Merge remote-tracking branch 'origin/master' into f/centos8
Signed-off-by: Charles Short <charles.short@windriver.com> Change-Id: I63728d8d3a20b98c3114ebead4bd3007fbe187b5
This commit is contained in:
commit
6c2905e665
|
@ -25,6 +25,7 @@ mtce-guestServer
|
|||
nfscheck
|
||||
radvd
|
||||
config-gate-worker
|
||||
isolcpus-device-plugin
|
||||
kernel-rt
|
||||
kernel-module-igb-uio
|
||||
kernel-module-igb-uio-rt
|
||||
|
@ -33,6 +34,7 @@ kernel-rt-modules-extra
|
|||
kmod-e1000e-rt
|
||||
kmod-i40e-rt
|
||||
kmod-iavf-rt
|
||||
kmod-ice-rt
|
||||
kmod-ixgbe-rt
|
||||
kmod-ixgbevf-rt
|
||||
kmod-igb_uio-rt
|
||||
|
@ -53,3 +55,7 @@ openvswitch-config
|
|||
pci-irq-affinity-agent
|
||||
kvm-timer-advance
|
||||
sysinv-fpga-agent
|
||||
kernel-rt-headers
|
||||
kernel-rt-devel
|
||||
kernel-headers
|
||||
kernel-devel
|
||||
|
|
|
@ -13,6 +13,7 @@ kernel-rt-modules-extra
|
|||
kmod-e1000e-rt
|
||||
kmod-i40e-rt
|
||||
kmod-iavf-rt
|
||||
kmod-ice-rt
|
||||
kmod-ixgbe-rt
|
||||
kmod-ixgbevf-rt
|
||||
kmod-igb_uio-rt
|
||||
|
@ -26,3 +27,5 @@ qat17-rt
|
|||
kernel-rt-tools
|
||||
kernel-rt-tools-libs
|
||||
kmod-drbd-rt
|
||||
kernel-rt-headers
|
||||
kernel-rt-devel
|
||||
|
|
|
@ -11,6 +11,7 @@ kernel-module-igb-uio
|
|||
kmod-e1000e
|
||||
kmod-i40e
|
||||
kmod-iavf
|
||||
kmod-ice
|
||||
kmod-ixgbe
|
||||
kmod-ixgbevf
|
||||
kmod-igb_uio
|
||||
|
@ -23,3 +24,5 @@ kernel-tools
|
|||
kernel-tools-libs
|
||||
kmod-drbd
|
||||
kernel-modules-extra
|
||||
kernel-headers
|
||||
kernel-devel
|
||||
|
|
|
@ -69,6 +69,7 @@ influxdb
|
|||
influxdb-extensions
|
||||
io-monitor
|
||||
io-scheduler
|
||||
isolcpus-device-plugin
|
||||
isomd5sum
|
||||
ipxe-roms-qemu
|
||||
kernel-module-openvswitch
|
||||
|
@ -120,8 +121,6 @@ nova-tests
|
|||
nova-api-proxy
|
||||
nova-placement-api
|
||||
novnc
|
||||
net-snmp
|
||||
net-snmp-config
|
||||
openstack-aodh-api
|
||||
openstack-aodh-commmon
|
||||
openstack-aodh-compat
|
||||
|
@ -256,7 +255,6 @@ qemu-kvm-ev
|
|||
qemu-kvm-tools-ev
|
||||
radvd
|
||||
rubygem-rdoc
|
||||
snmp-ext
|
||||
task-cloud-compute
|
||||
task-cloud-controller
|
||||
tgt
|
||||
|
@ -290,6 +288,7 @@ kernel-rt-modules-extra
|
|||
kmod-e1000e-rt
|
||||
kmod-i40e-rt
|
||||
kmod-iavf-rt
|
||||
kmod-ice-rt
|
||||
kmod-ixgbe-rt
|
||||
kmod-ixgbevf-rt
|
||||
kmod-igb_uio-rt
|
||||
|
@ -304,7 +303,6 @@ kernel-rt-tools
|
|||
kernel-rt-tools-libs
|
||||
NaviCLI-Linux-64-x86-en_US
|
||||
kmod-drbd-rt
|
||||
snmp-audittrail
|
||||
wrs-ssl
|
||||
tpm2-tools
|
||||
tss2
|
||||
|
@ -340,6 +338,11 @@ stx-oidc-auth-helm
|
|||
stx-cert-manager-helm
|
||||
stx-nginx-ingress-controller-helm
|
||||
stx-portieris-helm
|
||||
stx-snmp-helm
|
||||
stx-vault-helm
|
||||
sysinv-fpga-agent
|
||||
k8s-pod-recovery
|
||||
kernel-rt-headers
|
||||
kernel-rt-devel
|
||||
kernel-headers
|
||||
kernel-devel
|
||||
|
|
|
@ -81,8 +81,6 @@ nova-tests
|
|||
nova-api-proxy
|
||||
nova-placement-api
|
||||
novnc
|
||||
net-snmp
|
||||
net-snmp-config
|
||||
openldap-backend-bdb
|
||||
openldap-backend-dnssrv
|
||||
openldap-backend-hdb
|
||||
|
@ -138,7 +136,6 @@ python-swiftclient
|
|||
python-wsme
|
||||
fm-mgr
|
||||
fm-rest-api
|
||||
snmp-ext
|
||||
sm
|
||||
sm-api
|
||||
sm-client
|
||||
|
@ -258,6 +255,7 @@ kernel-rt-modules-extra
|
|||
kmod-e1000e-rt
|
||||
kmod-i40e-rt
|
||||
kmod-iavf-rt
|
||||
kmod-ice-rt
|
||||
kmod-ixgbe-rt
|
||||
kmod-ixgbevf-rt
|
||||
kmod-igb_uio-rt
|
||||
|
@ -272,7 +270,6 @@ kernel-rt-tools
|
|||
kernel-rt-tools-libs
|
||||
NaviCLI-Linux-64-x86-en_US
|
||||
kmod-drbd-rt
|
||||
snmp-audittrail
|
||||
wrs-ssl
|
||||
tpm2-tools
|
||||
tss2
|
||||
|
@ -301,5 +298,8 @@ stx-oidc-auth-helm
|
|||
stx-cert-manager-helm
|
||||
stx-nginx-ingress-controller-helm
|
||||
stx-portieris-helm
|
||||
stx-snmp-helm
|
||||
stx-vault-helm
|
||||
k8s-pod-recovery
|
||||
kernel-rt-headers
|
||||
kernel-rt-devel
|
||||
|
|
|
@ -81,8 +81,6 @@ nova-tests
|
|||
nova-api-proxy
|
||||
nova-placement-api
|
||||
novnc
|
||||
net-snmp
|
||||
net-snmp-config
|
||||
neutron-plugin-ml2
|
||||
neutron-server
|
||||
neutron-tests
|
||||
|
@ -141,7 +139,6 @@ python-swiftclient
|
|||
python-wsme
|
||||
fm-mgr
|
||||
fm-rest-api
|
||||
snmp-ext
|
||||
sm
|
||||
sm-api
|
||||
sm-client
|
||||
|
@ -261,6 +258,7 @@ kernel-module-igb-uio
|
|||
kmod-e1000e
|
||||
kmod-i40e
|
||||
kmod-iavf
|
||||
kmod-ice
|
||||
kmod-ixgbe
|
||||
kmod-ixgbevf
|
||||
kmod-igb_uio
|
||||
|
@ -274,7 +272,6 @@ kernel-tools-libs
|
|||
kernel-modules-extra
|
||||
NaviCLI-Linux-64-x86-en_US
|
||||
kmod-drbd-rt
|
||||
snmp-audittrail
|
||||
wrs-ssl
|
||||
tpm2-tools
|
||||
tss2
|
||||
|
@ -302,5 +299,8 @@ stx-oidc-auth-helm
|
|||
stx-cert-manager-helm
|
||||
stx-nginx-ingress-controller-helm
|
||||
stx-portieris-helm
|
||||
stx-snmp-helm
|
||||
stx-vault-helm
|
||||
k8s-pod-recovery
|
||||
kernel-headers
|
||||
kernel-devel
|
||||
|
|
|
@ -29,11 +29,12 @@
|
|||
## ETCD_STOR_SIZE = 5GiB
|
||||
## CEPH_MON_SIZE = 20GiB
|
||||
## KUBELET_STOR_SIZE = 10GiB
|
||||
## DC_VAULT_SIZE = 15GiB
|
||||
## RESERVED_PE = 16MiB (based on pesize=32768)
|
||||
##
|
||||
## CGCS_PV_SIZE = (10 + 2*10 + 25 + 8 + 16 + 2 + 1 + 30 + 16 + 5 + 20 + 10)GiB + 16MiB/1024 = 163.02GiB
|
||||
## CGCS_PV_SIZE = (10 + 2*10 + 25 + 8 + 16 + 2 + 1 + 30 + 16 + 5 + 20 + 10 + 15)GiB + 16MiB/1024 = 178.02GiB
|
||||
##
|
||||
##***************************************************************************************************
|
||||
##**********************************************************************************************************
|
||||
## Small disk install - (for disks below 240GB)
|
||||
## - DB size is doubled to allow for upgrades
|
||||
##
|
||||
|
@ -50,11 +51,12 @@
|
|||
## ETCD_STOR_SIZE = 5GiB
|
||||
## CEPH_MON_SIZE = 20GiB
|
||||
## KUBELET_STOR_SIZE = 10GiB
|
||||
## DC_VAULT_SIZE = 15GiB
|
||||
## RESERVED_PE = 16MiB (based on pesize=32768)
|
||||
##
|
||||
## CGCS_PV_SIZE = (10 + 2*5 + 20 + 8 + 16 + 2 + 1 + 30 + 16 + 5 + 20 + 10)GiB + 16MiB/1024 = 148.02GiB
|
||||
## CGCS_PV_SIZE = (10 + 2*5 + 20 + 8 + 16 + 2 + 1 + 30 + 16 + 5 + 20 + 10 + 15)GiB + 16MiB/1024 = 163.02GiB
|
||||
##
|
||||
##***************************************************************************************************
|
||||
##*********************************************************************************************************
|
||||
## Tiny disk install - (for disks below 154GB)
|
||||
##
|
||||
## NOTE: Tiny disk setup is mainly for StarlingX running in QEMU/KVM VM.
|
||||
|
@ -89,15 +91,15 @@ EFI_SIZE=300
|
|||
# which are DEFAULT_SMALL_DISK_SIZE
|
||||
# MINIMUM_SMALL_DISK_SIZE
|
||||
default_small_disk_size=240
|
||||
minimum_small_disk_size=181
|
||||
minimum_small_disk_size=196
|
||||
sz=$(blockdev --getsize64 $rootfs_device)
|
||||
# Round CGCS_PV_SIZE to the closest upper value that can be divided by 1024.
|
||||
if [ $sz -gt $(($default_small_disk_size*$gb)) ] ; then
|
||||
# Large disk: CGCS_PV_SIZE=164GiB*1024=167936
|
||||
CGCS_PV_SIZE=167936
|
||||
# Large disk: CGCS_PV_SIZE=179GiB*1024=183296
|
||||
CGCS_PV_SIZE=183296
|
||||
elif [ $sz -ge $(($minimum_small_disk_size*$gb)) ] ; then
|
||||
# Small disk: CGCS_PV_SIZE=149GiB*1024=152576
|
||||
CGCS_PV_SIZE=152576
|
||||
# Small disk: CGCS_PV_SIZE=164GiB*1024=167936
|
||||
CGCS_PV_SIZE=167936
|
||||
else
|
||||
# Tiny disk: CGCS_PV_SIZE=43GiB*1024=44032
|
||||
# Using a disk with a size under 60GiB will fail.
|
||||
|
|
|
@ -167,6 +167,13 @@ else
|
|||
# Avoid wiping ceph osds if sysinv tells us so
|
||||
if [ ${WIPE_CEPH_OSDS} == "false" ]; then
|
||||
wipe_dev="true"
|
||||
|
||||
pvs | grep -q "$dev *ceph"
|
||||
if [ $? -eq 0 ]; then
|
||||
wlog "skip rook provisoned disk $dev"
|
||||
continue
|
||||
fi
|
||||
|
||||
part_numbers=( `parted -s $dev print | awk '$1 == "Number" {i=1; next}; i {print $1}'` )
|
||||
# Scanning the partitions looking for CEPH OSDs and
|
||||
# skipping any disk found with such partitions
|
||||
|
@ -178,7 +185,15 @@ else
|
|||
wipe_dev="false"
|
||||
break
|
||||
fi
|
||||
|
||||
pvs | grep -q -e "${dev}${part_number} *ceph" -e "${dev}p${part_number} *ceph"
|
||||
if [ $? -eq 0 ]; then
|
||||
wlog "Rook OSD found on $dev$part_number, skip wipe"
|
||||
wipe_dev="false"
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
if [ "$wipe_dev" == "false" ]; then
|
||||
continue
|
||||
fi
|
||||
|
|
|
@ -6,6 +6,6 @@ COPY_LIST="pxe-network-installer/* \
|
|||
/import/mirrors/CentOS/stx-installer/vmlinuz \
|
||||
"
|
||||
|
||||
TIS_PATCH_VER=28
|
||||
TIS_PATCH_VER=PKG_GITREVCOUNT+13
|
||||
BUILD_IS_BIG=4
|
||||
BUILD_IS_SLOW=4
|
||||
|
|
|
@ -110,6 +110,7 @@ install -v -m 644 %{_sourcedir}/efi-centos-pxe-worker_lowlatency-install \
|
|||
install -v -m 644 %{_sourcedir}/efi-centos-pxe-smallsystem_lowlatency-install \
|
||||
%{buildroot}/pxeboot/pxelinux.cfg.files/efi-pxe-smallsystem_lowlatency-install-%{platform_release}
|
||||
|
||||
ln -sf /pxeboot/EFI/grubx64.efi %{buildroot}/pxeboot/grubx64.efi
|
||||
|
||||
sed -i "s/xxxSW_VERSIONxxx/%{platform_release}/g" \
|
||||
%{buildroot}/pxeboot/pxelinux.cfg.files/pxe-* \
|
||||
|
|
|
@ -274,9 +274,9 @@ void bmcUtil_create_pw_file ( thread_info_type * info_ptr,
|
|||
*
|
||||
*************************************************************************/
|
||||
|
||||
string bmcUtil_create_data_fn ( string & hostname,
|
||||
string file_suffix,
|
||||
bmc_protocol_enum protocol )
|
||||
string bmcUtil_create_data_fn ( const string & hostname,
|
||||
string file_suffix,
|
||||
bmc_protocol_enum protocol )
|
||||
{
|
||||
/* create the output filename */
|
||||
string datafile ;
|
||||
|
|
|
@ -82,6 +82,14 @@ typedef struct
|
|||
|
||||
} bmc_info_type ;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
string hostname;
|
||||
string host_ip ;
|
||||
string bm_ip ;
|
||||
string bm_un ;
|
||||
string bm_pw ;
|
||||
} bmcUtil_accessInfo_type ;
|
||||
|
||||
/* BMC commands */
|
||||
typedef enum
|
||||
|
@ -107,6 +115,7 @@ typedef enum
|
|||
#define BMC_QUERY_FILE_SUFFIX ((const char *)("_root_query"))
|
||||
#define BMC_INFO_FILE_SUFFIX ((const char *)("_bmc_info"))
|
||||
#define BMC_POWER_CMD_FILE_SUFFIX ((const char *)("_power_cmd_result"))
|
||||
#define BMC_RESET_CMD_FILE_SUFFIX ((const char *)("_reset"))
|
||||
#define BMC_BOOTDEV_CMD_FILE_SUFFIX ((const char *)("_bootdev"))
|
||||
#define BMC_RESTART_CAUSE_FILE_SUFFIX ((const char *)("_restart_cause"))
|
||||
#define BMC_POWER_STATUS_FILE_SUFFIX ((const char *)("_power_status"))
|
||||
|
@ -137,9 +146,9 @@ void bmcUtil_create_pw_file ( thread_info_type * info_ptr,
|
|||
bmc_protocol_enum protocol );
|
||||
|
||||
/* create the output filename */
|
||||
string bmcUtil_create_data_fn ( string & hostname,
|
||||
string file_suffix,
|
||||
bmc_protocol_enum protocol );
|
||||
string bmcUtil_create_data_fn ( const string & hostname,
|
||||
string file_suffix,
|
||||
bmc_protocol_enum protocol );
|
||||
|
||||
/* Get power state from query response data. */
|
||||
int bmcUtil_is_power_on ( string hostname,
|
||||
|
|
|
@ -130,6 +130,14 @@ bool hostUtil_is_valid_username ( string un )
|
|||
return (false);
|
||||
}
|
||||
|
||||
bool hostUtil_is_valid_pw ( string pw )
|
||||
{
|
||||
if ( !pw.empty() )
|
||||
if ( pw.compare(NONE) )
|
||||
return (true);
|
||||
return (false);
|
||||
}
|
||||
|
||||
bool hostUtil_is_valid_mac_addr ( string mac )
|
||||
{
|
||||
if ( !mac.empty() )
|
||||
|
|
|
@ -46,6 +46,7 @@ string hostUtil_getPrefixPath ( void );
|
|||
bool hostUtil_is_valid_uuid ( string uuid );
|
||||
bool hostUtil_is_valid_ip_addr ( string ip );
|
||||
bool hostUtil_is_valid_username ( string un );
|
||||
bool hostUtil_is_valid_pw ( string pw );
|
||||
bool hostUtil_is_valid_bm_type ( string bm_type );
|
||||
|
||||
int hostUtil_mktmpfile ( string hostname, string basename, string & filename, string data );
|
||||
|
|
|
@ -202,3 +202,66 @@ int ipmiUtil_bmc_info_load ( string hostname, const char * filename, bmc_info_ty
|
|||
ipmiUtil_bmc_info_log ( hostname, bmc_info, rc );
|
||||
return (rc);
|
||||
}
|
||||
|
||||
|
||||
int ipmiUtil_reset_host_now ( string hostname,
|
||||
bmcUtil_accessInfo_type accessInfo,
|
||||
string output_filename)
|
||||
{
|
||||
dlog("%s %s BMC [IP:%s UN:%s]",
|
||||
accessInfo.hostname.c_str(),
|
||||
accessInfo.host_ip.c_str(),
|
||||
accessInfo.bm_ip.c_str(),
|
||||
accessInfo.bm_un.c_str());
|
||||
|
||||
if (daemon_is_file_present ( BMC_OUTPUT_DIR ) == false )
|
||||
daemon_make_dir(BMC_OUTPUT_DIR) ;
|
||||
if (daemon_is_file_present ( IPMITOOL_OUTPUT_DIR ) == false )
|
||||
daemon_make_dir(IPMITOOL_OUTPUT_DIR) ;
|
||||
|
||||
/* create temp password file */
|
||||
thread_info_type info ;
|
||||
info.hostname = accessInfo.hostname ;
|
||||
info.password_file = "" ;
|
||||
info.pw_file_fd = 0 ;
|
||||
|
||||
/* Use common utility to create a temp pw file */
|
||||
bmcUtil_create_pw_file ( &info, accessInfo.bm_pw, BMC_PROTOCOL__IPMITOOL );
|
||||
|
||||
/* create request */
|
||||
string request =
|
||||
ipmiUtil_create_request ( IPMITOOL_POWER_RESET_CMD,
|
||||
accessInfo.bm_ip,
|
||||
accessInfo.bm_un,
|
||||
info.password_file,
|
||||
output_filename );
|
||||
|
||||
/* issue request
|
||||
*
|
||||
* Note: Could launch a thread to avoid any stall.
|
||||
* However, mtcClient can withstand up to a 25 second stall
|
||||
* before pmon will fail it due to active monitoring.
|
||||
* UT showed that there is no stall at all. */
|
||||
unsigned long long latency_threshold_secs = DEFAULT_SYSTEM_REQUEST_LATENCY_SECS ;
|
||||
unsigned long long before_time = gettime_monotonic_nsec () ;
|
||||
int rc = system ( request.data()) ;
|
||||
unsigned long long after_time = gettime_monotonic_nsec () ;
|
||||
unsigned long long delta_time = after_time-before_time ;
|
||||
if ( rc )
|
||||
{
|
||||
wlog("system call failed ; rc:%d [%d:%s]", rc, errno, strerror(errno) );
|
||||
rc = FAIL_SYSTEM_CALL ;
|
||||
}
|
||||
if ( delta_time > (latency_threshold_secs*1000000000))
|
||||
{
|
||||
wlog ("%s bmc system call took %2llu.%-8llu sec", hostname.c_str(),
|
||||
(delta_time > NSEC_TO_SEC) ? (delta_time/NSEC_TO_SEC) : 0,
|
||||
(delta_time > NSEC_TO_SEC) ? (delta_time%NSEC_TO_SEC) : 0);
|
||||
}
|
||||
|
||||
/* Cleanup */
|
||||
if ( info.pw_file_fd > 0 )
|
||||
close(info.pw_file_fd);
|
||||
daemon_remove_file ( info.password_file.data());
|
||||
return (rc);
|
||||
}
|
||||
|
|
|
@ -57,6 +57,8 @@ int ipmiUtil_init ( void );
|
|||
|
||||
int ipmiUtil_bmc_info_load ( string hostname, const char * filename, bmc_info_type & mc_info );
|
||||
|
||||
int ipmiUtil_reset_host_now ( string hostname, bmcUtil_accessInfo_type accessInfo, string output_filename );
|
||||
|
||||
/* Create the ipmi request */
|
||||
string ipmiUtil_create_request ( string cmd, string & ip, string & un, string & pw, string & out );
|
||||
|
||||
|
|
|
@ -149,6 +149,8 @@ const char * get_mtcNodeCommand_str ( int cmd )
|
|||
case MTC_REQ_MTCALIVE: return ("mtcAlive req");
|
||||
case MTC_MSG_LOCKED: return ("locked msg");
|
||||
case MTC_CMD_LAZY_REBOOT: return ("lazy reboot");
|
||||
case MTC_MSG_INFO: return ("info msg");
|
||||
case MTC_CMD_SYNC: return ("sync");
|
||||
|
||||
/* goenabled commands and messages */
|
||||
case MTC_MSG_MAIN_GOENABLED: return ("goEnabled main msg");
|
||||
|
@ -199,7 +201,8 @@ const char * get_mtcNodeCommand_str ( int cmd )
|
|||
case MTC_EVENT_PMON_MAJOR: return("pmon major event");
|
||||
case MTC_EVENT_PMON_MINOR: return("pmon minor event");
|
||||
case MTC_EVENT_PMON_LOG: return("pmon log");
|
||||
case MTC_EVENT_PMOND_RAISE: return("pmon raise");
|
||||
case MTC_EVENT_PMOND_RAISE: return("pmond raise");
|
||||
case MTC_EVENT_PMOND_CLEAR: return("pmond clear");
|
||||
|
||||
/* data port events */
|
||||
case MTC_EVENT_AVS_CLEAR: return("AVS clear");
|
||||
|
@ -394,10 +397,9 @@ void mtc_stages_init ( void )
|
|||
recoveryStages_str[MTC_RECOVERY__HEARTBEAT_START ] = "Heartbeat-Start";
|
||||
recoveryStages_str[MTC_RECOVERY__HEARTBEAT_SOAK ] = "Heartbeat-Soak";
|
||||
recoveryStages_str[MTC_RECOVERY__STATE_CHANGE ] = "State Change";
|
||||
recoveryStages_str[MTC_RECOVERY__ENABLE_START ] = "Enable-Start";
|
||||
recoveryStages_str[MTC_RECOVERY__FAILURE ] = "Failure";
|
||||
recoveryStages_str[MTC_RECOVERY__WORKQUEUE_WAIT ] = "WorkQ-Wait";
|
||||
recoveryStages_str[MTC_RECOVERY__ENABLE_WAIT ] = "Enable-Wait";
|
||||
recoveryStages_str[MTC_RECOVERY__ENABLE ] = "Enable";
|
||||
recoveryStages_str[MTC_RECOVERY__STAGES ] = "unknown";
|
||||
|
||||
disableStages_str [MTC_DISABLE__START ] = "Disable-Start";
|
||||
|
|
|
@ -185,7 +185,7 @@ typedef enum
|
|||
#define DEFAULT_MTCALIVE_TIMEOUT (1200)
|
||||
#define DEFAULT_GOENABLE_TIMEOUT (300)
|
||||
#define DEFAULT_DOR_MODE_TIMEOUT (20)
|
||||
#define DEFAULT_DOR_MODE_CPE_TIMEOUT (600)
|
||||
#define DEFAULT_DOR_MODE_AIO_TIMEOUT (600)
|
||||
|
||||
/** TODO: Convert names to omit JSON part */
|
||||
#define MTC_JSON_INV_LABEL "ihosts"
|
||||
|
@ -263,6 +263,7 @@ typedef enum
|
|||
#define MTC_TASK_ENABLE_WORK_FAIL "Enable Action Failed"
|
||||
#define MTC_TASK_ENABLE_WORK_TO "Enable Action Timeout"
|
||||
#define MTC_TASK_ENABLE_FAIL_HB "Enable Heartbeat Failure, re-enabling"
|
||||
#define MTC_TASK_RECOVERY_FAIL_HB "Graceful Recovery Heartbeat Failure, re-enabling"
|
||||
#define MTC_TASK_RECOVERY_FAIL "Graceful Recovery Failed, re-enabling"
|
||||
#define MTC_TASK_RECOVERY_WAIT "Graceful Recovery Wait"
|
||||
#define MTC_TASK_RECOVERED "Gracefully Recovered"
|
||||
|
@ -311,7 +312,7 @@ typedef enum
|
|||
#define MTC_TASK_POWERCYCLE_FAIL "Critical Event Power-Cycle %d; failed"
|
||||
#define MTC_TASK_POWERCYCLE_DOWN "Critical Event Power-Down ; due to persistent critical sensor"
|
||||
#define MTC_TASK_RESETTING_HOST "Resetting Host, critical sensor"
|
||||
#define MTC_TASK_CPE_SX_UNLOCK_MSG "Unlocking, please stand-by while the system gracefully reboots"
|
||||
#define MTC_TASK_AIO_SX_UNLOCK_MSG "Unlocking, please stand-by while the system gracefully reboots"
|
||||
#define MTC_TASK_SELF_UNLOCK_MSG "Unlocking active controller, please stand-by while it reboots"
|
||||
#define MTC_TASK_FAILED_SWACT_REQ "Critical failure.Requesting SWACT to enabled standby controller"
|
||||
#define MTC_TASK_FAILED_NO_BACKUP "Critical failure.Please provision/enable standby controller"
|
||||
|
@ -383,8 +384,8 @@ typedef enum
|
|||
/* 5 milliseconds */
|
||||
#define MTCAGENT_SELECT_TIMEOUT (5000)
|
||||
|
||||
/* dedicate more idle time in CPE ; there is less maintenance to do */
|
||||
#define MTCAGENT_CPE_SELECT_TIMEOUT (10000)
|
||||
/* dedicate more idle time in AIO ; there is less maintenance to do */
|
||||
#define MTCAGENT_AIO_SELECT_TIMEOUT (10000)
|
||||
|
||||
/** Number of retries maintenance will do when it experiences
|
||||
* a REST API call failure ; any failure */
|
||||
|
@ -751,7 +752,9 @@ typedef struct
|
|||
#define MTC_CMD_START_STORAGE_SVCS 19 /* to host */
|
||||
#define MTC_CMD_LAZY_REBOOT 20 /* to host */
|
||||
#define MTC_CMD_HOST_SVCS_RESULT 21 /* to host */
|
||||
#define MTC_CMD_LAST 22
|
||||
#define MTC_MSG_INFO 22 /* to host */
|
||||
#define MTC_CMD_SYNC 23 /* to host */
|
||||
#define MTC_CMD_LAST 24
|
||||
|
||||
#define RESET_PROG_MAX_REBOOTS_B4_RESET (5)
|
||||
#define RESET_PROG_MAX_REBOOTS_B4_RETRY (RESET_PROG_MAX_REBOOTS_B4_RESET+2)
|
||||
|
@ -946,7 +949,7 @@ typedef enum
|
|||
string get_delStages_str ( mtc_delStages_enum stage );
|
||||
|
||||
|
||||
#define MTC_MAX_FAST_ENABLES (3)
|
||||
#define MTC_MAX_FAST_ENABLES (5)
|
||||
typedef enum
|
||||
{
|
||||
MTC_RECOVERY__START = 0,
|
||||
|
@ -972,10 +975,9 @@ typedef enum
|
|||
MTC_RECOVERY__HEARTBEAT_START,
|
||||
MTC_RECOVERY__HEARTBEAT_SOAK,
|
||||
MTC_RECOVERY__STATE_CHANGE,
|
||||
MTC_RECOVERY__ENABLE_START,
|
||||
MTC_RECOVERY__FAILURE,
|
||||
MTC_RECOVERY__WORKQUEUE_WAIT,
|
||||
MTC_RECOVERY__ENABLE_WAIT,
|
||||
MTC_RECOVERY__ENABLE,
|
||||
MTC_RECOVERY__STAGES,
|
||||
} mtc_recoveryStages_enum ;
|
||||
|
||||
|
@ -1263,6 +1265,14 @@ typedef enum
|
|||
MTC_AR_DISABLE_CAUSE__NONE,
|
||||
} autorecovery_disable_cause_enum ;
|
||||
|
||||
/* code that represents a specific group of maintenance information
|
||||
* ... typically for a specific feature */
|
||||
typedef enum
|
||||
{
|
||||
MTC_INFO_CODE__PEER_CONTROLLER_KILL_INFO,
|
||||
MTC_INFO_CODE__LAST
|
||||
} mtcInfo_enum ;
|
||||
|
||||
/* Service Based Auto Recovery Control Structure */
|
||||
typedef struct
|
||||
{
|
||||
|
|
|
@ -309,6 +309,48 @@ bool thread_idle ( thread_ctrl_type & ctrl )
|
|||
return (false);
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* Name : thread_done_consume
|
||||
*
|
||||
* Description: Return to IDLE stage.
|
||||
*
|
||||
****************************************************************************/
|
||||
|
||||
int thread_done_consume ( thread_ctrl_type & ctrl, thread_info_type & info )
|
||||
{
|
||||
if ( ctrl.stage == THREAD_STAGE__IDLE )
|
||||
{
|
||||
return PASS ;
|
||||
}
|
||||
else if ( ctrl.done == false )
|
||||
{
|
||||
if ( info.runcount > ctrl.runcount )
|
||||
{
|
||||
ilog("%s thread cleanup ; cmd:%d ; cnt:%d:%d",
|
||||
info.hostname.c_str(),
|
||||
info.command,
|
||||
ctrl.runcount,
|
||||
info.runcount);
|
||||
ctrl.done = true ;
|
||||
ctrl.stage = THREAD_STAGE__DONE ;
|
||||
thread_handler (ctrl, info);
|
||||
return PASS ;
|
||||
}
|
||||
else
|
||||
{
|
||||
thread_kill(ctrl, info);
|
||||
return RETRY ;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
ctrl.stage = THREAD_STAGE__DONE ;
|
||||
thread_handler( ctrl, info );
|
||||
return PASS ;
|
||||
}
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* Name : thread_launch
|
||||
|
@ -381,7 +423,7 @@ void thread_kill ( thread_ctrl_type & ctrl, thread_info_type & info )
|
|||
( ctrl.stage != THREAD_STAGE__WAIT ) &&
|
||||
( ctrl.stage != THREAD_STAGE__IDLE ))
|
||||
{
|
||||
blog ("%s kill request\n", ctrl.hostname.c_str() );
|
||||
wlog ("%s kill request\n", ctrl.hostname.c_str() );
|
||||
_stage_change ( ctrl, THREAD_STAGE__KILL );
|
||||
}
|
||||
}
|
||||
|
|
|
@ -284,6 +284,7 @@ bool thread_done ( thread_ctrl_type & ctrl );
|
|||
bool thread_idle ( thread_ctrl_type & ctrl );
|
||||
void thread_kill ( thread_ctrl_type & ctrl , thread_info_type & info );
|
||||
string thread_stage ( thread_ctrl_type & ctrl );
|
||||
int thread_done_consume ( thread_ctrl_type & ctrl, thread_info_type & info );
|
||||
|
||||
/* Cooperative service of cancel and exit requests from parent */
|
||||
void pthread_signal_handler ( thread_info_type * info_ptr );
|
||||
|
|
|
@ -38,15 +38,15 @@ using namespace std ;
|
|||
/* List of different types */
|
||||
typedef enum
|
||||
{
|
||||
SYSTEM_TYPE__NORMAL =0,
|
||||
SYSTEM_TYPE__CPE_MODE__DUPLEX =1,
|
||||
SYSTEM_TYPE__CPE_MODE__DUPLEX_DIRECT =2,
|
||||
SYSTEM_TYPE__CPE_MODE__SIMPLEX =3,
|
||||
SYSTEM_TYPE__NORMAL =0,
|
||||
SYSTEM_TYPE__AIO__DUPLEX =1,
|
||||
SYSTEM_TYPE__AIO__DUPLEX_DIRECT =2,
|
||||
SYSTEM_TYPE__AIO__SIMPLEX =3,
|
||||
} system_type_enum ;
|
||||
|
||||
|
||||
/** Called by signal handler on daemon exit
|
||||
* Performs cleanup by closing open files
|
||||
* Performs cleanup by closing open files
|
||||
* and freeing used memory */
|
||||
void daemon_exit ( void );
|
||||
|
||||
|
|
|
@ -347,7 +347,7 @@ string daemon_mgmnt_iface ( void )
|
|||
system_type_enum daemon_system_type ( void )
|
||||
{
|
||||
char buffer [BUFFER];
|
||||
system_type_enum system_type = SYSTEM_TYPE__CPE_MODE__SIMPLEX ;
|
||||
system_type_enum system_type = SYSTEM_TYPE__AIO__SIMPLEX ;
|
||||
FILE * cfg_file_stream = fopen ( PLATFORM_CONF_FILE, "r" );
|
||||
if ( cfg_file_stream != NULL )
|
||||
{
|
||||
|
@ -401,11 +401,11 @@ system_type_enum daemon_system_type ( void )
|
|||
if ( !mode.empty() )
|
||||
{
|
||||
if ( mode.compare("duplex") == 0 )
|
||||
system_type = SYSTEM_TYPE__CPE_MODE__DUPLEX ;
|
||||
system_type = SYSTEM_TYPE__AIO__DUPLEX ;
|
||||
else if ( mode.compare("duplex-direct") == 0 )
|
||||
system_type = SYSTEM_TYPE__CPE_MODE__DUPLEX_DIRECT ;
|
||||
system_type = SYSTEM_TYPE__AIO__DUPLEX_DIRECT ;
|
||||
else if ( mode.compare("simplex") == 0 )
|
||||
system_type = SYSTEM_TYPE__CPE_MODE__SIMPLEX ;
|
||||
system_type = SYSTEM_TYPE__AIO__SIMPLEX ;
|
||||
else
|
||||
{
|
||||
elog ("%s All-In-One system type ; mode unknown\n", SYSTEM_TYPE_PREFIX );
|
||||
|
@ -438,21 +438,21 @@ system_type_enum daemon_system_type ( void )
|
|||
ilog("%s Standard System\n", SYSTEM_TYPE_PREFIX);
|
||||
break ;
|
||||
}
|
||||
case SYSTEM_TYPE__CPE_MODE__DUPLEX_DIRECT:
|
||||
case SYSTEM_TYPE__AIO__DUPLEX_DIRECT:
|
||||
{
|
||||
ilog ("%s All-in-one Duplex Direct Connect\n", SYSTEM_TYPE_PREFIX );
|
||||
break ;
|
||||
}
|
||||
case SYSTEM_TYPE__CPE_MODE__DUPLEX:
|
||||
case SYSTEM_TYPE__AIO__DUPLEX:
|
||||
{
|
||||
ilog ("%s All-in-one Duplex\n", SYSTEM_TYPE_PREFIX );
|
||||
break ;
|
||||
}
|
||||
case SYSTEM_TYPE__CPE_MODE__SIMPLEX:
|
||||
case SYSTEM_TYPE__AIO__SIMPLEX:
|
||||
default:
|
||||
{
|
||||
ilog ("%s All-in-one Simplex \n", SYSTEM_TYPE_PREFIX );
|
||||
system_type = SYSTEM_TYPE__CPE_MODE__SIMPLEX ;
|
||||
system_type = SYSTEM_TYPE__AIO__SIMPLEX ;
|
||||
break ;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,22 +1,13 @@
|
|||
[Unit]
|
||||
Description=StarlingX Maintenance Heartbeat Agent
|
||||
After=network.target syslog.service config.service
|
||||
After=hbsClient.service
|
||||
Before=pmon.service
|
||||
|
||||
[Service]
|
||||
Type=forking
|
||||
ExecStart=/etc/rc.d/init.d/hbsAgent start
|
||||
ExecStop=/etc/rc.d/init.d/hbsAgent start
|
||||
ExecStop=/etc/rc.d/init.d/hbsAgent stop
|
||||
PIDFile=/var/run/hbsAgent.pid
|
||||
KillMode=process
|
||||
SendSIGKILL=no
|
||||
|
||||
# Process recovery is handled by pmond if its running.
|
||||
# Delay 10 seconds to give pmond a chance to recover
|
||||
# before systemd kicks in to do it as a backup plan.
|
||||
Restart=always
|
||||
RestartSec=10
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
||||
|
|
|
@ -1,17 +1,19 @@
|
|||
#daily
|
||||
nodateext
|
||||
start 1
|
||||
compress
|
||||
copytruncate
|
||||
notifempty
|
||||
missingok
|
||||
#
|
||||
# Copyright (c) 2018-2021 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
/var/log/mtcalarmd.log
|
||||
{
|
||||
create 0640 root root
|
||||
start 1
|
||||
size 10M
|
||||
rotate 20
|
||||
sharedscripts
|
||||
compress
|
||||
notifempty
|
||||
missingok
|
||||
postrotate
|
||||
systemctl reload syslog-ng > /dev/null 2>&1 || true
|
||||
endscript
|
||||
delaycompress
|
||||
}
|
||||
|
|
|
@ -660,7 +660,7 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname )
|
|||
{
|
||||
ptr->alarms[id] = FM_ALARM_SEVERITY_CLEAR ;
|
||||
}
|
||||
ptr->alarms_loaded = false ;
|
||||
ptr->active_alarms = "" ; /* no active alarms */
|
||||
|
||||
ptr->cfgEvent.base = NULL ;
|
||||
ptr->sysinvEvent.base= NULL ;
|
||||
|
@ -778,6 +778,7 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname )
|
|||
return ptr ;
|
||||
}
|
||||
|
||||
|
||||
struct nodeLinkClass::node* nodeLinkClass::getNode ( string hostname )
|
||||
{
|
||||
/* check for empty list condition */
|
||||
|
@ -2706,7 +2707,7 @@ int nodeLinkClass::add_host ( node_inv_type & inv )
|
|||
node_ptr->operState = operState_str_to_enum (inv.oper.data ());
|
||||
node_ptr->availStatus = availStatus_str_to_enum (inv.avail.data());
|
||||
|
||||
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
{
|
||||
node_ptr->operState_subf = operState_str_to_enum (inv.oper_subf.data());
|
||||
node_ptr->availStatus_subf = availStatus_str_to_enum (inv.avail_subf.data());
|
||||
|
@ -2818,7 +2819,7 @@ int nodeLinkClass::add_host ( node_inv_type & inv )
|
|||
node_ptr->operState = operState_str_to_enum (inv.oper.data ());
|
||||
node_ptr->availStatus = availStatus_str_to_enum (inv.avail.data());
|
||||
|
||||
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
{
|
||||
node_ptr->operState_subf = operState_str_to_enum (inv.oper_subf.data());
|
||||
node_ptr->availStatus_subf = availStatus_str_to_enum (inv.avail_subf.data());
|
||||
|
@ -2835,7 +2836,7 @@ int nodeLinkClass::add_host ( node_inv_type & inv )
|
|||
node_ptr->operState = operState_str_to_enum (inv.oper.data ());
|
||||
node_ptr->availStatus = availStatus_str_to_enum (inv.avail.data());
|
||||
|
||||
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
{
|
||||
node_ptr->operState_subf = operState_str_to_enum (inv.oper_subf.data());
|
||||
node_ptr->availStatus_subf = availStatus_str_to_enum (inv.avail_subf.data());
|
||||
|
@ -2853,7 +2854,7 @@ int nodeLinkClass::add_host ( node_inv_type & inv )
|
|||
node_ptr->operState = operState_str_to_enum (inv.oper.data ());
|
||||
node_ptr->availStatus = availStatus_str_to_enum (inv.avail.data());
|
||||
|
||||
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
{
|
||||
node_ptr->operState_subf = operState_str_to_enum (inv.oper_subf.data());
|
||||
node_ptr->availStatus_subf = availStatus_str_to_enum (inv.avail_subf.data());
|
||||
|
@ -2871,7 +2872,7 @@ int nodeLinkClass::add_host ( node_inv_type & inv )
|
|||
node_ptr->operState = operState_str_to_enum (inv.oper.data ());
|
||||
node_ptr->availStatus = availStatus_str_to_enum (inv.avail.data());
|
||||
|
||||
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
{
|
||||
node_ptr->operState_subf = operState_str_to_enum (inv.oper_subf.data());
|
||||
node_ptr->availStatus_subf = availStatus_str_to_enum (inv.avail_subf.data());
|
||||
|
@ -2889,7 +2890,7 @@ int nodeLinkClass::add_host ( node_inv_type & inv )
|
|||
node_ptr->operState = operState_str_to_enum (inv.oper.data ());
|
||||
node_ptr->availStatus = availStatus_str_to_enum (inv.avail.data());
|
||||
|
||||
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
{
|
||||
node_ptr->operState_subf = operState_str_to_enum (inv.oper_subf.data());
|
||||
node_ptr->availStatus_subf = availStatus_str_to_enum (inv.avail_subf.data());
|
||||
|
@ -2940,7 +2941,7 @@ int nodeLinkClass::add_host ( node_inv_type & inv )
|
|||
node_ptr->operState = MTC_OPER_STATE__DISABLED ;
|
||||
node_ptr->availStatus = MTC_AVAIL_STATUS__OFFLINE ;
|
||||
|
||||
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
{
|
||||
node_ptr->operState_subf = MTC_OPER_STATE__DISABLED ;
|
||||
node_ptr->availStatus_subf = MTC_AVAIL_STATUS__OFFLINE ;
|
||||
|
@ -2958,7 +2959,7 @@ int nodeLinkClass::add_host ( node_inv_type & inv )
|
|||
node_ptr->operState = operState_str_to_enum (inv.oper.data ());
|
||||
node_ptr->availStatus = availStatus_str_to_enum (inv.avail.data());
|
||||
|
||||
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
{
|
||||
node_ptr->operState_subf = operState_str_to_enum (inv.oper_subf.data());
|
||||
node_ptr->availStatus_subf = availStatus_str_to_enum (inv.avail_subf.data());
|
||||
|
@ -3295,6 +3296,102 @@ void nodeLinkClass::mtcInfo_log ( struct nodeLinkClass::node * node_ptr )
|
|||
}
|
||||
}
|
||||
|
||||
/***************************************************************************
|
||||
*
|
||||
* Name : build_mtcInfo_dict
|
||||
*
|
||||
* Purpose : Build a json dictionary for the specified info code enum
|
||||
*
|
||||
* Assumptions : Only MTC_INFO_CODE__PEER_CONTROLLER_KILL_INFO is supported
|
||||
*
|
||||
* Returns : Returns a json dictionary of mtcInfo.
|
||||
*
|
||||
* {
|
||||
* "controller-0":{
|
||||
* "ip":"192.168.204.2",
|
||||
* "bm_ip":"xxx.xxx.xx.23",
|
||||
* "bm_un":"root",
|
||||
* "bm_pw":"root"
|
||||
* },
|
||||
* "controller-1":{
|
||||
* "ip":"192.168.204.3",
|
||||
* "bm_ip":"xxx.xxx.xx.24",
|
||||
* "bm_un":"root",
|
||||
* "bm_pw":"root"
|
||||
* }
|
||||
* }
|
||||
*
|
||||
**************************************************************************/
|
||||
|
||||
string nodeLinkClass::build_mtcInfo_dict ( mtcInfo_enum mtcInfo_code )
|
||||
{
|
||||
string mtcInfo_dict = "" ;
|
||||
|
||||
/* loop/exit control */
|
||||
int temp = 0 ;
|
||||
|
||||
/* should never happen but better to be safe */
|
||||
if ( head == NULL )
|
||||
return mtcInfo_dict ;
|
||||
|
||||
/* force the update to be a dictionary */
|
||||
mtcInfo_dict = "{" ;
|
||||
|
||||
for ( struct node * ptr = head ; ; ptr = ptr->next )
|
||||
{
|
||||
if (( ptr->nodetype & CONTROLLER_TYPE ) &&
|
||||
( mtcInfo_code == MTC_INFO_CODE__PEER_CONTROLLER_KILL_INFO ))
|
||||
{
|
||||
if ( temp )
|
||||
mtcInfo_dict.append(",");
|
||||
mtcInfo_dict.append("\"" + ptr->hostname + "\":{");
|
||||
mtcInfo_dict.append("\"mgmt_ip\":\"" + ptr->ip + "\",");
|
||||
mtcInfo_dict.append("\"bm_ip\":\"" + ptr->bm_ip + "\",");
|
||||
mtcInfo_dict.append("\"bm_un\":\"" + ptr->bm_un + "\",");
|
||||
mtcInfo_dict.append("\"bm_pw\":\"" + ptr->bm_pw + "\"}");
|
||||
if ( ++temp >= 2 )
|
||||
break ;
|
||||
}
|
||||
if (( ptr->next == NULL ) || ( ptr == tail ))
|
||||
break ;
|
||||
}
|
||||
mtcInfo_dict.append("}");
|
||||
return mtcInfo_dict ;
|
||||
}
|
||||
|
||||
/**************************************************************************
|
||||
*
|
||||
* Name : mtcInfo_handler
|
||||
*
|
||||
* Purpose : Send mtcInfo update to provisioned controllers when
|
||||
* the push flag is set.
|
||||
*
|
||||
**************************************************************************/
|
||||
|
||||
void nodeLinkClass::mtcInfo_handler ( void )
|
||||
{
|
||||
/* This is set in the bm_handler once access to the BMC using
|
||||
* provisioned credentials have been verified. */
|
||||
if ( this->want_mtcInfo_push )
|
||||
{
|
||||
/* handler will enhance when more codes are introduced */
|
||||
mtcInfo_enum mtcInfo_code = MTC_INFO_CODE__PEER_CONTROLLER_KILL_INFO ;
|
||||
|
||||
string mtcInfo_dict = build_mtcInfo_dict(mtcInfo_code);
|
||||
if ( ! mtcInfo_dict.empty() )
|
||||
{
|
||||
string temp = CONTROLLER_0 ;
|
||||
send_mtc_cmd ( temp, MTC_MSG_INFO, MGMNT_INTERFACE, mtcInfo_dict);
|
||||
if ( this->controllers > 1 )
|
||||
{
|
||||
temp = CONTROLLER_1;
|
||||
send_mtc_cmd ( temp, MTC_MSG_INFO, MGMNT_INTERFACE, mtcInfo_dict);
|
||||
}
|
||||
}
|
||||
this->want_mtcInfo_push = false ;
|
||||
}
|
||||
}
|
||||
|
||||
/* Lock Rules
|
||||
*
|
||||
* 1. Cannot lock this controller
|
||||
|
@ -4034,6 +4131,18 @@ int nodeLinkClass::get_uptime_refresh_ctr ( string & hostname )
|
|||
return (0);
|
||||
}
|
||||
|
||||
|
||||
int nodeLinkClass::get_mtce_flags ( string & hostname )
|
||||
{
|
||||
nodeLinkClass::node* node_ptr ;
|
||||
node_ptr = nodeLinkClass::getNode ( hostname );
|
||||
if ( node_ptr != NULL )
|
||||
{
|
||||
return ( node_ptr->mtce_flags );
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
|
||||
void nodeLinkClass::set_mtce_flags ( string hostname, int flags, int iface )
|
||||
{
|
||||
nodeLinkClass::node* node_ptr = nodeLinkClass::getNode ( hostname );
|
||||
|
@ -4114,7 +4223,7 @@ void nodeLinkClass::set_mtce_flags ( string hostname, int flags, int iface )
|
|||
|
||||
|
||||
/* Deal with sub-function if AIO controller host */
|
||||
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
{
|
||||
if ( flags & MTC_FLAG__SUBF_GOENABLED )
|
||||
{
|
||||
|
@ -4422,6 +4531,18 @@ string nodeLinkClass::get_bm_ip ( string hostname )
|
|||
return ("");
|
||||
}
|
||||
|
||||
string nodeLinkClass::get_bm_pw ( string hostname )
|
||||
{
|
||||
nodeLinkClass::node* node_ptr ;
|
||||
node_ptr = nodeLinkClass::getNode ( hostname );
|
||||
if ( node_ptr != NULL )
|
||||
{
|
||||
return (node_ptr->bm_pw);
|
||||
}
|
||||
elog ("%s bm pw lookup failed\n", hostname.c_str() );
|
||||
return ("");
|
||||
}
|
||||
|
||||
string nodeLinkClass::get_bm_un ( string hostname )
|
||||
{
|
||||
nodeLinkClass::node* node_ptr ;
|
||||
|
@ -4774,7 +4895,10 @@ void nodeLinkClass::hbs_minor_clear ( struct nodeLinkClass::node * node_ptr, ifa
|
|||
|
||||
/* Otherwise this is a single host that has recovered
|
||||
* possibly as part of a mnfa group or simply a lone wolf */
|
||||
else
|
||||
else if (( node_ptr->hbs_minor[MGMNT_IFACE] == false ) &&
|
||||
(( clstr_network_provisioned == false ) ||
|
||||
(( clstr_network_provisioned == true ) &&
|
||||
( node_ptr->hbs_minor[CLSTR_IFACE] == false ))))
|
||||
{
|
||||
if ( node_ptr->mnfa_graceful_recovery == true )
|
||||
{
|
||||
|
@ -4782,6 +4906,8 @@ void nodeLinkClass::hbs_minor_clear ( struct nodeLinkClass::node * node_ptr, ifa
|
|||
mnfa_awol_list.remove(node_ptr->hostname);
|
||||
}
|
||||
|
||||
/* Don't recover until heartbeat is working over all
|
||||
* monitored interfaces */
|
||||
mnfa_recover_host ( node_ptr );
|
||||
|
||||
if ( mnfa_active == true )
|
||||
|
@ -4819,17 +4945,17 @@ void nodeLinkClass::hbs_minor_clear ( struct nodeLinkClass::node * node_ptr, ifa
|
|||
}
|
||||
|
||||
if ( temp_count != mnfa_host_count[iface] )
|
||||
{
|
||||
{
|
||||
slog ("%s MNFA host tally (%s:%d incorrect - expected %d) ; correcting\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
get_iface_name_str(iface),
|
||||
mnfa_host_count[iface], temp_count );
|
||||
mnfa_host_count[iface] = temp_count ;
|
||||
mnfa_host_count[iface] = temp_count ;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
wlog ("%s MNFA host tally (%s:%d)\n",
|
||||
dlog ("%s MNFA host tally (%s:%d)\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
get_iface_name_str(iface),
|
||||
mnfa_host_count[iface] );
|
||||
|
@ -4935,11 +5061,28 @@ void nodeLinkClass::manage_heartbeat_failure ( string hostname, iface_enum iface
|
|||
}
|
||||
return ;
|
||||
}
|
||||
else if ( node_ptr->recoveryStage == MTC_RECOVERY__HEARTBEAT_SOAK )
|
||||
{
|
||||
elog ("%s %s *** Heartbeat Loss *** (during recovery soak)\n",
|
||||
hostname.c_str(),
|
||||
get_iface_name_str(iface));
|
||||
force_full_enable ( node_ptr );
|
||||
return ;
|
||||
}
|
||||
|
||||
mnfa_add_host ( node_ptr , iface );
|
||||
|
||||
if ( mnfa_active == false )
|
||||
{
|
||||
/* if node is already in graceful recovery just ignore the event */
|
||||
if ( node_ptr->graceful_recovery_counter != 0 )
|
||||
{
|
||||
dlog ("%s %s loss event ; already in graceful recovery try %d",
|
||||
hostname.c_str(),
|
||||
get_iface_name_str(iface),
|
||||
node_ptr->graceful_recovery_counter );
|
||||
return ;
|
||||
}
|
||||
elog ("%s %s *** Heartbeat Loss ***\n", hostname.c_str(), get_iface_name_str(iface));
|
||||
if ( iface == CLSTR_IFACE )
|
||||
{
|
||||
|
@ -4980,6 +5123,15 @@ void nodeLinkClass::manage_heartbeat_failure ( string hostname, iface_enum iface
|
|||
}
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* Name : manage_heartbeat_clear
|
||||
*
|
||||
* Description: Manage clearing heartbeat failure status
|
||||
*
|
||||
* Assuptions : Called by Both hbsAgent and mtcAgent
|
||||
*
|
||||
***************************************************************************/
|
||||
void nodeLinkClass::manage_heartbeat_clear ( string hostname, iface_enum iface )
|
||||
{
|
||||
nodeLinkClass::node * node_ptr = nodeLinkClass::getNode ( hostname );
|
||||
|
@ -4995,13 +5147,17 @@ void nodeLinkClass::manage_heartbeat_clear ( string hostname, iface_enum iface )
|
|||
node_ptr->heartbeat_failed[i] = false ;
|
||||
if ( i == MGMNT_IFACE )
|
||||
{
|
||||
node_ptr->alarms[HBS_ALARM_ID__HB_MGMNT] = FM_ALARM_SEVERITY_CLEAR ;
|
||||
node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_MGMNT ;
|
||||
if ( heartbeat )
|
||||
node_ptr->alarms[HBS_ALARM_ID__HB_MGMNT] = FM_ALARM_SEVERITY_CLEAR ;
|
||||
if ( maintenance )
|
||||
node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_MGMNT ;
|
||||
}
|
||||
if ( i == CLSTR_IFACE )
|
||||
{
|
||||
node_ptr->alarms[HBS_ALARM_ID__HB_CLSTR] = FM_ALARM_SEVERITY_CLEAR ;
|
||||
node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_CLSTR ;
|
||||
if ( heartbeat )
|
||||
node_ptr->alarms[HBS_ALARM_ID__HB_CLSTR] = FM_ALARM_SEVERITY_CLEAR ;
|
||||
if ( maintenance )
|
||||
node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_CLSTR ;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -5010,13 +5166,17 @@ void nodeLinkClass::manage_heartbeat_clear ( string hostname, iface_enum iface )
|
|||
node_ptr->heartbeat_failed[iface] = false ;
|
||||
if ( iface == MGMNT_IFACE )
|
||||
{
|
||||
node_ptr->alarms[HBS_ALARM_ID__HB_MGMNT] = FM_ALARM_SEVERITY_CLEAR ;
|
||||
node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_MGMNT ;
|
||||
if ( heartbeat )
|
||||
node_ptr->alarms[HBS_ALARM_ID__HB_MGMNT] = FM_ALARM_SEVERITY_CLEAR ;
|
||||
if ( maintenance )
|
||||
node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_MGMNT ;
|
||||
}
|
||||
else if ( iface == CLSTR_IFACE )
|
||||
{
|
||||
node_ptr->alarms[HBS_ALARM_ID__HB_CLSTR] = FM_ALARM_SEVERITY_CLEAR ;
|
||||
node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_CLSTR ;
|
||||
if ( heartbeat )
|
||||
node_ptr->alarms[HBS_ALARM_ID__HB_CLSTR] = FM_ALARM_SEVERITY_CLEAR ;
|
||||
if ( maintenance )
|
||||
node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_CLSTR ;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -5795,9 +5955,6 @@ int nodeLinkClass::critical_process_failed( string & hostname,
|
|||
node_ptr->hostname.c_str()); /* dlog */
|
||||
}
|
||||
|
||||
/* Start fresh the next time we enter graceful recovery handler */
|
||||
node_ptr->graceful_recovery_counter = 0 ;
|
||||
|
||||
/* Set node as unlocked-disabled-failed */
|
||||
allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
|
||||
MTC_OPER_STATE__DISABLED,
|
||||
|
@ -6755,7 +6912,7 @@ int nodeLinkClass::disableStageChange ( struct nodeLinkClass::node * node_ptr,
|
|||
}
|
||||
|
||||
/** Validate and log Recovery stage changes */
|
||||
int nodeLinkClass::recoveryStageChange ( struct nodeLinkClass::node * node_ptr,
|
||||
int nodeLinkClass::recoveryStageChange ( struct nodeLinkClass::node * node_ptr,
|
||||
mtc_recoveryStages_enum newHdlrStage )
|
||||
{
|
||||
int rc = PASS ;
|
||||
|
@ -6763,14 +6920,14 @@ int nodeLinkClass::recoveryStageChange ( struct nodeLinkClass::node * node_ptr,
|
|||
if (( newHdlrStage >= MTC_RECOVERY__STAGES ) ||
|
||||
( node_ptr->recoveryStage >= MTC_RECOVERY__STAGES ))
|
||||
{
|
||||
slog ("%s Invalid recovery stage (%d:%d)\n",
|
||||
slog ("%s Invalid recovery stage (%d:%d)\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->recoveryStage,
|
||||
node_ptr->recoveryStage,
|
||||
newHdlrStage );
|
||||
|
||||
if ( newHdlrStage < MTC_RECOVERY__STAGES )
|
||||
{
|
||||
clog ("%s ? -> %s\n",
|
||||
clog ("%s ? -> %s\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
get_recoveryStages_str(newHdlrStage).c_str());
|
||||
|
||||
|
@ -6782,11 +6939,11 @@ int nodeLinkClass::recoveryStageChange ( struct nodeLinkClass::node * node_ptr,
|
|||
rc = FAIL ;
|
||||
}
|
||||
}
|
||||
else
|
||||
else
|
||||
{
|
||||
clog ("%s %s -> %s\n",
|
||||
clog ("%s %s -> %s\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
get_recoveryStages_str(node_ptr->recoveryStage).c_str(),
|
||||
get_recoveryStages_str(node_ptr->recoveryStage).c_str(),
|
||||
get_recoveryStages_str(newHdlrStage).c_str());
|
||||
|
||||
node_ptr->recoveryStage = newHdlrStage ;
|
||||
|
@ -7514,7 +7671,7 @@ int nodeLinkClass::ar_manage ( struct nodeLinkClass::node * node_ptr,
|
|||
mtcInvApi_update_states ( node_ptr, "unlocked", "disabled", "failed" );
|
||||
|
||||
if (( NOT_THIS_HOST ) &&
|
||||
( this->system_type != SYSTEM_TYPE__CPE_MODE__SIMPLEX ))
|
||||
( this->system_type != SYSTEM_TYPE__AIO__SIMPLEX ))
|
||||
{
|
||||
if ( ++node_ptr->ar_count[node_ptr->ar_cause] >=
|
||||
this->ar_threshold [node_ptr->ar_cause] )
|
||||
|
@ -7746,7 +7903,11 @@ int nodeLinkClass::mon_host ( const string & hostname, bool true_false, bool sen
|
|||
|
||||
if ( true_false == true )
|
||||
{
|
||||
ilog ("%s heartbeat start", hostname.c_str());
|
||||
ilog ("%s %s heartbeat %sstart",
|
||||
hostname.c_str(),
|
||||
get_iface_name_str(iface),
|
||||
node_ptr->monitor[iface] ? "re" : "");
|
||||
|
||||
node_ptr->no_work_log_throttle = 0 ;
|
||||
node_ptr->b2b_misses_count[iface] = 0 ;
|
||||
node_ptr->hbs_misses_count[iface] = 0 ;
|
||||
|
@ -7758,7 +7919,12 @@ int nodeLinkClass::mon_host ( const string & hostname, bool true_false, bool sen
|
|||
}
|
||||
else
|
||||
{
|
||||
ilog ("%s heartbeat stop", hostname.c_str());
|
||||
if ( node_ptr->monitor[iface] == true )
|
||||
{
|
||||
ilog ("%s %s heartbeat stop",
|
||||
hostname.c_str(),
|
||||
get_iface_name_str(iface));
|
||||
}
|
||||
}
|
||||
node_ptr->monitor[iface] = true_false ;
|
||||
}
|
||||
|
@ -7771,7 +7937,7 @@ int nodeLinkClass::mon_host ( const string & hostname, bool true_false, bool sen
|
|||
void nodeLinkClass::set_hwmond_monitor_state ( string & hostname, bool state )
|
||||
{
|
||||
if ( hostname.length() )
|
||||
{
|
||||
{
|
||||
struct nodeLinkClass::node* node_ptr ;
|
||||
node_ptr = nodeLinkClass::getNode ( hostname );
|
||||
if ( node_ptr != NULL )
|
||||
|
@ -8511,7 +8677,7 @@ void nodeLinkClass::manage_heartbeat_alarm ( struct nodeLinkClass::node * node_p
|
|||
|
||||
|
||||
|
||||
#define HBS_LOSS_REPORT_THROTTLE (100)
|
||||
#define HBS_LOSS_REPORT_THROTTLE (100000)
|
||||
int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
|
||||
{
|
||||
int lost = 0 ;
|
||||
|
@ -8551,6 +8717,13 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
|
|||
|
||||
if ( pulse_ptr->b2b_misses_count[iface] > 1 )
|
||||
{
|
||||
if ( pulse_ptr->b2b_misses_count[iface] < hbs_failure_threshold )
|
||||
{
|
||||
hbs_cluster_change ( pulse_ptr->hostname + " " +
|
||||
get_iface_name_str(iface) +
|
||||
" heartbeat miss " +
|
||||
itos(pulse_ptr->b2b_misses_count[iface]));
|
||||
}
|
||||
if ( pulse_ptr->b2b_misses_count[iface] >= hbs_failure_threshold )
|
||||
{
|
||||
if ( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold )
|
||||
|
@ -8657,57 +8830,43 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
|
|||
}
|
||||
}
|
||||
|
||||
/* Turn the cluster-host heartbeat loss into a degrade only
|
||||
* condition if the clstr_degrade_only flag is set */
|
||||
if (( iface == CLSTR_IFACE ) &&
|
||||
( pulse_ptr->b2b_misses_count[iface] >= hbs_failure_threshold ) &&
|
||||
( clstr_degrade_only == true ))
|
||||
{
|
||||
/* Only print the log at the threshold boundary */
|
||||
if (( pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold )
|
||||
{
|
||||
if ( this->active_controller )
|
||||
{
|
||||
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
|
||||
}
|
||||
|
||||
wlog ( "%s %s *** Heartbeat Loss *** (degrade only)\n",
|
||||
pulse_ptr->hostname.c_str(),
|
||||
get_iface_name_str(iface) );
|
||||
hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
|
||||
}
|
||||
}
|
||||
|
||||
/* Turn the clstr heartbeat loss into a degrade only
|
||||
* condition for inactive controller on normal system. */
|
||||
else if (( iface == CLSTR_IFACE ) &&
|
||||
( pulse_ptr->b2b_misses_count[iface] >= hbs_failure_threshold ) &&
|
||||
( this->system_type == SYSTEM_TYPE__NORMAL ) &&
|
||||
(( pulse_ptr->nodetype & CONTROLLER_TYPE) == CONTROLLER_TYPE ))
|
||||
if (( iface == CLSTR_IFACE ) &&
|
||||
((( this->system_type == SYSTEM_TYPE__NORMAL ) &&
|
||||
(( pulse_ptr->nodetype & CONTROLLER_TYPE) == CONTROLLER_TYPE )) ||
|
||||
( clstr_degrade_only == true )))
|
||||
{
|
||||
/* Only print the log at the threshold boundary */
|
||||
if ( (pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold )
|
||||
if ( pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE == hbs_failure_threshold )
|
||||
{
|
||||
if ( this->active_controller )
|
||||
{
|
||||
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
|
||||
}
|
||||
wlog ( "%s %s *** Heartbeat Loss *** (degrade only)\n",
|
||||
wlog ( "%s %s *** Heartbeat Loss *** (degrade only due to %s)\n",
|
||||
pulse_ptr->hostname.c_str(),
|
||||
get_iface_name_str(iface));
|
||||
hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
|
||||
get_iface_name_str(iface),
|
||||
clstr_degrade_only ? "config option" : "system type");
|
||||
hbs_cluster_change ( pulse_ptr->hostname + " " + get_iface_name_str(iface) + " heartbeat loss" );
|
||||
}
|
||||
}
|
||||
|
||||
else if ((pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold )
|
||||
// else if ( pulse_ptr->hbs_failure[iface] == false )
|
||||
{
|
||||
elog ("%s %s *** Heartbeat Loss ***\n", pulse_ptr->hostname.c_str(),
|
||||
get_iface_name_str(iface) );
|
||||
elog ("%s %s *** Heartbeat Loss *** (b2b_misses:0x%x)\n",
|
||||
pulse_ptr->hostname.c_str(),
|
||||
get_iface_name_str(iface),
|
||||
pulse_ptr->b2b_misses_count[iface]);
|
||||
hbs_cluster_change ( pulse_ptr->hostname + " " + get_iface_name_str(iface) + " heartbeat loss" );
|
||||
|
||||
if ( this->active_controller )
|
||||
{
|
||||
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
|
||||
|
||||
if ( pulse_ptr->hbs_failure[iface] == false )
|
||||
{
|
||||
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
|
||||
}
|
||||
/* report this host as failed */
|
||||
if ( send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_LOSS , iface ) == PASS )
|
||||
{
|
||||
|
@ -8715,10 +8874,8 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
|
|||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
pulse_ptr->hbs_failure[iface] = true ;
|
||||
}
|
||||
hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
|
||||
|
||||
pulse_ptr->hbs_failure_count[iface]++ ;
|
||||
}
|
||||
if ( pulse_ptr->b2b_misses_count[iface] > pulse_ptr->max_count[iface] )
|
||||
|
@ -8963,21 +9120,21 @@ void nodeLinkClass::mem_log_mtcalive ( struct nodeLinkClass::node * node_ptr )
|
|||
{
|
||||
char str[MAX_MEM_LOG_DATA] ;
|
||||
|
||||
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tmtcAlive: online:%c offline:%c Cnt:%d Gate:%s Misses:%d\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tmtcAlive: online:%c offline:%c Cnt:%d Gate:%s Misses:%d\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->mtcAlive_online ? 'Y' : 'N',
|
||||
node_ptr->mtcAlive_offline ? 'Y' : 'N',
|
||||
node_ptr->mtcAlive_count,
|
||||
node_ptr->mtcAlive_gate ? "closed" : "open",
|
||||
node_ptr->mtcAlive_misses);
|
||||
node_ptr->mtcAlive_misses);
|
||||
mem_log (str);
|
||||
}
|
||||
|
||||
void nodeLinkClass::mem_log_alarm1 ( struct nodeLinkClass::node * node_ptr )
|
||||
{
|
||||
char str[MAX_MEM_LOG_DATA] ;
|
||||
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tAlarm List:%s%s%s%s%s%s\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tAlarm List:%s%s%s%s%s%s\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->alarms[MTC_ALARM_ID__LOCK ] ? " Locked" : " .",
|
||||
node_ptr->alarms[MTC_ALARM_ID__CONFIG ] ? " Config" : " .",
|
||||
node_ptr->alarms[MTC_ALARM_ID__ENABLE ] ? " Enable" : " .",
|
||||
|
@ -8987,6 +9144,18 @@ void nodeLinkClass::mem_log_alarm1 ( struct nodeLinkClass::node * node_ptr )
|
|||
mem_log (str);
|
||||
}
|
||||
|
||||
void nodeLinkClass::mem_log_alarm2 ( struct nodeLinkClass::node * node_ptr )
|
||||
{
|
||||
if ( ! node_ptr->active_alarms.empty() )
|
||||
{
|
||||
char str[MAX_MEM_LOG_DATA] ;
|
||||
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tActive Alarms:%s\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->active_alarms.c_str());
|
||||
mem_log (str);
|
||||
}
|
||||
}
|
||||
|
||||
void nodeLinkClass::mem_log_stage ( struct nodeLinkClass::node * node_ptr )
|
||||
{
|
||||
char str[MAX_MEM_LOG_DATA] ;
|
||||
|
@ -9037,8 +9206,8 @@ void nodeLinkClass::mem_log_network ( struct nodeLinkClass::node * node_ptr )
|
|||
{
|
||||
char str[MAX_MEM_LOG_DATA] ;
|
||||
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\t%s %s cluster_host_ip: %s Uptime: %u\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->mac.c_str(),
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->mac.c_str(),
|
||||
node_ptr->ip.c_str(),
|
||||
node_ptr->clstr_ip.c_str(),
|
||||
node_ptr->uptime );
|
||||
|
@ -9050,11 +9219,11 @@ void nodeLinkClass::mem_log_heartbeat ( struct nodeLinkClass::node * node_ptr )
|
|||
char str[MAX_MEM_LOG_DATA] ;
|
||||
for ( int iface = 0 ; iface < MAX_IFACES ; iface++ )
|
||||
{
|
||||
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\t%s Minor:%s Degrade:%s Failed:%s Monitor:%s\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\t%s Minor:%s Degrade:%s Failed:%s Monitor:%s\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
get_iface_name_str (iface),
|
||||
node_ptr->hbs_minor[iface] ? "true " : "false",
|
||||
node_ptr->hbs_degrade[iface] ? "true " : "false",
|
||||
node_ptr->hbs_minor[iface] ? "true " : "false",
|
||||
node_ptr->hbs_degrade[iface] ? "true " : "false",
|
||||
node_ptr->hbs_failure[iface] ? "true " : "false",
|
||||
node_ptr->monitor[iface] ? "YES" : "no" );
|
||||
mem_log (str);
|
||||
|
@ -9083,8 +9252,8 @@ void nodeLinkClass::mem_log_hbs_cnts ( struct nodeLinkClass::node * node_ptr )
|
|||
void nodeLinkClass::mem_log_test_info ( struct nodeLinkClass::node * node_ptr )
|
||||
{
|
||||
char str[MAX_MEM_LOG_DATA] ;
|
||||
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tOOS Stage:%s Runs:%d - INSV Stage:%s Runs:%d\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tOOS Stage:%s Runs:%d - INSV Stage:%s Runs:%d\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
get_oosTestStages_str(node_ptr->oosTestStage).c_str(),
|
||||
node_ptr->oos_test_count,
|
||||
get_insvTestStages_str(node_ptr->insvTestStage).c_str(),
|
||||
|
@ -9117,7 +9286,7 @@ void nodeLinkClass::mem_log_type_info ( struct nodeLinkClass::node * node_ptr )
|
|||
node_ptr->function);
|
||||
mem_log (str);
|
||||
|
||||
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
{
|
||||
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tSub-Function: %s (%u) (SubFunc Enabled:%c)\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
|
@ -9156,6 +9325,7 @@ void nodeLinkClass::memDumpNodeState ( string hostname )
|
|||
// mem_log_reset_info ( node_ptr );
|
||||
mem_log_power_info ( node_ptr );
|
||||
mem_log_alarm1 ( node_ptr );
|
||||
mem_log_alarm2 ( node_ptr );
|
||||
mem_log_mtcalive ( node_ptr );
|
||||
mem_log_stage ( node_ptr );
|
||||
mem_log_bm ( node_ptr );
|
||||
|
|
|
@ -76,11 +76,11 @@ using namespace std;
|
|||
#define LARGE_SYSTEM \
|
||||
( this->system_type == SYSTEM_TYPE__NORMAL )
|
||||
|
||||
#define CPE_SYSTEM \
|
||||
#define AIO_SYSTEM \
|
||||
( this->system_type != SYSTEM_TYPE__NORMAL )
|
||||
|
||||
#define SIMPLEX_CPE_SYSTEM \
|
||||
( this->system_type == SYSTEM_TYPE__CPE_MODE__SIMPLEX )
|
||||
#define SIMPLEX_AIO_SYSTEM \
|
||||
( this->system_type == SYSTEM_TYPE__AIO__SIMPLEX )
|
||||
|
||||
/**
|
||||
* @addtogroup nodeLinkClass
|
||||
|
@ -652,12 +652,12 @@ private:
|
|||
|
||||
/** @} private_monitoring_services_variables */
|
||||
|
||||
/* List of alarms and current severity */
|
||||
#define MAX_ALARMS (10)
|
||||
/* List of alarms current severity */
|
||||
EFmAlarmSeverityT alarms[MAX_ALARMS];
|
||||
|
||||
/* tracks whether the alarms for this host have been loaded already or not */
|
||||
bool alarms_loaded ;
|
||||
/* string containing active alarms and their severity
|
||||
* ... for logging purposes only */
|
||||
string active_alarms ;
|
||||
|
||||
/** true if this host has recovered before the mnfa timeout period.
|
||||
* This bool flags the graceful recovery handler that this node
|
||||
|
@ -665,8 +665,6 @@ private:
|
|||
* and uptime accordingly */
|
||||
bool mnfa_graceful_recovery ;
|
||||
|
||||
int stress_iteration ;
|
||||
|
||||
/* BMC Protocol Learning Controls and State */
|
||||
|
||||
/* specifies what BMC protocol is selected for this host
|
||||
|
@ -828,10 +826,13 @@ private:
|
|||
int oos_test_handler ( struct nodeLinkClass::node * node_ptr );
|
||||
int insv_test_handler ( struct nodeLinkClass::node * node_ptr );
|
||||
int stress_handler ( struct nodeLinkClass::node * node_ptr );
|
||||
int bmc_handler ( struct nodeLinkClass::node * node_ptr );
|
||||
int bmc_handler ( struct nodeLinkClass::node * node_ptr );
|
||||
int degrade_handler ( struct nodeLinkClass::node * node_ptr );
|
||||
|
||||
int uptime_handler ( void );
|
||||
|
||||
void mtcInfo_handler ( void );
|
||||
|
||||
int host_services_handler ( struct nodeLinkClass::node * node_ptr );
|
||||
|
||||
/* Starts the specified 'reset or powercycle' recovery monitor */
|
||||
|
@ -840,6 +841,9 @@ private:
|
|||
/* server specific power state query handler */
|
||||
bool (*is_poweron_handler) (string hostname, string query_response );
|
||||
|
||||
/* Audit that monitors and auto corrects alarm state mismatches */
|
||||
void mtcAlarm_audit ( struct nodeLinkClass::node * node_ptr );
|
||||
|
||||
/* Calculate the overall reset progression timeout */
|
||||
int calc_reset_prog_timeout ( struct nodeLinkClass::node * node_ptr, int retries );
|
||||
|
||||
|
@ -851,13 +855,22 @@ private:
|
|||
void ctl_mtcAlive_gate ( struct nodeLinkClass::node * node_ptr, bool gate_state );
|
||||
void set_mtcAlive ( struct nodeLinkClass::node * node_ptr, int interface );
|
||||
|
||||
/********* mtcInfo in the database ************/
|
||||
int mtcInfo_set ( struct nodeLinkClass::node * node_ptr, string key, string value );
|
||||
string mtcInfo_get ( struct nodeLinkClass::node * node_ptr, string key );
|
||||
void mtcInfo_clr ( struct nodeLinkClass::node * node_ptr, string key );
|
||||
void mtcInfo_log ( struct nodeLinkClass::node * node_ptr );
|
||||
|
||||
int set_mtcInfo ( struct nodeLinkClass::node * node_ptr, string & mtc_info );
|
||||
|
||||
/********* mtcInfo that gets puished out to daemons ***********/
|
||||
|
||||
|
||||
/* flag telling mtce when a mtcInfo push needs to be done */
|
||||
bool want_mtcInfo_push = false ;
|
||||
|
||||
/* performs the mtcInfo push */
|
||||
void push_mtcInfo ( void );
|
||||
|
||||
/*****************************************************************************
|
||||
*
|
||||
* Name : bmc_command_send
|
||||
|
@ -1192,11 +1205,11 @@ private:
|
|||
* Set to true when the autorecovery threshold is reached
|
||||
* and we want to avoid taking further autorecovery action
|
||||
* even though it may be requested. */
|
||||
bool autorecovery_disabled ;
|
||||
bool autorecovery_disabled = false ;
|
||||
|
||||
/* Set to true by fault detection methods that are
|
||||
* autorecoverable when in simplex mode. */
|
||||
bool autorecovery_enabled ;
|
||||
bool autorecovery_enabled = false ;
|
||||
|
||||
/** Tracks the number of hosts that 'are currently' in service trouble
|
||||
* wrt heartbeat (above minor threshold).
|
||||
|
@ -1292,6 +1305,7 @@ private:
|
|||
void mem_log_state1 ( struct nodeLinkClass::node * node_ptr );
|
||||
void mem_log_state2 ( struct nodeLinkClass::node * node_ptr );
|
||||
void mem_log_alarm1 ( struct nodeLinkClass::node * node_ptr );
|
||||
void mem_log_alarm2 ( struct nodeLinkClass::node * node_ptr );
|
||||
void mem_log_mtcalive ( struct nodeLinkClass::node * node_ptr );
|
||||
void mem_log_stage ( struct nodeLinkClass::node * node_ptr );
|
||||
void mem_log_test_info ( struct nodeLinkClass::node * node_ptr );
|
||||
|
@ -1464,11 +1478,14 @@ public:
|
|||
|
||||
/***********************************************************/
|
||||
|
||||
/** Number of provisioned controllers */
|
||||
int controllers = 0 ;
|
||||
|
||||
/** Number of provisioned hosts (nodes) */
|
||||
int hosts ;
|
||||
int hosts = 0 ;
|
||||
|
||||
/* Set to True while waiting for UNLOCK_READY_FILE in simplex mode */
|
||||
bool unlock_ready_wait ;
|
||||
bool unlock_ready_wait = false ;
|
||||
|
||||
/** Host has been deleted */
|
||||
bool host_deleted ;
|
||||
|
@ -1517,6 +1534,9 @@ public:
|
|||
/** Return the number of inventoried hosts */
|
||||
int num_hosts ( void );
|
||||
|
||||
/** Return the number of inventoried controllers */
|
||||
int num_controllers ( void );
|
||||
|
||||
/** **********************************************************************
|
||||
*
|
||||
* Name : nodeLinkClass::workQueue_enqueue
|
||||
|
@ -1664,6 +1684,9 @@ public:
|
|||
/* Clear heartbeat failed flag for all interfaces */
|
||||
void manage_heartbeat_clear ( string hostname, iface_enum iface );
|
||||
|
||||
/* Build a json dictionary of containing code specified maintenance info */
|
||||
string build_mtcInfo_dict ( mtcInfo_enum mtcInfo_code );
|
||||
|
||||
/** Test and Debug Members and Variables */
|
||||
|
||||
/** Print node info banner */
|
||||
|
@ -1752,6 +1775,7 @@ public:
|
|||
#define MTC_FLAG__I_AM_LOCKED (0x00000008)
|
||||
*/
|
||||
void set_mtce_flags ( string hostname, int flags, int iface );
|
||||
int get_mtce_flags ( string & hostname );
|
||||
|
||||
/** Updates the node's health code
|
||||
* Codes are found in nodeBase.h
|
||||
|
@ -1789,6 +1813,7 @@ public:
|
|||
|
||||
string get_bm_ip ( string hostname );
|
||||
string get_bm_un ( string hostname );
|
||||
string get_bm_pw ( string hostname );
|
||||
string get_bm_type ( string hostname );
|
||||
|
||||
string get_hostname_from_bm_ip ( string bm_ip );
|
||||
|
|
|
@ -1,15 +1,19 @@
|
|||
#daily
|
||||
nodateext
|
||||
#
|
||||
# Copyright (c) 2015-2021 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
/var/log/fsmond.log
|
||||
{
|
||||
size 10M
|
||||
create 0640 root root
|
||||
start 1
|
||||
missingok
|
||||
size 10M
|
||||
rotate 20
|
||||
compress
|
||||
sharedscripts
|
||||
notifempty
|
||||
missingok
|
||||
postrotate
|
||||
systemctl reload syslog-ng > /dev/null 2>&1 || true
|
||||
endscript
|
||||
delaycompress
|
||||
}
|
||||
|
|
|
@ -13,7 +13,7 @@ LDLIBS = -lstdc++ -ldaemon -lcommon -lthreadUtil -lpthread -lfmcommon -lalarm -l
|
|||
INCLUDES = -I. -I/usr/include/mtce-daemon -I/usr/include/mtce-common
|
||||
INCLUDES += -I../common -I../alarm -I../maintenance -I../public
|
||||
|
||||
CCFLAGS = -g -O2 -Wall -Wextra -Werror
|
||||
CCFLAGS = -g -O2 -Wall -Wextra -Werror -std=c++11
|
||||
|
||||
STATIC_ANALYSIS_TOOL = cppcheck
|
||||
STATIC_ANALYSIS_TOOL_EXISTS = $(shell [[ -e `which $(STATIC_ANALYSIS_TOOL)` ]] && echo 1 || echo 0)
|
||||
|
|
|
@ -1381,6 +1381,7 @@ int daemon_init ( string iface, string nodetype )
|
|||
hbs_ctrl.locked = true ;
|
||||
}
|
||||
|
||||
|
||||
daemon_init_fit();
|
||||
return (rc);
|
||||
}
|
||||
|
@ -1521,6 +1522,7 @@ void hbs_sm_handler ( void )
|
|||
* False if time delta is greater
|
||||
*
|
||||
***************************************************************************/
|
||||
#define HUGE_NUMBER_B2B_SM_HEARTBEAT_MISSES (10000)
|
||||
bool manage_sm_heartbeat ( void )
|
||||
{
|
||||
struct timespec ts ;
|
||||
|
@ -1532,8 +1534,9 @@ bool manage_sm_heartbeat ( void )
|
|||
if ( delta_in_ms > SM_HEARTBEAT_PULSE_PERIOD_MSECS )
|
||||
{
|
||||
sm_heartbeat_count = 0;
|
||||
if (( ++sm_heartbeat_count_b2b_misses < 20 )||
|
||||
(!( sm_heartbeat_count_b2b_misses % 100 )))
|
||||
if ((( ++sm_heartbeat_count_b2b_misses < 20 ) ||
|
||||
(!( sm_heartbeat_count_b2b_misses % 1000 ))) &&
|
||||
( sm_heartbeat_count_b2b_misses < HUGE_NUMBER_B2B_SM_HEARTBEAT_MISSES ))
|
||||
{
|
||||
wlog("SM Heartbeat missing since %ld.%03ld secs ago ; HBS Period Misses:%3d ; Running HB Count:%4d",
|
||||
delta.secs, delta.msecs,
|
||||
|
@ -1817,6 +1820,10 @@ void daemon_service_run ( void )
|
|||
inv.name = hbsInv.my_hostname ;
|
||||
inv.nodetype = CONTROLLER_TYPE ;
|
||||
hbsInv.add_heartbeat_host ( inv );
|
||||
|
||||
/* add this host to local inventory */
|
||||
hostname_inventory.push_front(hbsInv.my_hostname);
|
||||
ilog ("%s added to inventory (self)", hbsInv.my_hostname.c_str());
|
||||
}
|
||||
|
||||
/* enable the base level signal handler latency monitor */
|
||||
|
@ -1841,7 +1848,7 @@ void daemon_service_run ( void )
|
|||
clock_gettime (CLOCK_MONOTONIC, &sm_heartbeat_timestamp_last );
|
||||
|
||||
/* no need for the heartbeat audit in a simplex system */
|
||||
if ( hbsInv.system_type != SYSTEM_TYPE__CPE_MODE__SIMPLEX )
|
||||
if ( hbsInv.system_type != SYSTEM_TYPE__AIO__SIMPLEX )
|
||||
{
|
||||
/* start the state audit */
|
||||
/* run the first audit in 30 seconds */
|
||||
|
@ -2056,7 +2063,7 @@ void daemon_service_run ( void )
|
|||
hbsInv.active_controller ? "" : "in" );
|
||||
|
||||
/* no need for the heartbeat audit in a simplex system */
|
||||
if ( hbsInv.system_type != SYSTEM_TYPE__CPE_MODE__SIMPLEX )
|
||||
if ( hbsInv.system_type != SYSTEM_TYPE__AIO__SIMPLEX )
|
||||
{
|
||||
/* Due to activity state change we will dump
|
||||
* the heartbeat cluster state at now time
|
||||
|
@ -2074,6 +2081,7 @@ void daemon_service_run ( void )
|
|||
inv.nodetype = msg.parm[0];
|
||||
hbsInv.add_heartbeat_host ( inv ) ;
|
||||
hostname_inventory.push_back ( inv.name );
|
||||
hostname_inventory.unique(); // avoid duplicates
|
||||
ilog ("%s added to heartbeat service (%d)\n",
|
||||
inv.name.c_str(),
|
||||
inv.nodetype);
|
||||
|
@ -2119,7 +2127,7 @@ void daemon_service_run ( void )
|
|||
{
|
||||
if ( hostname != hbsInv.my_hostname )
|
||||
{
|
||||
hbsInv.mon_host ( hostname, false, true );
|
||||
hbsInv.mon_host ( hostname, false, false );
|
||||
hbs_cluster_del ( hostname );
|
||||
ilog ("%s heartbeat service disabled by stop command",
|
||||
hostname.c_str());
|
||||
|
@ -2366,6 +2374,7 @@ void daemon_service_run ( void )
|
|||
arrival_histogram[iface] = "" ;
|
||||
unexpected_pulse_list[iface] = "" ;
|
||||
|
||||
|
||||
rc = hbs_pulse_request ( (iface_enum)iface, seq_num, ri, rri );
|
||||
if ( rc != 0 )
|
||||
{
|
||||
|
@ -2523,7 +2532,9 @@ void daemon_service_run ( void )
|
|||
}
|
||||
}
|
||||
/* log cluster throttled */
|
||||
if (( heartbeat_ok == false ) && ( !( sm_heartbeat_count_b2b_misses % 100 )))
|
||||
if ((( heartbeat_ok == false ) &&
|
||||
( !( sm_heartbeat_count_b2b_misses % 1000 ))) &&
|
||||
( sm_heartbeat_count_b2b_misses < HUGE_NUMBER_B2B_SM_HEARTBEAT_MISSES ))
|
||||
{
|
||||
hbs_state_audit ( );
|
||||
}
|
||||
|
|
|
@ -326,7 +326,7 @@ void hbs_cluster_log ( string & hostname, mtce_hbs_cluster_type & cluster, stri
|
|||
void hbs_sm_handler ( void );
|
||||
|
||||
/* send the cluster vault to SM */
|
||||
void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason );
|
||||
int hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason );
|
||||
|
||||
/* copy cluster data from src to dst */
|
||||
void hbs_cluster_copy ( mtce_hbs_cluster_type & src, mtce_hbs_cluster_type & dst );
|
||||
|
@ -338,6 +338,10 @@ void hbs_cluster_dump ( mtce_hbs_cluster_type & vault );
|
|||
/* Heartbeat service state audit */
|
||||
void hbs_state_audit ( void );
|
||||
|
||||
/* Send state change message to SM if there has been a
|
||||
* state change in the last period */
|
||||
void hbs_cluster_change_notifier ( void );
|
||||
|
||||
/**
|
||||
* @} hbs_base
|
||||
*/
|
||||
|
|
|
@ -69,6 +69,8 @@ typedef struct
|
|||
|
||||
msgClassSock * sm_socket_ptr ;
|
||||
|
||||
string cluster_change_reason ;
|
||||
|
||||
} hbs_cluster_ctrl_type ;
|
||||
|
||||
/* Cluster control structire construct allocation. */
|
||||
|
@ -122,6 +124,8 @@ void hbs_cluster_init ( unsigned short period, msgClassSock * sm_socket_ptr )
|
|||
{
|
||||
ctrl.sm_socket_ptr = sm_socket_ptr ;
|
||||
}
|
||||
ctrl.cluster_change_reason = "";
|
||||
|
||||
ctrl.log_throttle = 0 ;
|
||||
}
|
||||
|
||||
|
@ -173,7 +177,30 @@ void hbs_cluster_nums ( unsigned short this_controller,
|
|||
|
||||
void hbs_cluster_change ( string cluster_change_reason )
|
||||
{
|
||||
hbs_cluster_send( ctrl.sm_socket_ptr, 0, cluster_change_reason );
|
||||
ilog ("reason: %s", cluster_change_reason.c_str());
|
||||
if ( ctrl.cluster_change_reason.empty() )
|
||||
ctrl.cluster_change_reason = cluster_change_reason ;
|
||||
else
|
||||
ctrl.cluster_change_reason.append("," + cluster_change_reason) ;
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* Name : hbs_cluster_change_notifier
|
||||
*
|
||||
* Description : Send SM the cluster info if there has been a state change.
|
||||
*
|
||||
***************************************************************************/
|
||||
void hbs_cluster_change_notifier ( void )
|
||||
{
|
||||
if ( ! ctrl.cluster_change_reason.empty () )
|
||||
{
|
||||
if ( hbs_cluster_send( ctrl.sm_socket_ptr, 0,
|
||||
ctrl.cluster_change_reason ) == PASS )
|
||||
{
|
||||
ctrl.cluster_change_reason.clear();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
|
@ -444,6 +471,7 @@ void hbs_cluster_update ( iface_enum iface,
|
|||
wlog_throttled ( ctrl.log_throttle, THROTTLE_COUNT,
|
||||
"Unable to store history beyond %d ",
|
||||
ctrl.cluster.histories );
|
||||
hbs_cluster_change_notifier ();
|
||||
return ;
|
||||
}
|
||||
else
|
||||
|
@ -544,6 +572,8 @@ void hbs_cluster_update ( iface_enum iface,
|
|||
else
|
||||
history_ptr->oldest_entry_index++ ;
|
||||
|
||||
hbs_cluster_change_notifier ();
|
||||
|
||||
/* clear the log throttle if we are updating history ok. */
|
||||
ctrl.log_throttle = 0 ;
|
||||
}
|
||||
|
@ -647,12 +677,12 @@ unsigned short hbs_cluster_unused_bytes ( void )
|
|||
*
|
||||
***************************************************************************/
|
||||
|
||||
void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason )
|
||||
int hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason )
|
||||
{
|
||||
int rc = FAIL_SOCKET_SENDTO ;
|
||||
ctrl.cluster.reqid = (unsigned short)reqid ;
|
||||
if (( sm_client_sock ) && ( sm_client_sock->sock_ok() == true ))
|
||||
{
|
||||
ilog ("cluster state notification Reason: %s", reason.c_str());
|
||||
int len = sizeof(mtce_hbs_cluster_type)-hbs_cluster_unused_bytes();
|
||||
int bytes = sm_client_sock->write((char*)&ctrl.cluster, len);
|
||||
if ( bytes <= 0 )
|
||||
|
@ -660,12 +690,19 @@ void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason
|
|||
elog ("failed to send cluster vault to SM (bytes=%d) (%d:%s)\n",
|
||||
bytes , errno, strerror(errno));
|
||||
}
|
||||
hbs_cluster_dump ( ctrl.cluster );
|
||||
else
|
||||
{
|
||||
/* limit the string length */
|
||||
ilog ("reason: %s", reason.substr(0,80).c_str());
|
||||
hbs_cluster_dump ( ctrl.cluster );
|
||||
rc = PASS ;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
wlog ("cannot send cluster info due to socket error");
|
||||
}
|
||||
return(rc);
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
|
@ -689,7 +726,7 @@ void hbs_history_save ( string hostname,
|
|||
{
|
||||
if ( hbs_cluster_cmp( sample, ctrl.cluster.history[h] ) )
|
||||
{
|
||||
hbs_cluster_change ("peer controller cluster event " +
|
||||
hbs_cluster_change ("peer cluster delta " +
|
||||
hbs_cluster_network_name((mtce_hbs_network_enum)sample.network));
|
||||
}
|
||||
|
||||
|
|
|
@ -279,8 +279,14 @@ void nodeLinkClass::mnfa_enter ( void )
|
|||
void nodeLinkClass::mnfa_exit ( bool force )
|
||||
{ force = force ; }
|
||||
|
||||
int send_mtc_cmd ( string & hostname, int cmd, int interface )
|
||||
{ UNUSED(hostname); UNUSED(cmd); UNUSED(interface); return PASS ; }
|
||||
int send_mtc_cmd ( string & hostname, int cmd, int interface, string json_dict)
|
||||
{
|
||||
UNUSED(hostname);
|
||||
UNUSED(cmd);
|
||||
UNUSED(interface);
|
||||
UNUSED(json_dict);
|
||||
return PASS ;
|
||||
}
|
||||
|
||||
int nodeLinkClass::mtcInvApi_subf_states ( string hostname,
|
||||
string oper_subf,
|
||||
|
|
|
@ -1,16 +1,19 @@
|
|||
#daily
|
||||
nodateext
|
||||
#
|
||||
# Copyright (c) 2020-2021 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
/var/log/hostwd.log
|
||||
{
|
||||
nodateext
|
||||
size 10M
|
||||
create 0640 root root
|
||||
start 1
|
||||
missingok
|
||||
size 10M
|
||||
rotate 20
|
||||
compress
|
||||
sharedscripts
|
||||
notifempty
|
||||
missingok
|
||||
postrotate
|
||||
systemctl reload syslog-ng > /dev/null 2>&1 || true
|
||||
endscript
|
||||
delaycompress
|
||||
}
|
||||
|
|
|
@ -254,7 +254,7 @@ void hwmonGroup_init ( string & hostname , struct sensor_group_type * group_ptr
|
|||
group_ptr->actions_critical_choices.append(HWMON_ACTION_ALARM);
|
||||
|
||||
/* Don't support reset and power cycle in AIO simplex mode */
|
||||
if ( obj_ptr->system_type != SYSTEM_TYPE__CPE_MODE__SIMPLEX )
|
||||
if ( obj_ptr->system_type != SYSTEM_TYPE__AIO__SIMPLEX )
|
||||
{
|
||||
group_ptr->actions_critical_choices.append(",");
|
||||
group_ptr->actions_critical_choices.append(HWMON_ACTION_RESET);
|
||||
|
|
|
@ -964,6 +964,10 @@ static int _parse_redfish_sensor_data( char * json_str_ptr, thread_info_type * i
|
|||
{
|
||||
strcpy(_sample_list[samples].status, "cr");
|
||||
}
|
||||
else if (!strcmp (health.data(), REDFISH_SEVERITY__NONRECOVERABLE ))
|
||||
{
|
||||
strcpy(_sample_list[samples].status, "nr");
|
||||
}
|
||||
else
|
||||
{
|
||||
strcpy(_sample_list[samples].status, "na");
|
||||
|
|
|
@ -33,6 +33,7 @@
|
|||
#define REDFISH_SEVERITY__GOOD "OK"
|
||||
#define REDFISH_SEVERITY__MAJOR "Warning"
|
||||
#define REDFISH_SEVERITY__CRITICAL "Critical"
|
||||
#define REDFISH_SEVERITY__NONRECOVERABLE "NonRecoverable"
|
||||
|
||||
#define BMC_SENSOR_DEFAULT_UNIT_TYPE_TEMP "degrees"
|
||||
#define BMC_SENSOR_DEFAULT_UNIT_TYPE_VOLT "Volts"
|
||||
|
|
|
@ -1,28 +1,21 @@
|
|||
#daily
|
||||
nodateext
|
||||
start 1
|
||||
missingok
|
||||
notifempty
|
||||
compress
|
||||
sharedscripts
|
||||
postrotate
|
||||
systemctl reload syslog-ng > /dev/null 2>&1 || true
|
||||
endscript
|
||||
#
|
||||
# Copyright (c) 2020-2021 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
/var/log/hwmond.log
|
||||
{
|
||||
size 50M
|
||||
rotate 5
|
||||
}
|
||||
|
||||
/var/log/hwmond_event.log
|
||||
{
|
||||
size 50M
|
||||
rotate 5
|
||||
}
|
||||
|
||||
/var/log/hwmond_api.log
|
||||
{
|
||||
create 0640 root root
|
||||
start 1
|
||||
size 50M
|
||||
rotate 5
|
||||
compress
|
||||
notifempty
|
||||
missingok
|
||||
postrotate
|
||||
systemctl reload syslog-ng > /dev/null 2>&1 || true
|
||||
endscript
|
||||
delaycompress
|
||||
}
|
||||
|
|
|
@ -1,16 +1,19 @@
|
|||
#daily
|
||||
nodateext
|
||||
#
|
||||
# Copyright (c) 2020-2021 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
/var/log/lmond.log
|
||||
{
|
||||
nodateext
|
||||
size 10M
|
||||
create 0640 root root
|
||||
start 1
|
||||
missingok
|
||||
size 10M
|
||||
rotate 20
|
||||
compress
|
||||
sharedscripts
|
||||
notifempty
|
||||
missingok
|
||||
postrotate
|
||||
systemctl reload syslog-ng > /dev/null 2>&1 || true
|
||||
endscript
|
||||
delaycompress
|
||||
}
|
||||
|
|
|
@ -54,7 +54,7 @@ BINS = mtcAgent mtcClient
|
|||
LDLIBS += -lstdc++ -ldaemon -lcommon -lthreadUtil -lbmcUtils -lfmcommon -lalarm -lpthread -lrt -levent -ljson-c -lamon -lcrypto -luuid
|
||||
INCLUDES = -I. -I/usr/include/mtce-daemon -I/usr/include/mtce-common
|
||||
INCLUDES += -I../common -I../alarm -I../heartbeat -I../hwmon -I../public
|
||||
CCFLAGS += -g -O2 -Wall -Wextra -Werror -Wno-missing-braces
|
||||
CCFLAGS += -g -O2 -Wall -Wextra -Werror -Wno-missing-braces -std=c++11
|
||||
|
||||
STATIC_ANALYSIS_TOOL = cppcheck
|
||||
STATIC_ANALYSIS_TOOL_EXISTS = $(shell [[ -e `which $(STATIC_ANALYSIS_TOOL)` ]] && echo 1 || echo 0)
|
||||
|
|
|
@ -26,6 +26,7 @@ using namespace std;
|
|||
#include "daemon_common.h" /* */
|
||||
|
||||
#include "nodeBase.h" /* */
|
||||
#include "nodeClass.h" /* */
|
||||
#include "nodeTimers.h" /* */
|
||||
#include "nodeUtil.h" /* */
|
||||
#include "mtcAlarm.h" /* for ... this module header */
|
||||
|
@ -379,8 +380,169 @@ void mtcAlarm_clear_all ( string hostname )
|
|||
}
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* Name : mtcAlarm_audit
|
||||
*
|
||||
* Purpose : Monitor and Auto-Correct maintenance alarms
|
||||
*
|
||||
* Description: Query locked state alarm (raw)
|
||||
* if successful
|
||||
* - Query alarms
|
||||
* - compare to running state
|
||||
* - correct mismatches ; internal state takes precidence
|
||||
* - log all alarm state changes
|
||||
*
|
||||
****************************************************************************/
|
||||
|
||||
void nodeLinkClass::mtcAlarm_audit ( struct nodeLinkClass::node * node_ptr )
|
||||
{
|
||||
/*
|
||||
* Read locked state alarm directly to detect fm access failures.
|
||||
* If successful further reads are done using a wrapper utility.
|
||||
*/
|
||||
SFmAlarmDataT alarm_query ;
|
||||
AlarmFilter alarm_filter ;
|
||||
EFmErrorT rc ;
|
||||
|
||||
memset(&alarm_query, 0, sizeof(alarm_query));
|
||||
memset(&alarm_filter, 0, sizeof(alarm_filter));
|
||||
snprintf ( &alarm_filter.alarm_id[0], FM_MAX_BUFFER_LENGTH, "%s",
|
||||
LOCK_ALARM_ID);
|
||||
snprintf ( &alarm_filter.entity_instance_id[0], FM_MAX_BUFFER_LENGTH, "%s%s",
|
||||
ENTITY_PREFIX, node_ptr->hostname.data());
|
||||
rc = fm_get_fault ( &alarm_filter, &alarm_query );
|
||||
if (( rc != FM_ERR_OK ) && ( rc != FM_ERR_ENTITY_NOT_FOUND ))
|
||||
{
|
||||
wlog("%s alarm query failure ; code:%d",
|
||||
node_ptr->hostname.c_str(),
|
||||
rc );
|
||||
return ;
|
||||
}
|
||||
|
||||
/* With FM comms proven working lets check the other mtc alarms */
|
||||
string active_alarms = "";
|
||||
for ( int i = 0 ; i < MAX_ALARMS ; i++ )
|
||||
{
|
||||
mtc_alarm_id_enum id = (mtc_alarm_id_enum)i ;
|
||||
if ( id == MTC_ALARM_ID__LOCK )
|
||||
{
|
||||
/* Unexpected severity case */
|
||||
if ( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED )
|
||||
{
|
||||
if ( alarm_query.severity != FM_ALARM_SEVERITY_WARNING )
|
||||
{
|
||||
node_ptr->alarms[id] = FM_ALARM_SEVERITY_WARNING ;
|
||||
|
||||
wlog("%s %s alarm mismatch ; %s -> %s",
|
||||
node_ptr->hostname.c_str(),
|
||||
_getIdentity(id).c_str(),
|
||||
alarmUtil_getSev_str(alarm_query.severity).c_str(),
|
||||
alarmUtil_getSev_str(node_ptr->alarms[id]).c_str());
|
||||
|
||||
mtcAlarm_warning ( node_ptr->hostname, MTC_ALARM_ID__LOCK );
|
||||
|
||||
}
|
||||
if (!active_alarms.empty())
|
||||
active_alarms.append(", ");
|
||||
active_alarms.append(_getIdentity(id) + ":");
|
||||
active_alarms.append(alarmUtil_getSev_str(node_ptr->alarms[id]));
|
||||
}
|
||||
/* Unexpected assertion case */
|
||||
else if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
|
||||
( alarm_query.severity != FM_ALARM_SEVERITY_CLEAR ))
|
||||
{
|
||||
node_ptr->alarms[id] = FM_ALARM_SEVERITY_CLEAR ;
|
||||
|
||||
wlog("%s %s alarm mismatch ; %s -> %s",
|
||||
node_ptr->hostname.c_str(),
|
||||
_getIdentity(id).c_str(),
|
||||
alarmUtil_getSev_str(alarm_query.severity).c_str(),
|
||||
alarmUtil_getSev_str(node_ptr->alarms[id]).c_str());
|
||||
|
||||
mtcAlarm_clear ( node_ptr->hostname, id );
|
||||
}
|
||||
}
|
||||
else if (( id == MTC_ALARM_ID__CONFIG ) ||
|
||||
( id == MTC_ALARM_ID__ENABLE ) ||
|
||||
( id == MTC_ALARM_ID__BM ) ||
|
||||
( id == MTC_ALARM_ID__CH_CONT) ||
|
||||
( id == MTC_ALARM_ID__CH_COMP))
|
||||
{
|
||||
EFmAlarmSeverityT severity = mtcAlarm_state ( node_ptr->hostname, id);
|
||||
if ( severity != node_ptr->alarms[id] )
|
||||
{
|
||||
ilog ("%s %s alarm mismatch ; %s -> %s",
|
||||
node_ptr->hostname.c_str(),
|
||||
_getIdentity(id).c_str(),
|
||||
alarmUtil_getSev_str(severity).c_str(),
|
||||
alarmUtil_getSev_str(node_ptr->alarms[id]).c_str());
|
||||
|
||||
if ( node_ptr->alarms[id] == FM_ALARM_SEVERITY_CLEAR )
|
||||
{
|
||||
mtcAlarm_clear ( node_ptr->hostname, id );
|
||||
}
|
||||
else
|
||||
{
|
||||
mtcAlarm_raise ( node_ptr->hostname, id, node_ptr->alarms[id] );
|
||||
}
|
||||
}
|
||||
if ( node_ptr->alarms[id] != FM_ALARM_SEVERITY_CLEAR )
|
||||
{
|
||||
if (!active_alarms.empty())
|
||||
active_alarms.append(", ");
|
||||
active_alarms.append(_getIdentity(id) + ":");
|
||||
active_alarms.append(alarmUtil_getSev_str(node_ptr->alarms[id]));
|
||||
}
|
||||
}
|
||||
/* else don't care about other alarm ids ; logs events etc */
|
||||
}
|
||||
|
||||
/* manage logging of active alarms */
|
||||
if ( !active_alarms.empty() )
|
||||
{
|
||||
if ( node_ptr->active_alarms != active_alarms )
|
||||
{
|
||||
ilog ("%s active alarms: %s",
|
||||
node_ptr->hostname.c_str(),
|
||||
active_alarms.c_str());
|
||||
|
||||
node_ptr->active_alarms = active_alarms ;
|
||||
}
|
||||
/* else
|
||||
* do nothing because there are active alarms
|
||||
* that have not changed since the last audit.
|
||||
*/
|
||||
}
|
||||
else if ( ! node_ptr->active_alarms.empty() )
|
||||
{
|
||||
/* clear active alarm list since there 'were' active alarms
|
||||
* but there are no longer active alarms */
|
||||
node_ptr->active_alarms.clear();
|
||||
ilog ("%s no active alarms", node_ptr->hostname.c_str());
|
||||
}
|
||||
/* else
|
||||
* no active alarms ; don't log */
|
||||
}
|
||||
|
||||
/************************* A L A R M I N G **************************/
|
||||
|
||||
/* Raise the specified maintenance alarm severity */
|
||||
int mtcAlarm_raise ( string hostname, mtc_alarm_id_enum id, EFmAlarmSeverityT severity )
|
||||
{
|
||||
switch ( severity )
|
||||
{
|
||||
case FM_ALARM_SEVERITY_MINOR:
|
||||
return (mtcAlarm_minor(hostname,id));
|
||||
case FM_ALARM_SEVERITY_MAJOR:
|
||||
return (mtcAlarm_major(hostname,id));
|
||||
case FM_ALARM_SEVERITY_CRITICAL:
|
||||
return (mtcAlarm_critical(hostname,id));
|
||||
default:
|
||||
return (FAIL_BAD_PARM);
|
||||
}
|
||||
}
|
||||
|
||||
/* Clear the specified hosts's maintenance alarm */
|
||||
int mtcAlarm_clear ( string hostname, mtc_alarm_id_enum id )
|
||||
{
|
||||
|
|
|
@ -95,6 +95,9 @@ string mtcAlarm_getId_str ( mtc_alarm_id_enum id );
|
|||
/** Clear the specified maintenance alarm for specific host */
|
||||
int mtcAlarm_clear ( string hostname, mtc_alarm_id_enum id );
|
||||
|
||||
/** Raise specified severity level alarm for the specified host */
|
||||
int mtcAlarm_raise ( string hostname, mtc_alarm_id_enum id, EFmAlarmSeverityT severity );
|
||||
|
||||
/** Assert a specified mtce alarm against the specified host with a WARNING severity level */
|
||||
int mtcAlarm_warning ( string hostname, mtc_alarm_id_enum id );
|
||||
|
||||
|
|
|
@ -39,6 +39,26 @@ int nodeLinkClass::bmc_command_send ( struct nodeLinkClass::node * node_ptr,
|
|||
{
|
||||
int rc = PASS ;
|
||||
|
||||
/* handle 'kill of in-progress' thread or 'done but not consumed' thread */
|
||||
if ( ! thread_idle ( node_ptr->bmc_thread_ctrl ))
|
||||
{
|
||||
if ( ! thread_done ( node_ptr->bmc_thread_ctrl ))
|
||||
{
|
||||
thread_kill ( node_ptr->bmc_thread_ctrl,
|
||||
node_ptr->bmc_thread_info );
|
||||
return (RETRY);
|
||||
}
|
||||
else
|
||||
{
|
||||
mtcTimer_reset ( node_ptr->bmc_thread_ctrl.timer );
|
||||
if ( thread_done_consume ( node_ptr->bmc_thread_ctrl,
|
||||
node_ptr->bmc_thread_info ) != PASS )
|
||||
{
|
||||
return (RETRY);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
node_ptr->bmc_thread_info.command = command ;
|
||||
|
||||
/* Update / Setup the BMC access credentials */
|
||||
|
@ -437,6 +457,13 @@ bmc_command_recv_cleanup:
|
|||
|
||||
if ( rc != RETRY )
|
||||
{
|
||||
ilog ("%s %s recv '%s' command (%s) (rc:%d)",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->bmc_thread_ctrl.name.c_str(),
|
||||
bmcUtil_getCmd_str(node_ptr->bmc_thread_info.command).c_str(),
|
||||
bmcUtil_getProtocol_str(node_ptr->bmc_protocol).c_str(),
|
||||
rc);
|
||||
|
||||
node_ptr->bmc_thread_ctrl.done = true ;
|
||||
node_ptr->bmc_thread_ctrl.retries = 0 ;
|
||||
node_ptr->bmc_thread_ctrl.id = 0 ;
|
||||
|
|
|
@ -20,7 +20,7 @@
|
|||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <sys/un.h> /* for ... unix domain sockets */
|
||||
#include <sys/un.h> /* for ... unix domain sockets */
|
||||
#include <arpa/inet.h>
|
||||
#include <sys/socket.h>
|
||||
#include <net/if.h>
|
||||
|
@ -29,8 +29,8 @@
|
|||
#include <sys/stat.h>
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
#include <list> /* for the list of conf file names */
|
||||
|
||||
#include <list> /* for ... list of conf file names */
|
||||
#include <unistd.h> /* for ... sync */
|
||||
|
||||
using namespace std;
|
||||
|
||||
|
@ -70,11 +70,15 @@ void stop_pmon( void )
|
|||
{
|
||||
/* max pipe command response length */
|
||||
#define PIPE_COMMAND_RESPON_LEN (100)
|
||||
|
||||
ilog("Stopping collectd.");
|
||||
int rc = system("/usr/local/sbin/pmon-stop collectd");
|
||||
sleep (2);
|
||||
ilog("Stopping pmon to prevent process recovery during shutdown");
|
||||
for ( int retry = 0 ; retry < 5 ; retry++ )
|
||||
{
|
||||
char pipe_cmd_output [PIPE_COMMAND_RESPON_LEN] ;
|
||||
int rc = system("/usr/bin/systemctl stop pmon");
|
||||
rc = system("/usr/bin/systemctl stop pmon");
|
||||
sleep(2);
|
||||
|
||||
/* confirm pmon is no longer active */
|
||||
|
@ -204,6 +208,24 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface )
|
|||
mlog1 ("mtcAlive request received (%s network)\n", interface_name.c_str());
|
||||
return ( send_mtcAlive_msg ( sock_ptr, get_who_i_am(), interface ));
|
||||
}
|
||||
else if ( msg.cmd == MTC_MSG_INFO )
|
||||
{
|
||||
mlog1("mtc 'info' message received (%s network)\n", interface_name.c_str());
|
||||
load_mtcInfo_msg ( msg );
|
||||
return ( PASS ); /* no ack for this message */
|
||||
}
|
||||
else if ( msg.cmd == MTC_CMD_SYNC )
|
||||
{
|
||||
ilog ("mtc '%s' message received (%s network)\n",
|
||||
get_mtcNodeCommand_str(msg.cmd),
|
||||
interface_name.c_str());
|
||||
|
||||
ilog ("Sync Start");
|
||||
sync ();
|
||||
ilog ("Sync Done");
|
||||
|
||||
return ( PASS ); /* no ack for this message */
|
||||
}
|
||||
else if ( msg.cmd == MTC_MSG_LOCKED )
|
||||
{
|
||||
/* Only recreate the file if its not already present */
|
||||
|
@ -603,7 +625,7 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface )
|
|||
}
|
||||
|
||||
/** Send an event to the mtcAgent **/
|
||||
int mtce_send_event ( mtc_socket_type * sock_ptr, int cmd , const char * mtce_name_ptr )
|
||||
int mtce_send_event ( mtc_socket_type * sock_ptr, unsigned int cmd , const char * mtce_name_ptr )
|
||||
{
|
||||
mtc_message_type event ;
|
||||
|
||||
|
@ -619,6 +641,24 @@ int mtce_send_event ( mtc_socket_type * sock_ptr, int cmd , const char * mtce_na
|
|||
/* We don't use the buffer for mtce events to remove it from the size */
|
||||
bytes = ((sizeof(mtc_message_type))-(BUF_SIZE));
|
||||
}
|
||||
else if ( cmd == MTC_EVENT_MONITOR_READY )
|
||||
{
|
||||
string event_info = "{\"" ;
|
||||
event_info.append(MTC_JSON_INV_NAME);
|
||||
event_info.append("\":\"");
|
||||
event_info.append(get_hostname());
|
||||
event_info.append("\",\"");
|
||||
event_info.append(MTC_JSON_SERVICE);
|
||||
event_info.append("\":\"");
|
||||
event_info.append(MTC_SERVICE_MTCCLIENT_NAME );
|
||||
event_info.append("\"}");
|
||||
|
||||
size_t len = event_info.length()+1 ;
|
||||
snprintf ( &event.hdr[0], MSG_HEADER_SIZE, "%s", get_mtce_event_header());
|
||||
snprintf ( &event.buf[0], len, "%s", event_info.data());
|
||||
bytes = ((sizeof(mtc_message_type))-(BUF_SIZE-len));
|
||||
ilog ("%s %s ready", get_hostname().c_str(), MTC_SERVICE_MTCCLIENT_NAME);
|
||||
}
|
||||
else if (( cmd == MTC_EVENT_AVS_CLEAR ) ||
|
||||
( cmd == MTC_EVENT_AVS_MAJOR ) ||
|
||||
( cmd == MTC_EVENT_AVS_CRITICAL ))
|
||||
|
@ -666,7 +706,7 @@ int mtce_send_event ( mtc_socket_type * sock_ptr, int cmd , const char * mtce_na
|
|||
{
|
||||
if ( bytes == 0 )
|
||||
{
|
||||
slog ("message send failed ; message size=0 for cmd:%d is 0\n", event.cmd );
|
||||
slog ("message send failed ; message size=0 for cmd:0x%x is 0\n", event.cmd );
|
||||
rc = FAIL_NO_DATA ;
|
||||
}
|
||||
else if ((rc = sock_ptr->mtc_client_tx_socket->write((char*)&event.hdr[0], bytes))!= bytes )
|
||||
|
@ -912,15 +952,18 @@ int send_mtcAlive_msg ( mtc_socket_type * sock_ptr, string identity, int interfa
|
|||
}
|
||||
|
||||
/* Send to controller-1 cluster address */
|
||||
if (( sock_ptr->mtc_client_tx_socket_c1_clstr ) &&
|
||||
( sock_ptr->mtc_client_tx_socket_c1_clstr->sock_ok() == true ))
|
||||
if ( get_ctrl_ptr()->system_type != SYSTEM_TYPE__AIO__SIMPLEX )
|
||||
{
|
||||
print_mtc_message ( CONTROLLER_1, MTC_CMD_TX, msg, get_iface_name_str(CLSTR_INTERFACE), false );
|
||||
sock_ptr->mtc_client_tx_socket_c1_clstr->write((char*)&msg.hdr[0], bytes ) ;
|
||||
}
|
||||
else
|
||||
{
|
||||
elog("mtc_client_tx_socket_c1_clstr not ok");
|
||||
if (( sock_ptr->mtc_client_tx_socket_c1_clstr ) &&
|
||||
( sock_ptr->mtc_client_tx_socket_c1_clstr->sock_ok() == true ))
|
||||
{
|
||||
print_mtc_message ( CONTROLLER_1, MTC_CMD_TX, msg, get_iface_name_str(CLSTR_INTERFACE), false );
|
||||
sock_ptr->mtc_client_tx_socket_c1_clstr->write((char*)&msg.hdr[0], bytes ) ;
|
||||
}
|
||||
else
|
||||
{
|
||||
elog("mtc_client_tx_socket_c1_clstr not ok");
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
|
@ -933,32 +976,59 @@ int send_mtcAlive_msg ( mtc_socket_type * sock_ptr, string identity, int interfa
|
|||
return (PASS) ;
|
||||
}
|
||||
|
||||
/* Accelerated Virtual Switch 'events' socket
|
||||
* - for receiving data port state change event
|
||||
* Event strings are
|
||||
*
|
||||
* {"type":"port-state", "severity":"critical|major|clear"}
|
||||
*
|
||||
* type:port-state - the provider network data port status has changed to the supplied fault severity
|
||||
*
|
||||
* severity:
|
||||
* critical - port has failed and is not part of an aggregate or is the last port in an aggregate (degrade, disable services)
|
||||
* major - port has failed and is part of an aggregate with other inservice-ports (degrade only)
|
||||
* clear - port has recovered from a failed state and is operational (clear degrade, enable services)
|
||||
*
|
||||
* NOTE: The port status can transition from any of the above states to any other state.
|
||||
*
|
||||
* The neutron agent monitors the vswitch ports at a 2 second interval.
|
||||
* If a port changes link state during the polling period, it will
|
||||
* raise/clear the alarm, but now also calculates the impact of that port
|
||||
* failure on the provider network data interface.
|
||||
*
|
||||
* The overall aggregated state across all provider network interfaces will
|
||||
* be reported to maintenance when ports enter a link down or up state.
|
||||
* The agent will also periodically send the current provider network port
|
||||
* status to maintenance every 30 seconds.
|
||||
*
|
||||
*/
|
||||
int send_mtcClient_cmd ( mtc_socket_type * sock_ptr, int cmd, string hostname, string address, int port)
|
||||
{
|
||||
mtc_message_type msg ;
|
||||
int bytes = 0 ;
|
||||
MEMSET_ZERO (msg);
|
||||
snprintf ( &msg.hdr[0], MSG_HEADER_SIZE, "%s", get_cmd_req_msg_header());
|
||||
msg.cmd = cmd ;
|
||||
|
||||
switch ( cmd )
|
||||
{
|
||||
case MTC_CMD_SYNC:
|
||||
{
|
||||
ilog ("Sending '%s' command to %s:%s:%d",
|
||||
get_mtcNodeCommand_str(cmd),
|
||||
hostname.c_str(),
|
||||
address.c_str(), port);
|
||||
|
||||
msg.num = 0 ;
|
||||
|
||||
/* buffer not used in this message */
|
||||
bytes = ((sizeof(mtc_message_type))-(BUF_SIZE));
|
||||
|
||||
break ;
|
||||
}
|
||||
default:
|
||||
{
|
||||
slog("Unsupported command ; %s:%d", get_mtcNodeCommand_str(cmd), cmd );
|
||||
return (FAIL_BAD_CASE);
|
||||
}
|
||||
}
|
||||
int rc = FAIL ;
|
||||
|
||||
/* Send to controller floating address */
|
||||
if (( sock_ptr->mtc_client_tx_socket ) &&
|
||||
( sock_ptr->mtc_client_tx_socket->sock_ok() == true ))
|
||||
{
|
||||
print_mtc_message ( hostname, MTC_CMD_TX, msg, get_iface_name_str(MGMNT_INTERFACE), false );
|
||||
rc = sock_ptr->mtc_client_tx_socket->write((char*)&msg.hdr[0], bytes, address.data(), port ) ;
|
||||
if ( 0 >= rc )
|
||||
{
|
||||
elog("failed to send command to mtcClient (%d) (%d:%s)", rc, errno, strerror(errno));
|
||||
rc = FAIL_SOCKET_SENDTO ;
|
||||
}
|
||||
else
|
||||
rc = PASS ;
|
||||
}
|
||||
else
|
||||
{
|
||||
elog("mtc_client_tx_socket not ok");
|
||||
rc = FAIL_BAD_STATE ;
|
||||
}
|
||||
return (rc) ;
|
||||
}
|
||||
|
||||
int mtcCompMsg_testhead ( void )
|
||||
{
|
||||
|
|
|
@ -443,6 +443,34 @@ int mtc_service_inbox ( nodeLinkClass * obj_ptr,
|
|||
obj_ptr->declare_service_ready ( hostname, MTC_SERVICE_HEARTBEAT );
|
||||
return (PASS);
|
||||
}
|
||||
else if ( service == MTC_SERVICE_MTCCLIENT_NAME )
|
||||
{
|
||||
ilog ("%s %s ready", hostname.c_str(), MTC_SERVICE_MTCCLIENT_NAME);
|
||||
|
||||
/* if this ready event is from the mtcClient of a
|
||||
* controller that has valid bmc access info then
|
||||
* build the 'peer controller kill' mtcInfo and
|
||||
* send it to that mtcClient */
|
||||
if ( obj_ptr->get_nodetype ( hostname ) & CONTROLLER_TYPE )
|
||||
{
|
||||
string bm_pw = obj_ptr->get_bm_pw ( hostname ) ;
|
||||
if ( !bm_pw.empty() && ( bm_pw != NONE ))
|
||||
{
|
||||
string bm_un = obj_ptr->get_bm_un ( hostname ) ;
|
||||
string bm_ip = obj_ptr->get_bm_ip ( hostname ) ;
|
||||
if (( hostUtil_is_valid_username ( bm_un )) &&
|
||||
( hostUtil_is_valid_ip_addr ( bm_ip )))
|
||||
{
|
||||
send_mtc_cmd ( hostname,
|
||||
MTC_MSG_INFO,
|
||||
MGMNT_INTERFACE,
|
||||
obj_ptr->build_mtcInfo_dict (
|
||||
MTC_INFO_CODE__PEER_CONTROLLER_KILL_INFO));
|
||||
}
|
||||
}
|
||||
}
|
||||
return (PASS);
|
||||
}
|
||||
if ( service == MTC_SERVICE_HWMOND_NAME )
|
||||
{
|
||||
std::list<string>::iterator temp ;
|
||||
|
@ -578,11 +606,12 @@ int mtc_service_inbox ( nodeLinkClass * obj_ptr,
|
|||
return (rc);
|
||||
}
|
||||
|
||||
int send_mtc_cmd ( string & hostname, int cmd , int interface )
|
||||
int send_mtc_cmd ( string & hostname, int cmd , int interface, string json_dict )
|
||||
{
|
||||
int rc = FAIL ;
|
||||
bool force = false ;
|
||||
mtc_message_type mtc_cmd ;
|
||||
string data = "" ;
|
||||
mtc_socket_type * sock_ptr = get_sockPtr ();
|
||||
memset (&mtc_cmd,0,sizeof(mtc_message_type));
|
||||
|
||||
|
@ -592,6 +621,16 @@ int send_mtc_cmd ( string & hostname, int cmd , int interface )
|
|||
|
||||
switch ( cmd )
|
||||
{
|
||||
case MTC_MSG_INFO:
|
||||
{
|
||||
snprintf ( &mtc_cmd.hdr[0], MSG_HEADER_SIZE, "%s" , get_cmd_req_msg_header() );
|
||||
mtc_cmd.cmd = cmd ;
|
||||
mtc_cmd.num = 0 ;
|
||||
data = "{\"mtcInfo\":" + json_dict + "}";
|
||||
ilog("%s mtc info update", hostname.c_str());
|
||||
rc = PASS ;
|
||||
break ;
|
||||
}
|
||||
case MTC_REQ_MTCALIVE:
|
||||
{
|
||||
snprintf ( &mtc_cmd.hdr[0], MSG_HEADER_SIZE, "%s" , get_cmd_req_msg_header() );
|
||||
|
@ -689,11 +728,20 @@ int send_mtc_cmd ( string & hostname, int cmd , int interface )
|
|||
* Note: the minus 1 is to overwrite the null */
|
||||
snprintf ( &mtc_cmd.hdr[MSG_HEADER_SIZE-1], MSG_HEADER_SIZE, "%s", obj_ptr->get_hostIfaceMac(hostname, MGMNT_IFACE).data());
|
||||
|
||||
string data = "{\"address\":\"";
|
||||
data.append(obj_ptr->my_float_ip) ;
|
||||
data.append("\",\"interface\":\"");
|
||||
data.append(get_iface_name_str(interface));
|
||||
data.append("\"}");
|
||||
/* If data is empty then at least add where the message came from */
|
||||
if ( data.empty() )
|
||||
{
|
||||
data = "{\"address\":\"";
|
||||
data.append(obj_ptr->my_float_ip) ;
|
||||
data.append("\",\"interface\":\"");
|
||||
data.append(get_iface_name_str(interface));
|
||||
data.append("\"}");
|
||||
}
|
||||
else
|
||||
{
|
||||
; /* data is already pre loaded by the command case above */
|
||||
}
|
||||
/* copy data into message buffer */
|
||||
snprintf ( &mtc_cmd.buf[0], data.length()+1, "%s", data.data());
|
||||
bytes = (sizeof(mtc_message_type)-(BUF_SIZE-(data.length()+1)));
|
||||
|
||||
|
@ -1176,7 +1224,7 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr )
|
|||
else if ( msg.cmd == MTC_EVENT_HEARTBEAT_READY )
|
||||
{
|
||||
/* no heartbeating in simplex mode */
|
||||
if ( obj_ptr->system_type == SYSTEM_TYPE__CPE_MODE__SIMPLEX )
|
||||
if ( obj_ptr->system_type == SYSTEM_TYPE__AIO__SIMPLEX )
|
||||
{
|
||||
return (PASS);
|
||||
}
|
||||
|
@ -1214,13 +1262,68 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr )
|
|||
{
|
||||
elog ("%s Failed to send inventory to heartbeat service\n", hostname.c_str());
|
||||
}
|
||||
/* Send the start event to the heartbeat service for all enabled hosts */
|
||||
/* Consider sending the 'start' request to the heartbeat service
|
||||
* for all enabled hosts. */
|
||||
if (( obj_ptr->get_adminState ( hostname ) == MTC_ADMIN_STATE__UNLOCKED ) &&
|
||||
( obj_ptr->get_operState ( hostname ) == MTC_OPER_STATE__ENABLED ) &&
|
||||
((obj_ptr->get_availStatus ( hostname ) == MTC_AVAIL_STATUS__AVAILABLE ) ||
|
||||
(obj_ptr->get_availStatus ( hostname ) == MTC_AVAIL_STATUS__DEGRADED )))
|
||||
{
|
||||
send_hbs_command ( hostname, MTC_CMD_START_HOST, controller );
|
||||
/* However, bypass sending heartbeat 'start' for nodes that
|
||||
* are not ready to heartbeat; enabling, configuring, testing.
|
||||
* Such cases are if a host is:
|
||||
*
|
||||
* 1. running the add_handler or
|
||||
* 2. running the enable_handler or
|
||||
* 3. running the enable_subf_handler or
|
||||
* 4. not configured or
|
||||
* 5. not tested (goenabled not complete)
|
||||
*
|
||||
*/
|
||||
mtc_nodeAdminAction_enum current_action =
|
||||
obj_ptr->get_adminAction (hostname);
|
||||
if (( current_action != MTC_ADMIN_ACTION__ADD ) &&
|
||||
( current_action != MTC_ADMIN_ACTION__ENABLE ) &&
|
||||
( current_action != MTC_ADMIN_ACTION__ENABLE_SUBF ))
|
||||
{
|
||||
int mtce_flags = obj_ptr->get_mtce_flags(hostname);
|
||||
if (( mtce_flags & MTC_FLAG__I_AM_CONFIGURED ) &&
|
||||
( mtce_flags & MTC_FLAG__I_AM_HEALTHY ) &&
|
||||
( mtce_flags & MTC_FLAG__MAIN_GOENABLED ))
|
||||
{
|
||||
if (( obj_ptr->system_type != SYSTEM_TYPE__NORMAL ) &&
|
||||
( obj_ptr->get_nodetype ( hostname ) & CONTROLLER_TYPE ))
|
||||
{
|
||||
/* If its an AIO then its worker subfunction
|
||||
* needs to have been be configured and tested. */
|
||||
if (( mtce_flags & MTC_FLAG__SUBF_CONFIGURED ) &&
|
||||
( mtce_flags & MTC_FLAG__SUBF_GOENABLED ))
|
||||
{
|
||||
ilog("%s heartbeat start (AIO controller)",
|
||||
hostname.c_str());
|
||||
send_hbs_command ( hostname, MTC_CMD_START_HOST, controller );
|
||||
}
|
||||
else
|
||||
{
|
||||
wlog ("%s not heartbeat ready (subf) (oob:%x)",
|
||||
hostname.c_str(),
|
||||
mtce_flags);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
ilog("%s heartbeat start (from ready event)",
|
||||
hostname.c_str());
|
||||
send_hbs_command ( hostname, MTC_CMD_START_HOST, controller );
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
wlog ("%s not heartbeat ready (main) (oob:%x)",
|
||||
hostname.c_str(),
|
||||
mtce_flags);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
ilog ("%s %s inventory push ... done",
|
||||
|
|
|
@ -974,7 +974,7 @@ int nodeLinkClass::mtcInvApi_update_states_now ( struct nodeLinkClass::node * no
|
|||
else
|
||||
avail = " " ;
|
||||
|
||||
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
{
|
||||
if ( ! oper_subf.empty() )
|
||||
{
|
||||
|
@ -1016,7 +1016,7 @@ int nodeLinkClass::mtcInvApi_update_states_now ( struct nodeLinkClass::node * no
|
|||
this->sysinvEvent.payload.erase(len-1,1);
|
||||
this->sysinvEvent.payload.append ( "]");
|
||||
|
||||
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
{
|
||||
ilog ("%s %s-%s-%s %s-%s\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
|
|
|
@ -43,9 +43,9 @@
|
|||
#include <signal.h>
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
//#include <syslog.h> /* for ... syslog */
|
||||
#include <sys/stat.h>
|
||||
#include <list>
|
||||
#include <json-c/json.h> /* for ... json_tokener_parse */
|
||||
|
||||
using namespace std;
|
||||
|
||||
|
@ -56,6 +56,10 @@ using namespace std;
|
|||
#include "nodeBase.h" /* for ... Common Definitions */
|
||||
#include "nodeTimers.h" /* fpr ... Timer Service */
|
||||
#include "nodeUtil.h" /* for ... Common Utilities */
|
||||
#include "hostUtil.h" /* for ... hostUtil_is_valid_... */
|
||||
#include "jsonUtil.h" /* for ... jsonUtil_get_key_value_string */
|
||||
#include "bmcUtil.h" /* for ... bmcUtil_accessInfo_type */
|
||||
#include "ipmiUtil.h" /* for ... ipmiUtil_reset_host_now */
|
||||
#include "nodeMacro.h" /* for ... CREATE_NONBLOCK_INET_UDP_RX_SOCKET */
|
||||
#include "mtcNodeMsg.h" /* for ... common maintenance messaging */
|
||||
#include "mtcNodeComp.h" /* for ... this module header */
|
||||
|
@ -96,7 +100,7 @@ string get_hostname ( void )
|
|||
* Daemon Configuration Structure - The allocated struct
|
||||
* @see daemon_common.h for daemon_config_type struct format.
|
||||
*/
|
||||
static daemon_config_type mtc_config ;
|
||||
static daemon_config_type mtc_config ;
|
||||
daemon_config_type * daemon_get_cfg_ptr () { return &mtc_config ; }
|
||||
|
||||
/**
|
||||
|
@ -106,6 +110,8 @@ daemon_config_type * daemon_get_cfg_ptr () { return &mtc_config ; }
|
|||
static mtc_socket_type mtc_sock ;
|
||||
static mtc_socket_type * sock_ptr ;
|
||||
|
||||
static bmcUtil_accessInfo_type peer_controller = {"none","none","none","none","none"};
|
||||
static bmcUtil_accessInfo_type this_controller = {"none","none","none","none","none"};
|
||||
|
||||
int run_goenabled_scripts ( string type );
|
||||
|
||||
|
@ -138,6 +144,16 @@ void timer_handler ( int sig, siginfo_t *si, void *uc)
|
|||
mtcTimer_stop_int_safe ( ctrl.hostservices.timer );
|
||||
ctrl.hostservices.timer.ring = true ;
|
||||
}
|
||||
else if ( *tid_ptr == ctrl.peer_ctrlr_reset.sync_timer.tid )
|
||||
{
|
||||
ctrl.peer_ctrlr_reset.sync_timer.ring = true ;
|
||||
mtcTimer_stop_int_safe ( ctrl.peer_ctrlr_reset.sync_timer );
|
||||
}
|
||||
else if ( *tid_ptr == ctrl.peer_ctrlr_reset.audit_timer.tid )
|
||||
{
|
||||
/* use auto restart */
|
||||
ctrl.peer_ctrlr_reset.audit_timer.ring = true ;
|
||||
}
|
||||
else
|
||||
{
|
||||
mtcTimer_stop_tid_int_safe ( tid_ptr );
|
||||
|
@ -207,9 +223,8 @@ void daemon_exit ( void )
|
|||
exit (0) ;
|
||||
}
|
||||
|
||||
|
||||
/* Startup config read */
|
||||
static int mtc_config_handler ( void * user,
|
||||
static int mtc_config_handler ( void * user,
|
||||
const char * section,
|
||||
const char * name,
|
||||
const char * value)
|
||||
|
@ -236,11 +251,14 @@ static int mtc_config_handler ( void * user,
|
|||
config_ptr->failsafe_shutdown_delay = atoi(value);
|
||||
ilog ("Shutdown TO : %d secs\n", config_ptr->failsafe_shutdown_delay );
|
||||
}
|
||||
else
|
||||
if (( ctrl.nodetype & CONTROLLER_TYPE ) &&
|
||||
(MATCH("client", "sync_b4_peer_ctrlr_reset")))
|
||||
{
|
||||
return (PASS);
|
||||
ctrl.peer_ctrlr_reset.sync = atoi(value);
|
||||
ilog("SyncB4 Reset: %s",
|
||||
ctrl.peer_ctrlr_reset.sync ? "Yes" : "No" );
|
||||
}
|
||||
return (FAIL);
|
||||
return (PASS);
|
||||
}
|
||||
|
||||
/* Read the mtc.ini file and load control */
|
||||
|
@ -431,7 +449,7 @@ void setup_clstr_tx_sockets ( void )
|
|||
mtc_sock.mtc_client_tx_socket_c0_clstr->sock_ok(false);
|
||||
}
|
||||
}
|
||||
if ( ctrl.system_type != SYSTEM_TYPE__CPE_MODE__SIMPLEX )
|
||||
if ( ctrl.system_type != SYSTEM_TYPE__AIO__SIMPLEX )
|
||||
{
|
||||
dlog ("setup of %s TX\n", CONTROLLER_1_CLUSTER_HOST);
|
||||
|
||||
|
@ -946,6 +964,65 @@ void _manage_goenabled_tests ( void )
|
|||
_scripts_cleanup (ctrl.active_script_set) ;
|
||||
}
|
||||
|
||||
int issue_reset_and_cleanup ( void )
|
||||
{
|
||||
int rc = FAIL ;
|
||||
const char peer_ctrlr [] = "Peer controller reset" ;
|
||||
|
||||
ilog("SM %s request", peer_ctrlr );
|
||||
/* check creds */
|
||||
if (( hostUtil_is_valid_ip_addr ( peer_controller.bm_ip ) == false ) ||
|
||||
( hostUtil_is_valid_username ( peer_controller.bm_un ) == false ) ||
|
||||
( hostUtil_is_valid_pw ( peer_controller.bm_pw ) == false ))
|
||||
{
|
||||
elog("%s cannot reset peer BMC host at %s due to invalid credentials",
|
||||
ctrl.hostname, peer_controller.bm_ip.c_str());
|
||||
return (rc);
|
||||
}
|
||||
|
||||
/* create output filename - no need to delete after operation */
|
||||
string output_filename = bmcUtil_create_data_fn ( ctrl.hostname,
|
||||
BMC_RESET_CMD_FILE_SUFFIX,
|
||||
BMC_PROTOCOL__IPMITOOL );
|
||||
if ( output_filename.empty() )
|
||||
{
|
||||
elog("%s ; failed to create output filename", peer_ctrlr);
|
||||
rc = FAIL_STRING_EMPTY ;
|
||||
}
|
||||
else if ( ipmiUtil_reset_host_now ( ctrl.hostname,
|
||||
peer_controller,
|
||||
output_filename ) == PASS )
|
||||
{
|
||||
string result = daemon_get_file_str ( output_filename.data() );
|
||||
ilog("%s succeeded", peer_ctrlr);
|
||||
|
||||
/* don't fail the operation if the result is unexpected ; but log it */
|
||||
if ( result.compare( IPMITOOL_POWER_RESET_RESP ) )
|
||||
{
|
||||
dlog("... but reset command output was unexpected ; %s",
|
||||
result.c_str());
|
||||
}
|
||||
rc = PASS ;
|
||||
}
|
||||
else
|
||||
{
|
||||
elog("%s failed", peer_ctrlr);
|
||||
rc = FAIL_OPERATION ;
|
||||
}
|
||||
|
||||
if ( rc == PASS )
|
||||
{
|
||||
/* give the host a chance to reset before
|
||||
* telling SM the reset is done */
|
||||
sleep (2) ;
|
||||
|
||||
/* Don't want to remove the file if the reset was not successful */
|
||||
dlog("removing %s", RESET_PEER_NOW );
|
||||
daemon_remove_file ( RESET_PEER_NOW );
|
||||
}
|
||||
return (rc);
|
||||
}
|
||||
|
||||
|
||||
/* The main service loop */
|
||||
int daemon_init ( string iface, string nodetype_str )
|
||||
|
@ -963,6 +1040,7 @@ int daemon_init ( string iface, string nodetype_str )
|
|||
ctrl.subfunction = 0 ;
|
||||
ctrl.system_type = daemon_system_type ();
|
||||
ctrl.clstr_iface_provisioned = false ;
|
||||
ctrl.peer_ctrlr_reset.sync = false ;
|
||||
|
||||
/* convert node type to integer */
|
||||
ctrl.nodetype = get_host_function_mask ( nodetype_str ) ;
|
||||
|
@ -1018,6 +1096,13 @@ int daemon_init ( string iface, string nodetype_str )
|
|||
mtcTimer_init ( ctrl.goenabled.timer, &ctrl.hostname[0], "goenable timer" );
|
||||
mtcTimer_init ( ctrl.hostservices.timer, &ctrl.hostname[0], "host services timer" );
|
||||
|
||||
/* initialize peer controller reset feature */
|
||||
mtcTimer_init ( ctrl.peer_ctrlr_reset.audit_timer, &ctrl.hostname[0], "peer ctrlr reset audit timer" ),
|
||||
mtcTimer_init ( ctrl.peer_ctrlr_reset.sync_timer, &ctrl.hostname[0], "peer ctrlr reset sync timer" ),
|
||||
ctrl.peer_ctrlr_reset.sync_timer.ring = false ;
|
||||
ctrl.peer_ctrlr_reset.audit_timer.ring = false ;
|
||||
ctrl.peer_ctrlr_reset.audit_period = PEER_CTRLR_AUDIT_PERIOD ;
|
||||
|
||||
/* initialize the script group control structures */
|
||||
script_ctrl_init ( &ctrl.goenabled );
|
||||
script_ctrl_init ( &ctrl.hostservices );
|
||||
|
@ -1073,6 +1158,17 @@ void daemon_service_run ( void )
|
|||
/* Send first mtcAlive ASAP */
|
||||
mtcTimer_start ( ctrl.timer, timer_handler, 1 );
|
||||
|
||||
/* Monitor for peer controller reset requests when this
|
||||
* daemon runs on a controller */
|
||||
if ( ctrl.nodetype & CONTROLLER_TYPE )
|
||||
{
|
||||
mtcTimer_start ( ctrl.peer_ctrlr_reset.audit_timer,
|
||||
timer_handler,
|
||||
ctrl.peer_ctrlr_reset.audit_period );
|
||||
}
|
||||
|
||||
mtce_send_event ( sock_ptr, MTC_EVENT_MONITOR_READY, NULL );
|
||||
|
||||
/* lets go select so that the sock does not go crazy */
|
||||
dlog ("%s running main loop with %d msecs socket timeout\n",
|
||||
&ctrl.hostname[0], (SOCKET_WAIT/1000) );
|
||||
|
@ -1305,8 +1401,20 @@ void daemon_service_run ( void )
|
|||
socket_reinit = true ;
|
||||
}
|
||||
|
||||
/* Clstr Tx */
|
||||
else if (( ctrl.clstr_iface_provisioned == true ) &&
|
||||
/* Clstr Tx ; AIO SX */
|
||||
else if ((ctrl.system_type == SYSTEM_TYPE__AIO__SIMPLEX) &&
|
||||
( ctrl.clstr_iface_provisioned == true ) &&
|
||||
(( mtc_sock.mtc_client_tx_socket_c0_clstr == NULL ) ||
|
||||
( mtc_sock.mtc_client_tx_socket_c0_clstr->sock_ok() == false )))
|
||||
{
|
||||
wlog ("calling setup_clstr_tx_sockets (auto-recovery)\n");
|
||||
setup_clstr_tx_sockets();
|
||||
socket_reinit = true ;
|
||||
}
|
||||
|
||||
/* Clstr Tx ; not AIO SX */
|
||||
else if ((ctrl.system_type != SYSTEM_TYPE__AIO__SIMPLEX) &&
|
||||
( ctrl.clstr_iface_provisioned == true ) &&
|
||||
(( mtc_sock.mtc_client_tx_socket_c0_clstr == NULL ) ||
|
||||
( mtc_sock.mtc_client_tx_socket_c1_clstr == NULL ) ||
|
||||
( mtc_sock.mtc_client_tx_socket_c0_clstr->sock_ok() == false ) ||
|
||||
|
@ -1384,7 +1492,51 @@ void daemon_service_run ( void )
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* service controller specific audits */
|
||||
if ( ctrl.nodetype & CONTROLLER_TYPE )
|
||||
{
|
||||
/* peer controller reset service audit */
|
||||
if ( ctrl.peer_ctrlr_reset.audit_timer.ring )
|
||||
{
|
||||
if ( daemon_is_file_present ( RESET_PEER_NOW ) )
|
||||
{
|
||||
if ( ctrl.peer_ctrlr_reset.sync )
|
||||
{
|
||||
if ( ctrl.peer_ctrlr_reset.sync_timer.ring )
|
||||
{
|
||||
issue_reset_and_cleanup ();
|
||||
ctrl.peer_ctrlr_reset.sync_timer.ring = false ;
|
||||
}
|
||||
else if ( ctrl.peer_ctrlr_reset.sync_timer.tid == NULL )
|
||||
{
|
||||
if ( send_mtcClient_cmd ( &mtc_sock,
|
||||
MTC_CMD_SYNC,
|
||||
peer_controller.hostname,
|
||||
peer_controller.host_ip,
|
||||
mtc_config.mtc_rx_mgmnt_port) == PASS )
|
||||
{
|
||||
mtcTimer_start ( ctrl.peer_ctrlr_reset.sync_timer, timer_handler, MTC_SECS_10 );
|
||||
ilog("... waiting for peer controller to sync - %d secs", MTC_SECS_10);
|
||||
}
|
||||
else
|
||||
{
|
||||
elog("failed to send 'sync' command to peer controller mtcClient");
|
||||
ctrl.peer_ctrlr_reset.sync_timer.ring = true ;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
; /* wait longer */
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
issue_reset_and_cleanup ();
|
||||
}
|
||||
}
|
||||
ctrl.peer_ctrlr_reset.audit_timer.ring = false ;
|
||||
}
|
||||
}
|
||||
daemon_signal_hdlr ();
|
||||
}
|
||||
daemon_exit();
|
||||
|
@ -1573,7 +1725,7 @@ int run_hostservices_scripts ( unsigned int cmd )
|
|||
|
||||
|
||||
/* For the stop command we need the mtcClient to run both controller and
|
||||
* worker stop services if we are on a CPE system.
|
||||
* worker stop services if we are on a AIO system.
|
||||
* This saves the mtcAgent from having to issue and manage 2 commands,
|
||||
* one for controller and 1 for worker */
|
||||
if ( ctrl.system_type != SYSTEM_TYPE__NORMAL )
|
||||
|
@ -1750,7 +1902,6 @@ void daemon_sigchld_hdlr ( void )
|
|||
}
|
||||
default:
|
||||
{
|
||||
wlog ("child handler running with no active script set (%d)\n", ctrl.active_script_set );
|
||||
return ;
|
||||
}
|
||||
}
|
||||
|
@ -1820,6 +1971,84 @@ void daemon_sigchld_hdlr ( void )
|
|||
}
|
||||
}
|
||||
|
||||
/***************************************************************************
|
||||
*
|
||||
* Name : load_mtcInfo_msg
|
||||
*
|
||||
* Description: Extract the mtc info from the MTC_MSG_INFO message.
|
||||
*
|
||||
* Assumptions: So far only the peer controller reset feature uses this.
|
||||
*
|
||||
* Returns : Nothing
|
||||
*
|
||||
***************************************************************************/
|
||||
|
||||
void load_mtcInfo_msg ( mtc_message_type & msg )
|
||||
{
|
||||
if ( ctrl.nodetype & CONTROLLER_TYPE )
|
||||
{
|
||||
mlog1("%s", &msg.buf[0]);
|
||||
struct json_object *_obj = json_tokener_parse( &msg.buf[0] );
|
||||
if ( _obj )
|
||||
{
|
||||
if ( strcmp(&ctrl.hostname[0], CONTROLLER_0 ))
|
||||
peer_controller.hostname = CONTROLLER_0 ;
|
||||
else
|
||||
peer_controller.hostname = CONTROLLER_1 ;
|
||||
|
||||
struct json_object *info_obj = (struct json_object *)(NULL);
|
||||
json_bool json_rc = json_object_object_get_ex( _obj,
|
||||
"mtcInfo",
|
||||
&info_obj );
|
||||
if ( ( json_rc == TRUE ) && ( info_obj ))
|
||||
{
|
||||
struct json_object *ctrl_obj = (struct json_object *)(NULL);
|
||||
json_bool json_rc =
|
||||
json_object_object_get_ex( info_obj,
|
||||
peer_controller.hostname.data(),
|
||||
&ctrl_obj );
|
||||
|
||||
if (( json_rc == TRUE ) && ( ctrl_obj ))
|
||||
{
|
||||
peer_controller.host_ip = jsonUtil_get_key_value_string(ctrl_obj, MTC_JSON_INV_HOSTIP) ;
|
||||
peer_controller.bm_ip = jsonUtil_get_key_value_string(ctrl_obj, MTC_JSON_INV_BMIP) ;
|
||||
peer_controller.bm_un = jsonUtil_get_key_value_string(ctrl_obj, "bm_un");
|
||||
peer_controller.bm_pw = jsonUtil_get_key_value_string(ctrl_obj, "bm_pw");
|
||||
|
||||
/* log the mc info but not the bmc password ; only
|
||||
* indicate that it looks 'ok' or 'is 'none' */
|
||||
ilog ("%s is my peer [host:%s bmc:%s:%s:%s]",
|
||||
peer_controller.hostname.c_str(),
|
||||
peer_controller.host_ip.c_str(),
|
||||
peer_controller.bm_ip.c_str(),
|
||||
peer_controller.bm_un.c_str(),
|
||||
hostUtil_is_valid_pw(peer_controller.bm_pw) ? "ok":"none");
|
||||
}
|
||||
else
|
||||
{
|
||||
wlog("peer mtcInfo missing (rc:%d) ; %s",
|
||||
json_rc, &msg.buf[0]);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
wlog("mtcInfo label parse error (rc:%d) ; %s",
|
||||
json_rc, &msg.buf[0]);
|
||||
}
|
||||
json_object_put(_obj);
|
||||
}
|
||||
else
|
||||
{
|
||||
wlog("message buffer tokenize error ; %s", &msg.buf[0]);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
slog("%s got mtcInfo ; unexpected for this nodetype", ctrl.hostname);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Push daemon state to log file */
|
||||
void daemon_dump_info ( void )
|
||||
{
|
||||
|
@ -1853,13 +2082,13 @@ int daemon_run_testhead ( void )
|
|||
* STAGE 1: some test
|
||||
************************************************/
|
||||
printf ( "| Test %d : Maintenance Service Test ............. ", stage );
|
||||
if ( rc != PASS )
|
||||
if ( rc != PASS )
|
||||
{
|
||||
FAILED_STR ;
|
||||
rc = FAIL ;
|
||||
}
|
||||
else
|
||||
PASSED ;
|
||||
PASSED ;
|
||||
|
||||
printf ("+---------------------------------------------------------+\n");
|
||||
return PASS ;
|
||||
|
|
|
@ -17,6 +17,10 @@
|
|||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
|
||||
using namespace std;
|
||||
|
||||
#include "nodeTimers.h" /* for ... Timer Service */
|
||||
|
||||
/** Compute Config mask */
|
||||
#define CONFIG_CLIENT_MASK (CONFIG_AGENT_MTC_MGMNT_PORT |\
|
||||
CONFIG_CLIENT_MTC_MGMNT_PORT |\
|
||||
|
@ -59,6 +63,22 @@ typedef struct
|
|||
} script_ctrl_type ;
|
||||
void script_ctrl_init ( script_ctrl_type * script_ctrl_ptr );
|
||||
|
||||
/* peer controller reset control structure and associated definitions */
|
||||
|
||||
/* This is a flag file set by SM when SM wants maintanence to perform a
|
||||
* BMC reset of the other (peer) controller */
|
||||
#define RESET_PEER_NOW "/var/run/.sm_reset_peer"
|
||||
|
||||
#define PEER_CTRLR_AUDIT_PERIOD (2)
|
||||
typedef struct
|
||||
{
|
||||
struct
|
||||
mtc_timer sync_timer ;
|
||||
mtc_timer audit_timer ;
|
||||
int audit_period ;
|
||||
bool sync ;
|
||||
} peer_ctrlr_reset_type ;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
char hostname [MAX_HOST_NAME_SIZE+1];
|
||||
|
@ -76,7 +96,7 @@ typedef struct
|
|||
unsigned int function ;
|
||||
unsigned int subfunction ;
|
||||
|
||||
struct mtc_timer timer ; /* mtcAlive timer */
|
||||
struct mtc_timer timer ; /* mtcAlive timer */
|
||||
|
||||
bool clstr_iface_provisioned ;
|
||||
|
||||
|
@ -102,6 +122,7 @@ typedef struct
|
|||
/* Where to send events */
|
||||
string mtcAgent_ip ;
|
||||
|
||||
peer_ctrlr_reset_type peer_ctrlr_reset;
|
||||
} ctrl_type ;
|
||||
|
||||
ctrl_type * get_ctrl_ptr ( void );
|
||||
|
@ -109,5 +130,6 @@ ctrl_type * get_ctrl_ptr ( void );
|
|||
bool is_subfunction_worker ( void );
|
||||
int run_goenabled_scripts ( mtc_socket_type * sock_ptr , string requestor );
|
||||
int run_hostservices_scripts ( unsigned int cmd );
|
||||
void load_mtcInfo_msg ( mtc_message_type & msg );
|
||||
|
||||
#endif
|
||||
|
|
|
@ -1187,15 +1187,6 @@ int _self_provision ( void )
|
|||
|
||||
if ( my_identity.name == record_info.name )
|
||||
{
|
||||
/* If the active controller was 'locked' and is being auto-corrected
|
||||
* to 'unlocked' then ensure that there is no locked alarm set for it */
|
||||
if ( record_info.admin != "locked" )
|
||||
{
|
||||
mtcAlarm_clear ( my_identity.name, MTC_ALARM_ID__LOCK );
|
||||
/* this is not required because its already inited to clear */
|
||||
// node_ptr->alarms[MTC_ALARM_ID__LOCK] = FM_ALARM_SEVERITY_CLEAR
|
||||
}
|
||||
|
||||
if ( my_identity.mac != record_info.mac )
|
||||
{
|
||||
wlog ("%s mac address mismatch (%s - %s)\n",
|
||||
|
@ -1326,6 +1317,7 @@ void nodeLinkClass::fsm ( void )
|
|||
daemon_signal_hdlr ();
|
||||
mtcHttpSvr_look ( mtce_event );
|
||||
}
|
||||
mtcInv.mtcInfo_handler();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1515,9 +1507,9 @@ void daemon_service_run ( void )
|
|||
|
||||
if ( ts.tv_sec < MTC_MINS_15 )
|
||||
{
|
||||
/* CPE DOR window is much greater in CPE since heartbeat
|
||||
* cannot start until the inactive CPE has run both manifests */
|
||||
int timeout = DEFAULT_DOR_MODE_CPE_TIMEOUT ;
|
||||
/* AIO DOR window is much greater in AIO since heartbeat
|
||||
* cannot start until the inactive AIO has run both manifests */
|
||||
int timeout = DEFAULT_DOR_MODE_AIO_TIMEOUT ;
|
||||
|
||||
/* override the timeout to a smaller value for normal system */
|
||||
if ( mtcInv.system_type == SYSTEM_TYPE__NORMAL )
|
||||
|
@ -1601,7 +1593,7 @@ void daemon_service_run ( void )
|
|||
if ( mtcInv.system_type == SYSTEM_TYPE__NORMAL )
|
||||
mtc_sock.waitd.tv_usec = MTCAGENT_SELECT_TIMEOUT ;
|
||||
else
|
||||
mtc_sock.waitd.tv_usec = MTCAGENT_CPE_SELECT_TIMEOUT ;
|
||||
mtc_sock.waitd.tv_usec = MTCAGENT_AIO_SELECT_TIMEOUT ;
|
||||
|
||||
/* This is used as a delay up to select_timeout */
|
||||
rc = select( socks.back()+1, &mtc_sock.readfds, NULL, NULL, &mtc_sock.waitd);
|
||||
|
|
|
@ -63,6 +63,11 @@ int nodeLinkClass::fsm ( struct nodeLinkClass::node * node_ptr )
|
|||
|
||||
/* Monitor and Manage active threads */
|
||||
thread_handler ( node_ptr->bmc_thread_ctrl, node_ptr->bmc_thread_info );
|
||||
if ( node_ptr->bmc_thread_ctrl.stage == THREAD_STAGE__KILL )
|
||||
{
|
||||
/* do nothing while thread is being killed */
|
||||
return RETRY ;
|
||||
}
|
||||
|
||||
/* manage the host connected state and board management alarms */
|
||||
nodeLinkClass::bmc_handler ( node_ptr );
|
||||
|
@ -310,10 +315,10 @@ int nodeLinkClass::fsm ( struct nodeLinkClass::node * node_ptr )
|
|||
}
|
||||
|
||||
/****************************************************************************
|
||||
* No Op: Do nothing for this Healthy Enabled Locked CPE Simplex Host
|
||||
* No Op: Do nothing for this Healthy Enabled Locked AIO Simplex Host
|
||||
****************************************************************************
|
||||
*/
|
||||
else if (( this->system_type == SYSTEM_TYPE__CPE_MODE__SIMPLEX ) &&
|
||||
else if (( this->system_type == SYSTEM_TYPE__AIO__SIMPLEX ) &&
|
||||
( node_ptr->adminAction == MTC_ADMIN_ACTION__NONE ) &&
|
||||
( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED ))
|
||||
{
|
||||
|
|
|
@ -481,7 +481,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
if ( node_ptr->adminAction == MTC_ADMIN_ACTION__UNLOCK )
|
||||
{
|
||||
bool aio = false ;
|
||||
if ( SIMPLEX_CPE_SYSTEM )
|
||||
if ( SIMPLEX_AIO_SYSTEM )
|
||||
aio = true ;
|
||||
else
|
||||
aio = false ;
|
||||
|
@ -525,7 +525,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
}
|
||||
}
|
||||
mtcInvApi_update_states_now ( node_ptr, "unlocked", "disabled" , "offline", "disabled", "offline" );
|
||||
mtcInvApi_update_task_now ( node_ptr, aio ? MTC_TASK_CPE_SX_UNLOCK_MSG : MTC_TASK_SELF_UNLOCK_MSG );
|
||||
mtcInvApi_update_task_now ( node_ptr, aio ? MTC_TASK_AIO_SX_UNLOCK_MSG : MTC_TASK_SELF_UNLOCK_MSG );
|
||||
|
||||
wlog ("%s unlocking %s with reboot\n",
|
||||
my_hostname.c_str(),
|
||||
|
@ -546,7 +546,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
* Condition 1: While there is no in-service backup controller
|
||||
* to swact to. In this case the ctive controller
|
||||
* - is only degraded to avoid a system outage.
|
||||
* - the CPE subfunction is failed
|
||||
* - the AIO subfunction is failed
|
||||
* - worker SubFunction Alarm is raised
|
||||
* - Enable alarm is raised
|
||||
* - A process monitor alarm may also be raised if
|
||||
|
@ -648,7 +648,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
}
|
||||
else
|
||||
{
|
||||
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
{
|
||||
/* Raise Critical Compute Function Alarm */
|
||||
alarm_compute_failure ( node_ptr , FM_ALARM_SEVERITY_CRITICAL );
|
||||
|
@ -661,7 +661,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
node_ptr->graceful_recovery_counter = 0 ;
|
||||
node_ptr->health_threshold_counter = 0 ;
|
||||
|
||||
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
{
|
||||
node_ptr->inservice_failed_subf = true ;
|
||||
subfStateChange ( node_ptr, MTC_OPER_STATE__DISABLED,
|
||||
|
@ -1358,7 +1358,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
* have a worker function and the heartbeat for those hosts
|
||||
* are started at the end of the subfunction handler. */
|
||||
if (( THIS_HOST ) ||
|
||||
(( CPE_SYSTEM ) && ( is_controller(node_ptr)) ))
|
||||
(( AIO_SYSTEM ) && ( is_controller(node_ptr)) ))
|
||||
{
|
||||
enableStageChange ( node_ptr, MTC_ENABLE__STATE_CHANGE );
|
||||
}
|
||||
|
@ -1523,8 +1523,8 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
if ( is_controller(node_ptr) )
|
||||
{
|
||||
/* Defer telling SM the controller state if
|
||||
* this is a CPE and this is the only controller */
|
||||
if ( CPE_SYSTEM && ( num_controllers_enabled() > 0 ))
|
||||
* this is a AIO and this is the only controller */
|
||||
if ( AIO_SYSTEM && ( num_controllers_enabled() > 0 ))
|
||||
{
|
||||
wlog ("%s deferring SM enable notification till subfunction-enable complete\n",
|
||||
node_ptr->hostname.c_str());
|
||||
|
@ -1555,7 +1555,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
|
||||
enableStageChange ( node_ptr, MTC_ENABLE__START );
|
||||
|
||||
if (( CPE_SYSTEM ) && ( is_controller(node_ptr)))
|
||||
if (( AIO_SYSTEM ) && ( is_controller(node_ptr)))
|
||||
{
|
||||
ilog ("%s running worker sub-function enable handler\n", node_ptr->hostname.c_str());
|
||||
mtcInvApi_update_task ( node_ptr, MTC_TASK_ENABLING_SUBF );
|
||||
|
@ -1637,9 +1637,10 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
node_ptr->http_retries_cur = 0 ;
|
||||
node_ptr->unknown_health_reported = false ;
|
||||
|
||||
plog ("%s %sGraceful Recovery (uptime was %d)\n",
|
||||
plog ("%s %sGraceful Recovery (%d) (uptime was %d)\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->mnfa_graceful_recovery ? "MNFA " : "",
|
||||
node_ptr->graceful_recovery_counter,
|
||||
node_ptr->uptime );
|
||||
|
||||
/* Cancel any outstanding timers */
|
||||
|
@ -1660,7 +1661,8 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
* 2. Setting the node operational state to Disabled
|
||||
* 3. Setting the Enable action
|
||||
*/
|
||||
if ( ++node_ptr->graceful_recovery_counter > MTC_MAX_FAST_ENABLES )
|
||||
node_ptr->graceful_recovery_counter++ ;
|
||||
if ( node_ptr->graceful_recovery_counter > MTC_MAX_FAST_ENABLES )
|
||||
{
|
||||
/* gate off further mtcAlive messaging timme the offline
|
||||
* handler runs. This prevents stale messages from making it
|
||||
|
@ -1772,10 +1774,11 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
|
||||
else if ( node_ptr->mnfa_graceful_recovery == true )
|
||||
{
|
||||
if ( node_ptr->uptime > MTC_MINS_10 )
|
||||
if ( node_ptr->uptime > MTC_MINS_15 )
|
||||
{
|
||||
/* did not reboot case */
|
||||
wlog ("%s Connectivity Recovered ; host did not reset\n", node_ptr->hostname.c_str());
|
||||
wlog ("%s Connectivity Recovered ; host did not reset (uptime:%d)\n",
|
||||
node_ptr->hostname.c_str(), node_ptr->uptime);
|
||||
wlog ("%s ... continuing with MNFA graceful recovery\n", node_ptr->hostname.c_str());
|
||||
wlog ("%s ... with no affect to host services\n", node_ptr->hostname.c_str());
|
||||
|
||||
|
@ -1788,7 +1791,8 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
else
|
||||
{
|
||||
/* did reboot case */
|
||||
wlog ("%s Connectivity Recovered ; host has reset\n", node_ptr->hostname.c_str());
|
||||
wlog ("%s Connectivity Recovered ; host has reset (uptime:%d)\n",
|
||||
node_ptr->hostname.c_str(), node_ptr->uptime);
|
||||
ilog ("%s ... continuing with MNFA graceful recovery\n", node_ptr->hostname.c_str());
|
||||
ilog ("%s ... without additional reboot %s\n",
|
||||
node_ptr->hostname.c_str(), node_ptr->bm_ip.empty() ? "or reset" : "" );
|
||||
|
@ -1806,12 +1810,13 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
break ;
|
||||
}
|
||||
}
|
||||
else if (( node_ptr->uptime_save ) && ( node_ptr->uptime >= node_ptr->uptime_save ))
|
||||
else if ( node_ptr->uptime > MTC_MINS_15 )
|
||||
{
|
||||
/* did not reboot case */
|
||||
wlog ("%s Connectivity Recovered ; host did not reset%s\n",
|
||||
wlog ("%s Connectivity Recovered ; host did not reset%s (uptime:%d)",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->was_dor_recovery_mode ? " (DOR)" : "" );
|
||||
node_ptr->was_dor_recovery_mode ? " (DOR)" : "",
|
||||
node_ptr->uptime);
|
||||
|
||||
wlog ("%s ... continuing with graceful recovery\n", node_ptr->hostname.c_str());
|
||||
wlog ("%s ... with no affect to host services\n", node_ptr->hostname.c_str());
|
||||
|
@ -1875,7 +1880,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
MTC_OPER_STATE__DISABLED,
|
||||
MTC_AVAIL_STATUS__FAILED );
|
||||
|
||||
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
{
|
||||
subfStateChange ( node_ptr, MTC_OPER_STATE__DISABLED,
|
||||
MTC_AVAIL_STATUS__FAILED );
|
||||
|
@ -1905,7 +1910,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
{
|
||||
int timeout = 0 ;
|
||||
|
||||
/* Set the FSM task state to booting */
|
||||
/* Set the FSM task state to 'Graceful Recovery Wait' */
|
||||
node_ptr->uptime = 0 ;
|
||||
mtcInvApi_update_task ( node_ptr, MTC_TASK_RECOVERY_WAIT );
|
||||
|
||||
|
@ -2266,7 +2271,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
{
|
||||
/* The active controller would never get/be here but
|
||||
* if it did then just fall through to change state. */
|
||||
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
{
|
||||
/* Here we need to run the sub-fnction goenable and start
|
||||
* host services if this is the other controller in a AIO
|
||||
|
@ -2442,10 +2447,10 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
}
|
||||
else /* success path */
|
||||
{
|
||||
/* allow the fsm to wait for up to 1 minute for the
|
||||
* hbsClient's ready event before starting heartbeat
|
||||
/* allow the fsm to wait for up to 'worker config timeout'
|
||||
* for the hbsClient's ready event before starting heartbeat
|
||||
* test. */
|
||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_MINS_1 );
|
||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_WORKER_CONFIG_TIMEOUT );
|
||||
recoveryStageChange ( node_ptr, MTC_RECOVERY__HEARTBEAT_START );
|
||||
}
|
||||
break ;
|
||||
|
@ -2502,6 +2507,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
{
|
||||
if ( node_ptr->mtcTimer.ring == true )
|
||||
{
|
||||
ilog ("%s heartbeating", node_ptr->hostname.c_str());
|
||||
/* if heartbeat is not working then we will
|
||||
* never get here and enable the host */
|
||||
recoveryStageChange ( node_ptr, MTC_RECOVERY__STATE_CHANGE );
|
||||
|
@ -2510,7 +2516,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
}
|
||||
case MTC_RECOVERY__STATE_CHANGE:
|
||||
{
|
||||
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
{
|
||||
/* Set node as unlocked-enabled */
|
||||
subfStateChange ( node_ptr, MTC_OPER_STATE__ENABLED,
|
||||
|
@ -2555,7 +2561,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
else if ( rc == PASS )
|
||||
{
|
||||
/* Start Graceful Recovery */
|
||||
recoveryStageChange ( node_ptr, MTC_RECOVERY__ENABLE_START ) ;
|
||||
recoveryStageChange ( node_ptr, MTC_RECOVERY__ENABLE ) ;
|
||||
break ;
|
||||
}
|
||||
else if ( rc == FAIL_WORKQ_TIMEOUT )
|
||||
|
@ -2571,51 +2577,37 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
nodeLinkClass::force_full_enable ( node_ptr );
|
||||
break ;
|
||||
}
|
||||
case MTC_RECOVERY__ENABLE_START:
|
||||
case MTC_RECOVERY__ENABLE:
|
||||
{
|
||||
/* Create the recovery enable timer. This timer is short.
|
||||
* A node need to stay enabled with the hartbeat service
|
||||
* running for a period of time before declaring it enabled */
|
||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_HEARTBEAT_SOAK_BEFORE_ENABLE );
|
||||
|
||||
recoveryStageChange ( node_ptr, MTC_RECOVERY__ENABLE_WAIT ) ;
|
||||
break;
|
||||
}
|
||||
case MTC_RECOVERY__ENABLE_WAIT:
|
||||
{
|
||||
/* When this timer fires the host has been up for enough time */
|
||||
if ( node_ptr->mtcTimer.ring == true )
|
||||
if ( is_controller(node_ptr) )
|
||||
{
|
||||
if ( is_controller(node_ptr) )
|
||||
if ( mtcSmgrApi_request ( node_ptr,
|
||||
CONTROLLER_ENABLED,
|
||||
SMGR_MAX_RETRIES ) != PASS )
|
||||
{
|
||||
if ( mtcSmgrApi_request ( node_ptr,
|
||||
CONTROLLER_ENABLED,
|
||||
SMGR_MAX_RETRIES ) != PASS )
|
||||
{
|
||||
wlog ("%s Failed to send 'unlocked-disabled' to HA Service Manager ; allowing enable\n",
|
||||
node_ptr->hostname.c_str());
|
||||
}
|
||||
wlog ("%s Failed to send 'unlocked-enabled' to HA Service Manager ; allowing enable\n",
|
||||
node_ptr->hostname.c_str());
|
||||
}
|
||||
/* Node Has Recovered */
|
||||
node_ptr->graceful_recovery_counter = 0 ;
|
||||
recoveryStageChange ( node_ptr, MTC_RECOVERY__START );
|
||||
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
|
||||
node_ptr->health_threshold_counter = 0 ;
|
||||
node_ptr->enabled_count++ ;
|
||||
node_ptr->http_retries_cur = 0 ;
|
||||
|
||||
doneQueue_purge ( node_ptr );
|
||||
if ( node_ptr->was_dor_recovery_mode )
|
||||
{
|
||||
report_dor_recovery ( node_ptr , "is ENABLED" );
|
||||
}
|
||||
else
|
||||
{
|
||||
plog ("%s is ENABLED (Gracefully Recovered)\n",
|
||||
node_ptr->hostname.c_str());
|
||||
}
|
||||
alarm_enabled_clear ( node_ptr, false );
|
||||
}
|
||||
/* Node Has Recovered */
|
||||
node_ptr->graceful_recovery_counter = 0 ;
|
||||
recoveryStageChange ( node_ptr, MTC_RECOVERY__START );
|
||||
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
|
||||
node_ptr->health_threshold_counter = 0 ;
|
||||
node_ptr->enabled_count++ ;
|
||||
node_ptr->http_retries_cur = 0 ;
|
||||
|
||||
doneQueue_purge ( node_ptr );
|
||||
if ( node_ptr->was_dor_recovery_mode )
|
||||
{
|
||||
report_dor_recovery ( node_ptr , "is ENABLED" );
|
||||
}
|
||||
else
|
||||
{
|
||||
plog ("%s is ENABLED (Gracefully Recovered)\n",
|
||||
node_ptr->hostname.c_str());
|
||||
}
|
||||
alarm_enabled_clear ( node_ptr, false );
|
||||
break ;
|
||||
}
|
||||
default:
|
||||
|
@ -2783,7 +2775,7 @@ int nodeLinkClass::disable_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
MTC_OPER_STATE__DISABLED,
|
||||
locked_status );
|
||||
|
||||
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
{
|
||||
subfStateChange ( node_ptr, MTC_OPER_STATE__DISABLED,
|
||||
locked_status );
|
||||
|
@ -3432,7 +3424,7 @@ int nodeLinkClass::online_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
|
||||
/* otherwise change state */
|
||||
mtcInvApi_update_state(node_ptr, MTC_JSON_INV_AVAIL,"offline" );
|
||||
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
{
|
||||
mtcInvApi_update_state(node_ptr, MTC_JSON_INV_AVAIL_SUBF,"offline" );
|
||||
}
|
||||
|
@ -3473,7 +3465,7 @@ int nodeLinkClass::online_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
node_ptr->hostname.c_str());
|
||||
|
||||
mtcInvApi_update_state ( node_ptr, MTC_JSON_INV_AVAIL, "online" );
|
||||
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
{
|
||||
mtcInvApi_update_state ( node_ptr, MTC_JSON_INV_AVAIL_SUBF, "online" );
|
||||
}
|
||||
|
@ -6093,7 +6085,7 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
|
||||
mtcInfo_log(node_ptr);
|
||||
|
||||
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
{
|
||||
if ( daemon_is_file_present ( CONFIG_COMPLETE_WORKER ) == false )
|
||||
{
|
||||
|
@ -6120,52 +6112,38 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
mtcInvApi_update_state ( node_ptr, "availability", "available" );
|
||||
}
|
||||
|
||||
/* handle other cases */
|
||||
EFmAlarmSeverityT sev = mtcAlarm_state ( node_ptr->hostname,
|
||||
MTC_ALARM_ID__ENABLE);
|
||||
/* Query FM for existing Enable and Config alarm status */
|
||||
EFmAlarmSeverityT enable_alarm_severity =
|
||||
mtcAlarm_state ( node_ptr->hostname, MTC_ALARM_ID__ENABLE);
|
||||
EFmAlarmSeverityT config_alarm_severity =
|
||||
mtcAlarm_state ( node_ptr->hostname, MTC_ALARM_ID__CONFIG);
|
||||
|
||||
if ( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED )
|
||||
/* Clear generic enable alarm over process restart.
|
||||
* Will get reasserted if the cause condition still exists */
|
||||
if ( enable_alarm_severity != FM_ALARM_SEVERITY_CLEAR )
|
||||
{
|
||||
node_ptr->alarms[MTC_ALARM_ID__LOCK] = FM_ALARM_SEVERITY_WARNING ;
|
||||
|
||||
/* If the node is locked then the Enable alarm
|
||||
* should not be present */
|
||||
if ( sev != FM_ALARM_SEVERITY_CLEAR )
|
||||
{
|
||||
mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
|
||||
sev = FM_ALARM_SEVERITY_CLEAR ;
|
||||
}
|
||||
ilog ("%s found enable alarm ; clearing %s",
|
||||
node_ptr->hostname.c_str(),
|
||||
alarmUtil_getSev_str(enable_alarm_severity).c_str());
|
||||
mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
|
||||
}
|
||||
|
||||
/* Manage enable alarm over process restart.
|
||||
*
|
||||
* - clear the alarm in the active controller case
|
||||
* - maintain the alarm, set degrade state in MAJOR and CRIT cases
|
||||
* - clear alarm for all other severities.
|
||||
*/
|
||||
if ( THIS_HOST )
|
||||
/* The config alarm is maintained if it exists.
|
||||
* The in-service test handler will clear the alarm
|
||||
* if the config failure is gone */
|
||||
if ( config_alarm_severity != FM_ALARM_SEVERITY_CLEAR )
|
||||
{
|
||||
if ( sev != FM_ALARM_SEVERITY_CLEAR )
|
||||
{
|
||||
mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (( sev == FM_ALARM_SEVERITY_CRITICAL ) ||
|
||||
( sev == FM_ALARM_SEVERITY_MAJOR ))
|
||||
{
|
||||
node_ptr->alarms[MTC_ALARM_ID__ENABLE] = sev ;
|
||||
node_ptr->degrade_mask |= DEGRADE_MASK_ENABLE ;
|
||||
}
|
||||
else if ( sev != FM_ALARM_SEVERITY_CLEAR )
|
||||
{
|
||||
mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
|
||||
}
|
||||
node_ptr->degrade_mask |= DEGRADE_MASK_CONFIG ;
|
||||
node_ptr->alarms[MTC_ALARM_ID__CONFIG] = config_alarm_severity ;
|
||||
ilog ("%s found config alarm ; loaded %s",
|
||||
node_ptr->hostname.c_str(),
|
||||
alarmUtil_getSev_str(config_alarm_severity).c_str());
|
||||
}
|
||||
|
||||
if ( is_controller(node_ptr) )
|
||||
{
|
||||
this->controllers++ ;
|
||||
|
||||
mtc_cmd_enum state = CONTROLLER_DISABLED ;
|
||||
|
||||
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
|
||||
|
@ -6199,7 +6177,6 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
{
|
||||
ilog ("%s %s\n",node_ptr->hostname.c_str(), MTC_TASK_SWACT_COMPLETE );
|
||||
|
||||
/* Work Around for issue: */
|
||||
mtcInvApi_update_uptime ( node_ptr, node_ptr->uptime );
|
||||
|
||||
mtcInvApi_update_task ( node_ptr, MTC_TASK_SWACT_COMPLETE );
|
||||
|
@ -6233,7 +6210,6 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
mtcSmgrApi_request ( node_ptr, state , SWACT_FAIL_THRESHOLD );
|
||||
}
|
||||
}
|
||||
|
||||
if ( daemon_get_cfg_ptr()->debug_level & 1 )
|
||||
nodeLinkClass::host_print (node_ptr);
|
||||
|
||||
|
@ -6290,6 +6266,40 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
node_ptr->hostname.c_str(), node_ptr->uptime );
|
||||
break ;
|
||||
}
|
||||
/* Handle catching and recovering/restoring hosts that might
|
||||
* have been in the Graceful Recovery Wait state.
|
||||
*
|
||||
* Prevents an extra reboot for hosts that might be in
|
||||
* Graceful Recovery over a maintenance process restart. */
|
||||
else if (( NOT_THIS_HOST ) &&
|
||||
( !node_ptr->task.compare(MTC_TASK_RECOVERY_WAIT)))
|
||||
{
|
||||
ilog ("%s is in %s ; restoring state",
|
||||
node_ptr->hostname.c_str(),
|
||||
MTC_TASK_RECOVERY_WAIT);
|
||||
|
||||
/* Complete necessary add operations before switching
|
||||
* to Recovery */
|
||||
LOAD_NODETYPE_TIMERS ;
|
||||
workQueue_purge ( node_ptr );
|
||||
if (( hostUtil_is_valid_bm_type ( node_ptr->bm_type )) &&
|
||||
( hostUtil_is_valid_ip_addr ( node_ptr->bm_ip )) &&
|
||||
( hostUtil_is_valid_username ( node_ptr->bm_un )))
|
||||
{
|
||||
set_bm_prov ( node_ptr, true ) ;
|
||||
}
|
||||
mtcTimer_reset ( node_ptr->mtcTimer );
|
||||
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
|
||||
node_ptr->addStage = MTC_ADD__START;
|
||||
|
||||
/* Switch into recovery_handler's Graceful Recovery Wait
|
||||
* state with the Graceful Recovery Wait timeout */
|
||||
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__RECOVER );
|
||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler,
|
||||
node_ptr->mtcalive_timeout );
|
||||
recoveryStageChange ( node_ptr, MTC_RECOVERY__MTCALIVE_WAIT );
|
||||
break ;
|
||||
}
|
||||
else
|
||||
{
|
||||
if ( is_controller(node_ptr) )
|
||||
|
@ -6354,7 +6364,7 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
|
||||
send_hbs_command ( node_ptr->hostname, MTC_CMD_ADD_HOST );
|
||||
|
||||
if ( ( CPE_SYSTEM ) || ( is_worker (node_ptr) == true ))
|
||||
if ( ( AIO_SYSTEM ) || ( is_worker (node_ptr) == true ))
|
||||
{
|
||||
send_guest_command ( node_ptr->hostname, MTC_CMD_ADD_HOST );
|
||||
}
|
||||
|
@ -6368,6 +6378,7 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
}
|
||||
case MTC_ADD__WORKQUEUE_WAIT:
|
||||
{
|
||||
|
||||
rc = workQueue_done ( node_ptr );
|
||||
if ( rc == RETRY )
|
||||
{
|
||||
|
@ -6393,11 +6404,11 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
( node_ptr->operState == MTC_OPER_STATE__ENABLED ))
|
||||
{
|
||||
/* start the heartbeat service in all cases except for
|
||||
* THIS host and CPE controller hosts */
|
||||
* THIS host and AIO controller hosts */
|
||||
if ( NOT_THIS_HOST )
|
||||
{
|
||||
if (( LARGE_SYSTEM ) ||
|
||||
(( CPE_SYSTEM ) && ( this->dor_mode_active == false )))
|
||||
(( AIO_SYSTEM ) && ( this->dor_mode_active == false )))
|
||||
{
|
||||
send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST );
|
||||
}
|
||||
|
@ -6430,7 +6441,7 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
node_ptr->configAction = MTC_CONFIG_ACTION__INSTALL_PASSWD ;
|
||||
}
|
||||
|
||||
if (( ! SIMPLEX_CPE_SYSTEM ) &&
|
||||
if (( ! SIMPLEX_AIO_SYSTEM ) &&
|
||||
( node_ptr->bmc_provisioned == true ))
|
||||
{
|
||||
mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__BM );
|
||||
|
@ -6438,7 +6449,7 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
}
|
||||
|
||||
/* Special Add handling for the AIO system */
|
||||
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
{
|
||||
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
|
||||
( node_ptr->operState == MTC_OPER_STATE__ENABLED ))
|
||||
|
@ -6455,6 +6466,7 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
}
|
||||
|
||||
node_ptr->addStage = MTC_ADD__START;
|
||||
|
||||
plog ("%s Host Add Completed (uptime:%d)\n", node_ptr->hostname.c_str(), node_ptr->uptime );
|
||||
node_ptr->add_completed = true ;
|
||||
break ;
|
||||
|
@ -6635,6 +6647,8 @@ int nodeLinkClass::bmc_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
mtcInfo_set ( node_ptr, MTCE_INFO_KEY__BMC_PROTOCOL, BMC_PROTOCOL__IPMI_STR );
|
||||
node_ptr->bmc_protocol = BMC_PROTOCOL__IPMITOOL ;
|
||||
}
|
||||
/* store mtcInfo, which specifies the selected BMC protocol,
|
||||
* into the sysinv database */
|
||||
mtcInvApi_update_mtcInfo ( node_ptr );
|
||||
|
||||
ilog ("%s bmc control using %s:%s",
|
||||
|
@ -6751,8 +6765,15 @@ int nodeLinkClass::bmc_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
node_ptr->bmc_thread_ctrl.done = true ;
|
||||
node_ptr->bmc_thread_info.command = 0 ;
|
||||
}
|
||||
/* store mtcInfo, which specifies the selected BMC protocol,
|
||||
* into the sysinv database */
|
||||
mtcInvApi_update_mtcInfo ( node_ptr );
|
||||
|
||||
/* push the BMC access info out to the mtcClient when
|
||||
* a controller's BMC connection is established/verified */
|
||||
if ( node_ptr->nodetype & CONTROLLER_TYPE )
|
||||
this->want_mtcInfo_push = true ;
|
||||
|
||||
send_hwmon_command ( node_ptr->hostname, MTC_CMD_ADD_HOST );
|
||||
send_hwmon_command ( node_ptr->hostname, MTC_CMD_START_HOST );
|
||||
}
|
||||
|
@ -6942,6 +6963,11 @@ int nodeLinkClass::bmc_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
}
|
||||
} /* end power off detection handling */
|
||||
|
||||
/* push the BMC access info out to the mtcClient when
|
||||
* a controller's BMC connection is established/verified */
|
||||
if ( node_ptr->nodetype & CONTROLLER_TYPE )
|
||||
this->want_mtcInfo_push = true ;
|
||||
|
||||
send_hwmon_command ( node_ptr->hostname, MTC_CMD_ADD_HOST );
|
||||
send_hwmon_command ( node_ptr->hostname, MTC_CMD_START_HOST );
|
||||
|
||||
|
@ -7199,6 +7225,9 @@ int nodeLinkClass::oos_test_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
}
|
||||
}
|
||||
|
||||
/* audit alarms */
|
||||
mtcAlarm_audit (node_ptr );
|
||||
|
||||
break ;
|
||||
}
|
||||
case MTC_OOS_TEST__WAIT:
|
||||
|
@ -7494,7 +7523,7 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
* In the restart case the subfunction fsm enable handler is not run so
|
||||
* we try to detect the missing goenabled_subf flag as an inservice test.
|
||||
*
|
||||
* Only in CPE type
|
||||
* Only in AIO type
|
||||
* - clear the alarm if the issue goes away -
|
||||
* i.e. the goenabled tests eventually pass. Today
|
||||
* hey are not re-run in the background but someday they may be
|
||||
|
@ -7502,7 +7531,7 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
* and we have only a single enabled controller (which must be this one)
|
||||
* and the alarm is not already raised.
|
||||
**/
|
||||
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
{
|
||||
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
|
||||
( node_ptr->operState == MTC_OPER_STATE__ENABLED ) &&
|
||||
|
@ -7597,7 +7626,7 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
}
|
||||
}
|
||||
|
||||
/* Monitor the health of the host - no pass file */
|
||||
/* Monitor the health of the host */
|
||||
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
|
||||
( node_ptr->operState == MTC_OPER_STATE__ENABLED ) &&
|
||||
(( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ) ||
|
||||
|
@ -7623,6 +7652,11 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
ilog ("%s sm degrade clear\n", node_ptr->hostname.c_str());
|
||||
}
|
||||
|
||||
/*
|
||||
* In-service Config Failure/Alarm handling
|
||||
*/
|
||||
|
||||
/* Detect new config failure condition */
|
||||
if ( node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY)
|
||||
{
|
||||
/* not healthy .... */
|
||||
|
@ -7634,16 +7668,7 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
{
|
||||
wlog_throttled ( node_ptr->health_threshold_counter, (MTC_UNHEALTHY_THRESHOLD*10), "%s is UNHEALTHY\n", node_ptr->hostname.c_str());
|
||||
if ( node_ptr->health_threshold_counter >= MTC_UNHEALTHY_THRESHOLD )
|
||||
{
|
||||
node_ptr->degrade_mask |= DEGRADE_MASK_CONFIG ;
|
||||
|
||||
/* threshold is reached so raise the config alarm if it is not already raised */
|
||||
if ( node_ptr->alarms[MTC_ALARM_ID__CONFIG] != FM_ALARM_SEVERITY_CRITICAL )
|
||||
{
|
||||
mtcAlarm_critical ( node_ptr->hostname, MTC_ALARM_ID__CONFIG );
|
||||
node_ptr->alarms[MTC_ALARM_ID__CONFIG] = FM_ALARM_SEVERITY_CRITICAL ;
|
||||
}
|
||||
}
|
||||
alarm_config_failure ( node_ptr );
|
||||
}
|
||||
}
|
||||
else
|
||||
|
@ -7663,6 +7688,12 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
}
|
||||
}
|
||||
}
|
||||
/* or correct an alarmed config failure that has cleared */
|
||||
else if ( node_ptr->degrade_mask & DEGRADE_MASK_CONFIG )
|
||||
{
|
||||
if ( node_ptr->mtce_flags & MTC_FLAG__I_AM_HEALTHY )
|
||||
alarm_config_clear ( node_ptr );
|
||||
}
|
||||
else
|
||||
{
|
||||
node_ptr->health_threshold_counter = 0 ;
|
||||
|
|
|
@ -159,19 +159,20 @@ void nodeLinkClass::mnfa_recover_host ( struct nodeLinkClass::node * node_ptr )
|
|||
|
||||
if ( node_ptr->mnfa_graceful_recovery == true )
|
||||
{
|
||||
/* Restart the heartbeat for this recovered host */
|
||||
// send_hbs_command ( node_ptr->hostname, MTC_RESTART_HBS );
|
||||
|
||||
if ( node_ptr->adminAction != MTC_ADMIN_ACTION__RECOVER )
|
||||
{
|
||||
ilog ("%s graceful recovery from MNFA\n", node_ptr->hostname.c_str());
|
||||
recoveryStageChange ( node_ptr, MTC_RECOVERY__START );
|
||||
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__RECOVER );
|
||||
ilog ("%s graceful recovery (graceful recover count:%d)",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->graceful_recovery_counter);
|
||||
}
|
||||
else
|
||||
{
|
||||
wlog ("%s already gracefully recovering\n", node_ptr->hostname.c_str() );
|
||||
wlog ("%s graceful recovery restart (graceful recover count:%d)",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->graceful_recovery_counter );
|
||||
}
|
||||
recoveryStageChange ( node_ptr, MTC_RECOVERY__START );
|
||||
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__RECOVER );
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -298,43 +299,38 @@ void nodeLinkClass::mnfa_exit ( bool force )
|
|||
* Clear heartbeat degrades */
|
||||
for ( struct node * ptr = head ; ; ptr = ptr->next )
|
||||
{
|
||||
if ((( ptr->hbs_minor[CLSTR_IFACE] == true ) ||
|
||||
( ptr->hbs_minor[MGMNT_IFACE] == true )) &&
|
||||
( ptr->operState == MTC_OPER_STATE__ENABLED ))
|
||||
std::list<string>::iterator mnfa_awol_ptr ;
|
||||
for ( mnfa_awol_ptr = mnfa_awol_list.begin() ;
|
||||
mnfa_awol_ptr != mnfa_awol_list.end() ;
|
||||
mnfa_awol_ptr++ )
|
||||
{
|
||||
ptr->hbs_minor[MGMNT_IFACE] = false ;
|
||||
ptr->hbs_minor[CLSTR_IFACE] = false ;
|
||||
/* skip host if not in the mnfa pool */
|
||||
if ( ptr->hostname.compare(*(mnfa_awol_ptr)) )
|
||||
continue ;
|
||||
|
||||
if ( force == true )
|
||||
if ((( ptr->hbs_minor[CLSTR_IFACE] == true ) ||
|
||||
( ptr->hbs_minor[MGMNT_IFACE] == true )) &&
|
||||
( ptr->operState == MTC_OPER_STATE__ENABLED ))
|
||||
{
|
||||
elog ("... %s failed ; auto-recovering\n",
|
||||
ptr->hostname.c_str());
|
||||
ptr->hbs_minor[MGMNT_IFACE] = false ;
|
||||
ptr->hbs_minor[CLSTR_IFACE] = false ;
|
||||
|
||||
/* Set node as failed */
|
||||
availStatusChange ( ptr, MTC_AVAIL_STATUS__FAILED );
|
||||
enableStageChange ( ptr, MTC_ENABLE__START );
|
||||
adminActionChange ( ptr, MTC_ADMIN_ACTION__NONE );
|
||||
}
|
||||
else
|
||||
{
|
||||
if ( ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED )
|
||||
if ( force == true )
|
||||
{
|
||||
if ( ptr->degrade_mask == 0 )
|
||||
{
|
||||
availStatusChange ( ptr, MTC_AVAIL_STATUS__AVAILABLE );
|
||||
}
|
||||
}
|
||||
elog ("... %s failed ; auto-recovering\n",
|
||||
ptr->hostname.c_str());
|
||||
|
||||
if ( ptr->adminAction != MTC_ADMIN_ACTION__RECOVER )
|
||||
{
|
||||
recoveryStageChange ( ptr, MTC_RECOVERY__START );
|
||||
adminActionChange ( ptr, MTC_ADMIN_ACTION__RECOVER );
|
||||
/* Set node as failed */
|
||||
availStatusChange ( ptr, MTC_AVAIL_STATUS__FAILED );
|
||||
enableStageChange ( ptr, MTC_ENABLE__START );
|
||||
adminActionChange ( ptr, MTC_ADMIN_ACTION__NONE );
|
||||
}
|
||||
else
|
||||
{
|
||||
wlog ("%s already gracefully recovering\n", ptr->hostname.c_str() );
|
||||
mnfa_recover_host ( ptr );
|
||||
}
|
||||
}
|
||||
break ;
|
||||
}
|
||||
if (( ptr->next == NULL ) || ( ptr == tail ))
|
||||
break ;
|
||||
|
|
|
@ -125,11 +125,13 @@ int send_mtcAlive_msg ( mtc_socket_type * sock_ptr, string identity, int interfa
|
|||
|
||||
int recv_mtc_reply_noblock ( void );
|
||||
|
||||
int send_mtc_cmd ( string & hostname, int cmd, int interface );
|
||||
int send_mtc_cmd ( string & hostname, int cmd, int interface , string json_dict="" );
|
||||
int mtc_service_command ( mtc_socket_type * sock_ptr , int interface );
|
||||
int mtc_set_availStatus ( string & hostname, mtc_nodeAvailStatus_enum status );
|
||||
int mtce_send_event ( mtc_socket_type * sock_ptr, int cmd , const char * mtce_name_ptr );
|
||||
int mtce_send_event ( mtc_socket_type * sock_ptr, unsigned int cmd , const char * mtce_name_ptr );
|
||||
int mtc_clstr_init ( mtc_socket_type * sock_ptr , char * iface );
|
||||
string get_who_i_am ( void );
|
||||
|
||||
int send_mtcClient_cmd ( mtc_socket_type * sock_ptr, int cmd, string hostname, string address, int port);
|
||||
|
||||
#endif
|
||||
|
|
|
@ -96,7 +96,7 @@ int nodeLinkClass::mtcSmgrApi_request ( struct nodeLinkClass::node * node_ptr, m
|
|||
int rc = PASS ;
|
||||
string operation_string = "unknown" ;
|
||||
|
||||
if ( system_type == SYSTEM_TYPE__CPE_MODE__SIMPLEX )
|
||||
if ( system_type == SYSTEM_TYPE__AIO__SIMPLEX )
|
||||
{
|
||||
dlog ("%s simpex mode ; SM '%d' request not sent\n", node_ptr->hostname.c_str(), operation );
|
||||
return ( PASS );
|
||||
|
|
|
@ -110,14 +110,16 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
if ( node_ptr->mtce_flags & MTC_FLAG__SUBF_CONFIGURED )
|
||||
{
|
||||
mtcTimer_reset (node_ptr->mtcTimer);
|
||||
plog ("%s Subf Configured OK\n", name.c_str());
|
||||
plog ("%s Subf Configured OK (oob:%x)\n",
|
||||
name.c_str(), node_ptr->mtce_flags);
|
||||
enableStageChange ( node_ptr, MTC_ENABLE__GOENABLED_TIMER );
|
||||
alarm_config_clear ( node_ptr );
|
||||
break ;
|
||||
}
|
||||
|
||||
if ((( !node_ptr->mtce_flags & MTC_FLAG__I_AM_CONFIGURED )) ||
|
||||
(( node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY )))
|
||||
if (( node_ptr->mtce_flags ) &&
|
||||
(( !node_ptr->mtce_flags & MTC_FLAG__I_AM_CONFIGURED ) ||
|
||||
( node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY )))
|
||||
{
|
||||
mtcTimer_reset (node_ptr->mtcTimer);
|
||||
|
||||
|
@ -140,9 +142,10 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
/* timeout handling */
|
||||
else if ( node_ptr->mtcTimer.ring == true )
|
||||
{
|
||||
elog ("%s configuration timeout (%d secs)\n",
|
||||
elog ("%s configuration timeout (%d secs) (oob:%x)\n",
|
||||
name.c_str(),
|
||||
MTC_WORKER_CONFIG_TIMEOUT );
|
||||
MTC_WORKER_CONFIG_TIMEOUT,
|
||||
node_ptr->mtce_flags);
|
||||
|
||||
alarm_config_failure ( node_ptr );
|
||||
|
||||
|
@ -169,7 +172,7 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
*
|
||||
* issue: subfunction go-enable patching script fails and
|
||||
* maintenance reboots the active controller when no-reboot
|
||||
* patching maintenance in CPE.
|
||||
* patching maintenance in AIO.
|
||||
*
|
||||
* The fix is to avoid running the subfunction go-enabled tests
|
||||
* on self while patching.
|
||||
|
@ -490,7 +493,7 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
|
|||
|
||||
fail = true ;
|
||||
}
|
||||
else if ( this->system_type != SYSTEM_TYPE__CPE_MODE__SIMPLEX )
|
||||
else if ( this->system_type != SYSTEM_TYPE__AIO__SIMPLEX )
|
||||
{
|
||||
/* Loop over the heartbeat interfaces and fail the Enable if any of them are failing */
|
||||
for ( int i = 0 ; i < MAX_IFACES ; i++ )
|
||||
|
|
|
@ -231,6 +231,7 @@ typedef struct
|
|||
recovery_method_type recovery_method ; /**< How processes are recovered */
|
||||
bool reload_config ;
|
||||
bool patching_in_progress ;
|
||||
bool last_alarm_query_pass;
|
||||
|
||||
} pmon_ctrl_type ;
|
||||
void pmon_set_ctrl_ptr ( pmon_ctrl_type * ctrl_ptr );
|
||||
|
|
|
@ -38,14 +38,14 @@ void pmonAlarm_init ( void )
|
|||
alarmUtil_type * ptr ;
|
||||
|
||||
/** Process Failure Alarm ****************************************************/
|
||||
|
||||
|
||||
ptr = &alarm_list[PMON_ALARM_ID__PMOND];
|
||||
memset (&ptr->alarm, 0, (sizeof(SFmAlarmDataT)));
|
||||
snprintf(&ptr->alarm.alarm_id[0], FM_MAX_BUFFER_LENGTH, "%s", PMOND_ALARM_ID);
|
||||
|
||||
ptr->name = "process failure" ;
|
||||
ptr->instc_prefix = "process=" ;
|
||||
|
||||
|
||||
ptr->critl_reason = "";
|
||||
ptr->minor_reason = "";
|
||||
ptr->major_reason = "";
|
||||
|
@ -56,12 +56,12 @@ void pmonAlarm_init ( void )
|
|||
ptr->alarm.inhibit_alarms = FM_FALSE;
|
||||
ptr->alarm.service_affecting = FM_TRUE ;
|
||||
ptr->alarm.suppression = FM_TRUE ;
|
||||
|
||||
|
||||
ptr->alarm.severity = FM_ALARM_SEVERITY_CLEAR ; /* Dynamic */
|
||||
ptr->alarm.alarm_state = FM_ALARM_STATE_CLEAR ; /* Dynamic */
|
||||
|
||||
snprintf (ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH,
|
||||
"If problem consistently occurs after Host is locked and unlocked then "
|
||||
snprintf (ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH,
|
||||
"If problem consistently occurs after Host is locked and unlocked then "
|
||||
"contact next level of support for root cause analysis and recovery.");
|
||||
}
|
||||
|
||||
|
@ -97,38 +97,46 @@ EFmAlarmSeverityT pmonAlarm_state ( string hostname, pmon_alarm_id_enum id )
|
|||
|
||||
/******************************************************************************
|
||||
*
|
||||
* Name : manage_queried_alarms
|
||||
* Name : query_alarms
|
||||
*
|
||||
* Description: query FM for all the existing process monitor alarms and build
|
||||
* up the callers 'saved_alarm_list' with those process names and
|
||||
* corresponding severity.
|
||||
*
|
||||
* Assumptions: If the hostname is passed in as not empty then assume the clear
|
||||
* is requested.
|
||||
*
|
||||
* Updates : callers saved_alarm_list
|
||||
*
|
||||
* Returns : PASS if FM returns no error
|
||||
* FAIL_REQUEST ... alarmUtil_query_identity failed
|
||||
* FAIL_OPERATION ... fm_get_fault failed
|
||||
* FAIL_NULL_POINTER ... failed to get memory
|
||||
*
|
||||
******************************************************************************/
|
||||
|
||||
void manage_queried_alarms ( list<active_process_alarms_type> & saved_alarm_list, string hostname )
|
||||
int query_alarms ( list<active_process_alarms_type> & saved_alarm_list, string hostname )
|
||||
{
|
||||
static const char HOSTNAME_LABEL [] = "host=" ;
|
||||
static const char PROCNAME_LABEL [] = ".process=" ;
|
||||
|
||||
int rc = FAIL ;
|
||||
saved_alarm_list.clear();
|
||||
|
||||
/**
|
||||
* Query all the pmon alarms and if there is an alarm for a
|
||||
* process that is functioing properly then clear the alarm.
|
||||
**/
|
||||
SFmAlarmDataT * alarm_list_ptr = (SFmAlarmDataT*) malloc ((sizeof(SFmAlarmDataT)*PMON_MAX_ALARMS));
|
||||
if ( alarm_list_ptr )
|
||||
{
|
||||
if ( alarmUtil_query_identity ( pmonAlarm_getId_str(PMON_ALARM_ID__PMOND), alarm_list_ptr, PMON_MAX_ALARMS ) == PASS )
|
||||
/* Query all the pmon alarms */
|
||||
rc = alarmUtil_query_identity ( pmonAlarm_getId_str(PMON_ALARM_ID__PMOND), alarm_list_ptr, PMON_MAX_ALARMS );
|
||||
if ( rc == RETRY )
|
||||
{
|
||||
dlog ("no %s alarms found", pmonAlarm_getId_str(PMON_ALARM_ID__PMOND).c_str());
|
||||
rc = PASS ;
|
||||
}
|
||||
else if ( rc == PASS )
|
||||
{
|
||||
for ( int i = 0 ; i < PMON_MAX_ALARMS ; ++i )
|
||||
{
|
||||
/* loop over each active alarm and maintain its activity state */
|
||||
if ( strnlen ((alarm_list_ptr+i)->entity_instance_id , MAX_FILENAME_LEN ) )
|
||||
{
|
||||
int rc ;
|
||||
AlarmFilter alarm_filter ;
|
||||
SFmAlarmDataT alarm_query ;
|
||||
memset(&alarm_query, 0, sizeof(alarm_query));
|
||||
|
@ -139,34 +147,49 @@ void manage_queried_alarms ( list<active_process_alarms_type> & saved_alarm_lis
|
|||
|
||||
if (( rc = fm_get_fault ( &alarm_filter, &alarm_query )) == FM_ERR_OK )
|
||||
{
|
||||
string entity = alarm_filter.entity_instance_id ;
|
||||
size_t pos = entity.find("process=");
|
||||
if ( pos != std::string::npos )
|
||||
{
|
||||
string pn = entity.substr(pos+strlen("process="));
|
||||
ilog ("%s alarm is %s (process:%s)\n", alarm_filter.entity_instance_id,
|
||||
alarmUtil_getSev_str(alarm_query.severity).c_str(), pn.c_str());
|
||||
rc = PASS ;
|
||||
|
||||
/* filter out 'process=pmond' as that alarm is handled by hbsAgent */
|
||||
if ( pn.compare("pmond") )
|
||||
string entity = alarm_filter.entity_instance_id ;
|
||||
size_t pos_hn = entity.find(HOSTNAME_LABEL);
|
||||
size_t pos_pn = entity.find(PROCNAME_LABEL);
|
||||
|
||||
if (( pos_hn != std::string::npos ) &&
|
||||
( pos_pn != std::string::npos ))
|
||||
{
|
||||
string hn = entity.substr(pos_hn+strlen(HOSTNAME_LABEL), pos_pn-strlen(HOSTNAME_LABEL));
|
||||
string pn = entity.substr(pos_pn+strlen(PROCNAME_LABEL));
|
||||
|
||||
/* verify hostname */
|
||||
if ( ( hn.length() == 0 ) || ( hn != hostname ) )
|
||||
{
|
||||
if ( !hostname.empty() )
|
||||
{
|
||||
pmonAlarm_clear ( hostname, PMON_ALARM_ID__PMOND, pn );
|
||||
}
|
||||
else
|
||||
{
|
||||
active_process_alarms_type this_alarm ;
|
||||
this_alarm.process = pn ;
|
||||
this_alarm.severity = alarm_query.severity ;
|
||||
saved_alarm_list.push_front ( this_alarm );
|
||||
}
|
||||
/* ignore alarms not for this host */
|
||||
dlog ("%s %s %s alarm not for this host",
|
||||
entity.c_str(),
|
||||
hn.c_str(),
|
||||
pn.c_str());
|
||||
continue ;
|
||||
}
|
||||
dlog ("%s alarm is %s (process:%s)\n",
|
||||
alarm_filter.entity_instance_id,
|
||||
alarmUtil_getSev_str(alarm_query.severity).c_str(),
|
||||
pn.c_str());
|
||||
|
||||
/* filter out 'process=pmond'
|
||||
* ... that alarm is handled by hbsAgent */
|
||||
if ( pn != MTC_SERVICE_PMOND_NAME )
|
||||
{
|
||||
active_process_alarms_type this_alarm ;
|
||||
this_alarm.process = pn ;
|
||||
this_alarm.severity = alarm_query.severity ;
|
||||
saved_alarm_list.push_front ( this_alarm );
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
ilog ("fm_get_fault failed (rc:%d)\n", rc );
|
||||
wlog ("fm_get_fault failed (rc:%d)\n", rc );
|
||||
rc = FAIL_OPERATION ;
|
||||
break ;
|
||||
}
|
||||
}
|
||||
else
|
||||
|
@ -174,10 +197,21 @@ void manage_queried_alarms ( list<active_process_alarms_type> & saved_alarm_lis
|
|||
dlog2 ("last entry %d\n", i);
|
||||
break ;
|
||||
}
|
||||
}
|
||||
} /* for loop */
|
||||
}
|
||||
else
|
||||
{
|
||||
wlog("failed to query alarms from fm ; rc:%d", rc);
|
||||
rc = FAIL_REQUEST ;
|
||||
}
|
||||
free(alarm_list_ptr);
|
||||
}
|
||||
else
|
||||
{
|
||||
elog ("unable to allocate memory for alarm list");
|
||||
rc = FAIL_NULL_POINTER ;
|
||||
}
|
||||
return (rc);
|
||||
}
|
||||
|
||||
/************************* A L A R M I N G **************************/
|
||||
|
|
|
@ -37,8 +37,10 @@ typedef struct
|
|||
EFmAlarmSeverityT severity ;
|
||||
} active_process_alarms_type ;
|
||||
|
||||
/* Clear any pending alarms if the specified hostname is valid */
|
||||
void manage_queried_alarms ( list<active_process_alarms_type> & alarm_list, string hostname="" );
|
||||
/* Query FM for a list of Process Monitor (200.006) alarms */
|
||||
int query_alarms ( list<active_process_alarms_type> & alarm_list, string hostname="" );
|
||||
|
||||
void alarmed_process_audit ( void );
|
||||
|
||||
void pmonAlarm_init ( void );
|
||||
|
||||
|
|
|
@ -41,15 +41,6 @@ static struct mtc_timer ptimer[MAX_PROCESSES] ;
|
|||
std::list<string> config_files ;
|
||||
std::list<string>::iterator string_iter_ptr ;
|
||||
|
||||
/* If there is an alarm in the list that matches one in the process list
|
||||
* then update that process with its severity and failed state.
|
||||
* If there is a process in the saved list that is not in the process list
|
||||
* then clear its alarm as it is no longer valid.
|
||||
*/
|
||||
void manage_process_alarms ( list<active_process_alarms_type> & _list,
|
||||
process_config_type * const ptr,
|
||||
int const processes );
|
||||
|
||||
static process_config_type process_config[MAX_PROCESSES] ;
|
||||
|
||||
/* lookup process control by index and return its pointer if found.
|
||||
|
@ -216,6 +207,7 @@ void pmon_timer_init ( void )
|
|||
/* Init the timer for this process */
|
||||
mtcTimer_init ( process_config[i].pt_ptr, _pmon_ctrl_ptr->my_hostname, "process" ) ;
|
||||
}
|
||||
_pmon_ctrl_ptr->last_alarm_query_pass = false ;
|
||||
}
|
||||
|
||||
void _process_death_hdlr ( int sig_num, siginfo_t * info_ptr, void * context_ptr );
|
||||
|
@ -371,7 +363,7 @@ void init_process_config_memory ( void )
|
|||
* all the process config files from /etc/pmon.d */
|
||||
void load_processes ( void )
|
||||
{
|
||||
list<active_process_alarms_type> saved_alarm_list ;
|
||||
list<active_process_alarms_type> queried_alarm_list ;
|
||||
|
||||
int rc = PASS ;
|
||||
|
||||
|
@ -385,10 +377,6 @@ void load_processes ( void )
|
|||
close_process_socket ( &process_config[i] );
|
||||
}
|
||||
|
||||
/* Query fm for existing pmon process alarms and
|
||||
* for each that is found store their 'name' and
|
||||
* 'severity' in the passed in saved list */
|
||||
manage_queried_alarms ( saved_alarm_list );
|
||||
|
||||
/* init the process config memory */
|
||||
init_process_config_memory ();
|
||||
|
@ -454,13 +442,8 @@ void load_processes ( void )
|
|||
}
|
||||
_pmon_ctrl_ptr->reload_config = false ;
|
||||
|
||||
/* If there were process alarms that existed over the reload
|
||||
* then ensure that those processes are updated with that information. */
|
||||
if ( saved_alarm_list.size () )
|
||||
{
|
||||
ilog ("there are %ld active alarms over reload\n", saved_alarm_list.size());
|
||||
manage_process_alarms ( saved_alarm_list, &process_config[0], _pmon_ctrl_ptr->processes );
|
||||
}
|
||||
/* use the audit to clear pre-existing alarms at process startup */
|
||||
alarmed_process_audit ();
|
||||
}
|
||||
|
||||
|
||||
|
@ -1702,65 +1685,124 @@ void _process_death_hdlr ( int sig_num, siginfo_t * info_ptr, void * context_ptr
|
|||
}
|
||||
}
|
||||
|
||||
/************************************************************************
|
||||
/***************************************************************************
|
||||
*
|
||||
* Name : manage_process_alarms
|
||||
* Name : alarmed_process_audit
|
||||
*
|
||||
* Description: This interface manages process alarms over a process
|
||||
* configuration reload
|
||||
* Purpose : Verify the process state matches the queried alarm state
|
||||
*
|
||||
* Steps:
|
||||
* Description: To correct process alarm state mismatches.
|
||||
*
|
||||
* 1. Loop over each item in the list and mark the process as failed
|
||||
* with the specified severity level.
|
||||
*
|
||||
* 2. If the process is not found then clear its alarm as it is no
|
||||
* longer a valid process in the new profile and we don't want a
|
||||
* lingering stuck alarm.
|
||||
*
|
||||
*************************************************************************/
|
||||
***************************************************************************/
|
||||
|
||||
void manage_process_alarms ( list<active_process_alarms_type> & _list,
|
||||
process_config_type * const ptr,
|
||||
int const processes )
|
||||
void alarmed_process_audit ( void )
|
||||
{
|
||||
/* get out if the list is empty ; should not have been called if
|
||||
* empty but ... just in case */
|
||||
if ( ! _list.empty() )
|
||||
/* Don't audit FM in service after the last query was successful.
|
||||
* There is a blocking issue that needs to be dealt with */
|
||||
if ( _pmon_ctrl_ptr->last_alarm_query_pass == true )
|
||||
return ;
|
||||
|
||||
/*
|
||||
* Query fm for existing pmon process alarms and
|
||||
* for each that is found store their 'name' and
|
||||
* 'severity' in the passed in queried_alarm_list.
|
||||
*/
|
||||
list<active_process_alarms_type> queried_alarm_list ;
|
||||
int rc = query_alarms ( queried_alarm_list, get_ctrl_ptr()->my_hostname );
|
||||
_pmon_ctrl_ptr->last_alarm_query_pass = (rc == PASS);
|
||||
|
||||
/* just return if query failed */
|
||||
if ( _pmon_ctrl_ptr->last_alarm_query_pass == false )
|
||||
return ;
|
||||
|
||||
if ( queried_alarm_list.size () )
|
||||
{
|
||||
list<active_process_alarms_type>::iterator _iter_ptr ;
|
||||
|
||||
alog ("audit found %ld active alarms", queried_alarm_list.size());
|
||||
|
||||
/* loop over the list ... */
|
||||
for ( _iter_ptr=_list.begin(); _iter_ptr!=_list.end(); ++_iter_ptr )
|
||||
for ( _iter_ptr=queried_alarm_list.begin();
|
||||
_iter_ptr!=queried_alarm_list.end();
|
||||
++_iter_ptr )
|
||||
{
|
||||
/* for each item assum it is not found */
|
||||
bool found = false ;
|
||||
alog ("%s audit", _iter_ptr->process.c_str());
|
||||
|
||||
/* try and find this process in the new process profile */
|
||||
for ( int i = 0 ; i < processes ; i++ )
|
||||
/* find this process*/
|
||||
for ( int i = 0 ; (i < _pmon_ctrl_ptr->processes) && !found ; i++ )
|
||||
{
|
||||
if ( ! _iter_ptr->process.compare((ptr+i)->process) )
|
||||
{
|
||||
/* If the process is found then mark it as failed and update its severity.
|
||||
* At this point we then assume that there is an alarm raised for this process. */
|
||||
found = true ;
|
||||
process_config_type * ptr = &process_config[i];
|
||||
|
||||
(ptr+i)->failed = false ;
|
||||
wlog ("%s process was failed critical ; clearing existing alarm\n", _iter_ptr->process.c_str() );
|
||||
pmonAlarm_clear ( get_ctrl_ptr()->my_hostname, PMON_ALARM_ID__PMOND, _iter_ptr->process );
|
||||
if ( ! _iter_ptr->process.compare(ptr->process) )
|
||||
{
|
||||
found = true ;
|
||||
if ( ptr->failed == false )
|
||||
{
|
||||
ilog ("%s stale alarm ; clearing",
|
||||
_iter_ptr->process.c_str() );
|
||||
|
||||
pmonAlarm_clear ( get_ctrl_ptr()->my_hostname,
|
||||
PMON_ALARM_ID__PMOND,
|
||||
_iter_ptr->process );
|
||||
}
|
||||
else if ( _iter_ptr->severity != ptr->alarm_severity )
|
||||
{
|
||||
wlog ("%s alarm severity mismatch ; %s -> %s ; correcting",
|
||||
ptr->process,
|
||||
alarmUtil_getSev_str(_iter_ptr->severity).c_str(),
|
||||
alarmUtil_getSev_str(ptr->alarm_severity).c_str());
|
||||
if ( ptr->alarm_severity == FM_ALARM_SEVERITY_MINOR )
|
||||
{
|
||||
pmonAlarm_minor(get_ctrl_ptr()->my_hostname,
|
||||
PMON_ALARM_ID__PMOND,
|
||||
ptr->process, 0);
|
||||
}
|
||||
else if (ptr->alarm_severity == FM_ALARM_SEVERITY_MAJOR )
|
||||
{
|
||||
pmonAlarm_major(get_ctrl_ptr()->my_hostname,
|
||||
PMON_ALARM_ID__PMOND,
|
||||
ptr->process);
|
||||
}
|
||||
else if (ptr->alarm_severity == FM_ALARM_SEVERITY_CRITICAL )
|
||||
{
|
||||
pmonAlarm_critical(get_ctrl_ptr()->my_hostname,
|
||||
PMON_ALARM_ID__PMOND,
|
||||
ptr->process);
|
||||
}
|
||||
else
|
||||
{
|
||||
wlog ("%s unexpected severity '%s' ; clearing alarm",
|
||||
ptr->process,
|
||||
ptr->severity);
|
||||
|
||||
pmonAlarm_clear ( get_ctrl_ptr()->my_hostname,
|
||||
PMON_ALARM_ID__PMOND,
|
||||
ptr->process );
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
alog ("%s is alarmed '%s' ; audit",
|
||||
ptr->process,
|
||||
ptr->severity);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* if not found then just clear the alarm */
|
||||
if ( found == false)
|
||||
{
|
||||
wlog ("%s process alarm clear ; not in current process profile\n", _iter_ptr->process.c_str() );
|
||||
pmonAlarm_clear ( get_ctrl_ptr()->my_hostname, PMON_ALARM_ID__PMOND, _iter_ptr->process );
|
||||
wlog ("%s is not a monitored process ; clearing alarm",
|
||||
_iter_ptr->process.c_str());
|
||||
pmonAlarm_clear ( get_ctrl_ptr()->my_hostname,
|
||||
PMON_ALARM_ID__PMOND,
|
||||
_iter_ptr->process );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void pmon_service ( pmon_ctrl_type * ctrl_ptr )
|
||||
{
|
||||
std::list<int> socks ;
|
||||
|
@ -1931,6 +1973,8 @@ void pmon_service ( pmon_ctrl_type * ctrl_ptr )
|
|||
{
|
||||
_get_events ();
|
||||
mtcTimer_start ( pmonTimer_audit, pmon_timer_handler, audit_period );
|
||||
|
||||
alarmed_process_audit ();
|
||||
}
|
||||
|
||||
/* Run the degrade set/clear by audit */
|
||||
|
|
|
@ -1,16 +1,19 @@
|
|||
#daily
|
||||
nodateext
|
||||
#
|
||||
# Copyright (c) 2015-2021 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
/var/log/pmond.log
|
||||
{
|
||||
nodateext
|
||||
size 10M
|
||||
create 0640 root root
|
||||
start 1
|
||||
missingok
|
||||
size 10M
|
||||
rotate 20
|
||||
compress
|
||||
sharedscripts
|
||||
notifempty
|
||||
missingok
|
||||
postrotate
|
||||
systemctl reload syslog-ng > /dev/null 2>&1 || true
|
||||
endscript
|
||||
delaycompress
|
||||
}
|
||||
|
|
|
@ -1,7 +1,11 @@
|
|||
#
|
||||
# Copyright (c) 2020-2021 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
/var/log/crash/vmcore.tar
|
||||
/var/log/crash/vmcore_first.tar
|
||||
{
|
||||
nodateext
|
||||
size 1K
|
||||
start 1
|
||||
rotate 1
|
||||
|
|
|
@ -87,6 +87,10 @@ sched_delay_threshold = 300 ; scheduler delay time in msecs that will trigger
|
|||
daemon_log_port = 2121 ; daemon logger port
|
||||
mtcalarm_req_port = 2122 ;
|
||||
|
||||
sync_b4_peer_ctrlr_reset = 0 ; issue a sync command to peer controller mtcClient
|
||||
; before issuing BMC reset.
|
||||
|
||||
|
||||
[timeouts] ; configurable maintenance timeout values in seconds
|
||||
|
||||
failsafe_shutdown_delay = 120;
|
||||
|
|
|
@ -1,59 +1,67 @@
|
|||
#daily
|
||||
|
||||
# Apply all these options to all the logs
|
||||
nodateext
|
||||
start 1
|
||||
compress
|
||||
notifempty
|
||||
missingok
|
||||
sharedscripts
|
||||
postrotate
|
||||
systemctl reload syslog-ng > /dev/null 2>&1 || true
|
||||
endscript
|
||||
|
||||
#
|
||||
# Copyright (c) 2015-2021 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
/var/log/mtcAgent.log
|
||||
{
|
||||
size 100M
|
||||
create 0640 root root
|
||||
start 1
|
||||
rotate 10
|
||||
size 100M
|
||||
compress
|
||||
notifempty
|
||||
missingok
|
||||
postrotate
|
||||
systemctl reload syslog-ng > /dev/null 2>&1 || true
|
||||
endscript
|
||||
delaycompress
|
||||
}
|
||||
|
||||
/var/log/hbsAgent.log
|
||||
{
|
||||
size 20M
|
||||
rotate 5
|
||||
}
|
||||
|
||||
/var/log/mtcClient.log
|
||||
{
|
||||
size 20M
|
||||
rotate 5
|
||||
}
|
||||
|
||||
/var/log/hbsClient.log
|
||||
{
|
||||
size 20M
|
||||
create 0640 root root
|
||||
start 1
|
||||
rotate 5
|
||||
size 20M
|
||||
compress
|
||||
notifempty
|
||||
missingok
|
||||
postrotate
|
||||
systemctl reload syslog-ng > /dev/null 2>&1 || true
|
||||
endscript
|
||||
delaycompress
|
||||
}
|
||||
|
||||
/var/log/mtclogd.log
|
||||
{
|
||||
size 10M
|
||||
create 0640 root root
|
||||
start 1
|
||||
rotate 5
|
||||
size 10M
|
||||
compress
|
||||
notifempty
|
||||
missingok
|
||||
postrotate
|
||||
systemctl reload syslog-ng > /dev/null 2>&1 || true
|
||||
endscript
|
||||
delaycompress
|
||||
}
|
||||
|
||||
# The mtclogd opens and closes these log files on every log addition.
|
||||
# Therefore does not require a notification over log rotation.
|
||||
/var/log/mtcAgent_event.log
|
||||
/var/log/mtcAgent_alarm.log
|
||||
/var/log/mtcAgent_api.log
|
||||
{
|
||||
size 20M
|
||||
create 0640 root root
|
||||
start 1
|
||||
rotate 5
|
||||
}
|
||||
|
||||
/var/log/mtcAgent_event.log
|
||||
{
|
||||
size 20M
|
||||
rotate 5
|
||||
}
|
||||
/var/log/mtcAgent_alarm.log
|
||||
{
|
||||
size 10M
|
||||
rotate 5
|
||||
compress
|
||||
notifempty
|
||||
missingok
|
||||
delaycompress
|
||||
}
|
||||
|
|
|
@ -18,6 +18,28 @@ usage ()
|
|||
exit 1
|
||||
}
|
||||
|
||||
# Systemd automatically remounts all the mounted filesystems at shutdown
|
||||
# When we are deleting a partition, we have to unmount its corresponding filesystem
|
||||
# because remounting deleted filesystems at shutdown will throw errors
|
||||
unmount_fs()
|
||||
{
|
||||
local fs=$1
|
||||
local ret_code=0
|
||||
echo "Trying to unmount $fs"
|
||||
if findmnt $fs > /dev/null 2>&1 ; then
|
||||
if umount -f $fs ; then
|
||||
echo "$fs has been successfully unmounted"
|
||||
else
|
||||
echo "Error! Failed to unmount $fs"
|
||||
ret_code=1
|
||||
fi
|
||||
else
|
||||
echo "Warning! $fs is not mounted"
|
||||
ret_code=2
|
||||
fi
|
||||
return $ret_code
|
||||
}
|
||||
|
||||
OPTS=`getopt -o h -l force -- "$@"`
|
||||
if [ $? != 0 ]
|
||||
then
|
||||
|
@ -100,11 +122,14 @@ fi
|
|||
BACKUP_PART_GUID="BA5EBA11-0000-1111-2222-000000000002"
|
||||
part_type_guid_str="Partition GUID code"
|
||||
|
||||
# get the nodetype variable to check later if this node is a controller
|
||||
. /etc/platform/platform.conf
|
||||
|
||||
for dev in $WIPE_HDD
|
||||
do
|
||||
if [[ -e $dev ]]
|
||||
then
|
||||
if [ "$dev" == "$rootfs" ]
|
||||
if [[ "$dev" == "$rootfs" && "${nodetype}" == "controller" ]]
|
||||
then
|
||||
part_numbers=( $(parted -s $dev print | awk '$1 == "Number" {i=1; next}; i {print $1}') )
|
||||
for part_number in "${part_numbers[@]}"; do
|
||||
|
@ -128,6 +153,7 @@ do
|
|||
# Skip / or we will lose access to the tools on the system.
|
||||
if [[ $part != $rootfs_part ]]
|
||||
then
|
||||
unmount_fs $part
|
||||
dd if=/dev/zero of=$part bs=512 count=34
|
||||
dd if=/dev/zero of=$part bs=512 count=34 seek=$((`blockdev --getsz $part` - 34))
|
||||
fi
|
||||
|
@ -141,6 +167,7 @@ do
|
|||
else
|
||||
echo "Wiping $dev..."
|
||||
wipefs -f -a $dev
|
||||
unmount_fs $dev
|
||||
|
||||
# Clearing previous GPT tables or LVM data
|
||||
# Delete the first few bytes at the start and end of the partition. This is required with
|
||||
|
|
Loading…
Reference in New Issue