62532a7eac
Maintenance's success path messaging does not depend on cluster network messaging. However, there are a number of failure mode cases that do depend on cluster network messaging to properly diagnose and offer a higher availability handling for some failure cases. For instance, when the management interface goes down, without cluster network messaging remote hosts can be isolated. Being able to command- reboot a host over cluster-host network offers higher availability. Maintenance is designed to use the cluster network, if provisioned, as a backup path for mtcAlive, node locked, reboot and several other commands and acknowledgements. Unfortunately, it was recently observed that maintenance is using the 'nfs-controller' label to resolve cluster network addressing which resolves to management network IPs. As a result all messages intended to be going over the cluster-host network are instead just redundant management network messages. During debug of this issue several additional cluster network messaging related issues were observed and fixed. This update implements the following fixes 1. since there is no floating address for the cluster network the mtcClient was modified to send messages to both controllers where only the active controller will be listening and acting. 2. fixes port number mtce listens for cluster-host network messages 3. fixes port number mtce sends cluster-host network messages to. 4. mtcAlive messages are also sent on provisioned cluster network. 5. locked state notifications and acks sent on provisioned cluster network. 6. reboot request and acks sent on provisioned cluster network. 7. fixed command acknowledgement messaging. This update also 1. envelopes the mtcAlive gate control to allow debug tracing of all gate state changes. 2. moves graceful recovery handling heartbeat failure state clear to the end of the recovery handler, just before heartbeat start. 3. adds sm unhealthy support to fail and automatically recover the inactive controller from an SM UNHEALTHY state. ---------- Test Plan: ---------- Functional: PASS: Verify management network messaging PASS: Verify cluster-host network messaging PASS: Verify cluster-host messages with tcpdump PASS: Verify cluster-host network mtcAlive messaging PASS: Verify reboot request and ack reply over management network PASS: Verify reboot request and ack reply over cluster-host network PASS: Verify lock state notification and ack reply over management network PASS: Verify lock state notification and ack reply over cluster-host network PASS: Verify acknowledgement messaging PASS: Verify maintenance daemon logging PASS: Verify maintenance socket initialization System: PASS: Verify compute system install PASS: Verify AIO system install Feature: PASS: Verify sm node unhealth handling (active:ignore, inactive:recover) Change-Id: I092596d3e22438dd8a613a073614c188f6f5721d Closes-Bug: #835268 Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
114 lines
3.5 KiB
C
114 lines
3.5 KiB
C
#ifndef __INCLUDE_MTCNODECOMP_HH__
|
|
#define __INCLUDE_MTCNODECOMP_HH__
|
|
/*
|
|
* Copyright (c) 2015-2016 Wind River Systems, Inc.
|
|
*
|
|
* SPDX-License-Identifier: Apache-2.0
|
|
*
|
|
*/
|
|
|
|
/**
|
|
* @file
|
|
* Wind River CGTS Platform Node Maintenance Client 'mtcClient' Header
|
|
*
|
|
*/
|
|
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <unistd.h>
|
|
|
|
/** Compute Config mask */
|
|
#define CONFIG_CLIENT_MASK (CONFIG_AGENT_MTC_MGMNT_PORT |\
|
|
CONFIG_CLIENT_MTC_MGMNT_PORT |\
|
|
CONFIG_CLIENT_MTC_CLSTR_PORT)
|
|
|
|
#define MAX_RUN_SCRIPTS (20)
|
|
|
|
typedef enum
|
|
{
|
|
NO_SCRIPTS,
|
|
GOENABLED_MAIN_SCRIPTS,
|
|
GOENABLED_SUBF_SCRIPTS,
|
|
HOSTSERVICES_SCRIPTS,
|
|
} script_set_enum ;
|
|
|
|
typedef struct
|
|
{
|
|
int status ; /* script execution exit status */
|
|
pid_t pid ; /* the script's PID */
|
|
bool done ; /* set to true when a script has completed */
|
|
string name ; /* the full path/filename of the script */
|
|
|
|
time_debug_type time_start ; /* time stamps used to measure the */
|
|
time_debug_type time_stop ; /* execution time of */
|
|
time_delta_type time_delta ; /* the script */
|
|
} script_exec_type;
|
|
void script_exec_init ( script_exec_type * script_exec_ptr );
|
|
|
|
typedef struct
|
|
{
|
|
unsigned int posted ; /* posted for execution command */
|
|
unsigned int monitor ; /* set to the previously posted command
|
|
* after this commands' scripts have
|
|
* been launched. */
|
|
int scripts ; /* the number of scripts to run */
|
|
int scripts_done ; /* number of scripts that completed */
|
|
struct mtc_timer timer ; /* the scripts completion timeout timer */
|
|
script_exec_type script[MAX_RUN_SCRIPTS]; /* array of script exec status */
|
|
|
|
} script_ctrl_type ;
|
|
void script_ctrl_init ( script_ctrl_type * script_ctrl_ptr );
|
|
|
|
typedef struct
|
|
{
|
|
char hostname [MAX_HOST_NAME_SIZE+1];
|
|
string macaddr ;
|
|
string address ;
|
|
string address_clstr ;
|
|
string who_i_am ;
|
|
|
|
string nodetype_str ;
|
|
|
|
string mgmnt_iface ;
|
|
string clstr_iface ;
|
|
|
|
unsigned int nodetype ;
|
|
unsigned int function ;
|
|
unsigned int subfunction ;
|
|
|
|
struct mtc_timer timer ; /* mtcAlive timer */
|
|
|
|
bool clstr_iface_provisioned ;
|
|
|
|
/* tracks the time the level specific goenabled file was last created */
|
|
time_t goenabled_main_time ;
|
|
time_t goenabled_subf_time ;
|
|
|
|
/* Go Enable Control execution control struct, timing and completion status */
|
|
script_ctrl_type goenabled ;
|
|
|
|
/* Start/Stop Hosts Services execution control timing and completion status */
|
|
script_ctrl_type hostservices ;
|
|
|
|
/* The script set that is executing */
|
|
script_set_enum active_script_set ;
|
|
|
|
/* The list of posted script set requests */
|
|
list<script_set_enum> posted_script_set;
|
|
|
|
/* The system type */
|
|
system_type_enum system_type ;
|
|
|
|
/* Where to send events */
|
|
string mtcAgent_ip ;
|
|
|
|
} ctrl_type ;
|
|
|
|
ctrl_type * get_ctrl_ptr ( void );
|
|
|
|
bool is_subfunction_worker ( void );
|
|
int run_goenabled_scripts ( mtc_socket_type * sock_ptr , string requestor );
|
|
int run_hostservices_scripts ( unsigned int cmd );
|
|
|
|
#endif
|