
This update replaces compute references to worker in mtce, kickstarts, installer and bsp files. Tests Performed: Non-containerized deployment AIO-SX: Sanity and Nightly automated test suite AIO-DX: Sanity and Nightly automated test suite 2+2 System: Sanity and Nightly automated test suite 2+2 System: Horizon Patch Orchestration Kubernetes deployment: AIO-SX: Create, delete, reboot and rebuild instances 2+2+2 System: worker nodes are unlock enable and no alarms Story: 2004022 Task: 27013 Depends-On: https://review.openstack.org/#/c/624452/ Change-Id: I225f7d7143d841f80459603b27b95ac3f846c46f Signed-off-by: Tao Liu <tao.liu@windriver.com>
4925 lines
190 KiB
C++
4925 lines
190 KiB
C++
/*
|
|
* Copyright (c) 2013-2017 Wind River Systems, Inc.
|
|
*
|
|
* SPDX-License-Identifier: Apache-2.0
|
|
*
|
|
*/
|
|
|
|
/**
|
|
* @file
|
|
* Wind River CGCS Platform Resource Monitor Handler
|
|
*/
|
|
#include "rmon.h" /* rmon header file */
|
|
#include "rmonHttp.h" /* for rmon HTTP libEvent utilties */
|
|
#include "rmonApi.h" /* vswitch calls */
|
|
#include <sys/wait.h>
|
|
#include <time.h>
|
|
#include <signal.h>
|
|
#include <fstream>
|
|
#include <sstream>
|
|
#include <ctime>
|
|
#include <vector> /* for storing dynamic resource names */
|
|
#include <dirent.h>
|
|
#include <algorithm>
|
|
#include <fcntl.h>
|
|
#include <sys/mman.h>
|
|
#include <sys/syscall.h>
|
|
#include <cctype>
|
|
#include <pthread.h>
|
|
#include <linux/rtnetlink.h> /* for ... RTMGRP_LINK */
|
|
#include "nlEvent.h" /* for ... open_netlink_socket */
|
|
#include "nodeEvent.h" /* for inotify */
|
|
#include <json-c/json.h> /* for ... json-c json string parsing */
|
|
#include "jsonUtil.h"
|
|
#include "tokenUtil.h" /* for ... tokenUtil_new_token */
|
|
|
|
/* Preserve a local copy of a pointer to the control struct to
|
|
* avoid having to publish a get utility prototype into rmon.h */
|
|
static rmon_ctrl_type * _rmon_ctrl_ptr = NULL ;
|
|
static interface_resource_config_type interface_resource_config[MAX_RESOURCES] ;
|
|
static resource_config_type resource_config[MAX_RESOURCES] ;
|
|
static thinmeta_resource_config_type thinmeta_resource_config[MAX_RESOURCES] ;
|
|
static registered_clients registered_clt[MAX_CLIENTS];
|
|
|
|
static libEvent_type ceilometerEvent; // for ceilometer REST API request
|
|
static libEvent tokenEvent; // for token request
|
|
|
|
/* Used to set alarms through the FM API */
|
|
static SFmAlarmDataT alarmData;
|
|
static struct mtc_timer rmonTimer_event ;
|
|
static struct mtc_timer rmonTimer_pm ;
|
|
static struct mtc_timer rmonTimer_ntp ;
|
|
|
|
static struct mtc_timer rtimer[MAX_RESOURCES] ;
|
|
static struct mtc_timer thinmetatimer[MAX_RESOURCES] ;
|
|
|
|
static ntpStage_enum ntp_stage ; /* The stage the ntp is in within the resource handler fsm */
|
|
static int ntp_status ; /* status returned by the ntpq command */
|
|
static int ntp_child_pid ;
|
|
|
|
/* for dynamic resources */
|
|
bool modifyingResources = false;
|
|
vector<string> criticality_resource;
|
|
vector<string> dynamic_resource;
|
|
vector<string> types;
|
|
vector<string> devices;
|
|
vector<int> fs_index;
|
|
vector<string> fs_state;
|
|
|
|
/** List of config files */
|
|
std::list<string> config_files ;
|
|
std::list<string>::iterator string_iter_ptr ;
|
|
std::list<string> interface_config_files ;
|
|
|
|
/* percent or abs value for fs resources */
|
|
int fs_percent = 0;
|
|
int swact_count = 0;
|
|
|
|
/* for cpu usage */
|
|
time_t t1, t2;
|
|
int num_cpus = 0;
|
|
int num_base_cpus = 0;
|
|
int included_cpu[MAX_BASE_CPU];
|
|
|
|
static string hostUUID = "";
|
|
|
|
/* Initial cpu time */
|
|
vector<unsigned long long> cpu_time_initial;
|
|
/* Later cpu time */
|
|
vector<unsigned long long> cpu_time_later;
|
|
|
|
void save_fs_resource ( string resource_name, string criticality,
|
|
int enabled, int percent, int abs_values[3],
|
|
int alarm_type, string type, string device, int mounted );
|
|
void calculate_fs_usage( resource_config_type * ptr );
|
|
void _space_to_underscore (string & str );
|
|
|
|
struct thread_data
|
|
{
|
|
pid_t tid;
|
|
pid_t pid;
|
|
unsigned long long nr_switches_count;
|
|
bool thread_running;
|
|
double resource_usage;
|
|
resource_config_type * resource;
|
|
};
|
|
|
|
/* info passed to pthreads */
|
|
struct thread_data t_data;
|
|
pthread_t thread;
|
|
pthread_mutex_t lock;
|
|
|
|
/* strict memory accounting off = 0 or on = 1 */
|
|
int IS_STRICT = 0;
|
|
|
|
void mem_log_ctrl ( rmon_ctrl_type * ptr )
|
|
{
|
|
#define MAX_LEN 500
|
|
char str[MAX_LEN] ;
|
|
snprintf (&str[0], MAX_LEN, "%s %s %s\n",
|
|
&ptr->my_hostname[0],
|
|
ptr->my_address.c_str(),
|
|
ptr->my_macaddr.c_str() );
|
|
mem_log(str);
|
|
}
|
|
|
|
void mem_log_resource ( resource_config_type * ptr )
|
|
{
|
|
#define MAX_LEN 500
|
|
char str[MAX_LEN] ;
|
|
snprintf (&str[0], MAX_LEN, "Resource:%-15s Sev:%-8s Tries:%u Debounce:%d\n",
|
|
ptr->resource, ptr->severity, ptr->count, ptr->debounce);
|
|
mem_log(str);
|
|
}
|
|
|
|
void mem_log_interface_resource ( interface_resource_config_type * ptr )
|
|
{
|
|
#define MAX_LEN 500
|
|
char str[MAX_LEN] ;
|
|
snprintf (&str[0], MAX_LEN, "Resource:%-15s Sev:%-8s Debounce:%d\n",
|
|
ptr->resource, ptr->severity, ptr->debounce);
|
|
mem_log(str);
|
|
}
|
|
|
|
int _config_dir_load (void);
|
|
int _config_files_load (void);
|
|
|
|
|
|
const char rmonStages_str [RMON_STAGE__STAGES][32] =
|
|
{
|
|
"Handler-Init",
|
|
"Handler-Start",
|
|
"Manage-Restart",
|
|
"Monitor-Wait",
|
|
"Monitor-Resource",
|
|
"Restart-Wait",
|
|
"Ignore-Resource",
|
|
"Handler-Finish",
|
|
"Failed-Resource",
|
|
"Failed-Resource-clr",
|
|
} ;
|
|
|
|
const char ntpStages_str [NTP_STAGE__STAGES][32] =
|
|
{
|
|
"Begin",
|
|
"Execute-NTPQ",
|
|
"Execute-NTPQ-Wait",
|
|
} ;
|
|
|
|
registered_clients * get_registered_clients_ptr ( int index )
|
|
{
|
|
if ( index <= _rmon_ctrl_ptr->clients )
|
|
return ( ®istered_clt[index] );
|
|
return ( NULL );
|
|
}
|
|
|
|
rmon_ctrl_type * get_rmon_ctrl_ptr ()
|
|
{
|
|
return _rmon_ctrl_ptr;
|
|
}
|
|
|
|
interface_resource_config_type * get_interface_ptr ( int index )
|
|
{
|
|
if ( index <= _rmon_ctrl_ptr->interface_resources )
|
|
return ( &interface_resource_config[index] );
|
|
return ( NULL );
|
|
}
|
|
|
|
resource_config_type * get_resource_ptr ( int index )
|
|
{
|
|
if ( index >= 0 && index <= _rmon_ctrl_ptr->resources )
|
|
return ( &resource_config[index] );
|
|
return NULL;
|
|
}
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Name : get_resource_index
|
|
*
|
|
* Purpose : Get the resource's index based on the name
|
|
*
|
|
*****************************************************************************/
|
|
int get_resource_index ( const char *resource_name, int *index )
|
|
{
|
|
for ( int i = 0 ; i < _rmon_ctrl_ptr->resources ; i++ )
|
|
{
|
|
if ( strcmp(resource_config[i].resource, resource_name) == 0)
|
|
{
|
|
*index = i;
|
|
return (PASS);
|
|
}
|
|
}
|
|
return (FAIL);
|
|
}
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Name : rmon_hdlr_fini
|
|
*
|
|
* Purpose : Clean up the resource monitor module
|
|
*
|
|
*****************************************************************************/
|
|
void rmon_hdlr_fini ( rmon_ctrl_type * ctrl_ptr )
|
|
{
|
|
for ( int i = 0 ; i < ctrl_ptr->resources ; i++ )
|
|
{
|
|
// mem_log ('\n');
|
|
mem_log_resource ( &resource_config[i] );
|
|
}
|
|
pthread_mutex_destroy(&lock);
|
|
/* Turn off inotify */
|
|
//set_inotify_close ( ctrl_ptr->fd, ctrl_ptr->wd );
|
|
}
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Name : resourceStageChange
|
|
*
|
|
* Purpose : Put a resource in the requested stage for use by the resource handler
|
|
*
|
|
*****************************************************************************/
|
|
int resourceStageChange ( resource_config_type * ptr , rmonStage_enum newStage )
|
|
{
|
|
if (( newStage < RMON_STAGE__STAGES ) &&
|
|
( ptr->stage < RMON_STAGE__STAGES ))
|
|
{
|
|
clog ("%s %s -> %s (%d->%d)\n",
|
|
ptr->resource,
|
|
rmonStages_str[ptr->stage],
|
|
rmonStages_str[newStage],
|
|
ptr->stage, newStage);
|
|
ptr->stage = newStage ;
|
|
return (PASS);
|
|
}
|
|
else
|
|
{
|
|
slog ("%s Invalid Stage (now:%d new:%d)\n",
|
|
ptr->resource, ptr->stage, newStage );
|
|
ptr->stage = RMON_STAGE__FINISH ;
|
|
return (FAIL);
|
|
}
|
|
}
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Name : ntpStageChange
|
|
*
|
|
* Purpose : Stage change handler for NTP resource
|
|
*
|
|
*****************************************************************************/
|
|
int ntpStageChange ( ntpStage_enum newStage )
|
|
{
|
|
if ((newStage < NTP_STAGE__STAGES ) &&
|
|
( ntp_stage < NTP_STAGE__STAGES ))
|
|
{
|
|
clog ("NTP %s -> %s (%d->%d)\n",
|
|
ntpStages_str[ntp_stage],
|
|
ntpStages_str[newStage],
|
|
ntp_stage, newStage);
|
|
ntp_stage = newStage ;
|
|
return (PASS);
|
|
}
|
|
else
|
|
{
|
|
slog ("NTP Invalid Stage (now:%d new:%d)\n", ntp_stage, newStage );
|
|
ntp_stage = NTP_STAGE__BEGIN ;
|
|
return (FAIL);
|
|
}
|
|
}
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Name : _config_files_load
|
|
*
|
|
* Purpose : Load the content of each config file into resource_config[x]
|
|
*
|
|
*****************************************************************************/
|
|
int _config_files_load (void)
|
|
{
|
|
int i = 0 ;
|
|
/* Run Maintenance on Inventory */
|
|
for ( string_iter_ptr = config_files.begin () ;
|
|
string_iter_ptr != config_files.end () ;
|
|
string_iter_ptr++ )
|
|
{
|
|
if ( i >= MAX_RESOURCES )
|
|
{
|
|
wlog ("Cannot Monitor more than %d resources\n", MAX_RESOURCES );
|
|
break ;
|
|
}
|
|
/* Read the resource config file */
|
|
resource_config[i].mask = 0 ;
|
|
if (ini_parse( string_iter_ptr->data(), rmon_resource_config,
|
|
&resource_config[i]) < 0)
|
|
{
|
|
ilog("Read Failure : %s\n", string_iter_ptr->data() );
|
|
}
|
|
|
|
else
|
|
{
|
|
dlog ("Config File : %s\n", string_iter_ptr->c_str());
|
|
|
|
/* Init the timer for this resource */
|
|
mtcTimer_reset ( rtimer[i] ) ;
|
|
rtimer[i].service = resource_config[i].resource ;
|
|
|
|
resource_config[i].i = i ;
|
|
/* allow to clear an existing alarm if the first reading is good
|
|
after reboot
|
|
*/
|
|
resource_config[i].failed = false ;
|
|
resource_config[i].count = 0 ;
|
|
resource_config[i].resource_value = 0 ;
|
|
resource_config[i].resource_prev = 0 ;
|
|
resource_config[i].stage = RMON_STAGE__INIT ;
|
|
resource_config[i].sev = SEVERITY_CLEARED ;
|
|
resource_config[i].alarm_type = STANDARD_ALARM;
|
|
resource_config[i].failed_send = 0;
|
|
resource_config[i].alarm_raised = false;
|
|
|
|
/* add the alarm ids for the FM API per resource monitored */
|
|
if (strcmp(resource_config[i].resource, CPU_RESOURCE_NAME) == 0) {
|
|
/* platform cpu utilization */
|
|
snprintf(resource_config[i].alarm_id, FM_MAX_BUFFER_LENGTH, CPU_ALARM_ID);
|
|
resource_config[i].res_type = RESOURCE_TYPE__CPU_USAGE ;
|
|
}
|
|
else if (strcmp(resource_config[i].resource, V_CPU_RESOURCE_NAME) == 0) {
|
|
/* vswitch cpu utilization */
|
|
snprintf(resource_config[i].alarm_id, FM_MAX_BUFFER_LENGTH, V_CPU_ALARM_ID);
|
|
resource_config[i].res_type = RESOURCE_TYPE__CPU_USAGE ;
|
|
}
|
|
else if (strcmp(resource_config[i].resource, MEMORY_RESOURCE_NAME) == 0) {
|
|
/* platform memory utilization */
|
|
snprintf(resource_config[i].alarm_id, FM_MAX_BUFFER_LENGTH, MEMORY_ALARM_ID);
|
|
resource_config[i].res_type = RESOURCE_TYPE__MEMORY_USAGE ;
|
|
}
|
|
else if (strcmp(resource_config[i].resource, V_MEMORY_RESOURCE_NAME) == 0) {
|
|
/* vswitch memory utilization */
|
|
snprintf(resource_config[i].alarm_id, FM_MAX_BUFFER_LENGTH, V_MEMORY_ALARM_ID);
|
|
resource_config[i].res_type = RESOURCE_TYPE__MEMORY_USAGE ;
|
|
}
|
|
else if (strcmp(resource_config[i].resource, FS_RESOURCE_NAME) == 0) {
|
|
/* platform disk utilization */
|
|
snprintf(resource_config[i].alarm_id, FM_MAX_BUFFER_LENGTH, FS_ALARM_ID);
|
|
resource_config[i].mounted = MOUNTED;
|
|
resource_config[i].res_type = RESOURCE_TYPE__FILESYSTEM_USAGE ;
|
|
}
|
|
else if (strcmp(resource_config[i].resource, INSTANCE_RESOURCE_NAME) == 0) {
|
|
/* platform disk utilization */
|
|
snprintf(resource_config[i].alarm_id, FM_MAX_BUFFER_LENGTH, INSTANCE_ALARM_ID);
|
|
resource_config[i].res_type = RESOURCE_TYPE__FILESYSTEM_USAGE ;
|
|
}
|
|
else if (strcmp(resource_config[i].resource, V_CINDER_THINPOOL_RESOURCE_NAME) == 0) {
|
|
/* platform virtual thin pool utilization */
|
|
snprintf(resource_config[i].alarm_id, FM_MAX_BUFFER_LENGTH, V_CINDER_THINPOOL_ALARM_ID);
|
|
resource_config[i].res_type = RESOURCE_TYPE__FILESYSTEM_USAGE ;
|
|
}
|
|
else if (strcmp(resource_config[i].resource, V_NOVA_THINPOOL_RESOURCE_NAME) == 0) {
|
|
/* platform virtual thin pool utilization */
|
|
snprintf(resource_config[i].alarm_id, FM_MAX_BUFFER_LENGTH, V_NOVA_THINPOOL_ALARM_ID);
|
|
resource_config[i].res_type = RESOURCE_TYPE__FILESYSTEM_USAGE ;
|
|
}
|
|
else if (strcmp(resource_config[i].resource, V_PORT_RESOURCE_NAME) == 0) {
|
|
/* vswitch port utilization */
|
|
snprintf(resource_config[i].alarm_id, FM_MAX_BUFFER_LENGTH,
|
|
V_PORT_ALARM_ID);
|
|
resource_config[i].res_type = RESOURCE_TYPE__PORT ;
|
|
}
|
|
else if (!strcmp(resource_config[i].resource, V_INTERFACE_RESOURCE_NAME) ||
|
|
!strcmp(resource_config[i].resource, V_LACP_INTERFACE_RESOURCE_NAME)) {
|
|
/* vswitch interface(lacp or otherwise) utilization */
|
|
snprintf(resource_config[i].alarm_id, FM_MAX_BUFFER_LENGTH,
|
|
V_INTERFACE_ALARM_ID);
|
|
resource_config[i].res_type = RESOURCE_TYPE__INTERFACE ;
|
|
}
|
|
else if (!strcmp(resource_config[i].resource, V_OVSDB_RESOURCE_NAME)) {
|
|
/* vswitch OVSDB manager utilization */
|
|
snprintf(resource_config[i].alarm_id, FM_MAX_BUFFER_LENGTH,
|
|
V_OVSDB_MANAGER_ALARM_ID);
|
|
resource_config[i].res_type = RESOURCE_TYPE__DATABASE_USAGE ;
|
|
}
|
|
else if (!strcmp(resource_config[i].resource, V_OPENFLOW_RESOURCE_NAME)) {
|
|
/* vswitch Openflow utilization */
|
|
snprintf(resource_config[i].alarm_id, FM_MAX_BUFFER_LENGTH,
|
|
V_OPENFLOW_CONTROLLER_ALARM_ID);
|
|
resource_config[i].res_type = RESOURCE_TYPE__NETWORK_USAGE ;
|
|
}
|
|
else if (strcmp(resource_config[i].resource, REMOTE_LOGGING_RESOURCE_NAME) == 0) {
|
|
/* remote logging connectivity */
|
|
snprintf(resource_config[i].alarm_id, FM_MAX_BUFFER_LENGTH,
|
|
REMOTE_LOGGING_CONTROLLER_CONNECTIVITY_ALARM_ID);
|
|
resource_config[i].res_type = RESOURCE_TYPE__CONNECTIVITY ;
|
|
}
|
|
else
|
|
{
|
|
resource_config[i].res_type = RESOURCE_TYPE__UNKNOWN ;
|
|
}
|
|
|
|
ilog ("Monitoring %2d: %s (%s)\n",
|
|
i,
|
|
resource_config[i].resource,
|
|
resource_config[i].severity);
|
|
mem_log_resource ( &resource_config[i] );
|
|
i++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
_rmon_ctrl_ptr->resources = i ;
|
|
ilog ("Monitoring %d Resources\n", _rmon_ctrl_ptr->resources );
|
|
return (PASS);
|
|
}
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Name : _inter_config_load
|
|
*
|
|
* Purpose : Load the content of each config file into interface_resource_config[x]
|
|
*
|
|
*****************************************************************************/
|
|
int _inter_config_load (void)
|
|
{
|
|
int i = 0 ;
|
|
|
|
for ( string_iter_ptr = interface_config_files.begin () ;
|
|
string_iter_ptr != interface_config_files.end () ;
|
|
string_iter_ptr++ )
|
|
{
|
|
if ( i >= MAX_RESOURCES )
|
|
{
|
|
wlog ("Cannot Monitor more than %d resources\n", MAX_RESOURCES );
|
|
break ;
|
|
}
|
|
|
|
/* Read the interface resource config file */
|
|
resource_config[i].mask = 0 ;
|
|
if (ini_parse( string_iter_ptr->data(), rmon_interface_config,
|
|
&interface_resource_config[i]) < 0)
|
|
{
|
|
ilog("Read Failure : %s\n", string_iter_ptr->data() );
|
|
}
|
|
|
|
else
|
|
{
|
|
dlog ("Config File : %s\n", string_iter_ptr->c_str());
|
|
ilog ("Monitoring %2d: %s (%s)\n", i, interface_resource_config[i].resource ,
|
|
interface_resource_config[i].severity );
|
|
|
|
interface_resource_config[i].i = i ;
|
|
interface_resource_config[i].failed = false ;
|
|
interface_resource_config[i].stage = RMON_STAGE__INIT ;
|
|
interface_resource_config[i].sev = SEVERITY_CLEARED ;
|
|
interface_resource_config[i].failed_send = 0;
|
|
interface_resource_config[i].alarm_raised = false;
|
|
|
|
/* add the alarm ids for the FM API per resource monitored */
|
|
if (strcmp(interface_resource_config[i].resource, OAM_INTERFACE_NAME) == 0) {
|
|
/* add the alarm id for the FM API per resource monitored */
|
|
snprintf(interface_resource_config[i].alarm_id, FM_MAX_BUFFER_LENGTH, OAM_ALARM_ID);
|
|
snprintf(interface_resource_config[i].alarm_id_port, FM_MAX_BUFFER_LENGTH, OAM_PORT_ALARM_ID);
|
|
}
|
|
else if (strcmp(interface_resource_config[i].resource, MGMT_INTERFACE_NAME) == 0) {
|
|
/* add the alarm id for the FM API per resource monitored */
|
|
snprintf(interface_resource_config[i].alarm_id, FM_MAX_BUFFER_LENGTH, MGMT_ALARM_ID);
|
|
snprintf(interface_resource_config[i].alarm_id_port, FM_MAX_BUFFER_LENGTH, MGMT_PORT_ALARM_ID);
|
|
}
|
|
else if (strcmp(interface_resource_config[i].resource, INFRA_INTERFACE_NAME) == 0) {
|
|
/* add the alarm id for the FM API per resource monitored */
|
|
snprintf(interface_resource_config[i].alarm_id, FM_MAX_BUFFER_LENGTH, INFRA_ALARM_ID);
|
|
snprintf(interface_resource_config[i].alarm_id_port, FM_MAX_BUFFER_LENGTH, INFRA_PORT_ALARM_ID);
|
|
}
|
|
|
|
mem_log_interface_resource ( &interface_resource_config[i] );
|
|
i++;
|
|
|
|
}
|
|
}
|
|
|
|
_rmon_ctrl_ptr->interface_resources = i ;
|
|
ilog ("Monitoring %d Interface Resources\n", _rmon_ctrl_ptr->interface_resources );
|
|
return (PASS);
|
|
}
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Name : _thinmeta_config_load
|
|
*
|
|
* Purpose : Load the content of each config file into thinmeta_config[x]
|
|
*
|
|
*****************************************************************************/
|
|
int _thinmeta_config_load (void)
|
|
{
|
|
int i = 0 ;
|
|
|
|
/* Set hard-coded defaults for all structures */
|
|
for ( int j = 0; j < MAX_RESOURCES; j++)
|
|
{
|
|
thinmeta_resource_config_type * res;
|
|
res = &thinmeta_resource_config[i];
|
|
res->critical_threshold = THINMETA_DEFAULT_CRITICAL_THRESHOLD;
|
|
res->alarm_on = THINMETA_DEFAULT_ALARM_ON;
|
|
res->autoextend_on = THINMETA_DEFAULT_AUTOEXTEND_ON;
|
|
res->autoextend_by = THINMETA_DEFAULT_AUTOEXTEND_BY;
|
|
res->autoextend_percent = THINMETA_DEFAULT_AUTOEXTEND_PERCENT;
|
|
res->audit_period = THINMETA_DEFAULT_AUDIT_PERIOD;
|
|
}
|
|
|
|
/* Load resources */
|
|
for ( string_iter_ptr = config_files.begin () ;
|
|
string_iter_ptr != config_files.end () ;
|
|
string_iter_ptr++ )
|
|
{
|
|
if ( i >= MAX_RESOURCES )
|
|
{
|
|
wlog ("Cannot Monitor more than %d resources\n", MAX_RESOURCES );
|
|
break ;
|
|
}
|
|
/* Read the resource config file */
|
|
if (ini_parse( string_iter_ptr->data(), rmon_thinmeta_config,
|
|
&thinmeta_resource_config[i]) < 0)
|
|
{
|
|
ilog("Read Failure : %s\n", string_iter_ptr->data() );
|
|
}
|
|
else
|
|
{
|
|
thinmeta_resource_config_type * res;
|
|
res = &thinmeta_resource_config[i];
|
|
if (!res->section_exists)
|
|
{
|
|
dlog3 ("Config File : %s does not have a [%s] section\n",
|
|
string_iter_ptr->c_str(), THINMETA_CONFIG_SECTION);
|
|
continue;
|
|
}
|
|
dlog ("Config File : %s\n", string_iter_ptr->c_str());
|
|
|
|
/* validate loaded configuration */
|
|
if (!res->vg_name || !res->thinpool_name)
|
|
{
|
|
elog("Invalid VG and/or thinpool names for thinpool metadata "
|
|
"in config file: %s, disabling monitoring", string_iter_ptr->c_str());
|
|
res->critical_threshold = RESOURCE_DISABLE;
|
|
res->vg_name = THINMETA_INVALID_NAME;
|
|
res->thinpool_name = THINMETA_INVALID_NAME;
|
|
}
|
|
else if (res->critical_threshold > 99)
|
|
{
|
|
elog("Metadata monitoring error in config file: %s. Option critical_threshold > 99%%, "
|
|
"value in config file: %i, disabling monitoring",
|
|
string_iter_ptr->c_str(), res->critical_threshold)
|
|
res->critical_threshold = 0;
|
|
}
|
|
else if (res->alarm_on > 1)
|
|
{
|
|
elog("Metadata monitoring error in config file: %s. Option alarm_on is NOT boolean, "
|
|
"value in config file: %i, disabling monitoring", string_iter_ptr->c_str(), res->alarm_on);
|
|
res->critical_threshold = RESOURCE_DISABLE;
|
|
}
|
|
else if (res->autoextend_on > 1)
|
|
{
|
|
elog("Metadata monitoring error in config file: %s. Option autoextend_on is NOT boolean, "
|
|
"value in config file: %i, disabling monitoring",
|
|
string_iter_ptr->c_str(), res->autoextend_on)
|
|
res->critical_threshold = RESOURCE_DISABLE;
|
|
}
|
|
else if (res->autoextend_percent > 1)
|
|
{
|
|
elog("Metadata monitoring error in config file: %s. Option autoextend_percent is NOT boolean, "
|
|
"value in config file: %i, disabling monitoring",
|
|
string_iter_ptr->c_str(), res->autoextend_percent)
|
|
res->critical_threshold = RESOURCE_DISABLE;
|
|
}
|
|
else if ((res->autoextend_percent && res->autoextend_by > 100) ||
|
|
(res->autoextend_on && res->autoextend_by < 1))
|
|
{
|
|
elog("Metadata monitoring error in config file: %s. Option autoextend_by not in [1,100] interval, "
|
|
"value in config file: %i, disabling monitoring",
|
|
string_iter_ptr->c_str(), res->autoextend_by)
|
|
res->critical_threshold = RESOURCE_DISABLE;
|
|
}
|
|
else if ((res->audit_period < 1) || (res->audit_period > 10000))
|
|
{
|
|
elog("Metadata monitoring error in config file: %s. Option audit_period not in [1,10000] interval, "
|
|
"value in config file: %i, disabling monitoring",
|
|
string_iter_ptr->c_str(), res->audit_period)
|
|
res->critical_threshold = RESOURCE_DISABLE;
|
|
}
|
|
|
|
ilog ("%s/%s pool metadata monitored; resource index: %2d\n", res->vg_name ,
|
|
res->thinpool_name, i );
|
|
i++;
|
|
}
|
|
|
|
}
|
|
|
|
_rmon_ctrl_ptr->thinmeta_resources = i ;
|
|
ilog ("Monitoring %d Thinpool Metadata Resources\n", _rmon_ctrl_ptr->thinmeta_resources );
|
|
return (PASS);
|
|
}
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Name : rmon_hdlr_init
|
|
*
|
|
* Purpose : Init the handler but also support re-init that might occur over a SIGHUP
|
|
*
|
|
*****************************************************************************/
|
|
|
|
#define RMON_TIMER_TYPE__EVENT "event"
|
|
#define RMON_TIMER_TYPE__PM "pm"
|
|
#define RMON_TIMER_TYPE__NTP "ntp"
|
|
#define RMON_TIMER_TYPE__RES "resource"
|
|
#define RMON_TIMER_TYPE__THIN "thinpool"
|
|
|
|
int rmon_hdlr_init ( rmon_ctrl_type * ctrl_ptr )
|
|
{
|
|
/* Save the control pointer */
|
|
_rmon_ctrl_ptr = ctrl_ptr ;
|
|
|
|
mtcTimer_init ( rmonTimer_event, LOCALHOST, RMON_TIMER_TYPE__EVENT) ;
|
|
mtcTimer_init ( rmonTimer_pm, LOCALHOST, RMON_TIMER_TYPE__PM ) ;
|
|
|
|
if (is_controller())
|
|
mtcTimer_init ( rmonTimer_ntp,LOCALHOST, RMON_TIMER_TYPE__NTP ) ;
|
|
|
|
for ( int i = 0 ; i < MAX_RESOURCES ; i++ )
|
|
mtcTimer_init ( rtimer[i], LOCALHOST, RMON_TIMER_TYPE__RES );
|
|
ctrl_ptr->resources = 0 ;
|
|
|
|
for ( int i = 0 ; i < MAX_RESOURCES ; i++ )
|
|
mtcTimer_init ( thinmetatimer[i], LOCALHOST, RMON_TIMER_TYPE__THIN );
|
|
ctrl_ptr->thinmeta_resources = 0 ;
|
|
|
|
/* Initialize the Resource Monitor Array */
|
|
memset ( (char*)&resource_config[0], 0, sizeof(resource_config_type)*MAX_RESOURCES);
|
|
memset ( (char*)&interface_resource_config[0], 0, sizeof(interface_resource_config_type)*MAX_RESOURCES);
|
|
memset ( (char*)&thinmeta_resource_config[0], 0, sizeof(thinmeta_resource_config_type)*MAX_RESOURCES);
|
|
memset ( (char*)®istered_clt[0], 0, sizeof(registered_clients)*MAX_CLIENTS);
|
|
|
|
/* Read in the list of config files and their contents */
|
|
load_filenames_in_dir ( CONFIG_DIR, config_files ) ;
|
|
/* Read in the list of interface config files and their contents */
|
|
load_filenames_in_dir ( INT_CONFIG_DIR, interface_config_files ) ;
|
|
|
|
_thinmeta_config_load();
|
|
_config_files_load ();
|
|
_inter_config_load ();
|
|
|
|
/* init Thin Metadata Monitoring after config reload - including timers */
|
|
thinmeta_init(thinmeta_resource_config, thinmetatimer, ctrl_ptr->thinmeta_resources);
|
|
|
|
/* Log the control setting going into the main loop */
|
|
mem_log_ctrl ( _rmon_ctrl_ptr );
|
|
|
|
/* Initialize instance mount monitoring */
|
|
if (pthread_mutex_init(&lock, NULL) != 0)
|
|
{
|
|
elog("mutex init failed \n");
|
|
}
|
|
|
|
t_data.thread_running = false;
|
|
t_data.resource_usage = MOUNTED;
|
|
t_data.nr_switches_count = 0;
|
|
t_data.pid = getpid();
|
|
|
|
return (PASS) ;
|
|
}
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Name : _set_severity
|
|
*
|
|
* Purpose : Restores the resource value and the severity of the alarm
|
|
*
|
|
*****************************************************************************/
|
|
void _set_resource_usage ( string reason_text, resource_config_type * ptr )
|
|
{
|
|
unsigned int found;
|
|
string res_val;
|
|
size_t last_index;
|
|
string temp_val;
|
|
char resource_usage[10];
|
|
|
|
/* extract the resource value from the reason text */
|
|
found = reason_text.find_last_of( ' ' );
|
|
temp_val = reason_text.substr(found+1);
|
|
last_index = temp_val.find_first_not_of("0123456789");
|
|
res_val = temp_val.substr(0, last_index);
|
|
snprintf (resource_usage, sizeof(resource_usage), res_val.c_str());
|
|
sscanf(resource_usage, "%lf", &ptr->resource_value);
|
|
}
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Name : build_entity_instance_id
|
|
*
|
|
* Purpose : build the alarm's entity_instance_id based on the
|
|
* resource type and alarm type.
|
|
*
|
|
*****************************************************************************/
|
|
void build_entity_instance_id ( resource_config_type *ptr, char *entity_instance_id )
|
|
{
|
|
dlog ("resource name: %s, resource type: %s, alarm type: %d \n", ptr->resource, ptr->type, ptr->alarm_type);
|
|
|
|
// Make certain the id is cleared
|
|
entity_instance_id[0] = 0;
|
|
|
|
if ( ptr->alarm_type == DYNAMIC_ALARM )
|
|
{
|
|
if ((ptr->type != NULL) && (strcmp(ptr->type, "lvg") == 0 ))
|
|
{
|
|
/* This case covers volume groups */
|
|
/* Use host=<x>.volumegroup=type for id*/
|
|
snprintf((char*)entity_instance_id, FM_MAX_BUFFER_LENGTH, "%s.volumegroup=%s", _rmon_ctrl_ptr->my_hostname, ptr->resource);
|
|
}
|
|
else
|
|
{
|
|
/* Use host=<x>.filesystem=type for id*/
|
|
snprintf(entity_instance_id, FM_MAX_BUFFER_LENGTH, "%s.filesystem=%s", _rmon_ctrl_ptr->my_hostname, ptr->resource);
|
|
}
|
|
}
|
|
else if ( ptr->alarm_type == STATIC_ALARM )
|
|
{
|
|
/* Use host=<x>.filesystem=type for id*/
|
|
snprintf(entity_instance_id, FM_MAX_BUFFER_LENGTH, "%s.filesystem=%s", _rmon_ctrl_ptr->my_hostname, ptr->resource);
|
|
}
|
|
else if ((ptr->alarm_type == STANDARD_ALARM) && (strstr(ptr->resource, V_MEMORY_RESOURCE_NAME) != NULL))
|
|
{
|
|
/* AVS memory */
|
|
snprintf(alarmData.entity_instance_id, FM_MAX_BUFFER_LENGTH, "%s.processor=%d", _rmon_ctrl_ptr->my_hostname, ptr->socket_id);
|
|
}
|
|
else if (strstr(ptr->resource, V_CINDER_THINPOOL_RESOURCE_NAME) != NULL)
|
|
{
|
|
/* Cinder thin pool alarm should not be raised against a specific host */
|
|
/* as the volumes are synced between controllers through drbd. */
|
|
/* Instead we use a common entity instance id for both controllers. */
|
|
snprintf(entity_instance_id, FM_MAX_BUFFER_LENGTH, "host=controller");
|
|
}
|
|
else
|
|
{
|
|
/* Use hostname for alarm */
|
|
snprintf(entity_instance_id, FM_MAX_BUFFER_LENGTH, _rmon_ctrl_ptr->my_hostname);
|
|
}
|
|
|
|
dlog ("resource %s entity instance id: %s\n", ptr->resource, entity_instance_id);
|
|
|
|
return;
|
|
}
|
|
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Name : thinpool_virtual_space_usage_init
|
|
*
|
|
* Purpose : Determine if we should monitor virtual usage or not: no purpose
|
|
* in doing so if thin provisioning is not used.
|
|
*
|
|
* Params : index - the index of the virtual space resource
|
|
*
|
|
* Return : None.
|
|
*
|
|
*****************************************************************************/
|
|
void thinpool_virtual_space_usage_init(int index,
|
|
const char *poolName,
|
|
const char *poolOwner) {
|
|
|
|
if (!poolName or !poolOwner) {
|
|
slog ("No poolName or poolOwner provided");
|
|
return;
|
|
}
|
|
ilog("index = %d, poolName = %s, poolOwner = %s", index, poolName, poolOwner);
|
|
|
|
/* Buffer (and its size) for keeping the initial result after executing
|
|
the above command. */
|
|
char current_pool_type[BUFFER_SIZE];
|
|
const unsigned int buffer_size = BUFFER_SIZE;
|
|
/* The command for seeing if the pool type is thin. */
|
|
char lvm_thin_cmd[BUFFER_SIZE];
|
|
const char *thin_pool_expected_result = NULL;
|
|
|
|
MEMSET_ZERO(current_pool_type);
|
|
MEMSET_ZERO(lvm_thin_cmd);
|
|
|
|
if (strcmp(poolName, "nova-local-pool") == 0) {
|
|
const char *nova_thin_pool_expected_result = "thin-pool";
|
|
thin_pool_expected_result = nova_thin_pool_expected_result;
|
|
sprintf(lvm_thin_cmd, "lvs --segments | grep \"%s\" | awk '{print $5}'", poolName);
|
|
}
|
|
else if (strcmp(poolName, "cinder-volumes-pool") == 0) {
|
|
const char *cinder_thin_pool_expected_result = "thin";
|
|
thin_pool_expected_result = cinder_thin_pool_expected_result;
|
|
sprintf(lvm_thin_cmd, "cat /etc/cinder/cinder.conf | awk -F = '/^lvm_type.*=.*/ { print $2; }' | tail -n 1 | tr -d ' '");
|
|
}
|
|
else {
|
|
slog("Invalid pool name given.");
|
|
return;
|
|
}
|
|
|
|
/* Result code. */
|
|
int rc;
|
|
|
|
/* Execute the command. */
|
|
rc = execute_pipe_cmd(lvm_thin_cmd, current_pool_type, buffer_size);
|
|
|
|
/* If the command has been executed successfuly, continue. */
|
|
if (rc == PASS) {
|
|
if (current_pool_type != NULL) {
|
|
/* If the pool type is not thin, disable the alarm for virtual
|
|
usage. */
|
|
ilog("%s current pool type is set to = %s", poolOwner, current_pool_type);
|
|
if(strcmp(current_pool_type, thin_pool_expected_result) != 0) {
|
|
resource_config[index].alarm_status = ALARM_OFF;
|
|
ilog("%s LVM Thinpool Usage alarm off: thin provisioning not used", poolOwner);
|
|
} else {
|
|
resource_config[index].alarm_status = ALARM_ON;
|
|
ilog("%s LVM Thinpool Usage alarm on: thin provisioning used", poolOwner);
|
|
}
|
|
}
|
|
} else {
|
|
resource_config[index].alarm_status = ALARM_OFF;
|
|
elog("%s LVM Thinpool monitoring state unknown ; alarm disabled (rc:%i)",
|
|
poolOwner, rc);
|
|
}
|
|
}
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Name : virtual_space_usage_init
|
|
*
|
|
* Purpose : Determine if we should monitor virtual usage or not: no purpose
|
|
* in doing so if thin provisioning is not used.
|
|
*
|
|
* Return : None.
|
|
*
|
|
*****************************************************************************/
|
|
|
|
void virtual_space_usage_init(const char* resource_name) {
|
|
|
|
ilog ("Initialize thin pools for resource %s\n", resource_name);
|
|
int index;
|
|
if ( get_resource_index( resource_name, &index ) == PASS ) {
|
|
|
|
if (strcmp(resource_name, V_CINDER_THINPOOL_RESOURCE_NAME) == 0) {
|
|
thinpool_virtual_space_usage_init(index,"cinder-volumes-pool","Cinder");
|
|
|
|
} else if (strcmp(resource_name, V_NOVA_THINPOOL_RESOURCE_NAME) == 0) {
|
|
thinpool_virtual_space_usage_init(index, "nova-local-pool","Nova");
|
|
}
|
|
}
|
|
else {
|
|
wlog ("failed get_resource_index for resource %s\n", resource_name);
|
|
}
|
|
}
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Name : rmon_alarming_init
|
|
*
|
|
* Purpose : Clears any previously raised rmon alarms if rmon is restarted
|
|
*
|
|
*****************************************************************************/
|
|
void rmon_alarming_init ( resource_config_type * ptr )
|
|
{
|
|
dlog ("resource name: %s, resource type: %s, alarm type: %d \n", ptr->resource, ptr->type, ptr->alarm_type);
|
|
|
|
AlarmFilter alarmFilter;
|
|
|
|
SFmAlarmDataT *active_alarm = (SFmAlarmDataT*) calloc (1, sizeof (SFmAlarmDataT));
|
|
if (active_alarm == NULL)
|
|
{
|
|
elog("Failed to allocate memory for SFmAlarmDataT\n");
|
|
return;
|
|
}
|
|
|
|
build_entity_instance_id (ptr, alarmData.entity_instance_id);
|
|
|
|
snprintf(alarmFilter.alarm_id, FM_MAX_BUFFER_LENGTH, ptr->alarm_id);
|
|
snprintf(alarmFilter.entity_instance_id, FM_MAX_BUFFER_LENGTH, alarmData.entity_instance_id);
|
|
|
|
if (fm_get_fault( &alarmFilter, active_alarm) == FM_ERR_OK)
|
|
{
|
|
if (active_alarm != NULL) {
|
|
|
|
string reasonText(active_alarm->reason_text);
|
|
/* Set the resource severity */
|
|
ptr->failed = true;
|
|
ptr->alarm_raised = true;
|
|
ptr->count = ptr->num_tries;
|
|
if ( active_alarm->severity == FM_ALARM_SEVERITY_MINOR )
|
|
{
|
|
ptr->sev = SEVERITY_MINOR;
|
|
}
|
|
else if ( active_alarm->severity == FM_ALARM_SEVERITY_MAJOR )
|
|
{
|
|
ptr->sev = SEVERITY_MAJOR;
|
|
if ( ptr->res_type == RESOURCE_TYPE__FILESYSTEM_USAGE )
|
|
{
|
|
string err_res_name(ptr->resource);
|
|
_space_to_underscore(err_res_name);
|
|
|
|
/* clear host degrade for fs usage alarms */
|
|
snprintf(ptr->errorMsg, sizeof(ptr->errorMsg), "%s %s:",
|
|
err_res_name.c_str(),
|
|
DEGRADE_CLEAR_MSG );
|
|
|
|
rmon_send_request ( ptr, _rmon_ctrl_ptr->clients );
|
|
}
|
|
}
|
|
else
|
|
{
|
|
ptr->sev = SEVERITY_CRITICAL;
|
|
}
|
|
resourceStageChange ( ptr, RMON_STAGE__MONITOR_WAIT );
|
|
|
|
if (strcmp(ptr->resource, INSTANCE_RESOURCE_NAME) != 0)
|
|
{
|
|
/* Set the resource severity */
|
|
_set_resource_usage( reasonText, ptr );
|
|
ilog ("%s setting previously failed resource alarm id: %s entity_instance_id: %s usage: %0.2f\n",
|
|
ptr->resource, ptr->alarm_id, alarmFilter.entity_instance_id, ptr->resource_value);
|
|
}
|
|
else
|
|
{
|
|
ilog ("%s setting previously failed resource alarm id: %s entity_instance_id: %s\n",
|
|
ptr->resource, ptr->alarm_id, alarmFilter.entity_instance_id);
|
|
}
|
|
}
|
|
}
|
|
free(active_alarm);
|
|
}
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Name : send_clear_msg
|
|
*
|
|
* Purpose : Send a message to all registered clients to set the node to
|
|
* available (clear the degrade)
|
|
*
|
|
*****************************************************************************/
|
|
void send_clear_msg ( int index )
|
|
{
|
|
int count = 0;
|
|
AlarmFilter alarmFilter;
|
|
|
|
SFmAlarmDataT *active_alarm = (SFmAlarmDataT*) calloc (1, sizeof (SFmAlarmDataT));
|
|
if (active_alarm == NULL)
|
|
{
|
|
elog("Failed to allocate memory for SFmAlarmDataT\n");
|
|
return;
|
|
}
|
|
|
|
string err_res_name(resource_config[index].resource);
|
|
_space_to_underscore(err_res_name);
|
|
snprintf(alarmFilter.alarm_id, FM_MAX_BUFFER_LENGTH, resource_config[index].alarm_id);
|
|
|
|
build_entity_instance_id (&resource_config[index], alarmData.entity_instance_id);
|
|
|
|
snprintf(alarmFilter.entity_instance_id, FM_MAX_BUFFER_LENGTH, alarmData.entity_instance_id);
|
|
|
|
/* Notify rmon clients of fault being cleared */
|
|
snprintf(resource_config[index].errorMsg, sizeof(resource_config[index].errorMsg),
|
|
"%s cleared_alarms_for_resource:", err_res_name.c_str());
|
|
|
|
/* check if there is an alarm first for this resource. If there is not then the node */
|
|
/* should not be in a degrade state */
|
|
EFmErrorT ret = fm_get_fault( &alarmFilter, active_alarm);
|
|
if ( (ret == FM_ERR_OK) && (active_alarm != NULL) )
|
|
{
|
|
while (( rmon_send_request ( &resource_config[index], _rmon_ctrl_ptr->clients ) != PASS ) && (count < 3 ))
|
|
{
|
|
wlog ("%s request send failed \n", resource_config[index].resource);
|
|
count++;
|
|
}
|
|
if (count > 2)
|
|
{
|
|
wlog ("%s request send failed, count:%d \n", resource_config[index].resource, count);
|
|
resource_config[index].failed_send++;
|
|
}
|
|
if ((resource_config[index].failed_send == MAX_FAIL_SEND) || (count < 3))
|
|
{
|
|
/* Reset the values to defaults */
|
|
swact_count = 0;
|
|
ilog("Setting resource: %s back to defaults \n", resource_config[index].resource);
|
|
resource_config[index].failed = false ;
|
|
resource_config[index].alarm_raised = false ;
|
|
resource_config[index].count = 0 ;
|
|
resource_config[index].sev = SEVERITY_CLEARED ;
|
|
resource_config[index].stage = RMON_STAGE__START ;
|
|
resource_config[index].failed_send = 0;
|
|
}
|
|
}
|
|
else //alarm not found or error
|
|
{
|
|
if (ret == FM_ERR_ENTITY_NOT_FOUND)
|
|
{
|
|
dlog ("Alarm not found for resource: %s entity_instance_id: %s \n", alarmFilter.alarm_id, alarmFilter.entity_instance_id);
|
|
}
|
|
else
|
|
{
|
|
wlog ("fm_get_fault failed for resource: %s entity_instance_id: %s err: %d\n", alarmFilter.alarm_id,
|
|
alarmFilter.entity_instance_id, ret);
|
|
}
|
|
|
|
if (active_alarm == NULL)
|
|
{
|
|
elog("fm_get_fault returned null active_alarm\n");
|
|
}
|
|
|
|
swact_count++;
|
|
if (swact_count == MAX_SWACT_COUNT)
|
|
{
|
|
/* Reset the values to defaults */
|
|
while (( rmon_send_request ( &resource_config[index], _rmon_ctrl_ptr->clients ) != PASS ) && (count < 3 ))
|
|
{
|
|
wlog ("%s request send failed \n", resource_config[index].resource);
|
|
count++;
|
|
}
|
|
swact_count = 0;
|
|
ilog("Setting resource: %s back to defaults \n", resource_config[index].resource);
|
|
resource_config[index].failed = false ;
|
|
resource_config[index].alarm_raised = false ;
|
|
resource_config[index].count = 0 ;
|
|
resource_config[index].sev = SEVERITY_CLEARED ;
|
|
resource_config[index].stage = RMON_STAGE__START ;
|
|
resource_config[index].failed_send = 0;
|
|
}
|
|
}
|
|
free(active_alarm);
|
|
}
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Name : read_fs_file
|
|
*
|
|
* Purpose : read the memory mapped dynamic file system file
|
|
*****************************************************************************/
|
|
void read_fs_file ( vector<string> & dynamic_resources )
|
|
{
|
|
FILE * pFile;
|
|
char buf[MAX_LEN];
|
|
int fd;
|
|
string delimiter = ",";
|
|
size_t pos;
|
|
string token;
|
|
struct stat fileInfo;
|
|
struct flock fl;
|
|
|
|
memset ((char *)&fileInfo, 0 , sizeof(fileInfo));
|
|
|
|
fl.l_whence = SEEK_SET;
|
|
fl.l_start = 0;
|
|
fl.l_len = 0;
|
|
fl.l_pid = getpid();
|
|
|
|
pFile = fopen (DYNAMIC_FS_FILE , "r");
|
|
if (pFile != NULL) {
|
|
|
|
fd = fileno(pFile);
|
|
/* lock the file */
|
|
fl.l_type = F_RDLCK;
|
|
|
|
/* lock the file for read and write */
|
|
fcntl(fd, F_SETLKW, &fl);
|
|
|
|
if (fd == -1)
|
|
{
|
|
elog("Error opening file for reading");
|
|
}
|
|
|
|
if (fstat(fd, &fileInfo) == -1)
|
|
{
|
|
elog("Error getting the file size");
|
|
}
|
|
|
|
char *map = static_cast<char*>( mmap(0, fileInfo.st_size, PROT_READ, MAP_SHARED, fd, 0));
|
|
if (map == MAP_FAILED)
|
|
{
|
|
elog("Error mmapping the file");
|
|
}
|
|
string str(map);
|
|
|
|
snprintf( buf, MAX_LEN, str.c_str());
|
|
/* free the mmapped memory */
|
|
if (munmap(map, fileInfo.st_size) == -1)
|
|
{
|
|
elog("Error un-mmapping the file");
|
|
}
|
|
fclose(pFile);
|
|
/* unlock the file */
|
|
fl.l_type = F_UNLCK;
|
|
fcntl(fd, F_SETLK, &fl);
|
|
|
|
while ((pos = str.find(delimiter)) != string::npos) {
|
|
/* separate the resources from the file */
|
|
token = str.substr(0, pos);
|
|
dynamic_resources.push_back(token);
|
|
dlog("reading resource %s \n", token.c_str());
|
|
str.erase(0, pos + delimiter.length());
|
|
}
|
|
}
|
|
}
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Name : add_dynamic_fs_resource
|
|
*
|
|
* Purpose : Add the dynamic file system resources
|
|
*****************************************************************************/
|
|
void add_dynamic_fs_resource ( bool send_response )
|
|
{
|
|
#ifdef WANT_FS_MONITORING
|
|
char resource[50];
|
|
char temp_resource[50];
|
|
char device [50];
|
|
char mount_point[50];
|
|
char temp_state[20];
|
|
char type [50];
|
|
char buf[200];
|
|
string criticality = "critical";
|
|
vector<string> resource_list;
|
|
int absolute_thresholds[3];
|
|
|
|
memset(absolute_thresholds, 0, sizeof(absolute_thresholds));
|
|
fs_index.clear();
|
|
fs_state.clear();
|
|
|
|
/* get a list of all the dynamic fs mounts */
|
|
read_fs_file(resource_list);
|
|
|
|
for(std::vector<string>::iterator it = resource_list.begin(); it != resource_list.end(); ++it)
|
|
{
|
|
string str = *it;
|
|
snprintf(buf, sizeof(buf), str.c_str());
|
|
|
|
// For resources without mounts the mount_point will be NULL
|
|
memset(&mount_point[0], 0, sizeof(mount_point));
|
|
sscanf(buf, "%49s %19s %49s %49s %49s", temp_resource, temp_state, type, device, mount_point);
|
|
string state(temp_state);
|
|
|
|
bool found = false;
|
|
|
|
if (mount_point[0] != '\0')
|
|
{
|
|
// for resources with mounts, the resource name is the mount value
|
|
snprintf(resource, FM_MAX_BUFFER_LENGTH, mount_point);
|
|
}
|
|
else
|
|
{
|
|
// for resources without mounts, the resource name is the device value
|
|
snprintf(resource, FM_MAX_BUFFER_LENGTH, device);
|
|
}
|
|
|
|
/* the dynamic file system is enabled, add it if need be */
|
|
for (int i=0; i<_rmon_ctrl_ptr->resources; i++)
|
|
{
|
|
if ( strcmp(resource, resource_config[i].resource) == 0)
|
|
{
|
|
dlog ("resource %s already exists, update the state to %s \n", resource, state.c_str());
|
|
/* resource already exists no need to add it again */
|
|
/* update the state, it may have changed */
|
|
fs_index.push_back(i);
|
|
fs_state.push_back(state);
|
|
found = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!found) // new resource to monitor, lets add it
|
|
{
|
|
int enabled_resource = ALARM_OFF;
|
|
if (strcmp(temp_state,"enabled") == 0)
|
|
{
|
|
enabled_resource = ALARM_ON;
|
|
}
|
|
|
|
if (mount_point[0] != '\0')
|
|
{
|
|
save_fs_resource ( resource, criticality, enabled_resource, fs_percent, absolute_thresholds, DYNAMIC_ALARM, type, device, MOUNTED );
|
|
}
|
|
else
|
|
{
|
|
save_fs_resource ( resource, criticality, enabled_resource, fs_percent, absolute_thresholds, DYNAMIC_ALARM, type, device, NOT_MOUNTED );
|
|
}
|
|
|
|
if (enabled_resource == ALARM_ON) {
|
|
calculate_fs_usage( &resource_config[_rmon_ctrl_ptr->resources - 1] );
|
|
rmon_alarming_init( &resource_config[_rmon_ctrl_ptr->resources - 1] );
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
if (send_response)
|
|
{
|
|
#ifdef WANT_FS_MONITORING
|
|
ilog ("sending response to dynamic FS add, to the rmon client\n");
|
|
#else
|
|
ilog("dynamic filesystem monitoring moved to collectd\n");
|
|
#endif
|
|
/* let the rmon client know that we are done with the file */
|
|
rmon_resource_response(_rmon_ctrl_ptr->clients);
|
|
}
|
|
}
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Name : clear_alarm_for_resource
|
|
*
|
|
* Purpose : Clear the alarm of the resource passed in
|
|
*
|
|
*****************************************************************************/
|
|
void clear_alarm_for_resource ( resource_config_type * ptr )
|
|
{
|
|
dlog ("resource name: %s, resource type: %s, alarm type: %d \n", ptr->resource, ptr->type, ptr->alarm_type);
|
|
AlarmFilter alarmFilter;
|
|
|
|
build_entity_instance_id (ptr, alarmData.entity_instance_id);
|
|
|
|
snprintf(alarmFilter.alarm_id, FM_MAX_BUFFER_LENGTH, ptr->alarm_id);
|
|
snprintf(alarmFilter.entity_instance_id, FM_MAX_BUFFER_LENGTH, alarmData.entity_instance_id);
|
|
|
|
int ret = rmon_fm_clear(&alarmFilter);
|
|
if (ret == FM_ERR_OK)
|
|
{
|
|
ilog ("Cleared stale alarm %s for entity instance id: %s", alarmFilter.alarm_id, alarmFilter.entity_instance_id);
|
|
}
|
|
else if (ret == FM_ERR_ENTITY_NOT_FOUND)
|
|
{
|
|
dlog ("Stale alarm %s for entity instance id: %s was not found", alarmFilter.alarm_id, alarmFilter.entity_instance_id);
|
|
}
|
|
else
|
|
{
|
|
wlog ("Failed to clear stale alarm %s for entity instance id: %s error: %d", alarmFilter.alarm_id, alarmFilter.entity_instance_id, ret);
|
|
}
|
|
}
|
|
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Name : process_dynamic_fs_file
|
|
*
|
|
* Purpose : read the dynamic files directory and add the dynamic filesystem
|
|
* resources when the file is updated
|
|
*****************************************************************************/
|
|
void process_dynamic_fs_file()
|
|
{
|
|
int index = 0;
|
|
|
|
pthread_mutex_lock(&lock);
|
|
modifyingResources = true;
|
|
pthread_mutex_unlock(&lock);
|
|
|
|
add_dynamic_fs_resource(true);
|
|
|
|
pthread_mutex_lock(&lock);
|
|
modifyingResources = false;
|
|
pthread_mutex_unlock(&lock);
|
|
|
|
/* deal with changes of dynamic file system enabled state */
|
|
for (unsigned int i=0; i<fs_index.size(); i++)
|
|
{
|
|
index = fs_index.at(i);
|
|
if ( strcmp(fs_state.at(i).c_str(), "disable") == 0 )
|
|
{
|
|
/* resource has been disabled, stop alarming on it */
|
|
ilog("%s is no longer enabled\n", resource_config[index].resource);
|
|
|
|
if ( resource_config[index].failed == true )
|
|
{
|
|
resource_config[index].alarm_status = ALARM_OFF;
|
|
|
|
if ( _rmon_ctrl_ptr->clients > 0 )
|
|
{
|
|
//send a clear degrade node
|
|
send_clear_msg(index);
|
|
}
|
|
|
|
// we need to clear the resource's alarm if there was any set for this resource
|
|
clear_alarm_for_resource(&resource_config[index]);
|
|
}
|
|
else
|
|
{
|
|
/* There was no active alarm to clear */
|
|
ilog("Setting resource: %s back to defaults \n", resource_config[index].resource);
|
|
resource_config[index].alarm_status = ALARM_OFF;
|
|
resource_config[index].failed = false;
|
|
resource_config[index].alarm_raised = false;
|
|
resource_config[index].count = 0 ;
|
|
resource_config[index].sev = SEVERITY_CLEARED ;
|
|
resource_config[index].stage = RMON_STAGE__START ;
|
|
}
|
|
}
|
|
else if ( strcmp(fs_state.at(i).c_str(), "enabled") == 0 )
|
|
{
|
|
// resource has been enabled
|
|
if ( resource_config[index].alarm_status == ALARM_OFF )
|
|
{
|
|
/* Turn the resource checking back on if it was off */
|
|
resource_config[index].alarm_status = ALARM_ON;
|
|
|
|
//reset values
|
|
resource_config[index].failed = false;
|
|
resource_config[index].alarm_raised = false;
|
|
resource_config[index].count = 0 ;
|
|
resource_config[index].sev = SEVERITY_CLEARED ;
|
|
resource_config[index].stage = RMON_STAGE__START ;
|
|
|
|
rmon_alarming_init( &resource_config[index] );
|
|
|
|
ilog("%s is now enabled \n", resource_config[index].resource);
|
|
if (strcmp(resource_config[index].resource, CINDER_VOLUMES) == 0)
|
|
{
|
|
virtual_space_usage_init(V_CINDER_THINPOOL_RESOURCE_NAME);
|
|
}
|
|
if (strcmp(resource_config[index].resource, NOVA_LOCAL) == 0)
|
|
{
|
|
virtual_space_usage_init(V_NOVA_THINPOOL_RESOURCE_NAME);
|
|
}
|
|
}
|
|
else // alarm aready on (enabled)
|
|
{
|
|
ilog("%s is already enabled \n", resource_config[index].resource);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
wlog("%s invalid dynamic file system state: %s \n", resource_config[index].resource, fs_state.at(i).c_str());
|
|
}
|
|
}
|
|
}
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Name : process_static_fs_file
|
|
*
|
|
* Purpose : Reads in the list of static file systems for monitoring
|
|
*
|
|
*****************************************************************************/
|
|
void process_static_fs_file()
|
|
{
|
|
FILE * pFile;
|
|
vector<string> mounts;
|
|
char buf[MAX_LEN];
|
|
char resource[50];
|
|
char type[50];
|
|
char device[50];
|
|
bool found = false;
|
|
int enabled_resource = ALARM_ON;
|
|
string criticality = "critical";
|
|
int absolute_thresholds[3] = {0};
|
|
|
|
pFile = fopen (STATIC_FS_FILE , "r");
|
|
if (pFile != NULL) {
|
|
ifstream fin( STATIC_FS_FILE );
|
|
string line;
|
|
|
|
while( getline( fin, line )) {
|
|
/* process each line */
|
|
mounts.push_back(line);
|
|
}
|
|
fclose(pFile);
|
|
|
|
|
|
for(std::vector<string>::iterator it = mounts.begin(); it != mounts.end(); ++it)
|
|
{
|
|
string str = *it;
|
|
snprintf(buf, MAX_LEN, str.c_str());
|
|
sscanf(buf, "%49s %49s %49s %d %d %d", resource, device, type, &absolute_thresholds[0], &absolute_thresholds[1], &absolute_thresholds[2]);
|
|
|
|
if (!found)
|
|
{
|
|
if (fs_percent == PERCENT_USED)
|
|
{
|
|
/* do not use the absolute thresholds */
|
|
memset(absolute_thresholds, 0, sizeof(absolute_thresholds));
|
|
}
|
|
/* add the resource */
|
|
save_fs_resource ( resource, criticality, enabled_resource, fs_percent, absolute_thresholds, STATIC_ALARM, type, device, MOUNTED );
|
|
calculate_fs_usage( &resource_config[_rmon_ctrl_ptr->resources - 1] );
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
elog("Error, no static file system file present at: %s\n", STATIC_FS_FILE);
|
|
}
|
|
}
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Name : rmon_timer_handler
|
|
*
|
|
* Purpose : Looks up the timer ID and asserts the corresponding ringer
|
|
*
|
|
*****************************************************************************/
|
|
void rmon_timer_handler ( int sig, siginfo_t *si, void *uc)
|
|
{
|
|
timer_t * tid_ptr = (void**)si->si_value.sival_ptr ;
|
|
|
|
/* Avoid compiler errors/warnings for parms we must
|
|
* have but currently do nothing with */
|
|
UNUSED(sig);
|
|
UNUSED(uc);
|
|
|
|
if ( !(*tid_ptr) )
|
|
{
|
|
// tlog ("Called with a NULL Timer ID\n");
|
|
return ;
|
|
}
|
|
|
|
/* is event rmon timer */
|
|
if ( *tid_ptr == rmonTimer_event.tid )
|
|
{
|
|
mtcTimer_stop_int_safe ( rmonTimer_event);
|
|
rmonTimer_event.ring = true ;
|
|
}
|
|
|
|
else if ( *tid_ptr == rmonTimer_pm.tid )
|
|
{
|
|
mtcTimer_stop_int_safe ( rmonTimer_pm);
|
|
rmonTimer_pm.ring = true ;
|
|
}
|
|
|
|
else if ( (is_controller()) && (*tid_ptr == rmonTimer_ntp.tid) )
|
|
{
|
|
mtcTimer_stop_int_safe ( rmonTimer_ntp);
|
|
rmonTimer_ntp.ring = true ;
|
|
}
|
|
|
|
else
|
|
{
|
|
bool found = false ;
|
|
for ( int i = 0 ; i < _rmon_ctrl_ptr->resources ; i++ )
|
|
{
|
|
if ( *tid_ptr == rtimer[i].tid )
|
|
{
|
|
mtcTimer_stop_int_safe ( rtimer[i] );
|
|
rtimer[i].ring = true ;
|
|
found = true ;
|
|
break ;
|
|
}
|
|
}
|
|
if ( !found )
|
|
{
|
|
for ( int i = 0 ; i < _rmon_ctrl_ptr->thinmeta_resources ; i++ )
|
|
{
|
|
if ( *tid_ptr == thinmetatimer[i].tid )
|
|
{
|
|
mtcTimer_stop_int_safe ( thinmetatimer[i] );
|
|
thinmetatimer[i].ring = true ;
|
|
found = true ;
|
|
break ;
|
|
}
|
|
}
|
|
}
|
|
if ( !found )
|
|
{
|
|
/* try and cleanup by stopping this unknown timer via its tid */
|
|
mtcTimer_stop_tid_int_safe (tid_ptr);
|
|
}
|
|
}
|
|
}
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Name : clear_ntp_alarms
|
|
*
|
|
* Purpose : Loop through each current alarms and deleted them if the server
|
|
* is now reachable or the server no longer is assigned to ntpq
|
|
*
|
|
*****************************************************************************/
|
|
void clear_ntp_alarms(std::list<string> &non_reachable_ntp_servers, unsigned int alarm_count, SFmAlarmDataT *active_alarms, bool clear_major_alarm)
|
|
{
|
|
dlog ("Total NTP alarm_count:%d", alarm_count);
|
|
AlarmFilter alarmFilter;
|
|
char alarm_to_search[FM_MAX_BUFFER_LENGTH];
|
|
|
|
fm_alarm_id alarm_id;
|
|
snprintf(alarm_id, FM_MAX_BUFFER_LENGTH, "%s", NTP_ALARM_ID);
|
|
|
|
// clear the major alarms if required
|
|
if (clear_major_alarm)
|
|
{
|
|
snprintf(alarmFilter.alarm_id, FM_MAX_BUFFER_LENGTH, "%s", NTP_ALARM_ID );
|
|
snprintf(alarmFilter.entity_instance_id, FM_MAX_BUFFER_LENGTH, "%s.ntp", _rmon_ctrl_ptr->my_hostname);
|
|
|
|
int ret = rmon_fm_clear(&alarmFilter);
|
|
if (ret != FM_ERR_OK)
|
|
{
|
|
if (ret != FM_ERR_ENTITY_NOT_FOUND)
|
|
{
|
|
wlog ("Failed to clear major alarm %s for entity instance id:%s error:%d", NTP_ALARM_ID, alarmFilter.entity_instance_id, ret);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
ilog ("Cleared major alarm %s for entity instance id:%s", NTP_ALARM_ID, alarmFilter.entity_instance_id);
|
|
}
|
|
}
|
|
|
|
if (active_alarms == NULL)
|
|
{
|
|
elog ("Null pointer for active_alarms");
|
|
return;
|
|
}
|
|
|
|
// clear minor alarms if required
|
|
bool found;
|
|
std::list<string>::iterator iter;
|
|
std::list<string>::iterator iter_bad_list;
|
|
|
|
// for each NTP alarms in the system see if it match any of the invalid NTP servers
|
|
// if it does not match then the alarm must be removed since that NTP server
|
|
// is no longer being monitored or is now valid
|
|
for ( unsigned int i = 0; i < alarm_count; i++ )
|
|
{
|
|
if ( ((active_alarms+i)->severity) == FM_ALARM_SEVERITY_MINOR )
|
|
{
|
|
// Verify that this NTP minor alarm is still valid, This server could no longer exist or is now marked
|
|
// reachable
|
|
dlog ("Verify NTP minor alarm is still valid, entity instance id:%s", (active_alarms+i)->entity_instance_id);
|
|
|
|
found = false;
|
|
|
|
// check for stale minor alarm
|
|
for ( iter = non_reachable_ntp_servers.begin (); iter != non_reachable_ntp_servers.end (); iter++ )
|
|
{
|
|
// e.g. host=controller-0.ntp=102.111.2.2
|
|
snprintf(alarm_to_search, FM_MAX_BUFFER_LENGTH, "%s.ntp=%s", _rmon_ctrl_ptr->my_hostname, iter->c_str());
|
|
|
|
dlog ("Non reachable NTP server to search %s", iter->c_str());
|
|
|
|
if (strstr((active_alarms+i)->entity_instance_id, iter->c_str()) != NULL)
|
|
{
|
|
// server is in non reachable list, do not clear it
|
|
found = true;
|
|
dlog ("Alarm is still valid %s", iter->c_str());
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!found)
|
|
{
|
|
// lets clear it but only if it's this controller's alarm, it could be the peer controller's alarm
|
|
if (strstr((active_alarms+i)->entity_instance_id, _rmon_ctrl_ptr->my_hostname) != NULL)
|
|
{
|
|
snprintf(alarmFilter.alarm_id, FM_MAX_BUFFER_LENGTH, "%s", NTP_ALARM_ID);
|
|
snprintf(alarmFilter.entity_instance_id, FM_MAX_BUFFER_LENGTH, "%s", (active_alarms+i)->entity_instance_id);
|
|
|
|
if (rmon_fm_clear(&alarmFilter) != FM_ERR_OK)
|
|
{
|
|
wlog ("Failed to clear minor alarm %s for entity instance id:%s", NTP_ALARM_ID, (active_alarms+i)->entity_instance_id);
|
|
}
|
|
else
|
|
{
|
|
ilog ("Cleared minor alarm %s for entity instance id:%s", NTP_ALARM_ID, (active_alarms+i)->entity_instance_id);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Name : ntp_query_results
|
|
*
|
|
* Purpose : Analyze the return code from script query_ntp_servers.sh.
|
|
* Create alarms if the servers are non reachable, Clear alarms if they are
|
|
* now reachable
|
|
*
|
|
*****************************************************************************/
|
|
void ntp_query_results (int ntp_query_status )
|
|
{
|
|
dlog ("ntp_query_results ntp_query_status:%d", ntp_query_status);
|
|
|
|
std::list<string> non_reachable_ntp_servers;
|
|
|
|
// if no NTP servers are provisioned on the system, we still need to clear old NTP
|
|
// alarms if there are any. But we do not need to read the tmp server file.
|
|
if (ntp_query_status != NTP_NOT_PROVISIONED)
|
|
{
|
|
// read the temp file which contains a list of reachable and non reachable servers
|
|
// this file is the output from the query_ntp_servers.sh script
|
|
|
|
const char *server_info = "/tmp/ntpq_server_info";
|
|
FILE *pFile;
|
|
pFile = fopen(server_info, "r");
|
|
if (pFile != NULL)
|
|
{
|
|
const char * delim = ";\n\r";
|
|
char * ip;
|
|
char line[500];
|
|
|
|
int pos = 0;
|
|
while ( memset(line, 0, sizeof(line)) && (fgets((char*) &line, sizeof(line), pFile) != NULL) )
|
|
{
|
|
// the first line in the tmp file is the reachable servers, the second is the non reachable servers
|
|
if (pos == 1)
|
|
{
|
|
for (ip = strtok (line, delim); ip; ip = strtok (NULL, delim))
|
|
{
|
|
non_reachable_ntp_servers.push_back(ip);
|
|
dlog("Found non reachable NTP servers:%s\n", ip);
|
|
}
|
|
break;
|
|
}
|
|
pos++;
|
|
}
|
|
fclose(pFile);
|
|
}
|
|
else
|
|
{
|
|
elog("Failed to open file: %s\n", server_info);
|
|
return;
|
|
}
|
|
}
|
|
|
|
// retreive all the current NTP alarms
|
|
int rc;
|
|
unsigned int max_alarms=75;
|
|
fm_alarm_id alarm_id;
|
|
snprintf(alarm_id, FM_MAX_BUFFER_LENGTH, "%s", NTP_ALARM_ID);
|
|
SFmAlarmDataT *active_alarms = (SFmAlarmDataT*) calloc (max_alarms, sizeof (SFmAlarmDataT));
|
|
if (active_alarms == NULL)
|
|
{
|
|
elog ("Failed to allocate memory for NTP alarms");
|
|
return;
|
|
}
|
|
|
|
int ret = fm_get_faults_by_id( &alarm_id, active_alarms, &max_alarms);
|
|
if (!(ret == FM_ERR_OK || ret == FM_ERR_ENTITY_NOT_FOUND))
|
|
{
|
|
elog ("fm_get_faults_by_id failed trying to retreive all the NTP alarms, error:%d", ret);
|
|
free(active_alarms);
|
|
return;
|
|
}
|
|
|
|
// Clear alarms if required
|
|
|
|
bool clear_major_alarm = false;
|
|
bool created_major_alarm = false;
|
|
|
|
if ( ntp_query_status == NTP_NOT_PROVISIONED || ntp_query_status == NTP_SOME_REACHABLE || ntp_query_status == NTP_OK )
|
|
{
|
|
// We are going to clear the major alarm since there is at least one server selected or
|
|
// no servers are provisioned
|
|
clear_major_alarm = true;
|
|
}
|
|
|
|
// fm_get_faults_by_id returns the number of alarms found
|
|
if (max_alarms != 0)
|
|
{
|
|
// verify if alarms need to cleared and clear them
|
|
clear_ntp_alarms(non_reachable_ntp_servers, max_alarms, active_alarms, clear_major_alarm);
|
|
}
|
|
|
|
// There are no NTP servers provisioned so there is no alarms to raise
|
|
if (ntp_query_status == NTP_NOT_PROVISIONED)
|
|
{
|
|
return;
|
|
}
|
|
|
|
// Raise alarms if required
|
|
|
|
// Set up alarms data
|
|
AlarmFilter alarmFilter;
|
|
snprintf(alarmData.proposed_repair_action , sizeof(alarmData.proposed_repair_action), "Monitor and if condition persists, contact next level of support.");
|
|
snprintf(alarmData.alarm_id, FM_MAX_BUFFER_LENGTH, "%s", NTP_ALARM_ID);
|
|
strcpy(alarmData.uuid, "");
|
|
snprintf(alarmData.entity_type_id, FM_MAX_BUFFER_LENGTH, "ntp");
|
|
alarmData.alarm_state = FM_ALARM_STATE_SET;
|
|
alarmData.alarm_type = FM_ALARM_COMM;
|
|
alarmData.probable_cause = FM_ALARM_CAUSE_UNKNOWN;
|
|
alarmData.timestamp = 0;
|
|
alarmData.service_affecting = FM_FALSE;
|
|
alarmData.suppression = FM_FALSE;
|
|
|
|
// Here we raise the major alarm if required
|
|
if (ntp_query_status == NTP_NONE_REACHABLE || ntp_query_status == NTP_SOME_REACHABLE_NONE_SELECTED)
|
|
{
|
|
wlog("NTP configuration does not contain any valid or reachable NTP servers");
|
|
|
|
// Check if alarm is raised already
|
|
snprintf(alarmFilter.entity_instance_id, FM_MAX_BUFFER_LENGTH, "%s.ntp", _rmon_ctrl_ptr->my_hostname);
|
|
|
|
bool found = false;
|
|
for ( unsigned int i = 0; i < max_alarms; i++ )
|
|
{
|
|
if ( strncmp((active_alarms+i)->entity_instance_id, alarmFilter.entity_instance_id, sizeof((active_alarms+i)->entity_instance_id)) == 0 )
|
|
{
|
|
// Alarm already exist
|
|
dlog("Alarm %s already raised for entity instance id:%s\n", NTP_ALARM_ID, alarmFilter.entity_instance_id);
|
|
found = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Alarm does not exist so raise it
|
|
if (!found && !created_major_alarm)
|
|
{
|
|
// Alarm does not exist so raise it
|
|
alarmData.severity = FM_ALARM_SEVERITY_MAJOR;
|
|
snprintf(alarmData.reason_text, sizeof(alarmData.reason_text), "NTP configuration does not contain any valid or reachable NTP servers.");
|
|
snprintf(alarmData.entity_instance_id, FM_MAX_BUFFER_LENGTH, "%s", alarmFilter.entity_instance_id);
|
|
|
|
rc = rmon_fm_set(&alarmData, NULL);
|
|
if (rc == FM_ERR_OK )
|
|
{
|
|
ilog("Alarm %s created for entity instance id:%s \n", NTP_ALARM_ID, alarmData.entity_instance_id);
|
|
created_major_alarm = true;
|
|
}
|
|
else
|
|
{
|
|
ilog("Failed to create alarm %s for entity instance id:%s error: %d \n", NTP_ALARM_ID, alarmData.entity_instance_id, (int)rc);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Here were raise alarms for individual servers
|
|
if (ntp_query_status != NTP_OK)
|
|
{
|
|
wlog("Some or all of the NTP servers are not reachable");
|
|
std::list<string>::iterator iter;
|
|
alarmData.severity = FM_ALARM_SEVERITY_MINOR;
|
|
|
|
// Loop through all the non reachable NTP servers
|
|
// Check to see if an alarms is lready raised for the server.
|
|
// If we do not find an alarm for the server then we raise it
|
|
for ( iter = non_reachable_ntp_servers.begin (); iter != non_reachable_ntp_servers.end (); iter++ )
|
|
{
|
|
bool found = false;
|
|
|
|
// Build the alarm entity instatance id
|
|
snprintf(alarmFilter.entity_instance_id, FM_MAX_BUFFER_LENGTH, "%s.ntp=%s", _rmon_ctrl_ptr->my_hostname, iter->c_str());
|
|
|
|
dlog("Search alarms for entity instance id:%s \n", alarmFilter.entity_instance_id);
|
|
for ( unsigned int i = 0; i < max_alarms; i++ )
|
|
{
|
|
if ( strncmp((active_alarms+i)->entity_instance_id, alarmFilter.entity_instance_id, sizeof((active_alarms+i)->entity_instance_id)) == 0 )
|
|
{
|
|
dlog("Alarm %s already raised for entity instance id:%s\n", NTP_ALARM_ID, alarmFilter.entity_instance_id);
|
|
found = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// If the NTP alarm was not found then raise one for this NTP server
|
|
if (!found)
|
|
{
|
|
|
|
snprintf(alarmData.reason_text, sizeof(alarmData.reason_text), "NTP address %s is not a valid or a reachable NTP server.", iter->c_str() );
|
|
snprintf(alarmData.entity_instance_id, FM_MAX_BUFFER_LENGTH, "%s", alarmFilter.entity_instance_id);
|
|
|
|
rc = rmon_fm_set(&alarmData, NULL);
|
|
if (rc == FM_ERR_OK )
|
|
{
|
|
ilog("Alarm %s created for entity instance id:%s \n", NTP_ALARM_ID, alarmData.entity_instance_id);
|
|
}
|
|
else
|
|
{
|
|
ilog("Failed to create alarm %s for entity instance id:%s error:%d \n", NTP_ALARM_ID, alarmData.entity_instance_id, (int)rc);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
free(active_alarms);
|
|
return;
|
|
}
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Name : query_ntp_servers
|
|
*
|
|
* Purpose : execute script query_ntp_servers.sh which run the "ntpq -np"
|
|
* which query the healths of the NTP servers. The script will return a
|
|
* status code and also create a temporate file which will save the list
|
|
* of reachable and non reachable NTP servers. This temp file is required
|
|
* to generate proper alarms
|
|
*
|
|
*****************************************************************************/
|
|
int query_ntp_servers ( )
|
|
{
|
|
pid_t child_pid;
|
|
|
|
dlog ("Main Pid:%d \n", getpid() );
|
|
|
|
ntp_child_pid = child_pid = fork ();
|
|
if (child_pid == 0)
|
|
{
|
|
dlog ("Child Pid:%d \n", getpid() );
|
|
|
|
char* argv[] = {(char*)NTPQ_QUERY_SCRIPT, NULL};
|
|
char cmd[MAX_FILE_SIZE] ;
|
|
memset (cmd,0,MAX_FILE_SIZE);
|
|
|
|
snprintf ( &cmd[0], MAX_FILE_SIZE, "%s/%s", RMON_FILES_DIR, NTPQ_QUERY_SCRIPT );
|
|
|
|
bool close_file_descriptors = true ;
|
|
if ( setup_child ( close_file_descriptors ) != PASS )
|
|
{
|
|
exit(NTP_ERROR);
|
|
}
|
|
|
|
/* Set child to ignore child exit */
|
|
signal (SIGCHLD, SIG_DFL);
|
|
|
|
/* Setup the exec arguement */
|
|
int res = execv(cmd, argv);
|
|
elog ( "Failed to run %s return code:%d error:%s\n", cmd, res, strerror(errno) );
|
|
exit (NTP_ERROR);
|
|
}
|
|
|
|
if ( child_pid == -1 )
|
|
{
|
|
elog ("Fork failed (%s)\n", strerror(errno));
|
|
|
|
/* TODO: Consider making this a critical fault
|
|
* after 100 retries.
|
|
* All possibilities based on man page are
|
|
* due to resource limitations and if that does
|
|
* not resolve in 100 retries then ip probably will never.
|
|
**/
|
|
return (FAIL);
|
|
}
|
|
|
|
return (PASS);
|
|
}
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Name : rmonHdlr_ceilometer_handler
|
|
*
|
|
* Purpose : Handles the ceilometer sample create response message
|
|
*
|
|
*****************************************************************************/
|
|
void rmonHdlr_ceilometer_handler( struct evhttp_request *req, void *arg )
|
|
{
|
|
if ( !req )
|
|
{
|
|
elog (" Request Timeout\n");
|
|
ceilometerEvent.status = FAIL_TIMEOUT;
|
|
goto _ceilometer_handler_done ;
|
|
}
|
|
|
|
ceilometerEvent.status = rmonHttpUtil_status(ceilometerEvent);
|
|
if ( ceilometerEvent.status != PASS )
|
|
{
|
|
elog ("ceilometer HTTP request Failed (%d)\n", ceilometerEvent.status);
|
|
rmonHttpUtil_get_response(ceilometerEvent);
|
|
goto _ceilometer_handler_done ;
|
|
}
|
|
|
|
_ceilometer_handler_done:
|
|
event_base_loopbreak((struct event_base *)arg);
|
|
}
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Name : generate_ceilometer_pm
|
|
*
|
|
* Purpose : Generate ceilometer PMs through the REST API
|
|
*
|
|
*****************************************************************************/
|
|
void generate_ceilometer_pm ( string r_id, string m_id, string m_type,
|
|
string m_unit, string m_volume,
|
|
string m_metadata )
|
|
{
|
|
int rc = PASS;
|
|
daemon_config_type * cfg_ptr = daemon_get_cfg_ptr();
|
|
string command_path="";
|
|
string host_ip = cfg_ptr->keystone_auth_host;
|
|
int port = cfg_ptr->ceilometer_port;
|
|
int count = 0;
|
|
|
|
rmonHttpUtil_libEvent_init ( &ceilometerEvent, CEILOMETER_EVENT_SIG, host_ip, port);
|
|
|
|
ceilometerEvent.address.append("/v2/meters/");
|
|
ceilometerEvent.address.append(m_id);
|
|
|
|
ceilometerEvent.user_agent = "ceilometerclient.openstack.common.apiclient";
|
|
|
|
ceilometerEvent.payload = "[{";
|
|
ceilometerEvent.payload.append("\"resource_id\":\"");
|
|
ceilometerEvent.payload.append(r_id);
|
|
ceilometerEvent.payload.append("\",\"counter_name\":\"");
|
|
ceilometerEvent.payload.append(m_id);
|
|
ceilometerEvent.payload.append("\",\"counter_type\":\"");
|
|
ceilometerEvent.payload.append(m_type);
|
|
ceilometerEvent.payload.append("\",\"counter_unit\":\"");
|
|
ceilometerEvent.payload.append(m_unit);
|
|
ceilometerEvent.payload.append("\",\"counter_volume\":\"");
|
|
ceilometerEvent.payload.append(m_volume);
|
|
ceilometerEvent.payload.append("\",\"resource_metadata\":");
|
|
// the resource metadata is dictionary of key-value pairs
|
|
ceilometerEvent.payload.append(m_metadata);
|
|
ceilometerEvent.payload.append("}]");
|
|
dlog ("Payload is : %s\n", ceilometerEvent.payload.c_str());
|
|
|
|
rc = rmonHttpUtil_api_request (CEILOMETER_SAMPLE_CREATE, ceilometerEvent, command_path);
|
|
do
|
|
{
|
|
if ( rc != PASS )
|
|
{
|
|
count++;
|
|
wlog ("ceilometer failed request (%d) ... retrying (%d)\n", rc, count);
|
|
}
|
|
rmonHttpUtil_log_event (ceilometerEvent);
|
|
|
|
} while ( ( rc!=PASS ) && ( count < REST_API_RETRY_COUNT ) );
|
|
|
|
if ( rc!= PASS )
|
|
{
|
|
elog ("ceilometer sample create Failed (%d) (cnt:%d)\n", rc, count);
|
|
}
|
|
}
|
|
|
|
void clear_rmon_api_counts ( registered_clients * ptr )
|
|
{
|
|
if ( ptr->b2b_miss_count > ptr->b2b_miss_peak )
|
|
{
|
|
ptr->b2b_miss_peak = ptr->b2b_miss_count ;
|
|
}
|
|
|
|
if ( ptr->mesg_err_cnt > ptr->mesg_err_peak )
|
|
{
|
|
ptr->mesg_err_peak = ptr->mesg_err_cnt ;
|
|
}
|
|
ptr->b2b_miss_count = 0 ;
|
|
ptr->send_err_cnt = 0 ;
|
|
ptr->recv_err_cnt = 0 ;
|
|
ptr->mesg_err_cnt = 0 ;
|
|
}
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Name : _space_to_underscore
|
|
*
|
|
* Purpose : Converts spaces in a string to underscores
|
|
* *****************************************************************************/
|
|
void _space_to_underscore (string & str )
|
|
{
|
|
char space = ' ';
|
|
for(unsigned int i = 0; i < str.size(); i++)
|
|
{
|
|
if(str[i] == space)
|
|
{
|
|
str[i] = '_';
|
|
}
|
|
}
|
|
}
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Name : set_alarm_defaults
|
|
*
|
|
* Purpose : Set the defaults for the fm alarms
|
|
* *****************************************************************************/
|
|
void set_alarm_defaults ( resource_config_type * ptr )
|
|
{
|
|
strcpy(alarmData.uuid, "");
|
|
/* common data for all alarm messages */
|
|
snprintf(alarmData.entity_type_id, FM_MAX_BUFFER_LENGTH, "system.host");
|
|
|
|
build_entity_instance_id (ptr, alarmData.entity_instance_id);
|
|
|
|
alarmData.alarm_state = FM_ALARM_STATE_SET;
|
|
alarmData.alarm_type = FM_ALARM_OPERATIONAL;
|
|
alarmData.probable_cause = FM_ALARM_THRESHOLD_CROSSED;
|
|
alarmData.timestamp = 0;
|
|
alarmData.service_affecting = FM_FALSE;
|
|
alarmData.suppression = FM_TRUE;
|
|
snprintf(alarmData.alarm_id, FM_MAX_BUFFER_LENGTH, ptr->alarm_id);
|
|
|
|
}
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Name : resource_handler
|
|
*
|
|
* Purpose : Handle the failed resources and raise alarms through
|
|
* the FM API as well as calling a function to notify registered clients
|
|
*****************************************************************************/
|
|
int resource_handler ( resource_config_type * ptr )
|
|
{
|
|
int rc = RETRY ;
|
|
AlarmFilter alarmFilter;
|
|
string err_res_name(ptr->resource);
|
|
_space_to_underscore(err_res_name);
|
|
|
|
if ( ptr->stage < RMON_STAGE__STAGES )
|
|
{
|
|
dlog2 ("%s %s Stage %d\n", ptr->resource, rmonStages_str[ptr->stage], ptr->stage );
|
|
}
|
|
else
|
|
{
|
|
resourceStageChange ( ptr, RMON_STAGE__FINISH );
|
|
}
|
|
|
|
switch ( ptr->stage )
|
|
{
|
|
case RMON_STAGE__START:
|
|
{
|
|
dlog ( "%s failed:%d set_cnt:%d debounce_cnt:%d\n",
|
|
ptr->resource,
|
|
ptr->failed,
|
|
ptr->count,
|
|
ptr->debounce_cnt);
|
|
break ;
|
|
}
|
|
case RMON_STAGE__MANAGE:
|
|
{
|
|
/* send messages to maintnance in thresholds are crossed */
|
|
if (ptr->alarm_status == ALARM_ON)
|
|
{
|
|
/* set up the fm api alarm defaults */
|
|
set_alarm_defaults( ptr );
|
|
if ( strcmp(ptr->resource, MEMORY_RESOURCE_NAME) == 0 )
|
|
{
|
|
snprintf(alarmData.proposed_repair_action , sizeof(alarmData.proposed_repair_action),
|
|
"Monitor and if condition persists, contact next level of support; may require additional memory on Host.");
|
|
}
|
|
else if ( strcmp(ptr->resource, INSTANCE_RESOURCE_NAME) == 0 )
|
|
{
|
|
snprintf(alarmData.proposed_repair_action , sizeof(alarmData.proposed_repair_action),
|
|
"Check Management and Infrastructure Networks and Controller or Storage Nodes.");
|
|
}
|
|
else
|
|
{
|
|
if ((ptr->type != NULL) && (strcmp(ptr->type, "lvg") == 0 ))
|
|
{
|
|
snprintf(alarmData.proposed_repair_action , sizeof(alarmData.proposed_repair_action),
|
|
"Monitor and if condition persists, consider adding additional physical volumes to the volume group.");
|
|
}
|
|
else
|
|
{
|
|
snprintf(alarmData.proposed_repair_action , sizeof(alarmData.proposed_repair_action),
|
|
"Monitor and if condition persists, contact next level of support.");
|
|
}
|
|
}
|
|
|
|
if ( ptr->sev == SEVERITY_MINOR )
|
|
{
|
|
alarmData.severity = FM_ALARM_SEVERITY_MINOR;
|
|
|
|
if ( ptr->percent == PERCENT_USED ) {
|
|
|
|
if ( ptr->alarm_type == STANDARD_ALARM )
|
|
{
|
|
ilog ("%s threshold exceeded; threshold: %d%%, actual: %.2f%%. \n",
|
|
ptr->resource, ptr->minor_threshold, ptr->resource_value);
|
|
snprintf(alarmData.reason_text, sizeof(alarmData.reason_text),
|
|
"%s threshold exceeded; threshold: %u%%, actual: %.2f%%.",
|
|
ptr->resource, ptr->minor_threshold, ptr->resource_value);
|
|
}
|
|
else {
|
|
ilog ("Filesystem threshold exceeded; threshold: %d%%, actual: %.2f%%. \n",
|
|
ptr->minor_threshold, ptr->resource_value);
|
|
snprintf(alarmData.reason_text, sizeof(alarmData.reason_text),
|
|
"Filesystem exceeded; threshold: %u%%, actual: %.2f%%.",
|
|
ptr->minor_threshold, ptr->resource_value);
|
|
}
|
|
} else {
|
|
if ( ptr->alarm_type == STANDARD_ALARM )
|
|
{
|
|
ilog ("%s threshold exceeded; threshold: %dMB, remaining value: %.2fMB. \n",
|
|
ptr->resource, ptr->minor_threshold_abs_node0, ptr->resource_value);
|
|
snprintf(alarmData.reason_text, sizeof(alarmData.reason_text),
|
|
"%s threshold exceeded; threshold: %uMB, remaining value: %.2fMB.",
|
|
ptr->resource, ptr->minor_threshold_abs_node0, ptr->resource_value);
|
|
} else {
|
|
ilog ("Filesystem threshold exceeded; threshold: %dMB, remaining value: %.2fMB. \n",
|
|
ptr->minor_threshold_abs_node0, ptr->resource_value);
|
|
snprintf(alarmData.reason_text, sizeof(alarmData.reason_text),
|
|
"Filesystem threshold exceeded; threshold: %uMB, remaining value: %.2fMB.",
|
|
ptr->minor_threshold_abs_node0, ptr->resource_value);
|
|
}
|
|
}
|
|
snprintf(ptr->errorMsg, sizeof(ptr->errorMsg),
|
|
"%s minor_threshold_set", err_res_name.c_str());
|
|
}
|
|
else if ( ptr->sev == SEVERITY_MAJOR )
|
|
{
|
|
alarmData.severity = FM_ALARM_SEVERITY_MAJOR;
|
|
|
|
if (strcmp(ptr->resource, INSTANCE_RESOURCE_NAME) != 0)
|
|
{
|
|
if (ptr->percent == PERCENT_USED){
|
|
if ( ptr->alarm_type == STANDARD_ALARM )
|
|
{
|
|
ilog ("%s threshold exceeded; threshold: %d%%, actual: %.2f%%. \n",
|
|
ptr->resource, ptr->major_threshold, ptr->resource_value);
|
|
snprintf(alarmData.reason_text, sizeof(alarmData.reason_text),
|
|
"%s threshold exceeded; threshold: %u%%, actual: %.2f%%.",
|
|
ptr->resource, ptr->major_threshold, ptr->resource_value);
|
|
}
|
|
else {
|
|
ilog ("Filesystem threshold exceeded; threshold: %d%%, actual: %.2f%%. \n",
|
|
ptr->major_threshold, ptr->resource_value);
|
|
snprintf(alarmData.reason_text, sizeof(alarmData.reason_text),
|
|
"Filesystem threshold exceeded; threshold: %u%%, actual: %.2f%%.",
|
|
ptr->major_threshold, ptr->resource_value);
|
|
}
|
|
} else {
|
|
if ( ptr->alarm_type == STANDARD_ALARM )
|
|
{
|
|
ilog ("%s threshold exceeded; threshold: %dMB, remaining value: %.2fMB. \n",
|
|
ptr->resource, ptr->major_threshold_abs_node0, ptr->resource_value);
|
|
snprintf(alarmData.reason_text, sizeof(alarmData.reason_text),
|
|
"%s threshold exceeded; threshold: %uMB, remaining value: %.2fMB.",
|
|
ptr->resource, ptr->major_threshold_abs_node0, ptr->resource_value);
|
|
} else {
|
|
ilog ("Filesystem threshold exceeded; threshold: %dMB, remaining value: %.2fMB. \n",
|
|
ptr->major_threshold_abs_node0, ptr->resource_value);
|
|
snprintf(alarmData.reason_text, sizeof(alarmData.reason_text),
|
|
"Filesystem threshold exceeded; threshold: %uMB, remaining value: %.2fMB.",
|
|
ptr->major_threshold_abs_node0, ptr->resource_value);
|
|
}
|
|
}
|
|
}
|
|
else if (strcmp(ptr->resource, INSTANCE_RESOURCE_NAME) == 0)
|
|
{
|
|
/* instance alarming is a special case of alarm */
|
|
wlog ("No access to remote VM volumes.\n");
|
|
snprintf(alarmData.reason_text, sizeof(alarmData.reason_text),
|
|
"No access to remote VM volumes.");
|
|
}
|
|
|
|
if ( ptr->res_type == RESOURCE_TYPE__FILESYSTEM_USAGE )
|
|
{
|
|
snprintf(ptr->errorMsg, sizeof(ptr->errorMsg),
|
|
"%s %s",err_res_name.c_str(), DEGRADE_CLEAR_MSG );
|
|
}
|
|
else
|
|
{
|
|
snprintf(ptr->errorMsg, sizeof(ptr->errorMsg),
|
|
"%s major_threshold_set",err_res_name.c_str());
|
|
}
|
|
}
|
|
else if ( ptr->sev == SEVERITY_CRITICAL )
|
|
{
|
|
alarmData.severity = FM_ALARM_SEVERITY_CRITICAL;
|
|
|
|
if (ptr->percent == PERCENT_USED){
|
|
if ( ptr->alarm_type == STANDARD_ALARM )
|
|
{
|
|
ilog ("%s threshold exceeded; threshold: %d%%, actual: %.2f%%. \n",
|
|
ptr->resource, ptr->critical_threshold, ptr->resource_value);
|
|
snprintf(alarmData.reason_text, sizeof(alarmData.reason_text),
|
|
"%s threshold exceeded; threshold: %u%%, actual: %.2f%%.",
|
|
ptr->resource, ptr->critical_threshold, ptr->resource_value);
|
|
}
|
|
else {
|
|
ilog ("Filesystem threshold exceeded; threshold: %d%%, actual: %.2f%%. \n",
|
|
ptr->critical_threshold, ptr->resource_value);
|
|
snprintf(alarmData.reason_text, sizeof(alarmData.reason_text),
|
|
"Filesystem threshold exceeded; threshold: %u%%, actual: %.2f%%.",
|
|
ptr->critical_threshold, ptr->resource_value);
|
|
}
|
|
} else {
|
|
if ( ptr->alarm_type == STANDARD_ALARM )
|
|
{
|
|
ilog ("%s threshold exceeded; threshold: %dMB, remaining value: %.2fMB. \n",
|
|
ptr->resource, ptr->critical_threshold_abs_node0, ptr->resource_value);
|
|
snprintf(alarmData.reason_text, sizeof(alarmData.reason_text),
|
|
"%s threshold exceeded; threshold: %uMB, remaining value: %.2fMB.",
|
|
ptr->resource, ptr->critical_threshold_abs_node0, ptr->resource_value);
|
|
} else {
|
|
ilog ("Filesystem threshold exceeded; threshold: %dMB, remaining value: %.2fMB. \n",
|
|
ptr->critical_threshold_abs_node0, ptr->resource_value);
|
|
snprintf(alarmData.reason_text, sizeof(alarmData.reason_text),
|
|
"Filesystem threshold exceeded; threshold: %uMB, remaining value: %.2fMB.",
|
|
ptr->critical_threshold_abs_node0, ptr->resource_value);
|
|
}
|
|
}
|
|
snprintf(ptr->errorMsg, sizeof(ptr->errorMsg),
|
|
"%s major_threshold_set",err_res_name.c_str());
|
|
}
|
|
|
|
rc = rmon_fm_set(&alarmData, NULL);
|
|
if (rc == FM_ERR_OK ) {
|
|
ilog("%s: %s alarm\n",
|
|
ptr->resource,
|
|
FmAlarmSeverity_to_string(alarmData.severity).c_str());
|
|
ptr->alarm_raised = true;
|
|
} else {
|
|
ilog("%s: %s alarm failed (rc:%d)\n",
|
|
ptr->resource,
|
|
FmAlarmSeverity_to_string(alarmData.severity).c_str(),
|
|
(int)rc);
|
|
}
|
|
|
|
if (ptr->alarm_raised)
|
|
{
|
|
if ((_rmon_ctrl_ptr->clients > 0) && (ptr->failed_send < MAX_FAIL_SEND))
|
|
{
|
|
/* If degrade debounce is non-zero then this
|
|
* alarm condition is candidate for host degrade */
|
|
if (ptr->debounce)
|
|
{
|
|
if ( rmon_send_request ( ptr, _rmon_ctrl_ptr->clients ) != PASS )
|
|
{
|
|
ptr->failed_send++;
|
|
wlog ("%s request send failed (count:%d)\n",
|
|
ptr->resource,
|
|
ptr->failed_send );
|
|
}
|
|
else
|
|
{
|
|
ptr->failed_send = 0;
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
ptr->failed_send = 0;
|
|
}
|
|
resourceStageChange ( ptr, RMON_STAGE__MONITOR_WAIT );
|
|
}
|
|
}
|
|
else {
|
|
resourceStageChange ( ptr, RMON_STAGE__FINISH );
|
|
}
|
|
|
|
break;
|
|
}
|
|
|
|
case RMON_STAGE__IGNORE:
|
|
{
|
|
|
|
//nothing to do here, go to the finished stage
|
|
resourceStageChange ( ptr, RMON_STAGE__FINISH );
|
|
|
|
break ;
|
|
}
|
|
|
|
case RMON_STAGE__MONITOR_WAIT:
|
|
{
|
|
if ((_rmon_ctrl_ptr->clients > 0) && (ptr->failed_send < MAX_FAIL_SEND) && (ptr->failed_send > 0))
|
|
{
|
|
if ( rmon_send_request ( ptr, _rmon_ctrl_ptr->clients ) != PASS )
|
|
{
|
|
wlog ("%s request send failed \n", ptr->resource);
|
|
ptr->failed_send++;
|
|
}
|
|
else
|
|
{
|
|
ptr->failed_send = 0;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
|
|
case RMON_STAGE__FINISH:
|
|
{
|
|
if ((ptr->alarm_status == ALARM_ON) && (ptr->alarm_raised))
|
|
{
|
|
snprintf(alarmFilter.alarm_id, FM_MAX_BUFFER_LENGTH, ptr->alarm_id);
|
|
|
|
build_entity_instance_id (ptr, alarmData.entity_instance_id);
|
|
|
|
snprintf(alarmFilter.entity_instance_id, FM_MAX_BUFFER_LENGTH, alarmData.entity_instance_id);
|
|
ilog ("%s alarm clear\n", ptr->resource );
|
|
|
|
/* clear the alarm */
|
|
EFmErrorT ret = rmon_fm_clear(&alarmFilter);
|
|
if (( ret == FM_ERR_OK ) || ( ret == FM_ERR_ENTITY_NOT_FOUND ))
|
|
{
|
|
if (ret == FM_ERR_ENTITY_NOT_FOUND)
|
|
{
|
|
dlog ("%s alarm clear failed, entity '%s' not found",
|
|
ptr->resource, alarmData.entity_instance_id);
|
|
}
|
|
|
|
snprintf(ptr->errorMsg, sizeof(ptr->errorMsg), "%s cleared_alarms_for_resource", err_res_name.c_str());
|
|
if ( (_rmon_ctrl_ptr->clients > 0) && ( ptr->failed_send < MAX_FAIL_SEND ) && (ret == FM_ERR_OK) )
|
|
{
|
|
while (( rmon_send_request ( ptr, _rmon_ctrl_ptr->clients ) != PASS ) &&
|
|
( ptr->failed_send < MAX_FAIL_SEND ))
|
|
{
|
|
wlog ("%s request send failed \n", ptr->resource);
|
|
ptr->failed_send++;
|
|
}
|
|
|
|
ptr->alarm_raised = false;
|
|
ptr->failed_send = 0;
|
|
ptr->failed = false ;
|
|
ptr->count = 0 ;
|
|
ptr->sev = SEVERITY_CLEARED ;
|
|
ptr->stage = RMON_STAGE__START ;
|
|
}
|
|
else
|
|
{
|
|
ptr->alarm_raised = false;
|
|
ptr->failed_send = 0;
|
|
ptr->failed = false ;
|
|
ptr->count = 0 ;
|
|
ptr->sev = SEVERITY_CLEARED ;
|
|
ptr->stage = RMON_STAGE__START ;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
wlog("%s alarm clear failed, entity '%s' (rc:%d)\n",
|
|
ptr->resource,
|
|
alarmData.entity_instance_id,
|
|
ret);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
ptr->alarm_raised = false;
|
|
ptr->failed_send = 0;
|
|
ptr->failed = false ;
|
|
ptr->count = 0 ;
|
|
ptr->sev = SEVERITY_CLEARED ;
|
|
ptr->stage = RMON_STAGE__START ;
|
|
}
|
|
rc = PASS ;
|
|
break ;
|
|
}
|
|
default:
|
|
{
|
|
slog ("%s Invalid stage (%d)\n", ptr->resource, ptr->stage );
|
|
|
|
/* Default to finish for invalid case.
|
|
* If there is an issue then it will be detected */
|
|
resourceStageChange ( ptr, RMON_STAGE__FINISH );
|
|
}
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Name : process_failures
|
|
*
|
|
* Purpose : Check whether a percentage resource is to be failed or a failure
|
|
* threshold is to be cleared by the resource_handler
|
|
*
|
|
*****************************************************************************/
|
|
void process_failures ( resource_config_type * ptr )
|
|
{
|
|
if (ptr->stage == RMON_STAGE__INIT)
|
|
{
|
|
/* first time after restart/reboot, clear the alarm if the first reading is good */
|
|
resourceStageChange ( ptr, RMON_STAGE__START );
|
|
if (ptr->resource_value < ptr->minor_threshold)
|
|
{
|
|
// assuming we left as alarm on last time
|
|
ptr->alarm_status = ALARM_ON;
|
|
ptr->alarm_raised = true;
|
|
ptr->failed = true;
|
|
ilog("%s Setting the state to FINISH\n", ptr->resource);
|
|
resourceStageChange ( ptr, RMON_STAGE__FINISH );
|
|
}
|
|
// Now we start counting as normal ...
|
|
}
|
|
else
|
|
{
|
|
if (ptr->failed)
|
|
{
|
|
/* If the resource is already failed, check to see if it is to be cleared */
|
|
if ((( ptr->sev == SEVERITY_MINOR) && ( ptr->resource_value < ptr->minor_threshold )) ||
|
|
(( ptr->sev == SEVERITY_MAJOR) && ( ptr->resource_value < ptr->major_threshold )) ||
|
|
(( ptr->sev == SEVERITY_CRITICAL) && ( ptr->resource_value < ptr->critical_threshold )))
|
|
{
|
|
if (ptr->count > ptr->num_tries)
|
|
ptr->count = ptr->num_tries;
|
|
|
|
if (ptr->count > 0)
|
|
ptr->count--;
|
|
|
|
if (ptr->count == 0) {
|
|
ptr->sev = SEVERITY_CLEARED;
|
|
ilog("%s Setting the state to FINISH\n", ptr->resource);
|
|
resourceStageChange ( ptr, RMON_STAGE__FINISH );
|
|
}
|
|
}
|
|
else
|
|
{
|
|
/* While in failed state, the resource usage must sustain normal level
|
|
* num_tries number of times before an alarm can be cleared. Keep incrementing the counter
|
|
* as it will be set to num_tries in the above block as soon as resource usage returns to
|
|
* normal level.*/
|
|
ptr->count++;
|
|
|
|
// rmon needs to send degrade assert message periodically as the
|
|
// condition might be cleared by maintenance over controller swact.
|
|
//
|
|
// added meaning to the debounce config setting.
|
|
// must be non-zero to degrade the host.
|
|
if ((ptr->alarm_raised) && (ptr->debounce) &&
|
|
(_rmon_ctrl_ptr->clients > 0))
|
|
{
|
|
if ( rmon_send_request ( ptr, _rmon_ctrl_ptr->clients ) != PASS )
|
|
{
|
|
ptr->failed_send++ ;
|
|
wlog ("%s request send failed (count:%d)\n",
|
|
ptr->resource,
|
|
ptr->failed_send);
|
|
}
|
|
else
|
|
{
|
|
mlog ("%s rmon_send_request ok\n", ptr->resource );
|
|
ptr->failed_send = 0 ;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
/* typical path for resources that
|
|
* - do not degrade host
|
|
* - do not raise alarms */
|
|
dlog ("%s: alarm:%d debounce:%d clients:%d\n",
|
|
ptr->resource,
|
|
(ptr->alarm_raised),
|
|
(ptr->debounce),
|
|
(_rmon_ctrl_ptr->clients));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Check to see if a resource is over the failure thresholds for: minor, major and critical failures */
|
|
if (( ptr->resource_value >= ptr->minor_threshold ) &&
|
|
( ptr->resource_value < ptr->major_threshold )
|
|
&& (ptr->sev != SEVERITY_MINOR))
|
|
{
|
|
ptr->count++;
|
|
if ( ptr->count >= ptr->num_tries) {
|
|
ptr->failed = true;
|
|
ptr->sev = SEVERITY_MINOR;
|
|
resourceStageChange ( ptr, RMON_STAGE__MANAGE);
|
|
}
|
|
}
|
|
|
|
else if (( ptr->resource_value >= ptr->major_threshold ) &&
|
|
( ptr->resource_value < ptr->critical_threshold )
|
|
&& (ptr->sev != SEVERITY_MAJOR))
|
|
{
|
|
ptr->count++;
|
|
if ( ptr->count >= ptr->num_tries){
|
|
ptr->failed = true;
|
|
ptr->sev = SEVERITY_MAJOR;
|
|
resourceStageChange ( ptr, RMON_STAGE__MANAGE);
|
|
}
|
|
}
|
|
else if (( ptr->resource_value >= ptr->critical_threshold )&&
|
|
(ptr->sev != SEVERITY_CRITICAL))
|
|
{
|
|
ptr->count++;
|
|
if (ptr->count >= ptr->num_tries){
|
|
ptr->failed = true;
|
|
ptr->sev = SEVERITY_CRITICAL;
|
|
resourceStageChange ( ptr, RMON_STAGE__MANAGE);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
/* if the host experienced a resource blip in the previous audit run and usage
|
|
* is now back at the normal level, decrement the count.*/
|
|
if ((!ptr->failed) && (ptr->count > 0)){
|
|
ptr->count--;
|
|
dlog("Resource %s is back at the normal level, count is set to %d", ptr->resource, ptr->count);
|
|
}
|
|
}
|
|
}
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Name : process_failures_absolute
|
|
*
|
|
* Purpose : Check whether an absolute resource is to be failed or a
|
|
* failure threshold is to be cleared by the resource_handler
|
|
*
|
|
*****************************************************************************/
|
|
void process_failures_absolute ( resource_config_type * ptr )
|
|
{
|
|
int node = 0;
|
|
|
|
if (strcmp(ptr->resource,"processor_node1") == 0)
|
|
{
|
|
/* per node memory checking is enabled */
|
|
node = 1;
|
|
}
|
|
|
|
if (ptr->failed) {
|
|
/* If the resource is already failed, check to see if it is to be cleared */
|
|
if (node == 0) {
|
|
|
|
if ((( ptr->sev == SEVERITY_MINOR) && ( ptr->resource_value > ptr->minor_threshold_abs_node0 )) ||
|
|
(( ptr->sev == SEVERITY_MAJOR) && ( ptr->resource_value > ptr->major_threshold_abs_node0 )) ||
|
|
(( ptr->sev == SEVERITY_CRITICAL) && ( ptr->resource_value > ptr->critical_threshold_abs_node0 )))
|
|
{
|
|
if (ptr->count > ptr->num_tries)
|
|
ptr->count = ptr->num_tries;
|
|
if (ptr->count > 0)
|
|
ptr->count--;
|
|
|
|
if (ptr->count == 0) {
|
|
ptr->sev = SEVERITY_CLEARED;
|
|
resourceStageChange ( ptr, RMON_STAGE__FINISH );
|
|
}
|
|
}
|
|
else
|
|
{
|
|
/* While in failed state, the resource usage must sustain normal level
|
|
* num_tries number of times before an alarm can be cleared. Keep incrementing the counter
|
|
* as it will be set to num_tries in the above block as soon as resource usage returns to
|
|
* normal level.*/
|
|
ptr->count++;
|
|
}
|
|
}
|
|
else {
|
|
|
|
if ((( ptr->sev == SEVERITY_MINOR) && ( ptr->resource_value > ptr->minor_threshold_abs_node1 )) ||
|
|
(( ptr->sev == SEVERITY_MAJOR) && ( ptr->resource_value > ptr->major_threshold_abs_node1 )) ||
|
|
(( ptr->sev == SEVERITY_CRITICAL) && ( ptr->resource_value > ptr->critical_threshold_abs_node1 )))
|
|
{
|
|
if (ptr->count > ptr->num_tries)
|
|
ptr->count = ptr->num_tries;
|
|
if (ptr->count > 0)
|
|
ptr->count--;
|
|
|
|
if (ptr->count == 0) {
|
|
ptr->sev = SEVERITY_CLEARED;
|
|
resourceStageChange ( ptr, RMON_STAGE__FINISH );
|
|
}
|
|
}
|
|
else
|
|
{
|
|
/* While in failed state, the resource usage must sustain normal level
|
|
* num_tries number of times before an alarm can be cleared. Keep incrementing the counter
|
|
* as it will be set to num_tries in the above block as soon as resource usage returns to
|
|
* normal level.*/
|
|
ptr->count++;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (node == 0) {
|
|
/* Check to see if a resource is over the failure thresholds for: minor, major and critical failures node 0 */
|
|
if (( ptr->resource_value <= ptr->minor_threshold_abs_node0 ) &&
|
|
( ptr->resource_value > ptr->major_threshold_abs_node0 ) &&
|
|
(ptr->sev != SEVERITY_MINOR))
|
|
{
|
|
ptr->count++;
|
|
if ( ptr->count >= ptr->num_tries){
|
|
ptr->failed = true;
|
|
ptr->sev = SEVERITY_MINOR;
|
|
resourceStageChange ( ptr, RMON_STAGE__MANAGE);
|
|
}
|
|
}
|
|
|
|
else if (( ptr->resource_value <= ptr->major_threshold_abs_node0 ) &&
|
|
( ptr->resource_value > ptr->critical_threshold_abs_node0 ) &&
|
|
(ptr->sev != SEVERITY_MAJOR))
|
|
{
|
|
ptr->count++;
|
|
if ( ptr->count >= ptr->num_tries){
|
|
ptr->failed = true;
|
|
ptr->sev = SEVERITY_MAJOR;
|
|
resourceStageChange ( ptr, RMON_STAGE__MANAGE);
|
|
}
|
|
}
|
|
else if (( ptr->resource_value < ptr->critical_threshold_abs_node0 )&&
|
|
(ptr->sev != SEVERITY_CRITICAL))
|
|
{
|
|
ptr->count++;
|
|
if (ptr->count >= ptr->num_tries){
|
|
ptr->failed = true;
|
|
ptr->sev = SEVERITY_CRITICAL;
|
|
resourceStageChange ( ptr, RMON_STAGE__MANAGE);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
/* if the host experienced a resource blip in the previous audit run and usage
|
|
* is now back at the normal level, decrement the count.*/
|
|
if ((!ptr->failed) && (ptr->count > 0)){
|
|
ptr->count--;
|
|
dlog("Resource %s is back at the normal level, count is set to %d", ptr->resource, ptr->count);
|
|
}
|
|
}
|
|
} else {
|
|
|
|
/* Check to see if a resource is over the failure thresholds for: minor, major and critical failures node 1 */
|
|
if (( ptr->resource_value <= ptr->minor_threshold_abs_node1 ) &&
|
|
( ptr->resource_value > ptr->major_threshold_abs_node1 ) &&
|
|
(ptr->sev != SEVERITY_MINOR))
|
|
{
|
|
ptr->count++;
|
|
if ( ptr->count >= ptr->num_tries){
|
|
ptr->failed = true;
|
|
ptr->sev = SEVERITY_MINOR;
|
|
resourceStageChange ( ptr, RMON_STAGE__MANAGE);
|
|
}
|
|
}
|
|
else if (( ptr->resource_value <= ptr->major_threshold_abs_node1 ) &&
|
|
( ptr->resource_value > ptr->critical_threshold_abs_node1 ) &&
|
|
(ptr->sev != SEVERITY_MAJOR))
|
|
{
|
|
ptr->count++;
|
|
if ( ptr->count >= ptr->num_tries){
|
|
ptr->failed = true;
|
|
ptr->sev = SEVERITY_MAJOR;
|
|
resourceStageChange ( ptr, RMON_STAGE__MANAGE);
|
|
}
|
|
}
|
|
else if (( ptr->resource_value < ptr->critical_threshold_abs_node1 )&&
|
|
(ptr->sev != SEVERITY_CRITICAL))
|
|
{
|
|
ptr->count++;
|
|
if (ptr->count >= ptr->num_tries){
|
|
ptr->failed = true;
|
|
ptr->sev = SEVERITY_CRITICAL;
|
|
resourceStageChange ( ptr, RMON_STAGE__MANAGE);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
/* if the host experienced a resource blip in the previous audit run and usage
|
|
* is now back at the normal level, decrement the count.*/
|
|
if ((!ptr->failed) && (ptr->count > 0)){
|
|
ptr->count--;
|
|
dlog("Resource %s is back at the normal level, count is set to %d", ptr->resource, ptr->count);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void update_total_clients (int total_clients)
|
|
{
|
|
_rmon_ctrl_ptr->clients = total_clients;
|
|
}
|
|
|
|
void add_registered_client (registered_clients client)
|
|
{
|
|
|
|
registered_clt[_rmon_ctrl_ptr->clients] = client;
|
|
ilog("added registered client: %s \n", client.client_name);
|
|
}
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Name : add_fs_resource
|
|
*
|
|
* Purpose : Add a dynamic or static fs resource by reading
|
|
* the: /etc/rmonfiles.d/dynamic.conf file
|
|
*****************************************************************************/
|
|
void add_fs_resource ( int resource_index, int criticality_index, int enabled,
|
|
int |