metal/mtce/src/rmon/rmonHdlr.cpp
Tao Liu 9661e49411 Change compute node to worker node personality
This update replaces compute references to worker in mtce,
kickstarts, installer and bsp files.

Tests Performed:
Non-containerized deployment
AIO-SX: Sanity and Nightly automated test suite
AIO-DX: Sanity and Nightly automated test suite
2+2 System: Sanity and Nightly automated test suite
2+2 System: Horizon Patch Orchestration

Kubernetes deployment:
AIO-SX: Create, delete, reboot and rebuild instances
2+2+2 System: worker nodes are unlock enable and no alarms

Story: 2004022
Task: 27013

Depends-On: https://review.openstack.org/#/c/624452/

Change-Id: I225f7d7143d841f80459603b27b95ac3f846c46f
Signed-off-by: Tao Liu <tao.liu@windriver.com>
2018-12-13 13:08:48 -05:00

4925 lines
190 KiB
C++

/*
* Copyright (c) 2013-2017 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
*/
/**
* @file
* Wind River CGCS Platform Resource Monitor Handler
*/
#include "rmon.h" /* rmon header file */
#include "rmonHttp.h" /* for rmon HTTP libEvent utilties */
#include "rmonApi.h" /* vswitch calls */
#include <sys/wait.h>
#include <time.h>
#include <signal.h>
#include <fstream>
#include <sstream>
#include <ctime>
#include <vector> /* for storing dynamic resource names */
#include <dirent.h>
#include <algorithm>
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <cctype>
#include <pthread.h>
#include <linux/rtnetlink.h> /* for ... RTMGRP_LINK */
#include "nlEvent.h" /* for ... open_netlink_socket */
#include "nodeEvent.h" /* for inotify */
#include <json-c/json.h> /* for ... json-c json string parsing */
#include "jsonUtil.h"
#include "tokenUtil.h" /* for ... tokenUtil_new_token */
/* Preserve a local copy of a pointer to the control struct to
* avoid having to publish a get utility prototype into rmon.h */
static rmon_ctrl_type * _rmon_ctrl_ptr = NULL ;
static interface_resource_config_type interface_resource_config[MAX_RESOURCES] ;
static resource_config_type resource_config[MAX_RESOURCES] ;
static thinmeta_resource_config_type thinmeta_resource_config[MAX_RESOURCES] ;
static registered_clients registered_clt[MAX_CLIENTS];
static libEvent_type ceilometerEvent; // for ceilometer REST API request
static libEvent tokenEvent; // for token request
/* Used to set alarms through the FM API */
static SFmAlarmDataT alarmData;
static struct mtc_timer rmonTimer_event ;
static struct mtc_timer rmonTimer_pm ;
static struct mtc_timer rmonTimer_ntp ;
static struct mtc_timer rtimer[MAX_RESOURCES] ;
static struct mtc_timer thinmetatimer[MAX_RESOURCES] ;
static ntpStage_enum ntp_stage ; /* The stage the ntp is in within the resource handler fsm */
static int ntp_status ; /* status returned by the ntpq command */
static int ntp_child_pid ;
/* for dynamic resources */
bool modifyingResources = false;
vector<string> criticality_resource;
vector<string> dynamic_resource;
vector<string> types;
vector<string> devices;
vector<int> fs_index;
vector<string> fs_state;
/** List of config files */
std::list<string> config_files ;
std::list<string>::iterator string_iter_ptr ;
std::list<string> interface_config_files ;
/* percent or abs value for fs resources */
int fs_percent = 0;
int swact_count = 0;
/* for cpu usage */
time_t t1, t2;
int num_cpus = 0;
int num_base_cpus = 0;
int included_cpu[MAX_BASE_CPU];
static string hostUUID = "";
/* Initial cpu time */
vector<unsigned long long> cpu_time_initial;
/* Later cpu time */
vector<unsigned long long> cpu_time_later;
void save_fs_resource ( string resource_name, string criticality,
int enabled, int percent, int abs_values[3],
int alarm_type, string type, string device, int mounted );
void calculate_fs_usage( resource_config_type * ptr );
void _space_to_underscore (string & str );
struct thread_data
{
pid_t tid;
pid_t pid;
unsigned long long nr_switches_count;
bool thread_running;
double resource_usage;
resource_config_type * resource;
};
/* info passed to pthreads */
struct thread_data t_data;
pthread_t thread;
pthread_mutex_t lock;
/* strict memory accounting off = 0 or on = 1 */
int IS_STRICT = 0;
void mem_log_ctrl ( rmon_ctrl_type * ptr )
{
#define MAX_LEN 500
char str[MAX_LEN] ;
snprintf (&str[0], MAX_LEN, "%s %s %s\n",
&ptr->my_hostname[0],
ptr->my_address.c_str(),
ptr->my_macaddr.c_str() );
mem_log(str);
}
void mem_log_resource ( resource_config_type * ptr )
{
#define MAX_LEN 500
char str[MAX_LEN] ;
snprintf (&str[0], MAX_LEN, "Resource:%-15s Sev:%-8s Tries:%u Debounce:%d\n",
ptr->resource, ptr->severity, ptr->count, ptr->debounce);
mem_log(str);
}
void mem_log_interface_resource ( interface_resource_config_type * ptr )
{
#define MAX_LEN 500
char str[MAX_LEN] ;
snprintf (&str[0], MAX_LEN, "Resource:%-15s Sev:%-8s Debounce:%d\n",
ptr->resource, ptr->severity, ptr->debounce);
mem_log(str);
}
int _config_dir_load (void);
int _config_files_load (void);
const char rmonStages_str [RMON_STAGE__STAGES][32] =
{
"Handler-Init",
"Handler-Start",
"Manage-Restart",
"Monitor-Wait",
"Monitor-Resource",
"Restart-Wait",
"Ignore-Resource",
"Handler-Finish",
"Failed-Resource",
"Failed-Resource-clr",
} ;
const char ntpStages_str [NTP_STAGE__STAGES][32] =
{
"Begin",
"Execute-NTPQ",
"Execute-NTPQ-Wait",
} ;
registered_clients * get_registered_clients_ptr ( int index )
{
if ( index <= _rmon_ctrl_ptr->clients )
return ( &registered_clt[index] );
return ( NULL );
}
rmon_ctrl_type * get_rmon_ctrl_ptr ()
{
return _rmon_ctrl_ptr;
}
interface_resource_config_type * get_interface_ptr ( int index )
{
if ( index <= _rmon_ctrl_ptr->interface_resources )
return ( &interface_resource_config[index] );
return ( NULL );
}
resource_config_type * get_resource_ptr ( int index )
{
if ( index >= 0 && index <= _rmon_ctrl_ptr->resources )
return ( &resource_config[index] );
return NULL;
}
/*****************************************************************************
*
* Name : get_resource_index
*
* Purpose : Get the resource's index based on the name
*
*****************************************************************************/
int get_resource_index ( const char *resource_name, int *index )
{
for ( int i = 0 ; i < _rmon_ctrl_ptr->resources ; i++ )
{
if ( strcmp(resource_config[i].resource, resource_name) == 0)
{
*index = i;
return (PASS);
}
}
return (FAIL);
}
/*****************************************************************************
*
* Name : rmon_hdlr_fini
*
* Purpose : Clean up the resource monitor module
*
*****************************************************************************/
void rmon_hdlr_fini ( rmon_ctrl_type * ctrl_ptr )
{
for ( int i = 0 ; i < ctrl_ptr->resources ; i++ )
{
// mem_log ('\n');
mem_log_resource ( &resource_config[i] );
}
pthread_mutex_destroy(&lock);
/* Turn off inotify */
//set_inotify_close ( ctrl_ptr->fd, ctrl_ptr->wd );
}
/*****************************************************************************
*
* Name : resourceStageChange
*
* Purpose : Put a resource in the requested stage for use by the resource handler
*
*****************************************************************************/
int resourceStageChange ( resource_config_type * ptr , rmonStage_enum newStage )
{
if (( newStage < RMON_STAGE__STAGES ) &&
( ptr->stage < RMON_STAGE__STAGES ))
{
clog ("%s %s -> %s (%d->%d)\n",
ptr->resource,
rmonStages_str[ptr->stage],
rmonStages_str[newStage],
ptr->stage, newStage);
ptr->stage = newStage ;
return (PASS);
}
else
{
slog ("%s Invalid Stage (now:%d new:%d)\n",
ptr->resource, ptr->stage, newStage );
ptr->stage = RMON_STAGE__FINISH ;
return (FAIL);
}
}
/*****************************************************************************
*
* Name : ntpStageChange
*
* Purpose : Stage change handler for NTP resource
*
*****************************************************************************/
int ntpStageChange ( ntpStage_enum newStage )
{
if ((newStage < NTP_STAGE__STAGES ) &&
( ntp_stage < NTP_STAGE__STAGES ))
{
clog ("NTP %s -> %s (%d->%d)\n",
ntpStages_str[ntp_stage],
ntpStages_str[newStage],
ntp_stage, newStage);
ntp_stage = newStage ;
return (PASS);
}
else
{
slog ("NTP Invalid Stage (now:%d new:%d)\n", ntp_stage, newStage );
ntp_stage = NTP_STAGE__BEGIN ;
return (FAIL);
}
}
/*****************************************************************************
*
* Name : _config_files_load
*
* Purpose : Load the content of each config file into resource_config[x]
*
*****************************************************************************/
int _config_files_load (void)
{
int i = 0 ;
/* Run Maintenance on Inventory */
for ( string_iter_ptr = config_files.begin () ;
string_iter_ptr != config_files.end () ;
string_iter_ptr++ )
{
if ( i >= MAX_RESOURCES )
{
wlog ("Cannot Monitor more than %d resources\n", MAX_RESOURCES );
break ;
}
/* Read the resource config file */
resource_config[i].mask = 0 ;
if (ini_parse( string_iter_ptr->data(), rmon_resource_config,
&resource_config[i]) < 0)
{
ilog("Read Failure : %s\n", string_iter_ptr->data() );
}
else
{
dlog ("Config File : %s\n", string_iter_ptr->c_str());
/* Init the timer for this resource */
mtcTimer_reset ( rtimer[i] ) ;
rtimer[i].service = resource_config[i].resource ;
resource_config[i].i = i ;
/* allow to clear an existing alarm if the first reading is good
after reboot
*/
resource_config[i].failed = false ;
resource_config[i].count = 0 ;
resource_config[i].resource_value = 0 ;
resource_config[i].resource_prev = 0 ;
resource_config[i].stage = RMON_STAGE__INIT ;
resource_config[i].sev = SEVERITY_CLEARED ;
resource_config[i].alarm_type = STANDARD_ALARM;
resource_config[i].failed_send = 0;
resource_config[i].alarm_raised = false;
/* add the alarm ids for the FM API per resource monitored */
if (strcmp(resource_config[i].resource, CPU_RESOURCE_NAME) == 0) {
/* platform cpu utilization */
snprintf(resource_config[i].alarm_id, FM_MAX_BUFFER_LENGTH, CPU_ALARM_ID);
resource_config[i].res_type = RESOURCE_TYPE__CPU_USAGE ;
}
else if (strcmp(resource_config[i].resource, V_CPU_RESOURCE_NAME) == 0) {
/* vswitch cpu utilization */
snprintf(resource_config[i].alarm_id, FM_MAX_BUFFER_LENGTH, V_CPU_ALARM_ID);
resource_config[i].res_type = RESOURCE_TYPE__CPU_USAGE ;
}
else if (strcmp(resource_config[i].resource, MEMORY_RESOURCE_NAME) == 0) {
/* platform memory utilization */
snprintf(resource_config[i].alarm_id, FM_MAX_BUFFER_LENGTH, MEMORY_ALARM_ID);
resource_config[i].res_type = RESOURCE_TYPE__MEMORY_USAGE ;
}
else if (strcmp(resource_config[i].resource, V_MEMORY_RESOURCE_NAME) == 0) {
/* vswitch memory utilization */
snprintf(resource_config[i].alarm_id, FM_MAX_BUFFER_LENGTH, V_MEMORY_ALARM_ID);
resource_config[i].res_type = RESOURCE_TYPE__MEMORY_USAGE ;
}
else if (strcmp(resource_config[i].resource, FS_RESOURCE_NAME) == 0) {
/* platform disk utilization */
snprintf(resource_config[i].alarm_id, FM_MAX_BUFFER_LENGTH, FS_ALARM_ID);
resource_config[i].mounted = MOUNTED;
resource_config[i].res_type = RESOURCE_TYPE__FILESYSTEM_USAGE ;
}
else if (strcmp(resource_config[i].resource, INSTANCE_RESOURCE_NAME) == 0) {
/* platform disk utilization */
snprintf(resource_config[i].alarm_id, FM_MAX_BUFFER_LENGTH, INSTANCE_ALARM_ID);
resource_config[i].res_type = RESOURCE_TYPE__FILESYSTEM_USAGE ;
}
else if (strcmp(resource_config[i].resource, V_CINDER_THINPOOL_RESOURCE_NAME) == 0) {
/* platform virtual thin pool utilization */
snprintf(resource_config[i].alarm_id, FM_MAX_BUFFER_LENGTH, V_CINDER_THINPOOL_ALARM_ID);
resource_config[i].res_type = RESOURCE_TYPE__FILESYSTEM_USAGE ;
}
else if (strcmp(resource_config[i].resource, V_NOVA_THINPOOL_RESOURCE_NAME) == 0) {
/* platform virtual thin pool utilization */
snprintf(resource_config[i].alarm_id, FM_MAX_BUFFER_LENGTH, V_NOVA_THINPOOL_ALARM_ID);
resource_config[i].res_type = RESOURCE_TYPE__FILESYSTEM_USAGE ;
}
else if (strcmp(resource_config[i].resource, V_PORT_RESOURCE_NAME) == 0) {
/* vswitch port utilization */
snprintf(resource_config[i].alarm_id, FM_MAX_BUFFER_LENGTH,
V_PORT_ALARM_ID);
resource_config[i].res_type = RESOURCE_TYPE__PORT ;
}
else if (!strcmp(resource_config[i].resource, V_INTERFACE_RESOURCE_NAME) ||
!strcmp(resource_config[i].resource, V_LACP_INTERFACE_RESOURCE_NAME)) {
/* vswitch interface(lacp or otherwise) utilization */
snprintf(resource_config[i].alarm_id, FM_MAX_BUFFER_LENGTH,
V_INTERFACE_ALARM_ID);
resource_config[i].res_type = RESOURCE_TYPE__INTERFACE ;
}
else if (!strcmp(resource_config[i].resource, V_OVSDB_RESOURCE_NAME)) {
/* vswitch OVSDB manager utilization */
snprintf(resource_config[i].alarm_id, FM_MAX_BUFFER_LENGTH,
V_OVSDB_MANAGER_ALARM_ID);
resource_config[i].res_type = RESOURCE_TYPE__DATABASE_USAGE ;
}
else if (!strcmp(resource_config[i].resource, V_OPENFLOW_RESOURCE_NAME)) {
/* vswitch Openflow utilization */
snprintf(resource_config[i].alarm_id, FM_MAX_BUFFER_LENGTH,
V_OPENFLOW_CONTROLLER_ALARM_ID);
resource_config[i].res_type = RESOURCE_TYPE__NETWORK_USAGE ;
}
else if (strcmp(resource_config[i].resource, REMOTE_LOGGING_RESOURCE_NAME) == 0) {
/* remote logging connectivity */
snprintf(resource_config[i].alarm_id, FM_MAX_BUFFER_LENGTH,
REMOTE_LOGGING_CONTROLLER_CONNECTIVITY_ALARM_ID);
resource_config[i].res_type = RESOURCE_TYPE__CONNECTIVITY ;
}
else
{
resource_config[i].res_type = RESOURCE_TYPE__UNKNOWN ;
}
ilog ("Monitoring %2d: %s (%s)\n",
i,
resource_config[i].resource,
resource_config[i].severity);
mem_log_resource ( &resource_config[i] );
i++;
}
}
_rmon_ctrl_ptr->resources = i ;
ilog ("Monitoring %d Resources\n", _rmon_ctrl_ptr->resources );
return (PASS);
}
/*****************************************************************************
*
* Name : _inter_config_load
*
* Purpose : Load the content of each config file into interface_resource_config[x]
*
*****************************************************************************/
int _inter_config_load (void)
{
int i = 0 ;
for ( string_iter_ptr = interface_config_files.begin () ;
string_iter_ptr != interface_config_files.end () ;
string_iter_ptr++ )
{
if ( i >= MAX_RESOURCES )
{
wlog ("Cannot Monitor more than %d resources\n", MAX_RESOURCES );
break ;
}
/* Read the interface resource config file */
resource_config[i].mask = 0 ;
if (ini_parse( string_iter_ptr->data(), rmon_interface_config,
&interface_resource_config[i]) < 0)
{
ilog("Read Failure : %s\n", string_iter_ptr->data() );
}
else
{
dlog ("Config File : %s\n", string_iter_ptr->c_str());
ilog ("Monitoring %2d: %s (%s)\n", i, interface_resource_config[i].resource ,
interface_resource_config[i].severity );
interface_resource_config[i].i = i ;
interface_resource_config[i].failed = false ;
interface_resource_config[i].stage = RMON_STAGE__INIT ;
interface_resource_config[i].sev = SEVERITY_CLEARED ;
interface_resource_config[i].failed_send = 0;
interface_resource_config[i].alarm_raised = false;
/* add the alarm ids for the FM API per resource monitored */
if (strcmp(interface_resource_config[i].resource, OAM_INTERFACE_NAME) == 0) {
/* add the alarm id for the FM API per resource monitored */
snprintf(interface_resource_config[i].alarm_id, FM_MAX_BUFFER_LENGTH, OAM_ALARM_ID);
snprintf(interface_resource_config[i].alarm_id_port, FM_MAX_BUFFER_LENGTH, OAM_PORT_ALARM_ID);
}
else if (strcmp(interface_resource_config[i].resource, MGMT_INTERFACE_NAME) == 0) {
/* add the alarm id for the FM API per resource monitored */
snprintf(interface_resource_config[i].alarm_id, FM_MAX_BUFFER_LENGTH, MGMT_ALARM_ID);
snprintf(interface_resource_config[i].alarm_id_port, FM_MAX_BUFFER_LENGTH, MGMT_PORT_ALARM_ID);
}
else if (strcmp(interface_resource_config[i].resource, INFRA_INTERFACE_NAME) == 0) {
/* add the alarm id for the FM API per resource monitored */
snprintf(interface_resource_config[i].alarm_id, FM_MAX_BUFFER_LENGTH, INFRA_ALARM_ID);
snprintf(interface_resource_config[i].alarm_id_port, FM_MAX_BUFFER_LENGTH, INFRA_PORT_ALARM_ID);
}
mem_log_interface_resource ( &interface_resource_config[i] );
i++;
}
}
_rmon_ctrl_ptr->interface_resources = i ;
ilog ("Monitoring %d Interface Resources\n", _rmon_ctrl_ptr->interface_resources );
return (PASS);
}
/*****************************************************************************
*
* Name : _thinmeta_config_load
*
* Purpose : Load the content of each config file into thinmeta_config[x]
*
*****************************************************************************/
int _thinmeta_config_load (void)
{
int i = 0 ;
/* Set hard-coded defaults for all structures */
for ( int j = 0; j < MAX_RESOURCES; j++)
{
thinmeta_resource_config_type * res;
res = &thinmeta_resource_config[i];
res->critical_threshold = THINMETA_DEFAULT_CRITICAL_THRESHOLD;
res->alarm_on = THINMETA_DEFAULT_ALARM_ON;
res->autoextend_on = THINMETA_DEFAULT_AUTOEXTEND_ON;
res->autoextend_by = THINMETA_DEFAULT_AUTOEXTEND_BY;
res->autoextend_percent = THINMETA_DEFAULT_AUTOEXTEND_PERCENT;
res->audit_period = THINMETA_DEFAULT_AUDIT_PERIOD;
}
/* Load resources */
for ( string_iter_ptr = config_files.begin () ;
string_iter_ptr != config_files.end () ;
string_iter_ptr++ )
{
if ( i >= MAX_RESOURCES )
{
wlog ("Cannot Monitor more than %d resources\n", MAX_RESOURCES );
break ;
}
/* Read the resource config file */
if (ini_parse( string_iter_ptr->data(), rmon_thinmeta_config,
&thinmeta_resource_config[i]) < 0)
{
ilog("Read Failure : %s\n", string_iter_ptr->data() );
}
else
{
thinmeta_resource_config_type * res;
res = &thinmeta_resource_config[i];
if (!res->section_exists)
{
dlog3 ("Config File : %s does not have a [%s] section\n",
string_iter_ptr->c_str(), THINMETA_CONFIG_SECTION);
continue;
}
dlog ("Config File : %s\n", string_iter_ptr->c_str());
/* validate loaded configuration */
if (!res->vg_name || !res->thinpool_name)
{
elog("Invalid VG and/or thinpool names for thinpool metadata "
"in config file: %s, disabling monitoring", string_iter_ptr->c_str());
res->critical_threshold = RESOURCE_DISABLE;
res->vg_name = THINMETA_INVALID_NAME;
res->thinpool_name = THINMETA_INVALID_NAME;
}
else if (res->critical_threshold > 99)
{
elog("Metadata monitoring error in config file: %s. Option critical_threshold > 99%%, "
"value in config file: %i, disabling monitoring",
string_iter_ptr->c_str(), res->critical_threshold)
res->critical_threshold = 0;
}
else if (res->alarm_on > 1)
{
elog("Metadata monitoring error in config file: %s. Option alarm_on is NOT boolean, "
"value in config file: %i, disabling monitoring", string_iter_ptr->c_str(), res->alarm_on);
res->critical_threshold = RESOURCE_DISABLE;
}
else if (res->autoextend_on > 1)
{
elog("Metadata monitoring error in config file: %s. Option autoextend_on is NOT boolean, "
"value in config file: %i, disabling monitoring",
string_iter_ptr->c_str(), res->autoextend_on)
res->critical_threshold = RESOURCE_DISABLE;
}
else if (res->autoextend_percent > 1)
{
elog("Metadata monitoring error in config file: %s. Option autoextend_percent is NOT boolean, "
"value in config file: %i, disabling monitoring",
string_iter_ptr->c_str(), res->autoextend_percent)
res->critical_threshold = RESOURCE_DISABLE;
}
else if ((res->autoextend_percent && res->autoextend_by > 100) ||
(res->autoextend_on && res->autoextend_by < 1))
{
elog("Metadata monitoring error in config file: %s. Option autoextend_by not in [1,100] interval, "
"value in config file: %i, disabling monitoring",
string_iter_ptr->c_str(), res->autoextend_by)
res->critical_threshold = RESOURCE_DISABLE;
}
else if ((res->audit_period < 1) || (res->audit_period > 10000))
{
elog("Metadata monitoring error in config file: %s. Option audit_period not in [1,10000] interval, "
"value in config file: %i, disabling monitoring",
string_iter_ptr->c_str(), res->audit_period)
res->critical_threshold = RESOURCE_DISABLE;
}
ilog ("%s/%s pool metadata monitored; resource index: %2d\n", res->vg_name ,
res->thinpool_name, i );
i++;
}
}
_rmon_ctrl_ptr->thinmeta_resources = i ;
ilog ("Monitoring %d Thinpool Metadata Resources\n", _rmon_ctrl_ptr->thinmeta_resources );
return (PASS);
}
/*****************************************************************************
*
* Name : rmon_hdlr_init
*
* Purpose : Init the handler but also support re-init that might occur over a SIGHUP
*
*****************************************************************************/
#define RMON_TIMER_TYPE__EVENT "event"
#define RMON_TIMER_TYPE__PM "pm"
#define RMON_TIMER_TYPE__NTP "ntp"
#define RMON_TIMER_TYPE__RES "resource"
#define RMON_TIMER_TYPE__THIN "thinpool"
int rmon_hdlr_init ( rmon_ctrl_type * ctrl_ptr )
{
/* Save the control pointer */
_rmon_ctrl_ptr = ctrl_ptr ;
mtcTimer_init ( rmonTimer_event, LOCALHOST, RMON_TIMER_TYPE__EVENT) ;
mtcTimer_init ( rmonTimer_pm, LOCALHOST, RMON_TIMER_TYPE__PM ) ;
if (is_controller())
mtcTimer_init ( rmonTimer_ntp,LOCALHOST, RMON_TIMER_TYPE__NTP ) ;
for ( int i = 0 ; i < MAX_RESOURCES ; i++ )
mtcTimer_init ( rtimer[i], LOCALHOST, RMON_TIMER_TYPE__RES );
ctrl_ptr->resources = 0 ;
for ( int i = 0 ; i < MAX_RESOURCES ; i++ )
mtcTimer_init ( thinmetatimer[i], LOCALHOST, RMON_TIMER_TYPE__THIN );
ctrl_ptr->thinmeta_resources = 0 ;
/* Initialize the Resource Monitor Array */
memset ( (char*)&resource_config[0], 0, sizeof(resource_config_type)*MAX_RESOURCES);
memset ( (char*)&interface_resource_config[0], 0, sizeof(interface_resource_config_type)*MAX_RESOURCES);
memset ( (char*)&thinmeta_resource_config[0], 0, sizeof(thinmeta_resource_config_type)*MAX_RESOURCES);
memset ( (char*)&registered_clt[0], 0, sizeof(registered_clients)*MAX_CLIENTS);
/* Read in the list of config files and their contents */
load_filenames_in_dir ( CONFIG_DIR, config_files ) ;
/* Read in the list of interface config files and their contents */
load_filenames_in_dir ( INT_CONFIG_DIR, interface_config_files ) ;
_thinmeta_config_load();
_config_files_load ();
_inter_config_load ();
/* init Thin Metadata Monitoring after config reload - including timers */
thinmeta_init(thinmeta_resource_config, thinmetatimer, ctrl_ptr->thinmeta_resources);
/* Log the control setting going into the main loop */
mem_log_ctrl ( _rmon_ctrl_ptr );
/* Initialize instance mount monitoring */
if (pthread_mutex_init(&lock, NULL) != 0)
{
elog("mutex init failed \n");
}
t_data.thread_running = false;
t_data.resource_usage = MOUNTED;
t_data.nr_switches_count = 0;
t_data.pid = getpid();
return (PASS) ;
}
/*****************************************************************************
*
* Name : _set_severity
*
* Purpose : Restores the resource value and the severity of the alarm
*
*****************************************************************************/
void _set_resource_usage ( string reason_text, resource_config_type * ptr )
{
unsigned int found;
string res_val;
size_t last_index;
string temp_val;
char resource_usage[10];
/* extract the resource value from the reason text */
found = reason_text.find_last_of( ' ' );
temp_val = reason_text.substr(found+1);
last_index = temp_val.find_first_not_of("0123456789");
res_val = temp_val.substr(0, last_index);
snprintf (resource_usage, sizeof(resource_usage), res_val.c_str());
sscanf(resource_usage, "%lf", &ptr->resource_value);
}
/*****************************************************************************
*
* Name : build_entity_instance_id
*
* Purpose : build the alarm's entity_instance_id based on the
* resource type and alarm type.
*
*****************************************************************************/
void build_entity_instance_id ( resource_config_type *ptr, char *entity_instance_id )
{
dlog ("resource name: %s, resource type: %s, alarm type: %d \n", ptr->resource, ptr->type, ptr->alarm_type);
// Make certain the id is cleared
entity_instance_id[0] = 0;
if ( ptr->alarm_type == DYNAMIC_ALARM )
{
if ((ptr->type != NULL) && (strcmp(ptr->type, "lvg") == 0 ))
{
/* This case covers volume groups */
/* Use host=<x>.volumegroup=type for id*/
snprintf((char*)entity_instance_id, FM_MAX_BUFFER_LENGTH, "%s.volumegroup=%s", _rmon_ctrl_ptr->my_hostname, ptr->resource);
}
else
{
/* Use host=<x>.filesystem=type for id*/
snprintf(entity_instance_id, FM_MAX_BUFFER_LENGTH, "%s.filesystem=%s", _rmon_ctrl_ptr->my_hostname, ptr->resource);
}
}
else if ( ptr->alarm_type == STATIC_ALARM )
{
/* Use host=<x>.filesystem=type for id*/
snprintf(entity_instance_id, FM_MAX_BUFFER_LENGTH, "%s.filesystem=%s", _rmon_ctrl_ptr->my_hostname, ptr->resource);
}
else if ((ptr->alarm_type == STANDARD_ALARM) && (strstr(ptr->resource, V_MEMORY_RESOURCE_NAME) != NULL))
{
/* AVS memory */
snprintf(alarmData.entity_instance_id, FM_MAX_BUFFER_LENGTH, "%s.processor=%d", _rmon_ctrl_ptr->my_hostname, ptr->socket_id);
}
else if (strstr(ptr->resource, V_CINDER_THINPOOL_RESOURCE_NAME) != NULL)
{
/* Cinder thin pool alarm should not be raised against a specific host */
/* as the volumes are synced between controllers through drbd. */
/* Instead we use a common entity instance id for both controllers. */
snprintf(entity_instance_id, FM_MAX_BUFFER_LENGTH, "host=controller");
}
else
{
/* Use hostname for alarm */
snprintf(entity_instance_id, FM_MAX_BUFFER_LENGTH, _rmon_ctrl_ptr->my_hostname);
}
dlog ("resource %s entity instance id: %s\n", ptr->resource, entity_instance_id);
return;
}
/*****************************************************************************
*
* Name : thinpool_virtual_space_usage_init
*
* Purpose : Determine if we should monitor virtual usage or not: no purpose
* in doing so if thin provisioning is not used.
*
* Params : index - the index of the virtual space resource
*
* Return : None.
*
*****************************************************************************/
void thinpool_virtual_space_usage_init(int index,
const char *poolName,
const char *poolOwner) {
if (!poolName or !poolOwner) {
slog ("No poolName or poolOwner provided");
return;
}
ilog("index = %d, poolName = %s, poolOwner = %s", index, poolName, poolOwner);
/* Buffer (and its size) for keeping the initial result after executing
the above command. */
char current_pool_type[BUFFER_SIZE];
const unsigned int buffer_size = BUFFER_SIZE;
/* The command for seeing if the pool type is thin. */
char lvm_thin_cmd[BUFFER_SIZE];
const char *thin_pool_expected_result = NULL;
MEMSET_ZERO(current_pool_type);
MEMSET_ZERO(lvm_thin_cmd);
if (strcmp(poolName, "nova-local-pool") == 0) {
const char *nova_thin_pool_expected_result = "thin-pool";
thin_pool_expected_result = nova_thin_pool_expected_result;
sprintf(lvm_thin_cmd, "lvs --segments | grep \"%s\" | awk '{print $5}'", poolName);
}
else if (strcmp(poolName, "cinder-volumes-pool") == 0) {
const char *cinder_thin_pool_expected_result = "thin";
thin_pool_expected_result = cinder_thin_pool_expected_result;
sprintf(lvm_thin_cmd, "cat /etc/cinder/cinder.conf | awk -F = '/^lvm_type.*=.*/ { print $2; }' | tail -n 1 | tr -d ' '");
}
else {
slog("Invalid pool name given.");
return;
}
/* Result code. */
int rc;
/* Execute the command. */
rc = execute_pipe_cmd(lvm_thin_cmd, current_pool_type, buffer_size);
/* If the command has been executed successfuly, continue. */
if (rc == PASS) {
if (current_pool_type != NULL) {
/* If the pool type is not thin, disable the alarm for virtual
usage. */
ilog("%s current pool type is set to = %s", poolOwner, current_pool_type);
if(strcmp(current_pool_type, thin_pool_expected_result) != 0) {
resource_config[index].alarm_status = ALARM_OFF;
ilog("%s LVM Thinpool Usage alarm off: thin provisioning not used", poolOwner);
} else {
resource_config[index].alarm_status = ALARM_ON;
ilog("%s LVM Thinpool Usage alarm on: thin provisioning used", poolOwner);
}
}
} else {
resource_config[index].alarm_status = ALARM_OFF;
elog("%s LVM Thinpool monitoring state unknown ; alarm disabled (rc:%i)",
poolOwner, rc);
}
}
/*****************************************************************************
*
* Name : virtual_space_usage_init
*
* Purpose : Determine if we should monitor virtual usage or not: no purpose
* in doing so if thin provisioning is not used.
*
* Return : None.
*
*****************************************************************************/
void virtual_space_usage_init(const char* resource_name) {
ilog ("Initialize thin pools for resource %s\n", resource_name);
int index;
if ( get_resource_index( resource_name, &index ) == PASS ) {
if (strcmp(resource_name, V_CINDER_THINPOOL_RESOURCE_NAME) == 0) {
thinpool_virtual_space_usage_init(index,"cinder-volumes-pool","Cinder");
} else if (strcmp(resource_name, V_NOVA_THINPOOL_RESOURCE_NAME) == 0) {
thinpool_virtual_space_usage_init(index, "nova-local-pool","Nova");
}
}
else {
wlog ("failed get_resource_index for resource %s\n", resource_name);
}
}
/*****************************************************************************
*
* Name : rmon_alarming_init
*
* Purpose : Clears any previously raised rmon alarms if rmon is restarted
*
*****************************************************************************/
void rmon_alarming_init ( resource_config_type * ptr )
{
dlog ("resource name: %s, resource type: %s, alarm type: %d \n", ptr->resource, ptr->type, ptr->alarm_type);
AlarmFilter alarmFilter;
SFmAlarmDataT *active_alarm = (SFmAlarmDataT*) calloc (1, sizeof (SFmAlarmDataT));
if (active_alarm == NULL)
{
elog("Failed to allocate memory for SFmAlarmDataT\n");
return;
}
build_entity_instance_id (ptr, alarmData.entity_instance_id);
snprintf(alarmFilter.alarm_id, FM_MAX_BUFFER_LENGTH, ptr->alarm_id);
snprintf(alarmFilter.entity_instance_id, FM_MAX_BUFFER_LENGTH, alarmData.entity_instance_id);
if (fm_get_fault( &alarmFilter, active_alarm) == FM_ERR_OK)
{
if (active_alarm != NULL) {
string reasonText(active_alarm->reason_text);
/* Set the resource severity */
ptr->failed = true;
ptr->alarm_raised = true;
ptr->count = ptr->num_tries;
if ( active_alarm->severity == FM_ALARM_SEVERITY_MINOR )
{
ptr->sev = SEVERITY_MINOR;
}
else if ( active_alarm->severity == FM_ALARM_SEVERITY_MAJOR )
{
ptr->sev = SEVERITY_MAJOR;
if ( ptr->res_type == RESOURCE_TYPE__FILESYSTEM_USAGE )
{
string err_res_name(ptr->resource);
_space_to_underscore(err_res_name);
/* clear host degrade for fs usage alarms */
snprintf(ptr->errorMsg, sizeof(ptr->errorMsg), "%s %s:",
err_res_name.c_str(),
DEGRADE_CLEAR_MSG );
rmon_send_request ( ptr, _rmon_ctrl_ptr->clients );
}
}
else
{
ptr->sev = SEVERITY_CRITICAL;
}
resourceStageChange ( ptr, RMON_STAGE__MONITOR_WAIT );
if (strcmp(ptr->resource, INSTANCE_RESOURCE_NAME) != 0)
{
/* Set the resource severity */
_set_resource_usage( reasonText, ptr );
ilog ("%s setting previously failed resource alarm id: %s entity_instance_id: %s usage: %0.2f\n",
ptr->resource, ptr->alarm_id, alarmFilter.entity_instance_id, ptr->resource_value);
}
else
{
ilog ("%s setting previously failed resource alarm id: %s entity_instance_id: %s\n",
ptr->resource, ptr->alarm_id, alarmFilter.entity_instance_id);
}
}
}
free(active_alarm);
}
/*****************************************************************************
*
* Name : send_clear_msg
*
* Purpose : Send a message to all registered clients to set the node to
* available (clear the degrade)
*
*****************************************************************************/
void send_clear_msg ( int index )
{
int count = 0;
AlarmFilter alarmFilter;
SFmAlarmDataT *active_alarm = (SFmAlarmDataT*) calloc (1, sizeof (SFmAlarmDataT));
if (active_alarm == NULL)
{
elog("Failed to allocate memory for SFmAlarmDataT\n");
return;
}
string err_res_name(resource_config[index].resource);
_space_to_underscore(err_res_name);
snprintf(alarmFilter.alarm_id, FM_MAX_BUFFER_LENGTH, resource_config[index].alarm_id);
build_entity_instance_id (&resource_config[index], alarmData.entity_instance_id);
snprintf(alarmFilter.entity_instance_id, FM_MAX_BUFFER_LENGTH, alarmData.entity_instance_id);
/* Notify rmon clients of fault being cleared */
snprintf(resource_config[index].errorMsg, sizeof(resource_config[index].errorMsg),
"%s cleared_alarms_for_resource:", err_res_name.c_str());
/* check if there is an alarm first for this resource. If there is not then the node */
/* should not be in a degrade state */
EFmErrorT ret = fm_get_fault( &alarmFilter, active_alarm);
if ( (ret == FM_ERR_OK) && (active_alarm != NULL) )
{
while (( rmon_send_request ( &resource_config[index], _rmon_ctrl_ptr->clients ) != PASS ) && (count < 3 ))
{
wlog ("%s request send failed \n", resource_config[index].resource);
count++;
}
if (count > 2)
{
wlog ("%s request send failed, count:%d \n", resource_config[index].resource, count);
resource_config[index].failed_send++;
}
if ((resource_config[index].failed_send == MAX_FAIL_SEND) || (count < 3))
{
/* Reset the values to defaults */
swact_count = 0;
ilog("Setting resource: %s back to defaults \n", resource_config[index].resource);
resource_config[index].failed = false ;
resource_config[index].alarm_raised = false ;
resource_config[index].count = 0 ;
resource_config[index].sev = SEVERITY_CLEARED ;
resource_config[index].stage = RMON_STAGE__START ;
resource_config[index].failed_send = 0;
}
}
else //alarm not found or error
{
if (ret == FM_ERR_ENTITY_NOT_FOUND)
{
dlog ("Alarm not found for resource: %s entity_instance_id: %s \n", alarmFilter.alarm_id, alarmFilter.entity_instance_id);
}
else
{
wlog ("fm_get_fault failed for resource: %s entity_instance_id: %s err: %d\n", alarmFilter.alarm_id,
alarmFilter.entity_instance_id, ret);
}
if (active_alarm == NULL)
{
elog("fm_get_fault returned null active_alarm\n");
}
swact_count++;
if (swact_count == MAX_SWACT_COUNT)
{
/* Reset the values to defaults */
while (( rmon_send_request ( &resource_config[index], _rmon_ctrl_ptr->clients ) != PASS ) && (count < 3 ))
{
wlog ("%s request send failed \n", resource_config[index].resource);
count++;
}
swact_count = 0;
ilog("Setting resource: %s back to defaults \n", resource_config[index].resource);
resource_config[index].failed = false ;
resource_config[index].alarm_raised = false ;
resource_config[index].count = 0 ;
resource_config[index].sev = SEVERITY_CLEARED ;
resource_config[index].stage = RMON_STAGE__START ;
resource_config[index].failed_send = 0;
}
}
free(active_alarm);
}
/*****************************************************************************
*
* Name : read_fs_file
*
* Purpose : read the memory mapped dynamic file system file
*****************************************************************************/
void read_fs_file ( vector<string> & dynamic_resources )
{
FILE * pFile;
char buf[MAX_LEN];
int fd;
string delimiter = ",";
size_t pos;
string token;
struct stat fileInfo;
struct flock fl;
memset ((char *)&fileInfo, 0 , sizeof(fileInfo));
fl.l_whence = SEEK_SET;
fl.l_start = 0;
fl.l_len = 0;
fl.l_pid = getpid();
pFile = fopen (DYNAMIC_FS_FILE , "r");
if (pFile != NULL) {
fd = fileno(pFile);
/* lock the file */
fl.l_type = F_RDLCK;
/* lock the file for read and write */
fcntl(fd, F_SETLKW, &fl);
if (fd == -1)
{
elog("Error opening file for reading");
}
if (fstat(fd, &fileInfo) == -1)
{
elog("Error getting the file size");
}
char *map = static_cast<char*>( mmap(0, fileInfo.st_size, PROT_READ, MAP_SHARED, fd, 0));
if (map == MAP_FAILED)
{
elog("Error mmapping the file");
}
string str(map);
snprintf( buf, MAX_LEN, str.c_str());
/* free the mmapped memory */
if (munmap(map, fileInfo.st_size) == -1)
{
elog("Error un-mmapping the file");
}
fclose(pFile);
/* unlock the file */
fl.l_type = F_UNLCK;
fcntl(fd, F_SETLK, &fl);
while ((pos = str.find(delimiter)) != string::npos) {
/* separate the resources from the file */
token = str.substr(0, pos);
dynamic_resources.push_back(token);
dlog("reading resource %s \n", token.c_str());
str.erase(0, pos + delimiter.length());
}
}
}
/*****************************************************************************
*
* Name : add_dynamic_fs_resource
*
* Purpose : Add the dynamic file system resources
*****************************************************************************/
void add_dynamic_fs_resource ( bool send_response )
{
#ifdef WANT_FS_MONITORING
char resource[50];
char temp_resource[50];
char device [50];
char mount_point[50];
char temp_state[20];
char type [50];
char buf[200];
string criticality = "critical";
vector<string> resource_list;
int absolute_thresholds[3];
memset(absolute_thresholds, 0, sizeof(absolute_thresholds));
fs_index.clear();
fs_state.clear();
/* get a list of all the dynamic fs mounts */
read_fs_file(resource_list);
for(std::vector<string>::iterator it = resource_list.begin(); it != resource_list.end(); ++it)
{
string str = *it;
snprintf(buf, sizeof(buf), str.c_str());
// For resources without mounts the mount_point will be NULL
memset(&mount_point[0], 0, sizeof(mount_point));
sscanf(buf, "%49s %19s %49s %49s %49s", temp_resource, temp_state, type, device, mount_point);
string state(temp_state);
bool found = false;
if (mount_point[0] != '\0')
{
// for resources with mounts, the resource name is the mount value
snprintf(resource, FM_MAX_BUFFER_LENGTH, mount_point);
}
else
{
// for resources without mounts, the resource name is the device value
snprintf(resource, FM_MAX_BUFFER_LENGTH, device);
}
/* the dynamic file system is enabled, add it if need be */
for (int i=0; i<_rmon_ctrl_ptr->resources; i++)
{
if ( strcmp(resource, resource_config[i].resource) == 0)
{
dlog ("resource %s already exists, update the state to %s \n", resource, state.c_str());
/* resource already exists no need to add it again */
/* update the state, it may have changed */
fs_index.push_back(i);
fs_state.push_back(state);
found = true;
break;
}
}
if (!found) // new resource to monitor, lets add it
{
int enabled_resource = ALARM_OFF;
if (strcmp(temp_state,"enabled") == 0)
{
enabled_resource = ALARM_ON;
}
if (mount_point[0] != '\0')
{
save_fs_resource ( resource, criticality, enabled_resource, fs_percent, absolute_thresholds, DYNAMIC_ALARM, type, device, MOUNTED );
}
else
{
save_fs_resource ( resource, criticality, enabled_resource, fs_percent, absolute_thresholds, DYNAMIC_ALARM, type, device, NOT_MOUNTED );
}
if (enabled_resource == ALARM_ON) {
calculate_fs_usage( &resource_config[_rmon_ctrl_ptr->resources - 1] );
rmon_alarming_init( &resource_config[_rmon_ctrl_ptr->resources - 1] );
}
}
}
#endif
if (send_response)
{
#ifdef WANT_FS_MONITORING
ilog ("sending response to dynamic FS add, to the rmon client\n");
#else
ilog("dynamic filesystem monitoring moved to collectd\n");
#endif
/* let the rmon client know that we are done with the file */
rmon_resource_response(_rmon_ctrl_ptr->clients);
}
}
/*****************************************************************************
*
* Name : clear_alarm_for_resource
*
* Purpose : Clear the alarm of the resource passed in
*
*****************************************************************************/
void clear_alarm_for_resource ( resource_config_type * ptr )
{
dlog ("resource name: %s, resource type: %s, alarm type: %d \n", ptr->resource, ptr->type, ptr->alarm_type);
AlarmFilter alarmFilter;
build_entity_instance_id (ptr, alarmData.entity_instance_id);
snprintf(alarmFilter.alarm_id, FM_MAX_BUFFER_LENGTH, ptr->alarm_id);
snprintf(alarmFilter.entity_instance_id, FM_MAX_BUFFER_LENGTH, alarmData.entity_instance_id);
int ret = rmon_fm_clear(&alarmFilter);
if (ret == FM_ERR_OK)
{
ilog ("Cleared stale alarm %s for entity instance id: %s", alarmFilter.alarm_id, alarmFilter.entity_instance_id);
}
else if (ret == FM_ERR_ENTITY_NOT_FOUND)
{
dlog ("Stale alarm %s for entity instance id: %s was not found", alarmFilter.alarm_id, alarmFilter.entity_instance_id);
}
else
{
wlog ("Failed to clear stale alarm %s for entity instance id: %s error: %d", alarmFilter.alarm_id, alarmFilter.entity_instance_id, ret);
}
}
/*****************************************************************************
*
* Name : process_dynamic_fs_file
*
* Purpose : read the dynamic files directory and add the dynamic filesystem
* resources when the file is updated
*****************************************************************************/
void process_dynamic_fs_file()
{
int index = 0;
pthread_mutex_lock(&lock);
modifyingResources = true;
pthread_mutex_unlock(&lock);
add_dynamic_fs_resource(true);
pthread_mutex_lock(&lock);
modifyingResources = false;
pthread_mutex_unlock(&lock);
/* deal with changes of dynamic file system enabled state */
for (unsigned int i=0; i<fs_index.size(); i++)
{
index = fs_index.at(i);
if ( strcmp(fs_state.at(i).c_str(), "disable") == 0 )
{
/* resource has been disabled, stop alarming on it */
ilog("%s is no longer enabled\n", resource_config[index].resource);
if ( resource_config[index].failed == true )
{
resource_config[index].alarm_status = ALARM_OFF;
if ( _rmon_ctrl_ptr->clients > 0 )
{
//send a clear degrade node
send_clear_msg(index);
}
// we need to clear the resource's alarm if there was any set for this resource
clear_alarm_for_resource(&resource_config[index]);
}
else
{
/* There was no active alarm to clear */
ilog("Setting resource: %s back to defaults \n", resource_config[index].resource);
resource_config[index].alarm_status = ALARM_OFF;
resource_config[index].failed = false;
resource_config[index].alarm_raised = false;
resource_config[index].count = 0 ;
resource_config[index].sev = SEVERITY_CLEARED ;
resource_config[index].stage = RMON_STAGE__START ;
}
}
else if ( strcmp(fs_state.at(i).c_str(), "enabled") == 0 )
{
// resource has been enabled
if ( resource_config[index].alarm_status == ALARM_OFF )
{
/* Turn the resource checking back on if it was off */
resource_config[index].alarm_status = ALARM_ON;
//reset values
resource_config[index].failed = false;
resource_config[index].alarm_raised = false;
resource_config[index].count = 0 ;
resource_config[index].sev = SEVERITY_CLEARED ;
resource_config[index].stage = RMON_STAGE__START ;
rmon_alarming_init( &resource_config[index] );
ilog("%s is now enabled \n", resource_config[index].resource);
if (strcmp(resource_config[index].resource, CINDER_VOLUMES) == 0)
{
virtual_space_usage_init(V_CINDER_THINPOOL_RESOURCE_NAME);
}
if (strcmp(resource_config[index].resource, NOVA_LOCAL) == 0)
{
virtual_space_usage_init(V_NOVA_THINPOOL_RESOURCE_NAME);
}
}
else // alarm aready on (enabled)
{
ilog("%s is already enabled \n", resource_config[index].resource);
}
}
else
{
wlog("%s invalid dynamic file system state: %s \n", resource_config[index].resource, fs_state.at(i).c_str());
}
}
}
/*****************************************************************************
*
* Name : process_static_fs_file
*
* Purpose : Reads in the list of static file systems for monitoring
*
*****************************************************************************/
void process_static_fs_file()
{
FILE * pFile;
vector<string> mounts;
char buf[MAX_LEN];
char resource[50];
char type[50];
char device[50];
bool found = false;
int enabled_resource = ALARM_ON;
string criticality = "critical";
int absolute_thresholds[3] = {0};
pFile = fopen (STATIC_FS_FILE , "r");
if (pFile != NULL) {
ifstream fin( STATIC_FS_FILE );
string line;
while( getline( fin, line )) {
/* process each line */
mounts.push_back(line);
}
fclose(pFile);
for(std::vector<string>::iterator it = mounts.begin(); it != mounts.end(); ++it)
{
string str = *it;
snprintf(buf, MAX_LEN, str.c_str());
sscanf(buf, "%49s %49s %49s %d %d %d", resource, device, type, &absolute_thresholds[0], &absolute_thresholds[1], &absolute_thresholds[2]);
if (!found)
{
if (fs_percent == PERCENT_USED)
{
/* do not use the absolute thresholds */
memset(absolute_thresholds, 0, sizeof(absolute_thresholds));
}
/* add the resource */
save_fs_resource ( resource, criticality, enabled_resource, fs_percent, absolute_thresholds, STATIC_ALARM, type, device, MOUNTED );
calculate_fs_usage( &resource_config[_rmon_ctrl_ptr->resources - 1] );
}
}
}
else
{
elog("Error, no static file system file present at: %s\n", STATIC_FS_FILE);
}
}
/*****************************************************************************
*
* Name : rmon_timer_handler
*
* Purpose : Looks up the timer ID and asserts the corresponding ringer
*
*****************************************************************************/
void rmon_timer_handler ( int sig, siginfo_t *si, void *uc)
{
timer_t * tid_ptr = (void**)si->si_value.sival_ptr ;
/* Avoid compiler errors/warnings for parms we must
* have but currently do nothing with */
UNUSED(sig);
UNUSED(uc);
if ( !(*tid_ptr) )
{
// tlog ("Called with a NULL Timer ID\n");
return ;
}
/* is event rmon timer */
if ( *tid_ptr == rmonTimer_event.tid )
{
mtcTimer_stop_int_safe ( rmonTimer_event);
rmonTimer_event.ring = true ;
}
else if ( *tid_ptr == rmonTimer_pm.tid )
{
mtcTimer_stop_int_safe ( rmonTimer_pm);
rmonTimer_pm.ring = true ;
}
else if ( (is_controller()) && (*tid_ptr == rmonTimer_ntp.tid) )
{
mtcTimer_stop_int_safe ( rmonTimer_ntp);
rmonTimer_ntp.ring = true ;
}
else
{
bool found = false ;
for ( int i = 0 ; i < _rmon_ctrl_ptr->resources ; i++ )
{
if ( *tid_ptr == rtimer[i].tid )
{
mtcTimer_stop_int_safe ( rtimer[i] );
rtimer[i].ring = true ;
found = true ;
break ;
}
}
if ( !found )
{
for ( int i = 0 ; i < _rmon_ctrl_ptr->thinmeta_resources ; i++ )
{
if ( *tid_ptr == thinmetatimer[i].tid )
{
mtcTimer_stop_int_safe ( thinmetatimer[i] );
thinmetatimer[i].ring = true ;
found = true ;
break ;
}
}
}
if ( !found )
{
/* try and cleanup by stopping this unknown timer via its tid */
mtcTimer_stop_tid_int_safe (tid_ptr);
}
}
}
/*****************************************************************************
*
* Name : clear_ntp_alarms
*
* Purpose : Loop through each current alarms and deleted them if the server
* is now reachable or the server no longer is assigned to ntpq
*
*****************************************************************************/
void clear_ntp_alarms(std::list<string> &non_reachable_ntp_servers, unsigned int alarm_count, SFmAlarmDataT *active_alarms, bool clear_major_alarm)
{
dlog ("Total NTP alarm_count:%d", alarm_count);
AlarmFilter alarmFilter;
char alarm_to_search[FM_MAX_BUFFER_LENGTH];
fm_alarm_id alarm_id;
snprintf(alarm_id, FM_MAX_BUFFER_LENGTH, "%s", NTP_ALARM_ID);
// clear the major alarms if required
if (clear_major_alarm)
{
snprintf(alarmFilter.alarm_id, FM_MAX_BUFFER_LENGTH, "%s", NTP_ALARM_ID );
snprintf(alarmFilter.entity_instance_id, FM_MAX_BUFFER_LENGTH, "%s.ntp", _rmon_ctrl_ptr->my_hostname);
int ret = rmon_fm_clear(&alarmFilter);
if (ret != FM_ERR_OK)
{
if (ret != FM_ERR_ENTITY_NOT_FOUND)
{
wlog ("Failed to clear major alarm %s for entity instance id:%s error:%d", NTP_ALARM_ID, alarmFilter.entity_instance_id, ret);
}
}
else
{
ilog ("Cleared major alarm %s for entity instance id:%s", NTP_ALARM_ID, alarmFilter.entity_instance_id);
}
}
if (active_alarms == NULL)
{
elog ("Null pointer for active_alarms");
return;
}
// clear minor alarms if required
bool found;
std::list<string>::iterator iter;
std::list<string>::iterator iter_bad_list;
// for each NTP alarms in the system see if it match any of the invalid NTP servers
// if it does not match then the alarm must be removed since that NTP server
// is no longer being monitored or is now valid
for ( unsigned int i = 0; i < alarm_count; i++ )
{
if ( ((active_alarms+i)->severity) == FM_ALARM_SEVERITY_MINOR )
{
// Verify that this NTP minor alarm is still valid, This server could no longer exist or is now marked
// reachable
dlog ("Verify NTP minor alarm is still valid, entity instance id:%s", (active_alarms+i)->entity_instance_id);
found = false;
// check for stale minor alarm
for ( iter = non_reachable_ntp_servers.begin (); iter != non_reachable_ntp_servers.end (); iter++ )
{
// e.g. host=controller-0.ntp=102.111.2.2
snprintf(alarm_to_search, FM_MAX_BUFFER_LENGTH, "%s.ntp=%s", _rmon_ctrl_ptr->my_hostname, iter->c_str());
dlog ("Non reachable NTP server to search %s", iter->c_str());
if (strstr((active_alarms+i)->entity_instance_id, iter->c_str()) != NULL)
{
// server is in non reachable list, do not clear it
found = true;
dlog ("Alarm is still valid %s", iter->c_str());
break;
}
}
if (!found)
{
// lets clear it but only if it's this controller's alarm, it could be the peer controller's alarm
if (strstr((active_alarms+i)->entity_instance_id, _rmon_ctrl_ptr->my_hostname) != NULL)
{
snprintf(alarmFilter.alarm_id, FM_MAX_BUFFER_LENGTH, "%s", NTP_ALARM_ID);
snprintf(alarmFilter.entity_instance_id, FM_MAX_BUFFER_LENGTH, "%s", (active_alarms+i)->entity_instance_id);
if (rmon_fm_clear(&alarmFilter) != FM_ERR_OK)
{
wlog ("Failed to clear minor alarm %s for entity instance id:%s", NTP_ALARM_ID, (active_alarms+i)->entity_instance_id);
}
else
{
ilog ("Cleared minor alarm %s for entity instance id:%s", NTP_ALARM_ID, (active_alarms+i)->entity_instance_id);
}
}
}
}
}
}
/*****************************************************************************
*
* Name : ntp_query_results
*
* Purpose : Analyze the return code from script query_ntp_servers.sh.
* Create alarms if the servers are non reachable, Clear alarms if they are
* now reachable
*
*****************************************************************************/
void ntp_query_results (int ntp_query_status )
{
dlog ("ntp_query_results ntp_query_status:%d", ntp_query_status);
std::list<string> non_reachable_ntp_servers;
// if no NTP servers are provisioned on the system, we still need to clear old NTP
// alarms if there are any. But we do not need to read the tmp server file.
if (ntp_query_status != NTP_NOT_PROVISIONED)
{
// read the temp file which contains a list of reachable and non reachable servers
// this file is the output from the query_ntp_servers.sh script
const char *server_info = "/tmp/ntpq_server_info";
FILE *pFile;
pFile = fopen(server_info, "r");
if (pFile != NULL)
{
const char * delim = ";\n\r";
char * ip;
char line[500];
int pos = 0;
while ( memset(line, 0, sizeof(line)) && (fgets((char*) &line, sizeof(line), pFile) != NULL) )
{
// the first line in the tmp file is the reachable servers, the second is the non reachable servers
if (pos == 1)
{
for (ip = strtok (line, delim); ip; ip = strtok (NULL, delim))
{
non_reachable_ntp_servers.push_back(ip);
dlog("Found non reachable NTP servers:%s\n", ip);
}
break;
}
pos++;
}
fclose(pFile);
}
else
{
elog("Failed to open file: %s\n", server_info);
return;
}
}
// retreive all the current NTP alarms
int rc;
unsigned int max_alarms=75;
fm_alarm_id alarm_id;
snprintf(alarm_id, FM_MAX_BUFFER_LENGTH, "%s", NTP_ALARM_ID);
SFmAlarmDataT *active_alarms = (SFmAlarmDataT*) calloc (max_alarms, sizeof (SFmAlarmDataT));
if (active_alarms == NULL)
{
elog ("Failed to allocate memory for NTP alarms");
return;
}
int ret = fm_get_faults_by_id( &alarm_id, active_alarms, &max_alarms);
if (!(ret == FM_ERR_OK || ret == FM_ERR_ENTITY_NOT_FOUND))
{
elog ("fm_get_faults_by_id failed trying to retreive all the NTP alarms, error:%d", ret);
free(active_alarms);
return;
}
// Clear alarms if required
bool clear_major_alarm = false;
bool created_major_alarm = false;
if ( ntp_query_status == NTP_NOT_PROVISIONED || ntp_query_status == NTP_SOME_REACHABLE || ntp_query_status == NTP_OK )
{
// We are going to clear the major alarm since there is at least one server selected or
// no servers are provisioned
clear_major_alarm = true;
}
// fm_get_faults_by_id returns the number of alarms found
if (max_alarms != 0)
{
// verify if alarms need to cleared and clear them
clear_ntp_alarms(non_reachable_ntp_servers, max_alarms, active_alarms, clear_major_alarm);
}
// There are no NTP servers provisioned so there is no alarms to raise
if (ntp_query_status == NTP_NOT_PROVISIONED)
{
return;
}
// Raise alarms if required
// Set up alarms data
AlarmFilter alarmFilter;
snprintf(alarmData.proposed_repair_action , sizeof(alarmData.proposed_repair_action), "Monitor and if condition persists, contact next level of support.");
snprintf(alarmData.alarm_id, FM_MAX_BUFFER_LENGTH, "%s", NTP_ALARM_ID);
strcpy(alarmData.uuid, "");
snprintf(alarmData.entity_type_id, FM_MAX_BUFFER_LENGTH, "ntp");
alarmData.alarm_state = FM_ALARM_STATE_SET;
alarmData.alarm_type = FM_ALARM_COMM;
alarmData.probable_cause = FM_ALARM_CAUSE_UNKNOWN;
alarmData.timestamp = 0;
alarmData.service_affecting = FM_FALSE;
alarmData.suppression = FM_FALSE;
// Here we raise the major alarm if required
if (ntp_query_status == NTP_NONE_REACHABLE || ntp_query_status == NTP_SOME_REACHABLE_NONE_SELECTED)
{
wlog("NTP configuration does not contain any valid or reachable NTP servers");
// Check if alarm is raised already
snprintf(alarmFilter.entity_instance_id, FM_MAX_BUFFER_LENGTH, "%s.ntp", _rmon_ctrl_ptr->my_hostname);
bool found = false;
for ( unsigned int i = 0; i < max_alarms; i++ )
{
if ( strncmp((active_alarms+i)->entity_instance_id, alarmFilter.entity_instance_id, sizeof((active_alarms+i)->entity_instance_id)) == 0 )
{
// Alarm already exist
dlog("Alarm %s already raised for entity instance id:%s\n", NTP_ALARM_ID, alarmFilter.entity_instance_id);
found = true;
break;
}
}
// Alarm does not exist so raise it
if (!found && !created_major_alarm)
{
// Alarm does not exist so raise it
alarmData.severity = FM_ALARM_SEVERITY_MAJOR;
snprintf(alarmData.reason_text, sizeof(alarmData.reason_text), "NTP configuration does not contain any valid or reachable NTP servers.");
snprintf(alarmData.entity_instance_id, FM_MAX_BUFFER_LENGTH, "%s", alarmFilter.entity_instance_id);
rc = rmon_fm_set(&alarmData, NULL);
if (rc == FM_ERR_OK )
{
ilog("Alarm %s created for entity instance id:%s \n", NTP_ALARM_ID, alarmData.entity_instance_id);
created_major_alarm = true;
}
else
{
ilog("Failed to create alarm %s for entity instance id:%s error: %d \n", NTP_ALARM_ID, alarmData.entity_instance_id, (int)rc);
}
}
}
// Here were raise alarms for individual servers
if (ntp_query_status != NTP_OK)
{
wlog("Some or all of the NTP servers are not reachable");
std::list<string>::iterator iter;
alarmData.severity = FM_ALARM_SEVERITY_MINOR;
// Loop through all the non reachable NTP servers
// Check to see if an alarms is lready raised for the server.
// If we do not find an alarm for the server then we raise it
for ( iter = non_reachable_ntp_servers.begin (); iter != non_reachable_ntp_servers.end (); iter++ )
{
bool found = false;
// Build the alarm entity instatance id
snprintf(alarmFilter.entity_instance_id, FM_MAX_BUFFER_LENGTH, "%s.ntp=%s", _rmon_ctrl_ptr->my_hostname, iter->c_str());
dlog("Search alarms for entity instance id:%s \n", alarmFilter.entity_instance_id);
for ( unsigned int i = 0; i < max_alarms; i++ )
{
if ( strncmp((active_alarms+i)->entity_instance_id, alarmFilter.entity_instance_id, sizeof((active_alarms+i)->entity_instance_id)) == 0 )
{
dlog("Alarm %s already raised for entity instance id:%s\n", NTP_ALARM_ID, alarmFilter.entity_instance_id);
found = true;
break;
}
}
// If the NTP alarm was not found then raise one for this NTP server
if (!found)
{
snprintf(alarmData.reason_text, sizeof(alarmData.reason_text), "NTP address %s is not a valid or a reachable NTP server.", iter->c_str() );
snprintf(alarmData.entity_instance_id, FM_MAX_BUFFER_LENGTH, "%s", alarmFilter.entity_instance_id);
rc = rmon_fm_set(&alarmData, NULL);
if (rc == FM_ERR_OK )
{
ilog("Alarm %s created for entity instance id:%s \n", NTP_ALARM_ID, alarmData.entity_instance_id);
}
else
{
ilog("Failed to create alarm %s for entity instance id:%s error:%d \n", NTP_ALARM_ID, alarmData.entity_instance_id, (int)rc);
}
}
}
}
free(active_alarms);
return;
}
/*****************************************************************************
*
* Name : query_ntp_servers
*
* Purpose : execute script query_ntp_servers.sh which run the "ntpq -np"
* which query the healths of the NTP servers. The script will return a
* status code and also create a temporate file which will save the list
* of reachable and non reachable NTP servers. This temp file is required
* to generate proper alarms
*
*****************************************************************************/
int query_ntp_servers ( )
{
pid_t child_pid;
dlog ("Main Pid:%d \n", getpid() );
ntp_child_pid = child_pid = fork ();
if (child_pid == 0)
{
dlog ("Child Pid:%d \n", getpid() );
char* argv[] = {(char*)NTPQ_QUERY_SCRIPT, NULL};
char cmd[MAX_FILE_SIZE] ;
memset (cmd,0,MAX_FILE_SIZE);
snprintf ( &cmd[0], MAX_FILE_SIZE, "%s/%s", RMON_FILES_DIR, NTPQ_QUERY_SCRIPT );
bool close_file_descriptors = true ;
if ( setup_child ( close_file_descriptors ) != PASS )
{
exit(NTP_ERROR);
}
/* Set child to ignore child exit */
signal (SIGCHLD, SIG_DFL);
/* Setup the exec arguement */
int res = execv(cmd, argv);
elog ( "Failed to run %s return code:%d error:%s\n", cmd, res, strerror(errno) );
exit (NTP_ERROR);
}
if ( child_pid == -1 )
{
elog ("Fork failed (%s)\n", strerror(errno));
/* TODO: Consider making this a critical fault
* after 100 retries.
* All possibilities based on man page are
* due to resource limitations and if that does
* not resolve in 100 retries then ip probably will never.
**/
return (FAIL);
}
return (PASS);
}
/*****************************************************************************
*
* Name : rmonHdlr_ceilometer_handler
*
* Purpose : Handles the ceilometer sample create response message
*
*****************************************************************************/
void rmonHdlr_ceilometer_handler( struct evhttp_request *req, void *arg )
{
if ( !req )
{
elog (" Request Timeout\n");
ceilometerEvent.status = FAIL_TIMEOUT;
goto _ceilometer_handler_done ;
}
ceilometerEvent.status = rmonHttpUtil_status(ceilometerEvent);
if ( ceilometerEvent.status != PASS )
{
elog ("ceilometer HTTP request Failed (%d)\n", ceilometerEvent.status);
rmonHttpUtil_get_response(ceilometerEvent);
goto _ceilometer_handler_done ;
}
_ceilometer_handler_done:
event_base_loopbreak((struct event_base *)arg);
}
/*****************************************************************************
*
* Name : generate_ceilometer_pm
*
* Purpose : Generate ceilometer PMs through the REST API
*
*****************************************************************************/
void generate_ceilometer_pm ( string r_id, string m_id, string m_type,
string m_unit, string m_volume,
string m_metadata )
{
int rc = PASS;
daemon_config_type * cfg_ptr = daemon_get_cfg_ptr();
string command_path="";
string host_ip = cfg_ptr->keystone_auth_host;
int port = cfg_ptr->ceilometer_port;
int count = 0;
rmonHttpUtil_libEvent_init ( &ceilometerEvent, CEILOMETER_EVENT_SIG, host_ip, port);
ceilometerEvent.address.append("/v2/meters/");
ceilometerEvent.address.append(m_id);
ceilometerEvent.user_agent = "ceilometerclient.openstack.common.apiclient";
ceilometerEvent.payload = "[{";
ceilometerEvent.payload.append("\"resource_id\":\"");
ceilometerEvent.payload.append(r_id);
ceilometerEvent.payload.append("\",\"counter_name\":\"");
ceilometerEvent.payload.append(m_id);
ceilometerEvent.payload.append("\",\"counter_type\":\"");
ceilometerEvent.payload.append(m_type);
ceilometerEvent.payload.append("\",\"counter_unit\":\"");
ceilometerEvent.payload.append(m_unit);
ceilometerEvent.payload.append("\",\"counter_volume\":\"");
ceilometerEvent.payload.append(m_volume);
ceilometerEvent.payload.append("\",\"resource_metadata\":");
// the resource metadata is dictionary of key-value pairs
ceilometerEvent.payload.append(m_metadata);
ceilometerEvent.payload.append("}]");
dlog ("Payload is : %s\n", ceilometerEvent.payload.c_str());
rc = rmonHttpUtil_api_request (CEILOMETER_SAMPLE_CREATE, ceilometerEvent, command_path);
do
{
if ( rc != PASS )
{
count++;
wlog ("ceilometer failed request (%d) ... retrying (%d)\n", rc, count);
}
rmonHttpUtil_log_event (ceilometerEvent);
} while ( ( rc!=PASS ) && ( count < REST_API_RETRY_COUNT ) );
if ( rc!= PASS )
{
elog ("ceilometer sample create Failed (%d) (cnt:%d)\n", rc, count);
}
}
void clear_rmon_api_counts ( registered_clients * ptr )
{
if ( ptr->b2b_miss_count > ptr->b2b_miss_peak )
{
ptr->b2b_miss_peak = ptr->b2b_miss_count ;
}
if ( ptr->mesg_err_cnt > ptr->mesg_err_peak )
{
ptr->mesg_err_peak = ptr->mesg_err_cnt ;
}
ptr->b2b_miss_count = 0 ;
ptr->send_err_cnt = 0 ;
ptr->recv_err_cnt = 0 ;
ptr->mesg_err_cnt = 0 ;
}
/*****************************************************************************
*
* Name : _space_to_underscore
*
* Purpose : Converts spaces in a string to underscores
* *****************************************************************************/
void _space_to_underscore (string & str )
{
char space = ' ';
for(unsigned int i = 0; i < str.size(); i++)
{
if(str[i] == space)
{
str[i] = '_';
}
}
}
/*****************************************************************************
*
* Name : set_alarm_defaults
*
* Purpose : Set the defaults for the fm alarms
* *****************************************************************************/
void set_alarm_defaults ( resource_config_type * ptr )
{
strcpy(alarmData.uuid, "");
/* common data for all alarm messages */
snprintf(alarmData.entity_type_id, FM_MAX_BUFFER_LENGTH, "system.host");
build_entity_instance_id (ptr, alarmData.entity_instance_id);
alarmData.alarm_state = FM_ALARM_STATE_SET;
alarmData.alarm_type = FM_ALARM_OPERATIONAL;
alarmData.probable_cause = FM_ALARM_THRESHOLD_CROSSED;
alarmData.timestamp = 0;
alarmData.service_affecting = FM_FALSE;
alarmData.suppression = FM_TRUE;
snprintf(alarmData.alarm_id, FM_MAX_BUFFER_LENGTH, ptr->alarm_id);
}
/*****************************************************************************
*
* Name : resource_handler
*
* Purpose : Handle the failed resources and raise alarms through
* the FM API as well as calling a function to notify registered clients
*****************************************************************************/
int resource_handler ( resource_config_type * ptr )
{
int rc = RETRY ;
AlarmFilter alarmFilter;
string err_res_name(ptr->resource);
_space_to_underscore(err_res_name);
if ( ptr->stage < RMON_STAGE__STAGES )
{
dlog2 ("%s %s Stage %d\n", ptr->resource, rmonStages_str[ptr->stage], ptr->stage );
}
else
{
resourceStageChange ( ptr, RMON_STAGE__FINISH );
}
switch ( ptr->stage )
{
case RMON_STAGE__START:
{
dlog ( "%s failed:%d set_cnt:%d debounce_cnt:%d\n",
ptr->resource,
ptr->failed,
ptr->count,
ptr->debounce_cnt);
break ;
}
case RMON_STAGE__MANAGE:
{
/* send messages to maintnance in thresholds are crossed */
if (ptr->alarm_status == ALARM_ON)
{
/* set up the fm api alarm defaults */
set_alarm_defaults( ptr );
if ( strcmp(ptr->resource, MEMORY_RESOURCE_NAME) == 0 )
{
snprintf(alarmData.proposed_repair_action , sizeof(alarmData.proposed_repair_action),
"Monitor and if condition persists, contact next level of support; may require additional memory on Host.");
}
else if ( strcmp(ptr->resource, INSTANCE_RESOURCE_NAME) == 0 )
{
snprintf(alarmData.proposed_repair_action , sizeof(alarmData.proposed_repair_action),
"Check Management and Infrastructure Networks and Controller or Storage Nodes.");
}
else
{
if ((ptr->type != NULL) && (strcmp(ptr->type, "lvg") == 0 ))
{
snprintf(alarmData.proposed_repair_action , sizeof(alarmData.proposed_repair_action),
"Monitor and if condition persists, consider adding additional physical volumes to the volume group.");
}
else
{
snprintf(alarmData.proposed_repair_action , sizeof(alarmData.proposed_repair_action),
"Monitor and if condition persists, contact next level of support.");
}
}
if ( ptr->sev == SEVERITY_MINOR )
{
alarmData.severity = FM_ALARM_SEVERITY_MINOR;
if ( ptr->percent == PERCENT_USED ) {
if ( ptr->alarm_type == STANDARD_ALARM )
{
ilog ("%s threshold exceeded; threshold: %d%%, actual: %.2f%%. \n",
ptr->resource, ptr->minor_threshold, ptr->resource_value);
snprintf(alarmData.reason_text, sizeof(alarmData.reason_text),
"%s threshold exceeded; threshold: %u%%, actual: %.2f%%.",
ptr->resource, ptr->minor_threshold, ptr->resource_value);
}
else {
ilog ("Filesystem threshold exceeded; threshold: %d%%, actual: %.2f%%. \n",
ptr->minor_threshold, ptr->resource_value);
snprintf(alarmData.reason_text, sizeof(alarmData.reason_text),
"Filesystem exceeded; threshold: %u%%, actual: %.2f%%.",
ptr->minor_threshold, ptr->resource_value);
}
} else {
if ( ptr->alarm_type == STANDARD_ALARM )
{
ilog ("%s threshold exceeded; threshold: %dMB, remaining value: %.2fMB. \n",
ptr->resource, ptr->minor_threshold_abs_node0, ptr->resource_value);
snprintf(alarmData.reason_text, sizeof(alarmData.reason_text),
"%s threshold exceeded; threshold: %uMB, remaining value: %.2fMB.",
ptr->resource, ptr->minor_threshold_abs_node0, ptr->resource_value);
} else {
ilog ("Filesystem threshold exceeded; threshold: %dMB, remaining value: %.2fMB. \n",
ptr->minor_threshold_abs_node0, ptr->resource_value);
snprintf(alarmData.reason_text, sizeof(alarmData.reason_text),
"Filesystem threshold exceeded; threshold: %uMB, remaining value: %.2fMB.",
ptr->minor_threshold_abs_node0, ptr->resource_value);
}
}
snprintf(ptr->errorMsg, sizeof(ptr->errorMsg),
"%s minor_threshold_set", err_res_name.c_str());
}
else if ( ptr->sev == SEVERITY_MAJOR )
{
alarmData.severity = FM_ALARM_SEVERITY_MAJOR;
if (strcmp(ptr->resource, INSTANCE_RESOURCE_NAME) != 0)
{
if (ptr->percent == PERCENT_USED){
if ( ptr->alarm_type == STANDARD_ALARM )
{
ilog ("%s threshold exceeded; threshold: %d%%, actual: %.2f%%. \n",
ptr->resource, ptr->major_threshold, ptr->resource_value);
snprintf(alarmData.reason_text, sizeof(alarmData.reason_text),
"%s threshold exceeded; threshold: %u%%, actual: %.2f%%.",
ptr->resource, ptr->major_threshold, ptr->resource_value);
}
else {
ilog ("Filesystem threshold exceeded; threshold: %d%%, actual: %.2f%%. \n",
ptr->major_threshold, ptr->resource_value);
snprintf(alarmData.reason_text, sizeof(alarmData.reason_text),
"Filesystem threshold exceeded; threshold: %u%%, actual: %.2f%%.",
ptr->major_threshold, ptr->resource_value);
}
} else {
if ( ptr->alarm_type == STANDARD_ALARM )
{
ilog ("%s threshold exceeded; threshold: %dMB, remaining value: %.2fMB. \n",
ptr->resource, ptr->major_threshold_abs_node0, ptr->resource_value);
snprintf(alarmData.reason_text, sizeof(alarmData.reason_text),
"%s threshold exceeded; threshold: %uMB, remaining value: %.2fMB.",
ptr->resource, ptr->major_threshold_abs_node0, ptr->resource_value);
} else {
ilog ("Filesystem threshold exceeded; threshold: %dMB, remaining value: %.2fMB. \n",
ptr->major_threshold_abs_node0, ptr->resource_value);
snprintf(alarmData.reason_text, sizeof(alarmData.reason_text),
"Filesystem threshold exceeded; threshold: %uMB, remaining value: %.2fMB.",
ptr->major_threshold_abs_node0, ptr->resource_value);
}
}
}
else if (strcmp(ptr->resource, INSTANCE_RESOURCE_NAME) == 0)
{
/* instance alarming is a special case of alarm */
wlog ("No access to remote VM volumes.\n");
snprintf(alarmData.reason_text, sizeof(alarmData.reason_text),
"No access to remote VM volumes.");
}
if ( ptr->res_type == RESOURCE_TYPE__FILESYSTEM_USAGE )
{
snprintf(ptr->errorMsg, sizeof(ptr->errorMsg),
"%s %s",err_res_name.c_str(), DEGRADE_CLEAR_MSG );
}
else
{
snprintf(ptr->errorMsg, sizeof(ptr->errorMsg),
"%s major_threshold_set",err_res_name.c_str());
}
}
else if ( ptr->sev == SEVERITY_CRITICAL )
{
alarmData.severity = FM_ALARM_SEVERITY_CRITICAL;
if (ptr->percent == PERCENT_USED){
if ( ptr->alarm_type == STANDARD_ALARM )
{
ilog ("%s threshold exceeded; threshold: %d%%, actual: %.2f%%. \n",
ptr->resource, ptr->critical_threshold, ptr->resource_value);
snprintf(alarmData.reason_text, sizeof(alarmData.reason_text),
"%s threshold exceeded; threshold: %u%%, actual: %.2f%%.",
ptr->resource, ptr->critical_threshold, ptr->resource_value);
}
else {
ilog ("Filesystem threshold exceeded; threshold: %d%%, actual: %.2f%%. \n",
ptr->critical_threshold, ptr->resource_value);
snprintf(alarmData.reason_text, sizeof(alarmData.reason_text),
"Filesystem threshold exceeded; threshold: %u%%, actual: %.2f%%.",
ptr->critical_threshold, ptr->resource_value);
}
} else {
if ( ptr->alarm_type == STANDARD_ALARM )
{
ilog ("%s threshold exceeded; threshold: %dMB, remaining value: %.2fMB. \n",
ptr->resource, ptr->critical_threshold_abs_node0, ptr->resource_value);
snprintf(alarmData.reason_text, sizeof(alarmData.reason_text),
"%s threshold exceeded; threshold: %uMB, remaining value: %.2fMB.",
ptr->resource, ptr->critical_threshold_abs_node0, ptr->resource_value);
} else {
ilog ("Filesystem threshold exceeded; threshold: %dMB, remaining value: %.2fMB. \n",
ptr->critical_threshold_abs_node0, ptr->resource_value);
snprintf(alarmData.reason_text, sizeof(alarmData.reason_text),
"Filesystem threshold exceeded; threshold: %uMB, remaining value: %.2fMB.",
ptr->critical_threshold_abs_node0, ptr->resource_value);
}
}
snprintf(ptr->errorMsg, sizeof(ptr->errorMsg),
"%s major_threshold_set",err_res_name.c_str());
}
rc = rmon_fm_set(&alarmData, NULL);
if (rc == FM_ERR_OK ) {
ilog("%s: %s alarm\n",
ptr->resource,
FmAlarmSeverity_to_string(alarmData.severity).c_str());
ptr->alarm_raised = true;
} else {
ilog("%s: %s alarm failed (rc:%d)\n",
ptr->resource,
FmAlarmSeverity_to_string(alarmData.severity).c_str(),
(int)rc);
}
if (ptr->alarm_raised)
{
if ((_rmon_ctrl_ptr->clients > 0) && (ptr->failed_send < MAX_FAIL_SEND))
{
/* If degrade debounce is non-zero then this
* alarm condition is candidate for host degrade */
if (ptr->debounce)
{
if ( rmon_send_request ( ptr, _rmon_ctrl_ptr->clients ) != PASS )
{
ptr->failed_send++;
wlog ("%s request send failed (count:%d)\n",
ptr->resource,
ptr->failed_send );
}
else
{
ptr->failed_send = 0;
}
}
}
else
{
ptr->failed_send = 0;
}
resourceStageChange ( ptr, RMON_STAGE__MONITOR_WAIT );
}
}
else {
resourceStageChange ( ptr, RMON_STAGE__FINISH );
}
break;
}
case RMON_STAGE__IGNORE:
{
//nothing to do here, go to the finished stage
resourceStageChange ( ptr, RMON_STAGE__FINISH );
break ;
}
case RMON_STAGE__MONITOR_WAIT:
{
if ((_rmon_ctrl_ptr->clients > 0) && (ptr->failed_send < MAX_FAIL_SEND) && (ptr->failed_send > 0))
{
if ( rmon_send_request ( ptr, _rmon_ctrl_ptr->clients ) != PASS )
{
wlog ("%s request send failed \n", ptr->resource);
ptr->failed_send++;
}
else
{
ptr->failed_send = 0;
}
}
break;
}
case RMON_STAGE__FINISH:
{
if ((ptr->alarm_status == ALARM_ON) && (ptr->alarm_raised))
{
snprintf(alarmFilter.alarm_id, FM_MAX_BUFFER_LENGTH, ptr->alarm_id);
build_entity_instance_id (ptr, alarmData.entity_instance_id);
snprintf(alarmFilter.entity_instance_id, FM_MAX_BUFFER_LENGTH, alarmData.entity_instance_id);
ilog ("%s alarm clear\n", ptr->resource );
/* clear the alarm */
EFmErrorT ret = rmon_fm_clear(&alarmFilter);
if (( ret == FM_ERR_OK ) || ( ret == FM_ERR_ENTITY_NOT_FOUND ))
{
if (ret == FM_ERR_ENTITY_NOT_FOUND)
{
dlog ("%s alarm clear failed, entity '%s' not found",
ptr->resource, alarmData.entity_instance_id);
}
snprintf(ptr->errorMsg, sizeof(ptr->errorMsg), "%s cleared_alarms_for_resource", err_res_name.c_str());
if ( (_rmon_ctrl_ptr->clients > 0) && ( ptr->failed_send < MAX_FAIL_SEND ) && (ret == FM_ERR_OK) )
{
while (( rmon_send_request ( ptr, _rmon_ctrl_ptr->clients ) != PASS ) &&
( ptr->failed_send < MAX_FAIL_SEND ))
{
wlog ("%s request send failed \n", ptr->resource);
ptr->failed_send++;
}
ptr->alarm_raised = false;
ptr->failed_send = 0;
ptr->failed = false ;
ptr->count = 0 ;
ptr->sev = SEVERITY_CLEARED ;
ptr->stage = RMON_STAGE__START ;
}
else
{
ptr->alarm_raised = false;
ptr->failed_send = 0;
ptr->failed = false ;
ptr->count = 0 ;
ptr->sev = SEVERITY_CLEARED ;
ptr->stage = RMON_STAGE__START ;
}
}
else
{
wlog("%s alarm clear failed, entity '%s' (rc:%d)\n",
ptr->resource,
alarmData.entity_instance_id,
ret);
}
}
else
{
ptr->alarm_raised = false;
ptr->failed_send = 0;
ptr->failed = false ;
ptr->count = 0 ;
ptr->sev = SEVERITY_CLEARED ;
ptr->stage = RMON_STAGE__START ;
}
rc = PASS ;
break ;
}
default:
{
slog ("%s Invalid stage (%d)\n", ptr->resource, ptr->stage );
/* Default to finish for invalid case.
* If there is an issue then it will be detected */
resourceStageChange ( ptr, RMON_STAGE__FINISH );
}
}
return rc;
}
/*****************************************************************************
*
* Name : process_failures
*
* Purpose : Check whether a percentage resource is to be failed or a failure
* threshold is to be cleared by the resource_handler
*
*****************************************************************************/
void process_failures ( resource_config_type * ptr )
{
if (ptr->stage == RMON_STAGE__INIT)
{
/* first time after restart/reboot, clear the alarm if the first reading is good */
resourceStageChange ( ptr, RMON_STAGE__START );
if (ptr->resource_value < ptr->minor_threshold)
{
// assuming we left as alarm on last time
ptr->alarm_status = ALARM_ON;
ptr->alarm_raised = true;
ptr->failed = true;
ilog("%s Setting the state to FINISH\n", ptr->resource);
resourceStageChange ( ptr, RMON_STAGE__FINISH );
}
// Now we start counting as normal ...
}
else
{
if (ptr->failed)
{
/* If the resource is already failed, check to see if it is to be cleared */
if ((( ptr->sev == SEVERITY_MINOR) && ( ptr->resource_value < ptr->minor_threshold )) ||
(( ptr->sev == SEVERITY_MAJOR) && ( ptr->resource_value < ptr->major_threshold )) ||
(( ptr->sev == SEVERITY_CRITICAL) && ( ptr->resource_value < ptr->critical_threshold )))
{
if (ptr->count > ptr->num_tries)
ptr->count = ptr->num_tries;
if (ptr->count > 0)
ptr->count--;
if (ptr->count == 0) {
ptr->sev = SEVERITY_CLEARED;
ilog("%s Setting the state to FINISH\n", ptr->resource);
resourceStageChange ( ptr, RMON_STAGE__FINISH );
}
}
else
{
/* While in failed state, the resource usage must sustain normal level
* num_tries number of times before an alarm can be cleared. Keep incrementing the counter
* as it will be set to num_tries in the above block as soon as resource usage returns to
* normal level.*/
ptr->count++;
// rmon needs to send degrade assert message periodically as the
// condition might be cleared by maintenance over controller swact.
//
// added meaning to the debounce config setting.
// must be non-zero to degrade the host.
if ((ptr->alarm_raised) && (ptr->debounce) &&
(_rmon_ctrl_ptr->clients > 0))
{
if ( rmon_send_request ( ptr, _rmon_ctrl_ptr->clients ) != PASS )
{
ptr->failed_send++ ;
wlog ("%s request send failed (count:%d)\n",
ptr->resource,
ptr->failed_send);
}
else
{
mlog ("%s rmon_send_request ok\n", ptr->resource );
ptr->failed_send = 0 ;
}
}
else
{
/* typical path for resources that
* - do not degrade host
* - do not raise alarms */
dlog ("%s: alarm:%d debounce:%d clients:%d\n",
ptr->resource,
(ptr->alarm_raised),
(ptr->debounce),
(_rmon_ctrl_ptr->clients));
}
}
}
}
/* Check to see if a resource is over the failure thresholds for: minor, major and critical failures */
if (( ptr->resource_value >= ptr->minor_threshold ) &&
( ptr->resource_value < ptr->major_threshold )
&& (ptr->sev != SEVERITY_MINOR))
{
ptr->count++;
if ( ptr->count >= ptr->num_tries) {
ptr->failed = true;
ptr->sev = SEVERITY_MINOR;
resourceStageChange ( ptr, RMON_STAGE__MANAGE);
}
}
else if (( ptr->resource_value >= ptr->major_threshold ) &&
( ptr->resource_value < ptr->critical_threshold )
&& (ptr->sev != SEVERITY_MAJOR))
{
ptr->count++;
if ( ptr->count >= ptr->num_tries){
ptr->failed = true;
ptr->sev = SEVERITY_MAJOR;
resourceStageChange ( ptr, RMON_STAGE__MANAGE);
}
}
else if (( ptr->resource_value >= ptr->critical_threshold )&&
(ptr->sev != SEVERITY_CRITICAL))
{
ptr->count++;
if (ptr->count >= ptr->num_tries){
ptr->failed = true;
ptr->sev = SEVERITY_CRITICAL;
resourceStageChange ( ptr, RMON_STAGE__MANAGE);
}
}
else
{
/* if the host experienced a resource blip in the previous audit run and usage
* is now back at the normal level, decrement the count.*/
if ((!ptr->failed) && (ptr->count > 0)){
ptr->count--;
dlog("Resource %s is back at the normal level, count is set to %d", ptr->resource, ptr->count);
}
}
}
/*****************************************************************************
*
* Name : process_failures_absolute
*
* Purpose : Check whether an absolute resource is to be failed or a
* failure threshold is to be cleared by the resource_handler
*
*****************************************************************************/
void process_failures_absolute ( resource_config_type * ptr )
{
int node = 0;
if (strcmp(ptr->resource,"processor_node1") == 0)
{
/* per node memory checking is enabled */
node = 1;
}
if (ptr->failed) {
/* If the resource is already failed, check to see if it is to be cleared */
if (node == 0) {
if ((( ptr->sev == SEVERITY_MINOR) && ( ptr->resource_value > ptr->minor_threshold_abs_node0 )) ||
(( ptr->sev == SEVERITY_MAJOR) && ( ptr->resource_value > ptr->major_threshold_abs_node0 )) ||
(( ptr->sev == SEVERITY_CRITICAL) && ( ptr->resource_value > ptr->critical_threshold_abs_node0 )))
{
if (ptr->count > ptr->num_tries)
ptr->count = ptr->num_tries;
if (ptr->count > 0)
ptr->count--;
if (ptr->count == 0) {
ptr->sev = SEVERITY_CLEARED;
resourceStageChange ( ptr, RMON_STAGE__FINISH );
}
}
else
{
/* While in failed state, the resource usage must sustain normal level
* num_tries number of times before an alarm can be cleared. Keep incrementing the counter
* as it will be set to num_tries in the above block as soon as resource usage returns to
* normal level.*/
ptr->count++;
}
}
else {
if ((( ptr->sev == SEVERITY_MINOR) && ( ptr->resource_value > ptr->minor_threshold_abs_node1 )) ||
(( ptr->sev == SEVERITY_MAJOR) && ( ptr->resource_value > ptr->major_threshold_abs_node1 )) ||
(( ptr->sev == SEVERITY_CRITICAL) && ( ptr->resource_value > ptr->critical_threshold_abs_node1 )))
{
if (ptr->count > ptr->num_tries)
ptr->count = ptr->num_tries;
if (ptr->count > 0)
ptr->count--;
if (ptr->count == 0) {
ptr->sev = SEVERITY_CLEARED;
resourceStageChange ( ptr, RMON_STAGE__FINISH );
}
}
else
{
/* While in failed state, the resource usage must sustain normal level
* num_tries number of times before an alarm can be cleared. Keep incrementing the counter
* as it will be set to num_tries in the above block as soon as resource usage returns to
* normal level.*/
ptr->count++;
}
}
}
if (node == 0) {
/* Check to see if a resource is over the failure thresholds for: minor, major and critical failures node 0 */
if (( ptr->resource_value <= ptr->minor_threshold_abs_node0 ) &&
( ptr->resource_value > ptr->major_threshold_abs_node0 ) &&
(ptr->sev != SEVERITY_MINOR))
{
ptr->count++;
if ( ptr->count >= ptr->num_tries){
ptr->failed = true;
ptr->sev = SEVERITY_MINOR;
resourceStageChange ( ptr, RMON_STAGE__MANAGE);
}
}
else if (( ptr->resource_value <= ptr->major_threshold_abs_node0 ) &&
( ptr->resource_value > ptr->critical_threshold_abs_node0 ) &&
(ptr->sev != SEVERITY_MAJOR))
{
ptr->count++;
if ( ptr->count >= ptr->num_tries){
ptr->failed = true;
ptr->sev = SEVERITY_MAJOR;
resourceStageChange ( ptr, RMON_STAGE__MANAGE);
}
}
else if (( ptr->resource_value < ptr->critical_threshold_abs_node0 )&&
(ptr->sev != SEVERITY_CRITICAL))
{
ptr->count++;
if (ptr->count >= ptr->num_tries){
ptr->failed = true;
ptr->sev = SEVERITY_CRITICAL;
resourceStageChange ( ptr, RMON_STAGE__MANAGE);
}
}
else
{
/* if the host experienced a resource blip in the previous audit run and usage
* is now back at the normal level, decrement the count.*/
if ((!ptr->failed) && (ptr->count > 0)){
ptr->count--;
dlog("Resource %s is back at the normal level, count is set to %d", ptr->resource, ptr->count);
}
}
} else {
/* Check to see if a resource is over the failure thresholds for: minor, major and critical failures node 1 */
if (( ptr->resource_value <= ptr->minor_threshold_abs_node1 ) &&
( ptr->resource_value > ptr->major_threshold_abs_node1 ) &&
(ptr->sev != SEVERITY_MINOR))
{
ptr->count++;
if ( ptr->count >= ptr->num_tries){
ptr->failed = true;
ptr->sev = SEVERITY_MINOR;
resourceStageChange ( ptr, RMON_STAGE__MANAGE);
}
}
else if (( ptr->resource_value <= ptr->major_threshold_abs_node1 ) &&
( ptr->resource_value > ptr->critical_threshold_abs_node1 ) &&
(ptr->sev != SEVERITY_MAJOR))
{
ptr->count++;
if ( ptr->count >= ptr->num_tries){
ptr->failed = true;
ptr->sev = SEVERITY_MAJOR;
resourceStageChange ( ptr, RMON_STAGE__MANAGE);
}
}
else if (( ptr->resource_value < ptr->critical_threshold_abs_node1 )&&
(ptr->sev != SEVERITY_CRITICAL))
{
ptr->count++;
if (ptr->count >= ptr->num_tries){
ptr->failed = true;
ptr->sev = SEVERITY_CRITICAL;
resourceStageChange ( ptr, RMON_STAGE__MANAGE);
}
}
else
{
/* if the host experienced a resource blip in the previous audit run and usage
* is now back at the normal level, decrement the count.*/
if ((!ptr->failed) && (ptr->count > 0)){
ptr->count--;
dlog("Resource %s is back at the normal level, count is set to %d", ptr->resource, ptr->count);
}
}
}
}
void update_total_clients (int total_clients)
{
_rmon_ctrl_ptr->clients = total_clients;
}
void add_registered_client (registered_clients client)
{
registered_clt[_rmon_ctrl_ptr->clients] = client;
ilog("added registered client: %s \n", client.client_name);
}
/*****************************************************************************
*
* Name : add_fs_resource
*
* Purpose : Add a dynamic or static fs resource by reading
* the: /etc/rmonfiles.d/dynamic.conf file
*****************************************************************************/
void add_fs_resource ( int resource_index, int criticality_index, int enabled,
int