
Maintenance heartbeat service should not be multicast messaging over an 'lo' interface which in IPv6 leads to socket failures, log flooding and the inability to detect and report pmond process failure. To fix that this update - configures pulse messaging to unicast for monitored networks configured as 'lo'. - prevents heartbeating over the cluster network if both it and the management network are both configured on the 'lo' interface. - improves logging to avoid flooding in the presence of socket setup or access errors. - stops logging netlink events (interface state changes) on unmonitored network interfaces. - maintains heartbeat disabled state until the management network is up. - modifies hbsAgent socket failure handling and its pmon conf file so that a persistent socket failure during startup is alarmed as an hbsAgent process failure. Test Plan: PASS: Verify logging over system install and socket errors PASS: Verify unicast messaging when cluster is set to 'lo' PASS: Verify no cluster network heartbeat when it and mgmnt are set to 'lo'. Regression: PASS: Verify heartbeat messaging and cluster info PASS: Verify pmond process failure alarm management PASS: Verify heartbeat failure detection and graceful recovery PASS: Verify AIO SX IPv6 system install and run PASS: Verify AIO DX IPv6 system install and run PASS: Verify Standard IPv6 system install and run PASS: Verify Storage system IPv6 install and run PASS: Verify Storage system IPv4 install and run PASS: Verify MNFA handling in IPv6 storage system Change-Id: I5a2a0b2dee0c690617c4e0b0e2ab8b1172b2dc49 Closes-Bug: 1884585 Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
332 lines
10 KiB
C++
332 lines
10 KiB
C++
/*
|
|
* Copyright (c) 2013-2020 Wind River Systems, Inc.
|
|
*
|
|
* SPDX-License-Identifier: Apache-2.0
|
|
*
|
|
*/
|
|
|
|
/**
|
|
* @file
|
|
* Wind River CGCS Platform netlink listener event support for maintenance
|
|
*/
|
|
|
|
#include <asm/types.h>
|
|
#include <sys/socket.h>
|
|
#include <unistd.h>
|
|
#include <errno.h>
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
#include <net/if.h>
|
|
#include <netinet/in.h>
|
|
#include <linux/netlink.h>
|
|
#include <linux/rtnetlink.h>
|
|
#include <sys/ioctl.h>
|
|
#include <stdlib.h>
|
|
#include <sys/time.h>
|
|
#include <sys/types.h>
|
|
#include <list>
|
|
|
|
using namespace std;
|
|
|
|
#include "nodeBase.h"
|
|
#include "nodeUtil.h"
|
|
|
|
int get_netlink_events_throttle = 0 ;
|
|
int get_netlink_events ( int nl_socket , std::list<string> & links_gone_down,
|
|
std::list<string> & links_gone_up )
|
|
{
|
|
char buf[4096];
|
|
char name[IF_NAMESIZE];
|
|
|
|
int len ;
|
|
int ret = 0;
|
|
|
|
struct sockaddr_nl sa;
|
|
struct iovec iov = { buf, sizeof (buf) };
|
|
struct msghdr msg = { &sa, sizeof (sa), &iov, 1, NULL, 0, 0 };
|
|
struct nlmsghdr *h;
|
|
struct ifinfomsg *ifi;
|
|
/* struct ifaddrmsg * ifa ; used for addr change events */
|
|
|
|
links_gone_up.clear();
|
|
links_gone_down.clear();
|
|
|
|
len = recvmsg (nl_socket, &msg, 0);
|
|
if (len < 0)
|
|
{
|
|
/* Socket non-blocking so bail out once we have read everything */
|
|
if ( (errno == EWOULDBLOCK) || (errno == EAGAIN))
|
|
{
|
|
return ret ;
|
|
}
|
|
|
|
/* Anything else is an error */
|
|
elog ("failed netlink recvmsg (%d:%d) (%d:%s)\n", nl_socket, len, errno, strerror(errno));
|
|
return len;
|
|
}
|
|
|
|
if (len == 0)
|
|
{
|
|
wlog ("No netlink data read_netlink: EOF\n");
|
|
}
|
|
|
|
/* Handle all the messages from 'recvmsg' */
|
|
h = (struct nlmsghdr *) &buf[0] ;
|
|
for ( ; NLMSG_OK (h,(unsigned int)len); h=NLMSG_NEXT (h,len))
|
|
{
|
|
/* ignore address change events */
|
|
if (h->nlmsg_type == RTM_NEWADDR )
|
|
{
|
|
|
|
#ifdef RTM_NEWADDR_SUPPORTED
|
|
ifa = (struct ifaddrmsg *) NLMSG_DATA (nlh);
|
|
|
|
struct rtattr * rth = IFA_RTA (ifa);
|
|
int rtl = IFA_PAYLOAD (nlh);
|
|
for (;rtl && RTA_OK (rth, rtl); rth = RTA_NEXT (rth,rtl))
|
|
{
|
|
char name[IFNAMSIZ];
|
|
uint32_t ipaddr;
|
|
|
|
if (rth->rta_type != IFA_LOCAL) continue;
|
|
|
|
ipaddr = * ((uint32_t *)RTA_DATA(rth));
|
|
ipaddr = htonl(ipaddr);
|
|
|
|
fprintf (stdout,"%s is now %X\n",if_indextoname(ifa->ifa_index,name),ipaddr);
|
|
}
|
|
#else
|
|
dlog ("unsupported netlink event: RTM_NEWADDR\n");
|
|
continue ;
|
|
#endif
|
|
}
|
|
|
|
/* Finish reading */
|
|
if (h->nlmsg_type == NLMSG_NOOP )
|
|
{
|
|
ilog ("netlink message: Nothing to read\n");
|
|
return ret;
|
|
}
|
|
|
|
/* Finish reading */
|
|
if (h->nlmsg_type == NLMSG_DONE)
|
|
{
|
|
ilog ("netlink message: No more messages\n");
|
|
return ret;
|
|
}
|
|
|
|
/* Message is some kind of error */
|
|
if (h->nlmsg_type == NLMSG_ERROR)
|
|
{
|
|
wlog ("netlink message: indicates error\n");
|
|
return -1;
|
|
}
|
|
|
|
ifi = (ifinfomsg*) NLMSG_DATA (h);
|
|
memset ( name, 0 , IF_NAMESIZE );
|
|
if ( ifi->ifi_index )
|
|
{
|
|
if_indextoname(ifi->ifi_index, name);
|
|
if (ifi->ifi_flags & IFF_RUNNING)
|
|
{
|
|
/* if 'up' then remove interface from 'down' list and add it to the 'up' list */
|
|
links_gone_down.remove(name);
|
|
links_gone_up.push_front(name);
|
|
dlog ( "%s is up and running \n", name );
|
|
}
|
|
else
|
|
{
|
|
if ( ifi->ifi_flags & IFF_UP )
|
|
{
|
|
dlog ("%s is admin:up but oper:down\n", name );
|
|
}
|
|
else
|
|
{
|
|
dlog ("%s is admin:down and oper:down\n", name );
|
|
}
|
|
|
|
/* if 'down' then remove interface from 'up' list and add it to the 'down' list */
|
|
links_gone_up.remove(name);
|
|
links_gone_down.push_front(name);
|
|
}
|
|
get_netlink_events_throttle = 0 ;
|
|
}
|
|
else
|
|
{
|
|
wlog_throttled (get_netlink_events_throttle, 100, "got netlink event for unknown interface index\n");
|
|
}
|
|
ret++ ;
|
|
}
|
|
links_gone_up.unique();
|
|
links_gone_down.unique();
|
|
|
|
return ret;
|
|
}
|
|
|
|
|
|
void log_link_events ( int netlink_sock,
|
|
int ioctl_sock,
|
|
const char * mgmnt_iface_ptr,
|
|
const char * clstr_iface_ptr,
|
|
bool & mgmnt_link_up_and_running,
|
|
bool & clstr_link_up_and_running)
|
|
{
|
|
std::list<string> links_gone_down ;
|
|
std::list<string> links_gone_up ;
|
|
std::list<string>::iterator iter_curr_ptr ;
|
|
dlog3 ("logging for interfaces %s and %s\n", mgmnt_iface_ptr, clstr_iface_ptr);
|
|
if ( get_netlink_events ( netlink_sock, links_gone_down, links_gone_up ))
|
|
{
|
|
bool running = false ;
|
|
if ( !links_gone_down.empty() )
|
|
{
|
|
dlog3 ("%ld links have dropped\n", links_gone_down.size() );
|
|
|
|
/* Look at the down list */
|
|
for ( iter_curr_ptr = links_gone_down.begin();
|
|
iter_curr_ptr != links_gone_down.end() ;
|
|
iter_curr_ptr++ )
|
|
{
|
|
bool care = false ;
|
|
dlog3 ( "downed link: %s (running:%d:%d)\n",
|
|
iter_curr_ptr->c_str(),
|
|
mgmnt_link_up_and_running,
|
|
clstr_link_up_and_running );
|
|
|
|
if ( !strcmp (mgmnt_iface_ptr, iter_curr_ptr->data()))
|
|
{
|
|
care = true ;
|
|
if ( mgmnt_link_up_and_running == true )
|
|
{
|
|
mgmnt_link_up_and_running = false ;
|
|
wlog ("Mgmnt link %s is down\n", mgmnt_iface_ptr );
|
|
}
|
|
}
|
|
if ( !strcmp (clstr_iface_ptr, iter_curr_ptr->data()))
|
|
{
|
|
care = true ;
|
|
if ( clstr_link_up_and_running == true )
|
|
{
|
|
clstr_link_up_and_running = false ;
|
|
wlog ("Cluster-host link %s is down\n", clstr_iface_ptr );
|
|
}
|
|
}
|
|
|
|
if ( care == true )
|
|
{
|
|
if ( get_link_state ( ioctl_sock, iter_curr_ptr->data(), &running ) == PASS )
|
|
{
|
|
wlog ("%s is down (oper:%s) (%ld)\n",
|
|
iter_curr_ptr->c_str(),
|
|
running ? "up" : "down",
|
|
iter_curr_ptr->length());
|
|
}
|
|
else
|
|
{
|
|
wlog ("%s is down (driver query failed) (len:%ld)\n",
|
|
iter_curr_ptr->c_str(),
|
|
iter_curr_ptr->length());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if ( !links_gone_up.empty() )
|
|
{
|
|
dlog3 ("%ld links have recovered\n", links_gone_up.size());
|
|
|
|
/* Look at the up list */
|
|
for ( iter_curr_ptr = links_gone_up.begin();
|
|
iter_curr_ptr != links_gone_up.end() ;
|
|
iter_curr_ptr++ )
|
|
{
|
|
bool care = false ;
|
|
dlog3 ( "recovered link: %s (running:%d:%d)\n",
|
|
iter_curr_ptr->c_str(),
|
|
mgmnt_link_up_and_running,
|
|
clstr_link_up_and_running );
|
|
|
|
if ( !strcmp (mgmnt_iface_ptr, iter_curr_ptr->data()))
|
|
{
|
|
care = true ;
|
|
mgmnt_link_up_and_running = true ;
|
|
wlog ("Mgmnt link %s is up\n", mgmnt_iface_ptr );
|
|
}
|
|
if ( !strcmp (clstr_iface_ptr, iter_curr_ptr->data()))
|
|
{
|
|
care = true ;
|
|
clstr_link_up_and_running = true ;
|
|
wlog ("Cluster-host link %s is up\n", clstr_iface_ptr );
|
|
}
|
|
|
|
if ( care == true )
|
|
{
|
|
if ( get_link_state ( ioctl_sock, iter_curr_ptr->data(), &running ) == PASS )
|
|
{
|
|
wlog ("%s is up (oper:%s) (len:%ld)\n",
|
|
iter_curr_ptr->c_str(),
|
|
running ? "up" : "down",
|
|
iter_curr_ptr->length() );
|
|
}
|
|
else
|
|
{
|
|
wlog ("%s is up (driver query failed) (len:%ld)\n",
|
|
iter_curr_ptr->c_str(),
|
|
iter_curr_ptr->length() );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Open a netlink listener socket and return that socket id.
|
|
* Return 0 on create or bind failure */
|
|
int open_netlink_socket ( int groups )
|
|
{
|
|
struct sockaddr_nl addr;
|
|
int on = 1 ;
|
|
ilog ( "NLMon Groups: %d\n", groups ) ;
|
|
|
|
int nl_socket = socket (AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
|
|
if (nl_socket < 0)
|
|
{
|
|
elog ("Failed to open netlink socket (%d:%s)\n", errno, strerror(errno));
|
|
return (0);
|
|
}
|
|
|
|
else if ( 0 > ioctl( nl_socket, FIONBIO, (char *)&on))
|
|
{
|
|
elog ("failed to set 'netlink monitor' socket non-blocking (%d:%m)\n", errno );
|
|
close (nl_socket);
|
|
nl_socket = 0 ;
|
|
}
|
|
else
|
|
{
|
|
memset ((void *) &addr, 0, sizeof (addr));
|
|
|
|
addr.nl_family = AF_NETLINK;
|
|
addr.nl_pid = getpid ();
|
|
/* addr.nl_groups = RTMGRP_LINK | RTMGRP_IPV4_IFADDR | RTMGRP_IPV6_IFADDR; */
|
|
addr.nl_groups = groups ; /* allow the caller to specify the groups */
|
|
|
|
if (bind (nl_socket, (struct sockaddr *) &addr, sizeof (addr)) < 0)
|
|
{
|
|
elog ( "Failed to bind netlink socket (%d:%s)\n", errno, strerror(errno));
|
|
close (nl_socket);
|
|
nl_socket = 0 ;
|
|
}
|
|
}
|
|
return (nl_socket);
|
|
}
|
|
|
|
void close_netlink_socket ( int socket )
|
|
{
|
|
if ( socket )
|
|
{
|
|
close (socket);
|
|
}
|
|
}
|