Add support for Keepalived VRRP health check

Adds functionality to generate bash script which verifies health of current
keepalived instance by pinging all available and configured GW addresses.
This functionality supports IPv4 and IPv6 by detecting needed ping version
using netaddr.

DocImpact:
Added a new parameter to 'l3_agent.ini' named
'ha_vrrp_health_check_interval' which is by default set to 0 (disabled).
Values > 0 designate health check functionality should be enabled.
Requires allowed ICMP ECHO_REQUEST because that is disabled by default.

Co-Authored-By: Artur Korzeniewski <artur.korzeniewski@intel.com>
Change-Id: Ib4d0691f432830357ea3f113036719645bc59a62
Closes-Bug: #1365461
This commit is contained in:
Lubosz Kosnik 2016-01-28 14:44:00 +01:00 committed by Brian Haley
parent 8d3f216e24
commit 185d6cbc64
7 changed files with 298 additions and 10 deletions

View File

@ -129,7 +129,10 @@ class HaRouter(router.RouterInfo):
ha_port_cidrs,
nopreempt=True,
advert_int=self.agent_conf.ha_vrrp_advert_int,
priority=self.ha_priority)
priority=self.ha_priority,
vrrp_health_check_interval=(
self.agent_conf.ha_vrrp_health_check_interval),
ha_conf_dir=self.keepalived_manager.get_conf_dir())
instance.track_interfaces.append(interface_name)
if self.agent_conf.ha_vrrp_auth_password:

View File

@ -15,6 +15,7 @@
import errno
import itertools
import os
import six
import netaddr
from neutron_lib import exceptions
@ -35,6 +36,7 @@ KEEPALIVED_SERVICE_NAME = 'keepalived'
KEEPALIVED_EMAIL_FROM = 'neutron@openstack.local'
KEEPALIVED_ROUTER_ID = 'neutron'
GARP_MASTER_DELAY = 60
HEALTH_CHECK_NAME = 'ha_health_check'
LOG = logging.getLogger(__name__)
@ -160,7 +162,9 @@ class KeepalivedInstance(object):
def __init__(self, state, interface, vrouter_id, ha_cidrs,
priority=HA_DEFAULT_PRIORITY, advert_int=None,
mcast_src_ip=None, nopreempt=False,
garp_master_delay=GARP_MASTER_DELAY):
garp_master_delay=GARP_MASTER_DELAY,
vrrp_health_check_interval=0,
ha_conf_dir=None):
self.name = 'VR_%s' % vrouter_id
if state not in VALID_STATES:
@ -178,12 +182,17 @@ class KeepalivedInstance(object):
self.vips = []
self.virtual_routes = KeepalivedInstanceRoutes()
self.authentication = None
self.track_script = None
self.primary_vip_range = get_free_range(
parent_range=constants.PRIVATE_CIDR_RANGE,
excluded_ranges=[constants.METADATA_CIDR,
constants.DVR_FIP_LL_CIDR] + ha_cidrs,
size=PRIMARY_VIP_RANGE_SIZE)
if vrrp_health_check_interval > 0:
self.track_script = KeepalivedTrackScript(
vrrp_health_check_interval, ha_conf_dir, self.vrouter_id)
def set_authentication(self, auth_type, password):
if auth_type not in VALID_AUTH_TYPES:
raise InvalidAuthenticationTypeException(auth_type=auth_type)
@ -267,12 +276,19 @@ class KeepalivedInstance(object):
[' }'])
def build_config(self):
config = ['vrrp_instance %s {' % self.name,
' state %s' % self.state,
' interface %s' % self.interface,
' virtual_router_id %s' % self.vrouter_id,
' priority %s' % self.priority,
' garp_master_delay %s' % self.garp_master_delay]
if self.track_script:
config = self.track_script.build_config_preamble()
self.track_script.routes = self.virtual_routes.gateway_routes
self.track_script.vips = self.vips
else:
config = []
config.extend(['vrrp_instance %s {' % self.name,
' state %s' % self.state,
' interface %s' % self.interface,
' virtual_router_id %s' % self.vrouter_id,
' priority %s' % self.priority,
' garp_master_delay %s' % self.garp_master_delay])
if self.nopreempt:
config.append(' nopreempt')
@ -299,6 +315,9 @@ class KeepalivedInstance(object):
if len(self.virtual_routes):
config.extend(self.virtual_routes.build_config())
if self.track_script:
config.extend(self.track_script.build_config())
config.append('}')
return config
@ -406,6 +425,10 @@ class KeepalivedManager(object):
keepalived_pm.enable(reload_cfg=True)
for key, instance in six.iteritems(self.config.instances):
if instance.track_script:
instance.track_script.write_check_script()
self.process_monitor.register(uuid=self.resource_id,
service_name=KEEPALIVED_SERVICE_NAME,
monitored_process=keepalived_pm)
@ -453,3 +476,81 @@ class KeepalivedManager(object):
return cmd
return callback
class KeepalivedTrackScript(KeepalivedConf):
"""Track script generator for Keepalived"""
def __init__(self, interval, conf_dir, vr_id):
self.interval = interval
self.conf_dir = conf_dir
self.vr_id = vr_id
self.routes = []
self.vips = []
def build_config_preamble(self):
config = ['',
'vrrp_script %s_%s {' % (HEALTH_CHECK_NAME, self.vr_id),
' script "%s"' % self._get_script_location(),
' interval %s' % self.interval,
' fall 2',
' rise 2',
'}',
'']
return config
def _is_needed(self):
"""Check if track script is needed by checking amount of routes.
:return: True/False
"""
return len(self.routes) > 0
def build_config(self):
if not self._is_needed():
return ''
config = [' track_script {',
' %s_%s' % (HEALTH_CHECK_NAME, self.vr_id),
' }']
return config
def build_script(self):
return itertools.chain(['#!/bin/bash -eu'],
['%s' % self._check_ip_assigned()],
('%s' % self._add_ip_addr(route.nexthop)
for route in self.routes if route.nexthop),
)
def _add_ip_addr(self, ip_addr):
cmd = {
4: 'ping',
6: 'ping6',
}.get(netaddr.IPAddress(ip_addr).version)
return '%s -c 1 -w 1 %s 1>/dev/null || exit 1' % (cmd, ip_addr)
def _check_ip_assigned(self):
cmd = 'ip a | grep %s || exit 0'
return cmd % netaddr.IPNetwork(self.vips[0].ip_address).ip if len(
self.vips) else ''
def _get_script_str(self):
"""Generates and returns bash script to verify connectivity.
:return: Bash script code
"""
return '\n'.join(self.build_script())
def _get_script_location(self):
return os.path.join(self.conf_dir,
'ha_check_script_%s.sh' % self.vr_id)
def write_check_script(self):
if not self._is_needed():
return
file_utils.replace_file(
self._get_script_location(), self._get_script_str(), 0o520)

View File

@ -43,6 +43,18 @@ OPTS = [
'keepalived server connection requests. '
'More threads create a higher CPU load '
'on the agent node.')),
cfg.IntOpt('ha_vrrp_health_check_interval',
default=0,
help=_('The VRRP health check interval in seconds. Values > 0 '
'enable VRRP health checks. Setting it to 0 disables '
'VRRP health checks. Recommended value is 5. '
'This will cause pings to be sent to the gateway '
'IP address(es) - requires ICMP_ECHO_REQUEST '
'to be enabled on the gateway. '
'If gateway fails, all routers will be reported '
'as master, and master election will be repeated '
'in round-robin fashion, until one of the router '
'restore the gateway connection.')),
]

View File

@ -591,6 +591,16 @@ class L3AgentTestFramework(base.BaseSudoTestCase):
ha_device = ip_lib.IPDevice(device_name, router.ha_namespace)
ha_device.link.set_down()
@staticmethod
def fail_gw_router_port(router):
r_br = ip_lib.IPDevice(router.driver.conf.external_network_bridge)
r_br.link.set_down()
@staticmethod
def restore_gw_router_port(router):
r_br = ip_lib.IPDevice(router.driver.conf.external_network_bridge)
r_br.link.set_up()
@classmethod
def _get_addresses_on_device(cls, namespace, interface):
return [address['cidr'] for address in

View File

@ -336,6 +336,54 @@ class L3HATestFailover(framework.L3AgentTestFramework):
self.assertEqual(master_router, new_slave)
self.assertEqual(slave_router, new_master)
def test_ha_router_lost_gw_connection(self):
self.agent.conf.set_override(
'ha_vrrp_health_check_interval', 5)
self.failover_agent.conf.set_override(
'ha_vrrp_health_check_interval', 5)
router1, router2 = self.create_ha_routers()
master_router, slave_router = self._get_master_and_slave_routers(
router1, router2)
self.fail_gw_router_port(master_router)
# NOTE: passing slave_router as first argument, because we expect
# that this router should be the master
new_master, new_slave = self._get_master_and_slave_routers(
slave_router, master_router)
self.assertEqual(master_router, new_slave)
self.assertEqual(slave_router, new_master)
def test_both_ha_router_lost_gw_connection(self):
self.agent.conf.set_override(
'ha_vrrp_health_check_interval', 5)
self.failover_agent.conf.set_override(
'ha_vrrp_health_check_interval', 5)
router1, router2 = self.create_ha_routers()
master_router, slave_router = self._get_master_and_slave_routers(
router1, router2)
self.fail_gw_router_port(master_router)
self.fail_gw_router_port(slave_router)
common_utils.wait_until_true(
lambda: master_router.ha_state == 'master')
common_utils.wait_until_true(
lambda: slave_router.ha_state == 'master')
self.restore_gw_router_port(master_router)
new_master, new_slave = self._get_master_and_slave_routers(
master_router, slave_router)
self.assertEqual(master_router, new_master)
self.assertEqual(slave_router, new_slave)
class LinuxBridgeL3HATestCase(L3HATestCase):
INTERFACE_DRIVER = 'neutron.agent.linux.interface.BridgeInterfaceDriver'

View File

@ -11,11 +11,16 @@
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
from neutron_lib import constants as n_consts
import os
import mock
import testtools
import textwrap
from neutron_lib import constants as n_consts
from neutron.agent.linux import keepalived
from neutron.tests import base
@ -29,6 +34,8 @@ KEEPALIVED_GLOBAL_CONFIG = textwrap.dedent("""\
}""") % dict(
email_from=keepalived.KEEPALIVED_EMAIL_FROM,
router_id=keepalived.KEEPALIVED_ROUTER_ID)
VRRP_ID = 1
VRRP_INTERVAL = 5
class KeepalivedGetFreeRangeTestCase(base.BaseTestCase):
@ -316,7 +323,32 @@ class KeepalivedInstanceTestCase(base.BaseTestCase,
}
}""")
instance = keepalived.KeepalivedInstance(
'MASTER', 'eth0', 1, ['169.254.192.0/18'])
'MASTER', 'eth0', VRRP_ID, ['169.254.192.0/18'])
self.assertEqual(expected, os.linesep.join(instance.build_config()))
def test_build_config_no_vips_track_script(self):
expected = """
vrrp_script ha_health_check_1 {
script "/etc/ha_confs/qrouter-x/ha_check_script_1.sh"
interval 5
fall 2
rise 2
}
vrrp_instance VR_1 {
state MASTER
interface eth0
virtual_router_id 1
priority 50
garp_master_delay 60
virtual_ipaddress {
169.254.0.1/24 dev eth0
}
}"""
instance = keepalived.KeepalivedInstance(
'MASTER', 'eth0', VRRP_ID, ['169.254.192.0/18'])
instance.track_script = keepalived.KeepalivedTrackScript(
VRRP_INTERVAL, '/etc/ha_confs/qrouter-x', VRRP_ID)
self.assertEqual(expected, '\n'.join(instance.build_config()))
@ -346,3 +378,74 @@ class KeepalivedVirtualRouteTestCase(base.BaseTestCase):
def test_virtual_route_without_dev(self):
route = keepalived.KeepalivedVirtualRoute('50.0.0.0/8', '1.2.3.4')
self.assertEqual('50.0.0.0/8 via 1.2.3.4', route.build_config())
class KeepalivedTrackScriptTestCase(base.BaseTestCase):
def test_build_config_preamble(self):
exp_conf = [
'',
'vrrp_script ha_health_check_1 {',
' script "/etc/ha_confs/qrouter-x/ha_check_script_1.sh"',
' interval 5',
' fall 2',
' rise 2',
'}',
'']
ts = keepalived.KeepalivedTrackScript(
VRRP_INTERVAL, '/etc/ha_confs/qrouter-x', VRRP_ID)
self.assertEqual(exp_conf, ts.build_config_preamble())
def test_get_config_str(self):
ts = keepalived.KeepalivedTrackScript(
VRRP_INTERVAL, '/etc/ha_confs/qrouter-x', VRRP_ID)
ts.routes = [
keepalived.KeepalivedVirtualRoute('12.0.0.0/24', '10.0.0.1'), ]
self.assertEqual(''' track_script {
ha_health_check_1
}''',
ts.get_config_str())
def test_get_script_str(self):
ts = keepalived.KeepalivedTrackScript(
VRRP_INTERVAL, '/etc/ha_confs/qrouter-x', VRRP_ID)
ts.routes = [
keepalived.KeepalivedVirtualRoute('12.0.0.0/24', '10.0.0.1'), ]
ts.vips = [
keepalived.KeepalivedVipAddress('192.168.0.3/18', 'ha-xxx'), ]
self.assertEqual("""#!/bin/bash -eu
ip a | grep 192.168.0.3 || exit 0
ping -c 1 -w 1 10.0.0.1 1>/dev/null || exit 1""",
ts._get_script_str())
def test_get_script_str_no_routes(self):
ts = keepalived.KeepalivedTrackScript(
VRRP_INTERVAL, '/etc/ha_confs/qrouter-x', VRRP_ID)
self.assertEqual('#!/bin/bash -eu\n', ts._get_script_str())
def test_write_check_script(self):
conf_dir = '/etc/ha_confs/qrouter-x'
ts = keepalived.KeepalivedTrackScript(VRRP_INTERVAL, conf_dir, VRRP_ID)
ts.routes = [
keepalived.KeepalivedVirtualRoute('12.0.0.0/24', '10.0.0.1'),
keepalived.KeepalivedVirtualRoute('2001:db8::1', '2001:db8::1'), ]
with mock.patch.object(keepalived, 'file_utils') as patched_utils:
ts.write_check_script()
patched_utils.replace_file.assert_called_with(
os.path.join(conf_dir, 'ha_check_script_1.sh'),
"""#!/bin/bash -eu
ping -c 1 -w 1 10.0.0.1 1>/dev/null || exit 1
ping6 -c 1 -w 1 2001:db8::1 1>/dev/null || exit 1""",
0o520
)
def test_write_check_script_no_routes(self):
conf_dir = '/etc/ha_confs/qrouter-x'
ts = keepalived.KeepalivedTrackScript(
VRRP_INTERVAL, conf_dir, VRRP_ID)
with mock.patch.object(keepalived, 'file_utils') as patched_utils:
ts.write_check_script()
patched_utils.replace_file.assert_not_called()

View File

@ -0,0 +1,11 @@
---
prelude: >
Keepalived VRRP health check functionality to enable verification of
connectivity from the "master" router to all gateways.
features:
- Activation of this feature enables gateway connectivity validation and
rescheduling of the "master" router to another node when connectivity
is lost. If all routers lose connectivity to the gateways, the election
process will be repeated round-robin until one of the routers restores
its gateway connection. In the mean time, all of the routers will be
reported as "master".