From 185d6cbc648fd041402a5034b04b818da5c7136e Mon Sep 17 00:00:00 2001 From: Lubosz Kosnik Date: Thu, 28 Jan 2016 14:44:00 +0100 Subject: [PATCH] Add support for Keepalived VRRP health check Adds functionality to generate bash script which verifies health of current keepalived instance by pinging all available and configured GW addresses. This functionality supports IPv4 and IPv6 by detecting needed ping version using netaddr. DocImpact: Added a new parameter to 'l3_agent.ini' named 'ha_vrrp_health_check_interval' which is by default set to 0 (disabled). Values > 0 designate health check functionality should be enabled. Requires allowed ICMP ECHO_REQUEST because that is disabled by default. Co-Authored-By: Artur Korzeniewski Change-Id: Ib4d0691f432830357ea3f113036719645bc59a62 Closes-Bug: #1365461 --- neutron/agent/l3/ha_router.py | 5 +- neutron/agent/linux/keepalived.py | 115 ++++++++++++++++-- neutron/conf/agent/l3/ha.py | 12 ++ .../tests/functional/agent/l3/framework.py | 10 ++ .../functional/agent/l3/test_ha_router.py | 48 ++++++++ .../tests/unit/agent/linux/test_keepalived.py | 107 +++++++++++++++- ...ved-vrrp-healt-check-f23ed7c853151484.yaml | 11 ++ 7 files changed, 298 insertions(+), 10 deletions(-) create mode 100644 releasenotes/notes/add-keepalived-vrrp-healt-check-f23ed7c853151484.yaml diff --git a/neutron/agent/l3/ha_router.py b/neutron/agent/l3/ha_router.py index e02aadbe4bd..426f2c5733d 100644 --- a/neutron/agent/l3/ha_router.py +++ b/neutron/agent/l3/ha_router.py @@ -129,7 +129,10 @@ class HaRouter(router.RouterInfo): ha_port_cidrs, nopreempt=True, advert_int=self.agent_conf.ha_vrrp_advert_int, - priority=self.ha_priority) + priority=self.ha_priority, + vrrp_health_check_interval=( + self.agent_conf.ha_vrrp_health_check_interval), + ha_conf_dir=self.keepalived_manager.get_conf_dir()) instance.track_interfaces.append(interface_name) if self.agent_conf.ha_vrrp_auth_password: diff --git a/neutron/agent/linux/keepalived.py b/neutron/agent/linux/keepalived.py index 0ed3e8d2791..c5ba8c83f8c 100644 --- a/neutron/agent/linux/keepalived.py +++ b/neutron/agent/linux/keepalived.py @@ -15,6 +15,7 @@ import errno import itertools import os +import six import netaddr from neutron_lib import exceptions @@ -35,6 +36,7 @@ KEEPALIVED_SERVICE_NAME = 'keepalived' KEEPALIVED_EMAIL_FROM = 'neutron@openstack.local' KEEPALIVED_ROUTER_ID = 'neutron' GARP_MASTER_DELAY = 60 +HEALTH_CHECK_NAME = 'ha_health_check' LOG = logging.getLogger(__name__) @@ -160,7 +162,9 @@ class KeepalivedInstance(object): def __init__(self, state, interface, vrouter_id, ha_cidrs, priority=HA_DEFAULT_PRIORITY, advert_int=None, mcast_src_ip=None, nopreempt=False, - garp_master_delay=GARP_MASTER_DELAY): + garp_master_delay=GARP_MASTER_DELAY, + vrrp_health_check_interval=0, + ha_conf_dir=None): self.name = 'VR_%s' % vrouter_id if state not in VALID_STATES: @@ -178,12 +182,17 @@ class KeepalivedInstance(object): self.vips = [] self.virtual_routes = KeepalivedInstanceRoutes() self.authentication = None + self.track_script = None self.primary_vip_range = get_free_range( parent_range=constants.PRIVATE_CIDR_RANGE, excluded_ranges=[constants.METADATA_CIDR, constants.DVR_FIP_LL_CIDR] + ha_cidrs, size=PRIMARY_VIP_RANGE_SIZE) + if vrrp_health_check_interval > 0: + self.track_script = KeepalivedTrackScript( + vrrp_health_check_interval, ha_conf_dir, self.vrouter_id) + def set_authentication(self, auth_type, password): if auth_type not in VALID_AUTH_TYPES: raise InvalidAuthenticationTypeException(auth_type=auth_type) @@ -267,12 +276,19 @@ class KeepalivedInstance(object): [' }']) def build_config(self): - config = ['vrrp_instance %s {' % self.name, - ' state %s' % self.state, - ' interface %s' % self.interface, - ' virtual_router_id %s' % self.vrouter_id, - ' priority %s' % self.priority, - ' garp_master_delay %s' % self.garp_master_delay] + if self.track_script: + config = self.track_script.build_config_preamble() + self.track_script.routes = self.virtual_routes.gateway_routes + self.track_script.vips = self.vips + else: + config = [] + + config.extend(['vrrp_instance %s {' % self.name, + ' state %s' % self.state, + ' interface %s' % self.interface, + ' virtual_router_id %s' % self.vrouter_id, + ' priority %s' % self.priority, + ' garp_master_delay %s' % self.garp_master_delay]) if self.nopreempt: config.append(' nopreempt') @@ -299,6 +315,9 @@ class KeepalivedInstance(object): if len(self.virtual_routes): config.extend(self.virtual_routes.build_config()) + if self.track_script: + config.extend(self.track_script.build_config()) + config.append('}') return config @@ -406,6 +425,10 @@ class KeepalivedManager(object): keepalived_pm.enable(reload_cfg=True) + for key, instance in six.iteritems(self.config.instances): + if instance.track_script: + instance.track_script.write_check_script() + self.process_monitor.register(uuid=self.resource_id, service_name=KEEPALIVED_SERVICE_NAME, monitored_process=keepalived_pm) @@ -453,3 +476,81 @@ class KeepalivedManager(object): return cmd return callback + + +class KeepalivedTrackScript(KeepalivedConf): + """Track script generator for Keepalived""" + + def __init__(self, interval, conf_dir, vr_id): + self.interval = interval + self.conf_dir = conf_dir + self.vr_id = vr_id + self.routes = [] + self.vips = [] + + def build_config_preamble(self): + config = ['', + 'vrrp_script %s_%s {' % (HEALTH_CHECK_NAME, self.vr_id), + ' script "%s"' % self._get_script_location(), + ' interval %s' % self.interval, + ' fall 2', + ' rise 2', + '}', + ''] + + return config + + def _is_needed(self): + """Check if track script is needed by checking amount of routes. + + :return: True/False + """ + return len(self.routes) > 0 + + def build_config(self): + if not self._is_needed(): + return '' + + config = [' track_script {', + ' %s_%s' % (HEALTH_CHECK_NAME, self.vr_id), + ' }'] + + return config + + def build_script(self): + return itertools.chain(['#!/bin/bash -eu'], + ['%s' % self._check_ip_assigned()], + ('%s' % self._add_ip_addr(route.nexthop) + for route in self.routes if route.nexthop), + ) + + def _add_ip_addr(self, ip_addr): + cmd = { + 4: 'ping', + 6: 'ping6', + }.get(netaddr.IPAddress(ip_addr).version) + + return '%s -c 1 -w 1 %s 1>/dev/null || exit 1' % (cmd, ip_addr) + + def _check_ip_assigned(self): + cmd = 'ip a | grep %s || exit 0' + return cmd % netaddr.IPNetwork(self.vips[0].ip_address).ip if len( + self.vips) else '' + + def _get_script_str(self): + """Generates and returns bash script to verify connectivity. + + :return: Bash script code + """ + return '\n'.join(self.build_script()) + + def _get_script_location(self): + return os.path.join(self.conf_dir, + 'ha_check_script_%s.sh' % self.vr_id) + + def write_check_script(self): + if not self._is_needed(): + return + + file_utils.replace_file( + self._get_script_location(), self._get_script_str(), 0o520) diff --git a/neutron/conf/agent/l3/ha.py b/neutron/conf/agent/l3/ha.py index 8bb539aed50..9ad20231460 100644 --- a/neutron/conf/agent/l3/ha.py +++ b/neutron/conf/agent/l3/ha.py @@ -43,6 +43,18 @@ OPTS = [ 'keepalived server connection requests. ' 'More threads create a higher CPU load ' 'on the agent node.')), + cfg.IntOpt('ha_vrrp_health_check_interval', + default=0, + help=_('The VRRP health check interval in seconds. Values > 0 ' + 'enable VRRP health checks. Setting it to 0 disables ' + 'VRRP health checks. Recommended value is 5. ' + 'This will cause pings to be sent to the gateway ' + 'IP address(es) - requires ICMP_ECHO_REQUEST ' + 'to be enabled on the gateway. ' + 'If gateway fails, all routers will be reported ' + 'as master, and master election will be repeated ' + 'in round-robin fashion, until one of the router ' + 'restore the gateway connection.')), ] diff --git a/neutron/tests/functional/agent/l3/framework.py b/neutron/tests/functional/agent/l3/framework.py index b0481b5ac5a..015d9cb287b 100644 --- a/neutron/tests/functional/agent/l3/framework.py +++ b/neutron/tests/functional/agent/l3/framework.py @@ -591,6 +591,16 @@ class L3AgentTestFramework(base.BaseSudoTestCase): ha_device = ip_lib.IPDevice(device_name, router.ha_namespace) ha_device.link.set_down() + @staticmethod + def fail_gw_router_port(router): + r_br = ip_lib.IPDevice(router.driver.conf.external_network_bridge) + r_br.link.set_down() + + @staticmethod + def restore_gw_router_port(router): + r_br = ip_lib.IPDevice(router.driver.conf.external_network_bridge) + r_br.link.set_up() + @classmethod def _get_addresses_on_device(cls, namespace, interface): return [address['cidr'] for address in diff --git a/neutron/tests/functional/agent/l3/test_ha_router.py b/neutron/tests/functional/agent/l3/test_ha_router.py index ea2b5aa44d8..886d9af5265 100644 --- a/neutron/tests/functional/agent/l3/test_ha_router.py +++ b/neutron/tests/functional/agent/l3/test_ha_router.py @@ -336,6 +336,54 @@ class L3HATestFailover(framework.L3AgentTestFramework): self.assertEqual(master_router, new_slave) self.assertEqual(slave_router, new_master) + def test_ha_router_lost_gw_connection(self): + self.agent.conf.set_override( + 'ha_vrrp_health_check_interval', 5) + self.failover_agent.conf.set_override( + 'ha_vrrp_health_check_interval', 5) + + router1, router2 = self.create_ha_routers() + + master_router, slave_router = self._get_master_and_slave_routers( + router1, router2) + + self.fail_gw_router_port(master_router) + + # NOTE: passing slave_router as first argument, because we expect + # that this router should be the master + new_master, new_slave = self._get_master_and_slave_routers( + slave_router, master_router) + + self.assertEqual(master_router, new_slave) + self.assertEqual(slave_router, new_master) + + def test_both_ha_router_lost_gw_connection(self): + self.agent.conf.set_override( + 'ha_vrrp_health_check_interval', 5) + self.failover_agent.conf.set_override( + 'ha_vrrp_health_check_interval', 5) + + router1, router2 = self.create_ha_routers() + + master_router, slave_router = self._get_master_and_slave_routers( + router1, router2) + + self.fail_gw_router_port(master_router) + self.fail_gw_router_port(slave_router) + + common_utils.wait_until_true( + lambda: master_router.ha_state == 'master') + common_utils.wait_until_true( + lambda: slave_router.ha_state == 'master') + + self.restore_gw_router_port(master_router) + + new_master, new_slave = self._get_master_and_slave_routers( + master_router, slave_router) + + self.assertEqual(master_router, new_master) + self.assertEqual(slave_router, new_slave) + class LinuxBridgeL3HATestCase(L3HATestCase): INTERFACE_DRIVER = 'neutron.agent.linux.interface.BridgeInterfaceDriver' diff --git a/neutron/tests/unit/agent/linux/test_keepalived.py b/neutron/tests/unit/agent/linux/test_keepalived.py index f59dca2618b..b881f7b5d16 100644 --- a/neutron/tests/unit/agent/linux/test_keepalived.py +++ b/neutron/tests/unit/agent/linux/test_keepalived.py @@ -11,11 +11,16 @@ # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. +# -from neutron_lib import constants as n_consts +import os + +import mock import testtools import textwrap +from neutron_lib import constants as n_consts + from neutron.agent.linux import keepalived from neutron.tests import base @@ -29,6 +34,8 @@ KEEPALIVED_GLOBAL_CONFIG = textwrap.dedent("""\ }""") % dict( email_from=keepalived.KEEPALIVED_EMAIL_FROM, router_id=keepalived.KEEPALIVED_ROUTER_ID) +VRRP_ID = 1 +VRRP_INTERVAL = 5 class KeepalivedGetFreeRangeTestCase(base.BaseTestCase): @@ -316,7 +323,32 @@ class KeepalivedInstanceTestCase(base.BaseTestCase, } }""") instance = keepalived.KeepalivedInstance( - 'MASTER', 'eth0', 1, ['169.254.192.0/18']) + 'MASTER', 'eth0', VRRP_ID, ['169.254.192.0/18']) + self.assertEqual(expected, os.linesep.join(instance.build_config())) + + def test_build_config_no_vips_track_script(self): + expected = """ +vrrp_script ha_health_check_1 { + script "/etc/ha_confs/qrouter-x/ha_check_script_1.sh" + interval 5 + fall 2 + rise 2 +} + +vrrp_instance VR_1 { + state MASTER + interface eth0 + virtual_router_id 1 + priority 50 + garp_master_delay 60 + virtual_ipaddress { + 169.254.0.1/24 dev eth0 + } +}""" + instance = keepalived.KeepalivedInstance( + 'MASTER', 'eth0', VRRP_ID, ['169.254.192.0/18']) + instance.track_script = keepalived.KeepalivedTrackScript( + VRRP_INTERVAL, '/etc/ha_confs/qrouter-x', VRRP_ID) self.assertEqual(expected, '\n'.join(instance.build_config())) @@ -346,3 +378,74 @@ class KeepalivedVirtualRouteTestCase(base.BaseTestCase): def test_virtual_route_without_dev(self): route = keepalived.KeepalivedVirtualRoute('50.0.0.0/8', '1.2.3.4') self.assertEqual('50.0.0.0/8 via 1.2.3.4', route.build_config()) + + +class KeepalivedTrackScriptTestCase(base.BaseTestCase): + + def test_build_config_preamble(self): + exp_conf = [ + '', + 'vrrp_script ha_health_check_1 {', + ' script "/etc/ha_confs/qrouter-x/ha_check_script_1.sh"', + ' interval 5', + ' fall 2', + ' rise 2', + '}', + ''] + ts = keepalived.KeepalivedTrackScript( + VRRP_INTERVAL, '/etc/ha_confs/qrouter-x', VRRP_ID) + self.assertEqual(exp_conf, ts.build_config_preamble()) + + def test_get_config_str(self): + ts = keepalived.KeepalivedTrackScript( + VRRP_INTERVAL, '/etc/ha_confs/qrouter-x', VRRP_ID) + ts.routes = [ + keepalived.KeepalivedVirtualRoute('12.0.0.0/24', '10.0.0.1'), ] + self.assertEqual(''' track_script { + ha_health_check_1 + }''', + ts.get_config_str()) + + def test_get_script_str(self): + ts = keepalived.KeepalivedTrackScript( + VRRP_INTERVAL, '/etc/ha_confs/qrouter-x', VRRP_ID) + ts.routes = [ + keepalived.KeepalivedVirtualRoute('12.0.0.0/24', '10.0.0.1'), ] + ts.vips = [ + keepalived.KeepalivedVipAddress('192.168.0.3/18', 'ha-xxx'), ] + + self.assertEqual("""#!/bin/bash -eu +ip a | grep 192.168.0.3 || exit 0 +ping -c 1 -w 1 10.0.0.1 1>/dev/null || exit 1""", + ts._get_script_str()) + + def test_get_script_str_no_routes(self): + ts = keepalived.KeepalivedTrackScript( + VRRP_INTERVAL, '/etc/ha_confs/qrouter-x', VRRP_ID) + + self.assertEqual('#!/bin/bash -eu\n', ts._get_script_str()) + + def test_write_check_script(self): + conf_dir = '/etc/ha_confs/qrouter-x' + ts = keepalived.KeepalivedTrackScript(VRRP_INTERVAL, conf_dir, VRRP_ID) + ts.routes = [ + keepalived.KeepalivedVirtualRoute('12.0.0.0/24', '10.0.0.1'), + keepalived.KeepalivedVirtualRoute('2001:db8::1', '2001:db8::1'), ] + with mock.patch.object(keepalived, 'file_utils') as patched_utils: + ts.write_check_script() + patched_utils.replace_file.assert_called_with( + os.path.join(conf_dir, 'ha_check_script_1.sh'), + """#!/bin/bash -eu + +ping -c 1 -w 1 10.0.0.1 1>/dev/null || exit 1 +ping6 -c 1 -w 1 2001:db8::1 1>/dev/null || exit 1""", + 0o520 + ) + + def test_write_check_script_no_routes(self): + conf_dir = '/etc/ha_confs/qrouter-x' + ts = keepalived.KeepalivedTrackScript( + VRRP_INTERVAL, conf_dir, VRRP_ID) + with mock.patch.object(keepalived, 'file_utils') as patched_utils: + ts.write_check_script() + patched_utils.replace_file.assert_not_called() diff --git a/releasenotes/notes/add-keepalived-vrrp-healt-check-f23ed7c853151484.yaml b/releasenotes/notes/add-keepalived-vrrp-healt-check-f23ed7c853151484.yaml new file mode 100644 index 00000000000..a3eb638ddf3 --- /dev/null +++ b/releasenotes/notes/add-keepalived-vrrp-healt-check-f23ed7c853151484.yaml @@ -0,0 +1,11 @@ +--- +prelude: > + Keepalived VRRP health check functionality to enable verification of + connectivity from the "master" router to all gateways. +features: + - Activation of this feature enables gateway connectivity validation and + rescheduling of the "master" router to another node when connectivity + is lost. If all routers lose connectivity to the gateways, the election + process will be repeated round-robin until one of the routers restores + its gateway connection. In the mean time, all of the routers will be + reported as "master".