diff --git a/neutron/agent/l3/ha.py b/neutron/agent/l3/ha.py index d2b1dad6a2d..3760d6e407b 100644 --- a/neutron/agent/l3/ha.py +++ b/neutron/agent/l3/ha.py @@ -53,6 +53,18 @@ OPTS = [ 'keepalived server connection requests.' 'More threads create a higher CPU load ' 'on the agent node.')), + cfg.IntOpt('ha_vrrp_health_check_interval', + default=0, + help=_('The VRRP health check interval in seconds. Values > 0 ' + 'enable VRRP health checks. Setting it to 0 disables ' + 'VRRP health checks. Recommended value is 5. ' + 'This will cause pings to be sent to the gateway ' + 'IP address(es) - requires ICMP_ECHO_REQUEST ' + 'to be enabled on the gateway. ' + 'If gateway fails, all routers will be reported ' + 'as master, and master election will be repeated ' + 'in round-robin fashion, until one of the router ' + 'restore the gateway connection.')), ] TRANSLATION_MAP = {'master': constants.HA_ROUTER_STATE_ACTIVE, diff --git a/neutron/agent/l3/ha_router.py b/neutron/agent/l3/ha_router.py index b87929faa57..4508a9ec69d 100644 --- a/neutron/agent/l3/ha_router.py +++ b/neutron/agent/l3/ha_router.py @@ -130,7 +130,10 @@ class HaRouter(router.RouterInfo): ha_port_cidrs, nopreempt=True, advert_int=self.agent_conf.ha_vrrp_advert_int, - priority=self.ha_priority) + priority=self.ha_priority, + vrrp_health_check_interval=( + self.agent_conf.ha_vrrp_health_check_interval), + ha_conf_dir=self.keepalived_manager.get_conf_dir()) instance.track_interfaces.append(interface_name) if self.agent_conf.ha_vrrp_auth_password: diff --git a/neutron/agent/linux/keepalived.py b/neutron/agent/linux/keepalived.py index 35b4a16f16b..6a10a3f55da 100644 --- a/neutron/agent/linux/keepalived.py +++ b/neutron/agent/linux/keepalived.py @@ -15,9 +15,11 @@ import errno import itertools import os +import six import netaddr from neutron_lib import exceptions +from neutron_lib.utils import file as file_utils from oslo_config import cfg from oslo_log import log as logging @@ -34,6 +36,7 @@ KEEPALIVED_SERVICE_NAME = 'keepalived' KEEPALIVED_EMAIL_FROM = 'neutron@openstack.local' KEEPALIVED_ROUTER_ID = 'neutron' GARP_MASTER_DELAY = 60 +HEALTH_CHECK_NAME = 'ha_health_check' LOG = logging.getLogger(__name__) @@ -159,7 +162,9 @@ class KeepalivedInstance(object): def __init__(self, state, interface, vrouter_id, ha_cidrs, priority=HA_DEFAULT_PRIORITY, advert_int=None, mcast_src_ip=None, nopreempt=False, - garp_master_delay=GARP_MASTER_DELAY): + garp_master_delay=GARP_MASTER_DELAY, + vrrp_health_check_interval=0, + ha_conf_dir=None): self.name = 'VR_%s' % vrouter_id if state not in VALID_STATES: @@ -177,12 +182,17 @@ class KeepalivedInstance(object): self.vips = [] self.virtual_routes = KeepalivedInstanceRoutes() self.authentication = None + self.track_script = None self.primary_vip_range = get_free_range( parent_range=constants.PRIVATE_CIDR_RANGE, excluded_ranges=[constants.METADATA_CIDR, constants.DVR_FIP_LL_CIDR] + ha_cidrs, size=PRIMARY_VIP_RANGE_SIZE) + if vrrp_health_check_interval > 0: + self.track_script = KeepalivedTrackScript( + vrrp_health_check_interval, ha_conf_dir, self.vrouter_id) + def set_authentication(self, auth_type, password): if auth_type not in VALID_AUTH_TYPES: raise InvalidAuthenticationTypeException(auth_type=auth_type) @@ -266,12 +276,19 @@ class KeepalivedInstance(object): [' }']) def build_config(self): - config = ['vrrp_instance %s {' % self.name, - ' state %s' % self.state, - ' interface %s' % self.interface, - ' virtual_router_id %s' % self.vrouter_id, - ' priority %s' % self.priority, - ' garp_master_delay %s' % self.garp_master_delay] + if self.track_script: + config = self.track_script.build_config_preamble() + self.track_script.routes = self.virtual_routes.gateway_routes + self.track_script.vips = self.vips + else: + config = [] + + config.extend(['vrrp_instance %s {' % self.name, + ' state %s' % self.state, + ' interface %s' % self.interface, + ' virtual_router_id %s' % self.vrouter_id, + ' priority %s' % self.priority, + ' garp_master_delay %s' % self.garp_master_delay]) if self.nopreempt: config.append(' nopreempt') @@ -298,6 +315,9 @@ class KeepalivedInstance(object): if len(self.virtual_routes): config.extend(self.virtual_routes.build_config()) + if self.track_script: + config.extend(self.track_script.build_config()) + config.append('}') return config @@ -405,6 +425,10 @@ class KeepalivedManager(object): keepalived_pm.enable(reload_cfg=True) + for key, instance in six.iteritems(self.config.instances): + if instance.track_script: + instance.track_script.write_check_script() + self.process_monitor.register(uuid=self.resource_id, service_name=KEEPALIVED_SERVICE_NAME, monitored_process=keepalived_pm) @@ -452,3 +476,81 @@ class KeepalivedManager(object): return cmd return callback + + +class KeepalivedTrackScript(KeepalivedConf): + """Track script generator for Keepalived""" + + def __init__(self, interval, conf_dir, vr_id): + self.interval = interval + self.conf_dir = conf_dir + self.vr_id = vr_id + self.routes = [] + self.vips = [] + + def build_config_preamble(self): + config = ['', + 'vrrp_script %s_%s {' % (HEALTH_CHECK_NAME, self.vr_id), + ' script "%s"' % self._get_script_location(), + ' interval %s' % self.interval, + ' fall 2', + ' rise 2', + '}', + ''] + + return config + + def _is_needed(self): + """Check if track script is needed by checking amount of routes. + + :return: True/False + """ + return len(self.routes) > 0 + + def build_config(self): + if not self._is_needed(): + return '' + + config = [' track_script {', + ' %s_%s' % (HEALTH_CHECK_NAME, self.vr_id), + ' }'] + + return config + + def build_script(self): + return itertools.chain(['#!/bin/bash -eu'], + ['%s' % self._check_ip_assigned()], + ('%s' % self._add_ip_addr(route.nexthop) + for route in self.routes if route.nexthop), + ) + + def _add_ip_addr(self, ip_addr): + cmd = { + 4: 'ping', + 6: 'ping6', + }.get(netaddr.IPAddress(ip_addr).version) + + return '%s -c 1 -w 1 %s 1>/dev/null || exit 1' % (cmd, ip_addr) + + def _check_ip_assigned(self): + cmd = 'ip a | grep %s || exit 0' + return cmd % netaddr.IPNetwork(self.vips[0].ip_address).ip if len( + self.vips) else '' + + def _get_script_str(self): + """Generates and returns bash script to verify connectivity. + + :return: Bash script code + """ + return '\n'.join(self.build_script()) + + def _get_script_location(self): + return os.path.join(self.conf_dir, + 'ha_check_script_%s.sh' % self.vr_id) + + def write_check_script(self): + if not self._is_needed(): + return + + file_utils.replace_file( + self._get_script_location(), self._get_script_str(), 0o520) diff --git a/neutron/tests/functional/agent/l3/framework.py b/neutron/tests/functional/agent/l3/framework.py index 7da9daf44b0..16f9954e0f8 100644 --- a/neutron/tests/functional/agent/l3/framework.py +++ b/neutron/tests/functional/agent/l3/framework.py @@ -587,6 +587,16 @@ class L3AgentTestFramework(base.BaseSudoTestCase): ha_device = ip_lib.IPDevice(device_name, router.ha_namespace) ha_device.link.set_down() + @staticmethod + def fail_gw_router_port(router): + r_br = ip_lib.IPDevice(router.driver.conf.external_network_bridge) + r_br.link.set_down() + + @staticmethod + def restore_gw_router_port(router): + r_br = ip_lib.IPDevice(router.driver.conf.external_network_bridge) + r_br.link.set_up() + @classmethod def _get_addresses_on_device(cls, namespace, interface): return [address['cidr'] for address in diff --git a/neutron/tests/functional/agent/l3/test_ha_router.py b/neutron/tests/functional/agent/l3/test_ha_router.py index d10bfc08ff9..953cda3e6cf 100644 --- a/neutron/tests/functional/agent/l3/test_ha_router.py +++ b/neutron/tests/functional/agent/l3/test_ha_router.py @@ -335,6 +335,54 @@ class L3HATestFailover(framework.L3AgentTestFramework): self.assertEqual(master_router, new_slave) self.assertEqual(slave_router, new_master) + def test_ha_router_lost_gw_connection(self): + self.agent.conf.set_override( + 'ha_vrrp_health_check_interval', 5) + self.failover_agent.conf.set_override( + 'ha_vrrp_health_check_interval', 5) + + router1, router2 = self.create_ha_routers() + + master_router, slave_router = self._get_master_and_slave_routers( + router1, router2) + + self.fail_gw_router_port(master_router) + + # NOTE: passing slave_router as first argument, because we expect + # that this router should be the master + new_master, new_slave = self._get_master_and_slave_routers( + slave_router, master_router) + + self.assertEqual(master_router, new_slave) + self.assertEqual(slave_router, new_master) + + def test_both_ha_router_lost_gw_connection(self): + self.agent.conf.set_override( + 'ha_vrrp_health_check_interval', 5) + self.failover_agent.conf.set_override( + 'ha_vrrp_health_check_interval', 5) + + router1, router2 = self.create_ha_routers() + + master_router, slave_router = self._get_master_and_slave_routers( + router1, router2) + + self.fail_gw_router_port(master_router) + self.fail_gw_router_port(slave_router) + + common_utils.wait_until_true( + lambda: master_router.ha_state == 'master') + common_utils.wait_until_true( + lambda: slave_router.ha_state == 'master') + + self.restore_gw_router_port(master_router) + + new_master, new_slave = self._get_master_and_slave_routers( + master_router, slave_router) + + self.assertEqual(master_router, new_master) + self.assertEqual(slave_router, new_slave) + class LinuxBridgeL3HATestCase(L3HATestCase): INTERFACE_DRIVER = 'neutron.agent.linux.interface.BridgeInterfaceDriver' diff --git a/neutron/tests/unit/agent/linux/test_keepalived.py b/neutron/tests/unit/agent/linux/test_keepalived.py index f59dca2618b..b881f7b5d16 100644 --- a/neutron/tests/unit/agent/linux/test_keepalived.py +++ b/neutron/tests/unit/agent/linux/test_keepalived.py @@ -11,11 +11,16 @@ # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. +# -from neutron_lib import constants as n_consts +import os + +import mock import testtools import textwrap +from neutron_lib import constants as n_consts + from neutron.agent.linux import keepalived from neutron.tests import base @@ -29,6 +34,8 @@ KEEPALIVED_GLOBAL_CONFIG = textwrap.dedent("""\ }""") % dict( email_from=keepalived.KEEPALIVED_EMAIL_FROM, router_id=keepalived.KEEPALIVED_ROUTER_ID) +VRRP_ID = 1 +VRRP_INTERVAL = 5 class KeepalivedGetFreeRangeTestCase(base.BaseTestCase): @@ -316,7 +323,32 @@ class KeepalivedInstanceTestCase(base.BaseTestCase, } }""") instance = keepalived.KeepalivedInstance( - 'MASTER', 'eth0', 1, ['169.254.192.0/18']) + 'MASTER', 'eth0', VRRP_ID, ['169.254.192.0/18']) + self.assertEqual(expected, os.linesep.join(instance.build_config())) + + def test_build_config_no_vips_track_script(self): + expected = """ +vrrp_script ha_health_check_1 { + script "/etc/ha_confs/qrouter-x/ha_check_script_1.sh" + interval 5 + fall 2 + rise 2 +} + +vrrp_instance VR_1 { + state MASTER + interface eth0 + virtual_router_id 1 + priority 50 + garp_master_delay 60 + virtual_ipaddress { + 169.254.0.1/24 dev eth0 + } +}""" + instance = keepalived.KeepalivedInstance( + 'MASTER', 'eth0', VRRP_ID, ['169.254.192.0/18']) + instance.track_script = keepalived.KeepalivedTrackScript( + VRRP_INTERVAL, '/etc/ha_confs/qrouter-x', VRRP_ID) self.assertEqual(expected, '\n'.join(instance.build_config())) @@ -346,3 +378,74 @@ class KeepalivedVirtualRouteTestCase(base.BaseTestCase): def test_virtual_route_without_dev(self): route = keepalived.KeepalivedVirtualRoute('50.0.0.0/8', '1.2.3.4') self.assertEqual('50.0.0.0/8 via 1.2.3.4', route.build_config()) + + +class KeepalivedTrackScriptTestCase(base.BaseTestCase): + + def test_build_config_preamble(self): + exp_conf = [ + '', + 'vrrp_script ha_health_check_1 {', + ' script "/etc/ha_confs/qrouter-x/ha_check_script_1.sh"', + ' interval 5', + ' fall 2', + ' rise 2', + '}', + ''] + ts = keepalived.KeepalivedTrackScript( + VRRP_INTERVAL, '/etc/ha_confs/qrouter-x', VRRP_ID) + self.assertEqual(exp_conf, ts.build_config_preamble()) + + def test_get_config_str(self): + ts = keepalived.KeepalivedTrackScript( + VRRP_INTERVAL, '/etc/ha_confs/qrouter-x', VRRP_ID) + ts.routes = [ + keepalived.KeepalivedVirtualRoute('12.0.0.0/24', '10.0.0.1'), ] + self.assertEqual(''' track_script { + ha_health_check_1 + }''', + ts.get_config_str()) + + def test_get_script_str(self): + ts = keepalived.KeepalivedTrackScript( + VRRP_INTERVAL, '/etc/ha_confs/qrouter-x', VRRP_ID) + ts.routes = [ + keepalived.KeepalivedVirtualRoute('12.0.0.0/24', '10.0.0.1'), ] + ts.vips = [ + keepalived.KeepalivedVipAddress('192.168.0.3/18', 'ha-xxx'), ] + + self.assertEqual("""#!/bin/bash -eu +ip a | grep 192.168.0.3 || exit 0 +ping -c 1 -w 1 10.0.0.1 1>/dev/null || exit 1""", + ts._get_script_str()) + + def test_get_script_str_no_routes(self): + ts = keepalived.KeepalivedTrackScript( + VRRP_INTERVAL, '/etc/ha_confs/qrouter-x', VRRP_ID) + + self.assertEqual('#!/bin/bash -eu\n', ts._get_script_str()) + + def test_write_check_script(self): + conf_dir = '/etc/ha_confs/qrouter-x' + ts = keepalived.KeepalivedTrackScript(VRRP_INTERVAL, conf_dir, VRRP_ID) + ts.routes = [ + keepalived.KeepalivedVirtualRoute('12.0.0.0/24', '10.0.0.1'), + keepalived.KeepalivedVirtualRoute('2001:db8::1', '2001:db8::1'), ] + with mock.patch.object(keepalived, 'file_utils') as patched_utils: + ts.write_check_script() + patched_utils.replace_file.assert_called_with( + os.path.join(conf_dir, 'ha_check_script_1.sh'), + """#!/bin/bash -eu + +ping -c 1 -w 1 10.0.0.1 1>/dev/null || exit 1 +ping6 -c 1 -w 1 2001:db8::1 1>/dev/null || exit 1""", + 0o520 + ) + + def test_write_check_script_no_routes(self): + conf_dir = '/etc/ha_confs/qrouter-x' + ts = keepalived.KeepalivedTrackScript( + VRRP_INTERVAL, conf_dir, VRRP_ID) + with mock.patch.object(keepalived, 'file_utils') as patched_utils: + ts.write_check_script() + patched_utils.replace_file.assert_not_called() diff --git a/releasenotes/notes/add-keepalived-vrrp-healt-check-f23ed7c853151484.yaml b/releasenotes/notes/add-keepalived-vrrp-healt-check-f23ed7c853151484.yaml new file mode 100644 index 00000000000..a3eb638ddf3 --- /dev/null +++ b/releasenotes/notes/add-keepalived-vrrp-healt-check-f23ed7c853151484.yaml @@ -0,0 +1,11 @@ +--- +prelude: > + Keepalived VRRP health check functionality to enable verification of + connectivity from the "master" router to all gateways. +features: + - Activation of this feature enables gateway connectivity validation and + rescheduling of the "master" router to another node when connectivity + is lost. If all routers lose connectivity to the gateways, the election + process will be repeated round-robin until one of the routers restores + its gateway connection. In the mean time, all of the routers will be + reported as "master".