Files
neutron/neutron/agent/linux/keepalived.py
Assaf Muller 303cbc6b5b Fix L3 HA with IPv6
We currently use garp_master_repeat and garp_master_refresh
to solve bug 1453855. We need to spawn keepalived only after
all of the qr/qg ports have been wired so that the
initial GARP will be properly sent. Otherwise you get a routing
black hole. In lieu of a proper sync method, we used those two keepalived
options to send GARPs repeatedly:

a) We did not know it never stops spamming the network
b) It causes VMs to lose their IPv6 default gateway due to a keepalived
   bug, which has since been fixed, but it would need to be backported
   to every keepalived version on every distro. Here's the patch:
   https://github.com/acassen/keepalived/pull/200

The solution this patch proposes is to drop the repeat and refresh
keepalived options. This will fix the IPv6 bug but re-introduce bug
1453855. So, this patch uses the delay option instead. It turns
out keepalived sends a GARP when it transitions to MASTER, and then
it waits a number of seconds determined by the delay option, and
sends a GARP again. We'll use an aggressive 'delay' setting to make
sure that when the node boots and the L3/L2 agents start, we'll
give the L2 agent enough time to wire the ports as a stopgap solution.
Note that this only affects initial synchronization time, not failover
times. Failover times will continue to be fast because the ports
are wired ahead of time, the initial GARP after the state transition
to MASTER will be sent properly.

Change-Id: I7a086472b8742828dae08ffd915c45e94fb4b94e
Closes-Bug: #1520517
Related-Bug: #1453855
2016-01-15 16:47:39 -05:00

434 lines
15 KiB
Python

# Copyright (C) 2014 eNovance SAS <licensing@enovance.com>
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import errno
import itertools
import os
import netaddr
from oslo_config import cfg
from oslo_log import log as logging
from neutron._i18n import _
from neutron.agent.linux import external_process
from neutron.common import exceptions
from neutron.common import utils as common_utils
VALID_STATES = ['MASTER', 'BACKUP']
VALID_AUTH_TYPES = ['AH', 'PASS']
HA_DEFAULT_PRIORITY = 50
PRIMARY_VIP_RANGE_SIZE = 24
# TODO(amuller): Use L3 agent constant when new constants module is introduced.
FIP_LL_SUBNET = '169.254.30.0/23'
KEEPALIVED_SERVICE_NAME = 'keepalived'
GARP_MASTER_DELAY = 60
LOG = logging.getLogger(__name__)
def get_free_range(parent_range, excluded_ranges, size=PRIMARY_VIP_RANGE_SIZE):
"""Get a free IP range, from parent_range, of the specified size.
:param parent_range: String representing an IP range. E.g: '169.254.0.0/16'
:param excluded_ranges: A list of strings to be excluded from parent_range
:param size: What should be the size of the range returned?
:return: A string representing an IP range
"""
free_cidrs = netaddr.IPSet([parent_range]) - netaddr.IPSet(excluded_ranges)
for cidr in free_cidrs.iter_cidrs():
if cidr.prefixlen <= size:
return '%s/%s' % (cidr.network, size)
raise ValueError(_('Network of size %(size)s, from IP range '
'%(parent_range)s excluding IP ranges '
'%(excluded_ranges)s was not found.') %
{'size': size,
'parent_range': parent_range,
'excluded_ranges': excluded_ranges})
class InvalidInstanceStateException(exceptions.NeutronException):
message = _('Invalid instance state: %(state)s, valid states are: '
'%(valid_states)s')
def __init__(self, **kwargs):
if 'valid_states' not in kwargs:
kwargs['valid_states'] = ', '.join(VALID_STATES)
super(InvalidInstanceStateException, self).__init__(**kwargs)
class InvalidAuthenticationTypeException(exceptions.NeutronException):
message = _('Invalid authentication type: %(auth_type)s, '
'valid types are: %(valid_auth_types)s')
def __init__(self, **kwargs):
if 'valid_auth_types' not in kwargs:
kwargs['valid_auth_types'] = ', '.join(VALID_AUTH_TYPES)
super(InvalidAuthenticationTypeException, self).__init__(**kwargs)
class KeepalivedVipAddress(object):
"""A virtual address entry of a keepalived configuration."""
def __init__(self, ip_address, interface_name, scope=None):
self.ip_address = ip_address
self.interface_name = interface_name
self.scope = scope
def __eq__(self, other):
return (isinstance(other, KeepalivedVipAddress) and
self.ip_address == other.ip_address)
def __str__(self):
return '[%s, %s, %s]' % (self.ip_address,
self.interface_name,
self.scope)
def build_config(self):
result = '%s dev %s' % (self.ip_address, self.interface_name)
if self.scope:
result += ' scope %s' % self.scope
return result
class KeepalivedVirtualRoute(object):
"""A virtual route entry of a keepalived configuration."""
def __init__(self, destination, nexthop, interface_name=None,
scope=None):
self.destination = destination
self.nexthop = nexthop
self.interface_name = interface_name
self.scope = scope
def build_config(self):
output = self.destination
if self.nexthop:
output += ' via %s' % self.nexthop
if self.interface_name:
output += ' dev %s' % self.interface_name
if self.scope:
output += ' scope %s' % self.scope
return output
class KeepalivedInstanceRoutes(object):
def __init__(self):
self.gateway_routes = []
self.extra_routes = []
self.extra_subnets = []
def remove_routes_on_interface(self, interface_name):
self.gateway_routes = [gw_rt for gw_rt in self.gateway_routes
if gw_rt.interface_name != interface_name]
# NOTE(amuller): extra_routes are initialized from the router's
# 'routes' attribute. These routes do not have an interface
# parameter and so cannot be removed via an interface_name lookup.
self.extra_subnets = [route for route in self.extra_subnets if
route.interface_name != interface_name]
@property
def routes(self):
return self.gateway_routes + self.extra_routes + self.extra_subnets
def __len__(self):
return len(self.routes)
def build_config(self):
return itertools.chain([' virtual_routes {'],
(' %s' % route.build_config()
for route in self.routes),
[' }'])
class KeepalivedInstance(object):
"""Instance section of a keepalived configuration."""
def __init__(self, state, interface, vrouter_id, ha_cidrs,
priority=HA_DEFAULT_PRIORITY, advert_int=None,
mcast_src_ip=None, nopreempt=False,
garp_master_delay=GARP_MASTER_DELAY):
self.name = 'VR_%s' % vrouter_id
if state not in VALID_STATES:
raise InvalidInstanceStateException(state=state)
self.state = state
self.interface = interface
self.vrouter_id = vrouter_id
self.priority = priority
self.nopreempt = nopreempt
self.advert_int = advert_int
self.mcast_src_ip = mcast_src_ip
self.garp_master_delay = garp_master_delay
self.track_interfaces = []
self.vips = []
self.virtual_routes = KeepalivedInstanceRoutes()
self.authentication = None
metadata_cidr = '169.254.169.254/32'
self.primary_vip_range = get_free_range(
parent_range='169.254.0.0/16',
excluded_ranges=[metadata_cidr, FIP_LL_SUBNET] + ha_cidrs,
size=PRIMARY_VIP_RANGE_SIZE)
def set_authentication(self, auth_type, password):
if auth_type not in VALID_AUTH_TYPES:
raise InvalidAuthenticationTypeException(auth_type=auth_type)
self.authentication = (auth_type, password)
def add_vip(self, ip_cidr, interface_name, scope):
vip = KeepalivedVipAddress(ip_cidr, interface_name, scope)
if vip not in self.vips:
self.vips.append(vip)
else:
LOG.debug('VIP %s already present in %s', vip, self.vips)
def remove_vips_vroutes_by_interface(self, interface_name):
self.vips = [vip for vip in self.vips
if vip.interface_name != interface_name]
self.virtual_routes.remove_routes_on_interface(interface_name)
def remove_vip_by_ip_address(self, ip_address):
self.vips = [vip for vip in self.vips
if vip.ip_address != ip_address]
def get_existing_vip_ip_addresses(self, interface_name):
return [vip.ip_address for vip in self.vips
if vip.interface_name == interface_name]
def _build_track_interface_config(self):
return itertools.chain(
[' track_interface {'],
(' %s' % i for i in self.track_interfaces),
[' }'])
def get_primary_vip(self):
"""Return an address in the primary_vip_range CIDR, with the router's
VRID in the host section.
For example, if primary_vip_range is 169.254.0.0/24, and this router's
VRID is 5, the result is 169.254.0.5. Using the VRID assures that
the primary VIP is consistent amongst HA router instances on different
nodes.
"""
ip = (netaddr.IPNetwork(self.primary_vip_range).network +
self.vrouter_id)
return str(netaddr.IPNetwork('%s/%s' % (ip, PRIMARY_VIP_RANGE_SIZE)))
def _build_vips_config(self):
# NOTE(amuller): The primary VIP must be consistent in order to avoid
# keepalived bugs. Changing the VIP in the 'virtual_ipaddress' and
# SIGHUP'ing keepalived can remove virtual routers, including the
# router's default gateway.
# We solve this by never changing the VIP in the virtual_ipaddress
# section, herein known as the primary VIP.
# The only interface known to exist for HA routers is the HA interface
# (self.interface). We generate an IP on that device and use it as the
# primary VIP. The other VIPs (Internal interfaces IPs, the external
# interface IP and floating IPs) are placed in the
# virtual_ipaddress_excluded section.
primary = KeepalivedVipAddress(self.get_primary_vip(), self.interface)
vips_result = [' virtual_ipaddress {',
' %s' % primary.build_config(),
' }']
if self.vips:
vips_result.extend(
itertools.chain([' virtual_ipaddress_excluded {'],
(' %s' % vip.build_config()
for vip in
sorted(self.vips,
key=lambda vip: vip.ip_address)),
[' }']))
return vips_result
def _build_virtual_routes_config(self):
return itertools.chain([' virtual_routes {'],
(' %s' % route.build_config()
for route in self.virtual_routes),
[' }'])
def build_config(self):
config = ['vrrp_instance %s {' % self.name,
' state %s' % self.state,
' interface %s' % self.interface,
' virtual_router_id %s' % self.vrouter_id,
' priority %s' % self.priority,
' garp_master_delay %s' % self.garp_master_delay]
if self.nopreempt:
config.append(' nopreempt')
if self.advert_int:
config.append(' advert_int %s' % self.advert_int)
if self.authentication:
auth_type, password = self.authentication
authentication = [' authentication {',
' auth_type %s' % auth_type,
' auth_pass %s' % password,
' }']
config.extend(authentication)
if self.mcast_src_ip:
config.append(' mcast_src_ip %s' % self.mcast_src_ip)
if self.track_interfaces:
config.extend(self._build_track_interface_config())
config.extend(self._build_vips_config())
if len(self.virtual_routes):
config.extend(self.virtual_routes.build_config())
config.append('}')
return config
class KeepalivedConf(object):
"""A keepalived configuration."""
def __init__(self):
self.reset()
def reset(self):
self.instances = {}
def add_instance(self, instance):
self.instances[instance.vrouter_id] = instance
def get_instance(self, vrouter_id):
return self.instances.get(vrouter_id)
def build_config(self):
config = []
for instance in self.instances.values():
config.extend(instance.build_config())
return config
def get_config_str(self):
"""Generates and returns the keepalived configuration.
:return: Keepalived configuration string.
"""
return '\n'.join(self.build_config())
class KeepalivedManager(object):
"""Wrapper for keepalived.
This wrapper permits to write keepalived config files, to start/restart
keepalived process.
"""
def __init__(self, resource_id, config, process_monitor, conf_path='/tmp',
namespace=None):
self.resource_id = resource_id
self.config = config
self.namespace = namespace
self.process_monitor = process_monitor
self.conf_path = conf_path
def get_conf_dir(self):
confs_dir = os.path.abspath(os.path.normpath(self.conf_path))
conf_dir = os.path.join(confs_dir, self.resource_id)
return conf_dir
def get_full_config_file_path(self, filename, ensure_conf_dir=True):
conf_dir = self.get_conf_dir()
if ensure_conf_dir:
common_utils.ensure_dir(conf_dir)
return os.path.join(conf_dir, filename)
def _output_config_file(self):
config_str = self.config.get_config_str()
config_path = self.get_full_config_file_path('keepalived.conf')
common_utils.replace_file(config_path, config_str)
return config_path
def get_conf_on_disk(self):
config_path = self.get_full_config_file_path('keepalived.conf')
try:
with open(config_path) as conf:
return conf.read()
except (OSError, IOError) as e:
if e.errno != errno.ENOENT:
raise
def spawn(self):
config_path = self._output_config_file()
keepalived_pm = self.get_process()
vrrp_pm = self._get_vrrp_process(
'%s-vrrp' % keepalived_pm.get_pid_file_name())
keepalived_pm.default_cmd_callback = (
self._get_keepalived_process_callback(vrrp_pm, config_path))
keepalived_pm.enable(reload_cfg=True)
self.process_monitor.register(uuid=self.resource_id,
service_name=KEEPALIVED_SERVICE_NAME,
monitored_process=keepalived_pm)
LOG.debug('Keepalived spawned with config %s', config_path)
def disable(self):
self.process_monitor.unregister(uuid=self.resource_id,
service_name=KEEPALIVED_SERVICE_NAME)
pm = self.get_process()
pm.disable(sig='15')
def get_process(self):
return external_process.ProcessManager(
cfg.CONF,
self.resource_id,
self.namespace,
pids_path=self.conf_path)
def _get_vrrp_process(self, pid_file):
return external_process.ProcessManager(
cfg.CONF,
self.resource_id,
self.namespace,
pid_file=pid_file)
def _get_keepalived_process_callback(self, vrrp_pm, config_path):
def callback(pid_file):
# If keepalived process crashed unexpectedly, the vrrp process
# will be orphan and prevent keepalived process to be spawned.
# A check here will let the l3-agent to kill the orphan process
# and spawn keepalived successfully.
if vrrp_pm.active:
vrrp_pm.disable()
cmd = ['keepalived', '-P',
'-f', config_path,
'-p', pid_file,
'-r', '%s-vrrp' % pid_file]
return cmd
return callback