charm-neutron-openvswitch/hooks/neutron_ovs_hooks.py
Marcin Wilk d6d6570973 iptables rules for not tracking GRE/VXLAN traffic
On some busy cloud deployments, it has been reported that the nodes
hosting the neutron-openvswitch are getting their nf-conntrack tables
full and starting to drop connections. The reason is that GRE/VXLAN
use source port randomization, and the number of "unique" flows is high
from the nf-conntrack's perspective. This is no surprise since flows
are usually identified with their 5-tuple (srcip/[srcport]/dstip/
dstport/tproto) by network elements, and GRE/VXLAN are leveraging this
fact to have an even distribution in load-balancing systems in between
[1][2]. The randomization causes the nf_conntrack table to be filled
with many GRE/VXLAN-related flows, eventually leading to connection
drops in a busy environment. As there is no particular reason and
benefit to track these flows at the moment, the solution is to exclude
GRE/VXLAN traffic from nf-conntrack tracking. This can be done by
putting rules with `-j NOTRACK` jump into relevant iptables chains,
which many people already use as a solution to this problem.

This change incorporates the relevant rules to the charm code, so the
rules become present by default.

[1] https://www.rfc-editor.org/rfc/rfc8086.html#section-3.2
[2] https://www.rfc-editor.org/rfc/rfc7348.html#section-5

Closes-bug: #1978806
Change-Id: I9f6c7ca5207a3d587cc9cc2995d9938921ad88f1
Signed-off-by: Marcin Wilk <marcin.wilk@canonical.com>
2024-10-08 14:50:08 +02:00

363 lines
12 KiB
Python
Executable File

#!/usr/bin/env python3
#
# Copyright 2016 Canonical Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import uuid
import shutil
from copy import deepcopy
from charmhelpers.contrib.openstack import context as os_context
from charmhelpers.contrib.openstack.utils import (
os_restart_on_change as restart_on_change,
series_upgrade_prepare,
series_upgrade_complete,
is_hook_allowed,
CompareOpenStackReleases,
os_release,
)
from charmhelpers.contrib.openstack.deferred_events import (
configure_deferred_restarts,
)
from charmhelpers.core.hookenv import (
Hooks,
UnregisteredHookError,
config,
log,
relation_set,
relation_ids,
charm_dir,
)
from charmhelpers.core.sysctl import create as create_sysctl
from charmhelpers.core.host import (
is_container,
)
from charmhelpers.core.unitdata import kv
from charmhelpers.fetch import apt_install
from charmhelpers.contrib.charmsupport import nrpe
from neutron_ovs_utils import (
DHCP_PACKAGES,
DVR_PACKAGES,
L3HA_PACKAGES,
METADATA_PACKAGES,
OVS_DEFAULT,
USE_FQDN_KEY,
configure_ovs,
get_shared_secret,
register_configs,
restart_map,
services,
use_dvr,
use_l3ha,
enable_nova_metadata,
enable_local_dhcp,
install_packages,
install_l3ha_packages,
purge_packages,
assess_status,
install_tmpfilesd,
pause_unit_helper,
resume_unit_helper,
determine_purge_packages,
purge_sriov_systemd_files,
use_fqdn_hint,
deferrable_services,
configure_iptables_rules,
)
hooks = Hooks()
CONFIGS = register_configs()
@hooks.hook('install.real')
def install():
install_packages()
# Start migration to agent registration with FQDNs for newly installed
# units with OpenStack release Stein or newer.
release = os_release('neutron-common')
if CompareOpenStackReleases(release) >= 'stein':
db = kv()
db.set(USE_FQDN_KEY, True)
db.flush()
# NOTE(wolsen): Do NOT add restart_on_change decorator without consideration
# for the implications of modifications to the /etc/default/openvswitch-switch.
@hooks.hook('upgrade-charm')
def upgrade_charm():
# Tidy up any prior installation of obsolete sriov startup
# scripts
purge_sriov_systemd_files()
if OVS_DEFAULT in restart_map():
# In the 16.10 release of the charms, the code changed from managing
# the /etc/default/openvswitch-switch file only when dpdk was enabled
# to always managing this file. Thus, an upgrade of the charm from a
# release prior to 16.10 or higher will always cause the contents of
# the file to change and will trigger a restart of the
# openvswitch-switch service, which in turn causes a temporary
# network outage. To prevent this outage, determine if the
# /etc/default/openvswitch-switch file needs to be migrated and if
# so, migrate the file but do NOT restart the openvswitch-switch
# service.
# See bug LP #1712444
with open(OVS_DEFAULT, 'r') as f:
# The 'Service restart triggered ...' line was added to the
# OVS_DEFAULT template in the 16.10 version of the charm to allow
# restarts so we use this as the key to see if the file needs
# migrating.
if 'Service restart triggered' not in f.read():
CONFIGS.write(OVS_DEFAULT)
@hooks.hook('neutron-plugin-relation-changed')
@hooks.hook('config-changed')
# NOTE(fnordahl): we need to act immediately to changes to OVS_DEFAULT in-line
# so ignore it here to avoid restarting the services twice. LP: #1906280
@restart_on_change({cfg: services
for cfg, services in restart_map().items()
if cfg != OVS_DEFAULT})
def config_changed(check_deferred_restarts=True):
configure_deferred_restarts(deferrable_services())
# policy_rcd.remove_policy_file()
# if we are paused, delay doing any config changed hooks.
# It is forced on the resume.
allowed, reason = is_hook_allowed(
'config-changed',
check_deferred_restarts=check_deferred_restarts)
if not allowed:
log(reason, "WARN")
return
install_packages()
install_tmpfilesd()
# NOTE(jamespage): purge any packages as a result of py3 switch
# at rocky.
packages_to_purge = determine_purge_packages()
request_nova_compute_restart = False
if packages_to_purge:
purge_packages(packages_to_purge)
request_nova_compute_restart = True
sysctl_settings = config('sysctl')
if not is_container() and sysctl_settings:
create_sysctl(sysctl_settings,
'/etc/sysctl.d/50-openvswitch.conf')
# NOTE(fnordahl): It is important to write config to disk and perhaps
# restart the openvswitch-swith service prior to attempting to do run-time
# configuration of OVS as we may have to pass options to `ovs-ctl` for
# `ovs-vswitchd` to run at all. LP: #1906280
# TODO: make restart_on_change use contextlib.contextmanager
@restart_on_change({cfg: services
for cfg, services in restart_map().items()
if cfg == OVS_DEFAULT})
def _restart_before_runtime_config_when_required():
CONFIGS.write_all()
_restart_before_runtime_config_when_required()
configure_ovs()
for rid in relation_ids('neutron-plugin'):
neutron_plugin_joined(
relation_id=rid,
request_restart=request_nova_compute_restart)
update_nrpe_config()
@hooks.hook('neutron-plugin-api-relation-changed')
# NOTE(fnordahl): we need to act immediately to changes to OVS_DEFAULT in-line
# so ignore it here to avoid restarting the services twice. LP: #1906280
@restart_on_change({cfg: services
for cfg, services in restart_map().items()
if cfg != OVS_DEFAULT})
def neutron_plugin_api_changed():
packages_to_purge = []
if use_dvr():
install_packages()
# per 17.08 release notes L3HA + DVR is a Newton+ feature
_os_release = os_release('neutron-common', base='icehouse')
if (use_l3ha() and
CompareOpenStackReleases(_os_release) >= 'newton'):
install_l3ha_packages()
# NOTE(hopem): don't uninstall keepalived if not using l3ha since that
# results in neutron-l3-agent also being uninstalled (see LP 1819499).
else:
packages_to_purge = deepcopy(DVR_PACKAGES)
packages_to_purge.extend(L3HA_PACKAGES)
if packages_to_purge:
purge_packages(packages_to_purge)
# NOTE(fnordahl): It is important to write config to disk and perhaps
# restart the openvswitch-swith service prior to attempting to do run-time
# configuration of OVS as we may have to pass options to `ovs-ctl` for
# `ovs-vswitchd` to run at all. LP: #1906280
# TODO: make restart_on_change use contextlib.contextmanager
@restart_on_change({cfg: service
for cfg, service in restart_map().items()
if cfg == OVS_DEFAULT})
def _restart_before_runtime_config_when_required():
CONFIGS.write_all()
_restart_before_runtime_config_when_required()
configure_ovs()
# If dvr setting has changed, need to pass that on
for rid in relation_ids('neutron-plugin'):
neutron_plugin_joined(relation_id=rid)
@hooks.hook('neutron-plugin-relation-joined')
def neutron_plugin_joined(relation_id=None, request_restart=False):
secret = None
if not is_container():
if enable_local_dhcp():
install_packages()
else:
pkgs = deepcopy(DHCP_PACKAGES)
# NOTE: only purge metadata packages if dvr is not
# in use as this will remove the l3 agent
# see https://pad.lv/1515008
if not use_dvr():
# NOTE(fnordahl) do not remove ``haproxy``, the principal
# charm may have use for it. LP: #1832739
pkgs.extend(set(METADATA_PACKAGES)-set(['haproxy']))
purge_packages(pkgs)
secret = get_shared_secret() if enable_nova_metadata() else None
rel_data = {
'metadata-shared-secret': secret,
}
host_info = os_context.HostInfoContext()()
if use_fqdn_hint() and host_info.get('host_fqdn'):
rel_data.update({'host': host_info['host_fqdn']})
if request_restart:
rel_data['restart-nonce'] = str(uuid.uuid4())
relation_set(relation_id=relation_id, **rel_data)
@hooks.hook('amqp-relation-joined')
def amqp_joined(relation_id=None):
relation_set(relation_id=relation_id,
username=config('rabbit-user'),
vhost=config('rabbit-vhost'))
@hooks.hook('amqp-relation-changed')
@hooks.hook('amqp-relation-departed')
@restart_on_change(restart_map())
def amqp_changed():
if 'amqp' not in CONFIGS.complete_contexts():
log('amqp relation incomplete. Peer not ready?')
return
CONFIGS.write_all()
@hooks.hook('neutron-control-relation-changed')
@restart_on_change(restart_map(),
stopstart=True)
def restart_check():
CONFIGS.write_all()
@hooks.hook('pre-series-upgrade')
def pre_series_upgrade():
log("Running prepare series upgrade hook", "INFO")
series_upgrade_prepare(
pause_unit_helper, CONFIGS)
@hooks.hook('post-series-upgrade')
def post_series_upgrade():
log("Running complete series upgrade hook", "INFO")
series_upgrade_complete(
resume_unit_helper, CONFIGS)
def install_nrpe_cron():
src = os.path.join(charm_dir(), "files", "ovs_vsctl", "cron_ovs_vsctl.sh")
dst = shutil.copy(src, "/usr/local/lib/nagios/plugins/")
os.chmod(dst, 0o100755)
os.chown(dst, uid=0, gid=0)
cronjob_line = "3 * * * * root {cmd}\n".format(cmd=dst)
crond_file = "/etc/cron.d/neutron_openvswitch_ovs_vsctl"
with open(crond_file, "w") as crond_fd:
crond_fd.write(cronjob_line)
return dst
def install_nrpe_plugin():
src = os.path.join(charm_dir(), "files", "ovs_vsctl", "check_ovs_vsctl.py")
dst = shutil.copy(src, "/usr/local/lib/nagios/plugins")
os.chmod(dst, 0o100755)
os.chown(dst, uid=0, gid=0)
return dst
@hooks.hook('nrpe-external-master-relation-joined',
'nrpe-external-master-relation-changed')
def update_nrpe_config():
# python-dbus is used by check_upstart_job
apt_install('python-dbus')
hostname = nrpe.get_nagios_hostname()
current_unit = nrpe.get_nagios_unit_name()
nrpe_setup = nrpe.NRPE(hostname=hostname)
nrpe.copy_nrpe_checks()
nrpe.add_init_service_checks(nrpe_setup, services(), current_unit)
install_nrpe_cron()
cmd = install_nrpe_plugin()
nrpe_setup.add_check(
shortname="ovs_vsctl",
description="Check ovs-vsctl list-br for predictable operation.",
check_cmd=cmd
)
nrpe_setup.write()
@hooks.hook('start')
def start_charm():
# add conntrack related iptables rules upon restart
log('Configuring iptables NOTRACK rules for GRE/VXLAN tunnels')
configure_iptables_rules()
@hooks.hook('update-status')
def dummy_update_status():
"""Dummy function to silence missing hook log entry"""
pass
def main():
try:
hooks.execute(sys.argv)
except UnregisteredHookError as e:
log('Unknown hook {} - skipping.'.format(e))
assess_status(CONFIGS)
if __name__ == '__main__':
main()