From eb4e3e3bc3f1c3833c22c82a2c45e4402bdcce91 Mon Sep 17 00:00:00 2001 From: Billy Olsen Date: Wed, 24 Mar 2021 12:52:43 -0700 Subject: [PATCH] Disable vrrp healthchecks by default VRRP healthchecks were enabled by default starting in the 19.07 charm release for network deployments which utilize l3ha or dvr+snat. The VRRP healthchecks have specific expectations that may not be satisfied in various data centers. This leads to problems with networks as failed healthchecks lead to router failovers. This change alters the default config option to disable the vrrp healthchecks by default and require users to opt in to using them. The description around the option has been updated to indicate that doing so may lead to routers failing over if ICMP pings are missed. Change-Id: Ie281a311a95ba394d72c2dfeeb0a1a0a12847e77 Closes-Bug: #192101 --- config.yaml | 17 +++++++++++------ unit_tests/test_neutron_ovs_context.py | 2 +- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/config.yaml b/config.yaml index d9ef9cce..13825557 100644 --- a/config.yaml +++ b/config.yaml @@ -404,13 +404,18 @@ options: access. The charm will go into a blocked state if this is attempted. keepalived-healthcheck-interval: type: int - default: 30 + default: 0 description: | - By default all HA routers will check their external network gateway - by sending a ping and if that fails they trigger a vrrp transition. This - option defines how frequently this check is performed. Setting this value - to 0 will disable the healthchecks. Note that this only applies when - using l3ha and dvr_snat. + Specifies the frequency (in seconds) at which HA routers will check + their external network gateway by performing an ICMP ping between the + virtual routers. When the ping check fails, this will trigger the HA + routers to failover to another node. A value of 0 will disable this + check. This setting only applies when using l3ha and dvr_snat. + . + WARNING: Enabling the health checks should be done with caution as it + may lead to rapid failovers of HA routers. ICMP pings are low priority + and may be dropped or take longer than the 1 second afforded by neutron, + which leads to routers failing over to other nodes. of-inactivity-probe: type: int default: 10 diff --git a/unit_tests/test_neutron_ovs_context.py b/unit_tests/test_neutron_ovs_context.py index 942d6cc3..15cba3cf 100644 --- a/unit_tests/test_neutron_ovs_context.py +++ b/unit_tests/test_neutron_ovs_context.py @@ -278,7 +278,7 @@ class OVSPluginContextTest(CharmTestCase): 'nsg_log_output_base': None, 'nsg_log_rate_limit': None, 'nsg_log_burst_limit': 25, - 'keepalived_healthcheck_interval': 30, + 'keepalived_healthcheck_interval': 0, 'of_inactivity_probe': 10, 'disable_mlockall': False, }