Add L3 HA fullstack failover tests

* Generate unique internal and external router device names
  using the agent's hostname. This is to allow multiple HA
  router replicas to co-exist on the same machine, otherwise
  they'd all use the same device names and OVS would freak out.
* Add host.disconnect method that disconnects the host from
  the central internal and external bridges, simulating pulling
  the cable from the host's NIC.
* Add host.kill and host.shutdown methods to forcefully kill and
  gracefully shutdown a host simulating a host failure and host
  shutdown. This also includes code to cleanup left-over router
  namespaces.
* Added a L3 HA failover test for graceful failover, host failure and
  network disconnect.
* Improve systemd service restart handling in Process fixture to use
  `systemctl restart` and to not block the rootwrap daemon.

Co-Authored-By: Assaf Muller <amuller@redhat.com>

Change-Id: Iaaa1c2cab0341a929e368392aa7dc47c9b2399c2
Original-Change-Id: I250fa41d89dfc4f9f3ba4c03a027b52b2e8c4b4b
This commit is contained in:
Gaudenz Steinlin
2024-03-21 09:02:51 +00:00
parent 1431a08440
commit f9a54350e6
6 changed files with 282 additions and 46 deletions

View File

@@ -275,6 +275,8 @@ def create_patch_ports(source, destination):
source.add_patch_port(source_name, destination_name)
destination.add_patch_port(destination_name, source_name)
return source_name, destination_name
def create_vlan_interface(
namespace, port_name, mac_address, ip_address, vlan_tag):
@@ -406,7 +408,7 @@ class Pinger(object):
"""
stats_pattern = re.compile(
r'^(?P<trans>\d+) packets transmitted,.*(?P<recv>\d+) received.*$')
r'^(?P<trans>\d+) packets transmitted, +(?P<recv>\d+) received.*$')
unreachable_pattern = re.compile(
r'.* Destination .* Unreachable')
TIMEOUT = 15
@@ -430,7 +432,9 @@ class Pinger(object):
"Ping command hasn't ended after %d seconds." % self.TIMEOUT))
def _parse_stats(self):
output = ''
for line in self.proc.stdout:
output += line
if (not self.destination_unreachable and
self.unreachable_pattern.match(line)):
self.destination_unreachable = True
@@ -441,7 +445,9 @@ class Pinger(object):
self.received = int(result.group('recv'))
break
else:
LOG.error(f"Didn't find ping statistics:\n{output}")
raise RuntimeError("Didn't find ping statistics.")
LOG.debug(f"ping command output:\n{output}")
def start(self):
if self.proc and self.proc.is_running:

View File

@@ -120,7 +120,7 @@ class BaseFullStackTestCase(testlib_api.MySQLTestCaseMixin,
common_utils.wait_until_true(_agent_down)
def _assert_ping_during_agents_restart(
self, agents, src_namespace, ips, restart_timeout=10,
self, agents, src_namespace, ips, restart_timeout=30,
ping_timeout=1, count=10):
with net_helpers.async_ping(
src_namespace, ips, timeout=ping_timeout,
@@ -167,11 +167,12 @@ class BaseFullStackTestCase(testlib_api.MySQLTestCaseMixin,
available_ips = itertools.islice(valid_ips, initial, initial + num)
return [str(available_ip) for available_ip in available_ips]
def _create_external_vm(self, network, subnet):
def _create_external_vm(self, network, subnet, ip=None):
ip = ip or subnet['gateway_ip']
vm = self.useFixture(
machine_fixtures.FakeMachine(
self.environment.central_bridge,
common_utils.ip_to_cidr(subnet['gateway_ip'], 24)))
common_utils.ip_to_cidr(ip, 24)))
# NOTE(slaweq): as ext_net is 'vlan' network type external_vm needs to
# send packets with proper vlan also
vm.bridge.set_db_attribute(

View File

@@ -444,3 +444,10 @@ class ClientFixture(fixtures.Fixture):
def update_quota(self, project_id, tracked_resource, quota):
self._update_resource('quota', project_id, {tracked_resource: quota})
def add_gateway_router(self, router_id, network_id):
self.client.add_gateway_router(
router_id,
{'network_id': network_id})
self.addCleanup(
self.client.remove_gateway_router, router_id)

View File

@@ -12,10 +12,13 @@
# License for the specific language governing permissions and limitations
# under the License.
import signal
import fixtures
from neutron_lib import constants
from neutronclient.common import exceptions as nc_exc
from oslo_config import cfg
from oslo_log import log as logging
from neutron.agent.linux import ip_lib
from neutron.common import utils as common_utils
@@ -28,6 +31,8 @@ from neutron.tests.common import net_helpers
from neutron.tests.fullstack.resources import config
from neutron.tests.fullstack.resources import process
LOG = logging.getLogger(__name__)
class EnvironmentDescription(object):
"""A set of characteristics of an environment setup.
@@ -110,9 +115,8 @@ class Host(fixtures.Fixture):
IP address on the appropriate physical NIC. The Host class does the same
with the connect_* methods.
TODO(amuller): Add start/stop/restart methods that will start/stop/restart
all of the agents on this host. Add a kill method that stops all agents
and disconnects the host from other hosts.
TODO(amuller): Add restart method that will restart all of the agents on
this host.
"""
def __init__(self, env_desc, host_desc, test_name,
@@ -263,12 +267,14 @@ class Host(fixtures.Fixture):
veth_1.link.set_up()
veth_2.link.set_up()
self.tunnel_device = veth_1
def connect_to_central_network_via_vlans(self, host_data_bridge):
# If using VLANs as a segmentation device, it's needed to connect
# a provider bridge to a centralized, shared bridge.
net_helpers.create_patch_ports(
source, destination = net_helpers.create_patch_ports(
self.central_bridge, host_data_bridge)
self.internal_port = destination
def allocate_local_ip(self):
if not self.env_desc.network_range:
@@ -296,6 +302,32 @@ class Host(fixtures.Fixture):
self.network_bridges[network_id] = bridge
return bridge
def disconnect(self):
if self.env_desc.tunneling_enabled:
self.tunnel_device.addr.flush(4)
else:
self.br_phys.delete_port(self.internal_port)
LOG.info(f'Host {self.hostname} disconnected.')
def kill(self, parent=None):
# First kill all the agent to prevent a graceful shutdown
for agent_name, agent in self.agents.items():
agent.stop(kill_signal=signal.SIGKILL)
LOG.info(f'Agents on host {self.hostname} killed.')
self.shutdown(parent)
def shutdown(self, parent=None):
self.cleanUp()
# Remove cleanup function from parent because it can't be called twice
if parent:
parent._cleanups._cleanups.remove(
(self.cleanUp, (), {})
)
LOG.info(f'Host {self.hostname} shut down.')
@property
def hostname(self):
return self.neutron_config.config.DEFAULT.host
@@ -385,6 +417,9 @@ class Environment(fixtures.Fixture):
except nc_exc.NeutronClientException:
return False
def get_host_by_name(self, hostname):
return next(host for host in self.hosts if host.hostname == hostname)
def _create_host(self, host_desc):
temp_dir = self.useFixture(fixtures.TempDir()).path
neutron_config = config.NeutronConfigFixture(

View File

@@ -81,7 +81,10 @@ class ProcessFixture(fixtures.Fixture):
systemd_run = [
'systemd-run',
'--service-type', 'exec',
'--property', 'TimeoutStopSec=30s',
# Timeout and KILL processes 5s before the timeout the restart
# tests use.
'--property', 'TimeoutStopSec=25s',
'--property', 'KillMode=mixed',
'--unit', self.unit_name,
'--setenv', f'PATH={os.environ["PATH"]}',
'--same-dir',
@@ -103,6 +106,7 @@ class ProcessFixture(fixtures.Fixture):
# run unprivileged if run_as_root is False.
run_as_root=True,
)
common_utils.wait_until_true(self.service_is_active)
LOG.debug("Process started: %s", self.process_name)
def stop(self, kill_signal=None):
@@ -120,16 +124,26 @@ class ProcessFixture(fixtures.Fixture):
msg = (f'Process killed with signal {kill_signal}: '
f'{self.process_name}')
else:
stop_cmd = ['systemctl', 'stop', self.unit_name]
stop_cmd = ['systemctl', 'stop', '--no-block', self.unit_name]
msg = f'Process stopped: {self.process_name}'
utils.execute(stop_cmd, run_as_root=True)
common_utils.wait_until_true(self.process_is_not_running)
LOG.debug(msg)
def restart(self, executor=None):
def _restart():
self.stop()
self.start()
if self.process_is_running():
restart_cmd = [
'systemctl',
'restart',
'--no-block',
self.unit_name,
]
utils.execute(restart_cmd, run_as_root=True)
common_utils.wait_until_true(self.service_is_active)
else:
self.start()
LOG.debug("Restarting process: %s", self.process_name)
@@ -138,14 +152,21 @@ class ProcessFixture(fixtures.Fixture):
else:
return executor.submit(_restart)
def process_is_running(self):
@property
def service_state(self):
cmd = ['systemctl', 'is-active', self.unit_name]
return utils.execute(
cmd,
run_as_root=True,
log_fail_as_error=False,
check_exit_code=False,
) == 'active\n'
).strip()
def service_is_active(self):
return self.service_state == 'active'
def process_is_running(self):
return self.service_state in ('active', 'activating', 'deactivating')
def process_is_not_running(self):
return not self.process_is_running()
@@ -347,7 +368,32 @@ class LinuxBridgeAgentFixture(ServiceFixture):
)
class L3AgentFixture(ServiceFixture):
class NamespaceCleanupFixture(ServiceFixture):
def _setUp(self):
super(NamespaceCleanupFixture, self)._setUp()
self.addCleanup(self.clean_namespaces)
def clean_namespaces(self):
"""Delete all DHCP namespaces created by DHCP agent.
In some tests for DHCP agent HA agents are killed when handling DHCP
service for network(s). In such case DHCP namespace is not deleted by
DHCP agent and such namespaces are found and deleted using agent's
namespace suffix.
"""
for namespace in ip_lib.list_network_namespaces():
if (getattr(self, 'namespace_pattern') and
self.namespace_pattern.match(namespace)):
try:
ip_lib.delete_network_namespace(namespace)
except RuntimeError:
# Continue cleaning even if namespace deletions fails
pass
class L3AgentFixture(NamespaceCleanupFixture):
def __init__(self, env_desc, host_desc, test_name,
neutron_cfg_fixture, l3_agent_cfg_fixture,
@@ -362,6 +408,8 @@ class L3AgentFixture(ServiceFixture):
self.hostname = self.neutron_cfg_fixture.config['DEFAULT']['host']
def _setUp(self):
super(L3AgentFixture, self)._setUp()
self.plugin_config = self.l3_agent_cfg_fixture.config
config_filenames = [self.neutron_cfg_fixture.filename,
@@ -386,12 +434,15 @@ class L3AgentFixture(ServiceFixture):
namespace=self.namespace
)
)
self.namespace_pattern = re.compile(
r"qrouter-[0-9a-f]{8}-([0-9a-f]{4}-){3}[0-9a-f]{12}@%s" %
self.get_namespace_suffix())
def get_namespace_suffix(self):
return self.plugin_config.DEFAULT.test_namespace_suffix
class DhcpAgentFixture(ServiceFixture):
class DhcpAgentFixture(NamespaceCleanupFixture):
def __init__(self, env_desc, host_desc, test_name,
neutron_cfg_fixture, agent_cfg_fixture, namespace=None):
@@ -404,6 +455,8 @@ class DhcpAgentFixture(ServiceFixture):
self.namespace = namespace
def _setUp(self):
super(DhcpAgentFixture, self)._setUp()
self.plugin_config = self.agent_cfg_fixture.config
config_filenames = [self.neutron_cfg_fixture.filename,
@@ -429,10 +482,9 @@ class DhcpAgentFixture(ServiceFixture):
namespace=self.namespace
)
)
self.dhcp_namespace_pattern = re.compile(
self.namespace_pattern = re.compile(
r"qdhcp-[0-9a-f]{8}-([0-9a-f]{4}-){3}[0-9a-f]{12}%s" %
self.get_namespace_suffix())
self.addCleanup(self.clean_dhcp_namespaces)
def get_agent_hostname(self):
return self.neutron_cfg_fixture.config['DEFAULT']['host']
@@ -442,21 +494,4 @@ class DhcpAgentFixture(ServiceFixture):
def kill(self):
self.process_fixture.stop()
self.clean_dhcp_namespaces()
def clean_dhcp_namespaces(self):
"""Delete all DHCP namespaces created by DHCP agent.
In some tests for DHCP agent HA agents are killed when handling DHCP
service for network(s). In such case DHCP namespace is not deleted by
DHCP agent and such namespaces are found and deleted using agent's
namespace suffix.
"""
for namespace in ip_lib.list_network_namespaces():
if self.dhcp_namespace_pattern.match(namespace):
try:
ip_lib.delete_network_namespace(namespace)
except RuntimeError:
# Continue cleaning even if namespace deletions fails
pass
self.clean_namespaces()

View File

@@ -16,8 +16,11 @@ import functools
import os
import time
from datetime import datetime
from neutron_lib import constants
from neutronclient.common import exceptions
from oslo_log import log as logging
from oslo_utils import uuidutils
from neutron.agent.l3 import ha_router
@@ -27,6 +30,7 @@ from neutron.agent.linux import l3_tc_lib
from neutron.common import utils as common_utils
from neutron.tests import base as tests_base
from neutron.tests.common.exclusive_resources import ip_network
from neutron.tests.common import net_helpers
from neutron.tests.fullstack import base
from neutron.tests.fullstack.resources import environment
from neutron.tests.fullstack.resources import machine
@@ -34,6 +38,8 @@ from neutron.tests.unit import testlib_api
load_tests = testlib_api.module_load_tests
LOG = logging.getLogger(__name__)
class TestL3Agent(base.BaseFullStackTestCase):
@@ -175,10 +181,13 @@ class TestL3Agent(base.BaseFullStackTestCase):
return "%s@%s" % (namespace, suffix)
def _get_l3_agents_with_ha_state(
self, l3_agents, router_id, ha_state=None):
self, router_id, ha_state=None):
l3_agents = [host.agents['l3'] for host in self.environment.hosts
if 'l3' in host.agents]
found_agents = []
agents_hosting_router = self.client.list_l3_agent_hosting_routers(
router_id)['agents']
for agent in l3_agents:
agent_host = agent.neutron_cfg_fixture.get_host()
for agent_hosting_router in agents_hosting_router:
@@ -189,6 +198,13 @@ class TestL3Agent(base.BaseFullStackTestCase):
break
return found_agents
def _get_hosts_with_ha_state(
self, router_id, ha_state=None):
return [
self.environment.get_host_by_name(agent.hostname)
for agent in self._get_l3_agents_with_ha_state(router_id, ha_state)
]
def _router_fip_qos_after_admin_state_down_up(self, ha=False):
def get_router_gw_interface():
devices = ip.get_devices()
@@ -231,9 +247,7 @@ class TestL3Agent(base.BaseFullStackTestCase):
external_vm.block_until_ping(fip['floating_ip_address'])
if ha:
l3_agents = [host.agents['l3'] for host in self.environment.hosts]
router_agent = self._get_l3_agents_with_ha_state(
l3_agents, router['id'])[0]
router_agent = self._get_l3_agents_with_ha_state(router['id'])[0]
qrouter_ns = self._get_namespace(
router['id'],
router_agent)
@@ -369,14 +383,23 @@ class TestHAL3Agent(TestL3Agent):
use_dhcp = False
def setUp(self):
# Two hosts with L3 agent to host HA routers
host_descriptions = [
environment.HostDescription(l3_agent=True,
dhcp_agent=self.use_dhcp,
l3_agent_extensions="fip_qos")
for _ in range(2)]
# Add two hosts for FakeFullstackMachines
host_descriptions.extend([
environment.HostDescription()
for _ in range(2)
])
env = environment.Environment(
environment.EnvironmentDescription(
network_type='vlan', l2_pop=True,
agent_down_time=30,
qos=True),
host_descriptions)
super(TestHAL3Agent, self).setUp(env)
@@ -387,9 +410,6 @@ class TestHAL3Agent(TestL3Agent):
agents['agents'][0]['ha_state'] != agents['agents'][1]['ha_state'])
def test_ha_router(self):
# TODO(amuller): Test external connectivity before and after a
# failover, see: https://review.opendev.org/#/c/196393/
tenant_id = uuidutils.generate_uuid()
router = self.safe_client.create_router(tenant_id, ha=True)
@@ -405,6 +425,139 @@ class TestHAL3Agent(TestL3Agent):
router['id']),
timeout=90)
def _test_ha_router_failover(self, method):
tenant_id = uuidutils.generate_uuid()
# Create router
router = self.safe_client.create_router(tenant_id, ha=True)
router_id = router['id']
agents = self.client.list_l3_agent_hosting_routers(router_id)
self.assertEqual(2, len(agents['agents']),
'HA router must be scheduled to both nodes')
# Create internal subnet
network = self.safe_client.create_network(tenant_id)
subnet = self.safe_client.create_subnet(
tenant_id, network['id'], '20.0.0.0/24')
self.safe_client.add_router_interface(router_id, subnet['id'])
# Create external network
external_network = self.safe_client.create_network(
tenant_id, external=True)
self.safe_client.create_subnet(
tenant_id, external_network['id'], '42.0.0.0/24',
enable_dhcp=False)
self.safe_client.add_gateway_router(
router_id,
external_network['id'])
# Create internal VM
vm = self.useFixture(
machine.FakeFullstackMachine(
self.environment.hosts[2],
network['id'],
tenant_id,
self.safe_client))
vm.block_until_boot()
# Create external VM
external = self.useFixture(
machine.FakeFullstackMachine(
self.environment.hosts[3],
external_network['id'],
tenant_id,
self.safe_client))
external.block_until_boot()
common_utils.wait_until_true(
functools.partial(
self._is_ha_router_active_on_one_agent,
router_id),
timeout=90)
# Test external connectivity, failover, test again
pinger = net_helpers.Pinger(vm.namespace, external.ip, interval=0.1)
pinger.start()
# Ensure connectivity before disconnect
vm.block_until_ping(external.ip)
get_active_hosts = functools.partial(
self._get_hosts_with_ha_state,
router_id,
'active',
)
active_hosts = get_active_hosts()
# Only one host should be active
self.assertEqual(len(active_hosts), 1,
'More than one active HA routers')
active_host = active_hosts[0]
backup_host = next(
h for h in self.environment.hosts if h != active_host)
start = datetime.now()
if method == 'disconnect':
active_host.disconnect()
elif method == 'kill':
active_host.kill(parent=self.environment)
elif method == 'shutdown':
active_host.shutdown(parent=self.environment)
if method != 'shutdown':
# Ensure connectivity is shortly lost if the failover is not
# graceful
vm.assert_no_ping(external.ip)
LOG.debug(f'Connectivity lost after {datetime.now() - start}')
# Ensure connectivity is restored
vm.block_until_ping(external.ip)
LOG.debug(f'Connectivity restored after {datetime.now() - start}')
# Assert the backup host got active
timeout = self.environment.env_desc.agent_down_time * 1.2
common_utils.wait_until_true(
lambda: backup_host in get_active_hosts(),
timeout=timeout,
)
LOG.debug(f'Active host asserted after {datetime.now() - start}')
if method in ('kill', 'shutdown'):
# Assert the previously active host is no longer active if it was
# killed or shutdown. In the disconnect case both hosts will stay
# active, but one host is disconnected from the data plane.
common_utils.wait_until_true(
lambda: active_host not in get_active_hosts(),
timeout=timeout,
)
LOG.debug(f'Inactive host asserted after {datetime.now() - start}')
# Stop probing processes
pinger.stop()
# With the default advert_int of 2s the keepalived master timeout is
# about 6s. Assert less than 90 lost packets (9 seconds)
threshold = 90
lost = pinger.sent - pinger.received
message = (f'Sent {pinger.sent} packets, received {pinger.received} '
f'packets, lost {lost} packets')
self.assertLess(lost, threshold, message)
def test_ha_router_failover_graceful(self):
self._test_ha_router_failover('shutdown')
def test_ha_router_failover_host_failure(self):
self._test_ha_router_failover('kill')
def test_ha_router_failover_disconnect(self):
self._test_ha_router_failover('disconnect')
def _get_keepalived_state(self, keepalived_state_file):
with open(keepalived_state_file, "r") as fd:
return fd.read()
@@ -491,11 +644,10 @@ class TestHAL3Agent(TestL3Agent):
router_ip = router['external_gateway_info'][
'external_fixed_ips'][0]['ip_address']
l3_agents = [host.agents['l3'] for host in self.environment.hosts]
l3_standby_agents = self._get_l3_agents_with_ha_state(
l3_agents, router['id'], 'standby')
router['id'], 'standby')
l3_active_agents = self._get_l3_agents_with_ha_state(
l3_agents, router['id'], 'active')
router['id'], 'active')
self.assertEqual(1, len(l3_active_agents))
# Let's check first if connectivity from external_vm to router's