Add L3 HA fullstack failover test

* Generate unique internal and external router device names
  using the agent's hostname. This is to allow multiple HA
  router replicas to co-exist on the same machine, otherwise
  they'd all use the same device names and OVS would freak out.
* Add host.disconnect method that disconnects the host from
  the central internal and external bridges, simulating pulling
  the cable from the host's NIC.
* Added a L3 HA failover test.

Co-Authored-By: Assaf Muller <amuller@redhat.com>

Change-Id: Iaaa1c2cab0341a929e368392aa7dc47c9b2399c2
Original-Change-Id: I250fa41d89dfc4f9f3ba4c03a027b52b2e8c4b4b
This commit is contained in:
Gaudenz Steinlin 2024-03-21 09:02:51 +00:00
parent a69bd0dd60
commit bd9ba68047
No known key found for this signature in database
GPG Key ID: 640E36E7F6FC7F12
5 changed files with 123 additions and 8 deletions

View File

@ -275,6 +275,8 @@ def create_patch_ports(source, destination):
source.add_patch_port(source_name, destination_name)
destination.add_patch_port(destination_name, source_name)
return source_name, destination_name
def create_vlan_interface(
namespace, port_name, mac_address, ip_address, vlan_tag):
@ -406,7 +408,7 @@ class Pinger(object):
"""
stats_pattern = re.compile(
r'^(?P<trans>\d+) packets transmitted,.*(?P<recv>\d+) received.*$')
r'^(?P<trans>\d+) packets transmitted, +(?P<recv>\d+) received.*$')
unreachable_pattern = re.compile(
r'.* Destination .* Unreachable')
TIMEOUT = 15
@ -430,7 +432,9 @@ class Pinger(object):
"Ping command hasn't ended after %d seconds." % self.TIMEOUT))
def _parse_stats(self):
output = ''
for line in self.proc.stdout:
output += line
if (not self.destination_unreachable and
self.unreachable_pattern.match(line)):
self.destination_unreachable = True
@ -441,7 +445,9 @@ class Pinger(object):
self.received = int(result.group('recv'))
break
else:
LOG.error(f"Didn't find ping statistics:\n{output}")
raise RuntimeError("Didn't find ping statistics.")
LOG.debug(f"ping command output:\n{output}")
def start(self):
if self.proc and self.proc.is_running:

View File

@ -165,11 +165,12 @@ class BaseFullStackTestCase(testlib_api.MySQLTestCaseMixin,
available_ips = itertools.islice(valid_ips, initial, initial + num)
return [str(available_ip) for available_ip in available_ips]
def _create_external_vm(self, network, subnet):
def _create_external_vm(self, network, subnet=None, ip=None):
ip = ip or subnet['gateway_ip']
vm = self.useFixture(
machine_fixtures.FakeMachine(
self.environment.central_bridge,
common_utils.ip_to_cidr(subnet['gateway_ip'], 24)))
common_utils.ip_to_cidr(ip, 24)))
# NOTE(slaweq): as ext_net is 'vlan' network type external_vm needs to
# send packets with proper vlan also
vm.bridge.set_db_attribute(

View File

@ -444,3 +444,10 @@ class ClientFixture(fixtures.Fixture):
def update_quota(self, project_id, tracked_resource, quota):
self._update_resource('quota', project_id, {tracked_resource: quota})
def add_gateway_router(self, router_id, network_id):
self.client.add_gateway_router(
router_id,
{'network_id': network_id})
self.addCleanup(
self.client.remove_gateway_router, router_id)

View File

@ -16,6 +16,7 @@ import fixtures
from neutron_lib import constants
from neutronclient.common import exceptions as nc_exc
from oslo_config import cfg
from oslo_log import log as logging
from neutron.agent.linux import ip_lib
from neutron.common import utils as common_utils
@ -28,6 +29,8 @@ from neutron.tests.common import net_helpers
from neutron.tests.fullstack.resources import config
from neutron.tests.fullstack.resources import process
LOG = logging.getLogger(__name__)
class EnvironmentDescription(object):
"""A set of characteristics of an environment setup.
@ -263,12 +266,14 @@ class Host(fixtures.Fixture):
veth_1.link.set_up()
veth_2.link.set_up()
self.tunnel_device = veth_1
def connect_to_central_network_via_vlans(self, host_data_bridge):
# If using VLANs as a segmentation device, it's needed to connect
# a provider bridge to a centralized, shared bridge.
net_helpers.create_patch_ports(
source, destination = net_helpers.create_patch_ports(
self.central_bridge, host_data_bridge)
self.internal_port = destination
def allocate_local_ip(self):
if not self.env_desc.network_range:
@ -296,6 +301,13 @@ class Host(fixtures.Fixture):
self.network_bridges[network_id] = bridge
return bridge
def disconnect(self):
if self.env_desc.tunneling_enabled:
self.tunnel_device.addr.flush(4)
else:
self.br_phys.delete_port(self.internal_port)
LOG.info(f'Host {self.hostname} disconnected.')
@property
def hostname(self):
return self.neutron_config.config.DEFAULT.host
@ -385,6 +397,9 @@ class Environment(fixtures.Fixture):
except nc_exc.NeutronClientException:
return False
def get_host_by_name(self, hostname):
return next(host for host in self.hosts if host.hostname == hostname)
def _create_host(self, host_desc):
temp_dir = self.useFixture(fixtures.TempDir()).path
neutron_config = config.NeutronConfigFixture(

View File

@ -27,6 +27,7 @@ from neutron.agent.linux import l3_tc_lib
from neutron.common import utils as common_utils
from neutron.tests import base as tests_base
from neutron.tests.common.exclusive_resources import ip_network
from neutron.tests.common import net_helpers
from neutron.tests.fullstack import base
from neutron.tests.fullstack.resources import environment
from neutron.tests.fullstack.resources import machine
@ -231,7 +232,8 @@ class TestL3Agent(base.BaseFullStackTestCase):
external_vm.block_until_ping(fip['floating_ip_address'])
if ha:
l3_agents = [host.agents['l3'] for host in self.environment.hosts]
l3_agents = [host.agents['l3'] for host in self.environment.hosts
if 'l3' in host.agents]
router_agent = self._get_l3_agents_with_ha_state(
l3_agents, router['id'])[0]
qrouter_ns = self._get_namespace(
@ -369,11 +371,19 @@ class TestHAL3Agent(TestL3Agent):
use_dhcp = False
def setUp(self):
# Two hosts with L3 agent to host HA routers
host_descriptions = [
environment.HostDescription(l3_agent=True,
dhcp_agent=self.use_dhcp,
l3_agent_extensions="fip_qos")
for _ in range(2)]
# Add two hosts for FakeFullstackMachines
host_descriptions.extend([
environment.HostDescription()
for _ in range(2)
])
env = environment.Environment(
environment.EnvironmentDescription(
network_type='vlan', l2_pop=True,
@ -387,9 +397,6 @@ class TestHAL3Agent(TestL3Agent):
agents['agents'][0]['ha_state'] != agents['agents'][1]['ha_state'])
def test_ha_router(self):
# TODO(amuller): Test external connectivity before and after a
# failover, see: https://review.opendev.org/#/c/196393/
tenant_id = uuidutils.generate_uuid()
router = self.safe_client.create_router(tenant_id, ha=True)
@ -405,6 +412,85 @@ class TestHAL3Agent(TestL3Agent):
router['id']),
timeout=90)
def _get_host_for_active_ha_router_replica(self, router_id):
result = self.client.list_l3_agent_hosting_routers(router_id)
hostname = next(
agent['host'] for agent in result['agents'] if
agent['ha_state'] == 'active')
return self.environment.get_host_by_name(hostname)
def test_ha_router_failover(self):
tenant_id = uuidutils.generate_uuid()
# Create router
router = self.safe_client.create_router(tenant_id, ha=True)
router_id = router['id']
agents = self.client.list_l3_agent_hosting_routers(router_id)
self.assertEqual(2, len(agents['agents']),
'HA router must be scheduled to both nodes')
# Create internal subnet
network = self.safe_client.create_network(tenant_id)
subnet = self.safe_client.create_subnet(
tenant_id, network['id'], '20.0.0.0/24')
self.safe_client.add_router_interface(router_id, subnet['id'])
# Create external network
external_network = self.safe_client.create_network(
tenant_id, external=True)
self.safe_client.create_subnet(
tenant_id, external_network['id'], '42.0.0.0/24',
enable_dhcp=False)
self.safe_client.add_gateway_router(
router_id,
external_network['id'])
# Create internal VM
vm = self.useFixture(
machine.FakeFullstackMachine(
self.environment.hosts[2],
network['id'],
tenant_id,
self.safe_client))
vm.block_until_boot()
# Create external VM
external = self.useFixture(
machine.FakeFullstackMachine(
self.environment.hosts[3],
external_network['id'],
tenant_id,
self.safe_client))
external.block_until_boot()
common_utils.wait_until_true(
functools.partial(
self._is_ha_router_active_on_one_agent,
router_id),
timeout=90)
# Test external connectivity, failover, test again
pinger = net_helpers.Pinger(vm.namespace, external.ip, interval=0.1)
pinger.start()
# Ensure connectivity before disconnect
vm.block_until_ping(external.ip)
active_host = self._get_host_for_active_ha_router_replica(router_id)
active_host.disconnect()
# Ensure connectivity is shortly lost on failover and recovers
vm.assert_no_ping(external.ip)
vm.block_until_ping(external.ip)
pinger.stop()
# With the default advert_int of 2s the keepalived master timeout is
# about 6s. Assert less than 80 lost packets (9 seconds)
lost = pinger.sent - pinger.received
message = (f'Sent {pinger.sent} packets, received {pinger.received} '
f'packets, lost {lost} packets')
assert lost < 90, message
def _get_keepalived_state(self, keepalived_state_file):
with open(keepalived_state_file, "r") as fd:
return fd.read()