tobiko/tobiko/tests/faults/agents/test_neutron_agents.py

368 lines
15 KiB
Python

# Copyright (c) 2020 Red Hat
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
from __future__ import absolute_import
import time
from oslo_log import log
import testtools
import tobiko
from tobiko.openstack import neutron
from tobiko.openstack import nova
from tobiko.openstack import stacks
from tobiko.openstack import topology
from tobiko.shell import ping
from tobiko.shell import sh
LOG = log.getLogger(__name__)
class AgentNotFoundOnHost(tobiko.TobikoException):
message = ("Agent {agent_type!s} not found on the host {host!s}")
class AgentTestMixin(object):
def stop_service_on_agents(self, service_name, agents):
for agent in agents:
agent_host = topology.get_openstack_node(hostname=agent['host'])
sh.execute(
"sudo systemctl stop %s" % service_name,
ssh_client=agent_host.ssh_client)
self.stopped_agents.append(agent)
def start_service_on_agents(self, service_name, agents):
for agent in agents:
agent_host = topology.get_openstack_node(hostname=agent['host'])
sh.execute(
"sudo systemctl start %s" % service_name,
ssh_client=agent_host.ssh_client)
def get_process_pids_for_resource(self, process_name, command_filter,
agents, timeout=120, interval=2):
'''Search for PIDs that match creteria on requested hosts'''
start_time = time.time()
pids_per_agent = {}
LOG.debug(f'Search for {process_name} processes on {agents}')
for agent in agents:
LOG.debug(f'Search for {process_name} process on {agent["host"]}')
agent_host = topology.get_openstack_node(hostname=agent['host'])
ssh_client = agent_host.ssh_client
time_left = start_time + timeout - time.time()
while time_left > 0:
pid = self.get_pid(ssh_client, command_filter, process_name)
if pid:
pids_per_agent[agent['host']] = pid
LOG.debug(f'{process_name} process has {pid} PID on '
f'{agent["host"]} host')
break
time_left = start_time + timeout - time.time()
LOG.debug(f'Retrying, time left: {time_left}')
time.sleep(interval)
if not pid:
self.fail(f'No {process_name} process found on host '
f'{agent["host"]} that matches {command_filter}')
return pids_per_agent
def get_pid(self, ssh_client, command_filter, process_name):
processes = sh.list_processes(command=process_name,
ssh_client=ssh_client)
for process in processes:
try:
command = sh.execute(f'cat /proc/{process.pid}/cmdline',
ssh_client=ssh_client)
if command_filter in command.stdout:
return process.pid
else:
LOG.debug(f'No {command_filter} has been found in details'
f' of the following command: {command.stdout}')
except sh.ShellCommandFailed:
LOG.debug(f'Process {process.pid} has been terminated right '
f'after the process list has been collected')
return None
def wait_processes_destroyed(
self, command_filter, pids_per_agent, timeout=120, interval=2):
'''Wait for processes to be terminated on hosts
Make sure that all processes from the list are terminated or return
an error otherwise. Tricky situation may happen when the different
process with same PID can be spawned so then need to check it against
`command_filter`.
'''
start_time = time.time()
LOG.debug(f'Waiting for processes to be finished: {pids_per_agent}')
for agent, pid in pids_per_agent.items():
host = topology.get_openstack_node(hostname=agent)
destroyed = False
time_left = start_time + timeout - time.time()
while time_left > 0:
LOG.debug(f'Check if {pid} has been terminated on {agent}')
if self.is_destroyed(pid, command_filter, host.ssh_client):
destroyed = True
break
time.sleep(interval)
time_left = start_time + timeout - time.time()
LOG.debug(f'Retrying, time left: {time_left}')
if not destroyed:
self.fail(f'Process {pid} has not been finished in {timeout}'
f' sec on {agent}')
else:
LOG.debug(f'Process {pid} has been finished on {agent}')
def is_destroyed(self, pid, command_filter, shell):
'''Check if process has been terminated'''
processes = sh.list_processes(ssh_client=shell)
process = processes.with_attributes(pid=pid)
process_destroyed = False
if not process:
LOG.debug(f'No PID {pid} has been found in process list')
process_destroyed = True
else:
try:
command = sh.execute(f'cat /proc/{pid}/cmdline',
ssh_client=shell)
if command_filter not in command.stdout:
LOG.debug(f'Different process with same PID {pid} exist')
process_destroyed = True
except sh.ShellCommandFailed:
LOG.debug(f'Process {pid} has been terminated right after the'
f' process list has been collected')
process_destroyed = True
return process_destroyed
class DHCPAgentTest(testtools.TestCase, AgentTestMixin):
#: Resources stack with Nova server to send messages to
stack = tobiko.required_setup_fixture(stacks.CirrosServerStackFixture)
def setUp(self):
super(DHCPAgentTest, self).setUp()
os_topology = topology.get_openstack_topology()
self.agent_service_name = os_topology.get_agent_service_name(
"neutron-dhcp-agent")
if not self.agent_service_name:
self.skip("Neutron DHCP agent's service name not defined for "
"the topology %s" % os_topology)
self.stopped_agents = []
def tearDown(self):
super(DHCPAgentTest, self).tearDown()
# Try to start all agents which may be down during the tests
self.start_service_on_agents(
self.agent_service_name, self.stopped_agents)
def test_stop_dhcp_agent(self):
'''Test that dnsmasq processes are not broken after DHCP agent restart
Dnsmasq processes should stay alive if DHCP agent is turned off and
then restarted once DHCP agent is returned to active state.
'''
network_dhcp_agents = neutron.list_dhcp_agent_hosting_network(
self.stack.network)
network_dnsmasq_pids = self.get_process_pids_for_resource(
"dnsmasq", self.stack.network, network_dhcp_agents)
self.stop_service_on_agents(
self.agent_service_name, network_dhcp_agents)
self.assertEqual(
network_dnsmasq_pids,
self.get_process_pids_for_resource(
"dnsmasq", self.stack.network, network_dhcp_agents))
self.start_service_on_agents(
self.agent_service_name, network_dhcp_agents)
self.wait_processes_destroyed(self.stack.network, network_dnsmasq_pids)
self.get_process_pids_for_resource(
"dnsmasq", self.stack.network, network_dhcp_agents)
def test_dhcp_lease_served_when_dhcp_agent_down(self):
'''Test that DHCP lease is correctly served when DHCP agent is down
Make sure that the VM will receive IP address after the reboot.
DHCP agent should be down during the VM reboot. VM should receive
the same IP address that was assigned to it before the reboot.
'''
ping.ping_until_received(
self.stack.ip_address).assert_replied()
network_dhcp_agents = neutron.list_dhcp_agent_hosting_network(
self.stack.network)
network_dnsmasq_pids = self.get_process_pids_for_resource(
"dnsmasq", self.stack.network, network_dhcp_agents)
self.stop_service_on_agents(
self.agent_service_name, network_dhcp_agents)
nova.shutoff_server(self.stack.resources.server.physical_resource_id)
nova.activate_server(self.stack.resources.server.physical_resource_id)
ping.ping_until_received(
self.stack.ip_address).assert_replied()
self.start_service_on_agents(
self.agent_service_name, network_dhcp_agents)
self.wait_processes_destroyed(self.stack.network, network_dnsmasq_pids)
self.get_process_pids_for_resource(
"dnsmasq", self.stack.network, network_dhcp_agents)
class L3AgentTest(testtools.TestCase, AgentTestMixin):
#: Resources stack with Nova server to send messages to
stack = tobiko.required_setup_fixture(stacks.CirrosPeerServerStackFixture)
def setUp(self):
super(L3AgentTest, self).setUp()
os_topology = topology.get_openstack_topology()
self.agent_service_name = os_topology.get_agent_service_name(
"neutron-l3-agent")
if not self.agent_service_name:
self.skip("Neutron L3 agent's service name not defined for "
"the topology %s" % os_topology)
self.router_id = self.stack.network_stack.gateway_id
self.stopped_agents = []
def tearDown(self):
super(L3AgentTest, self).tearDown()
# Try to start all agents which may be down during the tests
self.start_service_on_agents(
self.agent_service_name, self.stopped_agents)
@neutron.skip_if_missing_networking_extensions('l3_agent_scheduler')
def test_metadata_haproxy_during_stop_L3_agent(self):
network_l3_agents = neutron.list_l3_agent_hosting_routers(
self.router_id)
router_haproxy_pids = self.get_process_pids_for_resource(
"haproxy", self.router_id, network_l3_agents)
self.stop_service_on_agents(self.agent_service_name, network_l3_agents)
# Now check if haproxy processes are still run and have got same pids
# like before dhcp agent's stop
self.assertEqual(
router_haproxy_pids,
self.get_process_pids_for_resource(
"haproxy", self.router_id, network_l3_agents))
self.start_service_on_agents(
self.agent_service_name, network_l3_agents)
# And finally check if haproxy processes are still run and have got
# same pids like at the beginning of the test
self.assertEqual(
router_haproxy_pids,
self.get_process_pids_for_resource(
"haproxy", self.router_id, network_l3_agents))
def _is_radvd_process_expected(self):
stateless_modes = ['slaac', 'dhcpv6-stateless']
ipv6_ra_mode = self.stack.network_stack.ipv6_subnet_details.get(
'ipv6_ra_mode')
ipv6_address_mode = self.stack.network_stack.ipv6_subnet_details.get(
'ipv6_address_mode')
if not self.stack.network_stack.ipv6_cidr:
return False
if (ipv6_ra_mode not in stateless_modes or
ipv6_address_mode not in stateless_modes):
return False
return True
def test_radvd_during_stop_l3_agent(self):
os_topology = topology.get_openstack_topology()
if os_topology.has_containers:
self.skip("Radvd process is currently run directly in "
"neutron-l3-agent container so it will be always killed "
"when neutron-l3-agent container is killed and this "
"test is not needed")
if not self._is_radvd_process_expected():
self.skip("Radvd process is not expected to be run on router %s" %
self.router_id)
network_l3_agents = neutron.list_l3_agent_hosting_routers(
self.router_id)
router_radvd_pids = self.get_process_pids_for_resource(
"radvd", self.router_id, network_l3_agents)
self.stop_service_on_agents(self.agent_service_name, network_l3_agents)
# Now check if radvd processes are still run and have got same pids
# like before dhcp agent's stop
self.assertEqual(
router_radvd_pids,
self.get_process_pids_for_resource(
"radvd", self.router_id, network_l3_agents))
self.start_service_on_agents(
self.agent_service_name, network_l3_agents)
# And finally check if dnsmasq processes are still run and have got
# same pids like at the beginning of the test
self.assertEqual(
router_radvd_pids,
self.get_process_pids_for_resource(
"radvd", self.router_id, network_l3_agents))
class OvsAgentTest(testtools.TestCase, AgentTestMixin):
#: Resources stack with Nova server to send messages to
stack = tobiko.required_setup_fixture(stacks.CirrosServerStackFixture)
agent_type = 'Open vSwitch agent'
def setUp(self):
super(OvsAgentTest, self).setUp()
os_topology = topology.get_openstack_topology()
self.agent_service_name = os_topology.get_agent_service_name(
"neutron-ovs-agent")
if not self.agent_service_name:
self.skip("Neutron OVS agent's service name not defined for "
"the topology %s" % os_topology)
self.ovs_agents = neutron.list_agents(agent_type=self.agent_type)
if not self.ovs_agents:
self.skip("No Neutron OVS agents found in the cloud.")
self.stopped_agents = []
def tearDown(self):
super(OvsAgentTest, self).tearDown()
# Try to start all agents which may be down during the tests
self.start_service_on_agents(
self.agent_service_name, self.stopped_agents)
def _get_agent_from_host(self, host):
host_shortname = tobiko.get_short_hostname(host.name)
for agent in self.ovs_agents:
if host_shortname == tobiko.get_short_hostname(agent['host']):
return agent
raise AgentNotFoundOnHost(agent_type="neutron-ovs-agent",
host=host.name)
def test_vm_reachability_during_stop_ovs_agent(self):
# Check if vm is reachable before test
ping.ping_until_received(
self.stack.ip_address).assert_replied()
vm_host = topology.get_openstack_node(
hostname=self.stack.hypervisor_host)
agent = self._get_agent_from_host(vm_host)
self.stop_service_on_agents(self.agent_service_name, [agent])
ping.ping_until_received(
self.stack.floating_ip_address).assert_replied()
self.start_service_on_agents(self.agent_service_name, [agent])