Test that DHCP lease is correctly served when DHCP agent is down

+changed flake8 to version 3.3.0 to support python3.6

Stop agents and then reboot VM. All the dnsmasq processes should
stay alive while the agents are down to serve existing DHCP leases.

Because of the dnsmasq process is restarted right after the
DHCP agent became UP and running we need to make sure that
we are waiting enough time for the dnsmasq processes to be
available.

Change-Id: I06b8ad68657fef0d381fe4f9f2e85105c181d0c0
This commit is contained in:
Alex Katz
2020-03-23 13:45:57 +02:00
committed by Federico Ressi
parent 52bd787e05
commit b12c5d5d12
2 changed files with 132 additions and 26 deletions

View File

@@ -1,7 +1,7 @@
# pep8 and flake8 requirements # pep8 and flake8 requirements
ansible-lint==4.2.0 # MIT ansible-lint==4.2.0 # MIT
flake8==2.5.5 # MIT flake8==3.3.0 # MIT
flake8-import-order==0.12 # LGPLv3 flake8-import-order==0.12 # LGPLv3
mypy>=0.740 # MIT mypy>=0.740 # MIT
pylint>=1.9 # GPLv2 pylint>=1.9 # GPLv2

View File

@@ -14,15 +14,22 @@
# under the License. # under the License.
from __future__ import absolute_import from __future__ import absolute_import
import time
import testtools import testtools
from oslo_log import log
import tobiko import tobiko
from tobiko.openstack import neutron from tobiko.openstack import neutron
from tobiko.openstack import nova
from tobiko.openstack import stacks from tobiko.openstack import stacks
from tobiko.openstack import topology from tobiko.openstack import topology
from tobiko.shell import ping from tobiko.shell import ping
from tobiko.shell import sh from tobiko.shell import sh
LOG = log.getLogger(__name__)
class AgentTestMixin(object): class AgentTestMixin(object):
@@ -41,36 +48,107 @@ class AgentTestMixin(object):
"sudo systemctl start %s" % service_name, "sudo systemctl start %s" % service_name,
ssh_client=agent_host.ssh_client) ssh_client=agent_host.ssh_client)
def get_process_pids_for_resource(self, process_name, resource_id, agents): def get_process_pids_for_resource(self, process_name, command_filter,
agents, timeout=120, interval=2):
'''Search for PIDs that match creteria on requested hosts'''
start_time = time.time()
pids_per_agent = {} pids_per_agent = {}
LOG.debug(f'Search for {process_name} processes on {agents}')
for agent in agents: for agent in agents:
LOG.debug(f'Search for {process_name} process on {agent["host"]}')
agent_host = topology.get_openstack_node(hostname=agent['host']) agent_host = topology.get_openstack_node(hostname=agent['host'])
processes_on_host = sh.list_processes( ssh_client = agent_host.ssh_client
command=process_name, ssh_client=agent_host.ssh_client) time_left = start_time + timeout - time.time()
pid = self.get_pid( while time_left > 0:
agent_host.ssh_client, resource_id, processes_on_host) pid = self.get_pid(ssh_client, command_filter, process_name)
if pid:
pids_per_agent[agent['host']] = pid
LOG.debug(f'{process_name} process has {pid} PID on '
f'{agent["host"]} host')
break
time_left = start_time + timeout - time.time()
LOG.debug(f'Retrying, time left: {time_left}')
time.sleep(interval)
if not pid: if not pid:
self.fail("%(process)s process for router: %(id)s " self.fail(f'No {process_name} process found on host '
"not found on host %(host)s" % { f'{agent["host"]} that matches {command_filter}')
'process': process_name,
'id': resource_id,
'host': agent['host']})
pids_per_agent[agent['host']] = pid
return pids_per_agent return pids_per_agent
def get_pid(self, ssh_client, resource_id, processes): def get_pid(self, ssh_client, command_filter, process_name):
processes = sh.list_processes(command=process_name,
ssh_client=ssh_client)
for process in processes: for process in processes:
cmdline_result = sh.execute( try:
"cat /proc/%s/cmdline" % process.pid, ssh_client=ssh_client) command = sh.execute(f'cat /proc/{process.pid}/cmdline',
if resource_id in cmdline_result.stdout: ssh_client=ssh_client)
return process.pid if command_filter in command.stdout:
return process.pid
else:
LOG.debug(f'No {command_filter} has been found in details'
f' of the following command: {command.stdout}')
except sh.ShellCommandFailed:
LOG.debug(f'Process {process.pid} has been terminated right '
f'after the process list has been collected')
return None return None
def wait_processes_destroyed(
self, command_filter, pids_per_agent, timeout=120, interval=2):
'''Wait for processes to be terminated on hosts
Make sure that all processes from the list are terminated or return
an error otherwise. Tricky situation may happen when the different
process with same PID can be spawned so then need to check it against
`command_filter`.
'''
start_time = time.time()
LOG.debug(f'Waiting for processes to be finished: {pids_per_agent}')
for agent, pid in pids_per_agent.items():
host = topology.get_openstack_node(hostname=agent)
destroyed = False
time_left = start_time + timeout - time.time()
while time_left > 0:
LOG.debug(f'Check if {pid} has been terminated on {agent}')
if self.is_destroyed(pid, command_filter, host.ssh_client):
destroyed = True
break
time.sleep(interval)
time_left = start_time + timeout - time.time()
LOG.debug(f'Retrying, time left: {time_left}')
if not destroyed:
self.fail(f'Process {pid} has not been finished in {timeout}'
f' sec on {agent}')
else:
LOG.debug(f'Process {pid} has been finished on {agent}')
def is_destroyed(self, pid, command_filter, shell):
'''Check if process has been terminated'''
processes = sh.list_processes(ssh_client=shell)
process = processes.with_attributes(pid=pid)
process_destroyed = False
if not process:
LOG.debug(f'No PID {pid} has been found in process list')
process_destroyed = True
else:
try:
command = sh.execute(f'cat /proc/{pid}/cmdline',
ssh_client=shell)
if command_filter not in command.stdout:
LOG.debug(f'Different process with same PID {pid} exist')
process_destroyed = True
except sh.ShellCommandFailed:
LOG.debug(f'Process {pid} has been terminated right after the'
f' process list has been collected')
process_destroyed = True
return process_destroyed
class DHCPAgentTest(testtools.TestCase, AgentTestMixin): class DHCPAgentTest(testtools.TestCase, AgentTestMixin):
#: Resources stack with Nova server to send messages to #: Resources stack with Nova server to send messages to
stack = tobiko.required_setup_fixture(stacks.CirrosPeerServerStackFixture) stack = tobiko.required_setup_fixture(stacks.CirrosServerStackFixture)
def setUp(self): def setUp(self):
super(DHCPAgentTest, self).setUp() super(DHCPAgentTest, self).setUp()
@@ -89,14 +167,18 @@ class DHCPAgentTest(testtools.TestCase, AgentTestMixin):
self.agent_service_name, self.stopped_agents) self.agent_service_name, self.stopped_agents)
def test_stop_dhcp_agent(self): def test_stop_dhcp_agent(self):
'''Test that dnsmasq processes are not broken after DHCP agent restart
Dnsmasq processes should stay alive if DHCP agent is turned off and
then restarted once DHCP agent is returned to active state.
'''
network_dhcp_agents = neutron.list_dhcp_agent_hosting_network( network_dhcp_agents = neutron.list_dhcp_agent_hosting_network(
self.stack.network) self.stack.network)
network_dnsmasq_pids = self.get_process_pids_for_resource( network_dnsmasq_pids = self.get_process_pids_for_resource(
"dnsmasq", self.stack.network, network_dhcp_agents) "dnsmasq", self.stack.network, network_dhcp_agents)
self.stop_service_on_agents( self.stop_service_on_agents(
self.agent_service_name, network_dhcp_agents) self.agent_service_name, network_dhcp_agents)
# Now check if dnsmasq processes are still run and have got same pids
# like before dhcp agent's stop
self.assertEqual( self.assertEqual(
network_dnsmasq_pids, network_dnsmasq_pids,
self.get_process_pids_for_resource( self.get_process_pids_for_resource(
@@ -104,13 +186,37 @@ class DHCPAgentTest(testtools.TestCase, AgentTestMixin):
self.start_service_on_agents( self.start_service_on_agents(
self.agent_service_name, network_dhcp_agents) self.agent_service_name, network_dhcp_agents)
self.wait_processes_destroyed(self.stack.network, network_dnsmasq_pids)
self.get_process_pids_for_resource(
"dnsmasq", self.stack.network, network_dhcp_agents)
# And finally check if dnsmasq processes are still run and have got def test_dhcp_lease_served_when_dhcp_agent_down(self):
# same pids like at the beginning of the test '''Test that DHCP lease is correctly served when DHCP agent is down
self.assertEqual(
network_dnsmasq_pids, Make sure that the VM will receive IP address after the reboot.
self.get_process_pids_for_resource( DHCP agent should be down during the VM reboot. VM should receive
"dnsmasq", self.stack.network, network_dhcp_agents)) the same IP address that was assigned to it before the reboot.
'''
ping.ping_until_received(
self.stack.ip_address).assert_replied()
network_dhcp_agents = neutron.list_dhcp_agent_hosting_network(
self.stack.network)
network_dnsmasq_pids = self.get_process_pids_for_resource(
"dnsmasq", self.stack.network, network_dhcp_agents)
self.stop_service_on_agents(
self.agent_service_name, network_dhcp_agents)
nova.shutoff_server(self.stack.resources.server.physical_resource_id)
nova.activate_server(self.stack.resources.server.physical_resource_id)
ping.ping_until_received(
self.stack.ip_address).assert_replied()
self.start_service_on_agents(
self.agent_service_name, network_dhcp_agents)
self.wait_processes_destroyed(self.stack.network, network_dnsmasq_pids)
self.get_process_pids_for_resource(
"dnsmasq", self.stack.network, network_dhcp_agents)
class L3AgentTest(testtools.TestCase, AgentTestMixin): class L3AgentTest(testtools.TestCase, AgentTestMixin):