Add some OVN containers recovery tests

OVN DB resources restarted via pcs resource restart: all the resources (legacy
test with wrong name) or only from master controller (new test)

OVN controller container restart when its main process is killed

Change-Id: I551d3acf3e6130745239a2069406077c6b506828
This commit is contained in:
Eduardo Olivares 2020-11-04 13:31:49 +01:00
parent a4354c5274
commit 09292c7607
6 changed files with 138 additions and 41 deletions

View File

@ -104,9 +104,9 @@ def test_ovn_dbs_are_synchronized():
# declare commands
search_container_cmd = (
"sudo %s ps --format '{{.Names}}' -f name=ovn-dbs-bundle" %
"%s ps --format '{{.Names}}' -f name=ovn-dbs-bundle" %
containers.container_runtime_name)
container_cmd_prefix = ('sudo %s exec -uroot {container}' %
container_cmd_prefix = ('%s exec -uroot {container}' %
containers.container_runtime_name)
ovndb_sync_cmd = ('ovs-appctl -t /var/run/openvswitch/{ovndb_ctl_file} '
'ovsdb-server/sync-status')
@ -124,20 +124,22 @@ def test_ovn_dbs_are_synchronized():
# obtained the container name
container_name = sh.execute(
search_container_cmd,
ssh_client=ovn_master_node.ssh_client).stdout.splitlines()[0]
ssh_client=ovn_master_node.ssh_client,
sudo=True).stdout.splitlines()[0]
for db in ('nb', 'sb'):
# check its synchronization is active
sync_cmd = (' '.join((container_cmd_prefix, ovndb_sync_cmd)).
format(container=container_name,
ovndb_ctl_file=ovndb_ctl_file_dict[db]))
sync_status = sh.execute(sync_cmd,
ssh_client=ovn_master_node.ssh_client).stdout
ssh_client=ovn_master_node.ssh_client,
sudo=True).stdout
test_case.assertIn(expected_state_active_str, sync_status)
# obtain nb and sb show output
show_cmd = (' '.join((container_cmd_prefix, ovndb_show_cmd)).
format(container=container_name, ovndb=ovndb_dict[db]))
ovn_db_show = sh.execute(
show_cmd, ssh_client=ovn_master_node.ssh_client).stdout
show_cmd, ssh_client=ovn_master_node.ssh_client, sudo=True).stdout
ovn_master_dbs_show_dict[db] = build_ovn_db_show_dict(ovn_db_show)
# ovn dbs are located on the controller nodes
@ -147,7 +149,7 @@ def test_ovn_dbs_are_synchronized():
continue
container_name = sh.execute(
search_container_cmd,
ssh_client=node.ssh_client).stdout.splitlines()[0]
ssh_client=node.ssh_client, sudo=True).stdout.splitlines()[0]
# verify ovn nb and sb dbs are synchronized
ovn_dbs_show_dict = {}
for db in ('nb', 'sb'):
@ -156,13 +158,14 @@ def test_ovn_dbs_are_synchronized():
format(container=container_name,
ovndb_ctl_file=ovndb_ctl_file_dict[db]))
sync_status = sh.execute(sync_cmd,
ssh_client=node.ssh_client).stdout
ssh_client=node.ssh_client,
sudo=True).stdout
test_case.assertIn(expected_state_backup_str, sync_status)
# obtain nb and sb show output
show_cmd = (' '.join((container_cmd_prefix, ovndb_show_cmd)).
format(container=container_name, ovndb=ovndb_dict[db]))
ovn_db_show = sh.execute(
show_cmd, ssh_client=node.ssh_client).stdout
show_cmd, ssh_client=node.ssh_client, sudo=True).stdout
ovn_dbs_show_dict[db] = build_ovn_db_show_dict(ovn_db_show)
test_case.assertEqual(ovn_dbs_show_dict[db],
ovn_master_dbs_show_dict[db])

View File

@ -41,8 +41,8 @@ def get_filtered_node_containers(node, containers_regex):
# 'docker' is used here in order to be compatible with old OSP versions.
# On versions with podman, 'docker' command is linked to 'podman'
result = sh.execute(
'sudo docker ps --format "{{.Names}}"',
ssh_client=node.ssh_client)
'docker ps --format "{{.Names}}"',
ssh_client=node.ssh_client, sudo=True)
all_node_containers = result.stdout.strip().split('\n')
for container in all_node_containers:
container = container.strip('"')
@ -90,9 +90,9 @@ def get_config_files(node, kolla_jsons, conf_ignorelist, scripts_to_check):
:rtype: list
"""
cmds = sh.execute(
f"sudo jq '.command' {' '.join(kolla_jsons)}",
f"jq '.command' {' '.join(kolla_jsons)}",
ssh_client=node.ssh_client,
expect_exit_status=None).stdout.strip().split('\n')
expect_exit_status=None, sudo=True).stdout.strip().split('\n')
LOG.debug(f'{node.name} run containers with commands {cmds}')
config_files = set()
for cmd in cmds:
@ -101,11 +101,12 @@ def get_config_files(node, kolla_jsons, conf_ignorelist, scripts_to_check):
LOG.debug(f'{cmd} is recognized as script to search '
'for config files in')
oc_script_location = sh.execute(
f'sudo find /var/lib | grep {cmd} | grep -v overlay',
ssh_client=node.ssh_client).stdout.strip().split('\n')[0]
f'find /var/lib | grep {cmd} | grep -v overlay',
ssh_client=node.ssh_client,
sudo=True).stdout.strip().split('\n')[0]
cmd = sh.execute(
f'sudo cat {oc_script_location}',
ssh_client=node.ssh_client).stdout.strip()
f'cat {oc_script_location}',
ssh_client=node.ssh_client, sudo=True).stdout.strip()
cmd = cmd.strip('"')
temp_conf_files = re.findall('--config-file [^ \n]*', cmd)
for conf_file in temp_conf_files:
@ -201,9 +202,10 @@ def get_node_logdir_from_pcs(node, container):
if pcs_resource is None:
return
logdir = None
pcs_rsrc_cmd = f'sudo pcs resource show {pcs_resource}'
pcs_rsrc_cmd = f'pcs resource show {pcs_resource}'
out_lines = sh.execute(pcs_rsrc_cmd,
ssh_client=node.ssh_client).stdout.splitlines()
ssh_client=node.ssh_client,
sudo=True).stdout.splitlines()
log_files_regex = re.compile(
r'^\s*options=.*source-dir=(.*) target-dir=.*-log-files\)$')
for line in out_lines:
@ -221,9 +223,10 @@ def get_pacemaker_resource_logfiles(node, container):
logfiles = []
exclude_pid_files = 'ovn-controller.pid'
resource = get_pacemaker_resource_from_container(container)
pcs_rsrc_cmd = f'sudo pcs resource show {resource}'
pcs_rsrc_cmd = f'pcs resource show {resource}'
out_lines = sh.execute(pcs_rsrc_cmd,
ssh_client=node.ssh_client).stdout.splitlines()
ssh_client=node.ssh_client,
sudo=True).stdout.splitlines()
run_files_regex = re.compile(
r'^\s*options=.*source-dir=(.*) target-dir=.*-run-files\)$')
for line in out_lines:
@ -234,12 +237,14 @@ def get_pacemaker_resource_logfiles(node, container):
ssh_client=node.ssh_client).
stdout.splitlines())
break
pids = sh.execute(f'sudo cat {" ".join(pid_files)}',
ssh_client=node.ssh_client).stdout.splitlines()
pids = sh.execute(f'cat {" ".join(pid_files)}',
ssh_client=node.ssh_client,
sudo=True).stdout.splitlines()
for pid in pids:
cmd_stdout = sh.execute(f'sudo docker exec -u root {container} '
cmd_stdout = sh.execute(f'docker exec -u root {container} '
f'cat /proc/{pid}/cmdline',
ssh_client=node.ssh_client).stdout
ssh_client=node.ssh_client,
sudo=True).stdout
for log_file in re.findall('--log-file=[^ \n\x00]*', cmd_stdout):
logfiles.append(log_file.split('=')[1])
return logfiles
@ -266,15 +271,15 @@ def get_container_logfiles(node, container):
:rtype: list
"""
cmd = sh.execute(
f'sudo docker exec -u root {container} cat /run_command',
ssh_client=node.ssh_client)
f'docker exec -u root {container} cat /run_command',
ssh_client=node.ssh_client, sudo=True)
cmd_stdout = cmd.stdout.strip()
if 'pacemaker_remoted' in cmd_stdout:
return get_pacemaker_resource_logfiles(node, container)
if ' ' not in cmd_stdout: # probably script as no space in the command
cmd = sh.execute(
f'sudo docker exec -u root {container} cat {cmd_stdout}',
ssh_client=node.ssh_client)
f'docker exec -u root {container} cat {cmd_stdout}',
ssh_client=node.ssh_client, sudo=True)
cmd_stdout = cmd.stdout.strip()
LOG.debug(f'The following command is executed in {container} container '
f'on {node.name} node:\n{cmd_stdout}')
@ -325,8 +330,8 @@ def log_msg(node, container, logfile, msg):
:type msg: string
"""
cmd = f"sh -c 'echo {msg} >> {logfile}'"
sh.execute(f'sudo docker exec -u root {container} {cmd}',
ssh_client=node.ssh_client)
sh.execute(f'docker exec -u root {container} {cmd}',
ssh_client=node.ssh_client, sudo=True)
def find_msg_in_file(node, logfile, message, rotated=False):
@ -349,9 +354,9 @@ def find_msg_in_file(node, logfile, message, rotated=False):
else:
suffix = ""
LOG.debug(f'Searching for {message} in {logfile}{suffix} on {node.name}')
result = sh.execute(f'sudo grep -h {message} {logfile}{suffix}',
result = sh.execute(f'grep -h {message} {logfile}{suffix}',
ssh_client=node.ssh_client,
expect_exit_status=None)
expect_exit_status=None, sudo=True)
if result.stderr:
tobiko.fail(f'Failed to read {logfile} on {node.name}:\n'
f'{result.stderr}')
@ -372,9 +377,9 @@ def rotate_logs(node):
tobiko.skip('No logrotate container has been found')
else:
container = containers[0]
sh.execute(f'sudo docker exec -u root {container} logrotate '
sh.execute(f'docker exec -u root {container} logrotate '
'-f /etc/logrotate-crond.conf',
ssh_client=node.ssh_client)
ssh_client=node.ssh_client, sudo=True)
def has_docker():

View File

@ -271,8 +271,20 @@ def reset_all_compute_nodes(hard_reset=False):
LOG.info('{} is up '.format(compute_checked))
def reset_ovndb_master_resource():
"""restart ovndb pacemaker resource"""
def reset_ovndb_pcs_master_resource():
"""restart ovndb pacemaker resource
this method only restart the resource running on the controller with is
acting as Master"""
node = pacemaker.get_overcloud_nodes_running_pcs_resource(
resource_type='(ocf::ovn:ovndb-servers):', resource_state='Master')[0]
ovn_db_pcs_master_resource_restart = (ovn_db_pcs_resource_restart + ' ' +
node)
disrupt_node(node, disrupt_method=ovn_db_pcs_master_resource_restart)
def reset_ovndb_pcs_resource():
"""restart ovndb pacemaker resource
this method restart the whole resource, i.e. on all the controller nodes"""
node = pacemaker.get_overcloud_nodes_running_pcs_resource(
resource_type='(ocf::ovn:ovndb-servers):', resource_state='Master')[0]
disrupt_node(node, disrupt_method=ovn_db_pcs_resource_restart)
@ -281,7 +293,7 @@ def reset_ovndb_master_resource():
def reset_ovndb_master_container():
"""get and restart the ovndb master container
use of partial name : resource: ovn-dbs-bundle-0 =>
container: ovn-dbs-bundle-podman-2"""
container: ovn-dbs-bundle-podman-0 or ovn-dbs-bundle-docker-0"""
node = pacemaker.get_overcloud_nodes_running_pcs_resource(
resource_type='(ocf::ovn:ovndb-servers):', resource_state='Master')[0]
resource = pacemaker.get_overcloud_resource(

View File

@ -96,9 +96,15 @@ class DisruptTripleoNodesTest(testtools.TestCase):
# cloud_disruptions.network_undisrupt_controllers_non_main_vip()
@neutron.skip_unless_is_ovn()
def test_reset_ovndb_master_resource(self):
def test_reset_ovndb_pcs_master_resource(self):
overcloud_health_checks()
cloud_disruptions.reset_ovndb_master_resource()
cloud_disruptions.reset_ovndb_pcs_master_resource()
overcloud_health_checks()
@neutron.skip_unless_is_ovn()
def test_reset_ovndb_pcs_resource(self):
overcloud_health_checks()
cloud_disruptions.reset_ovndb_pcs_resource()
overcloud_health_checks()
@neutron.skip_unless_is_ovn()

View File

@ -519,6 +519,71 @@ class OvnControllerTest(BaseAgentTest):
self.get_ovn_agents_from_containers()
super(OvnControllerTest, self).setUp()
def kill_ovn_controller(self,
hosts: typing.Optional[typing.List[str]] = None,
timeout=60, interval=5):
'''Stop OVN controller container by killing ovn-controller process
running into it
Docker/Podman service should restart it automatically
:parm hosts: List of hostnames to stop agent on
:type hosts: list of strings
:param timeout: Time to wait OVN controller is recovered
:type timeout: int
:param interval: Time to wait between attempts
:type interval: int
'''
hosts = hosts or self.hosts
self.assertNotEqual([], hosts, "Host list is empty")
if self.container_name == '':
self.container_name = topology.get_agent_container_name(
self.agent_name)
for host in hosts:
ssh_client = topology.get_openstack_node(hostname=host).ssh_client
pid = None
for directory in ('ovn', 'openvswitch'):
try:
pid = sh.execute('docker exec -uroot '
f'{self.container_name} cat '
f'/run/{directory}/ovn-controller.pid',
ssh_client=ssh_client,
sudo=True).stdout.splitlines()[0]
except sh.ShellCommandFailed:
LOG.debug(f'/run/{directory}/ovn-controller.pid cannot '
f'be accessed')
else:
LOG.debug(f'/run/{directory}/ovn-controller.pid returned '
f'pid {pid}')
break
self.assertIsNotNone(pid)
LOG.debug(f'Killing process {pid} from container '
f'{self.container_name} on host {host}')
sh.execute(f'docker exec -uroot {self.container_name} '
f'kill {pid}',
ssh_client=ssh_client,
sudo=True)
LOG.debug(f'Container {self.container_name} has been killed '
f"on host '{host}'...")
# Schedule auto-restart of service at the end of this test case
self.addCleanup(self.start_agent, hosts=[host, ])
# Verify the container is restarted automatically
for attempt in tobiko.retry(timeout=timeout, interval=interval):
search_running_ovn_cont = ("docker ps --format '{{.Names}}'"
f" -f name={self.container_name}")
output = sh.execute(search_running_ovn_cont,
ssh_client=ssh_client,
sudo=True).stdout.splitlines()
if self.container_name in output:
LOG.debug(f'{self.container_name} successfully restarted')
break
attempt.check_limits()
def test_restart_ovn_controller(self):
'''Test that OVN controller agents can be restarted successfully
'''
@ -528,6 +593,12 @@ class OvnControllerTest(BaseAgentTest):
self.start_agent()
ping.ping_until_received(self.stack.ip_address).assert_replied()
def test_kill_ovn_controller(self):
'''Test that OVN controller container is restarted automatically after
ovn-controller process running into it was killed
'''
self.kill_ovn_controller()
class MetadataAgentTest(BaseAgentTest):

View File

@ -288,11 +288,11 @@ def run_container_config_validations():
# versions. On versions with podman, 'docker' command is
# linked to 'podman'
obtained_param = sh.execute(
"sudo docker exec -uroot "
"docker exec -uroot "
f"{config_check['container_name']} crudini "
f"--get {config_check['config_file']} "
f"{param_check['section']} {param_check['param']}",
ssh_client=node.ssh_client).stdout.strip()
ssh_client=node.ssh_client, sudo=True).stdout.strip()
if param_check['expected_value'] not in obtained_param:
tobiko.fail(f"Expected {param_check['param']} value: "
f"{param_check['expected_value']}\n"