Add some OVN containers recovery tests
OVN DB resources restarted via pcs resource restart: all the resources (legacy test with wrong name) or only from master controller (new test) OVN controller container restart when its main process is killed Change-Id: I551d3acf3e6130745239a2069406077c6b506828
This commit is contained in:
parent
a4354c5274
commit
09292c7607
|
@ -104,9 +104,9 @@ def test_ovn_dbs_are_synchronized():
|
|||
|
||||
# declare commands
|
||||
search_container_cmd = (
|
||||
"sudo %s ps --format '{{.Names}}' -f name=ovn-dbs-bundle" %
|
||||
"%s ps --format '{{.Names}}' -f name=ovn-dbs-bundle" %
|
||||
containers.container_runtime_name)
|
||||
container_cmd_prefix = ('sudo %s exec -uroot {container}' %
|
||||
container_cmd_prefix = ('%s exec -uroot {container}' %
|
||||
containers.container_runtime_name)
|
||||
ovndb_sync_cmd = ('ovs-appctl -t /var/run/openvswitch/{ovndb_ctl_file} '
|
||||
'ovsdb-server/sync-status')
|
||||
|
@ -124,20 +124,22 @@ def test_ovn_dbs_are_synchronized():
|
|||
# obtained the container name
|
||||
container_name = sh.execute(
|
||||
search_container_cmd,
|
||||
ssh_client=ovn_master_node.ssh_client).stdout.splitlines()[0]
|
||||
ssh_client=ovn_master_node.ssh_client,
|
||||
sudo=True).stdout.splitlines()[0]
|
||||
for db in ('nb', 'sb'):
|
||||
# check its synchronization is active
|
||||
sync_cmd = (' '.join((container_cmd_prefix, ovndb_sync_cmd)).
|
||||
format(container=container_name,
|
||||
ovndb_ctl_file=ovndb_ctl_file_dict[db]))
|
||||
sync_status = sh.execute(sync_cmd,
|
||||
ssh_client=ovn_master_node.ssh_client).stdout
|
||||
ssh_client=ovn_master_node.ssh_client,
|
||||
sudo=True).stdout
|
||||
test_case.assertIn(expected_state_active_str, sync_status)
|
||||
# obtain nb and sb show output
|
||||
show_cmd = (' '.join((container_cmd_prefix, ovndb_show_cmd)).
|
||||
format(container=container_name, ovndb=ovndb_dict[db]))
|
||||
ovn_db_show = sh.execute(
|
||||
show_cmd, ssh_client=ovn_master_node.ssh_client).stdout
|
||||
show_cmd, ssh_client=ovn_master_node.ssh_client, sudo=True).stdout
|
||||
ovn_master_dbs_show_dict[db] = build_ovn_db_show_dict(ovn_db_show)
|
||||
|
||||
# ovn dbs are located on the controller nodes
|
||||
|
@ -147,7 +149,7 @@ def test_ovn_dbs_are_synchronized():
|
|||
continue
|
||||
container_name = sh.execute(
|
||||
search_container_cmd,
|
||||
ssh_client=node.ssh_client).stdout.splitlines()[0]
|
||||
ssh_client=node.ssh_client, sudo=True).stdout.splitlines()[0]
|
||||
# verify ovn nb and sb dbs are synchronized
|
||||
ovn_dbs_show_dict = {}
|
||||
for db in ('nb', 'sb'):
|
||||
|
@ -156,13 +158,14 @@ def test_ovn_dbs_are_synchronized():
|
|||
format(container=container_name,
|
||||
ovndb_ctl_file=ovndb_ctl_file_dict[db]))
|
||||
sync_status = sh.execute(sync_cmd,
|
||||
ssh_client=node.ssh_client).stdout
|
||||
ssh_client=node.ssh_client,
|
||||
sudo=True).stdout
|
||||
test_case.assertIn(expected_state_backup_str, sync_status)
|
||||
# obtain nb and sb show output
|
||||
show_cmd = (' '.join((container_cmd_prefix, ovndb_show_cmd)).
|
||||
format(container=container_name, ovndb=ovndb_dict[db]))
|
||||
ovn_db_show = sh.execute(
|
||||
show_cmd, ssh_client=node.ssh_client).stdout
|
||||
show_cmd, ssh_client=node.ssh_client, sudo=True).stdout
|
||||
ovn_dbs_show_dict[db] = build_ovn_db_show_dict(ovn_db_show)
|
||||
test_case.assertEqual(ovn_dbs_show_dict[db],
|
||||
ovn_master_dbs_show_dict[db])
|
||||
|
|
|
@ -41,8 +41,8 @@ def get_filtered_node_containers(node, containers_regex):
|
|||
# 'docker' is used here in order to be compatible with old OSP versions.
|
||||
# On versions with podman, 'docker' command is linked to 'podman'
|
||||
result = sh.execute(
|
||||
'sudo docker ps --format "{{.Names}}"',
|
||||
ssh_client=node.ssh_client)
|
||||
'docker ps --format "{{.Names}}"',
|
||||
ssh_client=node.ssh_client, sudo=True)
|
||||
all_node_containers = result.stdout.strip().split('\n')
|
||||
for container in all_node_containers:
|
||||
container = container.strip('"')
|
||||
|
@ -90,9 +90,9 @@ def get_config_files(node, kolla_jsons, conf_ignorelist, scripts_to_check):
|
|||
:rtype: list
|
||||
"""
|
||||
cmds = sh.execute(
|
||||
f"sudo jq '.command' {' '.join(kolla_jsons)}",
|
||||
f"jq '.command' {' '.join(kolla_jsons)}",
|
||||
ssh_client=node.ssh_client,
|
||||
expect_exit_status=None).stdout.strip().split('\n')
|
||||
expect_exit_status=None, sudo=True).stdout.strip().split('\n')
|
||||
LOG.debug(f'{node.name} run containers with commands {cmds}')
|
||||
config_files = set()
|
||||
for cmd in cmds:
|
||||
|
@ -101,11 +101,12 @@ def get_config_files(node, kolla_jsons, conf_ignorelist, scripts_to_check):
|
|||
LOG.debug(f'{cmd} is recognized as script to search '
|
||||
'for config files in')
|
||||
oc_script_location = sh.execute(
|
||||
f'sudo find /var/lib | grep {cmd} | grep -v overlay',
|
||||
ssh_client=node.ssh_client).stdout.strip().split('\n')[0]
|
||||
f'find /var/lib | grep {cmd} | grep -v overlay',
|
||||
ssh_client=node.ssh_client,
|
||||
sudo=True).stdout.strip().split('\n')[0]
|
||||
cmd = sh.execute(
|
||||
f'sudo cat {oc_script_location}',
|
||||
ssh_client=node.ssh_client).stdout.strip()
|
||||
f'cat {oc_script_location}',
|
||||
ssh_client=node.ssh_client, sudo=True).stdout.strip()
|
||||
cmd = cmd.strip('"')
|
||||
temp_conf_files = re.findall('--config-file [^ \n]*', cmd)
|
||||
for conf_file in temp_conf_files:
|
||||
|
@ -201,9 +202,10 @@ def get_node_logdir_from_pcs(node, container):
|
|||
if pcs_resource is None:
|
||||
return
|
||||
logdir = None
|
||||
pcs_rsrc_cmd = f'sudo pcs resource show {pcs_resource}'
|
||||
pcs_rsrc_cmd = f'pcs resource show {pcs_resource}'
|
||||
out_lines = sh.execute(pcs_rsrc_cmd,
|
||||
ssh_client=node.ssh_client).stdout.splitlines()
|
||||
ssh_client=node.ssh_client,
|
||||
sudo=True).stdout.splitlines()
|
||||
log_files_regex = re.compile(
|
||||
r'^\s*options=.*source-dir=(.*) target-dir=.*-log-files\)$')
|
||||
for line in out_lines:
|
||||
|
@ -221,9 +223,10 @@ def get_pacemaker_resource_logfiles(node, container):
|
|||
logfiles = []
|
||||
exclude_pid_files = 'ovn-controller.pid'
|
||||
resource = get_pacemaker_resource_from_container(container)
|
||||
pcs_rsrc_cmd = f'sudo pcs resource show {resource}'
|
||||
pcs_rsrc_cmd = f'pcs resource show {resource}'
|
||||
out_lines = sh.execute(pcs_rsrc_cmd,
|
||||
ssh_client=node.ssh_client).stdout.splitlines()
|
||||
ssh_client=node.ssh_client,
|
||||
sudo=True).stdout.splitlines()
|
||||
run_files_regex = re.compile(
|
||||
r'^\s*options=.*source-dir=(.*) target-dir=.*-run-files\)$')
|
||||
for line in out_lines:
|
||||
|
@ -234,12 +237,14 @@ def get_pacemaker_resource_logfiles(node, container):
|
|||
ssh_client=node.ssh_client).
|
||||
stdout.splitlines())
|
||||
break
|
||||
pids = sh.execute(f'sudo cat {" ".join(pid_files)}',
|
||||
ssh_client=node.ssh_client).stdout.splitlines()
|
||||
pids = sh.execute(f'cat {" ".join(pid_files)}',
|
||||
ssh_client=node.ssh_client,
|
||||
sudo=True).stdout.splitlines()
|
||||
for pid in pids:
|
||||
cmd_stdout = sh.execute(f'sudo docker exec -u root {container} '
|
||||
cmd_stdout = sh.execute(f'docker exec -u root {container} '
|
||||
f'cat /proc/{pid}/cmdline',
|
||||
ssh_client=node.ssh_client).stdout
|
||||
ssh_client=node.ssh_client,
|
||||
sudo=True).stdout
|
||||
for log_file in re.findall('--log-file=[^ \n\x00]*', cmd_stdout):
|
||||
logfiles.append(log_file.split('=')[1])
|
||||
return logfiles
|
||||
|
@ -266,15 +271,15 @@ def get_container_logfiles(node, container):
|
|||
:rtype: list
|
||||
"""
|
||||
cmd = sh.execute(
|
||||
f'sudo docker exec -u root {container} cat /run_command',
|
||||
ssh_client=node.ssh_client)
|
||||
f'docker exec -u root {container} cat /run_command',
|
||||
ssh_client=node.ssh_client, sudo=True)
|
||||
cmd_stdout = cmd.stdout.strip()
|
||||
if 'pacemaker_remoted' in cmd_stdout:
|
||||
return get_pacemaker_resource_logfiles(node, container)
|
||||
if ' ' not in cmd_stdout: # probably script as no space in the command
|
||||
cmd = sh.execute(
|
||||
f'sudo docker exec -u root {container} cat {cmd_stdout}',
|
||||
ssh_client=node.ssh_client)
|
||||
f'docker exec -u root {container} cat {cmd_stdout}',
|
||||
ssh_client=node.ssh_client, sudo=True)
|
||||
cmd_stdout = cmd.stdout.strip()
|
||||
LOG.debug(f'The following command is executed in {container} container '
|
||||
f'on {node.name} node:\n{cmd_stdout}')
|
||||
|
@ -325,8 +330,8 @@ def log_msg(node, container, logfile, msg):
|
|||
:type msg: string
|
||||
"""
|
||||
cmd = f"sh -c 'echo {msg} >> {logfile}'"
|
||||
sh.execute(f'sudo docker exec -u root {container} {cmd}',
|
||||
ssh_client=node.ssh_client)
|
||||
sh.execute(f'docker exec -u root {container} {cmd}',
|
||||
ssh_client=node.ssh_client, sudo=True)
|
||||
|
||||
|
||||
def find_msg_in_file(node, logfile, message, rotated=False):
|
||||
|
@ -349,9 +354,9 @@ def find_msg_in_file(node, logfile, message, rotated=False):
|
|||
else:
|
||||
suffix = ""
|
||||
LOG.debug(f'Searching for {message} in {logfile}{suffix} on {node.name}')
|
||||
result = sh.execute(f'sudo grep -h {message} {logfile}{suffix}',
|
||||
result = sh.execute(f'grep -h {message} {logfile}{suffix}',
|
||||
ssh_client=node.ssh_client,
|
||||
expect_exit_status=None)
|
||||
expect_exit_status=None, sudo=True)
|
||||
if result.stderr:
|
||||
tobiko.fail(f'Failed to read {logfile} on {node.name}:\n'
|
||||
f'{result.stderr}')
|
||||
|
@ -372,9 +377,9 @@ def rotate_logs(node):
|
|||
tobiko.skip('No logrotate container has been found')
|
||||
else:
|
||||
container = containers[0]
|
||||
sh.execute(f'sudo docker exec -u root {container} logrotate '
|
||||
sh.execute(f'docker exec -u root {container} logrotate '
|
||||
'-f /etc/logrotate-crond.conf',
|
||||
ssh_client=node.ssh_client)
|
||||
ssh_client=node.ssh_client, sudo=True)
|
||||
|
||||
|
||||
def has_docker():
|
||||
|
|
|
@ -271,8 +271,20 @@ def reset_all_compute_nodes(hard_reset=False):
|
|||
LOG.info('{} is up '.format(compute_checked))
|
||||
|
||||
|
||||
def reset_ovndb_master_resource():
|
||||
"""restart ovndb pacemaker resource"""
|
||||
def reset_ovndb_pcs_master_resource():
|
||||
"""restart ovndb pacemaker resource
|
||||
this method only restart the resource running on the controller with is
|
||||
acting as Master"""
|
||||
node = pacemaker.get_overcloud_nodes_running_pcs_resource(
|
||||
resource_type='(ocf::ovn:ovndb-servers):', resource_state='Master')[0]
|
||||
ovn_db_pcs_master_resource_restart = (ovn_db_pcs_resource_restart + ' ' +
|
||||
node)
|
||||
disrupt_node(node, disrupt_method=ovn_db_pcs_master_resource_restart)
|
||||
|
||||
|
||||
def reset_ovndb_pcs_resource():
|
||||
"""restart ovndb pacemaker resource
|
||||
this method restart the whole resource, i.e. on all the controller nodes"""
|
||||
node = pacemaker.get_overcloud_nodes_running_pcs_resource(
|
||||
resource_type='(ocf::ovn:ovndb-servers):', resource_state='Master')[0]
|
||||
disrupt_node(node, disrupt_method=ovn_db_pcs_resource_restart)
|
||||
|
@ -281,7 +293,7 @@ def reset_ovndb_master_resource():
|
|||
def reset_ovndb_master_container():
|
||||
"""get and restart the ovndb master container
|
||||
use of partial name : resource: ovn-dbs-bundle-0 =>
|
||||
container: ovn-dbs-bundle-podman-2"""
|
||||
container: ovn-dbs-bundle-podman-0 or ovn-dbs-bundle-docker-0"""
|
||||
node = pacemaker.get_overcloud_nodes_running_pcs_resource(
|
||||
resource_type='(ocf::ovn:ovndb-servers):', resource_state='Master')[0]
|
||||
resource = pacemaker.get_overcloud_resource(
|
||||
|
|
|
@ -96,9 +96,15 @@ class DisruptTripleoNodesTest(testtools.TestCase):
|
|||
# cloud_disruptions.network_undisrupt_controllers_non_main_vip()
|
||||
|
||||
@neutron.skip_unless_is_ovn()
|
||||
def test_reset_ovndb_master_resource(self):
|
||||
def test_reset_ovndb_pcs_master_resource(self):
|
||||
overcloud_health_checks()
|
||||
cloud_disruptions.reset_ovndb_master_resource()
|
||||
cloud_disruptions.reset_ovndb_pcs_master_resource()
|
||||
overcloud_health_checks()
|
||||
|
||||
@neutron.skip_unless_is_ovn()
|
||||
def test_reset_ovndb_pcs_resource(self):
|
||||
overcloud_health_checks()
|
||||
cloud_disruptions.reset_ovndb_pcs_resource()
|
||||
overcloud_health_checks()
|
||||
|
||||
@neutron.skip_unless_is_ovn()
|
||||
|
|
|
@ -519,6 +519,71 @@ class OvnControllerTest(BaseAgentTest):
|
|||
self.get_ovn_agents_from_containers()
|
||||
super(OvnControllerTest, self).setUp()
|
||||
|
||||
def kill_ovn_controller(self,
|
||||
hosts: typing.Optional[typing.List[str]] = None,
|
||||
timeout=60, interval=5):
|
||||
'''Stop OVN controller container by killing ovn-controller process
|
||||
running into it
|
||||
|
||||
Docker/Podman service should restart it automatically
|
||||
|
||||
:parm hosts: List of hostnames to stop agent on
|
||||
:type hosts: list of strings
|
||||
:param timeout: Time to wait OVN controller is recovered
|
||||
:type timeout: int
|
||||
:param interval: Time to wait between attempts
|
||||
:type interval: int
|
||||
'''
|
||||
hosts = hosts or self.hosts
|
||||
self.assertNotEqual([], hosts, "Host list is empty")
|
||||
|
||||
if self.container_name == '':
|
||||
self.container_name = topology.get_agent_container_name(
|
||||
self.agent_name)
|
||||
|
||||
for host in hosts:
|
||||
ssh_client = topology.get_openstack_node(hostname=host).ssh_client
|
||||
pid = None
|
||||
for directory in ('ovn', 'openvswitch'):
|
||||
try:
|
||||
pid = sh.execute('docker exec -uroot '
|
||||
f'{self.container_name} cat '
|
||||
f'/run/{directory}/ovn-controller.pid',
|
||||
ssh_client=ssh_client,
|
||||
sudo=True).stdout.splitlines()[0]
|
||||
except sh.ShellCommandFailed:
|
||||
LOG.debug(f'/run/{directory}/ovn-controller.pid cannot '
|
||||
f'be accessed')
|
||||
else:
|
||||
LOG.debug(f'/run/{directory}/ovn-controller.pid returned '
|
||||
f'pid {pid}')
|
||||
break
|
||||
|
||||
self.assertIsNotNone(pid)
|
||||
LOG.debug(f'Killing process {pid} from container '
|
||||
f'{self.container_name} on host {host}')
|
||||
sh.execute(f'docker exec -uroot {self.container_name} '
|
||||
f'kill {pid}',
|
||||
ssh_client=ssh_client,
|
||||
sudo=True)
|
||||
LOG.debug(f'Container {self.container_name} has been killed '
|
||||
f"on host '{host}'...")
|
||||
# Schedule auto-restart of service at the end of this test case
|
||||
self.addCleanup(self.start_agent, hosts=[host, ])
|
||||
|
||||
# Verify the container is restarted automatically
|
||||
for attempt in tobiko.retry(timeout=timeout, interval=interval):
|
||||
search_running_ovn_cont = ("docker ps --format '{{.Names}}'"
|
||||
f" -f name={self.container_name}")
|
||||
output = sh.execute(search_running_ovn_cont,
|
||||
ssh_client=ssh_client,
|
||||
sudo=True).stdout.splitlines()
|
||||
|
||||
if self.container_name in output:
|
||||
LOG.debug(f'{self.container_name} successfully restarted')
|
||||
break
|
||||
attempt.check_limits()
|
||||
|
||||
def test_restart_ovn_controller(self):
|
||||
'''Test that OVN controller agents can be restarted successfully
|
||||
'''
|
||||
|
@ -528,6 +593,12 @@ class OvnControllerTest(BaseAgentTest):
|
|||
self.start_agent()
|
||||
ping.ping_until_received(self.stack.ip_address).assert_replied()
|
||||
|
||||
def test_kill_ovn_controller(self):
|
||||
'''Test that OVN controller container is restarted automatically after
|
||||
ovn-controller process running into it was killed
|
||||
'''
|
||||
self.kill_ovn_controller()
|
||||
|
||||
|
||||
class MetadataAgentTest(BaseAgentTest):
|
||||
|
||||
|
|
|
@ -288,11 +288,11 @@ def run_container_config_validations():
|
|||
# versions. On versions with podman, 'docker' command is
|
||||
# linked to 'podman'
|
||||
obtained_param = sh.execute(
|
||||
"sudo docker exec -uroot "
|
||||
"docker exec -uroot "
|
||||
f"{config_check['container_name']} crudini "
|
||||
f"--get {config_check['config_file']} "
|
||||
f"{param_check['section']} {param_check['param']}",
|
||||
ssh_client=node.ssh_client).stdout.strip()
|
||||
ssh_client=node.ssh_client, sudo=True).stdout.strip()
|
||||
if param_check['expected_value'] not in obtained_param:
|
||||
tobiko.fail(f"Expected {param_check['param']} value: "
|
||||
f"{param_check['expected_value']}\n"
|
||||
|
|
Loading…
Reference in New Issue