Confirm node is rebooted when reboot command is sent

Some tobiko cloud disruption tests are unstable because once the reboot command is sent to an overcloud node, that node status is validated immediately. Sometimes, the reboot has not been applied yet. With this patch, the node uptime is validated after the reboot command is sent in order to guarantee that the reboot has been completed Change-Id: I95607924c204744674f7fa08a74dc895ddc58f89
2020-10-19 12:10:25 +02:00 · 2020-10-19 12:10:25 +02:00 · 0cafacd668
parent 71e79acb59
commit 0cafacd668
3 changed files with 106 additions and 50 deletions
--- a/tobiko/shell/sh/init.py
+++ b/tobiko/shell/sh/init.py
@ -65,6 +65,8 @@ reboot_host = _reboot.reboot_host
 RebootHostError = _reboot.RebootHostError
 RebootHostOperation = _reboot.RebootHostOperation
 RebootHostTimeoutError = _reboot.RebootHostTimeoutError
+hard_reset_method = _reboot.hard_reset_method
+soft_reset_method = _reboot.soft_reset_method

 ssh_process = _ssh.ssh_process
 ssh_execute = _ssh.ssh_execute
--- a/tobiko/shell/sh/_reboot.py
+++ b/tobiko/shell/sh/_reboot.py
@ -19,13 +19,17 @@ from oslo_log import log

 import tobiko
 from tobiko.shell.sh import _exception
-from tobiko.shell.sh import _execute
 from tobiko.shell.sh import _uptime
 from tobiko.shell import ssh


 LOG = log.getLogger(__name__)

+hard_reset_method = 'sudo chmod o+w /proc/sysrq-trigger;' \
+               'sudo echo b > /proc/sysrq-trigger'
+
+soft_reset_method = 'sudo /sbin/reboot'
+

 class RebootHostError(tobiko.TobikoException):
    message = "host {hostname!r} not rebooted: {cause}"
@ -35,9 +39,10 @@ class RebootHostTimeoutError(RebootHostError):
    message = "host {hostname!r} not rebooted after {timeout!s} seconds"


-def reboot_host(ssh_client, wait: bool = True, timeout: tobiko.Seconds = None):
+def reboot_host(ssh_client, wait: bool = True, timeout: tobiko.Seconds = None,
+                method=soft_reset_method):
    reboot = RebootHostOperation(ssh_client=ssh_client, wait=wait,
-                                 timeout=timeout)
+                                 timeout=timeout, method=method)
    return tobiko.setup_fixture(reboot)


@ -58,27 +63,26 @@ class RebootHostOperation(tobiko.Operation):
    def __init__(self,
                 ssh_client: typing.Optional[ssh.SSHClientFixture] = None,
                 wait=True,
-                 timeout: tobiko.Seconds = None):
+                 timeout: tobiko.Seconds = None,
+                 method=soft_reset_method):
        super(RebootHostOperation, self).__init__()
        if ssh_client is not None:
            self._ssh_client = ssh_client
        tobiko.check_valid_type(self.ssh_client, ssh.SSHClientFixture)
        self.wait = bool(wait)
        self.timeout = tobiko.to_seconds(timeout)
+        self.method = method

    def run_operation(self):
        ssh_client = self.ssh_client
-        ssh_client.connect(connection_timeout=self.timeout)
        with ssh_client:
            self.hostname = ssh_client.hostname
            LOG.debug(f"Rebooting host '{self.hostname}'... ")
            self.is_rebooted = False
            self.start_time = tobiko.time()
            try:
-                _execute.execute('sudo /sbin/reboot',
-                                 stdout=False,
-                                 ssh_client=ssh_client,
-                                 timeout=30.)
+                ssh_client.connect(connection_timeout=self.timeout).\
+                    exec_command(self.method)
            except _exception.ShellTimeoutExpired as ex:
                LOG.debug(f"Reboot command timeout expired: {ex}")
        if self.wait:
--- a/tobiko/tests/faults/ha/cloud_disruptions.py
+++ b/tobiko/tests/faults/ha/cloud_disruptions.py
@ -21,11 +21,6 @@ from tobiko.tests.faults.ha import test_cloud_recovery

 LOG = log.getLogger(__name__)

-hard_reset_method = 'sudo chmod o+w /proc/sysrq-trigger;' \
-               'sudo echo b > /proc/sysrq-trigger'
-
-soft_reset_method = 'sudo reboot'
-
 network_disruption = """
 sudo iptables-save -f /root/working.iptables.rules &&
 sudo iptables -I INPUT 1 -m state --state RELATED,ESTABLISHED -j ACCEPT &&
@ -56,15 +51,12 @@ def network_undisrupt_node(node_name, disrupt_method=undisrupt_network):
    disrupt_node(node_name, disrupt_method=disrupt_method)


-def reset_node(node_name, disrupt_method=hard_reset_method):
-    disrupt_node(node_name, disrupt_method=disrupt_method)
-
-
-def disrupt_node(node_name, disrupt_method=hard_reset_method):
+def disrupt_node(node_name, disrupt_method=network_disruption):

    # reboot all controllers and wait for ssh Up on them
    # hard reset is simultaneous while soft is sequential
-    # method : method of disruptino to use : reset | network_disruption
+    # method : method of disruption to use : network_disruption |
+    # container_restart

    # using ssh_client.connect we use a fire and forget reboot method
    node = get_node(node_name)
@ -74,6 +66,19 @@ def disrupt_node(node_name, disrupt_method=hard_reset_method):
    check_overcloud_node_responsive(node)


+def reboot_node(node_name, wait=True, reboot_method=sh.hard_reset_method):
+
+    # reboot a node and wait for ssh Up on them
+    # hard reset is simultaneous while soft is sequential
+    # method : method of disruption to use : reset | network_disruption
+
+    # using ssh_client.connect we use a fire and forget reboot method
+    node = get_node(node_name)
+    sh.reboot_host(ssh_client=node.ssh_client, wait=wait, method=reboot_method)
+    LOG.info('disrupt exec: {} on server: {}'.format(reboot_method,
+                                                     node.name))
+
+
 def check_overcloud_node_responsive(node):
    node_checked = sh.execute("hostname",
                              ssh_client=node.ssh_client,
@ -89,21 +94,21 @@ def network_disrupt_all_controller_nodes(disrupt_method=network_disruption,
                                 exclude_list=exclude_list)


-def reset_all_controller_nodes(disrupt_method=hard_reset_method,
+def reset_all_controller_nodes(disrupt_method=sh.hard_reset_method,
                               exclude_list=None):
    disrupt_all_controller_nodes(disrupt_method=disrupt_method,
                                 exclude_list=exclude_list)


-def reset_all_controller_nodes_sequentially(disrupt_method=hard_reset_method,
-                                            sequentially=True,
-                                            exclude_list=None):
+def reset_all_controller_nodes_sequentially(
+        disrupt_method=sh.hard_reset_method,
+        sequentially=True, exclude_list=None):
    disrupt_all_controller_nodes(disrupt_method=disrupt_method,
                                 sequentially=sequentially,
                                 exclude_list=exclude_list)


-def disrupt_all_controller_nodes(disrupt_method=hard_reset_method,
+def disrupt_all_controller_nodes(disrupt_method=sh.hard_reset_method,
                                 sequentially=False, exclude_list=None):
    # reboot all controllers and wait for ssh Up on them
    # method : method of disruptino to use : reset | network_disruption
@ -120,13 +125,44 @@ def disrupt_all_controller_nodes(disrupt_method=hard_reset_method,
        nodes = [node for node in nodes if node.name not in exclude_list]

    for controller in nodes:
-        # using ssh_client.connect we use a fire and forget reboot method
-        controller.ssh_client.connect().exec_command(disrupt_method)
-        LOG.info('disrupt exec: {} on server: {}'.format(disrupt_method,
-                                                         controller.name))
-        tobiko.cleanup_fixture(controller.ssh_client)
-        if sequentially:
+        if disrupt_method in (sh.hard_reset_method, sh.soft_reset_method):
+            reboot_node(controller.name, wait=sequentially,
+                        reboot_method=disrupt_method)
+        else:
+            # using ssh_client.connect we use a fire and forget reboot method
+            controller.ssh_client.connect().exec_command(disrupt_method)
+            LOG.info('disrupt exec: {} on server: {}'.format(disrupt_method,
+                                                             controller.name))
+            tobiko.cleanup_fixture(controller.ssh_client)
+            if sequentially:
+                check_overcloud_node_responsive(controller)
+    if not sequentially:
+        for controller in topology.list_openstack_nodes(group='controller'):
            check_overcloud_node_responsive(controller)
+
+
+def reboot_all_controller_nodes(reboot_method=sh.hard_reset_method,
+                                sequentially=False, exclude_list=None):
+    # reboot all controllers and wait for ssh Up on them
+    # method : method of disruptino to use : hard or soft reset
+    # hard reset is simultaneous while soft is sequential
+    # exclude_list = list of nodes to NOT reset
+
+    controlplane_groups = ['controller', 'messaging', 'database', 'networker']
+    actual_controlplane_groups = tripleo_topology.actual_node_groups(
+        controlplane_groups)
+    nodes = topology.list_openstack_nodes(group=actual_controlplane_groups)
+
+    # remove excluded nodes from reset list
+    if exclude_list:
+        nodes = [node for node in nodes if node.name not in exclude_list]
+
+    for controller in nodes:
+        sh.reboot_host(ssh_client=controller.ssh_client, wait=sequentially,
+                       method=reboot_method)
+        LOG.info('reboot exec: {} on server: {}'.format(reboot_method,
+                                                        controller.name))
+        tobiko.cleanup_fixture(controller.ssh_client)
    if not sequentially:
        for controller in topology.list_openstack_nodes(group='controller'):
            check_overcloud_node_responsive(controller)
@ -156,7 +192,7 @@ def delete_evacuable_tagged_image():
            glance.delete_image(img.id)


-def disrupt_controller_main_vip(disrupt_method=hard_reset_method,
+def disrupt_controller_main_vip(disrupt_method=sh.hard_reset_method,
                                inverse=False):

    # reset the controller holding the main vip (os_auth_url)
@ -166,23 +202,31 @@ def disrupt_controller_main_vip(disrupt_method=hard_reset_method,

    # find the node holding that resource via :

-    main_vim_controller = get_main_vip_controller(main_vip)
+    main_vip_controller = get_main_vip_controller(main_vip)

-    if inverse:
-        # inverse the nodes reset selection
-        disrupt_all_controller_nodes(disrupt_method=disrupt_method,
-                                     exclude_list=[main_vim_controller])
+    if disrupt_method in (sh.hard_reset_method, sh.soft_reset_method):
+        if inverse:
+            reboot_all_controller_nodes(reboot_method=disrupt_method,
+                                        exclude_list=[main_vip_controller])
+        else:
+            reboot_node(main_vip_controller, reboot_method=disrupt_method)
    else:
-        # get that node's ssh_client and reset it
-        disrupt_node(main_vim_controller, disrupt_method=disrupt_method)
+        if inverse:
+            # inverse the nodes reset selection
+            disrupt_all_controller_nodes(disrupt_method=disrupt_method,
+                                         exclude_list=[main_vip_controller])
+        else:
+            # get that node's ssh_client and reset it
+            disrupt_node(main_vip_controller, disrupt_method=disrupt_method)


 def reset_controller_main_vip():
-    disrupt_controller_main_vip(disrupt_method=hard_reset_method)
+    disrupt_controller_main_vip(disrupt_method=sh.hard_reset_method)


 def reset_controllers_non_main_vip():
-    disrupt_controller_main_vip(disrupt_method=hard_reset_method, inverse=True)
+    disrupt_controller_main_vip(disrupt_method=sh.hard_reset_method,
+                                inverse=True)


 def network_disrupt_controller_main_vip():
@ -210,12 +254,13 @@ def reset_all_compute_nodes(hard_reset=False):
    # reboot all computes and wait for ssh Up on them
    # hard reset is simultaneous while soft is sequential
    if hard_reset:
-        reset_method = hard_reset_method
+        reset_method = sh.hard_reset_method
    else:
-        reset_method = soft_reset_method
+        reset_method = sh.soft_reset_method
    for compute in topology.list_openstack_nodes(group='compute'):
        # using ssh_client.connect we use a fire and forget reboot method
-        compute.ssh_client.connect().exec_command(reset_method)
+        sh.reboot_host(ssh_client=compute.ssh_client, wait=False,
+                       method=reset_method)
        LOG.info('reboot exec:  {} on server: {}'.format(reset_method,
                                                         compute.name))
        tobiko.cleanup_fixture(compute.ssh_client)
@ -228,7 +273,9 @@ def reset_all_compute_nodes(hard_reset=False):

 def reset_ovndb_master_resource():
    """restart ovndb pacemaker resource"""
-    disrupt_node('controller-0', disrupt_method=ovn_db_pcs_resource_restart)
+    node = pacemaker.get_overcloud_nodes_running_pcs_resource(
+        resource_type='(ocf::ovn:ovndb-servers):', resource_state='Master')[0]
+    disrupt_node(node, disrupt_method=ovn_db_pcs_resource_restart)


 def reset_ovndb_master_container():
@ -245,10 +292,13 @@ def reset_ovndb_master_container():
                                   container_host=node)


-def evac_failover_compute(compute_host, failover_type=hard_reset_method):
+def evac_failover_compute(compute_host, failover_type=sh.hard_reset_method):
    """disrupt a compute, to trigger it's instance-HA evacuation
    failover_type=hard_reset_method etc.."""
-    reset_node(compute_host, disrupt_method=failover_type)
+    if failover_type in (sh.hard_reset_method, sh.soft_reset_method):
+        reboot_node(compute_host, reboot_method=failover_type)
+    else:
+        disrupt_node(compute_host, disrupt_method=failover_type)


 def check_iha_evacuation(failover_type=None, vm_type=None):
@ -291,12 +341,12 @@ def check_iha_evacuation(failover_type=None, vm_type=None):


 def check_iha_evacuation_evac_image_vm():
-    check_iha_evacuation(failover_type=hard_reset_method,
+    check_iha_evacuation(failover_type=sh.hard_reset_method,
                         vm_type='evac_image_vm')


 def check_iha_evacuation_hard_reset():
-    check_iha_evacuation(failover_type=hard_reset_method)
+    check_iha_evacuation(failover_type=sh.hard_reset_method)


 def check_iha_evacuation_network_disruption():
@ -304,4 +354,4 @@ def check_iha_evacuation_network_disruption():


 def check_iha_evacuation_hard_reset_shutoff_instance():
-    check_iha_evacuation(failover_type=hard_reset_method, vm_type='shutoff')
+    check_iha_evacuation(failover_type=sh.hard_reset_method, vm_type='shutoff')