From 1cdce0c844cb22e31c4b2237eccb28bed4a31164 Mon Sep 17 00:00:00 2001
From: Eduardo Olivares <eolivare@redhat.com>
Date: Thu, 17 Aug 2023 17:51:16 +0200
Subject: [PATCH] Use uptime instead to verify controllers reboot has been
 completed

Use uptime instead of hostname command to validate that a node has been
rebooted and the reboot is completed.

Related-patch: https://review.opendev.org/c/x/tobiko/+/888442
Change-Id: Ib8c625fb385d8dff514dd7f83de122ab872a14c6
---
 tobiko/tests/faults/ha/cloud_disruptions.py | 38 ++++++++++++++++++---
 1 file changed, 34 insertions(+), 4 deletions(-)

diff --git a/tobiko/tests/faults/ha/cloud_disruptions.py b/tobiko/tests/faults/ha/cloud_disruptions.py
index 88c48c6e5..1fb981760 100644
--- a/tobiko/tests/faults/ha/cloud_disruptions.py
+++ b/tobiko/tests/faults/ha/cloud_disruptions.py
@@ -107,18 +107,38 @@ def network_undisrupt_node(node_name, disrupt_method=undisrupt_network):
 
 
 def disrupt_node(node_name, disrupt_method=network_disruption):
-
     # reboot all controllers and wait for ssh Up on them
     # hard reset is simultaneous while soft is sequential
     # method : method of disruption to use : network_disruption |
     # container_restart
 
+    start_time = tobiko.time()
     # using ssh_client.connect we use a fire and forget reboot method
     node = tripleo_topology.get_node(node_name)
     node.ssh_client.connect().exec_command(disrupt_method)
     LOG.info('disrupt exec: {} on server: {}'.format(disrupt_method,
                                                      node.name))
-    check_overcloud_node_responsive(node)
+
+    if isinstance(disrupt_method, sh.RebootHostMethod):
+        check_overcloud_node_uptime(node.ssh_client, start_time)
+    else:
+        check_overcloud_node_responsive(node)
+
+
+def check_overcloud_node_uptime(ssh_client, start_time):
+    for attempt in tobiko.retry(timeout=600., interval=10.):
+        try:
+            uptime = sh.get_uptime(ssh_client=ssh_client, timeout=15.)
+        except (sh.ShellCommandFailed,
+                sh.ShellTimeoutExpired,
+                sh.ShellProcessTerminated):
+            uptime = None
+
+        if uptime and uptime < (tobiko.time() - start_time):
+            LOG.debug('Reboot has been completed')
+            break
+        else:
+            attempt.check_limits()
 
 
 def reboot_node(node_name, wait=True, reboot_method=sh.hard_reset_method):
@@ -182,8 +202,10 @@ def disrupt_all_controller_nodes(disrupt_method=sh.hard_reset_method,
     if exclude_list:
         nodes = [node for node in nodes if node.name not in exclude_list]
 
+    start_time = {}
     for controller in nodes:
         if isinstance(disrupt_method, sh.RebootHostMethod):
+            start_time[controller.name] = tobiko.time()
             reboot_node(controller.name, wait=sequentially,
                         reboot_method=disrupt_method)
         else:
@@ -194,9 +216,14 @@ def disrupt_all_controller_nodes(disrupt_method=sh.hard_reset_method,
             tobiko.cleanup_fixture(controller.ssh_client)
         if sequentially:
             check_overcloud_node_responsive(controller)
+
     if not sequentially:
         for controller in nodes:
-            check_overcloud_node_responsive(controller)
+            if isinstance(disrupt_method, sh.RebootHostMethod):
+                check_overcloud_node_uptime(
+                    controller.ssh_client, start_time[controller.name])
+            else:
+                check_overcloud_node_responsive(controller)
 
 
 def reboot_all_controller_nodes(reboot_method=sh.hard_reset_method,
@@ -215,7 +242,9 @@ def reboot_all_controller_nodes(reboot_method=sh.hard_reset_method,
     if exclude_list:
         nodes = [node for node in nodes if node.name not in exclude_list]
 
+    start_time = {}
     for controller in nodes:
+        start_time[controller.name] = tobiko.time()
         sh.reboot_host(ssh_client=controller.ssh_client, wait=sequentially,
                        method=reboot_method)
         LOG.info('reboot exec: {} on server: {}'.format(reboot_method,
@@ -223,7 +252,8 @@ def reboot_all_controller_nodes(reboot_method=sh.hard_reset_method,
         tobiko.cleanup_fixture(controller.ssh_client)
     if not sequentially:
         for controller in nodes:
-            check_overcloud_node_responsive(controller)
+            check_overcloud_node_uptime(
+                controller.ssh_client, start_time[controller.name])
 
 
 def is_ipv6addr_main_vip():