From 15c39e4933319f72648aad9f4abe62b68ffa57c2 Mon Sep 17 00:00:00 2001
From: Ivan Kliuk <ivan.kliuk@gmail.com>
Date: Thu, 4 Jun 2015 11:15:07 +0000
Subject: [PATCH] Add clock skew status checking

  Remove redundant 'get_ceph_health' function.
  Remove unused 'get_status' function.
  Make 'get_osd_tree' and 'get_osd_id' invoke 'run_on_remote' helper function.
  Add clock skew status checking to 'get_node_fqdns_w_clock_skew'.

Change-Id: I1db0a90233ab0e3395778f5e25869dd2a8d997b8
Closes-bug: #1462217
---
 fuelweb_test/helpers/ceph.py           | 110 +++++++++++++++++++------
 fuelweb_test/models/fuel_web_client.py |  29 +++----
 2 files changed, 96 insertions(+), 43 deletions(-)

diff --git a/fuelweb_test/helpers/ceph.py b/fuelweb_test/helpers/ceph.py
index 61539e4a5..5bd854118 100644
--- a/fuelweb_test/helpers/ceph.py
+++ b/fuelweb_test/helpers/ceph.py
@@ -12,8 +12,6 @@
 #    License for the specific language governing permissions and limitations
 #    under the License.
 
-import json
-
 from proboscis.asserts import assert_equal
 
 from fuelweb_test import logger
@@ -72,12 +70,6 @@ def get_health(remote):
     return run_on_remote(remote, cmd, jsonify=True)
 
 
-def get_status(remote):
-    logger.debug("Checking Ceph cluster status on {0}".format(remote.host))
-    cmd = 'ceph status -f json'
-    return run_on_remote(remote, cmd, jsonify=True)
-
-
 def get_monitor_node_fqdns(remote):
     """Returns node FQDNs with Ceph monitor service is running.
 
@@ -92,19 +84,33 @@ def get_monitor_node_fqdns(remote):
     return fqdns
 
 
-def get_node_fqdns_w_time_skew(remote):
-    """Returns node FQDNs with a time skew.
+def is_clock_skew(remote):
+    """Checks whether clock skews across the monitor nodes.
+
+    :param remote: devops.helpers.helpers.SSHClient
+    :return: bool
+    """
+    if is_health_warn(remote):
+        if 'clock skew' in ' '.join(health_detail(remote)):
+            return True
+
+    return False
+
+
+def get_node_fqdns_w_clock_skew(remote):
+    """Returns node FQDNs with a clock skew.
 
     :param remote: devops.helpers.helpers.SSHClient
     :return: list of FQDNs
     """
-    health = get_health(remote)
-    monitors = health['timechecks']['mons']
     fqdns = []
-    for i in monitors:
+    if not is_clock_skew(remote):
+        return fqdns
+
+    for i in get_health(remote)['timechecks']['mons']:
         if abs(float(i['skew'])) >= 0.05:
             fqdns.append(i['name'] + DNS_SUFFIX)
-    logger.debug("Time skew is found on {0}".format(', '.join(fqdns)))
+    logger.debug("Clock skew is found on {0}".format(', '.join(fqdns)))
     return fqdns
 
 
@@ -139,27 +145,79 @@ def check_service_ready(remote, exit_code=0):
     return False
 
 
-# TODO(ivankliuk) Remove `get_ceph_health` function.
-def get_ceph_health(remote):
-    return ''.join(remote.execute('ceph health')['stdout']).rstrip()
+def health_overall_status(remote):
+    """Returns Ceph health overall status.
+
+    Can be one of: 'HEALTH_OK', 'HEALTH_WARN', 'HEALTH_ERR', ...
+    :param remote: devops.helpers.helpers.SSHClient
+    :return: str
+
+    """
+    health = get_health(remote)
+    return health['overall_status']
 
 
-def check_ceph_health(remote, health_status=('HEALTH_OK',)):
-    ceph_health = get_ceph_health(remote)
-    if all(x in ceph_health.split() for x in health_status):
+def health_detail(remote):
+    """Returns 'detail' section of Ceph health.
+
+    :param remote: devops.helpers.helpers.SSHClient
+    :return: JSON-like object
+
+    """
+    health = get_health(remote)
+    return health['detail']
+
+
+def is_health_ok(remote):
+    """Checks whether Ceph health overall status is OK.
+
+    :param remote: devops.helpers.helpers.SSHClient
+    :return: bool
+    """
+    return health_overall_status(remote) == 'HEALTH_OK'
+
+
+def is_health_warn(remote):
+    """Checks whether Ceph health overall status is WARN.
+
+    :param remote: devops.helpers.helpers.SSHClient
+    :return: bool
+    """
+    return health_overall_status(remote) == 'HEALTH_WARN'
+
+
+def is_pgs_recovering(remote):
+    """Checks whether Ceph PGs are being recovered.
+
+    :param remote: devops.helpers.helpers.SSHClient
+    :return: bool
+    """
+    keywords = ['degraded', 'recovery', 'osds', 'are', 'down']
+    detail = ' '.join(health_detail(remote))
+    if all(k in detail.split() for k in keywords):
         return True
-    logger.debug('Ceph health {0} doesn\'t equal to {1}'.format(
-        ceph_health, ''.join(health_status)))
+    logger.debug('Ceph PGs are not being recovered. '
+                 'Details: {0}'.format(detail))
     return False
 
 
 def get_osd_tree(remote):
-    # TODO(ivankliuk) `run_on_remote` function has to be used here.
+    """Returns OSDs according to their position in the CRUSH map.
+
+    :param remote: devops.helpers.helpers.SSHClient
+    :return: JSON-like object
+    """
+    logger.debug("Fetching Ceph OSD tree")
     cmd = 'ceph osd tree -f json'
-    return json.loads(''.join(remote.execute(cmd)['stdout']))
+    return run_on_remote(remote, cmd, jsonify=True)
 
 
 def get_osd_ids(remote):
-    # TODO(ivankliuk) `run_on_remote` function has to be used here.
+    """Returns all OSD ids.
+
+    :param remote: devops.helpers.helpers.SSHClient
+    :return: JSON-like object
+    """
+    logger.debug("Fetching Ceph OSD ids")
     cmd = 'ceph osd ls -f json'
-    return json.loads(''.join(remote.execute(cmd)['stdout']))
+    return run_on_remote(remote, cmd, jsonify=True)
diff --git a/fuelweb_test/models/fuel_web_client.py b/fuelweb_test/models/fuel_web_client.py
index 9be9fb773..4c0a8081f 100644
--- a/fuelweb_test/models/fuel_web_client.py
+++ b/fuelweb_test/models/fuel_web_client.py
@@ -1674,8 +1674,8 @@ class FuelWebClient(object):
         logger.debug("Looking up nodes with a time skew and try to fix them")
         with self.environment.d_env.get_ssh_to_remote(
                 online_ceph_nodes[0]['ip']) as remote:
-            skewed = ceph.get_node_fqdns_w_time_skew(remote)
-            if skewed:
+            if ceph.is_clock_skew(remote):
+                skewed = ceph.get_node_fqdns_w_clock_skew(remote)
                 logger.warning("Time on nodes {0} are to be "
                                "re-syncronized".format(skewed))
                 nodes_to_sync = [
@@ -1684,10 +1684,10 @@ class FuelWebClient(object):
                 self.environment.sync_time(nodes_to_sync)
 
             try:
-                wait(lambda: not ceph.get_node_fqdns_w_time_skew(remote),
+                wait(lambda: not ceph.is_clock_skew(remote),
                      timeout=120)
             except TimeoutError:
-                skewed = ceph.get_node_fqdns_w_time_skew(remote)
+                skewed = ceph.get_node_fqdns_w_clock_skew(remote)
                 logger.error("Time on Ceph nodes {0} is still skewed. "
                              "Restarting Ceph monitor on these "
                              "nodes".format(', '.join(skewed)))
@@ -1703,8 +1703,7 @@ class FuelWebClient(object):
                                      "on node %s", fqdn)
                         ceph.restart_monitor(remote_to_mon)
 
-                wait(lambda: not ceph.get_node_fqdns_w_time_skew(
-                    remote), timeout=120)
+                wait(lambda: not ceph.is_clock_skew(remote), timeout=120)
 
     @logwrap
     def check_ceph_status(self, cluster_id, offline_nodes=(),
@@ -1714,8 +1713,6 @@ class FuelWebClient(object):
         online_ceph_nodes = [
             n for n in ceph_nodes if n['id'] not in offline_nodes]
 
-        osd_recovery_status = ['degraded', 'recovery', 'osds', 'are', 'down']
-
         logger.info('Waiting until Ceph service become up...')
         for node in online_ceph_nodes:
             remote = self.environment.d_env.get_ssh_to_remote(node['ip'])
@@ -1733,26 +1730,24 @@ class FuelWebClient(object):
 
         node = online_ceph_nodes[0]
         remote = self.environment.d_env.get_ssh_to_remote(node['ip'])
-        health_status = ceph.get_ceph_health(remote)
-        if 'HEALTH_WARN' in health_status:
-            if ceph.check_ceph_health(remote, osd_recovery_status)\
-                    and len(offline_nodes) > 0:
+        if not ceph.is_health_ok(remote):
+            if ceph.is_pgs_recovering(remote) and len(offline_nodes) > 0:
                 logger.info('Ceph is being recovered after osd node(s)'
                             ' shutdown.')
                 try:
-                    wait(lambda: ceph.check_ceph_health(remote),
+                    wait(lambda: ceph.is_health_ok(remote),
                          interval=30, timeout=recovery_timeout)
                 except TimeoutError:
-                    result = ceph.get_ceph_health(remote)
+                    result = ceph.health_detail(remote)
                     msg = 'Ceph HEALTH is not OK on {0}. Details: {1}'.format(
                         node['name'], result)
                     logger.error(msg)
                     raise TimeoutError(msg)
         else:
-            result = ceph.get_ceph_health(remote)
+            result = ceph.health_detail(remote)
             msg = 'Ceph HEALTH is not OK on {0}. Details: {1}'.format(
                 node['name'], result)
-            assert_true(ceph.check_ceph_health(remote), msg)
+            assert_true(ceph.is_health_ok(remote), msg)
 
         logger.info('Checking Ceph OSD Tree...')
         ceph.check_disks(remote, [n['id'] for n in online_ceph_nodes])
@@ -2072,7 +2067,7 @@ class FuelWebClient(object):
         assert_true(ids, "osd ids for {} weren't found".format(hostname))
         for id in ids:
             remote_ceph.execute("ceph osd out {}".format(id))
-        wait(lambda: ceph.check_ceph_health(remote_ceph),
+        wait(lambda: ceph.is_health_ok(remote_ceph),
              interval=30, timeout=10 * 60)
         for id in ids:
             if OPENSTACK_RELEASE_UBUNTU in OPENSTACK_RELEASE: