Add clock skew status checking

Remove redundant 'get_ceph_health' function. Remove unused 'get_status' function. Make 'get_osd_tree' and 'get_osd_id' invoke 'run_on_remote' helper function. Add clock skew status checking to 'get_node_fqdns_w_clock_skew'. Change-Id: I1db0a90233ab0e3395778f5e25869dd2a8d997b8 Closes-bug: #1462217
2015-06-04 11:15:07 +00:00
parent 8f67d08d6e
commit 15c39e4933
2 changed files with 96 additions and 43 deletions
--- a/fuelweb_test/helpers/ceph.py
+++ b/fuelweb_test/helpers/ceph.py
@@ -12,8 +12,6 @@
 #    License for the specific language governing permissions and limitations
 #    under the License.

-import json
-
 from proboscis.asserts import assert_equal

 from fuelweb_test import logger
@@ -72,12 +70,6 @@ def get_health(remote):
    return run_on_remote(remote, cmd, jsonify=True)


-def get_status(remote):
-    logger.debug("Checking Ceph cluster status on {0}".format(remote.host))
-    cmd = 'ceph status -f json'
-    return run_on_remote(remote, cmd, jsonify=True)
-
-
 def get_monitor_node_fqdns(remote):
    """Returns node FQDNs with Ceph monitor service is running.

@@ -92,19 +84,33 @@ def get_monitor_node_fqdns(remote):
    return fqdns


-def get_node_fqdns_w_time_skew(remote):
-    """Returns node FQDNs with a time skew.
+def is_clock_skew(remote):
+    """Checks whether clock skews across the monitor nodes.
+
+    :param remote: devops.helpers.helpers.SSHClient
+    :return: bool
+    """
+    if is_health_warn(remote):
+        if 'clock skew' in ' '.join(health_detail(remote)):
+            return True
+
+    return False
+
+
+def get_node_fqdns_w_clock_skew(remote):
+    """Returns node FQDNs with a clock skew.

    :param remote: devops.helpers.helpers.SSHClient
    :return: list of FQDNs
    """
-    health = get_health(remote)
-    monitors = health['timechecks']['mons']
    fqdns = []
-    for i in monitors:
+    if not is_clock_skew(remote):
+        return fqdns
+
+    for i in get_health(remote)['timechecks']['mons']:
        if abs(float(i['skew'])) >= 0.05:
            fqdns.append(i['name'] + DNS_SUFFIX)
-    logger.debug("Time skew is found on {0}".format(', '.join(fqdns)))
+    logger.debug("Clock skew is found on {0}".format(', '.join(fqdns)))
    return fqdns


@@ -139,27 +145,79 @@ def check_service_ready(remote, exit_code=0):
    return False


-# TODO(ivankliuk) Remove `get_ceph_health` function.
-def get_ceph_health(remote):
-    return ''.join(remote.execute('ceph health')['stdout']).rstrip()
+def health_overall_status(remote):
+    """Returns Ceph health overall status.
+
+    Can be one of: 'HEALTH_OK', 'HEALTH_WARN', 'HEALTH_ERR', ...
+    :param remote: devops.helpers.helpers.SSHClient
+    :return: str
+
+    """
+    health = get_health(remote)
+    return health['overall_status']


-def check_ceph_health(remote, health_status=('HEALTH_OK',)):
-    ceph_health = get_ceph_health(remote)
-    if all(x in ceph_health.split() for x in health_status):
+def health_detail(remote):
+    """Returns 'detail' section of Ceph health.
+
+    :param remote: devops.helpers.helpers.SSHClient
+    :return: JSON-like object
+
+    """
+    health = get_health(remote)
+    return health['detail']
+
+
+def is_health_ok(remote):
+    """Checks whether Ceph health overall status is OK.
+
+    :param remote: devops.helpers.helpers.SSHClient
+    :return: bool
+    """
+    return health_overall_status(remote) == 'HEALTH_OK'
+
+
+def is_health_warn(remote):
+    """Checks whether Ceph health overall status is WARN.
+
+    :param remote: devops.helpers.helpers.SSHClient
+    :return: bool
+    """
+    return health_overall_status(remote) == 'HEALTH_WARN'
+
+
+def is_pgs_recovering(remote):
+    """Checks whether Ceph PGs are being recovered.
+
+    :param remote: devops.helpers.helpers.SSHClient
+    :return: bool
+    """
+    keywords = ['degraded', 'recovery', 'osds', 'are', 'down']
+    detail = ' '.join(health_detail(remote))
+    if all(k in detail.split() for k in keywords):
        return True
-    logger.debug('Ceph health {0} doesn\'t equal to {1}'.format(
-        ceph_health, ''.join(health_status)))
+    logger.debug('Ceph PGs are not being recovered. '
+                 'Details: {0}'.format(detail))
    return False


 def get_osd_tree(remote):
-    # TODO(ivankliuk) `run_on_remote` function has to be used here.
+    """Returns OSDs according to their position in the CRUSH map.
+
+    :param remote: devops.helpers.helpers.SSHClient
+    :return: JSON-like object
+    """
+    logger.debug("Fetching Ceph OSD tree")
    cmd = 'ceph osd tree -f json'
-    return json.loads(''.join(remote.execute(cmd)['stdout']))
+    return run_on_remote(remote, cmd, jsonify=True)


 def get_osd_ids(remote):
-    # TODO(ivankliuk) `run_on_remote` function has to be used here.
+    """Returns all OSD ids.
+
+    :param remote: devops.helpers.helpers.SSHClient
+    :return: JSON-like object
+    """
+    logger.debug("Fetching Ceph OSD ids")
    cmd = 'ceph osd ls -f json'
-    return json.loads(''.join(remote.execute(cmd)['stdout']))
+    return run_on_remote(remote, cmd, jsonify=True)
--- a/fuelweb_test/models/fuel_web_client.py
+++ b/fuelweb_test/models/fuel_web_client.py
@@ -1674,8 +1674,8 @@ class FuelWebClient(object):
        logger.debug("Looking up nodes with a time skew and try to fix them")
        with self.environment.d_env.get_ssh_to_remote(
                online_ceph_nodes[0]['ip']) as remote:
-            skewed = ceph.get_node_fqdns_w_time_skew(remote)
-            if skewed:
+            if ceph.is_clock_skew(remote):
+                skewed = ceph.get_node_fqdns_w_clock_skew(remote)
                logger.warning("Time on nodes {0} are to be "
                               "re-syncronized".format(skewed))
                nodes_to_sync = [
@@ -1684,10 +1684,10 @@ class FuelWebClient(object):
                self.environment.sync_time(nodes_to_sync)

            try:
-                wait(lambda: not ceph.get_node_fqdns_w_time_skew(remote),
+                wait(lambda: not ceph.is_clock_skew(remote),
                     timeout=120)
            except TimeoutError:
-                skewed = ceph.get_node_fqdns_w_time_skew(remote)
+                skewed = ceph.get_node_fqdns_w_clock_skew(remote)
                logger.error("Time on Ceph nodes {0} is still skewed. "
                             "Restarting Ceph monitor on these "
                             "nodes".format(', '.join(skewed)))
@@ -1703,8 +1703,7 @@ class FuelWebClient(object):
                                     "on node %s", fqdn)
                        ceph.restart_monitor(remote_to_mon)

-                wait(lambda: not ceph.get_node_fqdns_w_time_skew(
-                    remote), timeout=120)
+                wait(lambda: not ceph.is_clock_skew(remote), timeout=120)

    @logwrap
    def check_ceph_status(self, cluster_id, offline_nodes=(),
@@ -1714,8 +1713,6 @@ class FuelWebClient(object):
        online_ceph_nodes = [
            n for n in ceph_nodes if n['id'] not in offline_nodes]

-        osd_recovery_status = ['degraded', 'recovery', 'osds', 'are', 'down']
-
        logger.info('Waiting until Ceph service become up...')
        for node in online_ceph_nodes:
            remote = self.environment.d_env.get_ssh_to_remote(node['ip'])
@@ -1733,26 +1730,24 @@ class FuelWebClient(object):

        node = online_ceph_nodes[0]
        remote = self.environment.d_env.get_ssh_to_remote(node['ip'])
-        health_status = ceph.get_ceph_health(remote)
-        if 'HEALTH_WARN' in health_status:
-            if ceph.check_ceph_health(remote, osd_recovery_status)\
-                    and len(offline_nodes) > 0:
+        if not ceph.is_health_ok(remote):
+            if ceph.is_pgs_recovering(remote) and len(offline_nodes) > 0:
                logger.info('Ceph is being recovered after osd node(s)'
                            ' shutdown.')
                try:
-                    wait(lambda: ceph.check_ceph_health(remote),
+                    wait(lambda: ceph.is_health_ok(remote),
                         interval=30, timeout=recovery_timeout)
                except TimeoutError:
-                    result = ceph.get_ceph_health(remote)
+                    result = ceph.health_detail(remote)
                    msg = 'Ceph HEALTH is not OK on {0}. Details: {1}'.format(
                        node['name'], result)
                    logger.error(msg)
                    raise TimeoutError(msg)
        else:
-            result = ceph.get_ceph_health(remote)
+            result = ceph.health_detail(remote)
            msg = 'Ceph HEALTH is not OK on {0}. Details: {1}'.format(
                node['name'], result)
-            assert_true(ceph.check_ceph_health(remote), msg)
+            assert_true(ceph.is_health_ok(remote), msg)

        logger.info('Checking Ceph OSD Tree...')
        ceph.check_disks(remote, [n['id'] for n in online_ceph_nodes])
@@ -2072,7 +2067,7 @@ class FuelWebClient(object):
        assert_true(ids, "osd ids for {} weren't found".format(hostname))
        for id in ids:
            remote_ceph.execute("ceph osd out {}".format(id))
-        wait(lambda: ceph.check_ceph_health(remote_ceph),
+        wait(lambda: ceph.is_health_ok(remote_ceph),
             interval=30, timeout=10 * 60)
        for id in ids:
            if OPENSTACK_RELEASE_UBUNTU in OPENSTACK_RELEASE: