From 15c39e4933319f72648aad9f4abe62b68ffa57c2 Mon Sep 17 00:00:00 2001 From: Ivan Kliuk Date: Thu, 4 Jun 2015 11:15:07 +0000 Subject: [PATCH] Add clock skew status checking Remove redundant 'get_ceph_health' function. Remove unused 'get_status' function. Make 'get_osd_tree' and 'get_osd_id' invoke 'run_on_remote' helper function. Add clock skew status checking to 'get_node_fqdns_w_clock_skew'. Change-Id: I1db0a90233ab0e3395778f5e25869dd2a8d997b8 Closes-bug: #1462217 --- fuelweb_test/helpers/ceph.py | 110 +++++++++++++++++++------ fuelweb_test/models/fuel_web_client.py | 29 +++---- 2 files changed, 96 insertions(+), 43 deletions(-) diff --git a/fuelweb_test/helpers/ceph.py b/fuelweb_test/helpers/ceph.py index 61539e4a5..5bd854118 100644 --- a/fuelweb_test/helpers/ceph.py +++ b/fuelweb_test/helpers/ceph.py @@ -12,8 +12,6 @@ # License for the specific language governing permissions and limitations # under the License. -import json - from proboscis.asserts import assert_equal from fuelweb_test import logger @@ -72,12 +70,6 @@ def get_health(remote): return run_on_remote(remote, cmd, jsonify=True) -def get_status(remote): - logger.debug("Checking Ceph cluster status on {0}".format(remote.host)) - cmd = 'ceph status -f json' - return run_on_remote(remote, cmd, jsonify=True) - - def get_monitor_node_fqdns(remote): """Returns node FQDNs with Ceph monitor service is running. @@ -92,19 +84,33 @@ def get_monitor_node_fqdns(remote): return fqdns -def get_node_fqdns_w_time_skew(remote): - """Returns node FQDNs with a time skew. +def is_clock_skew(remote): + """Checks whether clock skews across the monitor nodes. + + :param remote: devops.helpers.helpers.SSHClient + :return: bool + """ + if is_health_warn(remote): + if 'clock skew' in ' '.join(health_detail(remote)): + return True + + return False + + +def get_node_fqdns_w_clock_skew(remote): + """Returns node FQDNs with a clock skew. :param remote: devops.helpers.helpers.SSHClient :return: list of FQDNs """ - health = get_health(remote) - monitors = health['timechecks']['mons'] fqdns = [] - for i in monitors: + if not is_clock_skew(remote): + return fqdns + + for i in get_health(remote)['timechecks']['mons']: if abs(float(i['skew'])) >= 0.05: fqdns.append(i['name'] + DNS_SUFFIX) - logger.debug("Time skew is found on {0}".format(', '.join(fqdns))) + logger.debug("Clock skew is found on {0}".format(', '.join(fqdns))) return fqdns @@ -139,27 +145,79 @@ def check_service_ready(remote, exit_code=0): return False -# TODO(ivankliuk) Remove `get_ceph_health` function. -def get_ceph_health(remote): - return ''.join(remote.execute('ceph health')['stdout']).rstrip() +def health_overall_status(remote): + """Returns Ceph health overall status. + + Can be one of: 'HEALTH_OK', 'HEALTH_WARN', 'HEALTH_ERR', ... + :param remote: devops.helpers.helpers.SSHClient + :return: str + + """ + health = get_health(remote) + return health['overall_status'] -def check_ceph_health(remote, health_status=('HEALTH_OK',)): - ceph_health = get_ceph_health(remote) - if all(x in ceph_health.split() for x in health_status): +def health_detail(remote): + """Returns 'detail' section of Ceph health. + + :param remote: devops.helpers.helpers.SSHClient + :return: JSON-like object + + """ + health = get_health(remote) + return health['detail'] + + +def is_health_ok(remote): + """Checks whether Ceph health overall status is OK. + + :param remote: devops.helpers.helpers.SSHClient + :return: bool + """ + return health_overall_status(remote) == 'HEALTH_OK' + + +def is_health_warn(remote): + """Checks whether Ceph health overall status is WARN. + + :param remote: devops.helpers.helpers.SSHClient + :return: bool + """ + return health_overall_status(remote) == 'HEALTH_WARN' + + +def is_pgs_recovering(remote): + """Checks whether Ceph PGs are being recovered. + + :param remote: devops.helpers.helpers.SSHClient + :return: bool + """ + keywords = ['degraded', 'recovery', 'osds', 'are', 'down'] + detail = ' '.join(health_detail(remote)) + if all(k in detail.split() for k in keywords): return True - logger.debug('Ceph health {0} doesn\'t equal to {1}'.format( - ceph_health, ''.join(health_status))) + logger.debug('Ceph PGs are not being recovered. ' + 'Details: {0}'.format(detail)) return False def get_osd_tree(remote): - # TODO(ivankliuk) `run_on_remote` function has to be used here. + """Returns OSDs according to their position in the CRUSH map. + + :param remote: devops.helpers.helpers.SSHClient + :return: JSON-like object + """ + logger.debug("Fetching Ceph OSD tree") cmd = 'ceph osd tree -f json' - return json.loads(''.join(remote.execute(cmd)['stdout'])) + return run_on_remote(remote, cmd, jsonify=True) def get_osd_ids(remote): - # TODO(ivankliuk) `run_on_remote` function has to be used here. + """Returns all OSD ids. + + :param remote: devops.helpers.helpers.SSHClient + :return: JSON-like object + """ + logger.debug("Fetching Ceph OSD ids") cmd = 'ceph osd ls -f json' - return json.loads(''.join(remote.execute(cmd)['stdout'])) + return run_on_remote(remote, cmd, jsonify=True) diff --git a/fuelweb_test/models/fuel_web_client.py b/fuelweb_test/models/fuel_web_client.py index 9be9fb773..4c0a8081f 100644 --- a/fuelweb_test/models/fuel_web_client.py +++ b/fuelweb_test/models/fuel_web_client.py @@ -1674,8 +1674,8 @@ class FuelWebClient(object): logger.debug("Looking up nodes with a time skew and try to fix them") with self.environment.d_env.get_ssh_to_remote( online_ceph_nodes[0]['ip']) as remote: - skewed = ceph.get_node_fqdns_w_time_skew(remote) - if skewed: + if ceph.is_clock_skew(remote): + skewed = ceph.get_node_fqdns_w_clock_skew(remote) logger.warning("Time on nodes {0} are to be " "re-syncronized".format(skewed)) nodes_to_sync = [ @@ -1684,10 +1684,10 @@ class FuelWebClient(object): self.environment.sync_time(nodes_to_sync) try: - wait(lambda: not ceph.get_node_fqdns_w_time_skew(remote), + wait(lambda: not ceph.is_clock_skew(remote), timeout=120) except TimeoutError: - skewed = ceph.get_node_fqdns_w_time_skew(remote) + skewed = ceph.get_node_fqdns_w_clock_skew(remote) logger.error("Time on Ceph nodes {0} is still skewed. " "Restarting Ceph monitor on these " "nodes".format(', '.join(skewed))) @@ -1703,8 +1703,7 @@ class FuelWebClient(object): "on node %s", fqdn) ceph.restart_monitor(remote_to_mon) - wait(lambda: not ceph.get_node_fqdns_w_time_skew( - remote), timeout=120) + wait(lambda: not ceph.is_clock_skew(remote), timeout=120) @logwrap def check_ceph_status(self, cluster_id, offline_nodes=(), @@ -1714,8 +1713,6 @@ class FuelWebClient(object): online_ceph_nodes = [ n for n in ceph_nodes if n['id'] not in offline_nodes] - osd_recovery_status = ['degraded', 'recovery', 'osds', 'are', 'down'] - logger.info('Waiting until Ceph service become up...') for node in online_ceph_nodes: remote = self.environment.d_env.get_ssh_to_remote(node['ip']) @@ -1733,26 +1730,24 @@ class FuelWebClient(object): node = online_ceph_nodes[0] remote = self.environment.d_env.get_ssh_to_remote(node['ip']) - health_status = ceph.get_ceph_health(remote) - if 'HEALTH_WARN' in health_status: - if ceph.check_ceph_health(remote, osd_recovery_status)\ - and len(offline_nodes) > 0: + if not ceph.is_health_ok(remote): + if ceph.is_pgs_recovering(remote) and len(offline_nodes) > 0: logger.info('Ceph is being recovered after osd node(s)' ' shutdown.') try: - wait(lambda: ceph.check_ceph_health(remote), + wait(lambda: ceph.is_health_ok(remote), interval=30, timeout=recovery_timeout) except TimeoutError: - result = ceph.get_ceph_health(remote) + result = ceph.health_detail(remote) msg = 'Ceph HEALTH is not OK on {0}. Details: {1}'.format( node['name'], result) logger.error(msg) raise TimeoutError(msg) else: - result = ceph.get_ceph_health(remote) + result = ceph.health_detail(remote) msg = 'Ceph HEALTH is not OK on {0}. Details: {1}'.format( node['name'], result) - assert_true(ceph.check_ceph_health(remote), msg) + assert_true(ceph.is_health_ok(remote), msg) logger.info('Checking Ceph OSD Tree...') ceph.check_disks(remote, [n['id'] for n in online_ceph_nodes]) @@ -2072,7 +2067,7 @@ class FuelWebClient(object): assert_true(ids, "osd ids for {} weren't found".format(hostname)) for id in ids: remote_ceph.execute("ceph osd out {}".format(id)) - wait(lambda: ceph.check_ceph_health(remote_ceph), + wait(lambda: ceph.is_health_ok(remote_ceph), interval=30, timeout=10 * 60) for id in ids: if OPENSTACK_RELEASE_UBUNTU in OPENSTACK_RELEASE: