Add clock skew status checking

Remove redundant 'get_ceph_health' function.
  Remove unused 'get_status' function.
  Make 'get_osd_tree' and 'get_osd_id' invoke 'run_on_remote' helper function.
  Add clock skew status checking to 'get_node_fqdns_w_clock_skew'.

Change-Id: I1db0a90233ab0e3395778f5e25869dd2a8d997b8
Closes-bug: #1462217
This commit is contained in:
Ivan Kliuk
2015-06-04 11:15:07 +00:00
parent 8f67d08d6e
commit 15c39e4933
2 changed files with 96 additions and 43 deletions

View File

@@ -12,8 +12,6 @@
# License for the specific language governing permissions and limitations
# under the License.
import json
from proboscis.asserts import assert_equal
from fuelweb_test import logger
@@ -72,12 +70,6 @@ def get_health(remote):
return run_on_remote(remote, cmd, jsonify=True)
def get_status(remote):
logger.debug("Checking Ceph cluster status on {0}".format(remote.host))
cmd = 'ceph status -f json'
return run_on_remote(remote, cmd, jsonify=True)
def get_monitor_node_fqdns(remote):
"""Returns node FQDNs with Ceph monitor service is running.
@@ -92,19 +84,33 @@ def get_monitor_node_fqdns(remote):
return fqdns
def get_node_fqdns_w_time_skew(remote):
"""Returns node FQDNs with a time skew.
def is_clock_skew(remote):
"""Checks whether clock skews across the monitor nodes.
:param remote: devops.helpers.helpers.SSHClient
:return: bool
"""
if is_health_warn(remote):
if 'clock skew' in ' '.join(health_detail(remote)):
return True
return False
def get_node_fqdns_w_clock_skew(remote):
"""Returns node FQDNs with a clock skew.
:param remote: devops.helpers.helpers.SSHClient
:return: list of FQDNs
"""
health = get_health(remote)
monitors = health['timechecks']['mons']
fqdns = []
for i in monitors:
if not is_clock_skew(remote):
return fqdns
for i in get_health(remote)['timechecks']['mons']:
if abs(float(i['skew'])) >= 0.05:
fqdns.append(i['name'] + DNS_SUFFIX)
logger.debug("Time skew is found on {0}".format(', '.join(fqdns)))
logger.debug("Clock skew is found on {0}".format(', '.join(fqdns)))
return fqdns
@@ -139,27 +145,79 @@ def check_service_ready(remote, exit_code=0):
return False
# TODO(ivankliuk) Remove `get_ceph_health` function.
def get_ceph_health(remote):
return ''.join(remote.execute('ceph health')['stdout']).rstrip()
def health_overall_status(remote):
"""Returns Ceph health overall status.
Can be one of: 'HEALTH_OK', 'HEALTH_WARN', 'HEALTH_ERR', ...
:param remote: devops.helpers.helpers.SSHClient
:return: str
"""
health = get_health(remote)
return health['overall_status']
def check_ceph_health(remote, health_status=('HEALTH_OK',)):
ceph_health = get_ceph_health(remote)
if all(x in ceph_health.split() for x in health_status):
def health_detail(remote):
"""Returns 'detail' section of Ceph health.
:param remote: devops.helpers.helpers.SSHClient
:return: JSON-like object
"""
health = get_health(remote)
return health['detail']
def is_health_ok(remote):
"""Checks whether Ceph health overall status is OK.
:param remote: devops.helpers.helpers.SSHClient
:return: bool
"""
return health_overall_status(remote) == 'HEALTH_OK'
def is_health_warn(remote):
"""Checks whether Ceph health overall status is WARN.
:param remote: devops.helpers.helpers.SSHClient
:return: bool
"""
return health_overall_status(remote) == 'HEALTH_WARN'
def is_pgs_recovering(remote):
"""Checks whether Ceph PGs are being recovered.
:param remote: devops.helpers.helpers.SSHClient
:return: bool
"""
keywords = ['degraded', 'recovery', 'osds', 'are', 'down']
detail = ' '.join(health_detail(remote))
if all(k in detail.split() for k in keywords):
return True
logger.debug('Ceph health {0} doesn\'t equal to {1}'.format(
ceph_health, ''.join(health_status)))
logger.debug('Ceph PGs are not being recovered. '
'Details: {0}'.format(detail))
return False
def get_osd_tree(remote):
# TODO(ivankliuk) `run_on_remote` function has to be used here.
"""Returns OSDs according to their position in the CRUSH map.
:param remote: devops.helpers.helpers.SSHClient
:return: JSON-like object
"""
logger.debug("Fetching Ceph OSD tree")
cmd = 'ceph osd tree -f json'
return json.loads(''.join(remote.execute(cmd)['stdout']))
return run_on_remote(remote, cmd, jsonify=True)
def get_osd_ids(remote):
# TODO(ivankliuk) `run_on_remote` function has to be used here.
"""Returns all OSD ids.
:param remote: devops.helpers.helpers.SSHClient
:return: JSON-like object
"""
logger.debug("Fetching Ceph OSD ids")
cmd = 'ceph osd ls -f json'
return json.loads(''.join(remote.execute(cmd)['stdout']))
return run_on_remote(remote, cmd, jsonify=True)

View File

@@ -1674,8 +1674,8 @@ class FuelWebClient(object):
logger.debug("Looking up nodes with a time skew and try to fix them")
with self.environment.d_env.get_ssh_to_remote(
online_ceph_nodes[0]['ip']) as remote:
skewed = ceph.get_node_fqdns_w_time_skew(remote)
if skewed:
if ceph.is_clock_skew(remote):
skewed = ceph.get_node_fqdns_w_clock_skew(remote)
logger.warning("Time on nodes {0} are to be "
"re-syncronized".format(skewed))
nodes_to_sync = [
@@ -1684,10 +1684,10 @@ class FuelWebClient(object):
self.environment.sync_time(nodes_to_sync)
try:
wait(lambda: not ceph.get_node_fqdns_w_time_skew(remote),
wait(lambda: not ceph.is_clock_skew(remote),
timeout=120)
except TimeoutError:
skewed = ceph.get_node_fqdns_w_time_skew(remote)
skewed = ceph.get_node_fqdns_w_clock_skew(remote)
logger.error("Time on Ceph nodes {0} is still skewed. "
"Restarting Ceph monitor on these "
"nodes".format(', '.join(skewed)))
@@ -1703,8 +1703,7 @@ class FuelWebClient(object):
"on node %s", fqdn)
ceph.restart_monitor(remote_to_mon)
wait(lambda: not ceph.get_node_fqdns_w_time_skew(
remote), timeout=120)
wait(lambda: not ceph.is_clock_skew(remote), timeout=120)
@logwrap
def check_ceph_status(self, cluster_id, offline_nodes=(),
@@ -1714,8 +1713,6 @@ class FuelWebClient(object):
online_ceph_nodes = [
n for n in ceph_nodes if n['id'] not in offline_nodes]
osd_recovery_status = ['degraded', 'recovery', 'osds', 'are', 'down']
logger.info('Waiting until Ceph service become up...')
for node in online_ceph_nodes:
remote = self.environment.d_env.get_ssh_to_remote(node['ip'])
@@ -1733,26 +1730,24 @@ class FuelWebClient(object):
node = online_ceph_nodes[0]
remote = self.environment.d_env.get_ssh_to_remote(node['ip'])
health_status = ceph.get_ceph_health(remote)
if 'HEALTH_WARN' in health_status:
if ceph.check_ceph_health(remote, osd_recovery_status)\
and len(offline_nodes) > 0:
if not ceph.is_health_ok(remote):
if ceph.is_pgs_recovering(remote) and len(offline_nodes) > 0:
logger.info('Ceph is being recovered after osd node(s)'
' shutdown.')
try:
wait(lambda: ceph.check_ceph_health(remote),
wait(lambda: ceph.is_health_ok(remote),
interval=30, timeout=recovery_timeout)
except TimeoutError:
result = ceph.get_ceph_health(remote)
result = ceph.health_detail(remote)
msg = 'Ceph HEALTH is not OK on {0}. Details: {1}'.format(
node['name'], result)
logger.error(msg)
raise TimeoutError(msg)
else:
result = ceph.get_ceph_health(remote)
result = ceph.health_detail(remote)
msg = 'Ceph HEALTH is not OK on {0}. Details: {1}'.format(
node['name'], result)
assert_true(ceph.check_ceph_health(remote), msg)
assert_true(ceph.is_health_ok(remote), msg)
logger.info('Checking Ceph OSD Tree...')
ceph.check_disks(remote, [n['id'] for n in online_ceph_nodes])
@@ -2072,7 +2067,7 @@ class FuelWebClient(object):
assert_true(ids, "osd ids for {} weren't found".format(hostname))
for id in ids:
remote_ceph.execute("ceph osd out {}".format(id))
wait(lambda: ceph.check_ceph_health(remote_ceph),
wait(lambda: ceph.is_health_ok(remote_ceph),
interval=30, timeout=10 * 60)
for id in ids:
if OPENSTACK_RELEASE_UBUNTU in OPENSTACK_RELEASE: