Add clock skew status checking
Remove redundant 'get_ceph_health' function. Remove unused 'get_status' function. Make 'get_osd_tree' and 'get_osd_id' invoke 'run_on_remote' helper function. Add clock skew status checking to 'get_node_fqdns_w_clock_skew'. Change-Id: I1db0a90233ab0e3395778f5e25869dd2a8d997b8 Closes-bug: #1462217
This commit is contained in:
@@ -12,8 +12,6 @@
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import json
|
||||
|
||||
from proboscis.asserts import assert_equal
|
||||
|
||||
from fuelweb_test import logger
|
||||
@@ -72,12 +70,6 @@ def get_health(remote):
|
||||
return run_on_remote(remote, cmd, jsonify=True)
|
||||
|
||||
|
||||
def get_status(remote):
|
||||
logger.debug("Checking Ceph cluster status on {0}".format(remote.host))
|
||||
cmd = 'ceph status -f json'
|
||||
return run_on_remote(remote, cmd, jsonify=True)
|
||||
|
||||
|
||||
def get_monitor_node_fqdns(remote):
|
||||
"""Returns node FQDNs with Ceph monitor service is running.
|
||||
|
||||
@@ -92,19 +84,33 @@ def get_monitor_node_fqdns(remote):
|
||||
return fqdns
|
||||
|
||||
|
||||
def get_node_fqdns_w_time_skew(remote):
|
||||
"""Returns node FQDNs with a time skew.
|
||||
def is_clock_skew(remote):
|
||||
"""Checks whether clock skews across the monitor nodes.
|
||||
|
||||
:param remote: devops.helpers.helpers.SSHClient
|
||||
:return: bool
|
||||
"""
|
||||
if is_health_warn(remote):
|
||||
if 'clock skew' in ' '.join(health_detail(remote)):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def get_node_fqdns_w_clock_skew(remote):
|
||||
"""Returns node FQDNs with a clock skew.
|
||||
|
||||
:param remote: devops.helpers.helpers.SSHClient
|
||||
:return: list of FQDNs
|
||||
"""
|
||||
health = get_health(remote)
|
||||
monitors = health['timechecks']['mons']
|
||||
fqdns = []
|
||||
for i in monitors:
|
||||
if not is_clock_skew(remote):
|
||||
return fqdns
|
||||
|
||||
for i in get_health(remote)['timechecks']['mons']:
|
||||
if abs(float(i['skew'])) >= 0.05:
|
||||
fqdns.append(i['name'] + DNS_SUFFIX)
|
||||
logger.debug("Time skew is found on {0}".format(', '.join(fqdns)))
|
||||
logger.debug("Clock skew is found on {0}".format(', '.join(fqdns)))
|
||||
return fqdns
|
||||
|
||||
|
||||
@@ -139,27 +145,79 @@ def check_service_ready(remote, exit_code=0):
|
||||
return False
|
||||
|
||||
|
||||
# TODO(ivankliuk) Remove `get_ceph_health` function.
|
||||
def get_ceph_health(remote):
|
||||
return ''.join(remote.execute('ceph health')['stdout']).rstrip()
|
||||
def health_overall_status(remote):
|
||||
"""Returns Ceph health overall status.
|
||||
|
||||
Can be one of: 'HEALTH_OK', 'HEALTH_WARN', 'HEALTH_ERR', ...
|
||||
:param remote: devops.helpers.helpers.SSHClient
|
||||
:return: str
|
||||
|
||||
"""
|
||||
health = get_health(remote)
|
||||
return health['overall_status']
|
||||
|
||||
|
||||
def check_ceph_health(remote, health_status=('HEALTH_OK',)):
|
||||
ceph_health = get_ceph_health(remote)
|
||||
if all(x in ceph_health.split() for x in health_status):
|
||||
def health_detail(remote):
|
||||
"""Returns 'detail' section of Ceph health.
|
||||
|
||||
:param remote: devops.helpers.helpers.SSHClient
|
||||
:return: JSON-like object
|
||||
|
||||
"""
|
||||
health = get_health(remote)
|
||||
return health['detail']
|
||||
|
||||
|
||||
def is_health_ok(remote):
|
||||
"""Checks whether Ceph health overall status is OK.
|
||||
|
||||
:param remote: devops.helpers.helpers.SSHClient
|
||||
:return: bool
|
||||
"""
|
||||
return health_overall_status(remote) == 'HEALTH_OK'
|
||||
|
||||
|
||||
def is_health_warn(remote):
|
||||
"""Checks whether Ceph health overall status is WARN.
|
||||
|
||||
:param remote: devops.helpers.helpers.SSHClient
|
||||
:return: bool
|
||||
"""
|
||||
return health_overall_status(remote) == 'HEALTH_WARN'
|
||||
|
||||
|
||||
def is_pgs_recovering(remote):
|
||||
"""Checks whether Ceph PGs are being recovered.
|
||||
|
||||
:param remote: devops.helpers.helpers.SSHClient
|
||||
:return: bool
|
||||
"""
|
||||
keywords = ['degraded', 'recovery', 'osds', 'are', 'down']
|
||||
detail = ' '.join(health_detail(remote))
|
||||
if all(k in detail.split() for k in keywords):
|
||||
return True
|
||||
logger.debug('Ceph health {0} doesn\'t equal to {1}'.format(
|
||||
ceph_health, ''.join(health_status)))
|
||||
logger.debug('Ceph PGs are not being recovered. '
|
||||
'Details: {0}'.format(detail))
|
||||
return False
|
||||
|
||||
|
||||
def get_osd_tree(remote):
|
||||
# TODO(ivankliuk) `run_on_remote` function has to be used here.
|
||||
"""Returns OSDs according to their position in the CRUSH map.
|
||||
|
||||
:param remote: devops.helpers.helpers.SSHClient
|
||||
:return: JSON-like object
|
||||
"""
|
||||
logger.debug("Fetching Ceph OSD tree")
|
||||
cmd = 'ceph osd tree -f json'
|
||||
return json.loads(''.join(remote.execute(cmd)['stdout']))
|
||||
return run_on_remote(remote, cmd, jsonify=True)
|
||||
|
||||
|
||||
def get_osd_ids(remote):
|
||||
# TODO(ivankliuk) `run_on_remote` function has to be used here.
|
||||
"""Returns all OSD ids.
|
||||
|
||||
:param remote: devops.helpers.helpers.SSHClient
|
||||
:return: JSON-like object
|
||||
"""
|
||||
logger.debug("Fetching Ceph OSD ids")
|
||||
cmd = 'ceph osd ls -f json'
|
||||
return json.loads(''.join(remote.execute(cmd)['stdout']))
|
||||
return run_on_remote(remote, cmd, jsonify=True)
|
||||
|
||||
@@ -1674,8 +1674,8 @@ class FuelWebClient(object):
|
||||
logger.debug("Looking up nodes with a time skew and try to fix them")
|
||||
with self.environment.d_env.get_ssh_to_remote(
|
||||
online_ceph_nodes[0]['ip']) as remote:
|
||||
skewed = ceph.get_node_fqdns_w_time_skew(remote)
|
||||
if skewed:
|
||||
if ceph.is_clock_skew(remote):
|
||||
skewed = ceph.get_node_fqdns_w_clock_skew(remote)
|
||||
logger.warning("Time on nodes {0} are to be "
|
||||
"re-syncronized".format(skewed))
|
||||
nodes_to_sync = [
|
||||
@@ -1684,10 +1684,10 @@ class FuelWebClient(object):
|
||||
self.environment.sync_time(nodes_to_sync)
|
||||
|
||||
try:
|
||||
wait(lambda: not ceph.get_node_fqdns_w_time_skew(remote),
|
||||
wait(lambda: not ceph.is_clock_skew(remote),
|
||||
timeout=120)
|
||||
except TimeoutError:
|
||||
skewed = ceph.get_node_fqdns_w_time_skew(remote)
|
||||
skewed = ceph.get_node_fqdns_w_clock_skew(remote)
|
||||
logger.error("Time on Ceph nodes {0} is still skewed. "
|
||||
"Restarting Ceph monitor on these "
|
||||
"nodes".format(', '.join(skewed)))
|
||||
@@ -1703,8 +1703,7 @@ class FuelWebClient(object):
|
||||
"on node %s", fqdn)
|
||||
ceph.restart_monitor(remote_to_mon)
|
||||
|
||||
wait(lambda: not ceph.get_node_fqdns_w_time_skew(
|
||||
remote), timeout=120)
|
||||
wait(lambda: not ceph.is_clock_skew(remote), timeout=120)
|
||||
|
||||
@logwrap
|
||||
def check_ceph_status(self, cluster_id, offline_nodes=(),
|
||||
@@ -1714,8 +1713,6 @@ class FuelWebClient(object):
|
||||
online_ceph_nodes = [
|
||||
n for n in ceph_nodes if n['id'] not in offline_nodes]
|
||||
|
||||
osd_recovery_status = ['degraded', 'recovery', 'osds', 'are', 'down']
|
||||
|
||||
logger.info('Waiting until Ceph service become up...')
|
||||
for node in online_ceph_nodes:
|
||||
remote = self.environment.d_env.get_ssh_to_remote(node['ip'])
|
||||
@@ -1733,26 +1730,24 @@ class FuelWebClient(object):
|
||||
|
||||
node = online_ceph_nodes[0]
|
||||
remote = self.environment.d_env.get_ssh_to_remote(node['ip'])
|
||||
health_status = ceph.get_ceph_health(remote)
|
||||
if 'HEALTH_WARN' in health_status:
|
||||
if ceph.check_ceph_health(remote, osd_recovery_status)\
|
||||
and len(offline_nodes) > 0:
|
||||
if not ceph.is_health_ok(remote):
|
||||
if ceph.is_pgs_recovering(remote) and len(offline_nodes) > 0:
|
||||
logger.info('Ceph is being recovered after osd node(s)'
|
||||
' shutdown.')
|
||||
try:
|
||||
wait(lambda: ceph.check_ceph_health(remote),
|
||||
wait(lambda: ceph.is_health_ok(remote),
|
||||
interval=30, timeout=recovery_timeout)
|
||||
except TimeoutError:
|
||||
result = ceph.get_ceph_health(remote)
|
||||
result = ceph.health_detail(remote)
|
||||
msg = 'Ceph HEALTH is not OK on {0}. Details: {1}'.format(
|
||||
node['name'], result)
|
||||
logger.error(msg)
|
||||
raise TimeoutError(msg)
|
||||
else:
|
||||
result = ceph.get_ceph_health(remote)
|
||||
result = ceph.health_detail(remote)
|
||||
msg = 'Ceph HEALTH is not OK on {0}. Details: {1}'.format(
|
||||
node['name'], result)
|
||||
assert_true(ceph.check_ceph_health(remote), msg)
|
||||
assert_true(ceph.is_health_ok(remote), msg)
|
||||
|
||||
logger.info('Checking Ceph OSD Tree...')
|
||||
ceph.check_disks(remote, [n['id'] for n in online_ceph_nodes])
|
||||
@@ -2072,7 +2067,7 @@ class FuelWebClient(object):
|
||||
assert_true(ids, "osd ids for {} weren't found".format(hostname))
|
||||
for id in ids:
|
||||
remote_ceph.execute("ceph osd out {}".format(id))
|
||||
wait(lambda: ceph.check_ceph_health(remote_ceph),
|
||||
wait(lambda: ceph.is_health_ok(remote_ceph),
|
||||
interval=30, timeout=10 * 60)
|
||||
for id in ids:
|
||||
if OPENSTACK_RELEASE_UBUNTU in OPENSTACK_RELEASE:
|
||||
|
||||
Reference in New Issue
Block a user