260 lines
10 KiB
Python
260 lines
10 KiB
Python
import json
|
|
import logging
|
|
import time
|
|
|
|
import requests
|
|
from requests.packages.urllib3 import exceptions
|
|
import six
|
|
import warnings
|
|
|
|
import monasca_agent.collector.checks as checks
|
|
|
|
|
|
LOG = logging.getLogger(__name__)
|
|
|
|
|
|
class SolidFire(checks.AgentCheck):
|
|
"""SolidFire plugin for reporting cluster metrics. Reference the general
|
|
plugin documentation for metric specifics.
|
|
"""
|
|
|
|
def __init__(self, name, init_config, agent_config):
|
|
super(SolidFire, self).__init__(name, init_config, agent_config)
|
|
self.sf = None
|
|
self.instance = None
|
|
self.cluster = None
|
|
|
|
def check(self, instance):
|
|
"""Pull down cluster stats."""
|
|
self.cluster = instance.get('name')
|
|
dimensions = {'service': 'solidfire',
|
|
'cluster': self.cluster}
|
|
data = {}
|
|
num_metrics = 0
|
|
# Extract cluster auth information
|
|
auth = self._pull_auth(instance)
|
|
self.sf = SolidFireLib(auth)
|
|
|
|
# Query cluster for stats
|
|
data.update(self._get_cluster_stats())
|
|
# Query for active cluster faults.
|
|
data.update(self._list_cluster_faults())
|
|
# Query for cluster capacity info
|
|
data.update(self._get_cluster_capacity())
|
|
|
|
# Dump data upstream.
|
|
for key, value in data.items():
|
|
if data[key] is None:
|
|
continue
|
|
self.gauge(key, value, dimensions)
|
|
num_metrics += 1
|
|
|
|
LOG.debug('Collected %s metrics' % (num_metrics))
|
|
|
|
def _pull_auth(self, instance):
|
|
"""Extract auth data from instance data.
|
|
|
|
Simple check to verify we have enough auth information to connect
|
|
to the SolidFire cluster.
|
|
"""
|
|
for k in ['mvip', 'username', 'password']:
|
|
if k not in instance:
|
|
msg = 'Missing config value: %s' % (k)
|
|
LOG.error(msg)
|
|
raise Exception(msg)
|
|
auth = {'mvip': instance.get('mvip'),
|
|
'port': instance.get('port', 443),
|
|
'login': instance.get('username'),
|
|
'passwd': instance.get('password')}
|
|
auth['url'] = 'https://%s:%s' % (auth['mvip'],
|
|
auth['port'])
|
|
return auth
|
|
|
|
def _get_cluster_stats(self):
|
|
res = (self.sf.issue_api_request('GetClusterStats', {}, '8.0')
|
|
['result']['clusterStats'])
|
|
# Cluster utilization is the overall load.
|
|
data = {'solidfire.cluster_utilization': res['clusterUtilization']}
|
|
return data
|
|
|
|
def _get_cluster_capacity(self):
|
|
res = (self.sf.issue_api_request('GetClusterCapacity', {}, '8.0')
|
|
['result']['clusterCapacity'])
|
|
|
|
# Number of 4KiB blocks with data after the last garbage collection
|
|
non_zero_blocks = res['nonZeroBlocks']
|
|
# Number of 4KiB blocks without data after the last garbage collection
|
|
zero_blocks = res['zeroBlocks']
|
|
# Number of blocks(not always 4KiB) stored on block drives.
|
|
unique_blocks = res['uniqueBlocks']
|
|
# Amount of space the unique blocks take on the block drives.
|
|
unique_blocks_space = res['uniqueBlocksUsedSpace']
|
|
|
|
# Amount of space consumed by the block services, including cruft.
|
|
active_block_space = res['activeBlockSpace']
|
|
# Maximum amount of bytes allocated to the block services.
|
|
max_block_space = res['maxUsedSpace']
|
|
|
|
# Amount of space consumed by the metadata services.
|
|
active_slice_space = res['usedMetadataSpace']
|
|
# Amount of space consumed by the metadata services for snapshots.
|
|
active_snap_space = res['usedMetadataSpaceInSnapshots']
|
|
# Maximum amount of bytes allocated to the metadata services.
|
|
max_slice_space = res['maxUsedMetadataSpace']
|
|
|
|
# Volume provisioned space
|
|
prov_space = res['provisionedSpace']
|
|
# Max provisionable space if 100% metadata space used.
|
|
max_prov_space = res['maxProvisionedSpace']
|
|
# Overprovision limit.
|
|
max_overprov_space = res['maxOverProvisionableSpace']
|
|
|
|
# Number of active iSCSI sessions.
|
|
iscsi_sessions = res['activeSessions']
|
|
# Average IOPS since midnight UTC.
|
|
avg_iops = res['averageIOPS']
|
|
# Peak IOPS since midnight UTC.
|
|
peak_iops = res['peakIOPS']
|
|
# Current IOPs over the last 5 seconds.
|
|
current_iops = res['currentIOPS']
|
|
# Theoretical max IOPS
|
|
max_iops = res['maxIOPS']
|
|
|
|
# Single-node clusters can report zero values for some divisors.
|
|
thin_factor, dedup_factor, comp_factor = 1, 1, 1
|
|
# Same calculations used in the SolidFire UI.
|
|
if non_zero_blocks:
|
|
# Thin provisioning factor
|
|
thin_factor = ((non_zero_blocks + zero_blocks) /
|
|
float(non_zero_blocks))
|
|
if unique_blocks:
|
|
# Data deduplication factor
|
|
dedup_factor = non_zero_blocks / float(unique_blocks)
|
|
if unique_blocks_space:
|
|
# 4096 constant from our internal block size, pre-compression
|
|
# Compression efficiency factor
|
|
comp_factor = (unique_blocks * 4096) / float(unique_blocks_space)
|
|
# Overall data reduction efficiency factor
|
|
eff_factor = thin_factor * dedup_factor * comp_factor
|
|
|
|
data = {'solidfire.num_iscsi_sessions': iscsi_sessions,
|
|
'solidfire.iops.avg_utc': avg_iops,
|
|
'solidfire.iops.peak_utc': peak_iops,
|
|
'solidfire.iops.avg_5_sec': current_iops,
|
|
'solidfire.iops.max_available': max_iops,
|
|
'solidfire.provisioned_bytes': prov_space,
|
|
'solidfire.max_provisioned_bytes': max_prov_space,
|
|
'solidfire.max_overprovisioned_bytes': max_overprov_space,
|
|
'solidfire.max_block_bytes': max_block_space,
|
|
'solidfire.active_block_bytes': active_block_space,
|
|
'solidfire.max_meta_bytes': max_slice_space,
|
|
'solidfire.active_meta_bytes': active_slice_space,
|
|
'solidfire.active_snapshot_bytes': active_snap_space,
|
|
'solidfire.non_zero_blocks': non_zero_blocks,
|
|
'solidfire.zero_blocks': zero_blocks,
|
|
'solidfire.unique_blocks': unique_blocks,
|
|
'solidfire.unique_blocks_used_bytes': unique_blocks_space,
|
|
'solidfire.thin_provision_factor': thin_factor,
|
|
'solidfire.deduplication_factor': dedup_factor,
|
|
'solidfire.compression_factor': comp_factor,
|
|
'solidfire.data_reduction_factor': eff_factor
|
|
}
|
|
return data
|
|
|
|
def _list_cluster_faults(self):
|
|
# Report the number of active faults. Might be useful for an alarm?
|
|
res = (self.sf.issue_api_request('ListClusterFaults',
|
|
{'faultTypes': 'current'},
|
|
'8.0')
|
|
['result']['faults'])
|
|
data = {'solidfire.active_cluster_faults': len(res)}
|
|
return data
|
|
|
|
|
|
def retry(exc_tuple, tries=5, delay=1, backoff=2):
|
|
# Retry decorator used for issuing API requests.
|
|
def retry_dec(f):
|
|
@six.wraps(f)
|
|
def func_retry(*args, **kwargs):
|
|
_tries, _delay = tries, delay
|
|
while _tries > 1:
|
|
try:
|
|
return f(*args, **kwargs)
|
|
except exc_tuple:
|
|
time.sleep(_delay)
|
|
_tries -= 1
|
|
_delay *= backoff
|
|
LOG.debug('Retrying %(args)s, %(tries)s attempts '
|
|
'remaining...',
|
|
{'args': args, 'tries': _tries})
|
|
msg = ('Retry count exceeded for command: %s' %
|
|
(args[1]))
|
|
LOG.error(msg)
|
|
raise Exception(msg)
|
|
return func_retry
|
|
return retry_dec
|
|
|
|
|
|
class SolidFireLib(object):
|
|
"""Gutted version of the Cinder driver.
|
|
|
|
Just enough to communicate with a SolidFire cluster for POC.
|
|
"""
|
|
|
|
retryable_errors = ['xDBVersionMismatch',
|
|
'xMaxSnapshotsPerVolumeExceeded',
|
|
'xMaxClonesPerVolumeExceeded',
|
|
'xMaxSnapshotsPerNodeExceeded',
|
|
'xMaxClonesPerNodeExceeded',
|
|
'xNotReadyForIO']
|
|
|
|
retry_exc_tuple = (requests.exceptions.ConnectionError)
|
|
|
|
def __init__(self, auth):
|
|
self.endpoint = auth
|
|
self.active_cluster_info = {}
|
|
self._set_active_cluster_info(auth)
|
|
|
|
@retry(retry_exc_tuple, tries=6)
|
|
def issue_api_request(self, method, params, version='1.0', endpoint=None):
|
|
if params is None:
|
|
params = {}
|
|
if endpoint is None:
|
|
endpoint = self.active_cluster_info['endpoint']
|
|
|
|
payload = {'method': method, 'params': params}
|
|
url = '%s/json-rpc/%s/' % (endpoint['url'], version)
|
|
with warnings.catch_warnings():
|
|
warnings.simplefilter("ignore", exceptions.InsecureRequestWarning)
|
|
req = requests.post(url,
|
|
data=json.dumps(payload),
|
|
auth=(endpoint['login'], endpoint['passwd']),
|
|
verify=False,
|
|
timeout=30)
|
|
response = req.json()
|
|
req.close()
|
|
if (('error' in response) and
|
|
(response['error']['name'] in self.retryable_errors)):
|
|
msg = ('Retryable error (%s) encountered during '
|
|
'SolidFire API call.' % response['error']['name'])
|
|
raise Exception(msg)
|
|
|
|
if 'error' in response:
|
|
msg = ('API response: %s') % response
|
|
raise Exception(msg)
|
|
|
|
return response
|
|
|
|
def _set_active_cluster_info(self, endpoint):
|
|
self.active_cluster_info['endpoint'] = endpoint
|
|
|
|
for k, v in self.issue_api_request(
|
|
'GetClusterInfo',
|
|
{})['result']['clusterInfo'].items():
|
|
self.active_cluster_info[k] = v
|
|
|
|
# Add a couple extra things that are handy for us
|
|
self.active_cluster_info['clusterAPIVersion'] = (
|
|
self.issue_api_request('GetClusterVersionInfo',
|
|
{})['result']['clusterAPIVersion'])
|