monasca-agent/monasca_agent/collector/checks_d/solidfire.py

260 lines
10 KiB
Python

import json
import logging
import time
import requests
from requests.packages.urllib3 import exceptions
import six
import warnings
import monasca_agent.collector.checks as checks
LOG = logging.getLogger(__name__)
class SolidFire(checks.AgentCheck):
"""SolidFire plugin for reporting cluster metrics. Reference the general
plugin documentation for metric specifics.
"""
def __init__(self, name, init_config, agent_config):
super(SolidFire, self).__init__(name, init_config, agent_config)
self.sf = None
self.instance = None
self.cluster = None
def check(self, instance):
"""Pull down cluster stats."""
self.cluster = instance.get('name')
dimensions = {'service': 'solidfire',
'cluster': self.cluster}
data = {}
num_metrics = 0
# Extract cluster auth information
auth = self._pull_auth(instance)
self.sf = SolidFireLib(auth)
# Query cluster for stats
data.update(self._get_cluster_stats())
# Query for active cluster faults.
data.update(self._list_cluster_faults())
# Query for cluster capacity info
data.update(self._get_cluster_capacity())
# Dump data upstream.
for key, value in data.items():
if data[key] is None:
continue
self.gauge(key, value, dimensions)
num_metrics += 1
LOG.debug('Collected %s metrics' % (num_metrics))
def _pull_auth(self, instance):
"""Extract auth data from instance data.
Simple check to verify we have enough auth information to connect
to the SolidFire cluster.
"""
for k in ['mvip', 'username', 'password']:
if k not in instance:
msg = 'Missing config value: %s' % (k)
LOG.error(msg)
raise Exception(msg)
auth = {'mvip': instance.get('mvip'),
'port': instance.get('port', 443),
'login': instance.get('username'),
'passwd': instance.get('password')}
auth['url'] = 'https://%s:%s' % (auth['mvip'],
auth['port'])
return auth
def _get_cluster_stats(self):
res = (self.sf.issue_api_request('GetClusterStats', {}, '8.0')
['result']['clusterStats'])
# Cluster utilization is the overall load.
data = {'solidfire.cluster_utilization': res['clusterUtilization']}
return data
def _get_cluster_capacity(self):
res = (self.sf.issue_api_request('GetClusterCapacity', {}, '8.0')
['result']['clusterCapacity'])
# Number of 4KiB blocks with data after the last garbage collection
non_zero_blocks = res['nonZeroBlocks']
# Number of 4KiB blocks without data after the last garbage collection
zero_blocks = res['zeroBlocks']
# Number of blocks(not always 4KiB) stored on block drives.
unique_blocks = res['uniqueBlocks']
# Amount of space the unique blocks take on the block drives.
unique_blocks_space = res['uniqueBlocksUsedSpace']
# Amount of space consumed by the block services, including cruft.
active_block_space = res['activeBlockSpace']
# Maximum amount of bytes allocated to the block services.
max_block_space = res['maxUsedSpace']
# Amount of space consumed by the metadata services.
active_slice_space = res['usedMetadataSpace']
# Amount of space consumed by the metadata services for snapshots.
active_snap_space = res['usedMetadataSpaceInSnapshots']
# Maximum amount of bytes allocated to the metadata services.
max_slice_space = res['maxUsedMetadataSpace']
# Volume provisioned space
prov_space = res['provisionedSpace']
# Max provisionable space if 100% metadata space used.
max_prov_space = res['maxProvisionedSpace']
# Overprovision limit.
max_overprov_space = res['maxOverProvisionableSpace']
# Number of active iSCSI sessions.
iscsi_sessions = res['activeSessions']
# Average IOPS since midnight UTC.
avg_iops = res['averageIOPS']
# Peak IOPS since midnight UTC.
peak_iops = res['peakIOPS']
# Current IOPs over the last 5 seconds.
current_iops = res['currentIOPS']
# Theoretical max IOPS
max_iops = res['maxIOPS']
# Single-node clusters can report zero values for some divisors.
thin_factor, dedup_factor, comp_factor = 1, 1, 1
# Same calculations used in the SolidFire UI.
if non_zero_blocks:
# Thin provisioning factor
thin_factor = ((non_zero_blocks + zero_blocks) /
float(non_zero_blocks))
if unique_blocks:
# Data deduplication factor
dedup_factor = non_zero_blocks / float(unique_blocks)
if unique_blocks_space:
# 4096 constant from our internal block size, pre-compression
# Compression efficiency factor
comp_factor = (unique_blocks * 4096) / float(unique_blocks_space)
# Overall data reduction efficiency factor
eff_factor = thin_factor * dedup_factor * comp_factor
data = {'solidfire.num_iscsi_sessions': iscsi_sessions,
'solidfire.iops.avg_utc': avg_iops,
'solidfire.iops.peak_utc': peak_iops,
'solidfire.iops.avg_5_sec': current_iops,
'solidfire.iops.max_available': max_iops,
'solidfire.provisioned_bytes': prov_space,
'solidfire.max_provisioned_bytes': max_prov_space,
'solidfire.max_overprovisioned_bytes': max_overprov_space,
'solidfire.max_block_bytes': max_block_space,
'solidfire.active_block_bytes': active_block_space,
'solidfire.max_meta_bytes': max_slice_space,
'solidfire.active_meta_bytes': active_slice_space,
'solidfire.active_snapshot_bytes': active_snap_space,
'solidfire.non_zero_blocks': non_zero_blocks,
'solidfire.zero_blocks': zero_blocks,
'solidfire.unique_blocks': unique_blocks,
'solidfire.unique_blocks_used_bytes': unique_blocks_space,
'solidfire.thin_provision_factor': thin_factor,
'solidfire.deduplication_factor': dedup_factor,
'solidfire.compression_factor': comp_factor,
'solidfire.data_reduction_factor': eff_factor
}
return data
def _list_cluster_faults(self):
# Report the number of active faults. Might be useful for an alarm?
res = (self.sf.issue_api_request('ListClusterFaults',
{'faultTypes': 'current'},
'8.0')
['result']['faults'])
data = {'solidfire.active_cluster_faults': len(res)}
return data
def retry(exc_tuple, tries=5, delay=1, backoff=2):
# Retry decorator used for issuing API requests.
def retry_dec(f):
@six.wraps(f)
def func_retry(*args, **kwargs):
_tries, _delay = tries, delay
while _tries > 1:
try:
return f(*args, **kwargs)
except exc_tuple:
time.sleep(_delay)
_tries -= 1
_delay *= backoff
LOG.debug('Retrying %(args)s, %(tries)s attempts '
'remaining...',
{'args': args, 'tries': _tries})
msg = ('Retry count exceeded for command: %s' %
(args[1]))
LOG.error(msg)
raise Exception(msg)
return func_retry
return retry_dec
class SolidFireLib(object):
"""Gutted version of the Cinder driver.
Just enough to communicate with a SolidFire cluster for POC.
"""
retryable_errors = ['xDBVersionMismatch',
'xMaxSnapshotsPerVolumeExceeded',
'xMaxClonesPerVolumeExceeded',
'xMaxSnapshotsPerNodeExceeded',
'xMaxClonesPerNodeExceeded',
'xNotReadyForIO']
retry_exc_tuple = (requests.exceptions.ConnectionError)
def __init__(self, auth):
self.endpoint = auth
self.active_cluster_info = {}
self._set_active_cluster_info(auth)
@retry(retry_exc_tuple, tries=6)
def issue_api_request(self, method, params, version='1.0', endpoint=None):
if params is None:
params = {}
if endpoint is None:
endpoint = self.active_cluster_info['endpoint']
payload = {'method': method, 'params': params}
url = '%s/json-rpc/%s/' % (endpoint['url'], version)
with warnings.catch_warnings():
warnings.simplefilter("ignore", exceptions.InsecureRequestWarning)
req = requests.post(url,
data=json.dumps(payload),
auth=(endpoint['login'], endpoint['passwd']),
verify=False,
timeout=30)
response = req.json()
req.close()
if (('error' in response) and
(response['error']['name'] in self.retryable_errors)):
msg = ('Retryable error (%s) encountered during '
'SolidFire API call.' % response['error']['name'])
raise Exception(msg)
if 'error' in response:
msg = ('API response: %s') % response
raise Exception(msg)
return response
def _set_active_cluster_info(self, endpoint):
self.active_cluster_info['endpoint'] = endpoint
for k, v in self.issue_api_request(
'GetClusterInfo',
{})['result']['clusterInfo'].items():
self.active_cluster_info[k] = v
# Add a couple extra things that are handy for us
self.active_cluster_info['clusterAPIVersion'] = (
self.issue_api_request('GetClusterVersionInfo',
{})['result']['clusterAPIVersion'])