manila/manila/share/drivers/netapp/dataontap/cluster_mode/performance.py

406 lines
17 KiB
Python

# Copyright (c) 2016 Clinton Knight
# All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
"""
Performance metrics functions and cache for NetApp systems.
"""
import copy
from oslo_log import log as logging
from manila import exception
from manila.i18n import _
from manila.share.drivers.netapp.dataontap.client import api as netapp_api
LOG = logging.getLogger(__name__)
DEFAULT_UTILIZATION = 50
class PerformanceLibrary(object):
def __init__(self, zapi_client):
self.zapi_client = zapi_client
self.performance_counters = {}
self.pool_utilization = {}
self._init_counter_info()
def _init_counter_info(self):
"""Set a few counter names based on Data ONTAP version."""
self.system_object_name = None
self.avg_processor_busy_base_counter_name = None
try:
if self.zapi_client.features.SYSTEM_CONSTITUENT_METRICS:
self.system_object_name = 'system:constituent'
self.avg_processor_busy_base_counter_name = (
self._get_base_counter_name('system:constituent',
'avg_processor_busy'))
elif self.zapi_client.features.SYSTEM_METRICS:
self.system_object_name = 'system'
self.avg_processor_busy_base_counter_name = (
self._get_base_counter_name('system',
'avg_processor_busy'))
except netapp_api.NaApiError:
if self.zapi_client.features.SYSTEM_CONSTITUENT_METRICS:
self.avg_processor_busy_base_counter_name = 'cpu_elapsed_time'
else:
self.avg_processor_busy_base_counter_name = 'cpu_elapsed_time1'
LOG.exception('Could not get performance base counter '
'name. Performance-based scheduler '
'functions may not be available.')
def update_performance_cache(self, flexvol_pools, aggregate_pools):
"""Called periodically to update per-pool node utilization metrics."""
# Nothing to do on older systems
if not (self.zapi_client.features.SYSTEM_METRICS or
self.zapi_client.features.SYSTEM_CONSTITUENT_METRICS):
return
# Get aggregates and nodes for all known pools
aggr_names = self._get_aggregates_for_pools(flexvol_pools,
aggregate_pools)
node_names, aggr_node_map = self._get_nodes_for_aggregates(aggr_names)
# Update performance counter cache for each node
node_utilization = {}
for node_name in node_names:
if node_name not in self.performance_counters:
self.performance_counters[node_name] = []
# Get new performance counters and save only the last 10
counters = self._get_node_utilization_counters(node_name)
if not counters:
continue
self.performance_counters[node_name].append(counters)
self.performance_counters[node_name] = (
self.performance_counters[node_name][-10:])
# Update utilization for each node using newest & oldest sample
counters = self.performance_counters[node_name]
if len(counters) < 2:
node_utilization[node_name] = DEFAULT_UTILIZATION
else:
node_utilization[node_name] = self._get_node_utilization(
counters[0], counters[-1], node_name)
# Update pool utilization map atomically
pool_utilization = {}
all_pools = copy.deepcopy(flexvol_pools)
all_pools.update(aggregate_pools)
for pool_name, pool_info in all_pools.items():
aggr_name = pool_info.get('netapp_aggregate', 'unknown')
node_name = aggr_node_map.get(aggr_name)
if node_name:
pool_utilization[pool_name] = node_utilization.get(
node_name, DEFAULT_UTILIZATION)
else:
pool_utilization[pool_name] = DEFAULT_UTILIZATION
self.pool_utilization = pool_utilization
def get_node_utilization_for_pool(self, pool_name):
"""Get the node utilization for the specified pool, if available."""
return self.pool_utilization.get(pool_name, DEFAULT_UTILIZATION)
def update_for_failover(self, zapi_client, flexvol_pools, aggregate_pools):
"""Change API client after a whole-backend failover event."""
self.zapi_client = zapi_client
self.update_performance_cache(flexvol_pools, aggregate_pools)
def _get_aggregates_for_pools(self, flexvol_pools, aggregate_pools):
"""Get the set of aggregates that contain the specified pools."""
aggr_names = set()
for pool_name, pool_info in aggregate_pools.items():
aggr_names.add(pool_info.get('netapp_aggregate'))
for pool_name, pool_info in flexvol_pools.items():
aggr_names.add(pool_info.get('netapp_aggregate'))
return list(aggr_names)
def _get_nodes_for_aggregates(self, aggr_names):
"""Get the cluster nodes that own the specified aggregates."""
node_names = set()
aggr_node_map = {}
for aggr_name in aggr_names:
node_name = self.zapi_client.get_node_for_aggregate(aggr_name)
if node_name:
node_names.add(node_name)
aggr_node_map[aggr_name] = node_name
return list(node_names), aggr_node_map
def _get_node_utilization(self, counters_t1, counters_t2, node_name):
"""Get node utilization from two sets of performance counters."""
try:
# Time spent in the single-threaded Kahuna domain
kahuna_percent = self._get_kahuna_utilization(counters_t1,
counters_t2)
# If Kahuna is using >60% of the CPU, the controller is fully busy
if kahuna_percent > 60:
return 100.0
# Average CPU busyness across all processors
avg_cpu_percent = 100.0 * self._get_average_cpu_utilization(
counters_t1, counters_t2)
# Total Consistency Point (CP) time
total_cp_time_msec = self._get_total_consistency_point_time(
counters_t1, counters_t2)
# Time spent in CP Phase 2 (buffer flush)
p2_flush_time_msec = self._get_consistency_point_p2_flush_time(
counters_t1, counters_t2)
# Wall-clock time between the two counter sets
poll_time_msec = self._get_total_time(counters_t1,
counters_t2,
'total_cp_msecs')
# If two polls happened in quick succession, use CPU utilization
if total_cp_time_msec == 0 or poll_time_msec == 0:
return max(min(100.0, avg_cpu_percent), 0)
# Adjusted Consistency Point time
adjusted_cp_time_msec = self._get_adjusted_consistency_point_time(
total_cp_time_msec, p2_flush_time_msec)
adjusted_cp_percent = (100.0 *
adjusted_cp_time_msec / poll_time_msec)
# Utilization is the greater of CPU busyness & CP time
node_utilization = max(avg_cpu_percent, adjusted_cp_percent)
return max(min(100.0, node_utilization), 0)
except Exception:
LOG.exception('Could not calculate node utilization for '
'node %s.', node_name)
return DEFAULT_UTILIZATION
def _get_kahuna_utilization(self, counters_t1, counters_t2):
"""Get time spent in the single-threaded Kahuna domain."""
# Note(cknight): Because Kahuna is single-threaded, running only on
# one CPU at a time, we can safely sum the Kahuna CPU usage
# percentages across all processors in a node.
return sum(self._get_performance_counter_average_multi_instance(
counters_t1, counters_t2, 'domain_busy:kahuna',
'processor_elapsed_time')) * 100.0
def _get_average_cpu_utilization(self, counters_t1, counters_t2):
"""Get average CPU busyness across all processors."""
return self._get_performance_counter_average(
counters_t1, counters_t2, 'avg_processor_busy',
self.avg_processor_busy_base_counter_name)
def _get_total_consistency_point_time(self, counters_t1, counters_t2):
"""Get time spent in Consistency Points in msecs."""
return float(self._get_performance_counter_delta(
counters_t1, counters_t2, 'total_cp_msecs'))
def _get_consistency_point_p2_flush_time(self, counters_t1, counters_t2):
"""Get time spent in CP Phase 2 (buffer flush) in msecs."""
return float(self._get_performance_counter_delta(
counters_t1, counters_t2, 'cp_phase_times:p2_flush'))
def _get_total_time(self, counters_t1, counters_t2, counter_name):
"""Get wall clock time between two successive counters in msecs."""
timestamp_t1 = float(self._find_performance_counter_timestamp(
counters_t1, counter_name))
timestamp_t2 = float(self._find_performance_counter_timestamp(
counters_t2, counter_name))
return (timestamp_t2 - timestamp_t1) * 1000.0
def _get_adjusted_consistency_point_time(self, total_cp_time,
p2_flush_time):
"""Get adjusted CP time by limiting CP phase 2 flush time to 20%."""
return (total_cp_time - p2_flush_time) * 1.20
def _get_performance_counter_delta(self, counters_t1, counters_t2,
counter_name):
"""Calculate a delta value from two performance counters."""
counter_t1 = int(
self._find_performance_counter_value(counters_t1, counter_name))
counter_t2 = int(
self._find_performance_counter_value(counters_t2, counter_name))
return counter_t2 - counter_t1
def _get_performance_counter_average(self, counters_t1, counters_t2,
counter_name, base_counter_name,
instance_name=None):
"""Calculate an average value from two performance counters."""
counter_t1 = float(self._find_performance_counter_value(
counters_t1, counter_name, instance_name))
counter_t2 = float(self._find_performance_counter_value(
counters_t2, counter_name, instance_name))
base_counter_t1 = float(self._find_performance_counter_value(
counters_t1, base_counter_name, instance_name))
base_counter_t2 = float(self._find_performance_counter_value(
counters_t2, base_counter_name, instance_name))
return (counter_t2 - counter_t1) / (base_counter_t2 - base_counter_t1)
def _get_performance_counter_average_multi_instance(self, counters_t1,
counters_t2,
counter_name,
base_counter_name):
"""Calculate an average value from multiple counter instances."""
averages = []
instance_names = []
for counter in counters_t1:
if counter_name in counter:
instance_names.append(counter['instance-name'])
for instance_name in instance_names:
average = self._get_performance_counter_average(
counters_t1, counters_t2, counter_name, base_counter_name,
instance_name)
averages.append(average)
return averages
def _find_performance_counter_value(self, counters, counter_name,
instance_name=None):
"""Given a counter set, return the value of a named instance."""
for counter in counters:
if counter_name in counter:
if (instance_name is None
or counter['instance-name'] == instance_name):
return counter[counter_name]
else:
raise exception.NotFound(_('Counter %s not found') % counter_name)
def _find_performance_counter_timestamp(self, counters, counter_name,
instance_name=None):
"""Given a counter set, return the timestamp of a named instance."""
for counter in counters:
if counter_name in counter:
if (instance_name is None
or counter['instance-name'] == instance_name):
return counter['timestamp']
else:
raise exception.NotFound(_('Counter %s not found') % counter_name)
def _expand_performance_array(self, object_name, counter_name, counter):
"""Get array labels and expand counter data array."""
# Get array labels for counter value
counter_info = self.zapi_client.get_performance_counter_info(
object_name, counter_name)
array_labels = [counter_name + ':' + label.lower()
for label in counter_info['labels']]
array_values = counter[counter_name].split(',')
# Combine labels and values, and then mix into existing counter
array_data = dict(zip(array_labels, array_values))
counter.update(array_data)
def _get_base_counter_name(self, object_name, counter_name):
"""Get the name of the base counter for the specified counter."""
counter_info = self.zapi_client.get_performance_counter_info(
object_name, counter_name)
return counter_info['base-counter']
def _get_node_utilization_counters(self, node_name):
"""Get all performance counters for calculating node utilization."""
try:
return (self._get_node_utilization_system_counters(node_name) +
self._get_node_utilization_wafl_counters(node_name) +
self._get_node_utilization_processor_counters(node_name))
except netapp_api.NaApiError:
LOG.exception('Could not get utilization counters from node '
'%s', node_name)
return None
def _get_node_utilization_system_counters(self, node_name):
"""Get the system counters for calculating node utilization."""
system_instance_uuids = (
self.zapi_client.get_performance_instance_uuids(
self.system_object_name, node_name))
system_counter_names = [
'avg_processor_busy',
self.avg_processor_busy_base_counter_name,
]
if 'cpu_elapsed_time1' in system_counter_names:
system_counter_names.append('cpu_elapsed_time')
system_counters = self.zapi_client.get_performance_counters(
self.system_object_name, system_instance_uuids,
system_counter_names)
return system_counters
def _get_node_utilization_wafl_counters(self, node_name):
"""Get the WAFL counters for calculating node utilization."""
wafl_instance_uuids = self.zapi_client.get_performance_instance_uuids(
'wafl', node_name)
wafl_counter_names = ['total_cp_msecs', 'cp_phase_times']
wafl_counters = self.zapi_client.get_performance_counters(
'wafl', wafl_instance_uuids, wafl_counter_names)
# Expand array data so we can use wafl:cp_phase_times[P2_FLUSH]
for counter in wafl_counters:
if 'cp_phase_times' in counter:
self._expand_performance_array(
'wafl', 'cp_phase_times', counter)
return wafl_counters
def _get_node_utilization_processor_counters(self, node_name):
"""Get the processor counters for calculating node utilization."""
processor_instance_uuids = (
self.zapi_client.get_performance_instance_uuids('processor',
node_name))
processor_counter_names = ['domain_busy', 'processor_elapsed_time']
processor_counters = self.zapi_client.get_performance_counters(
'processor', processor_instance_uuids, processor_counter_names)
# Expand array data so we can use processor:domain_busy[kahuna]
for counter in processor_counters:
if 'domain_busy' in counter:
self._expand_performance_array(
'processor', 'domain_busy', counter)
return processor_counters