Add call_retry for ModelBuilder for error recovery
Add call_retry method for ModelBuilder classes along with configuration options. This allows ModelBuilder classes to reattempt any failed calls to external services such as Nova or Ironic. Change-Id: Ided697adebed957e5ff13b4c6b5b06c816f81c4a
This commit is contained in:
parent
1af7ac107c
commit
cadc000f32
9
releasenotes/notes/api-call-retry-fef741ac684c58dd.yaml
Normal file
9
releasenotes/notes/api-call-retry-fef741ac684c58dd.yaml
Normal file
@ -0,0 +1,9 @@
|
||||
---
|
||||
features:
|
||||
- |
|
||||
API calls while building the Compute data model will be retried upon
|
||||
failure. The amount of failures allowed before giving up and the time before
|
||||
reattempting are configurable. The `api_call_retries` and
|
||||
`api_query_timeout` parameters in the `[collector]` group can be used to
|
||||
adjust these paremeters. 10 retries with a 1 second time in between
|
||||
reattempts is the default.
|
@ -35,6 +35,13 @@ Supported in-tree collectors include:
|
||||
Custom data model collector plugins can be defined with the
|
||||
``watcher_cluster_data_model_collectors`` extension point.
|
||||
"""),
|
||||
cfg.IntOpt('api_call_retries',
|
||||
default=10,
|
||||
help="Number of retries before giving up on external service "
|
||||
"calls."),
|
||||
cfg.IntOpt('api_query_timeout',
|
||||
default=1,
|
||||
help="Time before retry after failed call to external service.")
|
||||
]
|
||||
|
||||
|
||||
|
@ -106,6 +106,7 @@ strategies.
|
||||
import abc
|
||||
import copy
|
||||
import threading
|
||||
import time
|
||||
|
||||
from oslo_config import cfg
|
||||
from oslo_log import log
|
||||
@ -116,6 +117,7 @@ from watcher.common.loader import loadable
|
||||
from watcher.decision_engine.model import model_root
|
||||
|
||||
LOG = log.getLogger(__name__)
|
||||
CONF = cfg.CONF
|
||||
|
||||
|
||||
@six.add_metaclass(abc.ABCMeta)
|
||||
@ -194,6 +196,42 @@ class BaseClusterDataModelCollector(loadable.LoadableSingleton):
|
||||
|
||||
class BaseModelBuilder(object):
|
||||
|
||||
def call_retry(self, f, *args, **kwargs):
|
||||
"""Attempts to call external service
|
||||
|
||||
Attempts to access data from the external service and handles
|
||||
exceptions. The retrieval should be retried in accordance
|
||||
to the value of api_call_retries
|
||||
:param f: The method that performs the actual querying for metrics
|
||||
:param args: Array of arguments supplied to the method
|
||||
:param kwargs: The amount of arguments supplied to the method
|
||||
:return: The value as retrieved from the external service
|
||||
"""
|
||||
|
||||
num_retries = CONF.collector.api_call_retries
|
||||
timeout = CONF.collector.api_query_timeout
|
||||
|
||||
for i in range(num_retries):
|
||||
try:
|
||||
return f(*args, **kwargs)
|
||||
except Exception as e:
|
||||
LOG.exception(e)
|
||||
self.call_retry_reset(e)
|
||||
LOG.warning("Retry {0} of {1}, error while calling service "
|
||||
"retry in {2} seconds".format(i+1, num_retries,
|
||||
timeout))
|
||||
time.sleep(timeout)
|
||||
raise
|
||||
|
||||
@abc.abstractmethod
|
||||
def call_retry_reset(self, exc):
|
||||
"""Attempt to recover after encountering an error
|
||||
|
||||
Recover from errors while calling external services, the exception
|
||||
can be used to make a better decision on how to best recover.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def execute(self, model_scope):
|
||||
"""Build the cluster data model limited to the scope and return it
|
||||
|
@ -176,9 +176,10 @@ class CinderModelBuilder(base.BaseModelBuilder):
|
||||
This includes components which represent actual infrastructure
|
||||
hardware.
|
||||
"""
|
||||
for snode in self.cinder_helper.get_storage_node_list():
|
||||
for snode in self.call_retry(
|
||||
self.cinder_helper.get_storage_node_list):
|
||||
self.add_storage_node(snode)
|
||||
for pool in self.cinder_helper.get_storage_pool_list():
|
||||
for pool in self.call_retry(self.cinder_helper.get_storage_pool_list):
|
||||
pool = self._build_storage_pool(pool)
|
||||
self.model.add_pool(pool)
|
||||
storage_name = getattr(pool, 'name')
|
||||
@ -213,8 +214,8 @@ class CinderModelBuilder(base.BaseModelBuilder):
|
||||
except IndexError:
|
||||
pass
|
||||
|
||||
volume_type = self.cinder_helper.get_volume_type_by_backendname(
|
||||
backend)
|
||||
volume_type = self.call_retry(
|
||||
self.cinder_helper.get_volume_type_by_backendname, backend)
|
||||
|
||||
# build up the storage node.
|
||||
node_attributes = {
|
||||
@ -262,7 +263,7 @@ class CinderModelBuilder(base.BaseModelBuilder):
|
||||
self._add_virtual_storage()
|
||||
|
||||
def _add_virtual_storage(self):
|
||||
volumes = self.cinder_helper.get_volume_list()
|
||||
volumes = self.call_retry(self.cinder_helper.get_volume_list)
|
||||
for vol in volumes:
|
||||
volume = self._build_volume_node(vol)
|
||||
self.model.add_volume(volume)
|
||||
|
@ -105,6 +105,6 @@ class BareMetalModelBuilder(base.BaseModelBuilder):
|
||||
|
||||
def execute(self, model_scope):
|
||||
# TODO(Dantali0n): Use scope to limit size of model
|
||||
for node in self.ironic_helper.get_ironic_node_list():
|
||||
for node in self.call_retry(self.ironic_helper.get_ironic_node_list):
|
||||
self.add_ironic_node(node)
|
||||
return self.model
|
||||
|
@ -211,7 +211,7 @@ class NovaModelBuilder(base.BaseModelBuilder):
|
||||
self.nova_helper = nova_helper.NovaHelper(osc=self.osc)
|
||||
|
||||
def _collect_aggregates(self, host_aggregates, _nodes):
|
||||
aggregate_list = self.nova_helper.get_aggregate_list()
|
||||
aggregate_list = self.call_retry(f=self.nova_helper.get_aggregate_list)
|
||||
aggregate_ids = [aggregate['id'] for aggregate
|
||||
in host_aggregates if 'id' in aggregate]
|
||||
aggregate_names = [aggregate['name'] for aggregate
|
||||
@ -226,7 +226,7 @@ class NovaModelBuilder(base.BaseModelBuilder):
|
||||
_nodes.update(aggregate.hosts)
|
||||
|
||||
def _collect_zones(self, availability_zones, _nodes):
|
||||
service_list = self.nova_helper.get_service_list()
|
||||
service_list = self.call_retry(f=self.nova_helper.get_service_list)
|
||||
zone_names = [zone['name'] for zone
|
||||
in availability_zones]
|
||||
include_all_nodes = False
|
||||
@ -252,14 +252,15 @@ class NovaModelBuilder(base.BaseModelBuilder):
|
||||
|
||||
if not compute_nodes:
|
||||
self.no_model_scope_flag = True
|
||||
all_nodes = self.nova_helper.get_compute_node_list()
|
||||
all_nodes = self.call_retry(
|
||||
f=self.nova_helper.get_compute_node_list)
|
||||
compute_nodes = set(
|
||||
[node.hypervisor_hostname for node in all_nodes])
|
||||
LOG.debug("compute nodes: %s", compute_nodes)
|
||||
for node_name in compute_nodes:
|
||||
cnode = self.nova_helper.get_compute_node_by_name(node_name,
|
||||
servers=True,
|
||||
detailed=True)
|
||||
cnode = self.call_retry(
|
||||
self.nova_helper.get_compute_node_by_name,
|
||||
node_name, servers=True, detailed=True)
|
||||
if cnode:
|
||||
node_info = cnode[0]
|
||||
# filter out baremetal node
|
||||
@ -339,9 +340,8 @@ class NovaModelBuilder(base.BaseModelBuilder):
|
||||
# compute API. If we need to request more than 1000 servers,
|
||||
# we can set limit=-1. For details, please see:
|
||||
# https://bugs.launchpad.net/watcher/+bug/1834679
|
||||
instances = self.nova_helper.get_instance_list(
|
||||
filters=filters,
|
||||
limit=limit)
|
||||
instances = self.call_retry(f=self.nova_helper.get_instance_list,
|
||||
filters=filters, limit=limit)
|
||||
for inst in instances:
|
||||
# Add Node
|
||||
instance = self._build_instance_node(inst)
|
||||
|
Loading…
Reference in New Issue
Block a user