Add call_retry for ModelBuilder for error recovery
Add call_retry method for ModelBuilder classes along with configuration options. This allows ModelBuilder classes to reattempt any failed calls to external services such as Nova or Ironic. Change-Id: Ided697adebed957e5ff13b4c6b5b06c816f81c4a
This commit is contained in:
parent
1af7ac107c
commit
cadc000f32
9
releasenotes/notes/api-call-retry-fef741ac684c58dd.yaml
Normal file
9
releasenotes/notes/api-call-retry-fef741ac684c58dd.yaml
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
---
|
||||||
|
features:
|
||||||
|
- |
|
||||||
|
API calls while building the Compute data model will be retried upon
|
||||||
|
failure. The amount of failures allowed before giving up and the time before
|
||||||
|
reattempting are configurable. The `api_call_retries` and
|
||||||
|
`api_query_timeout` parameters in the `[collector]` group can be used to
|
||||||
|
adjust these paremeters. 10 retries with a 1 second time in between
|
||||||
|
reattempts is the default.
|
@ -35,6 +35,13 @@ Supported in-tree collectors include:
|
|||||||
Custom data model collector plugins can be defined with the
|
Custom data model collector plugins can be defined with the
|
||||||
``watcher_cluster_data_model_collectors`` extension point.
|
``watcher_cluster_data_model_collectors`` extension point.
|
||||||
"""),
|
"""),
|
||||||
|
cfg.IntOpt('api_call_retries',
|
||||||
|
default=10,
|
||||||
|
help="Number of retries before giving up on external service "
|
||||||
|
"calls."),
|
||||||
|
cfg.IntOpt('api_query_timeout',
|
||||||
|
default=1,
|
||||||
|
help="Time before retry after failed call to external service.")
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@ -106,6 +106,7 @@ strategies.
|
|||||||
import abc
|
import abc
|
||||||
import copy
|
import copy
|
||||||
import threading
|
import threading
|
||||||
|
import time
|
||||||
|
|
||||||
from oslo_config import cfg
|
from oslo_config import cfg
|
||||||
from oslo_log import log
|
from oslo_log import log
|
||||||
@ -116,6 +117,7 @@ from watcher.common.loader import loadable
|
|||||||
from watcher.decision_engine.model import model_root
|
from watcher.decision_engine.model import model_root
|
||||||
|
|
||||||
LOG = log.getLogger(__name__)
|
LOG = log.getLogger(__name__)
|
||||||
|
CONF = cfg.CONF
|
||||||
|
|
||||||
|
|
||||||
@six.add_metaclass(abc.ABCMeta)
|
@six.add_metaclass(abc.ABCMeta)
|
||||||
@ -194,6 +196,42 @@ class BaseClusterDataModelCollector(loadable.LoadableSingleton):
|
|||||||
|
|
||||||
class BaseModelBuilder(object):
|
class BaseModelBuilder(object):
|
||||||
|
|
||||||
|
def call_retry(self, f, *args, **kwargs):
|
||||||
|
"""Attempts to call external service
|
||||||
|
|
||||||
|
Attempts to access data from the external service and handles
|
||||||
|
exceptions. The retrieval should be retried in accordance
|
||||||
|
to the value of api_call_retries
|
||||||
|
:param f: The method that performs the actual querying for metrics
|
||||||
|
:param args: Array of arguments supplied to the method
|
||||||
|
:param kwargs: The amount of arguments supplied to the method
|
||||||
|
:return: The value as retrieved from the external service
|
||||||
|
"""
|
||||||
|
|
||||||
|
num_retries = CONF.collector.api_call_retries
|
||||||
|
timeout = CONF.collector.api_query_timeout
|
||||||
|
|
||||||
|
for i in range(num_retries):
|
||||||
|
try:
|
||||||
|
return f(*args, **kwargs)
|
||||||
|
except Exception as e:
|
||||||
|
LOG.exception(e)
|
||||||
|
self.call_retry_reset(e)
|
||||||
|
LOG.warning("Retry {0} of {1}, error while calling service "
|
||||||
|
"retry in {2} seconds".format(i+1, num_retries,
|
||||||
|
timeout))
|
||||||
|
time.sleep(timeout)
|
||||||
|
raise
|
||||||
|
|
||||||
|
@abc.abstractmethod
|
||||||
|
def call_retry_reset(self, exc):
|
||||||
|
"""Attempt to recover after encountering an error
|
||||||
|
|
||||||
|
Recover from errors while calling external services, the exception
|
||||||
|
can be used to make a better decision on how to best recover.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
@abc.abstractmethod
|
@abc.abstractmethod
|
||||||
def execute(self, model_scope):
|
def execute(self, model_scope):
|
||||||
"""Build the cluster data model limited to the scope and return it
|
"""Build the cluster data model limited to the scope and return it
|
||||||
|
@ -176,9 +176,10 @@ class CinderModelBuilder(base.BaseModelBuilder):
|
|||||||
This includes components which represent actual infrastructure
|
This includes components which represent actual infrastructure
|
||||||
hardware.
|
hardware.
|
||||||
"""
|
"""
|
||||||
for snode in self.cinder_helper.get_storage_node_list():
|
for snode in self.call_retry(
|
||||||
|
self.cinder_helper.get_storage_node_list):
|
||||||
self.add_storage_node(snode)
|
self.add_storage_node(snode)
|
||||||
for pool in self.cinder_helper.get_storage_pool_list():
|
for pool in self.call_retry(self.cinder_helper.get_storage_pool_list):
|
||||||
pool = self._build_storage_pool(pool)
|
pool = self._build_storage_pool(pool)
|
||||||
self.model.add_pool(pool)
|
self.model.add_pool(pool)
|
||||||
storage_name = getattr(pool, 'name')
|
storage_name = getattr(pool, 'name')
|
||||||
@ -213,8 +214,8 @@ class CinderModelBuilder(base.BaseModelBuilder):
|
|||||||
except IndexError:
|
except IndexError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
volume_type = self.cinder_helper.get_volume_type_by_backendname(
|
volume_type = self.call_retry(
|
||||||
backend)
|
self.cinder_helper.get_volume_type_by_backendname, backend)
|
||||||
|
|
||||||
# build up the storage node.
|
# build up the storage node.
|
||||||
node_attributes = {
|
node_attributes = {
|
||||||
@ -262,7 +263,7 @@ class CinderModelBuilder(base.BaseModelBuilder):
|
|||||||
self._add_virtual_storage()
|
self._add_virtual_storage()
|
||||||
|
|
||||||
def _add_virtual_storage(self):
|
def _add_virtual_storage(self):
|
||||||
volumes = self.cinder_helper.get_volume_list()
|
volumes = self.call_retry(self.cinder_helper.get_volume_list)
|
||||||
for vol in volumes:
|
for vol in volumes:
|
||||||
volume = self._build_volume_node(vol)
|
volume = self._build_volume_node(vol)
|
||||||
self.model.add_volume(volume)
|
self.model.add_volume(volume)
|
||||||
|
@ -105,6 +105,6 @@ class BareMetalModelBuilder(base.BaseModelBuilder):
|
|||||||
|
|
||||||
def execute(self, model_scope):
|
def execute(self, model_scope):
|
||||||
# TODO(Dantali0n): Use scope to limit size of model
|
# TODO(Dantali0n): Use scope to limit size of model
|
||||||
for node in self.ironic_helper.get_ironic_node_list():
|
for node in self.call_retry(self.ironic_helper.get_ironic_node_list):
|
||||||
self.add_ironic_node(node)
|
self.add_ironic_node(node)
|
||||||
return self.model
|
return self.model
|
||||||
|
@ -211,7 +211,7 @@ class NovaModelBuilder(base.BaseModelBuilder):
|
|||||||
self.nova_helper = nova_helper.NovaHelper(osc=self.osc)
|
self.nova_helper = nova_helper.NovaHelper(osc=self.osc)
|
||||||
|
|
||||||
def _collect_aggregates(self, host_aggregates, _nodes):
|
def _collect_aggregates(self, host_aggregates, _nodes):
|
||||||
aggregate_list = self.nova_helper.get_aggregate_list()
|
aggregate_list = self.call_retry(f=self.nova_helper.get_aggregate_list)
|
||||||
aggregate_ids = [aggregate['id'] for aggregate
|
aggregate_ids = [aggregate['id'] for aggregate
|
||||||
in host_aggregates if 'id' in aggregate]
|
in host_aggregates if 'id' in aggregate]
|
||||||
aggregate_names = [aggregate['name'] for aggregate
|
aggregate_names = [aggregate['name'] for aggregate
|
||||||
@ -226,7 +226,7 @@ class NovaModelBuilder(base.BaseModelBuilder):
|
|||||||
_nodes.update(aggregate.hosts)
|
_nodes.update(aggregate.hosts)
|
||||||
|
|
||||||
def _collect_zones(self, availability_zones, _nodes):
|
def _collect_zones(self, availability_zones, _nodes):
|
||||||
service_list = self.nova_helper.get_service_list()
|
service_list = self.call_retry(f=self.nova_helper.get_service_list)
|
||||||
zone_names = [zone['name'] for zone
|
zone_names = [zone['name'] for zone
|
||||||
in availability_zones]
|
in availability_zones]
|
||||||
include_all_nodes = False
|
include_all_nodes = False
|
||||||
@ -252,14 +252,15 @@ class NovaModelBuilder(base.BaseModelBuilder):
|
|||||||
|
|
||||||
if not compute_nodes:
|
if not compute_nodes:
|
||||||
self.no_model_scope_flag = True
|
self.no_model_scope_flag = True
|
||||||
all_nodes = self.nova_helper.get_compute_node_list()
|
all_nodes = self.call_retry(
|
||||||
|
f=self.nova_helper.get_compute_node_list)
|
||||||
compute_nodes = set(
|
compute_nodes = set(
|
||||||
[node.hypervisor_hostname for node in all_nodes])
|
[node.hypervisor_hostname for node in all_nodes])
|
||||||
LOG.debug("compute nodes: %s", compute_nodes)
|
LOG.debug("compute nodes: %s", compute_nodes)
|
||||||
for node_name in compute_nodes:
|
for node_name in compute_nodes:
|
||||||
cnode = self.nova_helper.get_compute_node_by_name(node_name,
|
cnode = self.call_retry(
|
||||||
servers=True,
|
self.nova_helper.get_compute_node_by_name,
|
||||||
detailed=True)
|
node_name, servers=True, detailed=True)
|
||||||
if cnode:
|
if cnode:
|
||||||
node_info = cnode[0]
|
node_info = cnode[0]
|
||||||
# filter out baremetal node
|
# filter out baremetal node
|
||||||
@ -339,9 +340,8 @@ class NovaModelBuilder(base.BaseModelBuilder):
|
|||||||
# compute API. If we need to request more than 1000 servers,
|
# compute API. If we need to request more than 1000 servers,
|
||||||
# we can set limit=-1. For details, please see:
|
# we can set limit=-1. For details, please see:
|
||||||
# https://bugs.launchpad.net/watcher/+bug/1834679
|
# https://bugs.launchpad.net/watcher/+bug/1834679
|
||||||
instances = self.nova_helper.get_instance_list(
|
instances = self.call_retry(f=self.nova_helper.get_instance_list,
|
||||||
filters=filters,
|
filters=filters, limit=limit)
|
||||||
limit=limit)
|
|
||||||
for inst in instances:
|
for inst in instances:
|
||||||
# Add Node
|
# Add Node
|
||||||
instance = self._build_instance_node(inst)
|
instance = self._build_instance_node(inst)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user