Add call_retry for ModelBuilder for error recovery

Add call_retry method for ModelBuilder classes along with configuration
options. This allows ModelBuilder classes to reattempt any failed calls
to external services such as Nova or Ironic.

Change-Id: Ided697adebed957e5ff13b4c6b5b06c816f81c4a
This commit is contained in:
Dantali0n 2019-07-11 22:07:20 +02:00
parent 1af7ac107c
commit cadc000f32
6 changed files with 70 additions and 15 deletions

View File

@ -0,0 +1,9 @@
---
features:
- |
API calls while building the Compute data model will be retried upon
failure. The amount of failures allowed before giving up and the time before
reattempting are configurable. The `api_call_retries` and
`api_query_timeout` parameters in the `[collector]` group can be used to
adjust these paremeters. 10 retries with a 1 second time in between
reattempts is the default.

View File

@ -35,6 +35,13 @@ Supported in-tree collectors include:
Custom data model collector plugins can be defined with the
``watcher_cluster_data_model_collectors`` extension point.
"""),
cfg.IntOpt('api_call_retries',
default=10,
help="Number of retries before giving up on external service "
"calls."),
cfg.IntOpt('api_query_timeout',
default=1,
help="Time before retry after failed call to external service.")
]

View File

@ -106,6 +106,7 @@ strategies.
import abc
import copy
import threading
import time
from oslo_config import cfg
from oslo_log import log
@ -116,6 +117,7 @@ from watcher.common.loader import loadable
from watcher.decision_engine.model import model_root
LOG = log.getLogger(__name__)
CONF = cfg.CONF
@six.add_metaclass(abc.ABCMeta)
@ -194,6 +196,42 @@ class BaseClusterDataModelCollector(loadable.LoadableSingleton):
class BaseModelBuilder(object):
def call_retry(self, f, *args, **kwargs):
"""Attempts to call external service
Attempts to access data from the external service and handles
exceptions. The retrieval should be retried in accordance
to the value of api_call_retries
:param f: The method that performs the actual querying for metrics
:param args: Array of arguments supplied to the method
:param kwargs: The amount of arguments supplied to the method
:return: The value as retrieved from the external service
"""
num_retries = CONF.collector.api_call_retries
timeout = CONF.collector.api_query_timeout
for i in range(num_retries):
try:
return f(*args, **kwargs)
except Exception as e:
LOG.exception(e)
self.call_retry_reset(e)
LOG.warning("Retry {0} of {1}, error while calling service "
"retry in {2} seconds".format(i+1, num_retries,
timeout))
time.sleep(timeout)
raise
@abc.abstractmethod
def call_retry_reset(self, exc):
"""Attempt to recover after encountering an error
Recover from errors while calling external services, the exception
can be used to make a better decision on how to best recover.
"""
pass
@abc.abstractmethod
def execute(self, model_scope):
"""Build the cluster data model limited to the scope and return it

View File

@ -176,9 +176,10 @@ class CinderModelBuilder(base.BaseModelBuilder):
This includes components which represent actual infrastructure
hardware.
"""
for snode in self.cinder_helper.get_storage_node_list():
for snode in self.call_retry(
self.cinder_helper.get_storage_node_list):
self.add_storage_node(snode)
for pool in self.cinder_helper.get_storage_pool_list():
for pool in self.call_retry(self.cinder_helper.get_storage_pool_list):
pool = self._build_storage_pool(pool)
self.model.add_pool(pool)
storage_name = getattr(pool, 'name')
@ -213,8 +214,8 @@ class CinderModelBuilder(base.BaseModelBuilder):
except IndexError:
pass
volume_type = self.cinder_helper.get_volume_type_by_backendname(
backend)
volume_type = self.call_retry(
self.cinder_helper.get_volume_type_by_backendname, backend)
# build up the storage node.
node_attributes = {
@ -262,7 +263,7 @@ class CinderModelBuilder(base.BaseModelBuilder):
self._add_virtual_storage()
def _add_virtual_storage(self):
volumes = self.cinder_helper.get_volume_list()
volumes = self.call_retry(self.cinder_helper.get_volume_list)
for vol in volumes:
volume = self._build_volume_node(vol)
self.model.add_volume(volume)

View File

@ -105,6 +105,6 @@ class BareMetalModelBuilder(base.BaseModelBuilder):
def execute(self, model_scope):
# TODO(Dantali0n): Use scope to limit size of model
for node in self.ironic_helper.get_ironic_node_list():
for node in self.call_retry(self.ironic_helper.get_ironic_node_list):
self.add_ironic_node(node)
return self.model

View File

@ -211,7 +211,7 @@ class NovaModelBuilder(base.BaseModelBuilder):
self.nova_helper = nova_helper.NovaHelper(osc=self.osc)
def _collect_aggregates(self, host_aggregates, _nodes):
aggregate_list = self.nova_helper.get_aggregate_list()
aggregate_list = self.call_retry(f=self.nova_helper.get_aggregate_list)
aggregate_ids = [aggregate['id'] for aggregate
in host_aggregates if 'id' in aggregate]
aggregate_names = [aggregate['name'] for aggregate
@ -226,7 +226,7 @@ class NovaModelBuilder(base.BaseModelBuilder):
_nodes.update(aggregate.hosts)
def _collect_zones(self, availability_zones, _nodes):
service_list = self.nova_helper.get_service_list()
service_list = self.call_retry(f=self.nova_helper.get_service_list)
zone_names = [zone['name'] for zone
in availability_zones]
include_all_nodes = False
@ -252,14 +252,15 @@ class NovaModelBuilder(base.BaseModelBuilder):
if not compute_nodes:
self.no_model_scope_flag = True
all_nodes = self.nova_helper.get_compute_node_list()
all_nodes = self.call_retry(
f=self.nova_helper.get_compute_node_list)
compute_nodes = set(
[node.hypervisor_hostname for node in all_nodes])
LOG.debug("compute nodes: %s", compute_nodes)
for node_name in compute_nodes:
cnode = self.nova_helper.get_compute_node_by_name(node_name,
servers=True,
detailed=True)
cnode = self.call_retry(
self.nova_helper.get_compute_node_by_name,
node_name, servers=True, detailed=True)
if cnode:
node_info = cnode[0]
# filter out baremetal node
@ -339,9 +340,8 @@ class NovaModelBuilder(base.BaseModelBuilder):
# compute API. If we need to request more than 1000 servers,
# we can set limit=-1. For details, please see:
# https://bugs.launchpad.net/watcher/+bug/1834679
instances = self.nova_helper.get_instance_list(
filters=filters,
limit=limit)
instances = self.call_retry(f=self.nova_helper.get_instance_list,
filters=filters, limit=limit)
for inst in instances:
# Add Node
instance = self._build_instance_node(inst)