Max percentage failure support

In this patch we update the new strategies so they support a max failure
percentage on a play. Which will be useful for the tripleo roles so we
can tolerate some % of failure (e.g. on computes) during a deployment.

Co-Authored-By: Alex Schultz <aschultz@redhat.com>
Co-Authored-By: Kevin Carter <kecarter@redhat.com>
Co-Authored-By: Emilien Macchi <emilien@redhat.com>

Change-Id: I30a930e4f7b8200ffa67c17b967db8a1fb60b4c0
changes/46/731846/7
Emilien Macchi 3 years ago
parent 64313d1f35
commit 87f9c93548

@ -47,6 +47,7 @@ class TripleoBase(StrategyBase):
self._play_context = None
self._strat_results = []
self.noop_task = None
self._fail_cache = {}
# these were defined in 2.9
self._has_hosts_cache = False
self._has_hosts_cache_all = False
@ -84,6 +85,50 @@ class TripleoBase(StrategyBase):
task.name = name
self._callback_sent = True
def _get_fail_percent(self, host):
"""Return maximum percentage failure per role"""
if host and host in self._fail_cache:
return self._fail_cache[host]
fail_vars = self._variable_manager.get_vars(play=self._iterator._play,
host=host,
task=None)
percent = fail_vars.get('max_fail_percentage', 0)
role = fail_vars.get('tripleo_role_name', 'default')
self._fail_cache[host] = (percent, role)
return (percent, role)
def _check_fail_percent(self, host, current_failures):
"""Check if max fail pourcentage was reached
When a failure occurs for a host, check if we reached
the max percentage of failure for the group in which
the host is part from.
"""
percent, role = self._get_fail_percent(host)
current_failed = current_failures.get(role, 1)
groups = self._inventory.get_groups_dict()
group_count = len(groups.get(role, []))
if group_count == 0:
return True
failed_percent = (current_failed / group_count) * 100
if failed_percent > percent:
return True
return False
def _get_current_failures(self):
"""Return the number of failures per role"""
failures = {}
for host, _ in self._iterator.get_failed_hosts().items():
host_obj = self._inventory.get_host(host)
per, role = self._get_fail_percent(host_obj)
if role in failures:
failures[role] += 1
else:
failures[role] = 1
return failures
def process_includes(self, host_results, noop=False):
"""Handle includes

@ -123,10 +123,12 @@ class StrategyModule(BASE.TripleoBase):
function returns True if there were failures and False if
there are no failures.
"""
fail_lookup = self._get_current_failures()
if self._any_errors_fatal:
for res in results:
if ((res.is_failed() or res._task.action == 'meta')
and self._iterator.is_failed(res._host)):
and self._iterator.is_failed(res._host)
and self._check_fail_percent(res._host, fail_lookup)):
return True
return False

@ -334,15 +334,20 @@ class StrategyModule(BASE.TripleoBase):
failed_hosts = []
unreachable_hosts = []
fail_lookup = self._get_current_failures()
for res in self._strat_results:
if ((res.is_failed() or res._task.action == 'meta')
and self._iterator.is_failed(res._host)):
failed_hosts.append(res._host.name)
failed_hosts.append(res._host)
elif res.is_unreachable():
unreachable_hosts.append(res._host.name)
# TODO(mwhahaha): handle max_fail_percentage by tripleo role
if (self._any_errors_fatal
unreachable_hosts.append(res._host)
errored = False
for host in set(failed_hosts + unreachable_hosts):
errored = self._check_fail_percent(host, fail_lookup)
if errored:
break
if (errored and self._any_errors_fatal
and (len(failed_hosts) > 0
or len(unreachable_hosts) > 0)):
result = self._process_failures()

Loading…
Cancel
Save