Max percentage failure support

In this patch we update the new strategies so they support a max failure
percentage on a play. Which will be useful for the tripleo roles so we
can tolerate some % of failure (e.g. on computes) during a deployment.

Co-Authored-By: Alex Schultz <aschultz@redhat.com>
Co-Authored-By: Kevin Carter <kecarter@redhat.com>
Co-Authored-By: Emilien Macchi <emilien@redhat.com>

Change-Id: I30a930e4f7b8200ffa67c17b967db8a1fb60b4c0
This commit is contained in:
Emilien Macchi 2020-05-29 17:04:15 -04:00
parent 64313d1f35
commit 87f9c93548
3 changed files with 57 additions and 5 deletions

View File

@ -47,6 +47,7 @@ class TripleoBase(StrategyBase):
self._play_context = None self._play_context = None
self._strat_results = [] self._strat_results = []
self.noop_task = None self.noop_task = None
self._fail_cache = {}
# these were defined in 2.9 # these were defined in 2.9
self._has_hosts_cache = False self._has_hosts_cache = False
self._has_hosts_cache_all = False self._has_hosts_cache_all = False
@ -84,6 +85,50 @@ class TripleoBase(StrategyBase):
task.name = name task.name = name
self._callback_sent = True self._callback_sent = True
def _get_fail_percent(self, host):
"""Return maximum percentage failure per role"""
if host and host in self._fail_cache:
return self._fail_cache[host]
fail_vars = self._variable_manager.get_vars(play=self._iterator._play,
host=host,
task=None)
percent = fail_vars.get('max_fail_percentage', 0)
role = fail_vars.get('tripleo_role_name', 'default')
self._fail_cache[host] = (percent, role)
return (percent, role)
def _check_fail_percent(self, host, current_failures):
"""Check if max fail pourcentage was reached
When a failure occurs for a host, check if we reached
the max percentage of failure for the group in which
the host is part from.
"""
percent, role = self._get_fail_percent(host)
current_failed = current_failures.get(role, 1)
groups = self._inventory.get_groups_dict()
group_count = len(groups.get(role, []))
if group_count == 0:
return True
failed_percent = (current_failed / group_count) * 100
if failed_percent > percent:
return True
return False
def _get_current_failures(self):
"""Return the number of failures per role"""
failures = {}
for host, _ in self._iterator.get_failed_hosts().items():
host_obj = self._inventory.get_host(host)
per, role = self._get_fail_percent(host_obj)
if role in failures:
failures[role] += 1
else:
failures[role] = 1
return failures
def process_includes(self, host_results, noop=False): def process_includes(self, host_results, noop=False):
"""Handle includes """Handle includes

View File

@ -123,10 +123,12 @@ class StrategyModule(BASE.TripleoBase):
function returns True if there were failures and False if function returns True if there were failures and False if
there are no failures. there are no failures.
""" """
fail_lookup = self._get_current_failures()
if self._any_errors_fatal: if self._any_errors_fatal:
for res in results: for res in results:
if ((res.is_failed() or res._task.action == 'meta') if ((res.is_failed() or res._task.action == 'meta')
and self._iterator.is_failed(res._host)): and self._iterator.is_failed(res._host)
and self._check_fail_percent(res._host, fail_lookup)):
return True return True
return False return False

View File

@ -334,15 +334,20 @@ class StrategyModule(BASE.TripleoBase):
failed_hosts = [] failed_hosts = []
unreachable_hosts = [] unreachable_hosts = []
fail_lookup = self._get_current_failures()
for res in self._strat_results: for res in self._strat_results:
if ((res.is_failed() or res._task.action == 'meta') if ((res.is_failed() or res._task.action == 'meta')
and self._iterator.is_failed(res._host)): and self._iterator.is_failed(res._host)):
failed_hosts.append(res._host.name) failed_hosts.append(res._host)
elif res.is_unreachable(): elif res.is_unreachable():
unreachable_hosts.append(res._host.name) unreachable_hosts.append(res._host)
# TODO(mwhahaha): handle max_fail_percentage by tripleo role errored = False
if (self._any_errors_fatal for host in set(failed_hosts + unreachable_hosts):
errored = self._check_fail_percent(host, fail_lookup)
if errored:
break
if (errored and self._any_errors_fatal
and (len(failed_hosts) > 0 and (len(failed_hosts) > 0
or len(unreachable_hosts) > 0)): or len(unreachable_hosts) > 0)):
result = self._process_failures() result = self._process_failures()