Skip test cases when nova scheduler hints are not respected

THE CHANGE

This also introduce some new features:
 - allows to override a stack verification step to be executed
   at the end of stack creation to check the stack is in healthy
   cond and eventually retry creating it
 - restructure fail safe and concurrent creation loop
   to make it easier to understand and maintain
 - this should fix some different-host of same-host test case
   failures tipically seen in downstream jobs by checking
   nova scheduler hints are respected and eventually skip neutron
   test cases thare pretend to check connectivity between
   vms running on different (or the same) hypervisor host.

THE PROBLEM

When schedulers hints are given to nova to chose where to allocate
new VMs, there are no warranties to make sure it actually follows
them. In special it has saw many times that it fails to do it
expecially when many VMs are being created on the same time, for
example when running tobiko from a machine with many CPU core.
On such case test runner decides to spawn many parallel workers
(typically 2 for every core when executing on hyperthreading CPUs).
It has been seen the more test runner workers executing tests
the highest is the probability these hints are unmet by Nova.

THE WORKAROUND

To workaround this problem with this change we are verifying
scheduler hints has met. If not it retries creating them up
to two times before raising a SkipException to skip test
current case.

Change-Id: I8c0cff5ca69680aba3842bd7738f27651a677633
This commit is contained in:
Federico Ressi 2019-12-11 17:24:43 +01:00
parent 88cbd3847f
commit 3c2155cb4d
5 changed files with 162 additions and 58 deletions

View File

@ -31,3 +31,12 @@ HeatTemplateFileFixture = _template.HeatTemplateFileFixture
HeatStackFixture = _stack.HeatStackFixture
heat_stack_parameters = _stack.heat_stack_parameters
INIT_IN_PROGRESS = _stack.INIT_IN_PROGRESS
INIT_COMPLETE = _stack.INIT_COMPLETE
INIT_IN_PROGRESS = _stack.INIT_IN_PROGRESS
CREATE_IN_PROGRESS = _stack.CREATE_IN_PROGRESS
CREATE_COMPLETE = _stack.CREATE_COMPLETE
CREATE_FAILED = _stack.CREATE_FAILED
DELETE_IN_PROGRESS = _stack.DELETE_IN_PROGRESS
DELETE_COMPLETE = _stack.DELETE_COMPLETE
DELETE_FAILED = _stack.DELETE_FAILED

View File

@ -14,6 +14,7 @@
from __future__ import absolute_import
import collections
import random
import time
import typing # noqa
@ -109,62 +110,87 @@ class HeatStackFixture(tobiko.SharedFixture):
def get_stack_parameters(self):
return tobiko.reset_fixture(self.parameters).values
retry_create_min_sleep = 0.1
retry_create_max_sleep = 3.
def create_stack(self, retry=None):
"""Creates stack based on passed parameters."""
created_stack_ids = set()
retry = self._get_retry_value(retry)
while True:
attempts_count = self._get_retry_value(retry)
if attempts_count:
for attempt_number in range(1, attempts_count):
try:
LOG.debug('Creating stack %r: attempt %d of %d',
self.stack_name, attempt_number, attempts_count)
self.try_create_stack()
return self.validate_created_stack()
except tobiko.TobikoException:
# I use random time sleep to make conflicting concurrent
# creations less probable to occur
sleep_time = random_sleep_time(
min_time=self.retry_create_min_sleep,
max_time=self.retry_create_max_sleep)
LOG.debug('Failed creating stack %r (attempt %d of %d). '
'Will retry in %s seconds',
self.stack_name, attempt_number, attempts_count,
sleep_time, exc_info=1)
time.sleep(sleep_time)
LOG.debug('Creating stack %r: attempt %d of %d',
self.stack_name, attempts_count, attempts_count)
self.try_create_stack()
return self.validate_created_stack()
#: valid status expected to be the stack after exiting from create_stack
# method
expected_creted_status = {CREATE_IN_PROGRESS, CREATE_COMPLETE}
def validate_created_stack(self):
return self.wait_for_stack_status(
expected_status=self.expected_creted_status, check=True)
def try_create_stack(self):
stack = self.wait_for_stack_status(
expected_status={CREATE_COMPLETE, CREATE_FAILED,
CREATE_IN_PROGRESS, DELETE_COMPLETE,
DELETE_FAILED})
stack_status = getattr(stack, 'stack_status', DELETE_COMPLETE)
if stack_status in {CREATE_IN_PROGRESS, CREATE_COMPLETE}:
LOG.debug('Stack created: %r (id=%r)', self.stack_name, stack.id)
return stack
if stack_status.endswith('_FAILED'):
LOG.debug('Delete existing failed stack: %r (id=%r)',
self.stack_name, stack.id)
self.delete_stack(stack_id=stack.id)
stack = self.wait_for_stack_status(
expected_status={CREATE_COMPLETE, CREATE_FAILED,
CREATE_IN_PROGRESS, DELETE_COMPLETE,
DELETE_FAILED})
stack_status = getattr(stack, 'stack_status', DELETE_COMPLETE)
expected_status = {CREATE_COMPLETE, CREATE_IN_PROGRESS}
if stack_status in expected_status:
LOG.debug('Stack created: %r (id=%r)', self.stack_name,
stack.id)
for stack_id in created_stack_ids:
if self.stack.id != stack_id:
LOG.warning("Concurrent stack creation: delete "
"duplicated stack is %r (id=%r).",
self.stack_name, stack_id)
self.delete_stack(stack_id)
expected_status={DELETE_COMPLETE})
return stack
# Cleanup cached objects
self.stack = self._outputs = self._resources = None
if not retry:
status_reason = getattr(stack, 'stack_status_reason', None)
raise HeatStackCreationFailed(name=self.stack_name,
observed=stack_status,
expected=expected_status,
status_reason=status_reason)
# Compile template parameters
parameters = self.get_stack_parameters()
LOG.debug('Begin creating stack %r...', self.stack_name)
try:
created_stack_id = self.client.stacks.create(
stack_name=self.stack_name,
template=self.template.template_yaml,
parameters=parameters)['stack']['id']
except exc.HTTPConflict:
LOG.debug('Stack %r already exists.', self.stack_name)
created_stack_id = None
retry -= 1
if stack_status.endswith('_FAILED'):
LOG.debug('Delete existing failed stack: %r (id=%r)',
self.stack_name, stack.id)
self.delete_stack()
stack = self.wait_for_stack_status(
expected_status={DELETE_COMPLETE})
stack = self.wait_for_stack_status(
expected_status={CREATE_IN_PROGRESS, CREATE_COMPLETE})
if created_stack_id and stack.id != created_stack_id:
LOG.debug('Concurrent stack creation: delete duplicate stack %r '
'(id=%r)', self.stack_name, created_stack_id)
self.delete_stack(stack_id=created_stack_id)
return stack
# Cleanup cached objects
self.stack = self._outputs = self._resources = None
# Compile template parameters
parameters = self.get_stack_parameters()
try:
LOG.debug('Creating stack %r (re-tries left %d)...',
self.stack_name, retry)
stack_id = self.client.stacks.create(
stack_name=self.stack_name,
template=self.template.template_yaml,
parameters=parameters)['stack']['id']
except exc.HTTPConflict:
LOG.debug('Stack %r already exists.', self.stack_name)
else:
created_stack_ids.add(stack_id)
LOG.debug('Creating stack %r (id=%r)...', self.stack_name,
stack_id)
def validate_stack(self):
return self.stack
_resources = None
@ -184,9 +210,9 @@ class HeatStackFixture(tobiko.SharedFixture):
def delete_stack(self, stack_id=None):
"""Deletes stack."""
self.stack = self._outputs = self._resources = None
if not stack_id:
stack_id = self.stack_id
self.stack = self._outputs = self._resources = None
try:
self.client.stacks.delete(stack_id)
except exc.NotFound:
@ -212,7 +238,7 @@ class HeatStackFixture(tobiko.SharedFixture):
except exc.HTTPNotFound:
self.stack = stack = None
finally:
self._outputs = None
self._outputs = self._resources = None
return stack
def wait_for_create_complete(self, check=True):
@ -442,3 +468,8 @@ class HeatStackResourceFixture(HeatStackNamespaceFixture):
@property
def fixture_name(self):
return self.stack_name + '.resources'
def random_sleep_time(min_time, max_time):
assert min_time <= min_time
return (max_time - min_time) * random.random() + min_time

View File

@ -31,6 +31,8 @@ nova_client = _client.nova_client
NovaClientFixture = _client.NovaClientFixture
skip_if_missing_hypervisors = _hypervisor.skip_if_missing_hypervisors
get_same_host_hypervisors = _hypervisor.get_same_host_hypervisors
get_different_host_hypervisors = _hypervisor.get_different_host_hypervisors
find_server_ip_address = _server.find_server_ip_address
HasServerMixin = _server.HasServerMixin

View File

@ -13,6 +13,8 @@
# under the License.
from __future__ import absolute_import
import collections
import tobiko
from tobiko.openstack.nova import _client
@ -43,10 +45,6 @@ def missing_hypervisors(count=1, **params):
return max(0, count - len(agents))
def has_networking_agents(count=1, **params):
return not missing_hypervisors(count=count, **params)
def skip_if_missing_hypervisors(count=1, **params):
message = "missing {return_value!r} hypervisor(s)"
if params:
@ -54,3 +52,28 @@ def skip_if_missing_hypervisors(count=1, **params):
', '.join("{!s}={!r}".format(k, v) for k, v in params.items()))
return tobiko.skip_if(message, missing_hypervisors, count=count,
**params)
def get_same_host_hypervisors(server_ids, hypervisor):
host_hypervisors = get_servers_hypervisors(server_ids)
same_host_server_ids = host_hypervisors.pop(hypervisor, None)
if same_host_server_ids:
return {hypervisor: same_host_server_ids}
else:
return {}
def get_different_host_hypervisors(server_ids, hypervisor):
host_hypervisors = get_servers_hypervisors(server_ids)
host_hypervisors.pop(hypervisor, None)
return host_hypervisors
def get_servers_hypervisors(server_ids):
hypervisors = collections.defaultdict(list)
if server_ids:
for server_id in (server_ids or list()):
server = _client.get_server(server_id)
hypervisor = getattr(server, 'OS-EXT-SRV-ATTR:host')
hypervisors[hypervisor].append(server_id)
return hypervisors

View File

@ -165,11 +165,50 @@ class ServerStackFixture(heat.HeatStackFixture):
def scheduler_hints(self):
scheduler_hints = {}
if self.different_host:
scheduler_hints.update(different_host=self.different_host)
scheduler_hints.update(different_host=list(self.different_host))
if self.same_host:
scheduler_hints.update(same_host=self.same_host)
scheduler_hints.update(same_host=list(self.same_host))
return scheduler_hints
#: allow to retry creating server in case scheduler hits are not respected
retry_create = 3
expected_creted_status = {heat.CREATE_COMPLETE}
def validate_created_stack(self):
stack = super(ServerStackFixture, self).validate_created_stack()
self.validate_scheduler_hints()
return stack
@property
def hypervisor_host(self):
return getattr(self.server_details, 'OS-EXT-SRV-ATTR:host')
def validate_scheduler_hints(self):
if self.scheduler_hints:
hypervisor = self.hypervisor_host
self.validate_same_host_scheduler_hints(hypervisor=hypervisor)
self.validate_different_host_scheduler_hints(hypervisor=hypervisor)
def validate_same_host_scheduler_hints(self, hypervisor):
if self.same_host:
different_host_hypervisors = nova.get_different_host_hypervisors(
self.same_host, hypervisor)
if different_host_hypervisors:
tobiko.skip("server {!r} of stack {!r} created on "
"different hypervisor host from servers:\n{!r}",
self.server_id, self.stack_name,
different_host_hypervisors)
def validate_different_host_scheduler_hints(self, hypervisor):
if self.different_host:
same_host_hypervisors = nova.get_same_host_hypervisors(
self.different_host, hypervisor)
if same_host_hypervisors:
tobiko.skip("server {!r} of stack {!r} created on the same "
"hypervisor host as servers:\n{!r}",
self.server_id, self.stack_name,
same_host_hypervisors)
@property
def server_details(self):
return nova.get_server(self.server_id)