Add workaround for the race between creating rooter and booting vm

In case when VM is booted before Neutron L3 agent will prepare router
namespace and metadata proxy inside that router, VM may not be able to
get metadata from Nova thus e.g. ssh-key will not be configured in such
case.
To workaround that bug, this patch adds check if guest OS finished
booting and if yes, check if SSH to the VM is possible. In case that SSH
is not possible, Tobiko will reboot vm as, if it was due to bug [1] then
during second boot metadata service should be available and ssh key
should be configured properly.

[1] https://bugs.launchpad.net/neutron/+bug/1813787

Related-Bug: #1813787
Change-Id: I1bd7b86e64ea7083f365ac84ebe1fff34cb52036
This commit is contained in:
Slawek Kaplonski 2021-11-04 12:47:52 +01:00
parent 23718d05f2
commit a39fc56e27
2 changed files with 54 additions and 0 deletions

View File

@ -29,6 +29,8 @@ heat_template_file = _template.heat_template_file
HeatTemplateFixture = _template.HeatTemplateFixture
HeatTemplateFileFixture = _template.HeatTemplateFileFixture
InvalidStackError = _stack.InvalidStackError
HeatStackFixture = _stack.HeatStackFixture
heat_stack_parameters = _stack.heat_stack_parameters
INIT_IN_PROGRESS = _stack.INIT_IN_PROGRESS

View File

@ -247,9 +247,52 @@ class ServerStackFixture(heat.HeatStackFixture, abc.ABC):
retry_create = 3
expected_creted_status = {heat.CREATE_COMPLETE}
#: String which is used to check if guest OS booting process was finished
booting_end_pattern = "login:"
guest_boot_timeout = 600 # seconds
def wait_for_guest_boot_finished(self):
# NOTE(slaweq): in that simple check we can look for something like
# "login:" in the console log. If this is already there, it means
# that vm was booted properly already
for attempt in tobiko.retry(timeout=self.guest_boot_timeout,
interval=1):
if self.booting_end_pattern in self.console_output:
return
LOG.debug(f"Server {self.server_id} seems that is not boot yet")
if attempt.is_last:
raise heat.InvalidStackError(
f"Server {self.server_id} didn't boot properly.")
def validate_created_stack(self):
stack = super(ServerStackFixture, self).validate_created_stack()
self.validate_scheduler_hints()
if not self.config_drive:
# NOTE(slaweq): Because of the bug
# https://bugs.launchpad.net/neutron/+bug/1813787 there can be
# race condition between Neutron L3 agent and VM boot. As a result
# VM can be booted and tries to get e.g. ssh-key from the metadata
# service before router is really ready and can provide metadata
# for that VM.
# As a workaround for that issue for now, Tobiko checks if guest
# OS boot finished and checks if SSH to the VM is possible.
# If booting was finished but SSH isn't possible it will reboot VM.
# If we hit the issue from the bug
# https://bugs.launchpad.net/neutron/+bug/1813787 during reboot of
# the VM, ssh-key should be properly configured inside VM as
# metadata service should be already available in the Neutron's
# router.
try:
self.wait_for_guest_boot_finished()
except heat.InvalidStackError:
return
if not self.ssh_works():
LOG.warning(
f"SSH to the server '{self.server_id}' is not "
f"working properly. Trying to reboot VM to "
f"check if it is maybe caused by the bug "
f"https://bugs.launchpad.net/neutron/+bug/1813787")
nova.reboot_server(self.server_id)
return stack
@property
@ -382,6 +425,15 @@ class ServerStackFixture(heat.HeatStackFixture, abc.ABC):
ssh_client=ssh_client,
timeout=timeout)
def ssh_works(self):
try:
# We don't need to retry many times or wait long time for this,
# it either works as it should or not
self.ssh_client.connect(retry_count=1, retry_timeout=5)
return True
except Exception:
return False
user_data = None