Retry launches in statemachine driver

This adds retry handling to the state machine driver and removes it
from the driver adapters.

This allows the adapters to be simpler as they don't need to implement
any internal retry handling.  It also lets us choose to retry due
to events handled by the statemachine driver itself.

One of those events is a failed keyscan.  In the old OpenStack driver,
a failed keyscan would result in a retry rather than a launch failure,
but the migration of the OpenStack driver to statemachine inadvertently
changed that behavior.  This restores the old behavior, applies it
to all drivers, and codifies it with a unit test.

Several logging lines have been changed to use arguments instead of
format strings for consistency and safety.

Change-Id: I670ef8d762558cf8346f071be282af8dfc803747
This commit is contained in:
James E. Blair
2023-05-03 14:12:05 -07:00
parent 3cbb00cdeb
commit df5cf68711
9 changed files with 185 additions and 145 deletions

View File

@@ -24,6 +24,7 @@ import cachetools.func
from nodepool.driver.utils import QuotaInformation, RateLimiter
from nodepool.driver import statemachine
from nodepool import exceptions
from . import azul
@@ -173,17 +174,15 @@ class AzureCreateStateMachine(statemachine.StateMachine):
PIP_CREATING = 'creating pip'
NIC_CREATING = 'creating nic'
VM_CREATING = 'creating vm'
VM_RETRY = 'retrying vm creation'
NIC_QUERY = 'querying nic'
PIP_QUERY = 'querying pip'
COMPLETE = 'complete'
def __init__(self, adapter, hostname, label, image_external_id,
metadata, retries, request, log):
metadata, request, log):
super().__init__()
self.log = log
self.adapter = adapter
self.retries = retries
self.attempts = 0
self.image_external_id = image_external_id
self.image_reference = None
@@ -270,21 +269,10 @@ class AzureCreateStateMachine(statemachine.StateMachine):
if self.adapter._succeeded(self.vm):
self.state = self.NIC_QUERY
elif self.adapter._failed(self.vm):
if self.attempts >= self.retries:
raise Exception("Too many retries")
self.attempts += 1
self.vm = self.adapter._deleteVirtualMachine(
self.external_id)
self.state = self.VM_RETRY
raise exceptions.LaunchStatusException("VM in failed state")
else:
return
if self.state == self.VM_RETRY:
self.vm = self.adapter._refresh_delete(self.vm)
if self.vm is None:
self.state = self.NIC_CREATING
return
if self.state == self.NIC_QUERY:
self.nic = self.adapter._refresh(self.nic, force=True)
all_found = True
@@ -351,11 +339,11 @@ class AzureAdapter(statemachine.Adapter):
self._getSKUs()
def getCreateStateMachine(self, hostname, label,
image_external_id, metadata, retries,
image_external_id, metadata,
request, az, log):
return AzureCreateStateMachine(self, hostname, label,
image_external_id, metadata,
retries, request, log)
request, log)
def getDeleteStateMachine(self, external_id, log):
return AzureDeleteStateMachine(self, external_id)