Merge "Azure: fix race in leaked resources"
This commit is contained in:
commit
fe624b2ef2
|
@ -74,6 +74,13 @@ class AzureInstance(statemachine.Instance):
|
|||
return quota_info_from_sku(self.sku)
|
||||
|
||||
|
||||
class AzureResource(statemachine.Resource):
|
||||
def __init__(self, metadata, type, name):
|
||||
super().__init__(metadata)
|
||||
self.type = type
|
||||
self.name = name
|
||||
|
||||
|
||||
class AzureDeleteStateMachine(statemachine.StateMachine):
|
||||
VM_DELETING = 'deleting vm'
|
||||
NIC_DELETING = 'deleting nic'
|
||||
|
@ -302,41 +309,32 @@ class AzureAdapter(statemachine.Adapter):
|
|||
def getDeleteStateMachine(self, external_id):
|
||||
return AzureDeleteStateMachine(self, external_id)
|
||||
|
||||
def cleanupLeakedResources(self, known_nodes, known_uploads, metadata):
|
||||
def listResources(self):
|
||||
for vm in self._listVirtualMachines():
|
||||
node_id, upload_id = self._metadataMatches(vm, metadata)
|
||||
if (node_id and node_id not in known_nodes):
|
||||
self.log.info(f"Deleting leaked vm: {vm['name']}")
|
||||
with self.rate_limiter:
|
||||
self.azul.virtual_machines.delete(
|
||||
self.resource_group, vm['name'])
|
||||
yield AzureResource(vm.get('tags', {}), 'vm', vm['name'])
|
||||
for nic in self._listNetworkInterfaces():
|
||||
node_id, upload_id = self._metadataMatches(nic, metadata)
|
||||
if (node_id and node_id not in known_nodes):
|
||||
self.log.info(f"Deleting leaked nic: {nic['name']}")
|
||||
with self.rate_limiter:
|
||||
self.azul.network_interfaces.delete(
|
||||
self.resource_group, nic['name'])
|
||||
yield AzureResource(nic.get('tags', {}), 'nic', nic['name'])
|
||||
for pip in self._listPublicIPAddresses():
|
||||
node_id, upload_id = self._metadataMatches(pip, metadata)
|
||||
if (node_id and node_id not in known_nodes):
|
||||
self.log.info(f"Deleting leaked pip: {pip['name']}")
|
||||
with self.rate_limiter:
|
||||
self.azul.public_ip_addresses.delete(
|
||||
self.resource_group, pip['name'])
|
||||
yield AzureResource(pip.get('tags', {}), 'pip', pip['name'])
|
||||
for disk in self._listDisks():
|
||||
node_id, upload_id = self._metadataMatches(disk, metadata)
|
||||
if ((node_id and node_id not in known_nodes) or
|
||||
(upload_id and upload_id not in known_uploads)):
|
||||
self.log.info(f"Deleting leaked disk: {disk['name']}")
|
||||
with self.rate_limiter:
|
||||
self.azul.disks.delete(self.resource_group, disk['name'])
|
||||
yield AzureResource(disk.get('tags', {}), 'disk', disk['name'])
|
||||
for image in self._listImages():
|
||||
node_id, upload_id = self._metadataMatches(image, metadata)
|
||||
if (upload_id and upload_id not in known_uploads):
|
||||
self.log.info(f"Deleting leaked image: {image['name']}")
|
||||
with self.rate_limiter:
|
||||
self.azul.images.delete(self.resource_group, image['name'])
|
||||
yield AzureResource(image.get('tags', {}), 'image', image['name'])
|
||||
|
||||
def deleteResource(self, resource):
|
||||
self.log.info(f"Deleting leaked {resource.type}: {resource.name}")
|
||||
if resource.type == 'vm':
|
||||
crud = self.azul.virtual_machines
|
||||
elif resource.type == 'nic':
|
||||
crud = self.azul.network_interfaces
|
||||
elif resource.type == 'pip':
|
||||
crud = self.azul.public_ip_addresses
|
||||
elif resource.type == 'disk':
|
||||
crud = self.azul.disks
|
||||
elif resource.type == 'image':
|
||||
crud = self.azul.images
|
||||
with self.rate_limiter:
|
||||
crud.delete(self.resource_group, resource.name)
|
||||
|
||||
def listInstances(self):
|
||||
for vm in self._listVirtualMachines():
|
||||
|
|
|
@ -448,6 +448,8 @@ class StateMachineProvider(Provider, QuotaSupport):
|
|||
num_labels = sum([len(pool.labels)
|
||||
for pool in provider.pools.values()])
|
||||
self.label_quota_cache = cachetools.LRUCache(num_labels)
|
||||
self.possibly_leaked_nodes = {}
|
||||
self.possibly_leaked_uploads = {}
|
||||
|
||||
def start(self, zk_conn):
|
||||
super().start(zk_conn)
|
||||
|
@ -586,9 +588,36 @@ class StateMachineProvider(Provider, QuotaSupport):
|
|||
for upload in build:
|
||||
known_uploads.add(upload.id)
|
||||
|
||||
metadata = {'nodepool_provider_name': self.provider.name}
|
||||
self.adapter.cleanupLeakedResources(known_nodes,
|
||||
known_uploads, metadata)
|
||||
newly_leaked_nodes = {}
|
||||
newly_leaked_uploads = {}
|
||||
for resource in self.adapter.listResources():
|
||||
pn = resource.metadata.get('nodepool_provider_name')
|
||||
if pn != self.provider.name:
|
||||
continue
|
||||
node_id = resource.metadata.get('nodepool_node_id')
|
||||
upload_id = resource.metadata.get('nodepool_upload_id')
|
||||
if node_id and node_id not in known_nodes:
|
||||
newly_leaked_nodes[node_id] = resource
|
||||
if node_id in self.possibly_leaked_nodes:
|
||||
# We've seen this twice now, so it's not a race
|
||||
# condition.
|
||||
try:
|
||||
self.adapter.deleteResource(resource)
|
||||
except Exception:
|
||||
self.log.exception("Unable to delete leaked "
|
||||
f"resource for node {node_id}")
|
||||
if upload_id and upload_id not in known_uploads:
|
||||
newly_leaked_uploads[upload_id] = resource
|
||||
if upload_id in self.possibly_leaked_uploads:
|
||||
# We've seen this twice now, so it's not a race
|
||||
# condition.
|
||||
try:
|
||||
self.adapter.deleteResource(resource)
|
||||
except Exception:
|
||||
self.log.exception("Unable to delete leaked "
|
||||
f"resource for upload {upload_id}")
|
||||
self.possibly_leaked_nodes = newly_leaked_nodes
|
||||
self.possibly_leaked_uploads = newly_leaked_uploads
|
||||
|
||||
# Image handling
|
||||
|
||||
|
@ -695,6 +724,28 @@ class Instance:
|
|||
raise NotImplementedError()
|
||||
|
||||
|
||||
class Resource:
|
||||
"""Represents a cloud resource
|
||||
|
||||
This could be an instance, a disk, a floating IP, or anything
|
||||
else. It is used by the driver to detect leaked resources so the
|
||||
adapter can clean them up. There is one required attribute,
|
||||
`metadata`.
|
||||
|
||||
This is a dictionary of key/value pairs initially supplied by the
|
||||
driver to the adapter when an instance or image was created. This
|
||||
is used by the driver to detect leaked resources. The adapter may
|
||||
add any other information to this instance for its own bookeeping
|
||||
(resource type, id, etc).
|
||||
|
||||
:param dict metadata: A dictionary of metadata for the resource.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, metadata):
|
||||
self.metadata = metadata
|
||||
|
||||
|
||||
class StateMachine:
|
||||
START = 'start'
|
||||
|
||||
|
@ -772,28 +823,27 @@ class Adapter:
|
|||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def cleanupLeakedResources(self, known_nodes, known_uploads, metadata):
|
||||
"""Delete any leaked resources
|
||||
def listResources(self):
|
||||
"""Return a list of resources accessible to this provider.
|
||||
|
||||
Use the supplied metadata to delete any leaked resources.
|
||||
Typically you would list all of the instances in the provider,
|
||||
then list any other resources (floating IPs, etc). Any
|
||||
resources with matching metadata not associated with an
|
||||
instance should be deleted, and any instances with matching
|
||||
metadata and whose node id is not in known_nodes should also
|
||||
be deleted.
|
||||
The yielded values should represent all resources accessible
|
||||
to this provider, not only those under the control of this
|
||||
adapter, but all visible instances in order for the driver to
|
||||
identify leaked resources and instruct the adapter to remove
|
||||
them.
|
||||
|
||||
Look up the `nodepool_node_id` metadata attribute on cloud
|
||||
instances to compare to known_nodes, and `nodepool_upload_id`
|
||||
for known_uploads.
|
||||
:returns: A generator of :py:class:`Resource` objects.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
:param set known_nodes: A set of string values representing
|
||||
known node ids.
|
||||
:param set known_uploads: A set of string values representing
|
||||
known image upload ids.
|
||||
:param dict metadata: Metadata values to compare with cloud
|
||||
instances.
|
||||
def deleteResource(self, resource):
|
||||
"""Delete the supplied resource
|
||||
|
||||
The driver has identified a leaked resource and the adapter
|
||||
should delete it.
|
||||
|
||||
:param Resource resource: A Resource object previously
|
||||
returned by 'listResources'.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
|
|
Loading…
Reference in New Issue