Files
zuul/zuul/provider/__init__.py
T
James E. Blair a14971778a Fix endpoint races
This change fixes two endpoint-related races:

* We now shut down the aws driver delete thread rather than setting
  it to daemon.  This provides more predictable behavior in tests.

  Some tests started launcher endpoints out-of-band (ie, they didn't
  really use the launcher, but it still started and stopped outside
  of the main action of the test).  In order to ensure that every
  endpoint that we start threads for is also correctly stopped,
  standard stop/start methods are added to endpoints and locks are
  used to ensure that the start/stop sequence is predictable.

  This also lets us use endpoints in the scheduler/web without
  starting background threads for them.

* It was possible to create multiple endpoints for a given region
  if they were created around the same time since we did not use
  a lock around the critical section of the cache.  To correct that,
  surrond the cache lookup with a lock and move the behavior to a
  mixin class for reusability.  (A cachetools decorator around
  getEndpoint was considered but the provider object in the method
  signature makes that difficult).

Change-Id: I5e8ca06e76fced1bb342250a953ecda576168874
2024-10-28 15:37:42 -07:00

584 lines
19 KiB
Python

# Copyright 2024 Acme Gating, LLC
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import abc
import json
import math
import threading
import urllib.parse
from zuul.lib.voluputil import Required, Optional, Nullable, assemble
from zuul import model
from zuul.driver.util import QuotaInformation
from zuul.zk import zkobject
import zuul.provider.schema as provider_schema
import voluptuous as vs
class CNameMixin:
@property
def canonical_name(self):
return '/'.join([
urllib.parse.quote_plus(
self.project_canonical_name),
urllib.parse.quote_plus(self.name),
])
class BaseProviderImage(CNameMixin, metaclass=abc.ABCMeta):
inheritable_schema = assemble(
provider_schema.common_image,
)
schema = assemble(
provider_schema.common_image,
provider_schema.base_image,
)
def __init__(self, image_config, provider_config):
new_config = image_config.copy()
for k in self.inheritable_schema.schema.keys():
if k not in new_config and k in provider_config:
new_config[k] = provider_config[k]
self.__dict__.update(self.schema(new_config))
# TODO: generate this automatically from config
self.format = 'raw'
class BaseProviderFlavor(CNameMixin, metaclass=abc.ABCMeta):
inheritable_schema = assemble()
schema = assemble(
provider_schema.base_flavor,
)
def __init__(self, flavor_config, provider_config):
new_config = flavor_config.copy()
for k in self.inheritable_schema.schema.keys():
if k not in new_config and k in provider_config:
new_config[k] = provider_config[k]
self.__dict__.update(self.schema(new_config))
class BaseProviderLabel(CNameMixin, metaclass=abc.ABCMeta):
inheritable_schema = assemble()
schema = assemble(
provider_schema.base_label,
)
image_flavor_inheritable_schema = assemble()
def __init__(self, label_config, provider_config):
new_config = label_config.copy()
for k in self.inheritable_schema.schema.keys():
if k not in new_config and k in provider_config:
new_config[k] = provider_config[k]
self.__dict__.update(self.schema(new_config))
def __repr__(self):
return (f"<{self.__class__.__name__} "
f"canonical_name={self.canonical_name} >")
def inheritFrom(self, image, flavor):
# Some label attributes should default to values specified in
# a flavor or image (for example, volume types) but can be
# overridden by a label. This method implements that
# inheritance, using the attributes specified in
# image_label_inhertable_schema.
for attr in self.image_flavor_inheritable_schema.schema:
# Get the vs.Optional attribute mutation
if hasattr(attr, 'output'):
attr = attr.output
if getattr(self, attr, None) is None:
setattr(self, attr,
getattr(flavor, attr,
getattr(image, attr, None)))
class BaseProviderEndpoint(metaclass=abc.ABCMeta):
"""Base class for provider endpoints.
Providers and Sections are combined to describe clouds, and they
may not correspond exactly with the cloud's topology. To
reconcile this, the Endpoint class is used for storing information
about what we would typically call a region of a cloud. This is
the unit of visibility of instances, VPCs, images, etc.
"""
def __init__(self, driver, connection, name):
self.driver = driver
self.connection = connection
self.name = name
self.start_lock = threading.Lock()
self.started = False
self.stopped = False
def start(self):
with self.start_lock:
if not self.stopped and not self.started:
self.startEndpoint()
self.started = True
def stop(self):
with self.start_lock:
if self.started:
self.stopEndpoint()
# Set the stopped flag regardless of whether we started so
# that we won't start after stopping.
self.stopped = True
@property
def canonical_name(self):
return '/'.join([
urllib.parse.quote_plus(self.connection.connection_name),
urllib.parse.quote_plus(self.name),
])
def handleStart(self):
"""Start the endpoint
This method may start any threads necessary for the endpoint.
"""
raise NotImplementedError()
def handleStop(self):
"""Stop the endpoint
This method must stop all endpoint threads.
"""
raise NotImplementedError()
class BaseProviderSchema(metaclass=abc.ABCMeta):
def getLabelSchema(self):
return BaseProviderLabel.schema
def getImageSchema(self):
return BaseProviderImage.schema
def getFlavorSchema(self):
return BaseProviderFlavor.schema
def getProviderSchema(self):
schema = vs.Schema({
'_source_context': model.SourceContext,
'_start_mark': model.ZuulMark,
Required('name'): str,
Required('section'): str,
Required('labels'): [self.getLabelSchema()],
Required('images'): [self.getImageSchema()],
Required('flavors'): [self.getFlavorSchema()],
Optional('abstract', default=False): Nullable(bool),
Optional('parent'): Nullable(str),
Required('connection'): str,
Optional('boot-timeout'): Nullable(int),
Optional('launch-timeout'): Nullable(int),
Optional('launch-attempts', default=3): int,
})
return schema
class BaseProvider(zkobject.PolymorphicZKObjectMixin,
zkobject.ShardedZKObject):
"""Base class for provider."""
schema = BaseProviderSchema().getProviderSchema()
def __init__(self, *args):
super().__init__()
if args:
(driver, connection, tenant_name, canonical_name, config) = args
config = config.copy()
config.pop('_source_context')
config.pop('_start_mark')
parsed_config = self.parseConfig(config, connection)
parsed_config.pop('connection')
self._set(
driver=driver,
connection=connection,
connection_name=connection.connection_name,
tenant_name=tenant_name,
canonical_name=canonical_name,
config=config,
**parsed_config,
)
def __repr__(self):
return (f"<{self.__class__.__name__} "
f"canonical_name={self.canonical_name}>")
@classmethod
def fromZK(cls, context, path, connections):
"""Deserialize a Provider (subclass) from ZK.
To deserialize a Provider from ZK, pass the connection
registry as the "connections" argument.
The Provider subclass will automatically be deserialized and
the connection/driver attributes updated from the connection
registry.
"""
raw_data, zstat = cls._loadData(context, path)
extra = {'connections': connections}
obj = cls._fromRaw(raw_data, zstat, extra)
connection = connections.connections[obj.connection_name]
obj._set(connection=connection,
driver=connection.driver)
return obj
def getProviderSchema(self):
return self.schema
def parseProviderConfig(self, config):
"""Parse the provider config without any images/labels/flavors
so that the other objects can collect any information they
need from the cloud region when they are parsed"""
schema = self.getProviderSchema()
ret = schema(config)
return ret
def parseFullConfig(self, config):
ret = self.parseProviderConfig(config)
ret.update(dict(
images=self.parseImages(config),
flavors=self.parseFlavors(config),
labels=self.parseLabels(config),
))
return ret
def parseConfig(self, config, connection):
schema = self.getProviderSchema()
ret = schema(config)
images = self.parseImages(config, connection)
flavors = self.parseFlavors(config, connection)
labels = self.parseLabels(config, connection)
for label in labels.values():
label.inheritFrom(images[label.image], flavors[label.flavor])
ret.update(dict(
images=images,
flavors=flavors,
labels=labels,
))
return ret
def deserialize(self, raw, context, extra):
data = super().deserialize(raw, context)
connections = extra['connections']
connection = connections.connections[data['connection_name']]
data['connection'] = connection
data['driver'] = connection.driver
data.update(self.parseConfig(data['config'], connection))
return data
def serialize(self, context):
data = dict(
tenant_name=self.tenant_name,
canonical_name=self.canonical_name,
config=self.config,
connection_name=self.connection.connection_name,
)
return json.dumps(data, sort_keys=True).encode("utf8")
@property
def tenant_scoped_name(self):
return f'{self.tenant_name}-{self.name}'
def parseImages(self, config, connection):
images = {}
for image_config in config.get('images', []):
i = self.parseImage(image_config, config, connection)
images[i.name] = i
return images
def parseFlavors(self, config, connection):
flavors = {}
for flavor_config in config.get('flavors', []):
f = self.parseFlavor(flavor_config, config, connection)
flavors[f.name] = f
return flavors
def parseLabels(self, config, connection):
labels = {}
for label_config in config.get('labels', []):
l = self.parseLabel(label_config, config, connection)
labels[l.name] = l
return labels
@abc.abstractmethod
def parseLabel(self, label_config, provider_config):
"""Instantiate a ProviderLabel subclass
:returns: a ProviderLabel subclass
:rtype: ProviderLabel
"""
pass
@abc.abstractmethod
def parseFlavor(self, flavor_config, provider_config):
"""Instantiate a ProviderFlavor subclass
:returns: a ProviderFlavor subclass
:rtype: ProviderFlavor
"""
pass
@abc.abstractmethod
def parseImage(self, image_config, provider_config):
"""Instantiate a ProviderImage subclass
:returns: a ProviderImage subclass
:rtype: ProviderImage
"""
pass
@abc.abstractmethod
def getEndpoint(self):
"""Get an endpoint for this provider"""
pass
def getPath(self):
path = (f'/zuul/tenant/{self.tenant_name}'
f'/provider/{self.canonical_name}/config')
return path
def hasLabel(self, label):
return label in self.labels
def getNodeTags(self, system_id, label, node_uuid,
provider=None, request=None):
"""Return the tags that should be stored with the node
:param str system_id: The Zuul system uuid
:param ProviderLabel label: The node label
:param str node_uuid: The node uuid
:param Provider provider: The cloud provider or None
:param NodesetRequest request: The node request or None
"""
tags = dict()
# TODO: add other potentially useful attrs from nodepool
attributes = model.Attributes(
request_id=request.uuid if request else None,
tenant_name=provider.tenant_name if provider else None,
)
for k, v in label.tags.items():
try:
tags[k] = v.format(**attributes)
except Exception:
self.log.exception("Error formatting metadata %s", k)
fixed = {
'zuul_system_id': system_id,
'zuul_node_uuid': node_uuid,
}
tags.update(fixed)
return tags
def getCreateStateMachine(self, node,
image_external_id,
log):
"""Return a state machine suitable for creating an instance
This method should return a new state machine object
initialized to create the described node.
:param ProviderNode node: The node object.
:param ProviderLabel label: A config object representing the
provider-label for the node.
:param str image_external_id: If provided, the external id of
a previously uploaded image; if None, then the adapter should
look up a cloud image based on the label.
:param log Logger: A logger instance for emitting annotated
logs related to the request.
:returns: A :py:class:`StateMachine` object.
"""
raise NotImplementedError()
def getDeleteStateMachine(self, node, log):
"""Return a state machine suitable for deleting an instance
This method should return a new state machine object
initialized to delete the described instance.
:param node ProviderNode: The node that should be deleted.
:param log Logger: A logger instance for emitting annotated
logs related to the request.
"""
raise NotImplementedError()
def listInstances(self):
"""Return an iterator of instances accessible to this provider.
The yielded values should represent all instances accessible
to this provider, not only those under the control of this
adapter, but all visible instances in order to achive accurate
quota calculation.
:returns: A generator of :py:class:`Instance` objects.
"""
raise NotImplementedError()
def listResources(self):
"""Return a list of resources accessible to this provider.
The yielded values should represent all resources accessible
to this provider, not only those under the control of this
adapter, but all visible instances in order for the driver to
identify leaked resources and instruct the adapter to remove
them.
:returns: A generator of :py:class:`Resource` objects.
"""
raise NotImplementedError()
def deleteResource(self, resource):
"""Delete the supplied resource
The driver has identified a leaked resource and the adapter
should delete it.
:param Resource resource: A Resource object previously
returned by 'listResources'.
"""
raise NotImplementedError()
def getQuotaLimits(self):
"""Return the quota limits for this provider
The default implementation returns a simple QuotaInformation
with no limits. Override this to provide accurate
information.
:returns: A :py:class:`QuotaInformation` object.
"""
return QuotaInformation(default=math.inf)
def getQuotaForLabel(self, label):
"""Return information about the quota used for a label
The default implementation returns a simple QuotaInformation
for one instance; override this to return more detailed
information including cores and RAM.
:param ProviderLabel label: A config object describing
a label for an instance.
:returns: A :py:class:`QuotaInformation` object.
"""
return QuotaInformation(instances=1)
def getAZs(self):
"""Return a list of availability zones for this provider
One of these will be selected at random and supplied to the
create state machine. If a request handler is building a node
set from an existing ready node, then the AZ from that node
will be used instead of the results of this method.
:returns: A list of availability zone names.
"""
return [None]
def labelReady(self, label):
"""Indicate whether a label is ready in the provided cloud
This is used by the launcher to determine whether it should
consider a label to be in-service for a provider. If this
returns False, the label will be ignored for this provider.
This does not need to consider whether a diskimage is ready;
the launcher handles that itself. Instead, this can be used
to determine whether a cloud-image is available.
:param ProviderLabel label: A config object describing a label
for an instance.
:returns: A bool indicating whether the label is ready.
"""
return True
# The following methods must be implemented only if image
# management is supported:
def uploadImage(self, provider_image, image_name, filename,
image_format=None, metadata=None, md5=None,
sha256=None):
"""Upload the image to the cloud
:param provider_image ProviderImageConfig:
The provider's config for this image
:param image_name str: The name of the image
:param filename str: The path to the local file to be uploaded
:param image_format str: The format of the image (e.g., "qcow")
:param metadata dict: A dictionary of metadata that must be
stored on the image in the cloud.
:param md5 str: The md5 hash of the image file
:param sha256 str: The sha256 hash of the image file
:return: The external id of the image in the cloud
"""
raise NotImplementedError()
def deleteImage(self, external_id):
"""Delete an image from the cloud
:param external_id str: The external id of the image to delete
"""
raise NotImplementedError()
# The following methods are optional
def getConsoleLog(self, label, node):
"""Return the console log from the specified server
:param label ConfigLabel: The label config for the node
:param ProviderNode node: The node of the server
"""
raise NotImplementedError()
def notifyNodescanFailure(self, label, node):
"""Notify the adapter of a nodescan failure
:param label ConfigLabel: The label config for the node
:param ProviderNode node: The node of the server
"""
pass
class EndpointCacheMixin:
def __init__(self, *args, **kw):
super().__init__(*args, **kw)
self.endpoints = {}
self.endpoints_lock = threading.Lock()
def getEndpointById(self, endpoint_id, create_args):
with self.endpoints_lock:
try:
return self.endpoints[endpoint_id]
except KeyError:
pass
endpoint = self._endpoint_class(*create_args)
self.endpoints[endpoint_id] = endpoint
return endpoint
def stopEndpoints(self):
with self.endpoints_lock:
for endpoint in self.endpoints.values():
endpoint.stop()