Improve max-servers handling for GCE

The quota handling for the simple driver (used by GCE) only handles
max-servers, but even so, it still didn't take into consideration
currently building servers.  If a number of simultaneous requests
were received, it would try to build them all and eventually return
node failures for the ones that the cloud refused to build.

The OpenStack driver has a lot of nice quota handling methods which
do take currently building nodes into account.  This change moves
some of those methods into a new Provider mixin class for quota
support.  This class implements some handy methods which perform
the calculations and provides some abstract methods which providers
will need to implement in order to supply information.

The simple driver is updated to use this system, though it still
only supports max-servers for the moment.

Change-Id: I0ce742452914301552f4af5e92a3e36304a7e291
This commit is contained in:
James E. Blair 2020-06-20 09:51:21 -07:00
parent 004f60eb8b
commit 9e9a5b9bfd
6 changed files with 179 additions and 103 deletions

View File

@ -147,7 +147,7 @@ class ProviderNotifications(object):
pass
class Provider(ProviderNotifications, metaclass=abc.ABCMeta):
class Provider(ProviderNotifications):
"""The Provider interface
Drivers implement this interface to supply Providers. Each
@ -160,6 +160,9 @@ class Provider(ProviderNotifications, metaclass=abc.ABCMeta):
The class or instance attribute **name** must be provided as a string.
"""
def __init__(self, *args, **kw):
super().__init__(*args, **kw)
@abc.abstractmethod
def start(self, zk_conn):
"""Start this provider

View File

@ -152,7 +152,7 @@ class OpenStackNodeLauncher(NodeLauncher):
self.node.image_id = image_id
pool = self.handler.provider.pools.get(self.node.pool)
resources = self.handler.manager.quotaNeededByNodeType(
resources = self.handler.manager.quotaNeededByLabel(
self.node.type[0], pool)
self.node.resources = resources.quota['compute']
if username:
@ -340,7 +340,7 @@ class OpenStackNodeRequestHandler(NodeRequestHandler):
return True
def hasRemainingQuota(self, ntype):
needed_quota = self.manager.quotaNeededByNodeType(ntype, self.pool)
needed_quota = self.manager.quotaNeededByLabel(ntype, self.pool)
if not self.pool.ignore_provider_quota:
# Calculate remaining quota which is calculated as:
@ -374,7 +374,7 @@ class OpenStackNodeRequestHandler(NodeRequestHandler):
for ntype in node_types:
needed_quota.add(
self.manager.quotaNeededByNodeType(ntype, self.pool))
self.manager.quotaNeededByLabel(ntype, self.pool))
if not self.pool.ignore_provider_quota:
cloud_quota = self.manager.estimatedNodepoolQuota()

View File

@ -14,7 +14,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import logging
import operator
import os
@ -24,7 +23,7 @@ import openstack
from nodepool import exceptions
from nodepool.driver import Provider
from nodepool.driver.utils import QuotaInformation
from nodepool.driver.utils import QuotaInformation, QuotaSupport
from nodepool.nodeutils import iterate_timeout
from nodepool import stats
from nodepool import version
@ -35,19 +34,18 @@ from nodepool.driver.openstack import handler
IPS_LIST_AGE = 5 # How long to keep a cached copy of the ip list
MAX_QUOTA_AGE = 5 * 60 # How long to keep the quota information cached
class OpenStackProvider(Provider):
class OpenStackProvider(Provider, QuotaSupport):
log = logging.getLogger("nodepool.driver.openstack.OpenStackProvider")
def __init__(self, provider):
super().__init__()
self.provider = provider
self._images = {}
self._networks = {}
self.__flavors = {} # TODO(gtema): caching
self.__azs = None
self._current_nodepool_quota = None
self._zk = None
self._down_ports = set()
self._last_port_cleanup = None
@ -91,7 +89,11 @@ class OpenStackProvider(Provider):
app_version=version.version_info.version_string()
)
def quotaNeededByNodeType(self, ntype, pool):
def getProviderLimits(self):
limits = self._client.get_compute_limits()
return QuotaInformation.construct_from_limits(limits)
def quotaNeededByLabel(self, ntype, pool):
provider_label = pool.labels[ntype]
flavor = self.findFlavor(provider_label.flavor_name,
@ -99,86 +101,6 @@ class OpenStackProvider(Provider):
return QuotaInformation.construct_from_flavor(flavor)
def estimatedNodepoolQuota(self):
'''
Determine how much quota is available for nodepool managed resources.
This needs to take into account the quota of the tenant, resources
used outside of nodepool and the currently used resources by nodepool,
max settings in nodepool config. This is cached for MAX_QUOTA_AGE
seconds.
:return: Total amount of resources available which is currently
available to nodepool including currently existing nodes.
'''
if self._current_nodepool_quota:
now = time.time()
if now < self._current_nodepool_quota['timestamp'] + MAX_QUOTA_AGE:
return copy.deepcopy(self._current_nodepool_quota['quota'])
limits = self._client.get_compute_limits()
# This is initialized with the full tenant quota and later becomes
# the quota available for nodepool.
nodepool_quota = QuotaInformation.construct_from_limits(limits)
self.log.debug("Provider quota for %s: %s",
self.provider.name, nodepool_quota)
# Subtract the unmanaged quota usage from nodepool_max
# to get the quota available for us.
nodepool_quota.subtract(self.unmanagedQuotaUsed())
self._current_nodepool_quota = {
'quota': nodepool_quota,
'timestamp': time.time()
}
self.log.debug("Available quota for %s: %s",
self.provider.name, nodepool_quota)
return copy.deepcopy(nodepool_quota)
def invalidateQuotaCache(self):
self._current_nodepool_quota['timestamp'] = 0
def estimatedNodepoolQuotaUsed(self, pool=None):
'''
Sums up the quota used (or planned) currently by nodepool. If pool is
given it is filtered by the pool.
:param pool: If given, filtered by the pool.
:return: Calculated quota in use by nodepool
'''
used_quota = QuotaInformation()
for node in self._zk.nodeIterator():
if node.provider == self.provider.name:
try:
if pool and not node.pool == pool.name:
continue
provider_pool = self.provider.pools.get(node.pool)
if not provider_pool:
self.log.warning(
"Cannot find provider pool for node %s" % node)
# This node is in a funny state we log it for debugging
# but move on and don't account it as we can't properly
# calculate its cost without pool info.
continue
if node.type[0] not in provider_pool.labels:
self.log.warning("Node type is not in provider pool "
"for node %s" % node)
# This node is also in a funny state; the config
# may have changed under it. It should settle out
# eventually when it's deleted.
continue
node_resources = self.quotaNeededByNodeType(
node.type[0], provider_pool)
used_quota.add(node_resources)
except Exception:
self.log.exception("Couldn't consider invalid node %s "
"for quota:" % node)
return used_quota
def unmanagedQuotaUsed(self):
'''
Sums up the quota used by servers unmanaged by nodepool.

View File

@ -18,7 +18,7 @@ import math
from nodepool.driver.taskmanager import BaseTaskManagerProvider, Task
from nodepool.driver import Driver, NodeRequestHandler
from nodepool.driver.utils import NodeLauncher, QuotaInformation
from nodepool.driver.utils import NodeLauncher, QuotaInformation, QuotaSupport
from nodepool.nodeutils import iterate_timeout, nodescan
from nodepool import exceptions
from nodepool import zk
@ -199,18 +199,31 @@ class SimpleTaskManagerHandler(NodeRequestHandler):
:param ntype: node type for the quota check
:return: True if there is enough quota, False otherwise
'''
# TODO: Add support for real quota handling; this only handles
# max_servers.
needed_quota = QuotaInformation(cores=1, instances=1, ram=1, default=1)
n_running = self.manager.countNodes(self.provider.name, self.pool.name)
pool_quota = QuotaInformation(
cores=math.inf,
instances=self.pool.max_servers - n_running,
ram=math.inf,
default=math.inf)
needed_quota = self.manager.quotaNeededByLabel(ntype, self.pool)
# Calculate remaining quota which is calculated as:
# quota = <total nodepool quota> - <used quota> - <quota for node>
cloud_quota = self.manager.estimatedNodepoolQuota()
cloud_quota.subtract(
self.manager.estimatedNodepoolQuotaUsed())
cloud_quota.subtract(needed_quota)
self.log.debug("Predicted remaining provider quota: %s",
cloud_quota)
if not cloud_quota.non_negative():
return False
# Now calculate pool specific quota. Values indicating no quota default
# to math.inf representing infinity that can be calculated with.
# TODO: add cores, ram
pool_quota = QuotaInformation(instances=self.pool.max_servers,
default=math.inf)
pool_quota.subtract(
self.manager.estimatedNodepoolQuotaUsed(self.pool))
self.log.debug("Current pool quota: %s" % pool_quota)
pool_quota.subtract(needed_quota)
self.log.debug("hasRemainingQuota({},{}) = {}".format(
self.pool, ntype, pool_quota))
self.log.debug("Predicted remaining pool quota: %s", pool_quota)
return pool_quota.non_negative()
def launchesComplete(self):
@ -243,7 +256,7 @@ class SimpleTaskManagerHandler(NodeRequestHandler):
self._threads.append(thd)
class SimpleTaskManagerProvider(BaseTaskManagerProvider):
class SimpleTaskManagerProvider(BaseTaskManagerProvider, QuotaSupport):
"""The Provider implementation for the SimpleTaskManager driver
framework"""
def __init__(self, adapter, provider):
@ -263,6 +276,22 @@ class SimpleTaskManagerProvider(BaseTaskManagerProvider):
def labelReady(self, label):
return True
def getProviderLimits(self):
# TODO: query the api to get real limits
return QuotaInformation(
cores=math.inf,
instances=math.inf,
ram=math.inf,
default=math.inf)
def quotaNeededByLabel(self, ntype, pool):
# TODO: return real quota information about a label
return QuotaInformation(cores=1, instances=1, ram=1, default=1)
def unmanagedQuotaUsed(self):
# TODO: return real quota information about quota
return QuotaInformation()
def cleanupNode(self, external_id):
instance = self.getInstance(external_id)
if (not instance) or instance.deleted:

View File

@ -164,6 +164,7 @@ class BaseTaskManagerProvider(Provider):
log = logging.getLogger("nodepool.driver.taskmanager.TaskManagerProvider")
def __init__(self, provider):
super().__init__()
self.provider = provider
self.thread = None
self.task_manager = TaskManager(provider.name, provider.rate_limit)

View File

@ -15,6 +15,7 @@
# limitations under the License.
import abc
import copy
import logging
import math
import threading
@ -28,6 +29,9 @@ from nodepool import zk
from nodepool.logconfig import get_annotated_logger
MAX_QUOTA_AGE = 5 * 60 # How long to keep the quota information cached
class NodeLauncher(threading.Thread,
stats.StatsReporter,
metaclass=abc.ABCMeta):
@ -171,3 +175,120 @@ class QuotaInformation:
def __str__(self):
return str(self.quota)
class QuotaSupport:
"""A mix-in class for providers to supply quota support methods"""
def __init__(self, *args, **kw):
super().__init__(*args, **kw)
self._current_nodepool_quota = None
@abc.abstractmethod
def quotaNeededByLabel(self, label, pool):
"""Return quota information about a label
:param str label: The label name
:param ProviderPool pool: A ProviderPool config object with the label
:return: QuotaInformation about the label
"""
pass
@abc.abstractmethod
def unmanagedQuotaUsed(self):
'''
Sums up the quota used by servers unmanaged by nodepool.
:return: Calculated quota in use by unmanaged servers
'''
pass
@abc.abstractmethod
def getProviderLimits(self):
'''
Get the resource limits from the provider.
:return: QuotaInformation about the label
'''
print("base")
pass
def invalidateQuotaCache(self):
self._current_nodepool_quota['timestamp'] = 0
def estimatedNodepoolQuota(self):
'''
Determine how much quota is available for nodepool managed resources.
This needs to take into account the quota of the tenant, resources
used outside of nodepool and the currently used resources by nodepool,
max settings in nodepool config. This is cached for MAX_QUOTA_AGE
seconds.
:return: Total amount of resources available which is currently
available to nodepool including currently existing nodes.
'''
if self._current_nodepool_quota:
now = time.time()
if now < self._current_nodepool_quota['timestamp'] + MAX_QUOTA_AGE:
return copy.deepcopy(self._current_nodepool_quota['quota'])
# This is initialized with the full tenant quota and later becomes
# the quota available for nodepool.
nodepool_quota = self.getProviderLimits()
self.log.debug("Provider quota for %s: %s",
self.provider.name, nodepool_quota)
# Subtract the unmanaged quota usage from nodepool_max
# to get the quota available for us.
nodepool_quota.subtract(self.unmanagedQuotaUsed())
self._current_nodepool_quota = {
'quota': nodepool_quota,
'timestamp': time.time()
}
self.log.debug("Available quota for %s: %s",
self.provider.name, nodepool_quota)
return copy.deepcopy(nodepool_quota)
def estimatedNodepoolQuotaUsed(self, pool=None):
'''
Sums up the quota used (or planned) currently by nodepool. If pool is
given it is filtered by the pool.
:param pool: If given, filtered by the pool.
:return: Calculated quota in use by nodepool
'''
used_quota = QuotaInformation()
for node in self._zk.nodeIterator():
if node.provider == self.provider.name:
try:
if pool and not node.pool == pool.name:
continue
provider_pool = self.provider.pools.get(node.pool)
if not provider_pool:
self.log.warning(
"Cannot find provider pool for node %s" % node)
# This node is in a funny state we log it for debugging
# but move on and don't account it as we can't properly
# calculate its cost without pool info.
continue
if node.type[0] not in provider_pool.labels:
self.log.warning("Node type is not in provider pool "
"for node %s" % node)
# This node is also in a funny state; the config
# may have changed under it. It should settle out
# eventually when it's deleted.
continue
node_resources = self.quotaNeededByLabel(
node.type[0], provider_pool)
used_quota.add(node_resources)
except Exception:
self.log.exception("Couldn't consider invalid node %s "
"for quota:" % node)
return used_quota