Consider all node types when adjusting label quota

Since nodes can have multiple other labels apart from the requested
type, we need to adjust the available quota for all labels of nodes that
were allocated to a request.

This fixes a problem where a static driver pool could pause because the
requested node types were no longer available but the request was still
accepted due to wrong label quota.

Change-Id: Ia9626ec26a66870574019ecc3f119a18e6c5022d
This commit is contained in:
Simon Westphahl
2022-08-24 15:28:52 +02:00
parent dfdf9d3684
commit 7b4ce1be8e
3 changed files with 55 additions and 5 deletions

View File

@@ -247,15 +247,20 @@ class PoolWorker(threading.Thread, stats.StatsReporter):
# Got a lock, so assign it
log.info("Assigning node request %s" % req)
rh = pm.getRequestHandler(self, req)
rh.run()
if has_quota_support:
# Adjust the label quota so we don't accept more requests
# than we have labels available.
for label in req.node_types:
with contextlib.suppress(KeyError):
label_quota[label] -= 1
# Since nodes can have multiple other labels apart from the
# requested type, we need to adjust the quota for all labels
# of nodes that are allocated to the request.
for node in rh.nodeset:
for node_type in node.type:
with contextlib.suppress(KeyError):
label_quota[node_type] -= 1
rh = pm.getRequestHandler(self, req)
rh.run()
if rh.paused:
self.paused_handler = rh
self.request_handlers.append(rh)

View File

@@ -11,6 +11,8 @@ zookeeper-tls:
labels:
- name: fake-label
- name: fake-label2
- name: label-host-1
- name: label-host-2
providers:
- name: static-provider
@@ -22,6 +24,7 @@ providers:
labels:
- fake-label
- fake-label2
- label-host-1
host-key: ssh-rsa FAKEKEY
timeout: 13
connection-port: 22022
@@ -29,6 +32,7 @@ providers:
- name: fake-host-2
labels:
- fake-label2
- label-host-2
host-key: ssh-rsa FAKEKEY
timeout: 13
connection-port: 22022

View File

@@ -326,6 +326,47 @@ class TestDriverStatic(tests.DBTestCase):
self.waitForNodeDeletion(node)
self.waitForNodeRequest(req_waiting, zk.FULFILLED)
def test_label_quota(self):
configfile = self.setup_config('static-2-nodes-multilabel.yaml')
pool = self.useNodepool(configfile, watermark_sleep=1)
pool.start()
req1 = zk.NodeRequest()
req1.state = zk.REQUESTED
req1.node_types.append('label-host-1')
req2 = zk.NodeRequest()
req2.state = zk.REQUESTED
req2.node_types.append('label-host-2')
# Request a label that is no longer available, but wasn't requested
# by any of the previous requests.
req_waiting = zk.NodeRequest()
req_waiting.state = zk.REQUESTED
req_waiting.node_types.append('fake-label2')
self.zk.storeNodeRequest(req1)
self.zk.storeNodeRequest(req2)
self.zk.storeNodeRequest(req_waiting)
req1 = self.waitForNodeRequest(req1, zk.FULFILLED)
node = self.zk.getNode(req1.nodes[0])
self.zk.lockNode(node)
node.state = zk.USED
self.zk.storeNode(node)
req2 = self.waitForNodeRequest(req2, zk.FULFILLED)
# Assert that the request was not accepted, which means that
# the label quota was correctly adjusted.
req_waiting = self.zk.getNodeRequest(req_waiting.id)
self.assertEqual(req_waiting.state, zk.REQUESTED)
self.assertEqual(req_waiting.declined_by, [])
self.zk.unlockNode(node)
self.waitForNodeDeletion(node)
self.waitForNodeRequest(req_waiting, zk.FULFILLED)
def test_static_ignore_assigned_ready_nodes(self):
"""Regression test to not touch assigned READY nodes"""
configfile = self.setup_config('static-basic.yaml')