Merge "Add a global limit on the number of allocation candidates"

This commit is contained in:
Zuul 2025-01-08 21:43:26 +00:00 committed by Gerrit Code Review
commit 7d195137b6
4 changed files with 189 additions and 5 deletions

View File

@ -26,11 +26,14 @@ placement_opts = [
default=False,
help="""
If True, when limiting allocation candidate results, the results will be
a random sampling of the full result set. If False, allocation candidates
are returned in a deterministic but undefined order. That is, all things
being equal, two requests for allocation candidates will return the same
results in the same order; but no guarantees are made as to how that order
is determined.
a random sampling of the full result set. The
[placement]max_allocation_candidates config might limit the size of the full
set used as the input of the sampling.
If False, allocation candidates are returned in a deterministic but undefined
order. That is, all things being equal, two requests for allocation candidates
will return the same results in the same order; but no guarantees are made as
to how that order is determined.
"""),
cfg.StrOpt(
'incomplete_consumer_project_id',
@ -59,6 +62,28 @@ doesn't provide.
The number of times to retry, server-side, writing allocations when there is
a resource provider generation conflict. Raising this value may be useful
when many concurrent allocations to the same resource provider are expected.
"""),
cfg.IntOpt(
'max_allocation_candidates',
default=-1,
help="""
The maximum number of allocation candidates placement generates for a single
request. This is a global limit to avoid excessive memory use and query
runtime. If set to -1 it means that the number of generated candidates are
only limited by the number and structure of the resource providers and the
content of the allocation_candidates query.
Note that the limit param of the allocation_candidates query is applied after
all the viable candidates are generated so that limit alone is not enough to
restrict the runtime or memory consumption of the query.
In a deployment with thousands of resource providers or if the deployment has
wide and symmetric provider trees, i.e. there are multiple children providers
under the same root having inventory from the same resource class
(e.g. in case of nova's mdev GPU or PCI in Placement features) we recommend
to tune this config option based on the memory available for the
placement service and the client timeout setting on the client side. A good
initial value could be around 100000.
"""),
]

View File

@ -731,6 +731,7 @@ def _merge_candidates(candidates, rw_ctx):
areqs = set()
all_suffixes = set(candidates)
num_granular_groups = len(all_suffixes - set(['']))
max_a_c = rw_ctx.config.placement.max_allocation_candidates
for areq_lists_by_suffix in areq_lists_by_anchor.values():
# Filter out any entries that don't have allocation requests for
# *all* suffixes (i.e. all RequestGroups)
@ -754,6 +755,10 @@ def _merge_candidates(candidates, rw_ctx):
# [areq__B, areq_1_B, ..., areq_42_B], return.
# ...,
# ]
# This loops on each merged candidate where a candidate is represented
# by the areq_list containing the allocations that fulfills each
# request groups
for areq_list in itertools.product(
*list(areq_lists_by_suffix.values())):
# At this point, each AllocationRequest in areq_list is still
@ -790,6 +795,14 @@ def _merge_candidates(candidates, rw_ctx):
continue
areqs.add(areq)
if max_a_c >= 0 and len(areqs) >= max_a_c:
# This duplicated check will go away in the next patch that
# refactors this logic a bit.
break
if max_a_c >= 0 and len(areqs) >= max_a_c:
break
# It's possible we've filtered out everything. If so, short out.
if not areqs:
return [], []

View File

@ -373,6 +373,10 @@ class RequestWideSearchContext(object):
return True
return False
@property
def config(self):
return self._ctx.config
@db_api.placement_context_manager.reader
def provider_ids_from_uuid(context, uuid):

View File

@ -0,0 +1,142 @@
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import collections
from placement import direct
from placement.tests.functional import base
from placement.tests.functional.db import test_base as tb
class TestWideTreeAllocationCandidateExplosion(base.TestCase):
"""Test candidate generation ordering and limiting in wide symmetric trees,
i.e. with trees of many similar child RPs.
"""
def setUp(self):
super().setUp()
self.headers = {
'x-auth-token': 'admin',
'content-type': 'application/json',
'OpenStack-API-Version': 'placement 1.38',
'X_ROLES': 'admin,service'
}
self.conf_fixture.conf.set_override(
"max_allocation_candidates", 100000, group="placement")
def create_tree(self, num_roots, num_child, num_res_per_child):
self.roots = {}
for i in range(num_roots):
compute = tb.create_provider(
self.context, f'compute{i}')
self.roots[compute.uuid] = compute.name
tb.add_inventory(compute, 'VCPU', 8)
tb.add_inventory(compute, 'MEMORY_MB', 4096)
tb.add_inventory(compute, 'DISK_GB', 500)
for j in range(num_child):
child = tb.create_provider(
self.context, f'compute{i}:PF{j}', parent=compute.uuid)
tb.add_inventory(child, 'CUSTOM_VF', num_res_per_child)
@staticmethod
def get_candidate_query(num_groups, num_res, limit):
query = ("/allocation_candidates?"
"resources=DISK_GB%3A20%2CMEMORY_MB%3A2048%2CVCPU%3A2")
for g in range(num_groups):
query += f"&resources{g}=CUSTOM_VF%3A{num_res}"
query += "&group_policy=none"
query += f"&limit={limit}"
return query
def _test_num_candidates_and_computes(
self, computes, pfs, vfs_per_pf, req_groups, req_res_per_group,
req_limit, expected_candidates, expected_computes_with_candidates
):
self.create_tree(
num_roots=computes, num_child=pfs, num_res_per_child=vfs_per_pf)
conf = self.conf_fixture.conf
with direct.PlacementDirect(conf) as client:
resp = client.get(
self.get_candidate_query(
num_groups=req_groups, num_res=req_res_per_group,
limit=req_limit),
headers=self.headers)
self.assertEqual(200, resp.status_code)
body = resp.json()
self.assertEqual(expected_candidates, len(body["allocation_requests"]))
root_rps = set(self.roots.keys())
roots_with_candidates = set()
nr_of_candidates_per_compute = collections.Counter()
for ar in body["allocation_requests"]:
allocated_rps = set(ar["allocations"].keys())
root_allocated_rps = allocated_rps.intersection(root_rps)
roots_with_candidates |= root_allocated_rps
nr_of_candidates_per_compute.update(root_allocated_rps)
self.assertEqual(
expected_computes_with_candidates, len(roots_with_candidates))
def test_all_candidates_generated_and_returned(self):
self._test_num_candidates_and_computes(
computes=2, pfs=8, vfs_per_pf=8, req_groups=2, req_res_per_group=1,
req_limit=1000,
expected_candidates=2 * 64, expected_computes_with_candidates=2,)
def test_requested_limit_is_hit_result_balanced(self):
# 8192 possible candidates, all generated, returned 1000,
# result is balanced due to python sets usage
self._test_num_candidates_and_computes(
computes=2, pfs=8, vfs_per_pf=8, req_groups=4, req_res_per_group=1,
req_limit=1000,
expected_candidates=1000, expected_computes_with_candidates=2)
def test_too_many_candidates_global_limit_is_hit_result_unbalanced(self):
# With max_allocation_candidates set to 100k limit this test now
# runs in reasonable time (10 sec on my machine), without that it would
# time out.
# However, with the global limit in place only the first compute gets
# candidates.
# 524288 valid candidates, the generation stops at 100k candidates,
# only 1000 is returned, result is unbalanced as the first 100k
# candidate is always from the first compute.
self._test_num_candidates_and_computes(
computes=2, pfs=8, vfs_per_pf=8, req_groups=6, req_res_per_group=1,
req_limit=1000,
expected_candidates=1000, expected_computes_with_candidates=1)
def test_global_limit_hit(self):
# 8192 possible candidates, global limit is set to 8000, higher request
# limit so number of candidates are limited by the global limit
self.conf_fixture.conf.set_override(
"max_allocation_candidates", 8000, group="placement")
self._test_num_candidates_and_computes(
computes=2, pfs=8, vfs_per_pf=8, req_groups=4, req_res_per_group=1,
req_limit=9000,
expected_candidates=8000, expected_computes_with_candidates=2)
def test_no_global_limit(self):
# 8192 possible candidates, there is no global limit, high request
# limit so all candidates returned
self.conf_fixture.conf.set_override(
"max_allocation_candidates", -1, group="placement")
self._test_num_candidates_and_computes(
computes=2, pfs=8, vfs_per_pf=8, req_groups=4, req_res_per_group=1,
req_limit=9000,
expected_candidates=8192, expected_computes_with_candidates=2)