Make allocation candidates available for scheduler filters

This patch extends the HostState object with an allocation_candidates list populated by the scheduler manager. Also this changes the generic scheduler logic to allocate the candidate of the selected host based on the candidates in the host state. So after this patch scheduler filters can be extended to filter the allocation_candidates list of the HostState object while processing a host and restrict which candidate can be allocated if the host passes the all the filters. Potentially all candidates can be removed by multiple consecutive filters making the host as a non viable scheduling target. blueprint: pci-device-tracking-in-placement Change-Id: Id0afff271d345a94aa83fc886e9c3231c3ff2570
2022-08-17 16:12:24 +02:00 · 2022-08-17 16:12:24 +02:00 · 3d818c3473
commit 3d818c3473
parent e96601c606
4 changed files with 866 additions and 122 deletions
--- a/nova/scheduler/filters/init.py
+++ b/nova/scheduler/filters/init.py
@ -28,6 +28,9 @@ class BaseHostFilter(filters.BaseFilter):
    # other parameters. We care about running policy filters (i.e.
    # ImagePropertiesFilter) but not things that check usage on the
    # existing compute node, etc.
+    # This also means that filters marked with RUN_ON_REBUILD = True cannot
+    # filter on allocation candidates or need to handle the rebuild case
+    # specially.
    RUN_ON_REBUILD = False

    def _filter_one(self, obj, spec):
--- a/nova/scheduler/host_manager.py
+++ b/nova/scheduler/host_manager.py
@ -153,6 +153,8 @@ class HostState(object):

        self.updated = None

+        self.allocation_candidates = []
+
    def update(self, compute=None, service=None, aggregates=None,
            inst_dict=None):
        """Update all information about a host."""
@ -314,13 +316,21 @@ class HostState(object):
        self.num_io_ops += 1

    def __repr__(self):
-        return ("(%(host)s, %(node)s) ram: %(free_ram)sMB "
-                "disk: %(free_disk)sMB io_ops: %(num_io_ops)s "
-                "instances: %(num_instances)s" %
-                {'host': self.host, 'node': self.nodename,
-                 'free_ram': self.free_ram_mb, 'free_disk': self.free_disk_mb,
-                 'num_io_ops': self.num_io_ops,
-                 'num_instances': self.num_instances})
+        return (
+            "(%(host)s, %(node)s) ram: %(free_ram)sMB "
+            "disk: %(free_disk)sMB io_ops: %(num_io_ops)s "
+            "instances: %(num_instances)s, "
+            "allocation_candidates: %(num_a_c)s"
+            % {
+                "host": self.host,
+                "node": self.nodename,
+                "free_ram": self.free_ram_mb,
+                "free_disk": self.free_disk_mb,
+                "num_io_ops": self.num_io_ops,
+                "num_instances": self.num_instances,
+                "num_a_c": len(self.allocation_candidates),
+            }
+        )


 class HostManager(object):
--- a/nova/scheduler/manager.py
+++ b/nova/scheduler/manager.py
@ -20,6 +20,7 @@ Scheduler Service
 """

 import collections
+import copy
 import random

 from oslo_log import log as logging
@ -299,12 +300,29 @@ class SchedulerManager(manager.Manager):
        # host, we virtually consume resources on it so subsequent
        # selections can adjust accordingly.

+        def hosts_with_alloc_reqs(hosts_gen):
+            """Extend the HostState objects returned by the generator with
+            the allocation requests of that host
+            """
+            for host in hosts_gen:
+                host.allocation_candidates = copy.deepcopy(
+                    alloc_reqs_by_rp_uuid[host.uuid])
+                yield host
+
        # Note: remember, we are using a generator-iterator here. So only
        # traverse this list once. This can bite you if the hosts
        # are being scanned in a filter or weighing function.
        hosts = self._get_all_host_states(
            elevated, spec_obj, provider_summaries)

+        # alloc_reqs_by_rp_uuid is None during rebuild, so this mean we cannot
+        # run filters that are using allocation candidates during rebuild
+        if alloc_reqs_by_rp_uuid is not None:
+            # wrap the generator to extend the HostState objects with the
+            # allocation requests for that given host. This is needed to
+            # support scheduler filters filtering on allocation candidates.
+            hosts = hosts_with_alloc_reqs(hosts)
+
        # NOTE(sbauza): The RequestSpec.num_instances field contains the number
        # of instances created when the RequestSpec was used to first boot some
        # instances. This is incorrect when doing a move or resize operation,
@ -332,6 +350,13 @@ class SchedulerManager(manager.Manager):
            # the older dict format representing HostState objects.
            # TODO(stephenfin): Remove this when we bump scheduler the RPC API
            # version to 5.0
+            # NOTE(gibi): We cannot remove this branch as it is actively used
+            # when nova calls the scheduler during rebuild (not evacuate) to
+            # check if the current host is still good for the new image used
+            # for the rebuild. In this case placement cannot be used to
+            # generate candidates as that would require space on the current
+            # compute for double allocation. So no allocation candidates for
+            # rebuild and therefore alloc_reqs_by_rp_uuid is None
            return self._legacy_find_hosts(
                context, num_instances, spec_obj, hosts, num_alts,
                instance_uuids=instance_uuids)
@ -345,6 +370,9 @@ class SchedulerManager(manager.Manager):
        # The list of hosts that have been selected (and claimed).
        claimed_hosts = []

+        # The allocation request allocated on the given claimed host
+        claimed_alloc_reqs = []
+
        for num, instance_uuid in enumerate(instance_uuids):
            # In a multi-create request, the first request spec from the list
            # is passed to the scheduler and that request spec's instance_uuid
@ -371,21 +399,20 @@ class SchedulerManager(manager.Manager):
            # resource provider UUID
            claimed_host = None
            for host in hosts:
-                cn_uuid = host.uuid
-                if cn_uuid not in alloc_reqs_by_rp_uuid:
-                    msg = ("A host state with uuid = '%s' that did not have a "
-                           "matching allocation_request was encountered while "
-                           "scheduling. This host was skipped.")
-                    LOG.debug(msg, cn_uuid)
+                if not host.allocation_candidates:
+                    LOG.debug(
+                        "The nova scheduler removed every allocation candidate"
+                        "for host %s so this host was skipped.",
+                        host
+                    )
                    continue

-                alloc_reqs = alloc_reqs_by_rp_uuid[cn_uuid]
                # TODO(jaypipes): Loop through all allocation_requests instead
                # of just trying the first one. For now, since we'll likely
                # want to order the allocation_requests in the future based on
                # information in the provider summaries, we'll just try to
                # claim resources using the first allocation_request
-                alloc_req = alloc_reqs[0]
+                alloc_req = host.allocation_candidates[0]
                if utils.claim_resources(
                    elevated, self.placement_client, spec_obj, instance_uuid,
                    alloc_req,
@ -405,6 +432,15 @@ class SchedulerManager(manager.Manager):

            claimed_instance_uuids.append(instance_uuid)
            claimed_hosts.append(claimed_host)
+            claimed_alloc_reqs.append(alloc_req)
+
+            # update the provider mapping in the request spec based
+            # on the allocated candidate as the _consume_selected_host depends
+            # on this information to temporally consume PCI devices tracked in
+            # placement
+            for request_group in spec_obj.requested_resources:
+                request_group.provider_uuids = alloc_req[
+                    'mappings'][request_group.requester_id]

            # Now consume the resources so the filter/weights will change for
            # the next instance.
@ -416,11 +452,19 @@ class SchedulerManager(manager.Manager):
        self._ensure_sufficient_hosts(
            context, claimed_hosts, num_instances, claimed_instance_uuids)

-        # We have selected and claimed hosts for each instance. Now we need to
-        # find alternates for each host.
+        # We have selected and claimed hosts for each instance along with a
+        # claimed allocation request. Now we need to find alternates for each
+        # host.
        return self._get_alternate_hosts(
-            claimed_hosts, spec_obj, hosts, num, num_alts,
-            alloc_reqs_by_rp_uuid, allocation_request_version)
+            claimed_hosts,
+            spec_obj,
+            hosts,
+            num,
+            num_alts,
+            alloc_reqs_by_rp_uuid,
+            allocation_request_version,
+            claimed_alloc_reqs,
+        )

    def _ensure_sufficient_hosts(
        self, context, hosts, required_count, claimed_uuids=None,
@ -532,7 +576,21 @@ class SchedulerManager(manager.Manager):
    def _get_alternate_hosts(
        self, selected_hosts, spec_obj, hosts, index, num_alts,
        alloc_reqs_by_rp_uuid=None, allocation_request_version=None,
+        selected_alloc_reqs=None,
    ):
+        """Generate the main Selection and possible alternate Selection
+        objects for each "instance".
+
+        :param selected_hosts: This is a list of HostState objects. Each
+            HostState represents the main selection for a given instance being
+            scheduled (we can have multiple instances during multi create).
+        :param selected_alloc_reqs: This is a list of allocation requests that
+            are already allocated in placement for the main Selection for each
+            instance. This list is matching with selected_hosts by index. So
+            for the first instance the selected host is selected_host[0] and
+            the already allocated placement candidate is
+            selected_alloc_reqs[0].
+        """
        # We only need to filter/weigh the hosts again if we're dealing with
        # more than one instance and are going to be picking alternates.
        if index > 0 and num_alts > 0:
@ -546,11 +604,10 @@ class SchedulerManager(manager.Manager):
        # representing the selected host along with alternates from the same
        # cell.
        selections_to_return = []
-        for selected_host in selected_hosts:
+        for i, selected_host in enumerate(selected_hosts):
            # This is the list of hosts for one particular instance.
            if alloc_reqs_by_rp_uuid:
-                selected_alloc_req = alloc_reqs_by_rp_uuid.get(
-                        selected_host.uuid)[0]
+                selected_alloc_req = selected_alloc_reqs[i]
            else:
                selected_alloc_req = None

@ -571,15 +628,17 @@ class SchedulerManager(manager.Manager):
                if len(selected_plus_alts) >= num_alts + 1:
                    break

+                # TODO(gibi): In theory we could generate alternatives on the
+                # same host if that host has different possible allocation
+                # candidates for the request. But we don't do that today
                if host.cell_uuid == cell_uuid and host not in selected_hosts:
                    if alloc_reqs_by_rp_uuid is not None:
-                        alt_uuid = host.uuid
-                        if alt_uuid not in alloc_reqs_by_rp_uuid:
+                        if not host.allocation_candidates:
                            msg = ("A host state with uuid = '%s' that did "
-                                   "not have a matching allocation_request "
+                                   "not have any remaining allocation_request "
                                   "was encountered while scheduling. This "
                                   "host was skipped.")
-                            LOG.debug(msg, alt_uuid)
+                            LOG.debug(msg, host.uuid)
                            continue

                        # TODO(jaypipes): Loop through all allocation_requests
@ -588,7 +647,13 @@ class SchedulerManager(manager.Manager):
                        # the future based on information in the provider
                        # summaries, we'll just try to claim resources using
                        # the first allocation_request
-                        alloc_req = alloc_reqs_by_rp_uuid[alt_uuid][0]
+                        # NOTE(gibi): we are using, and re-using, allocation
+                        # candidates for alternatives here. This is OK as
+                        # these candidates are not yet allocated in placement
+                        # and we don't know if an alternate will ever be used.
+                        # To increase our success we could try to use different
+                        # candidate for different alternative though.
+                        alloc_req = host.allocation_candidates[0]
                        alt_selection = objects.Selection.from_host_state(
                            host, alloc_req, allocation_request_version)
                    else:
--- a/nova/tests/unit/scheduler/test_manager.py
+++ b/nova/tests/unit/scheduler/test_manager.py