From d6795e1393a7f9cc781c0d149220e6b5dacf716e Mon Sep 17 00:00:00 2001 From: Balazs Gibizer Date: Thu, 5 Sep 2019 11:34:13 +0200 Subject: [PATCH] Add min service level check for migrate with bandwidth During resize and cold migrate the dest compute service needs to update the port binding based on the re-calculated port - resource provider mapping. This update happens in finish_resize. To do that the dest compute service needs to be at least on service level 39. The calculation is based on the RequestSpec. The RequestSpec is sent to the dest compute in pre_resize but the dest compute only sends it to the source compute in resize_instance if the compute rpc api version is at least 5.2. Also the source compute only sends the RequestSpec to the dest compute in the finish_resize if the rpc api version is at least 5.2. So migration with bandwidth only works if both compute talks at least 5.2 which means that the min service level is at least 39. Change-Id: Ia500b105b9ec70c0d8bd38faa084270b825476eb blueprint: support-move-ops-with-qos-ports --- nova/conductor/tasks/migrate.py | 112 +++- .../unit/conductor/tasks/test_migrate.py | 532 ++++++++++++++++++ 2 files changed, 641 insertions(+), 3 deletions(-) diff --git a/nova/conductor/tasks/migrate.py b/nova/conductor/tasks/migrate.py index 38b280d6c80a..d0b09f8e7963 100644 --- a/nova/conductor/tasks/migrate.py +++ b/nova/conductor/tasks/migrate.py @@ -185,6 +185,101 @@ class MigrationTask(base.TaskBase): self.request_spec.requested_destination = objects.Destination( cell=instance_mapping.cell_mapping) + def _support_resource_request(self, selection): + """Returns true if the host is new enough to support resource request + during migration. + """ + svc = objects.Service.get_by_host_and_binary( + self.context, selection.service_host, 'nova-compute') + return svc.version >= 39 + + # TODO(gibi): Remove this compat code when nova doesn't need to support + # Train computes any more. + def _get_host_supporting_request(self, selection_list): + """Return the first compute selection from the selection_list where + the service is new enough to support resource request during migration + and the resources claimed successfully. + + :param selection_list: a list of Selection objects returned by the + scheduler + :return: A two tuple. The first item is a Selection object + representing the host that supports the request. The second item + is a list of Selection objects representing the remaining alternate + hosts. + :raises MaxRetriesExceeded: if none of the hosts in the selection_list + is new enough to support the request or we cannot claim resource + on any of the hosts that are new enough. + """ + + if not self.request_spec.requested_resources: + return selection_list[0], selection_list[1:] + + # Scheduler allocated resources on the first host. So check if the + # first host is new enough + if self._support_resource_request(selection_list[0]): + return selection_list[0], selection_list[1:] + + # First host is old, so we need to use an alternate. Therefore we have + # to remove the allocation from the first host. + self.reportclient.delete_allocation_for_instance( + self.context, self.instance.uuid) + LOG.debug( + 'Scheduler returned host %(host)s as a possible migration target ' + 'but that host is not new enough to support the migration with ' + 'resource request %(request)s. Trying alternate hosts.', + {'host': selection_list[0].service_host, + 'request': self.request_spec.requested_resources}, + instance=self.instance) + + alternates = selection_list[1:] + + for i, selection in enumerate(alternates): + if self._support_resource_request(selection): + # this host is new enough so we need to try to claim resources + # on it + if selection.allocation_request: + alloc_req = jsonutils.loads( + selection.allocation_request) + resource_claimed = scheduler_utils.claim_resources( + self.context, self.reportclient, self.request_spec, + self.instance.uuid, alloc_req, + selection.allocation_request_version) + + if not resource_claimed: + LOG.debug( + 'Scheduler returned alternate host %(host)s as a ' + 'possible migration target but resource claim ' + 'failed on that host. Trying another alternate.', + {'host': selection.service_host}, + instance=self.instance) + else: + return selection, alternates[i + 1:] + + else: + # Some deployments use different schedulers that do not + # use Placement, so they will not have an + # allocation_request to claim with. For those cases, + # there is no concept of claiming, so just assume that + # the resources are available. + return selection, alternates[i + 1:] + + else: + LOG.debug( + 'Scheduler returned alternate host %(host)s as a possible ' + 'migration target but that host is not new enough to ' + 'support the migration with resource request %(request)s. ' + 'Trying another alternate.', + {'host': selection.service_host, + 'request': self.request_spec.requested_resources}, + instance=self.instance) + + # if we reach this point then none of the hosts was new enough for the + # request or we failed to claim resources on every alternate + reason = ("Exhausted all hosts available during compute service level " + "check for instance %(instance_uuid)s." % + {"instance_uuid": self.instance.uuid}) + raise exception.MaxRetriesExceeded(reason=reason) + def _execute(self): # TODO(sbauza): Remove once all the scheduler.utils methods accept a # RequestSpec object in the signature. @@ -277,9 +372,9 @@ class MigrationTask(base.TaskBase): # Since there is only ever one instance to migrate per call, we # just need the first returned element. selection_list = selection_lists[0] - # The selected host is the first item in the list, with the - # alternates being the remainder of the list. - selection, self.host_list = selection_list[0], selection_list[1:] + + selection, self.host_list = self._get_host_supporting_request( + selection_list) scheduler_utils.fill_provider_mapping( self.context, self.reportclient, self.request_spec, selection) @@ -295,6 +390,17 @@ class MigrationTask(base.TaskBase): selection = None while self.host_list and not host_available: selection = self.host_list.pop(0) + if (self.request_spec.requested_resources and not + self._support_resource_request(selection)): + LOG.debug( + 'Scheduler returned alternate host %(host)s as a possible ' + 'migration target for re-schedule but that host is not ' + 'new enough to support the migration with resource ' + 'request %(request)s. Trying another alternate.', + {'host': selection.service_host, + 'request': self.request_spec.requested_resources}, + instance=self.instance) + continue if selection.allocation_request: alloc_req = jsonutils.loads(selection.allocation_request) else: diff --git a/nova/tests/unit/conductor/tasks/test_migrate.py b/nova/tests/unit/conductor/tasks/test_migrate.py index f2a77b00ddfc..307773c61dd3 100644 --- a/nova/tests/unit/conductor/tasks/test_migrate.py +++ b/nova/tests/unit/conductor/tasks/test_migrate.py @@ -292,6 +292,538 @@ class MigrationTaskTestCase(test.NoDBTestCase): self.instance.uuid, alloc_req, '1.19') mock_fill_provider_mapping.assert_not_called() + @mock.patch('nova.scheduler.utils.claim_resources') + @mock.patch('nova.scheduler.client.report.SchedulerReportClient.' + 'delete_allocation_for_instance') + @mock.patch('nova.objects.Service.get_by_host_and_binary') + def test_get_host_supporting_request_no_resource_request( + self, mock_get_service, mock_delete_allocation, + mock_claim_resources): + # no resource request so we expect the first host is simply returned + self.request_spec.requested_resources = [] + task = self._generate_task() + resources = { + "resources": { + "VCPU": 1, + "MEMORY_MB": 1024, + "DISK_GB": 100}} + + first = objects.Selection( + service_host="host1", + nodename="node1", + cell_uuid=uuids.cell1, + allocation_request=jsonutils.dumps( + {"allocations": {uuids.host1: resources}}), + allocation_request_version='1.19') + alternate = objects.Selection( + service_host="host2", + nodename="node2", + cell_uuid=uuids.cell1, + allocation_request=jsonutils.dumps( + {"allocations": {uuids.host2: resources}}), + allocation_request_version='1.19') + selection_list = [first, alternate] + + selected, alternates = task._get_host_supporting_request( + selection_list) + + self.assertEqual(first, selected) + self.assertEqual([alternate], alternates) + mock_get_service.assert_not_called() + # The first host was good and the scheduler made allocation on that + # host. So we don't expect any resource claim manipulation + mock_delete_allocation.assert_not_called() + mock_claim_resources.assert_not_called() + + @mock.patch('nova.scheduler.utils.claim_resources') + @mock.patch('nova.scheduler.client.report.SchedulerReportClient.' + 'delete_allocation_for_instance') + @mock.patch('nova.objects.Service.get_by_host_and_binary') + def test_get_host_supporting_request_first_host_is_new( + self, mock_get_service, mock_delete_allocation, + mock_claim_resources): + self.request_spec.requested_resources = [ + objects.RequestGroup() + ] + task = self._generate_task() + resources = { + "resources": { + "VCPU": 1, + "MEMORY_MB": 1024, + "DISK_GB": 100}} + + first = objects.Selection( + service_host="host1", + nodename="node1", + cell_uuid=uuids.cell1, + allocation_request=jsonutils.dumps( + {"allocations": {uuids.host1: resources}}), + allocation_request_version='1.19') + alternate = objects.Selection( + service_host="host2", + nodename="node2", + cell_uuid=uuids.cell1, + allocation_request=jsonutils.dumps( + {"allocations": {uuids.host2: resources}}), + allocation_request_version='1.19') + selection_list = [first, alternate] + + first_service = objects.Service(service_host='host1') + first_service.version = 39 + mock_get_service.return_value = first_service + + selected, alternates = task._get_host_supporting_request( + selection_list) + + self.assertEqual(first, selected) + self.assertEqual([alternate], alternates) + mock_get_service.assert_called_once_with( + task.context, 'host1', 'nova-compute') + # The first host was good and the scheduler made allocation on that + # host. So we don't expect any resource claim manipulation + mock_delete_allocation.assert_not_called() + mock_claim_resources.assert_not_called() + + @mock.patch('nova.scheduler.utils.claim_resources') + @mock.patch('nova.scheduler.client.report.SchedulerReportClient.' + 'delete_allocation_for_instance') + @mock.patch('nova.objects.Service.get_by_host_and_binary') + def test_get_host_supporting_request_first_host_is_old_no_alternates( + self, mock_get_service, mock_delete_allocation, + mock_claim_resources): + self.request_spec.requested_resources = [ + objects.RequestGroup() + ] + task = self._generate_task() + resources = { + "resources": { + "VCPU": 1, + "MEMORY_MB": 1024, + "DISK_GB": 100}} + + first = objects.Selection( + service_host="host1", + nodename="node1", + cell_uuid=uuids.cell1, + allocation_request=jsonutils.dumps( + {"allocations": {uuids.host1: resources}}), + allocation_request_version='1.19') + selection_list = [first] + + first_service = objects.Service(service_host='host1') + first_service.version = 38 + mock_get_service.return_value = first_service + + self.assertRaises( + exception.MaxRetriesExceeded, task._get_host_supporting_request, + selection_list) + + mock_get_service.assert_called_once_with( + task.context, 'host1', 'nova-compute') + mock_delete_allocation.assert_called_once_with( + task.context, self.instance.uuid) + mock_claim_resources.assert_not_called() + + @mock.patch.object(migrate.LOG, 'debug') + @mock.patch('nova.scheduler.utils.claim_resources') + @mock.patch('nova.scheduler.client.report.SchedulerReportClient.' + 'delete_allocation_for_instance') + @mock.patch('nova.objects.Service.get_by_host_and_binary') + def test_get_host_supporting_request_first_host_is_old_second_good( + self, mock_get_service, mock_delete_allocation, + mock_claim_resources, mock_debug): + + self.request_spec.requested_resources = [ + objects.RequestGroup() + ] + task = self._generate_task() + resources = { + "resources": { + "VCPU": 1, + "MEMORY_MB": 1024, + "DISK_GB": 100}} + + first = objects.Selection( + service_host="host1", + nodename="node1", + cell_uuid=uuids.cell1, + allocation_request=jsonutils.dumps( + {"allocations": {uuids.host1: resources}}), + allocation_request_version='1.19') + second = objects.Selection( + service_host="host2", + nodename="node2", + cell_uuid=uuids.cell1, + allocation_request=jsonutils.dumps( + {"allocations": {uuids.host2: resources}}), + allocation_request_version='1.19') + third = objects.Selection( + service_host="host3", + nodename="node3", + cell_uuid=uuids.cell1, + allocation_request=jsonutils.dumps( + {"allocations": {uuids.host3: resources}}), + allocation_request_version='1.19') + selection_list = [first, second, third] + + first_service = objects.Service(service_host='host1') + first_service.version = 38 + second_service = objects.Service(service_host='host2') + second_service.version = 39 + mock_get_service.side_effect = [first_service, second_service] + + selected, alternates = task._get_host_supporting_request( + selection_list) + + self.assertEqual(second, selected) + self.assertEqual([third], alternates) + mock_get_service.assert_has_calls([ + mock.call(task.context, 'host1', 'nova-compute'), + mock.call(task.context, 'host2', 'nova-compute'), + ]) + mock_delete_allocation.assert_called_once_with( + task.context, self.instance.uuid) + mock_claim_resources.assert_called_once_with( + self.context, task.reportclient, task.request_spec, + self.instance.uuid, {"allocations": {uuids.host2: resources}}, + '1.19') + + mock_debug.assert_called_once_with( + 'Scheduler returned host %(host)s as a possible migration target ' + 'but that host is not new enough to support the migration with ' + 'resource request %(request)s. Trying alternate hosts.', + {'host': 'host1', + 'request': self.request_spec.requested_resources}, + instance=self.instance) + + @mock.patch.object(migrate.LOG, 'debug') + @mock.patch('nova.scheduler.utils.claim_resources') + @mock.patch('nova.scheduler.client.report.SchedulerReportClient.' + 'delete_allocation_for_instance') + @mock.patch('nova.objects.Service.get_by_host_and_binary') + def test_get_host_supporting_request_first_host_is_old_second_claim_fails( + self, mock_get_service, mock_delete_allocation, + mock_claim_resources, mock_debug): + self.request_spec.requested_resources = [ + objects.RequestGroup() + ] + task = self._generate_task() + resources = { + "resources": { + "VCPU": 1, + "MEMORY_MB": 1024, + "DISK_GB": 100}} + + first = objects.Selection( + service_host="host1", + nodename="node1", + cell_uuid=uuids.cell1, + allocation_request=jsonutils.dumps( + {"allocations": {uuids.host1: resources}}), + allocation_request_version='1.19') + second = objects.Selection( + service_host="host2", + nodename="node2", + cell_uuid=uuids.cell1, + allocation_request=jsonutils.dumps( + {"allocations": {uuids.host2: resources}}), + allocation_request_version='1.19') + third = objects.Selection( + service_host="host3", + nodename="node3", + cell_uuid=uuids.cell1, + allocation_request=jsonutils.dumps( + {"allocations": {uuids.host3: resources}}), + allocation_request_version='1.19') + fourth = objects.Selection( + service_host="host4", + nodename="node4", + cell_uuid=uuids.cell1, + allocation_request=jsonutils.dumps( + {"allocations": {uuids.host4: resources}}), + allocation_request_version='1.19') + selection_list = [first, second, third, fourth] + + first_service = objects.Service(service_host='host1') + first_service.version = 38 + second_service = objects.Service(service_host='host2') + second_service.version = 39 + third_service = objects.Service(service_host='host3') + third_service.version = 39 + mock_get_service.side_effect = [ + first_service, second_service, third_service] + # not called for the first host but called for the second and third + # make the second claim fail to force the selection of the third + mock_claim_resources.side_effect = [False, True] + + selected, alternates = task._get_host_supporting_request( + selection_list) + + self.assertEqual(third, selected) + self.assertEqual([fourth], alternates) + mock_get_service.assert_has_calls([ + mock.call(task.context, 'host1', 'nova-compute'), + mock.call(task.context, 'host2', 'nova-compute'), + mock.call(task.context, 'host3', 'nova-compute'), + ]) + mock_delete_allocation.assert_called_once_with( + task.context, self.instance.uuid) + mock_claim_resources.assert_has_calls([ + mock.call( + self.context, task.reportclient, task.request_spec, + self.instance.uuid, + {"allocations": {uuids.host2: resources}}, '1.19'), + mock.call( + self.context, task.reportclient, task.request_spec, + self.instance.uuid, + {"allocations": {uuids.host3: resources}}, '1.19'), + ]) + mock_debug.assert_has_calls([ + mock.call( + 'Scheduler returned host %(host)s as a possible migration ' + 'target but that host is not new enough to support the ' + 'migration with resource request %(request)s. Trying ' + 'alternate hosts.', + {'host': 'host1', + 'request': self.request_spec.requested_resources}, + instance=self.instance), + mock.call( + 'Scheduler returned alternate host %(host)s as a possible ' + 'migration target but resource claim ' + 'failed on that host. Trying another alternate.', + {'host': 'host2'}, + instance=self.instance), + ]) + + @mock.patch.object(migrate.LOG, 'debug') + @mock.patch('nova.scheduler.utils.claim_resources') + @mock.patch('nova.scheduler.client.report.SchedulerReportClient.' + 'delete_allocation_for_instance') + @mock.patch('nova.objects.Service.get_by_host_and_binary') + def test_get_host_supporting_request_both_first_and_second_too_old( + self, mock_get_service, mock_delete_allocation, + mock_claim_resources, mock_debug): + self.request_spec.requested_resources = [ + objects.RequestGroup() + ] + task = self._generate_task() + resources = { + "resources": { + "VCPU": 1, + "MEMORY_MB": 1024, + "DISK_GB": 100}} + + first = objects.Selection( + service_host="host1", + nodename="node1", + cell_uuid=uuids.cell1, + allocation_request=jsonutils.dumps( + {"allocations": {uuids.host1: resources}}), + allocation_request_version='1.19') + second = objects.Selection( + service_host="host2", + nodename="node2", + cell_uuid=uuids.cell1, + allocation_request=jsonutils.dumps( + {"allocations": {uuids.host2: resources}}), + allocation_request_version='1.19') + third = objects.Selection( + service_host="host3", + nodename="node3", + cell_uuid=uuids.cell1, + allocation_request=jsonutils.dumps( + {"allocations": {uuids.host3: resources}}), + allocation_request_version='1.19') + fourth = objects.Selection( + service_host="host4", + nodename="node4", + cell_uuid=uuids.cell1, + allocation_request=jsonutils.dumps( + {"allocations": {uuids.host4: resources}}), + allocation_request_version='1.19') + selection_list = [first, second, third, fourth] + + first_service = objects.Service(service_host='host1') + first_service.version = 38 + second_service = objects.Service(service_host='host2') + second_service.version = 38 + third_service = objects.Service(service_host='host3') + third_service.version = 39 + mock_get_service.side_effect = [ + first_service, second_service, third_service] + # not called for the first and second hosts but called for the third + mock_claim_resources.side_effect = [True] + + selected, alternates = task._get_host_supporting_request( + selection_list) + + self.assertEqual(third, selected) + self.assertEqual([fourth], alternates) + mock_get_service.assert_has_calls([ + mock.call(task.context, 'host1', 'nova-compute'), + mock.call(task.context, 'host2', 'nova-compute'), + mock.call(task.context, 'host3', 'nova-compute'), + ]) + mock_delete_allocation.assert_called_once_with( + task.context, self.instance.uuid) + mock_claim_resources.assert_called_once_with( + self.context, task.reportclient, task.request_spec, + self.instance.uuid, + {"allocations": {uuids.host3: resources}}, '1.19') + mock_debug.assert_has_calls([ + mock.call( + 'Scheduler returned host %(host)s as a possible migration ' + 'target but that host is not new enough to support the ' + 'migration with resource request %(request)s. Trying ' + 'alternate hosts.', + {'host': 'host1', + 'request': self.request_spec.requested_resources}, + instance=self.instance), + mock.call( + 'Scheduler returned alternate host %(host)s as a possible ' + 'migration target but that host is not new enough to support ' + 'the migration with resource request %(request)s. Trying ' + 'another alternate.', + {'host': 'host2', + 'request': self.request_spec.requested_resources}, + instance=self.instance), + ]) + + @mock.patch.object(migrate.LOG, 'debug') + @mock.patch('nova.scheduler.utils.fill_provider_mapping') + @mock.patch('nova.scheduler.utils.claim_resources') + @mock.patch('nova.objects.Service.get_by_host_and_binary') + def test_reschedule_old_compute_skipped( + self, mock_get_service, mock_claim_resources, mock_fill_mapping, + mock_debug): + self.request_spec.requested_resources = [ + objects.RequestGroup() + ] + task = self._generate_task() + resources = { + "resources": { + "VCPU": 1, + "MEMORY_MB": 1024, + "DISK_GB": 100}} + + first = objects.Selection( + service_host="host1", + nodename="node1", + cell_uuid=uuids.cell1, + allocation_request=jsonutils.dumps( + {"allocations": {uuids.host1: resources}}), + allocation_request_version='1.19') + second = objects.Selection( + service_host="host2", + nodename="node2", + cell_uuid=uuids.cell1, + allocation_request=jsonutils.dumps( + {"allocations": {uuids.host2: resources}}), + allocation_request_version='1.19') + + first_service = objects.Service(service_host='host1') + first_service.version = 38 + second_service = objects.Service(service_host='host2') + second_service.version = 39 + mock_get_service.side_effect = [first_service, second_service] + + # set up task for re-schedule + task.host_list = [first, second] + + selected = task._reschedule() + + self.assertEqual(second, selected) + self.assertEqual([], task.host_list) + mock_get_service.assert_has_calls([ + mock.call(task.context, 'host1', 'nova-compute'), + mock.call(task.context, 'host2', 'nova-compute'), + ]) + mock_claim_resources.assert_called_once_with( + self.context.elevated(), task.reportclient, task.request_spec, + self.instance.uuid, + {"allocations": {uuids.host2: resources}}, '1.19') + mock_fill_mapping.assert_called_once_with( + task.context, task.reportclient, task.request_spec, second) + mock_debug.assert_has_calls([ + mock.call( + 'Scheduler returned alternate host %(host)s as a possible ' + 'migration target for re-schedule but that host is not ' + 'new enough to support the migration with resource ' + 'request %(request)s. Trying another alternate.', + {'host': 'host1', + 'request': self.request_spec.requested_resources}, + instance=self.instance), + ]) + + @mock.patch.object(migrate.LOG, 'debug') + @mock.patch('nova.scheduler.utils.fill_provider_mapping') + @mock.patch('nova.scheduler.utils.claim_resources') + @mock.patch('nova.objects.Service.get_by_host_and_binary') + def test_reschedule_old_computes_no_more_alternates( + self, mock_get_service, mock_claim_resources, mock_fill_mapping, + mock_debug): + self.request_spec.requested_resources = [ + objects.RequestGroup() + ] + task = self._generate_task() + resources = { + "resources": { + "VCPU": 1, + "MEMORY_MB": 1024, + "DISK_GB": 100}} + + first = objects.Selection( + service_host="host1", + nodename="node1", + cell_uuid=uuids.cell1, + allocation_request=jsonutils.dumps( + {"allocations": {uuids.host1: resources}}), + allocation_request_version='1.19') + second = objects.Selection( + service_host="host2", + nodename="node2", + cell_uuid=uuids.cell1, + allocation_request=jsonutils.dumps( + {"allocations": {uuids.host2: resources}}), + allocation_request_version='1.19') + + first_service = objects.Service(service_host='host1') + first_service.version = 38 + second_service = objects.Service(service_host='host2') + second_service.version = 38 + mock_get_service.side_effect = [first_service, second_service] + + # set up task for re-schedule + task.host_list = [first, second] + + self.assertRaises(exception.MaxRetriesExceeded, task._reschedule) + + self.assertEqual([], task.host_list) + mock_get_service.assert_has_calls([ + mock.call(task.context, 'host1', 'nova-compute'), + mock.call(task.context, 'host2', 'nova-compute'), + ]) + mock_claim_resources.assert_not_called() + mock_fill_mapping.assert_not_called() + mock_debug.assert_has_calls([ + mock.call( + 'Scheduler returned alternate host %(host)s as a possible ' + 'migration target for re-schedule but that host is not ' + 'new enough to support the migration with resource ' + 'request %(request)s. Trying another alternate.', + {'host': 'host1', + 'request': self.request_spec.requested_resources}, + instance=self.instance), + mock.call( + 'Scheduler returned alternate host %(host)s as a possible ' + 'migration target for re-schedule but that host is not ' + 'new enough to support the migration with resource ' + 'request %(request)s. Trying another alternate.', + {'host': 'host2', + 'request': self.request_spec.requested_resources}, + instance=self.instance), + ]) + class MigrationTaskAllocationUtils(test.NoDBTestCase): @mock.patch('nova.objects.ComputeNode.get_by_host_and_nodename')