OpenStack Compute (Nova)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

228 lines
10KB

  1. #
  2. # Licensed under the Apache License, Version 2.0 (the "License"); you may
  3. # not use this file except in compliance with the License. You may obtain
  4. # a copy of the License at
  5. #
  6. # http://www.apache.org/licenses/LICENSE-2.0
  7. #
  8. # Unless required by applicable law or agreed to in writing, software
  9. # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  10. # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  11. # License for the specific language governing permissions and limitations
  12. # under the License.
  13. import io
  14. import mock
  15. from oslo_config import cfg
  16. from oslo_log import log as logging
  17. from nova import context
  18. from nova import objects
  19. from nova.tests.functional.libvirt import base
  20. from nova.tests.unit.virt.libvirt import fakelibvirt
  21. from nova.virt.libvirt import utils
  22. CONF = cfg.CONF
  23. LOG = logging.getLogger(__name__)
  24. class VGPUReshapeTests(base.ServersTestBase):
  25. # the minimum libvirt version needed for vgpu
  26. MIN_LIBVIRT_MDEV_SUPPORT = 3004000
  27. @mock.patch('nova.virt.libvirt.LibvirtDriver._get_local_gb_info',
  28. return_value={'total': 128,
  29. 'used': 44,
  30. 'free': 84})
  31. @mock.patch('nova.virt.libvirt.driver.libvirt_utils.is_valid_hostname',
  32. return_value=True)
  33. @mock.patch('nova.virt.libvirt.driver.libvirt_utils.file_open',
  34. side_effect=[io.BytesIO(b''), io.BytesIO(b''),
  35. io.BytesIO(b'')])
  36. def test_create_servers_with_vgpu(
  37. self, mock_file_open, mock_valid_hostname, mock_get_fs_info):
  38. """Verify that vgpu reshape works with libvirt driver
  39. 1) create two servers with an old tree where the VGPU resource is on
  40. the compute provider
  41. 2) trigger a reshape
  42. 3) check that the allocations of the servers are still valid
  43. 4) create another server now against the new tree
  44. """
  45. # NOTE(gibi): We cannot simply ask the virt driver to create an old
  46. # RP tree with vgpu on the root RP as that code path does not exist
  47. # any more. So we have to hack a "bit". We will create a compute
  48. # service without vgpu support to have the compute RP ready then we
  49. # manually add the VGPU resources to that RP in placement. Also we make
  50. # sure that during the instance claim the virt driver does not detect
  51. # the old tree as that would be a bad time for reshape. Later when the
  52. # compute service is restarted the driver will do the reshape.
  53. fake_connection = self._get_connection(
  54. # We need more RAM or the 3rd server won't be created
  55. host_info=fakelibvirt.HostInfo(kB_mem=8192),
  56. libvirt_version=self.MIN_LIBVIRT_MDEV_SUPPORT,
  57. mdev_info=fakelibvirt.HostMdevDevicesInfo())
  58. self.mock_conn.return_value = fake_connection
  59. # start a compute with vgpu support disabled so the driver will
  60. # ignore the content of the above HostMdevDeviceInfo
  61. self.flags(enabled_vgpu_types='', group='devices')
  62. self.compute = self.start_service('compute', host='compute1')
  63. # create the VGPU resource in placement manually
  64. compute_rp_uuid = self.placement_api.get(
  65. '/resource_providers?name=compute1').body[
  66. 'resource_providers'][0]['uuid']
  67. inventories = self.placement_api.get(
  68. '/resource_providers/%s/inventories' % compute_rp_uuid).body
  69. inventories['inventories']['VGPU'] = {
  70. 'allocation_ratio': 1.0,
  71. 'max_unit': 3,
  72. 'min_unit': 1,
  73. 'reserved': 0,
  74. 'step_size': 1,
  75. 'total': 3}
  76. self.placement_api.put(
  77. '/resource_providers/%s/inventories' % compute_rp_uuid,
  78. inventories)
  79. # now we boot two servers with vgpu
  80. extra_spec = {"resources:VGPU": 1}
  81. flavor_id = self._create_flavor(extra_spec=extra_spec)
  82. server_req = self._build_server(flavor_id)
  83. # NOTE(gibi): during instance_claim() there is a
  84. # driver.update_provider_tree() call that would detect the old tree and
  85. # would fail as this is not a good time to reshape. To avoid that we
  86. # temporarily mock update_provider_tree here.
  87. with mock.patch('nova.virt.libvirt.driver.LibvirtDriver.'
  88. 'update_provider_tree'):
  89. created_server1 = self.api.post_server({'server': server_req})
  90. server1 = self._wait_for_state_change(created_server1, 'ACTIVE')
  91. created_server2 = self.api.post_server({'server': server_req})
  92. server2 = self._wait_for_state_change(created_server2, 'ACTIVE')
  93. # Determine which device is associated with which instance
  94. # { inst.uuid: pgpu_name }
  95. inst_to_pgpu = {}
  96. ctx = context.get_admin_context()
  97. for server in (server1, server2):
  98. inst = objects.Instance.get_by_uuid(ctx, server['id'])
  99. mdevs = list(
  100. self.compute.driver._get_all_assigned_mediated_devices(inst))
  101. self.assertEqual(1, len(mdevs))
  102. mdev_uuid = mdevs[0]
  103. mdev_info = self.compute.driver._get_mediated_device_information(
  104. utils.mdev_uuid2name(mdev_uuid))
  105. inst_to_pgpu[inst.uuid] = mdev_info['parent']
  106. # The VGPUs should have come from different pGPUs
  107. self.assertNotEqual(*list(inst_to_pgpu.values()))
  108. # verify that the inventory, usages and allocation are correct before
  109. # the reshape
  110. compute_inventory = self.placement_api.get(
  111. '/resource_providers/%s/inventories' % compute_rp_uuid).body[
  112. 'inventories']
  113. self.assertEqual(3, compute_inventory['VGPU']['total'])
  114. compute_usages = self.placement_api.get(
  115. '/resource_providers/%s/usages' % compute_rp_uuid).body[
  116. 'usages']
  117. self.assertEqual(2, compute_usages['VGPU'])
  118. for server in (server1, server2):
  119. allocations = self.placement_api.get(
  120. '/allocations/%s' % server['id']).body['allocations']
  121. # the flavor has disk=10 and ephemeral=10
  122. self.assertEqual(
  123. {'DISK_GB': 20, 'MEMORY_MB': 2048, 'VCPU': 2, 'VGPU': 1},
  124. allocations[compute_rp_uuid]['resources'])
  125. # enabled vgpu support
  126. self.flags(
  127. enabled_vgpu_types=fakelibvirt.NVIDIA_11_VGPU_TYPE,
  128. group='devices')
  129. # restart compute which will trigger a reshape
  130. self.compute = self.restart_compute_service(self.compute)
  131. # verify that the inventory, usages and allocation are correct after
  132. # the reshape
  133. compute_inventory = self.placement_api.get(
  134. '/resource_providers/%s/inventories' % compute_rp_uuid).body[
  135. 'inventories']
  136. self.assertNotIn('VGPU', compute_inventory)
  137. # NOTE(sbauza): The two instances will use two different pGPUs
  138. # That said, we need to check all the pGPU inventories for knowing
  139. # which ones are used.
  140. usages = {}
  141. pgpu_uuid_to_name = {}
  142. for pci_device in [fakelibvirt.PGPU1_PCI_ADDR,
  143. fakelibvirt.PGPU2_PCI_ADDR,
  144. fakelibvirt.PGPU3_PCI_ADDR]:
  145. gpu_rp_uuid = self.placement_api.get(
  146. '/resource_providers?name=compute1_%s' % pci_device).body[
  147. 'resource_providers'][0]['uuid']
  148. pgpu_uuid_to_name[gpu_rp_uuid] = pci_device
  149. gpu_inventory = self.placement_api.get(
  150. '/resource_providers/%s/inventories' % gpu_rp_uuid).body[
  151. 'inventories']
  152. self.assertEqual(1, gpu_inventory['VGPU']['total'])
  153. gpu_usages = self.placement_api.get(
  154. '/resource_providers/%s/usages' % gpu_rp_uuid).body[
  155. 'usages']
  156. usages[pci_device] = gpu_usages['VGPU']
  157. # Make sure that both instances are using different pGPUs
  158. used_devices = [dev for dev, usage in usages.items() if usage == 1]
  159. avail_devices = list(set(usages.keys()) - set(used_devices))
  160. self.assertEqual(2, len(used_devices))
  161. # Make sure that both instances are using the correct pGPUs
  162. for server in [server1, server2]:
  163. allocations = self.placement_api.get(
  164. '/allocations/%s' % server['id']).body[
  165. 'allocations']
  166. self.assertEqual(
  167. {'DISK_GB': 20, 'MEMORY_MB': 2048, 'VCPU': 2},
  168. allocations[compute_rp_uuid]['resources'])
  169. rp_uuids = list(allocations.keys())
  170. # We only have two RPs, the compute RP (the root) and the child
  171. # pGPU RP
  172. gpu_rp_uuid = (rp_uuids[1] if rp_uuids[0] == compute_rp_uuid
  173. else rp_uuids[0])
  174. self.assertEqual(
  175. {'VGPU': 1},
  176. allocations[gpu_rp_uuid]['resources'])
  177. # The pGPU's RP name contains the pGPU name
  178. self.assertIn(inst_to_pgpu[server['id']],
  179. pgpu_uuid_to_name[gpu_rp_uuid])
  180. # now create one more instance with vgpu against the reshaped tree
  181. created_server = self.api.post_server({'server': server_req})
  182. server3 = self._wait_for_state_change(created_server, 'ACTIVE')
  183. # find the pGPU that wasn't used before we created the third instance
  184. # It should have taken the previously available pGPU
  185. device = avail_devices[0]
  186. gpu_rp_uuid = self.placement_api.get(
  187. '/resource_providers?name=compute1_%s' % device).body[
  188. 'resource_providers'][0]['uuid']
  189. gpu_usages = self.placement_api.get(
  190. '/resource_providers/%s/usages' % gpu_rp_uuid).body[
  191. 'usages']
  192. self.assertEqual(1, gpu_usages['VGPU'])
  193. allocations = self.placement_api.get(
  194. '/allocations/%s' % server3['id']).body[
  195. 'allocations']
  196. self.assertEqual(
  197. {'DISK_GB': 20, 'MEMORY_MB': 2048, 'VCPU': 2},
  198. allocations[compute_rp_uuid]['resources'])
  199. self.assertEqual(
  200. {'VGPU': 1},
  201. allocations[gpu_rp_uuid]['resources'])