Browse Source

Merge "Block rebuild when NUMA topology changed" into stable/rocky

changes/12/747612/1
Zuul 1 month ago
committed by Gerrit Code Review
parent
commit
a6e1bcce5f
9 changed files with 375 additions and 17 deletions
  1. +7
    -0
      nova/api/openstack/compute/servers.py
  2. +50
    -0
      nova/compute/api.py
  3. +6
    -0
      nova/exception.py
  4. +2
    -1
      nova/tests/functional/integrated_helpers.py
  5. +136
    -4
      nova/tests/functional/libvirt/test_numa_servers.py
  6. +35
    -11
      nova/tests/unit/api/openstack/compute/test_server_actions.py
  7. +6
    -1
      nova/tests/unit/compute/test_compute.py
  8. +117
    -0
      nova/tests/unit/compute/test_compute_api.py
  9. +16
    -0
      releasenotes/notes/numa-rebuild-b75f9a1966f576ea.yaml

+ 7
- 0
nova/api/openstack/compute/servers.py View File

@@ -980,6 +980,13 @@ class ServersController(wsgi.Controller):
exception.AutoDiskConfigDisabledByImage,
exception.CertificateValidationFailed) as error:
raise exc.HTTPBadRequest(explanation=error.format_message())
# NOTE(sean-k-mooney): due to the lack of the flavor and image
# properties validation I06fad233006c7bab14749a51ffa226c3801f951b in
# stable/rocky we explicitly catch the NUMA rebuild conflict instead of
# extending the INVALID_FLAVOR_IMAGE_EXCEPTIONS tuple which was
# introduced in the stein release.
except exception.ImageNUMATopologyRebuildConflict as error:
raise exc.HTTPBadRequest(explanation=error.format_message())

instance = self._get_server(context, req, id, is_detail=True)



+ 50
- 0
nova/compute/api.py View File

@@ -3260,6 +3260,14 @@ class API(base.Base):
self._checks_for_create_and_rebuild(context, image_id, image,
flavor, metadata, files_to_inject, root_bdm)

# NOTE(sean-k-mooney): When we rebuild with a new image we need to
# validate that the NUMA topology does not change as we do a NOOP claim
# in resource tracker. As such we cannot allow the resource usage or
# assignment to change as a result of a new image altering the
# numa constraints.
if orig_image_ref != image_href:
self._validate_numa_rebuild(instance, image, flavor)

kernel_id, ramdisk_id = self._handle_kernel_and_ramdisk(
context, None, None, image)

@@ -3359,6 +3367,48 @@ class API(base.Base):
request_spec=request_spec,
kwargs=kwargs)

@staticmethod
def _validate_numa_rebuild(instance, image, flavor):
"""validates that the NUMA constraints do not change on rebuild.

:param instance: nova.objects.instance.Instance object
:param image: the new image the instance will be rebuilt with.
:param flavor: the flavor of the current instance.

:returns: True or raises on failure.
:raises: nova.exception.ImageNUMATopologyRebuildConflict
"""

# NOTE(sean-k-mooney): currently it is not possible to express
# a PCI NUMA affinity policy via flavor or image but that will
# change in the future. we pull out the image metadata into
# separate variable to make future testing of this easier.
old_image_meta = instance.image_meta
new_image_meta = objects.ImageMeta.from_dict(image)
old_constraints = hardware.numa_get_constraints(flavor, old_image_meta)
new_constraints = hardware.numa_get_constraints(flavor, new_image_meta)

# early out for non NUMA instances
if old_constraints is None and new_constraints is None:
# return true for easy unit testing
return True

# if only one of the constrains are non-None (or 'set') then the
# constraints changed so raise an exception.
if old_constraints is None or new_constraints is None:
raise exception.ImageNUMATopologyRebuildConflict()

# otherwise since both the old a new constrains are non none compare
# them as dictionaries.
old = old_constraints.obj_to_primitive()
new = new_constraints.obj_to_primitive()
if old != new:
raise exception.ImageNUMATopologyRebuildConflict()
# TODO(sean-k-mooney): add PCI NUMA affinity policy check.

# return true for easy unit testing
return True

@staticmethod
def _check_quota_for_upsize(context, instance, current_flavor, new_flavor):
project_id, user_id = quotas_obj.ids_from_instance(context,


+ 6
- 0
nova/exception.py View File

@@ -1911,6 +1911,12 @@ class ImageNUMATopologyForbidden(Forbidden):
"NUMA configuration set against the flavor")


class ImageNUMATopologyRebuildConflict(Invalid):
msg_fmt = _(
"An instance's NUMA typology cannot be changed as part of a rebuild. "
"The image provided is invalid for this instance.")


class ImageNUMATopologyAsymmetric(Invalid):
msg_fmt = _("Instance CPUs and/or memory cannot be evenly distributed "
"across instance NUMA nodes. Explicit assignment of CPUs "


+ 2
- 1
nova/tests/functional/integrated_helpers.py View File

@@ -81,7 +81,8 @@ class _IntegratedTestBase(test.TestCase):
# TODO(mriedem): Fix the functional tests to work with Neutron.
self.flags(use_neutron=self.USE_NEUTRON)

nova.tests.unit.image.fake.stub_out_image_service(self)
self.fake_image_service =\
nova.tests.unit.image.fake.stub_out_image_service(self)

self.useFixture(cast_as_call.CastAsCall(self))
self.useFixture(nova_fixtures.Database(database='placement'))


+ 136
- 4
nova/tests/functional/libvirt/test_numa_servers.py View File

@@ -13,10 +13,10 @@
# License for the specific language governing permissions and limitations
# under the License.

import six

import fixtures
import mock
import six

from oslo_config import cfg
from oslo_log import log as logging

@@ -27,11 +27,10 @@ from nova import test
from nova.tests import fixtures as nova_fixtures
from nova.tests.functional.api import client
from nova.tests.functional.test_servers import ServersTestBase
from nova.tests.unit import fake_notifier
from nova.tests.unit.virt.libvirt import fake_imagebackend
from nova.tests.unit.virt.libvirt import fake_libvirt_utils
from nova.tests.unit.virt.libvirt import fakelibvirt


CONF = cfg.CONF
LOG = logging.getLogger(__name__)

@@ -568,3 +567,136 @@ class NUMAServersWithNetworksTest(NUMAServersTestBase):
network_metadata = args[1].network_metadata
self.assertIsNotNone(network_metadata)
self.assertEqual(set(['foo']), network_metadata.physnets)


class NUMAServersRebuildTests(NUMAServersTestBase):

def setUp(self):
super(NUMAServersRebuildTests, self).setUp()
images = self.api.get_images()
# save references to first two image for server create and rebuild
self.image_ref_0 = images[0]['id']
self.image_ref_1 = images[1]['id']

fake_notifier.stub_notifier(self)
self.addCleanup(fake_notifier.reset)

# NOTE(sean-k-mooney): I3b4c1153bebdeab2eb8bc2108aa177732f5c6a97
# moved mocking of the libvirt connection to the setup of
# NUMAServersTestBase but that is not backported so we just add back
# the mocking to enable backporting of
# I0322d872bdff68936033a6f5a54e8296a6fb3434
_p = mock.patch('nova.virt.libvirt.host.Host.get_connection')
self.mock_conn = _p.start()
self.addCleanup(_p.stop)

def _create_active_server(self, server_args=None):
basic_server = {
'flavorRef': 1,
'name': 'numa_server',
'networks': [{
'uuid': nova_fixtures.NeutronFixture.network_1['id']
}],
'imageRef': self.image_ref_0
}
if server_args:
basic_server.update(server_args)
server = self.api.post_server({'server': basic_server})
return self._wait_for_state_change(server, 'BUILD')

def _rebuild_server(self, active_server, image_ref):
args = {"rebuild": {"imageRef": image_ref}}
self.api.api_post(
'servers/%s/action' % active_server['id'], args)
fake_notifier.wait_for_versioned_notifications('instance.rebuild.end')
return self._wait_for_state_change(active_server, 'REBUILD')

def test_rebuild_server_with_numa(self):
"""Create a NUMA instance and ensure it can be rebuilt.
"""

# Create a flavor consuming 2 pinned cpus with an implicit
# numa topology of 1 virtual numa node.
extra_spec = {'hw:cpu_policy': 'dedicated'}
flavor_id = self._create_flavor(extra_spec=extra_spec)

# Create a host with 4 physical cpus to allow rebuild leveraging
# the free space to ensure the numa topology filter does not
# eliminate the host.
host_info = fakelibvirt.NUMAHostInfo(cpu_nodes=1, cpu_sockets=1,
cpu_cores=4, kB_mem=15740000)
fake_connection = self._get_connection(host_info=host_info)
self.mock_conn.return_value = fake_connection
self.compute = self.start_service('compute', host='compute1')

server = self._create_active_server(
server_args={"flavorRef": flavor_id})

# this should succeed as the NUMA topology has not changed
# and we have enough resources on the host. We rebuild with
# a different image to force the rebuild to query the scheduler
# to validate the host.
self._rebuild_server(server, self.image_ref_1)

def test_rebuild_server_with_numa_inplace_fails(self):
"""Create a NUMA instance and ensure in place rebuild fails.
"""

# Create a flavor consuming 2 pinned cpus with an implicit
# numa topology of 1 virtual numa node.
extra_spec = {'hw:cpu_policy': 'dedicated'}
flavor_id = self._create_flavor(extra_spec=extra_spec)

# cpu_cores is set to 2 to ensure that
# we have enough space to boot the vm but not enough space to rebuild
# by doubling the resource use during scheduling.
host_info = fakelibvirt.NUMAHostInfo(
cpu_nodes=1, cpu_sockets=1, cpu_cores=2, kB_mem=15740000)
fake_connection = self._get_connection(host_info=host_info)
self.mock_conn.return_value = fake_connection
self.compute = self.start_service('compute', host='compute1')

server = self._create_active_server(
server_args={"flavorRef": flavor_id})

# TODO(sean-k-mooney): this should pass but i currently expect it to
# fail because the NUMA topology filter does not support in place
# rebuild and we have used all the resources on the compute node.
self.assertRaises(
client.OpenStackApiException, self._rebuild_server,
server, self.image_ref_1)

def test_rebuild_server_with_different_numa_topology_fails(self):
"""Create a NUMA instance and ensure inplace rebuild fails.
"""

# Create a flavor consuming 2 pinned cpus with an implicit
# numa topology of 1 virtual numa node.
extra_spec = {'hw:cpu_policy': 'dedicated'}
flavor_id = self._create_flavor(extra_spec=extra_spec)

host_info = fakelibvirt.NUMAHostInfo(
cpu_nodes=2, cpu_sockets=1, cpu_cores=4, kB_mem=15740000)
fake_connection = self._get_connection(host_info=host_info)
self.mock_conn.return_value = fake_connection
self.compute = self.start_service('compute', host='compute1')

server = self._create_active_server(
server_args={"flavorRef": flavor_id})

# the original vm had an implicit numa topology of 1 virtual numa nodes
# so we alter the requested numa topology in image_ref_1 to request
# 2 virtual numa nodes.
ctx = nova_context.get_admin_context()
image_meta = {'properties': {'hw_numa_nodes': 2}}
self.fake_image_service.update(ctx, self.image_ref_1, image_meta)

# NOTE(sean-k-mooney): this should fail because rebuild uses noop
# claims therefor it is not allowed for the numa topology or resource
# usage to change during a rebuild.
ex = self.assertRaises(
client.OpenStackApiException, self._rebuild_server,
server, self.image_ref_1)
self.assertEqual(400, ex.response.status_code)
self.assertIn("An instance's NUMA typology cannot be changed",
six.text_type(ex))

+ 35
- 11
nova/tests/unit/api/openstack/compute/test_server_actions.py View File

@@ -540,11 +540,22 @@ class ServerActionsControllerTestV21(test.TestCase):
def test_rebuild_when_kernel_not_exists(self):

def return_image_meta(*args, **kwargs):
# NOTE(sean-k-mooney): To enable
# I0322d872bdff68936033a6f5a54e8296a6fb3434 to be backported
# without I34ffaf285718059b55f90e812b57f1e11d566c6f we update the
# fake image data to use valid UUIDs
image_meta_table = {
'2': {'id': 2, 'status': 'active', 'container_format': 'ari'},
'155d900f-4e14-4e4c-a73d-069cbf4541e6':
{'id': 3, 'status': 'active', 'container_format': 'raw',
'properties': {'kernel_id': 1, 'ramdisk_id': 2}},
uuids.ramdisk_image_id: {
'id': uuids.ramdisk_image_id, 'status': 'active',
'container_format': 'ari'},
'155d900f-4e14-4e4c-a73d-069cbf4541e6': {
'id': '155d900f-4e14-4e4c-a73d-069cbf4541e6',
'status': 'active', 'container_format': 'raw',
'properties': {
'kernel_id': uuids.missing_image_id,
'ramdisk_id': uuids.ramdisk_image_id,
},
},
}
image_id = args[2]
try:
@@ -582,12 +593,25 @@ class ServerActionsControllerTestV21(test.TestCase):
instance_meta[key] = instance[key]

def return_image_meta(*args, **kwargs):
# NOTE(sean-k-mooney): To enable
# I0322d872bdff68936033a6f5a54e8296a6fb3434 to be backported
# without I34ffaf285718059b55f90e812b57f1e11d566c6f we update the
# fake image data to use valid UUIDs
image_meta_table = {
'1': {'id': 1, 'status': 'active', 'container_format': 'aki'},
'2': {'id': 2, 'status': 'active', 'container_format': 'ari'},
'155d900f-4e14-4e4c-a73d-069cbf4541e6':
{'id': 3, 'status': 'active', 'container_format': 'raw',
'properties': {'kernel_id': 1, 'ramdisk_id': 2}},
uuids.kernel_image_id: {
'id': uuids.kernel_image_id, 'status': 'active',
'container_format': 'aki'},
uuids.ramdisk_image_id: {
'id': uuids.ramdisk_image_id, 'status': 'active',
'container_format': 'ari'},
'155d900f-4e14-4e4c-a73d-069cbf4541e6': {
'id': '155d900f-4e14-4e4c-a73d-069cbf4541e6',
'status': 'active', 'container_format': 'raw',
'properties': {
'kernel_id': uuids.kernel_image_id,
'ramdisk_id': uuids.ramdisk_image_id,
},
},
}
image_id = args[2]
try:
@@ -607,8 +631,8 @@ class ServerActionsControllerTestV21(test.TestCase):
},
}
self.controller._action_rebuild(self.req, FAKE_UUID, body=body).obj
self.assertEqual(instance_meta['kernel_id'], '1')
self.assertEqual(instance_meta['ramdisk_id'], '2')
self.assertEqual(instance_meta['kernel_id'], uuids.kernel_image_id)
self.assertEqual(instance_meta['ramdisk_id'], uuids.ramdisk_image_id)

@mock.patch.object(compute_api.API, 'rebuild')
def test_rebuild_instance_raise_auto_disk_config_exc(self, mock_rebuild):


+ 6
- 1
nova/tests/unit/compute/test_compute.py View File

@@ -9037,7 +9037,12 @@ class ComputeAPITestCase(BaseTestCase):

with mock.patch.object(self.compute_api.compute_task_api,
'rebuild_instance', fake_rpc_rebuild):
image_ref = instance["image_ref"] + '-new_image_ref'
# NOTE(sean-k-mooney): to enable
# I0322d872bdff68936033a6f5a54e8296a6fb3434 to be backported
# without I34ffaf285718059b55f90e812b57f1e11d566c6f we update the
# image ref to use uuids.new_image_ref to pass the UUIDField
# validation requirements
image_ref = uuids.new_image_ref
password = "new_password"

instance.vm_state = vm_state


+ 117
- 0
nova/tests/unit/compute/test_compute_api.py View File

@@ -6198,6 +6198,123 @@ class ComputeAPIUnitTestCase(_ComputeAPIUnitTestMixIn, test.NoDBTestCase):
self.assertEqual(['context-for-%s' % c for c in compute_api.CELLS],
cells)

def test__validate_numa_rebuild_non_numa(self):
"""This test asserts that a rebuild of an instance without a NUMA
topology passes validation.
"""
flavor = objects.Flavor(
id=42, vcpus=1, memory_mb=512, root_gb=1, extra_specs={})
instance = self._create_instance_obj(flavor=flavor)
# we use a dict instead of image metadata object as
# _validate_numa_rebuild constructs the object internally
image = {
'id': uuids.image_id, 'status': 'foo',
'properties': {}}
self.compute_api._validate_numa_rebuild(instance, image, flavor)

def test__validate_numa_rebuild_no_conflict(self):
"""This test asserts that a rebuild of an instance without a change
in NUMA topology passes validation.
"""
flavor = objects.Flavor(
id=42, vcpus=1, memory_mb=512, root_gb=1,
extra_specs={"hw:numa_nodes": 1})
instance = self._create_instance_obj(flavor=flavor)
# we use a dict instead of image metadata object as
# _validate_numa_rebuild constructs the object internally
image = {
'id': uuids.image_id, 'status': 'foo',
'properties': {}}
# The flavor creates a NUMA topology but the default image and the
# rebuild image do not have any image properties so there will
# be no conflict.
self.compute_api._validate_numa_rebuild(instance, image, flavor)

def test__validate_numa_rebuild_add_numa_toplogy(self):
"""This test asserts that a rebuild of an instance with a new image
that requests a NUMA topology when the original instance did not
have a NUMA topology is invalid.
"""

flavor = objects.Flavor(
id=42, vcpus=1, memory_mb=512, root_gb=1,
extra_specs={})
instance = self._create_instance_obj(flavor=flavor)
# we use a dict instead of image metadata object as
# _validate_numa_rebuild constructs the object internally
image = {
'id': uuids.image_id, 'status': 'foo',
'properties': {"hw_numa_nodes": 1}}
# The flavor and default image have no NUMA topology defined. The image
# used to rebuild requests a NUMA topology which is not allowed as it
# would alter the NUMA constrains.
self.assertRaises(
exception.ImageNUMATopologyRebuildConflict,
self.compute_api._validate_numa_rebuild, instance, image, flavor)

def test__validate_numa_rebuild_remove_numa_toplogy(self):
"""This test asserts that a rebuild of an instance with a new image
that does not request a NUMA topology when the original image did
is invalid if it would alter the instances topology as a result.
"""

flavor = objects.Flavor(
id=42, vcpus=1, memory_mb=512, root_gb=1,
extra_specs={})
instance = self._create_instance_obj(flavor=flavor)
# we use a dict instead of image metadata object as
# _validate_numa_rebuild constructs the object internally
old_image = {
'id': uuidutils.generate_uuid(), 'status': 'foo',
'properties': {"hw_numa_nodes": 1}}
old_image_meta = objects.ImageMeta.from_dict(old_image)
image = {
'id': uuidutils.generate_uuid(), 'status': 'foo',
'properties': {}}
with mock.patch(
'nova.objects.instance.Instance.image_meta',
new_callable=mock.PropertyMock(return_value=old_image_meta)):
# The old image has a NUMA topology defined but the new image
# used to rebuild does not. This would alter the NUMA constrains
# and therefor should raise.
self.assertRaises(
exception.ImageNUMATopologyRebuildConflict,
self.compute_api._validate_numa_rebuild, instance,
image, flavor)

def test__validate_numa_rebuild_alter_numa_toplogy(self):
"""This test asserts that a rebuild of an instance with a new image
that requests a different NUMA topology than the original image
is invalid.
"""

# NOTE(sean-k-mooney): we need to use 2 vcpus here or we will fail
# with a different exception ImageNUMATopologyAsymmetric when we
# construct the NUMA constrains as the rebuild image would result
# in an invalid topology.
flavor = objects.Flavor(
id=42, vcpus=2, memory_mb=512, root_gb=1,
extra_specs={})
instance = self._create_instance_obj(flavor=flavor)
# we use a dict instead of image metadata object as
# _validate_numa_rebuild constructs the object internally
old_image = {
'id': uuidutils.generate_uuid(), 'status': 'foo',
'properties': {"hw_numa_nodes": 1}}
old_image_meta = objects.ImageMeta.from_dict(old_image)
image = {
'id': uuidutils.generate_uuid(), 'status': 'foo',
'properties': {"hw_numa_nodes": 2}}
with mock.patch(
'nova.objects.instance.Instance.image_meta',
new_callable=mock.PropertyMock(return_value=old_image_meta)):
# the original image requested 1 NUMA node and the image used
# for rebuild requests 2 so assert an error is raised.
self.assertRaises(
exception.ImageNUMATopologyRebuildConflict,
self.compute_api._validate_numa_rebuild, instance,
image, flavor)

def test_validate_and_build_base_options_translate_neutron_secgroup(self):
"""Tests that _check_requested_secgroups will return a uuid for a
requested Neutron security group and that will be returned from


+ 16
- 0
releasenotes/notes/numa-rebuild-b75f9a1966f576ea.yaml View File

@@ -0,0 +1,16 @@
---
fixes:
- |
An instance can be rebuilt in-place with the original image or a new
image. Instance resource usage cannot be altered during a rebuild.
Previously Nova would have ignored the NUMA topology of the new image
continuing to use the NUMA topology of the existing instance until a move
operation was performed. As Nova did not explicitly guard against
inadvertent changes in resource request contained in a new image,
it was possible to rebuild with an image that would violate this requirement
`bug #1763766`_. This resulted in an inconsistent state as the instance that
was running did not match the instance that was requested. Nova now explicitly
checks if a rebuild would alter the requested NUMA topology of an instance
and rejects the rebuild.

.. _`bug #1763766`: https://bugs.launchpad.net/nova/+bug/1763766

Loading…
Cancel
Save