Merge "use cinder migrate for swap volume" into stable/2024.2
This commit is contained in:
47
releasenotes/notes/bug-2112187-763bae283e0b736d.yaml
Normal file
47
releasenotes/notes/bug-2112187-763bae283e0b736d.yaml
Normal file
@@ -0,0 +1,47 @@
|
||||
---
|
||||
security:
|
||||
- |
|
||||
Watchers no longer forges requests on behalf of a tenant when
|
||||
swapping volumes. Prior to this release watcher had 2 implementations
|
||||
of moving a volume, it could use cinders volume migrate api or its own
|
||||
internal implementation that directly calls nova volume attachment update
|
||||
api. The former is safe and the recommend way to move volumes between
|
||||
cinder storage backend the internal implementation was insecure, fragile
|
||||
due to a lack of error handling and capable of deleting user data.
|
||||
|
||||
Insecure: the internal volume migration operation created a new keystone
|
||||
user with a weak name and password and added it to the tenants project
|
||||
with the admin role. It then used that user to forge request on behalf
|
||||
of the tenant with admin right to swap the volume. if the applier was
|
||||
restarted during the execution of this operation it would never be cleaned
|
||||
up.
|
||||
|
||||
Fragile: the error handling was minimal, the swap volume api is async
|
||||
so watcher has to poll for completion, there was no support to resume
|
||||
that if interrupted of the time out was exceeded.
|
||||
|
||||
Data-loss: while the internal polling logic returned success or failure
|
||||
watcher did not check the result, once the function returned it
|
||||
unconditionally deleted the source volume. For larger volumes this
|
||||
could result in irretrievable data loss.
|
||||
|
||||
Finally if a volume was swapped using the internal workflow it put
|
||||
the nova instance in an out of sync state. If the VM was live migrated
|
||||
after the swap volume completed successfully prior to a hard reboot
|
||||
then the migration would fail or succeed and break tenant isolation.
|
||||
|
||||
see: https://bugs.launchpad.net/nova/+bug/2112187 for details.
|
||||
fixes:
|
||||
- |
|
||||
All code related to creating keystone user and granting roles has been
|
||||
removed. The internal swap volume implementation has been removed and
|
||||
replaced by cinders volume migrate api. Note as part of this change
|
||||
Watcher will no longer attempt volume migrations or retypes if the
|
||||
instance is in the `Verify Resize` task state. This resolves several
|
||||
issues related to volume migration in the zone migration and
|
||||
Storage capacity balance strategies. While efforts have been made
|
||||
to maintain backward compatibility these changes are required to
|
||||
address a security weakness in watcher's prior approach.
|
||||
|
||||
see: https://bugs.launchpad.net/nova/+bug/2112187 for more context.
|
||||
|
||||
@@ -17,14 +17,11 @@ import jsonschema
|
||||
|
||||
from oslo_log import log
|
||||
|
||||
from cinderclient import client as cinder_client
|
||||
from watcher._i18n import _
|
||||
from watcher.applier.actions import base
|
||||
from watcher.common import cinder_helper
|
||||
from watcher.common import exception
|
||||
from watcher.common import keystone_helper
|
||||
from watcher.common import nova_helper
|
||||
from watcher.common import utils
|
||||
from watcher import conf
|
||||
|
||||
CONF = conf.CONF
|
||||
@@ -70,8 +67,6 @@ class VolumeMigrate(base.BaseAction):
|
||||
|
||||
def __init__(self, config, osc=None):
|
||||
super(VolumeMigrate, self).__init__(config)
|
||||
self.temp_username = utils.random_string(10)
|
||||
self.temp_password = utils.random_string(10)
|
||||
self.cinder_util = cinder_helper.CinderHelper(osc=self.osc)
|
||||
self.nova_util = nova_helper.NovaHelper(osc=self.osc)
|
||||
|
||||
@@ -134,83 +129,42 @@ class VolumeMigrate(base.BaseAction):
|
||||
|
||||
def _can_swap(self, volume):
|
||||
"""Judge volume can be swapped"""
|
||||
# TODO(sean-k-mooney): rename this to _can_migrate and update
|
||||
# tests to reflect that.
|
||||
|
||||
# cinder volume migration can migrate volumes that are not
|
||||
# attached to instances or nova can migrate the data for cinder
|
||||
# if the volume is in-use. If the volume has no attachments
|
||||
# allow cinder to decided if it can be migrated.
|
||||
if not volume.attachments:
|
||||
return False
|
||||
instance_id = volume.attachments[0]['server_id']
|
||||
instance_status = self.nova_util.find_instance(instance_id).status
|
||||
|
||||
if (volume.status == 'in-use' and
|
||||
instance_status in ('ACTIVE', 'PAUSED', 'RESIZED')):
|
||||
LOG.debug(f"volume: {volume.id} has no attachments")
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _create_user(self, volume, user):
|
||||
"""Create user with volume attribute and user information"""
|
||||
keystone_util = keystone_helper.KeystoneHelper(osc=self.osc)
|
||||
project_id = getattr(volume, 'os-vol-tenant-attr:tenant_id')
|
||||
user['project'] = project_id
|
||||
user['domain'] = keystone_util.get_project(project_id).domain_id
|
||||
user['roles'] = ['admin']
|
||||
return keystone_util.create_user(user)
|
||||
|
||||
def _get_cinder_client(self, session):
|
||||
"""Get cinder client by session"""
|
||||
return cinder_client.Client(
|
||||
CONF.cinder_client.api_version,
|
||||
session=session,
|
||||
endpoint_type=CONF.cinder_client.endpoint_type)
|
||||
|
||||
def _swap_volume(self, volume, dest_type):
|
||||
"""Swap volume to dest_type
|
||||
|
||||
Limitation note: only for compute libvirt driver
|
||||
"""
|
||||
if not dest_type:
|
||||
raise exception.Invalid(
|
||||
message=(_("destination type is required when "
|
||||
"migration type is swap")))
|
||||
|
||||
if not self._can_swap(volume):
|
||||
raise exception.Invalid(
|
||||
message=(_("Invalid state for swapping volume")))
|
||||
|
||||
user_info = {
|
||||
'name': self.temp_username,
|
||||
'password': self.temp_password}
|
||||
user = self._create_user(volume, user_info)
|
||||
keystone_util = keystone_helper.KeystoneHelper(osc=self.osc)
|
||||
try:
|
||||
session = keystone_util.create_session(
|
||||
user.id, self.temp_password)
|
||||
temp_cinder = self._get_cinder_client(session)
|
||||
|
||||
# swap volume
|
||||
new_volume = self.cinder_util.create_volume(
|
||||
temp_cinder, volume, dest_type)
|
||||
self.nova_util.swap_volume(volume, new_volume)
|
||||
|
||||
# delete old volume
|
||||
self.cinder_util.delete_volume(volume)
|
||||
|
||||
finally:
|
||||
keystone_util.delete_user(user)
|
||||
|
||||
return True
|
||||
# since it has attachments we need to validate nova's constraints
|
||||
instance_id = volume.attachments[0]['server_id']
|
||||
instance_status = self.nova_util.find_instance(instance_id).status
|
||||
LOG.debug(
|
||||
f"volume: {volume.id} is attached to instance: {instance_id} "
|
||||
f"in instance status: {instance_status}")
|
||||
# NOTE(sean-k-mooney): This used to allow RESIZED which
|
||||
# is the resize_verify task state, that is not an acceptable time
|
||||
# to migrate volumes, if nova does not block this in the API
|
||||
# today that is probably a bug. PAUSED is also questionable but
|
||||
# it should generally be safe.
|
||||
return (volume.status == 'in-use' and
|
||||
instance_status in ('ACTIVE', 'PAUSED'))
|
||||
|
||||
def _migrate(self, volume_id, dest_node, dest_type):
|
||||
|
||||
try:
|
||||
volume = self.cinder_util.get_volume(volume_id)
|
||||
if self.migration_type == self.SWAP:
|
||||
if dest_node:
|
||||
LOG.warning("dest_node is ignored")
|
||||
return self._swap_volume(volume, dest_type)
|
||||
# for backward compatibility map swap to migrate.
|
||||
if self.migration_type in (self.SWAP, self.MIGRATE):
|
||||
if not self._can_swap(volume):
|
||||
raise exception.Invalid(
|
||||
message=(_("Invalid state for swapping volume")))
|
||||
return self.cinder_util.migrate(volume, dest_node)
|
||||
elif self.migration_type == self.RETYPE:
|
||||
return self.cinder_util.retype(volume, dest_type)
|
||||
elif self.migration_type == self.MIGRATE:
|
||||
return self.cinder_util.migrate(volume, dest_node)
|
||||
else:
|
||||
raise exception.Invalid(
|
||||
message=(_("Migration of type '%(migration_type)s' is not "
|
||||
|
||||
@@ -15,8 +15,6 @@
|
||||
from oslo_log import log
|
||||
|
||||
from keystoneauth1.exceptions import http as ks_exceptions
|
||||
from keystoneauth1 import loading
|
||||
from keystoneauth1 import session
|
||||
from watcher._i18n import _
|
||||
from watcher.common import clients
|
||||
from watcher.common import exception
|
||||
@@ -90,35 +88,3 @@ class KeystoneHelper(object):
|
||||
message=(_("Domain name seems ambiguous: %s") %
|
||||
name_or_id))
|
||||
return domains[0]
|
||||
|
||||
def create_session(self, user_id, password):
|
||||
user = self.get_user(user_id)
|
||||
loader = loading.get_plugin_loader('password')
|
||||
auth = loader.load_from_options(
|
||||
auth_url=CONF.watcher_clients_auth.auth_url,
|
||||
password=password,
|
||||
user_id=user_id,
|
||||
project_id=user.default_project_id)
|
||||
return session.Session(auth=auth)
|
||||
|
||||
def create_user(self, user):
|
||||
project = self.get_project(user['project'])
|
||||
domain = self.get_domain(user['domain'])
|
||||
_user = self.keystone.users.create(
|
||||
user['name'],
|
||||
password=user['password'],
|
||||
domain=domain,
|
||||
project=project,
|
||||
)
|
||||
for role in user['roles']:
|
||||
role = self.get_role(role)
|
||||
self.keystone.roles.grant(
|
||||
role.id, user=_user.id, project=project.id)
|
||||
return _user
|
||||
|
||||
def delete_user(self, user):
|
||||
try:
|
||||
user = self.get_user(user)
|
||||
self.keystone.users.delete(user)
|
||||
except exception.Invalid:
|
||||
pass
|
||||
|
||||
@@ -19,9 +19,7 @@
|
||||
import asyncio
|
||||
import datetime
|
||||
import inspect
|
||||
import random
|
||||
import re
|
||||
import string
|
||||
|
||||
from croniter import croniter
|
||||
import eventlet
|
||||
@@ -160,14 +158,10 @@ def extend_with_strict_schema(validator_class):
|
||||
StrictDefaultValidatingDraft4Validator = extend_with_default(
|
||||
extend_with_strict_schema(validators.Draft4Validator))
|
||||
|
||||
|
||||
Draft4Validator = validators.Draft4Validator
|
||||
|
||||
|
||||
def random_string(n):
|
||||
return ''.join([random.choice(
|
||||
string.ascii_letters + string.digits) for i in range(n)])
|
||||
|
||||
|
||||
# Some clients (e.g. MAAS) use asyncio, which isn't compatible with Eventlet.
|
||||
# As a workaround, we're delegating such calls to a native thread.
|
||||
def async_compat_call(f, *args, **kwargs):
|
||||
|
||||
@@ -57,8 +57,19 @@ class ZoneMigration(base.ZoneMigrationBaseStrategy):
|
||||
self.planned_cold_count = 0
|
||||
self.volume_count = 0
|
||||
self.planned_volume_count = 0
|
||||
self.volume_update_count = 0
|
||||
self.planned_volume_update_count = 0
|
||||
|
||||
# TODO(sean-n-mooney) This is backward compatibility
|
||||
# for calling the swap code paths. Swap is now an alias
|
||||
# for migrate, we should clean this up in a future
|
||||
# cycle.
|
||||
@property
|
||||
def volume_update_count(self):
|
||||
return self.volume_count
|
||||
|
||||
# same as above clean up later.
|
||||
@property
|
||||
def planned_volume_update_count(self):
|
||||
return self.planned_volume_count
|
||||
|
||||
@classmethod
|
||||
def get_name(cls):
|
||||
@@ -312,8 +323,8 @@ class ZoneMigration(base.ZoneMigrationBaseStrategy):
|
||||
planned_cold_migrate_instance_count=self.planned_cold_count,
|
||||
volume_migrate_count=self.volume_count,
|
||||
planned_volume_migrate_count=self.planned_volume_count,
|
||||
volume_update_count=self.volume_update_count,
|
||||
planned_volume_update_count=self.planned_volume_update_count
|
||||
volume_update_count=self.volume_count,
|
||||
planned_volume_update_count=self.planned_volume_count
|
||||
)
|
||||
|
||||
def set_migration_count(self, targets):
|
||||
@@ -328,10 +339,7 @@ class ZoneMigration(base.ZoneMigrationBaseStrategy):
|
||||
elif self.is_cold(instance):
|
||||
self.cold_count += 1
|
||||
for volume in targets.get('volume', []):
|
||||
if self.is_available(volume):
|
||||
self.volume_count += 1
|
||||
elif self.is_in_use(volume):
|
||||
self.volume_update_count += 1
|
||||
self.volume_count += 1
|
||||
|
||||
def is_live(self, instance):
|
||||
status = getattr(instance, 'status')
|
||||
@@ -404,19 +412,16 @@ class ZoneMigration(base.ZoneMigrationBaseStrategy):
|
||||
LOG.debug(src_type)
|
||||
LOG.debug("%s %s", dst_pool, dst_type)
|
||||
|
||||
if self.is_available(volume):
|
||||
if src_type == dst_type:
|
||||
self._volume_migrate(volume, dst_pool)
|
||||
else:
|
||||
self._volume_retype(volume, dst_type)
|
||||
elif self.is_in_use(volume):
|
||||
self._volume_update(volume, dst_type)
|
||||
if src_type == dst_type:
|
||||
self._volume_migrate(volume, dst_pool)
|
||||
else:
|
||||
self._volume_retype(volume, dst_type)
|
||||
|
||||
# if with_attached_volume is True, migrate attaching instances
|
||||
if self.with_attached_volume:
|
||||
instances = [self.nova.find_instance(dic.get('server_id'))
|
||||
for dic in volume.attachments]
|
||||
self.instances_migration(instances, action_counter)
|
||||
# if with_attached_volume is True, migrate attaching instances
|
||||
if self.with_attached_volume:
|
||||
instances = [self.nova.find_instance(dic.get('server_id'))
|
||||
for dic in volume.attachments]
|
||||
self.instances_migration(instances, action_counter)
|
||||
|
||||
action_counter.add_pool(pool)
|
||||
|
||||
@@ -464,16 +469,6 @@ class ZoneMigration(base.ZoneMigrationBaseStrategy):
|
||||
input_parameters=parameters)
|
||||
self.planned_cold_count += 1
|
||||
|
||||
def _volume_update(self, volume, dst_type):
|
||||
parameters = {"migration_type": "swap",
|
||||
"destination_type": dst_type,
|
||||
"resource_name": volume.name}
|
||||
self.solution.add_action(
|
||||
action_type="volume_migrate",
|
||||
resource_id=volume.id,
|
||||
input_parameters=parameters)
|
||||
self.planned_volume_update_count += 1
|
||||
|
||||
def _volume_migrate(self, volume, dst_pool):
|
||||
parameters = {"migration_type": "migrate",
|
||||
"destination_node": dst_pool,
|
||||
|
||||
@@ -22,7 +22,6 @@ from watcher.common import cinder_helper
|
||||
from watcher.common import clients
|
||||
from watcher.common import keystone_helper
|
||||
from watcher.common import nova_helper
|
||||
from watcher.common import utils as w_utils
|
||||
from watcher.tests import base
|
||||
|
||||
|
||||
@@ -102,12 +101,15 @@ class TestMigration(base.TestCase):
|
||||
|
||||
@staticmethod
|
||||
def fake_volume(**kwargs):
|
||||
# FIXME(sean-k-mooney): we should be using real objects in this
|
||||
# test or at lease something more Representative of the real data
|
||||
volume = mock.MagicMock()
|
||||
volume.id = kwargs.get('id', TestMigration.VOLUME_UUID)
|
||||
volume.size = kwargs.get('size', '1')
|
||||
volume.status = kwargs.get('status', 'available')
|
||||
volume.snapshot_id = kwargs.get('snapshot_id', None)
|
||||
volume.availability_zone = kwargs.get('availability_zone', 'nova')
|
||||
volume.attachments = kwargs.get('attachments', [])
|
||||
return volume
|
||||
|
||||
@staticmethod
|
||||
@@ -175,42 +177,14 @@ class TestMigration(base.TestCase):
|
||||
"storage1-typename",
|
||||
)
|
||||
|
||||
def test_swap_success(self):
|
||||
volume = self.fake_volume(
|
||||
status='in-use', attachments=[{'server_id': 'server_id'}])
|
||||
self.m_n_helper.find_instance.return_value = self.fake_instance()
|
||||
|
||||
new_volume = self.fake_volume(id=w_utils.generate_uuid())
|
||||
user = mock.Mock()
|
||||
session = mock.MagicMock()
|
||||
self.m_k_helper.create_user.return_value = user
|
||||
self.m_k_helper.create_session.return_value = session
|
||||
self.m_c_helper.get_volume.return_value = volume
|
||||
self.m_c_helper.create_volume.return_value = new_volume
|
||||
|
||||
result = self.action_swap.execute()
|
||||
self.assertTrue(result)
|
||||
|
||||
self.m_n_helper.swap_volume.assert_called_once_with(
|
||||
volume,
|
||||
new_volume
|
||||
)
|
||||
self.m_k_helper.delete_user.assert_called_once_with(user)
|
||||
|
||||
def test_swap_fail(self):
|
||||
# _can_swap fail
|
||||
instance = self.fake_instance(status='STOPPED')
|
||||
self.m_n_helper.find_instance.return_value = instance
|
||||
|
||||
result = self.action_swap.execute()
|
||||
self.assertFalse(result)
|
||||
|
||||
def test_can_swap_success(self):
|
||||
volume = self.fake_volume(
|
||||
status='in-use', attachments=[{'server_id': 'server_id'}])
|
||||
instance = self.fake_instance()
|
||||
status='in-use', attachments=[
|
||||
{'server_id': TestMigration.INSTANCE_UUID}])
|
||||
|
||||
instance = self.fake_instance()
|
||||
self.m_n_helper.find_instance.return_value = instance
|
||||
|
||||
result = self.action_swap._can_swap(volume)
|
||||
self.assertTrue(result)
|
||||
|
||||
@@ -219,16 +193,33 @@ class TestMigration(base.TestCase):
|
||||
result = self.action_swap._can_swap(volume)
|
||||
self.assertTrue(result)
|
||||
|
||||
instance = self.fake_instance(status='RESIZED')
|
||||
self.m_n_helper.find_instance.return_value = instance
|
||||
result = self.action_swap._can_swap(volume)
|
||||
self.assertTrue(result)
|
||||
|
||||
def test_can_swap_fail(self):
|
||||
|
||||
volume = self.fake_volume(
|
||||
status='in-use', attachments=[{'server_id': 'server_id'}])
|
||||
status='in-use', attachments=[
|
||||
{'server_id': TestMigration.INSTANCE_UUID}])
|
||||
instance = self.fake_instance(status='STOPPED')
|
||||
self.m_n_helper.find_instance.return_value = instance
|
||||
result = self.action_swap._can_swap(volume)
|
||||
self.assertFalse(result)
|
||||
|
||||
instance = self.fake_instance(status='RESIZED')
|
||||
self.m_n_helper.find_instance.return_value = instance
|
||||
result = self.action_swap._can_swap(volume)
|
||||
self.assertFalse(result)
|
||||
|
||||
def test_swap_success(self):
|
||||
volume = self.fake_volume(
|
||||
status='in-use', attachments=[
|
||||
{'server_id': TestMigration.INSTANCE_UUID}])
|
||||
self.m_c_helper.get_volume.return_value = volume
|
||||
|
||||
instance = self.fake_instance()
|
||||
self.m_n_helper.find_instance.return_value = instance
|
||||
|
||||
result = self.action_swap.execute()
|
||||
self.assertTrue(result)
|
||||
self.m_c_helper.migrate.assert_called_once_with(
|
||||
volume,
|
||||
"storage1-poolname"
|
||||
)
|
||||
|
||||
@@ -320,7 +320,10 @@ class TestZoneMigration(TestBaseStrategy):
|
||||
migration_types = collections.Counter(
|
||||
[action.get('input_parameters')['migration_type']
|
||||
for action in solution.actions])
|
||||
self.assertEqual(1, migration_types.get("swap", 0))
|
||||
# watcher no longer implements swap. it is now an
|
||||
# alias for migrate.
|
||||
self.assertEqual(0, migration_types.get("swap", 0))
|
||||
self.assertEqual(1, migration_types.get("migrate", 1))
|
||||
global_efficacy_value = solution.global_efficacy[3].get('value', 0)
|
||||
self.assertEqual(100, global_efficacy_value)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user