Support online volume resize

Trove now supports to resize volume without downtime. To use this
feature, the version of Nova and Cinder needs to be at least Pike, the
config option ``cinder_service_type`` needs to be set to ``volumev3``.
The cloud admin can disable this feature by setting
``online_volume_resize=False``, default is enabled.

Change-Id: I000a4e90800454972dd39f2f82d286571bc0b96c
This commit is contained in:
Lingxian Kong 2020-07-22 15:41:21 +12:00
parent 39b0df0a6b
commit ba046b2a14
12 changed files with 103 additions and 138 deletions

View File

@ -0,0 +1,7 @@
---
features:
- Trove now supports to resize volume without downtime. To use this feature,
the version of Nova and Cinder needs to be at least Pike, the config option
``cinder_service_type`` needs to be set to ``volumev3``. The cloud admin
can disable this feature by setting ``online_volume_resize=False``, default
is enabled.

View File

@ -93,7 +93,7 @@ common_opts = [
cfg.BoolOpt('neutron_api_insecure', default=False,
help="Allow to perform insecure SSL requests to neutron."),
cfg.URIOpt('cinder_url', help='URL without the tenant segment.'),
cfg.StrOpt('cinder_service_type', default='volumev2',
cfg.StrOpt('cinder_service_type', default='volumev3',
help='Service type to use when searching catalog.'),
cfg.StrOpt('cinder_endpoint_type', default='publicURL',
help='Service endpoint type to use when searching catalog.'),
@ -475,7 +475,10 @@ common_opts = [
help='The docker image used for backup and restore.'),
cfg.ListOpt('reserved_network_cidrs', default=[],
help='Network CIDRs reserved for Trove guest instance '
'management.')
'management.'),
cfg.BoolOpt(
'online_volume_resize', default=True,
help='If online volume resize is supported.')
]

View File

@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from cinderclient.v2 import client as CinderClient
from cinderclient import client as CinderClient
import glanceclient
from keystoneauth1 import loading
from keystoneauth1 import session
@ -95,13 +95,17 @@ def cinder_client_trove_admin(context, region_name=None):
LOG.debug('Re-use admin cinder client')
return ADMIN_CINDER_CLIENT
version = CONF.cinder_service_type.split('v')[-1] or '3'
ks_session = get_keystone_session()
ADMIN_CINDER_CLIENT = CinderClient.Client(
version,
session=ks_session,
service_type=CONF.cinder_service_type,
region_name=region_name or CONF.service_credentials.region_name,
insecure=CONF.cinder_api_insecure,
endpoint_type=CONF.cinder_endpoint_type)
endpoint_type=CONF.cinder_endpoint_type,
additional_headers={'OpenStack-API-Version': 'volumev3 latest'})
if CONF.cinder_url and CONF.service_credentials.project_id:
ADMIN_CINDER_CLIENT.client.management_url = "%s/%s/" % (

View File

@ -205,23 +205,6 @@ class TroveInstanceCreate(TroveCommonTraits):
super(TroveInstanceCreate, self).notify('create')
class TroveInstanceModifyVolume(TroveCommonTraits):
'''
Additional traits for trove.instance.create notifications that describe
instance action events
This class should correspond to trove_instance_modify_volume in
ceilometer/event_definitions.yaml
'''
def __init__(self, **kwargs):
super(TroveInstanceModifyVolume, self).__init__(**kwargs)
def notify(self):
super(TroveInstanceModifyVolume, self).notify('modify_volume')
class TroveInstanceModifyFlavor(TroveCommonTraits):
'''

View File

@ -473,7 +473,7 @@ class API(object):
self.agent_low_timeout, version=version,
device_path=device_path, mount_point=mount_point)
def resize_fs(self, device_path=None, mount_point=None):
def resize_fs(self, device_path=None, mount_point=None, online=False):
"""Resize the filesystem."""
LOG.debug("Resize device %(device)s on instance %(id)s.", {
'device': device_path, 'id': self.id})
@ -481,7 +481,8 @@ class API(object):
self._call("resize_fs",
self.agent_high_timeout, version=version,
device_path=device_path, mount_point=mount_point)
device_path=device_path, mount_point=mount_point,
online=online)
def update_overrides(self, overrides, remove=False):
"""Update the overrides."""

View File

@ -364,10 +364,11 @@ class Manager(periodic_task.PeriodicTasks):
device = volume.VolumeDevice(device_path)
device.unmount(mount_point)
def resize_fs(self, context, device_path=None, mount_point=None):
LOG.debug("Resizing the filesystem at %s.", mount_point)
def resize_fs(self, context, device_path=None, mount_point=None,
online=False):
LOG.info(f"Resizing the filesystem at {mount_point}, online: {online}")
device = volume.VolumeDevice(device_path)
device.resize_fs(mount_point)
device.resize_fs(mount_point, online=online)
###############
# Configuration

View File

@ -71,7 +71,7 @@ class FSBase(object):
"""
@abc.abstractmethod
def resize(self, device_path):
def resize(self, device_path, online=False):
"""
Resize the filesystem on device
"""
@ -113,9 +113,10 @@ class FSExt(FSBase):
exc_fmt = _("Volume '%s' was not formatted.")
log_and_raise(log_fmt, exc_fmt, device_path)
def resize(self, device_path):
utils.execute("e2fsck", "-f", "-p", device_path,
run_as_root=True, root_helper="sudo")
def resize(self, device_path, online=False):
if not online:
utils.execute("e2fsck", "-f", "-p", device_path,
run_as_root=True, root_helper="sudo")
utils.execute("resize2fs", device_path,
run_as_root=True, root_helper="sudo")
@ -158,7 +159,7 @@ class FSXFS(FSBase):
device_path)
raise exception.GuestError(original_message=msg)
def resize(self, device_path):
def resize(self, device_path, online=False):
utils.execute("xfs_repair", device_path,
run_as_root=True, root_helper="sudo")
utils.execute("mount", device_path,
@ -263,18 +264,18 @@ class VolumeDevice(object):
return True
def resize_fs(self, mount_point):
def resize_fs(self, mount_point, online=False):
"""Resize the filesystem on the specified device."""
self._check_device_exists()
# Some OS's will mount a file systems after it's attached if
# an entry is put in the fstab file (like Trove does).
# Thus it may be necessary to wait for the mount and then unmount
# the fs again (since the volume was just attached).
if self._wait_for_mount(mount_point, timeout=2):
if not online and self._wait_for_mount(mount_point, timeout=2):
LOG.debug("Unmounting '%s' before resizing.", mount_point)
self.unmount(mount_point)
try:
self.volume_fs.resize(self.device_path)
self.volume_fs.resize(self.device_path, online=online)
except exception.ProcessExecutionError:
log_fmt = "Error resizing the filesystem with device '%s'."
exc_fmt = _("Error resizing the filesystem with device '%s'.")

View File

@ -17,7 +17,6 @@ import os.path
import time
import traceback
from cinderclient import exceptions as cinder_exceptions
from eventlet import greenthread
from eventlet.timeout import Timeout
from oslo_log import log as logging
@ -55,7 +54,6 @@ from trove.common.notification import EndNotification
from trove.common.notification import StartNotification
from trove.common.notification import TroveInstanceCreate
from trove.common.notification import TroveInstanceModifyFlavor
from trove.common.notification import TroveInstanceModifyVolume
from trove.common.strategies.cluster import strategy
from trove.common.utils import try_recover
from trove.extensions.mysql import models as mysql_models
@ -1512,11 +1510,11 @@ class ResizeVolumeAction(object):
return self.instance.device_path
def _fail(self, orig_func):
LOG.exception("%(func)s encountered an error when "
"attempting to resize the volume for "
"instance %(id)s. Setting service "
"status to failed.", {'func': orig_func.__name__,
'id': self.instance.id})
LOG.error("%(func)s encountered an error when "
"attempting to resize the volume for "
"instance %(id)s. Setting service "
"status to failed.", {'func': orig_func.__name__,
'id': self.instance.id})
service = InstanceServiceStatus.find_by(instance_id=self.instance.id)
service.set_status(srvstatus.ServiceStatuses.FAILED)
service.save()
@ -1539,12 +1537,12 @@ class ResizeVolumeAction(object):
self.instance.restart()
def _recover_full(self, orig_func):
LOG.exception("%(func)s encountered an error when attempting to "
"resize the volume for instance %(id)s. Trying to "
"recover by attaching and"
" mounting the volume and then restarting the "
"guest.", {'func': orig_func.__name__,
'id': self.instance.id})
LOG.error("%(func)s encountered an error when attempting to "
"resize the volume for instance %(id)s. Trying to "
"recover by attaching and"
" mounting the volume and then restarting the "
"guest.", {'func': orig_func.__name__,
'id': self.instance.id})
self._attach_volume()
self._mount_volume()
self.instance.restart()
@ -1609,16 +1607,16 @@ class ResizeVolumeAction(object):
'id': self.instance.id})
@try_recover
def _resize_fs(self):
LOG.debug("Resizing the filesystem for instance %(id)s", {
'id': self.instance.id})
def _resize_fs(self, online=False):
LOG.info(f"Resizing the filesystem for instance {self.instance.id}, "
f"online: {online}")
mount_point = self.get_mount_point()
device_path = self.get_device_path()
self.instance.guest.resize_fs(device_path=device_path,
mount_point=mount_point)
LOG.debug("Successfully resized volume %(vol_id)s filesystem for "
"instance %(id)s", {'vol_id': self.instance.volume_id,
'id': self.instance.id})
mount_point=mount_point,
online=online)
LOG.debug(f"Successfully resized volume {self.instance.volume_id} "
f"filesystem for instance {self.instance.id}")
@try_recover
def _mount_volume(self):
@ -1634,10 +1632,8 @@ class ResizeVolumeAction(object):
@try_recover
def _extend(self):
LOG.debug("Extending volume %(vol_id)s for instance %(id)s to "
"size %(size)s", {'vol_id': self.instance.volume_id,
'id': self.instance.id,
'size': self.new_size})
LOG.info(f"Calling Cinder to extend volume {self.instance.volume_id} "
f"for instance {self.instance.id} to size {self.new_size}")
self.instance.volume_client.volumes.extend(self.instance.volume_id,
self.new_size)
LOG.debug("Successfully extended the volume %(vol_id)s for instance "
@ -1649,9 +1645,8 @@ class ResizeVolumeAction(object):
volume = self.instance.volume_client.volumes.get(
self.instance.volume_id)
if not volume:
msg = (_('Failed to get volume %(vol_id)s') % {
'vol_id': self.instance.volume_id})
raise cinder_exceptions.ClientException(msg)
msg = f'Failed to get volume {self.instance.volume_id}'
raise exception.TroveError(msg)
def volume_is_new_size():
volume = self.instance.volume_client.volumes.get(
@ -1659,34 +1654,46 @@ class ResizeVolumeAction(object):
return volume.size == self.new_size
utils.poll_until(volume_is_new_size,
sleep_time=2,
sleep_time=5,
time_out=CONF.volume_time_out)
self.instance.update_db(volume_size=self.new_size)
except PollTimeOut:
LOG.exception("Timeout trying to extend the volume %(vol_id)s "
"for instance %(id)s",
{'vol_id': self.instance.volume_id,
'id': self.instance.id})
LOG.error("Timeout trying to extend the volume %(vol_id)s "
"for instance %(id)s",
{'vol_id': self.instance.volume_id,
'id': self.instance.id})
volume = self.instance.volume_client.volumes.get(
self.instance.volume_id)
if volume.status == 'extending':
self._fail(self._verify_extend)
elif volume.size != self.new_size:
self.instance.update_db(volume_size=volume.size)
self._recover_full(self._verify_extend)
if not CONF.online_volume_resize:
self._recover_full(self._verify_extend)
raise
except Exception:
LOG.exception("Error encountered trying to verify extend for "
"the volume %(vol_id)s for instance %(id)s",
{'vol_id': self.instance.volume_id,
'id': self.instance.id})
self._recover_full(self._verify_extend)
except Exception as e:
LOG.error("Error encountered trying to verify extend for "
"the volume %(vol_id)s for instance %(id)s, "
"error: %(error)s",
{'vol_id': self.instance.volume_id,
'id': self.instance.id,
'error': str(e)})
if not CONF.online_volume_resize:
self._recover_full(self._verify_extend)
raise
def _resize_active_volume(self):
LOG.debug("Begin _resize_active_volume for id: %(id)s", {
'id': self.instance.id})
if CONF.online_volume_resize:
try:
self._extend()
except Exception as e:
LOG.error(f'Failed to extend volume, error: {str(e)}')
self._verify_extend()
self._resize_fs(recover_func=self._fail, online=True)
return
self._stop_db()
self._unmount_volume(recover_func=self._recover_restart)
self._detach_volume(recover_func=self._recover_mount_restart)
@ -1694,11 +1701,9 @@ class ResizeVolumeAction(object):
self._verify_extend()
# if anything fails after this point, recovery is futile
self._attach_volume(recover_func=self._fail)
self._resize_fs(recover_func=self._fail)
self._resize_fs(recover_func=self._fail, online=False)
self._mount_volume(recover_func=self._fail)
self.instance.restart()
LOG.debug("End _resize_active_volume for id: %(id)s", {
'id': self.instance.id})
def execute(self):
LOG.debug("%(gt)s: Resizing instance %(id)s volume for server "
@ -1711,19 +1716,11 @@ class ResizeVolumeAction(object):
if self.instance.server.status in [InstanceStatus.ACTIVE,
InstanceStatus.HEALTHY]:
self._resize_active_volume()
self.instance.reset_task_status()
# send usage event for size reported by cinder
volume = self.instance.volume_client.volumes.get(
self.instance.volume_id)
launched_time = timeutils.isotime(self.instance.updated)
modified_time = timeutils.isotime(self.instance.updated)
TroveInstanceModifyVolume(instance=self.instance,
old_volume_size=self.old_size,
launched_at=launched_time,
modify_at=modified_time,
volume_size=volume.size,
).notify()
try:
self._resize_active_volume()
finally:
self.instance.reset_task_status()
else:
self.instance.reset_task_status()
msg = (

View File

@ -546,7 +546,6 @@ class ResizeInstanceVolumeTest(ActionTestBase):
self.new_volume_size)
@test(depends_on=[test_volume_resize])
@time_out(300)
def test_volume_resize_success(self):
"""test_volume_resize_success"""
@ -559,7 +558,8 @@ class ResizeInstanceVolumeTest(ActionTestBase):
else:
asserts.fail("Status should not be %s" % instance.status)
poll_until(check_resize_status, sleep_time=2, time_out=300)
poll_until(check_resize_status, sleep_time=5, time_out=300,
initial_delay=5)
instance = instance_info.dbaas.instances.get(instance_info.id)
asserts.assert_equal(instance.volume['size'], self.new_volume_size)

View File

@ -330,7 +330,7 @@ class FakeGuest(object):
def unmount_volume(self, device_path=None, mount_point=None):
pass
def resize_fs(self, device_path=None, mount_point=None):
def resize_fs(self, device_path=None, mount_point=None, online=False):
pass
def update_overrides(self, overrides, remove=False):

View File

@ -13,17 +13,19 @@
# License for the specific language governing permissions and limitations
# under the License.
#
from unittest.mock import Mock, patch
from unittest.mock import Mock
from unittest.mock import patch
from oslo_utils import timeutils
from trove import rpc
from trove.common import cfg
from trove.common.context import TroveContext
from trove.common import exception
from trove.common import notification
from trove.common.notification import EndNotification, StartNotification
from trove.common.context import TroveContext
from trove.common.notification import EndNotification
from trove.common.notification import StartNotification
from trove.conductor import api as conductor_api
from trove import rpc
from trove.tests.unittests import trove_testtools
@ -227,30 +229,6 @@ class TestTroveInstanceDelete(trove_testtools.TestCase):
self.assertTrue(notifier().info.called)
class TestTroveInstanceModifyVolume(trove_testtools.TestCase):
def setUp(self):
super(TestTroveInstanceModifyVolume, self).setUp()
self.instance = Mock(db_info=Mock(created=timeutils.utcnow()))
@patch.object(cfg.CONF, 'get', Mock())
@patch.object(rpc, 'get_notifier')
def test_notification(self, notifier):
notification.TroveInstanceModifyVolume(instance=self.instance).notify()
self.assertTrue(notifier().info.called)
@patch.object(cfg.CONF, 'get', Mock())
@patch.object(rpc, 'get_notifier')
def test_notification_after_serialization(self, notifier):
orig_notify = notification.TroveInstanceModifyVolume(
instance=self.instance)
serialized = orig_notify.serialize(None)
new_notify = notification.TroveInstanceModifyVolume().deserialize(
None, serialized)
new_notify.notify()
self.assertTrue(notifier().info.called)
class TestTroveInstanceModifyFlavor(trove_testtools.TestCase):
def setUp(self):

View File

@ -39,10 +39,10 @@ import trove.backup.models
from trove.common import timeutils
from trove.common import utils
import trove.common.context
from trove.common import exception
from trove.common.exception import GuestError
from trove.common.exception import PollTimeOut
from trove.common.exception import TroveError
from trove.common.notification import TroveInstanceModifyVolume
import trove.common.template as template
from trove.datastore import models as datastore_models
import trove.db.models
@ -627,11 +627,10 @@ class ResizeVolumeTest(trove_testtools.TestCase):
self.instance.volume_client.volumes.extend.side_effect = None
self.instance.reset_mock()
@patch('trove.taskmanager.models.LOG')
def test_resize_volume_verify_extend_no_volume(self, mock_logging):
def test_resize_volume_verify_extend_no_volume(self):
self.instance.volume_client.volumes.get = Mock(
return_value=None)
self.assertRaises(cinder_exceptions.ClientException,
self.assertRaises(exception.TroveError,
self.action._verify_extend)
self.instance.reset_mock()
@ -643,29 +642,20 @@ class ResizeVolumeTest(trove_testtools.TestCase):
utils.poll_until.side_effect = None
self.instance.reset_mock()
@patch.object(TroveInstanceModifyVolume, 'notify')
def test_resize_volume_active_server_succeeds(self, *args):
server = Mock(status=InstanceStatus.ACTIVE)
self.instance.attach_mock(server, 'server')
self.action.execute()
self.assertEqual(1, self.instance.guest.stop_db.call_count)
self.assertEqual(1, self.instance.guest.unmount_volume.call_count)
detach_count = (
self.instance.nova_client.volumes.delete_server_volume.call_count)
self.assertEqual(1, detach_count)
extend_count = self.instance.volume_client.volumes.extend.call_count
self.assertEqual(1, extend_count)
attach_count = (
self.instance.nova_client.volumes.create_server_volume.call_count)
self.assertEqual(1, attach_count)
self.assertEqual(1, self.instance.guest.resize_fs.call_count)
self.assertEqual(1, self.instance.guest.mount_volume.call_count)
self.assertEqual(1, self.instance.restart.call_count)
self.instance.reset_mock()
def test_resize_volume_server_error_fails(self):
server = Mock(status=InstanceStatus.ERROR)
self.instance.attach_mock(server, 'server')
self.assertRaises(TroveError, self.action.execute)
self.instance.reset_mock()