diff --git a/releasenotes/notes/victoria-support-online-resize.yaml b/releasenotes/notes/victoria-support-online-resize.yaml new file mode 100644 index 0000000000..f5218013cf --- /dev/null +++ b/releasenotes/notes/victoria-support-online-resize.yaml @@ -0,0 +1,7 @@ +--- +features: + - Trove now supports to resize volume without downtime. To use this feature, + the version of Nova and Cinder needs to be at least Pike, the config option + ``cinder_service_type`` needs to be set to ``volumev3``. The cloud admin + can disable this feature by setting ``online_volume_resize=False``, default + is enabled. \ No newline at end of file diff --git a/trove/common/cfg.py b/trove/common/cfg.py index 2be700561e..ed099961ba 100644 --- a/trove/common/cfg.py +++ b/trove/common/cfg.py @@ -93,7 +93,7 @@ common_opts = [ cfg.BoolOpt('neutron_api_insecure', default=False, help="Allow to perform insecure SSL requests to neutron."), cfg.URIOpt('cinder_url', help='URL without the tenant segment.'), - cfg.StrOpt('cinder_service_type', default='volumev2', + cfg.StrOpt('cinder_service_type', default='volumev3', help='Service type to use when searching catalog.'), cfg.StrOpt('cinder_endpoint_type', default='publicURL', help='Service endpoint type to use when searching catalog.'), @@ -475,7 +475,10 @@ common_opts = [ help='The docker image used for backup and restore.'), cfg.ListOpt('reserved_network_cidrs', default=[], help='Network CIDRs reserved for Trove guest instance ' - 'management.') + 'management.'), + cfg.BoolOpt( + 'online_volume_resize', default=True, + help='If online volume resize is supported.') ] diff --git a/trove/common/clients_admin.py b/trove/common/clients_admin.py index 9f89d97fee..af0bb1fb97 100644 --- a/trove/common/clients_admin.py +++ b/trove/common/clients_admin.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from cinderclient.v2 import client as CinderClient +from cinderclient import client as CinderClient import glanceclient from keystoneauth1 import loading from keystoneauth1 import session @@ -95,13 +95,17 @@ def cinder_client_trove_admin(context, region_name=None): LOG.debug('Re-use admin cinder client') return ADMIN_CINDER_CLIENT + version = CONF.cinder_service_type.split('v')[-1] or '3' + ks_session = get_keystone_session() ADMIN_CINDER_CLIENT = CinderClient.Client( + version, session=ks_session, service_type=CONF.cinder_service_type, region_name=region_name or CONF.service_credentials.region_name, insecure=CONF.cinder_api_insecure, - endpoint_type=CONF.cinder_endpoint_type) + endpoint_type=CONF.cinder_endpoint_type, + additional_headers={'OpenStack-API-Version': 'volumev3 latest'}) if CONF.cinder_url and CONF.service_credentials.project_id: ADMIN_CINDER_CLIENT.client.management_url = "%s/%s/" % ( diff --git a/trove/common/notification.py b/trove/common/notification.py index c5fa57e27b..a050e0a787 100644 --- a/trove/common/notification.py +++ b/trove/common/notification.py @@ -205,23 +205,6 @@ class TroveInstanceCreate(TroveCommonTraits): super(TroveInstanceCreate, self).notify('create') -class TroveInstanceModifyVolume(TroveCommonTraits): - - ''' - Additional traits for trove.instance.create notifications that describe - instance action events - - This class should correspond to trove_instance_modify_volume in - ceilometer/event_definitions.yaml - ''' - - def __init__(self, **kwargs): - super(TroveInstanceModifyVolume, self).__init__(**kwargs) - - def notify(self): - super(TroveInstanceModifyVolume, self).notify('modify_volume') - - class TroveInstanceModifyFlavor(TroveCommonTraits): ''' diff --git a/trove/guestagent/api.py b/trove/guestagent/api.py index 67c3baa544..329980b48d 100644 --- a/trove/guestagent/api.py +++ b/trove/guestagent/api.py @@ -473,7 +473,7 @@ class API(object): self.agent_low_timeout, version=version, device_path=device_path, mount_point=mount_point) - def resize_fs(self, device_path=None, mount_point=None): + def resize_fs(self, device_path=None, mount_point=None, online=False): """Resize the filesystem.""" LOG.debug("Resize device %(device)s on instance %(id)s.", { 'device': device_path, 'id': self.id}) @@ -481,7 +481,8 @@ class API(object): self._call("resize_fs", self.agent_high_timeout, version=version, - device_path=device_path, mount_point=mount_point) + device_path=device_path, mount_point=mount_point, + online=online) def update_overrides(self, overrides, remove=False): """Update the overrides.""" diff --git a/trove/guestagent/datastore/manager.py b/trove/guestagent/datastore/manager.py index a7e126f124..f1bec02e07 100644 --- a/trove/guestagent/datastore/manager.py +++ b/trove/guestagent/datastore/manager.py @@ -364,10 +364,11 @@ class Manager(periodic_task.PeriodicTasks): device = volume.VolumeDevice(device_path) device.unmount(mount_point) - def resize_fs(self, context, device_path=None, mount_point=None): - LOG.debug("Resizing the filesystem at %s.", mount_point) + def resize_fs(self, context, device_path=None, mount_point=None, + online=False): + LOG.info(f"Resizing the filesystem at {mount_point}, online: {online}") device = volume.VolumeDevice(device_path) - device.resize_fs(mount_point) + device.resize_fs(mount_point, online=online) ############### # Configuration diff --git a/trove/guestagent/volume.py b/trove/guestagent/volume.py index 47b75726d6..0239607345 100644 --- a/trove/guestagent/volume.py +++ b/trove/guestagent/volume.py @@ -71,7 +71,7 @@ class FSBase(object): """ @abc.abstractmethod - def resize(self, device_path): + def resize(self, device_path, online=False): """ Resize the filesystem on device """ @@ -113,9 +113,10 @@ class FSExt(FSBase): exc_fmt = _("Volume '%s' was not formatted.") log_and_raise(log_fmt, exc_fmt, device_path) - def resize(self, device_path): - utils.execute("e2fsck", "-f", "-p", device_path, - run_as_root=True, root_helper="sudo") + def resize(self, device_path, online=False): + if not online: + utils.execute("e2fsck", "-f", "-p", device_path, + run_as_root=True, root_helper="sudo") utils.execute("resize2fs", device_path, run_as_root=True, root_helper="sudo") @@ -158,7 +159,7 @@ class FSXFS(FSBase): device_path) raise exception.GuestError(original_message=msg) - def resize(self, device_path): + def resize(self, device_path, online=False): utils.execute("xfs_repair", device_path, run_as_root=True, root_helper="sudo") utils.execute("mount", device_path, @@ -263,18 +264,18 @@ class VolumeDevice(object): return True - def resize_fs(self, mount_point): + def resize_fs(self, mount_point, online=False): """Resize the filesystem on the specified device.""" self._check_device_exists() # Some OS's will mount a file systems after it's attached if # an entry is put in the fstab file (like Trove does). # Thus it may be necessary to wait for the mount and then unmount # the fs again (since the volume was just attached). - if self._wait_for_mount(mount_point, timeout=2): + if not online and self._wait_for_mount(mount_point, timeout=2): LOG.debug("Unmounting '%s' before resizing.", mount_point) self.unmount(mount_point) try: - self.volume_fs.resize(self.device_path) + self.volume_fs.resize(self.device_path, online=online) except exception.ProcessExecutionError: log_fmt = "Error resizing the filesystem with device '%s'." exc_fmt = _("Error resizing the filesystem with device '%s'.") diff --git a/trove/taskmanager/models.py b/trove/taskmanager/models.py index 7adfb3377e..44e6577a38 100755 --- a/trove/taskmanager/models.py +++ b/trove/taskmanager/models.py @@ -17,7 +17,6 @@ import os.path import time import traceback -from cinderclient import exceptions as cinder_exceptions from eventlet import greenthread from eventlet.timeout import Timeout from oslo_log import log as logging @@ -55,7 +54,6 @@ from trove.common.notification import EndNotification from trove.common.notification import StartNotification from trove.common.notification import TroveInstanceCreate from trove.common.notification import TroveInstanceModifyFlavor -from trove.common.notification import TroveInstanceModifyVolume from trove.common.strategies.cluster import strategy from trove.common.utils import try_recover from trove.extensions.mysql import models as mysql_models @@ -1512,11 +1510,11 @@ class ResizeVolumeAction(object): return self.instance.device_path def _fail(self, orig_func): - LOG.exception("%(func)s encountered an error when " - "attempting to resize the volume for " - "instance %(id)s. Setting service " - "status to failed.", {'func': orig_func.__name__, - 'id': self.instance.id}) + LOG.error("%(func)s encountered an error when " + "attempting to resize the volume for " + "instance %(id)s. Setting service " + "status to failed.", {'func': orig_func.__name__, + 'id': self.instance.id}) service = InstanceServiceStatus.find_by(instance_id=self.instance.id) service.set_status(srvstatus.ServiceStatuses.FAILED) service.save() @@ -1539,12 +1537,12 @@ class ResizeVolumeAction(object): self.instance.restart() def _recover_full(self, orig_func): - LOG.exception("%(func)s encountered an error when attempting to " - "resize the volume for instance %(id)s. Trying to " - "recover by attaching and" - " mounting the volume and then restarting the " - "guest.", {'func': orig_func.__name__, - 'id': self.instance.id}) + LOG.error("%(func)s encountered an error when attempting to " + "resize the volume for instance %(id)s. Trying to " + "recover by attaching and" + " mounting the volume and then restarting the " + "guest.", {'func': orig_func.__name__, + 'id': self.instance.id}) self._attach_volume() self._mount_volume() self.instance.restart() @@ -1609,16 +1607,16 @@ class ResizeVolumeAction(object): 'id': self.instance.id}) @try_recover - def _resize_fs(self): - LOG.debug("Resizing the filesystem for instance %(id)s", { - 'id': self.instance.id}) + def _resize_fs(self, online=False): + LOG.info(f"Resizing the filesystem for instance {self.instance.id}, " + f"online: {online}") mount_point = self.get_mount_point() device_path = self.get_device_path() self.instance.guest.resize_fs(device_path=device_path, - mount_point=mount_point) - LOG.debug("Successfully resized volume %(vol_id)s filesystem for " - "instance %(id)s", {'vol_id': self.instance.volume_id, - 'id': self.instance.id}) + mount_point=mount_point, + online=online) + LOG.debug(f"Successfully resized volume {self.instance.volume_id} " + f"filesystem for instance {self.instance.id}") @try_recover def _mount_volume(self): @@ -1634,10 +1632,8 @@ class ResizeVolumeAction(object): @try_recover def _extend(self): - LOG.debug("Extending volume %(vol_id)s for instance %(id)s to " - "size %(size)s", {'vol_id': self.instance.volume_id, - 'id': self.instance.id, - 'size': self.new_size}) + LOG.info(f"Calling Cinder to extend volume {self.instance.volume_id} " + f"for instance {self.instance.id} to size {self.new_size}") self.instance.volume_client.volumes.extend(self.instance.volume_id, self.new_size) LOG.debug("Successfully extended the volume %(vol_id)s for instance " @@ -1649,9 +1645,8 @@ class ResizeVolumeAction(object): volume = self.instance.volume_client.volumes.get( self.instance.volume_id) if not volume: - msg = (_('Failed to get volume %(vol_id)s') % { - 'vol_id': self.instance.volume_id}) - raise cinder_exceptions.ClientException(msg) + msg = f'Failed to get volume {self.instance.volume_id}' + raise exception.TroveError(msg) def volume_is_new_size(): volume = self.instance.volume_client.volumes.get( @@ -1659,34 +1654,46 @@ class ResizeVolumeAction(object): return volume.size == self.new_size utils.poll_until(volume_is_new_size, - sleep_time=2, + sleep_time=5, time_out=CONF.volume_time_out) self.instance.update_db(volume_size=self.new_size) except PollTimeOut: - LOG.exception("Timeout trying to extend the volume %(vol_id)s " - "for instance %(id)s", - {'vol_id': self.instance.volume_id, - 'id': self.instance.id}) + LOG.error("Timeout trying to extend the volume %(vol_id)s " + "for instance %(id)s", + {'vol_id': self.instance.volume_id, + 'id': self.instance.id}) volume = self.instance.volume_client.volumes.get( self.instance.volume_id) if volume.status == 'extending': self._fail(self._verify_extend) elif volume.size != self.new_size: self.instance.update_db(volume_size=volume.size) - self._recover_full(self._verify_extend) + if not CONF.online_volume_resize: + self._recover_full(self._verify_extend) raise - except Exception: - LOG.exception("Error encountered trying to verify extend for " - "the volume %(vol_id)s for instance %(id)s", - {'vol_id': self.instance.volume_id, - 'id': self.instance.id}) - self._recover_full(self._verify_extend) + except Exception as e: + LOG.error("Error encountered trying to verify extend for " + "the volume %(vol_id)s for instance %(id)s, " + "error: %(error)s", + {'vol_id': self.instance.volume_id, + 'id': self.instance.id, + 'error': str(e)}) + if not CONF.online_volume_resize: + self._recover_full(self._verify_extend) raise def _resize_active_volume(self): - LOG.debug("Begin _resize_active_volume for id: %(id)s", { - 'id': self.instance.id}) + if CONF.online_volume_resize: + try: + self._extend() + except Exception as e: + LOG.error(f'Failed to extend volume, error: {str(e)}') + + self._verify_extend() + self._resize_fs(recover_func=self._fail, online=True) + return + self._stop_db() self._unmount_volume(recover_func=self._recover_restart) self._detach_volume(recover_func=self._recover_mount_restart) @@ -1694,11 +1701,9 @@ class ResizeVolumeAction(object): self._verify_extend() # if anything fails after this point, recovery is futile self._attach_volume(recover_func=self._fail) - self._resize_fs(recover_func=self._fail) + self._resize_fs(recover_func=self._fail, online=False) self._mount_volume(recover_func=self._fail) self.instance.restart() - LOG.debug("End _resize_active_volume for id: %(id)s", { - 'id': self.instance.id}) def execute(self): LOG.debug("%(gt)s: Resizing instance %(id)s volume for server " @@ -1711,19 +1716,11 @@ class ResizeVolumeAction(object): if self.instance.server.status in [InstanceStatus.ACTIVE, InstanceStatus.HEALTHY]: - self._resize_active_volume() - self.instance.reset_task_status() - # send usage event for size reported by cinder - volume = self.instance.volume_client.volumes.get( - self.instance.volume_id) - launched_time = timeutils.isotime(self.instance.updated) - modified_time = timeutils.isotime(self.instance.updated) - TroveInstanceModifyVolume(instance=self.instance, - old_volume_size=self.old_size, - launched_at=launched_time, - modify_at=modified_time, - volume_size=volume.size, - ).notify() + try: + self._resize_active_volume() + finally: + self.instance.reset_task_status() + else: self.instance.reset_task_status() msg = ( diff --git a/trove/tests/api/instances_actions.py b/trove/tests/api/instances_actions.py index 608e827e1c..4d8d2b25c3 100644 --- a/trove/tests/api/instances_actions.py +++ b/trove/tests/api/instances_actions.py @@ -546,7 +546,6 @@ class ResizeInstanceVolumeTest(ActionTestBase): self.new_volume_size) @test(depends_on=[test_volume_resize]) - @time_out(300) def test_volume_resize_success(self): """test_volume_resize_success""" @@ -559,7 +558,8 @@ class ResizeInstanceVolumeTest(ActionTestBase): else: asserts.fail("Status should not be %s" % instance.status) - poll_until(check_resize_status, sleep_time=2, time_out=300) + poll_until(check_resize_status, sleep_time=5, time_out=300, + initial_delay=5) instance = instance_info.dbaas.instances.get(instance_info.id) asserts.assert_equal(instance.volume['size'], self.new_volume_size) diff --git a/trove/tests/fakes/guestagent.py b/trove/tests/fakes/guestagent.py index a711058069..dc22bb536d 100644 --- a/trove/tests/fakes/guestagent.py +++ b/trove/tests/fakes/guestagent.py @@ -330,7 +330,7 @@ class FakeGuest(object): def unmount_volume(self, device_path=None, mount_point=None): pass - def resize_fs(self, device_path=None, mount_point=None): + def resize_fs(self, device_path=None, mount_point=None, online=False): pass def update_overrides(self, overrides, remove=False): diff --git a/trove/tests/unittests/common/test_notification.py b/trove/tests/unittests/common/test_notification.py index 3f27c71ec0..dd914a2386 100644 --- a/trove/tests/unittests/common/test_notification.py +++ b/trove/tests/unittests/common/test_notification.py @@ -13,17 +13,19 @@ # License for the specific language governing permissions and limitations # under the License. # -from unittest.mock import Mock, patch +from unittest.mock import Mock +from unittest.mock import patch from oslo_utils import timeutils +from trove import rpc from trove.common import cfg -from trove.common.context import TroveContext from trove.common import exception from trove.common import notification -from trove.common.notification import EndNotification, StartNotification +from trove.common.context import TroveContext +from trove.common.notification import EndNotification +from trove.common.notification import StartNotification from trove.conductor import api as conductor_api -from trove import rpc from trove.tests.unittests import trove_testtools @@ -227,30 +229,6 @@ class TestTroveInstanceDelete(trove_testtools.TestCase): self.assertTrue(notifier().info.called) -class TestTroveInstanceModifyVolume(trove_testtools.TestCase): - - def setUp(self): - super(TestTroveInstanceModifyVolume, self).setUp() - self.instance = Mock(db_info=Mock(created=timeutils.utcnow())) - - @patch.object(cfg.CONF, 'get', Mock()) - @patch.object(rpc, 'get_notifier') - def test_notification(self, notifier): - notification.TroveInstanceModifyVolume(instance=self.instance).notify() - self.assertTrue(notifier().info.called) - - @patch.object(cfg.CONF, 'get', Mock()) - @patch.object(rpc, 'get_notifier') - def test_notification_after_serialization(self, notifier): - orig_notify = notification.TroveInstanceModifyVolume( - instance=self.instance) - serialized = orig_notify.serialize(None) - new_notify = notification.TroveInstanceModifyVolume().deserialize( - None, serialized) - new_notify.notify() - self.assertTrue(notifier().info.called) - - class TestTroveInstanceModifyFlavor(trove_testtools.TestCase): def setUp(self): diff --git a/trove/tests/unittests/taskmanager/test_models.py b/trove/tests/unittests/taskmanager/test_models.py index e9dad0d626..d542b34bec 100644 --- a/trove/tests/unittests/taskmanager/test_models.py +++ b/trove/tests/unittests/taskmanager/test_models.py @@ -39,10 +39,10 @@ import trove.backup.models from trove.common import timeutils from trove.common import utils import trove.common.context +from trove.common import exception from trove.common.exception import GuestError from trove.common.exception import PollTimeOut from trove.common.exception import TroveError -from trove.common.notification import TroveInstanceModifyVolume import trove.common.template as template from trove.datastore import models as datastore_models import trove.db.models @@ -627,11 +627,10 @@ class ResizeVolumeTest(trove_testtools.TestCase): self.instance.volume_client.volumes.extend.side_effect = None self.instance.reset_mock() - @patch('trove.taskmanager.models.LOG') - def test_resize_volume_verify_extend_no_volume(self, mock_logging): + def test_resize_volume_verify_extend_no_volume(self): self.instance.volume_client.volumes.get = Mock( return_value=None) - self.assertRaises(cinder_exceptions.ClientException, + self.assertRaises(exception.TroveError, self.action._verify_extend) self.instance.reset_mock() @@ -643,29 +642,20 @@ class ResizeVolumeTest(trove_testtools.TestCase): utils.poll_until.side_effect = None self.instance.reset_mock() - @patch.object(TroveInstanceModifyVolume, 'notify') def test_resize_volume_active_server_succeeds(self, *args): server = Mock(status=InstanceStatus.ACTIVE) self.instance.attach_mock(server, 'server') + self.action.execute() - self.assertEqual(1, self.instance.guest.stop_db.call_count) - self.assertEqual(1, self.instance.guest.unmount_volume.call_count) - detach_count = ( - self.instance.nova_client.volumes.delete_server_volume.call_count) - self.assertEqual(1, detach_count) + extend_count = self.instance.volume_client.volumes.extend.call_count self.assertEqual(1, extend_count) - attach_count = ( - self.instance.nova_client.volumes.create_server_volume.call_count) - self.assertEqual(1, attach_count) - self.assertEqual(1, self.instance.guest.resize_fs.call_count) - self.assertEqual(1, self.instance.guest.mount_volume.call_count) - self.assertEqual(1, self.instance.restart.call_count) self.instance.reset_mock() def test_resize_volume_server_error_fails(self): server = Mock(status=InstanceStatus.ERROR) self.instance.attach_mock(server, 'server') + self.assertRaises(TroveError, self.action.execute) self.instance.reset_mock()