Fix AZ not matching backend after migration

When we migrate, or retype with migration, a volume between availability zones we are setting the AZ of the migrated volume to the same AZ as the source volume, so we end up with a volume where the AZ does not match the backend's AZ. This patch fixes this issue for the generic migration code as well as the optimized driver migration. Change-Id: Ia1bf3ae0b7cd6d209354024848aef6029179d8e4 Closes-Bug: #1747949
2018-02-07 17:07:47 +01:00 · 2018-02-07 17:07:47 +01:00 · fb8085894b
parent 28bb3e9b23
commit fb8085894b
4 changed files with 95 additions and 37 deletions
--- a/cinder/tests/unit/utils.py
+++ b/cinder/tests/unit/utils.py
@ -483,6 +483,7 @@ def default_service_values():
        'topic': 'fake_topic',
        'report_count': 3,
        'disabled': False,
        'availability_zone': 'nova',
    }
--- a/cinder/tests/unit/volume/test_volume_migration.py
+++ b/cinder/tests/unit/volume/test_volume_migration.py
@ -24,6 +24,7 @@ from oslo_concurrency import processutils
 from oslo_config import cfg
 from oslo_utils import imageutils
 from cinder.common import constants
 from cinder import context
 from cinder import db
 from cinder import exception
@ -75,6 +76,9 @@ class VolumeMigrationTestCase(base.BaseVolumeTestCase):
                                       autospec=True)
        self._clear_patch.start()
        self.expected_status = 'available'
        self._service = tests_utils.create_service(
            self.context,
            values={'host': 'newhost', 'binary': constants.VOLUME_BINARY})
    def tearDown(self):
        super(VolumeMigrationTestCase, self).tearDown()
@ -99,6 +103,28 @@ class VolumeMigrationTestCase(base.BaseVolumeTestCase):
        self.assertEqual('newhost', volume.host)
        self.assertEqual('success', volume.migration_status)
    def test_migrate_volume_driver_cross_az(self):
        """Test volume migration done by driver."""
        # Mock driver and rpc functions
        self.mock_object(self.volume.driver, 'migrate_volume',
                         lambda x, y, z, new_type_id=None: (
                             True, {'user_id': fake.USER_ID}))
        dst_az = 'AZ2'
        db.service_update(self.context, self._service.id,
                          {'availability_zone': dst_az})
        volume = tests_utils.create_volume(self.context, size=0,
                                           host=CONF.host,
                                           migration_status='migrating')
        host_obj = {'host': 'newhost', 'capabilities': {}}
        self.volume.migrate_volume(self.context, volume, host_obj, False)
        # check volume properties
        volume.refresh()
        self.assertEqual('newhost', volume.host)
        self.assertEqual('success', volume.migration_status)
        self.assertEqual(dst_az, volume.availability_zone)
    def _fake_create_volume(self, ctxt, volume, req_spec, filters,
                            allow_reschedule=True):
        return db.volume_update(ctxt, volume['id'],
@ -154,6 +180,41 @@ class VolumeMigrationTestCase(base.BaseVolumeTestCase):
                self.context, volume, new_volume_obj, error=False)
            self.assertFalse(update_server_volume.called)
    @mock.patch('cinder.compute.API')
    @mock.patch('cinder.volume.manager.VolumeManager.'
                'migrate_volume_completion')
    def test_migrate_volume_generic_cross_az(self, migrate_volume_completion,
                                             nova_api):
        """Test that we set the right AZ in cross AZ migrations."""
        original_create = objects.Volume.create
        dst_az = 'AZ2'
        db.service_update(self.context, self._service.id,
                          {'availability_zone': dst_az})
        def my_create(self, *args, **kwargs):
            self.status = 'available'
            original_create(self, *args, **kwargs)
        volume = tests_utils.create_volume(self.context, size=1,
                                           host=CONF.host)
        host_obj = {'host': 'newhost', 'capabilities': {}}
        create_vol = self.patch('cinder.objects.Volume.create',
                                side_effect=my_create, autospec=True)
        with mock.patch.object(self.volume, '_copy_volume_data') as copy_mock:
            self.volume._migrate_volume_generic(self.context, volume, host_obj,
                                                None)
            copy_mock.assert_called_with(self.context, volume, mock.ANY,
                                         remote='dest')
        migrate_volume_completion.assert_called_with(
            self.context, volume, mock.ANY, error=False)
        nova_api.return_value.update_server_volume.assert_not_called()
        self.assertEqual(dst_az,
                         create_vol.call_args[0][0]['availability_zone'])
    @mock.patch('cinder.compute.API')
    @mock.patch('cinder.volume.manager.VolumeManager.'
                'migrate_volume_completion')
--- a/cinder/volume/manager.py
+++ b/cinder/volume/manager.py
@ -198,6 +198,12 @@ class VolumeManager(manager.CleanableManager,
        'attach_status', 'migration_status', 'volume_type',
        'consistencygroup', 'volume_attachment', 'group'}
    def _get_service(self, host=None, binary=constants.VOLUME_BINARY):
        host = host or self.host
        ctxt = context.get_admin_context()
        svc_host = vol_utils.extract_host(host, 'backend')
        return objects.Service.get_by_args(ctxt, svc_host, binary)
    def __init__(self, volume_driver=None, service_name=None,
                 *args, **kwargs):
        """Load the driver from the one specified in args, or from flags."""
@ -227,12 +233,8 @@ class VolumeManager(manager.CleanableManager,
        # We pass the current setting for service.active_backend_id to
        # the driver on init, in case there was a restart or something
        curr_active_backend_id = None
        svc_host = vol_utils.extract_host(self.host, 'backend')
        try:
-            service = objects.Service.get_by_args(
+            service = self._get_service()
                context.get_admin_context(),
                svc_host,
                constants.VOLUME_BINARY)
        except exception.ServiceNotFound:
            # NOTE(jdg): This is to solve problems with unit tests
            LOG.info("Service not found for updating "
@ -530,12 +532,8 @@ class VolumeManager(manager.CleanableManager,
            return
        stats = self.driver.get_volume_stats(refresh=True)
        svc_host = vol_utils.extract_host(self.host, 'backend')
        try:
-            service = objects.Service.get_by_args(
+            service = self._get_service()
                context.get_admin_context(),
                svc_host,
                constants.VOLUME_BINARY)
        except exception.ServiceNotFound:
            with excutils.save_and_reraise_exception():
                LOG.error("Service not found for updating replication_status.")
@ -2068,8 +2066,10 @@ class VolumeManager(manager.CleanableManager,
        # Create new volume on remote host
        tmp_skip = {'snapshot_id', 'source_volid'}
-        skip = self._VOLUME_CLONE_SKIP_PROPERTIES | tmp_skip | {'host',
+        skip = {'host', 'cluster_name', 'availability_zone'}
-                                                                'cluster_name'}
+        skip.update(tmp_skip)
        skip.update(self._VOLUME_CLONE_SKIP_PROPERTIES)
        new_vol_values = {k: volume[k] for k in set(volume.keys()) - skip}
        if new_type_id:
            new_vol_values['volume_type_id'] = new_type_id
@ -2079,9 +2079,11 @@ class VolumeManager(manager.CleanableManager,
                    ctxt, self.key_manager, new_type_id)
                new_vol_values['encryption_key_id'] = encryption_key_id
        dst_service = self._get_service(backend['host'])
        new_volume = objects.Volume(
            context=ctxt,
            host=backend['host'],
            availability_zone=dst_service.availability_zone,
            cluster_name=backend.get('cluster_name'),
            status='creating',
            attach_status=fields.VolumeAttachStatus.DETACHED,
@ -2368,10 +2370,14 @@ class VolumeManager(manager.CleanableManager,
                                                                 volume,
                                                                 host)
                if moved:
-                    updates = {'host': host['host'],
+                    dst_service = self._get_service(host['host'])
                    updates = {
                        'host': host['host'],
                        'cluster_name': host.get('cluster_name'),
                        'migration_status': 'success',
-                               'previous_status': volume.status}
+                        'availability_zone': dst_service.availability_zone,
                        'previous_status': volume.status,
                    }
                    if status_update:
                        updates.update(status_update)
                    if model_update:
@ -2404,13 +2410,9 @@ class VolumeManager(manager.CleanableManager,
        # value isn't set (we didn't restart services), so we'll go ahead
        # and make this a part of the service periodic
        if not self.service_uuid:
            svc_host = vol_utils.extract_host(self.host, 'backend')
            # We hack this with a try/except for unit tests temporarily
            try:
-                service = objects.Service.get_by_args(
+                service = self._get_service()
                    context,
                    svc_host,
                    constants.VOLUME_BINARY)
                self.service_uuid = service.uuid
            except exception.ServiceNotFound:
                LOG.warning("Attempt to update service_uuid "
@ -3994,9 +3996,7 @@ class VolumeManager(manager.CleanableManager,
        updates = {}
        repl_status = fields.ReplicationStatus
-        svc_host = vol_utils.extract_host(self.host, 'backend')
+        service = self._get_service()
        service = objects.Service.get_by_args(context, svc_host,
                                              constants.VOLUME_BINARY)
        # TODO(geguileo): We should optimize these updates by doing them
        # directly on the DB with just 3 queries, one to change the volumes
@ -4168,9 +4168,7 @@ class VolumeManager(manager.CleanableManager,
        doing the failover of the volumes after finished processing the
        volumes.
        """
-        svc_host = vol_utils.extract_host(self.host, 'backend')
+        service = self._get_service()
        service = objects.Service.get_by_args(context, svc_host,
                                              constants.VOLUME_BINARY)
        service.update(updates)
        try:
            self.driver.failover_completed(context, service.active_backend_id)
@ -4206,12 +4204,8 @@ class VolumeManager(manager.CleanableManager,
            LOG.warning('Error encountered on Cinder backend during '
                        'freeze operation, service is frozen, however '
                        'notification to driver has failed.')
        svc_host = vol_utils.extract_host(self.host, 'backend')
-        service = objects.Service.get_by_args(
+        service = self._get_service()
            context,
            svc_host,
            constants.VOLUME_BINARY)
        service.disabled = True
        service.disabled_reason = "frozen"
        service.save()
@ -4239,12 +4233,8 @@ class VolumeManager(manager.CleanableManager,
            LOG.error('Error encountered on Cinder backend during '
                      'thaw operation, service will remain frozen.')
            return False
        svc_host = vol_utils.extract_host(self.host, 'backend')
-        service = objects.Service.get_by_args(
+        service = self._get_service()
            context,
            svc_host,
            constants.VOLUME_BINARY)
        service.disabled = False
        service.disabled_reason = ""
        service.save()
--- a/releasenotes/notes/fix-cross-az-migration-ce97eff61280e1c7.yaml
+++ b/releasenotes/notes/fix-cross-az-migration-ce97eff61280e1c7.yaml
@ -0,0 +1,6 @@
 ---
 fixes:
  - |
    Resolve issue with cross AZ migrations and retypes where the destination
    volume kept the source volume's AZ, so we ended up with a volume where the
    AZ does not match the backend. (bug 1747949)