libvirt: support management of downtime during migration

Currently live migration runs with the default maximum downtime setting defined by QEMU. This is often inadequate to allow migration of large VMs to ever complete. Rather than trying to invent a new policy for changing downtime in OpenStack, copy the existing logic that is successfully battle tested by the oVirt project in VDSM. Note that setting the downtime step delay based on guest RAM size is an inexact science, as RAM size is only one factor influencing success of migration. Just as important is the rate of dirtying data in the guest, but this is based on guest workload which is not something Nova has visibility into. The bottleneck is the network which needs to be able to keep up with the dirtying of data in the guest. The greater the overall RAM size, the more time is required to transfer the total guest memory. So for larger guest sizes, we need to allow greater time for the guest to attempt to successfully migrate before increasing the max downtime. Scaling downtime step delay according to the overall guest RAM size is a reasonable, albeit not foolproof, way to tune migration to increase chances of success. This adds three host level config parameters which admins can use to control the base downtime value and the rate at which downtime is allowed to be increased during migration. Related-bug: #1429220 DocImpact: three new libvirt configuration parameters in nova.conf allow the administrator to control the maximum permitted downtime for migration making migration more likely to complete for large VMs. Change-Id: I1992ffe9d3b2ff8d436cf1c419af9a238a8fecd8
2015-03-06 16:12:28 +00:00 · 2015-03-06 16:12:28 +00:00 · 07c7e5caf2
parent bccd178461
commit 07c7e5caf2
3 changed files with 223 additions and 12 deletions
--- a/nova/tests/unit/virt/libvirt/fakelibvirt.py
+++ b/nova/tests/unit/virt/libvirt/fakelibvirt.py
@ -591,6 +591,9 @@ class Domain(object):
                error_code=VIR_ERR_INTERNAL_ERROR,
                error_domain=VIR_FROM_QEMU)

+    def migrateSetMaxDowntime(self, downtime):
+        pass
+
    def attachDevice(self, xml):
        disk_info = _parse_disk_info(etree.fromstring(xml))
        disk_info['_attached'] = True
--- a/nova/tests/unit/virt/libvirt/test_driver.py
+++ b/nova/tests/unit/virt/libvirt/test_driver.py
@ -6225,6 +6225,7 @@ class LibvirtConnTestCase(test.NoDBTestCase):
        self.assertFalse(mock_exist.called)
        self.assertFalse(mock_shutil.called)

+    @mock.patch.object(time, "time")
    @mock.patch.object(time, "sleep",
                       side_effect=lambda x: eventlet.sleep(0))
    @mock.patch.object(host.DomainJobInfo, "for_domain")
@ -6232,11 +6233,13 @@ class LibvirtConnTestCase(test.NoDBTestCase):
    @mock.patch.object(fakelibvirt.Connection, "_mark_running")
    def _test_live_migration_monitoring(self,
                                        job_info_records,
+                                        time_records,
                                        expect_success,
                                        mock_running,
                                        mock_save,
                                        mock_job_info,
-                                        mock_sleep):
+                                        mock_sleep,
+                                        mock_time):
        drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
        instance = objects.Instance(**self.test_instance)
        dom = fakelibvirt.Domain(drvr._get_connection(), "<domain/>", True)
@ -6245,17 +6248,29 @@ class LibvirtConnTestCase(test.NoDBTestCase):
        def fake_job_info(hostself):
            while True:
                self.assertTrue(len(job_info_records) > 0)
-                rec = job_info_records.pop()
+                rec = job_info_records.pop(0)
+
                if type(rec) == str:
                    if rec == "thread-finish":
                        finish_event.send()
                    elif rec == "domain-stop":
                        dom.destroy()
                else:
+                    if len(time_records) > 0:
+                        time_records.pop(0)
                    return rec
            return rec

+        def fake_time():
+            if len(time_records) > 0:
+                return time_records[0]
+            else:
+                return int(
+                    datetime.datetime(2001, 1, 20, 20, 1, 0)
+                    .strftime('%s'))
+
        mock_job_info.side_effect = fake_job_info
+        mock_time.side_effect = fake_time

        dest = mock.sentinel.migrate_dest
        migrate_data = mock.sentinel.migrate_data
@ -6299,7 +6314,7 @@ class LibvirtConnTestCase(test.NoDBTestCase):
                type=fakelibvirt.VIR_DOMAIN_JOB_COMPLETED),
        ]

-        self._test_live_migration_monitoring(domain_info_records, True)
+        self._test_live_migration_monitoring(domain_info_records, [], True)

    def test_live_migration_monitor_success_race(self):
        # A normalish sequence but we're too slow to see the
@ -6319,7 +6334,7 @@ class LibvirtConnTestCase(test.NoDBTestCase):
                type=fakelibvirt.VIR_DOMAIN_JOB_NONE),
        ]

-        self._test_live_migration_monitoring(domain_info_records, True)
+        self._test_live_migration_monitoring(domain_info_records, [], True)

    def test_live_migration_monitor_failed(self):
        # A failed sequence where we see all the expected events
@ -6337,7 +6352,7 @@ class LibvirtConnTestCase(test.NoDBTestCase):
                type=fakelibvirt.VIR_DOMAIN_JOB_FAILED),
        ]

-        self._test_live_migration_monitoring(domain_info_records, False)
+        self._test_live_migration_monitoring(domain_info_records, [], False)

    def test_live_migration_monitor_failed_race(self):
        # A failed sequence where we are too slow to see the
@ -6356,7 +6371,7 @@ class LibvirtConnTestCase(test.NoDBTestCase):
                type=fakelibvirt.VIR_DOMAIN_JOB_NONE),
        ]

-        self._test_live_migration_monitoring(domain_info_records, False)
+        self._test_live_migration_monitoring(domain_info_records, [], False)

    def test_live_migration_monitor_cancelled(self):
        # A cancelled sequence where we see all the events
@ -6375,7 +6390,78 @@ class LibvirtConnTestCase(test.NoDBTestCase):
                type=fakelibvirt.VIR_DOMAIN_JOB_CANCELLED),
        ]

-        self._test_live_migration_monitoring(domain_info_records, False)
+        self._test_live_migration_monitoring(domain_info_records, [], False)
+
+    @mock.patch.object(fakelibvirt.virDomain, "migrateSetMaxDowntime")
+    @mock.patch.object(libvirt_driver.LibvirtDriver,
+                       "_migration_downtime_steps")
+    def test_live_migration_monitor_downtime(self, mock_downtime_steps,
+                                             mock_set_downtime):
+        # We've setup 4 fake downtime steps - first value is the
+        # time delay, second is the downtime value
+        downtime_steps = [
+            (90, 10),
+            (180, 50),
+            (270, 200),
+            (500, 300),
+        ]
+        mock_downtime_steps.return_value = downtime_steps
+
+        # Each one of these fake times is used for time.time()
+        # when a new domain_info_records entry is consumed.
+        # Times are chosen so that only the first 3 downtime
+        # steps are needed.
+        fake_times = [0, 1, 30, 95, 150, 200, 300]
+
+        # A normal sequence where see all the normal job states
+        domain_info_records = [
+            host.DomainJobInfo(
+                type=fakelibvirt.VIR_DOMAIN_JOB_NONE),
+            host.DomainJobInfo(
+                type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED),
+            host.DomainJobInfo(
+                type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED),
+            host.DomainJobInfo(
+                type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED),
+            host.DomainJobInfo(
+                type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED),
+            host.DomainJobInfo(
+                type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED),
+            "thread-finish",
+            "domain-stop",
+            host.DomainJobInfo(
+                type=fakelibvirt.VIR_DOMAIN_JOB_COMPLETED),
+        ]
+
+        self._test_live_migration_monitoring(domain_info_records,
+                                             fake_times, True)
+
+        mock_set_downtime.assert_has_calls([mock.call(10),
+                                            mock.call(50),
+                                            mock.call(200)])
+
+    def test_live_migration_downtime_steps(self):
+        self.flags(live_migration_downtime=400, group='libvirt')
+        self.flags(live_migration_downtime_steps=10, group='libvirt')
+        self.flags(live_migration_downtime_delay=30, group='libvirt')
+
+        drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
+
+        steps = drvr._migration_downtime_steps(3.0)
+
+        self.assertEqual([
+            (0, 37),
+            (90, 38),
+            (180, 39),
+            (270, 42),
+            (360, 46),
+            (450, 55),
+            (540, 70),
+            (630, 98),
+            (720, 148),
+            (810, 238),
+            (900, 400),
+        ], list(steps))

    @mock.patch.object(utils, "spawn")
    @mock.patch.object(libvirt_driver.LibvirtDriver, "_live_migration_monitor")
--- a/nova/virt/libvirt/driver.py
+++ b/nova/virt/libvirt/driver.py
@ -112,6 +112,13 @@ libvirt = None

 LOG = logging.getLogger(__name__)

+# Downtime period in milliseconds
+LIVE_MIGRATION_DOWNTIME_MIN = 100
+# Step count
+LIVE_MIGRATION_DOWNTIME_STEPS_MIN = 3
+# Delay in seconds
+LIVE_MIGRATION_DOWNTIME_DELAY_MIN = 10
+
 libvirt_opts = [
    cfg.StrOpt('rescue_image_id',
               help='Rescue ami image. This will not be used if an image id '
@ -160,6 +167,23 @@ libvirt_opts = [
    cfg.IntOpt('live_migration_bandwidth',
               default=0,
               help='Maximum bandwidth to be used during migration, in Mbps'),
+    cfg.IntOpt('live_migration_downtime',
+               default=500,
+               help='Maximum permitted downtime, in milliseconds, for live '
+                    'migration switchover. Will be rounded up to a minimum '
+                    'of %dms. Use a large value if guest liveness is '
+                    'unimportant.' % LIVE_MIGRATION_DOWNTIME_MIN),
+    cfg.IntOpt('live_migration_downtime_steps',
+               default=10,
+               help='Number of incremental steps to reach max downtime value. '
+                    'Will be rounded up to a minimum of %d steps' %
+                    LIVE_MIGRATION_DOWNTIME_STEPS_MIN),
+    cfg.IntOpt('live_migration_downtime_delay',
+               default=75,
+               help='Time to wait, in seconds, between each step increase '
+                    'of the migration downtime. Minimum delay is %d seconds. '
+                    'Value is per GiB of guest RAM, with lower bound of a '
+                    'minimum of 2 GiB' % LIVE_MIGRATION_DOWNTIME_DELAY_MIN),
    cfg.StrOpt('snapshot_image_format',
               choices=('raw', 'qcow2', 'vmdk', 'vdi'),
               help='Snapshot image format. Defaults to same as source image'),
@ -5583,10 +5607,85 @@ class LibvirtDriver(driver.ComputeDriver):
        LOG.debug("Migration operation thread has finished",
                  instance=instance)

+    @staticmethod
+    def _migration_downtime_steps(data_gb):
+        '''Calculate downtime value steps and time between increases.
+
+        :param data_gb: total GB of RAM and disk to transfer
+
+        This looks at the total downtime steps and upper bound
+        downtime value and uses an exponential backoff. So initially
+        max downtime is increased by small amounts, and as time goes
+        by it is increased by ever larger amounts
+
+        For example, with 10 steps, 30 second step delay, 3 GB
+        of RAM and 400ms target maximum downtime, the downtime will
+        be increased every 90 seconds in the following progression:
+
+        -   0 seconds -> set downtime to  37ms
+        -  90 seconds -> set downtime to  38ms
+        - 180 seconds -> set downtime to  39ms
+        - 270 seconds -> set downtime to  42ms
+        - 360 seconds -> set downtime to  46ms
+        - 450 seconds -> set downtime to  55ms
+        - 540 seconds -> set downtime to  70ms
+        - 630 seconds -> set downtime to  98ms
+        - 720 seconds -> set downtime to 148ms
+        - 810 seconds -> set downtime to 238ms
+        - 900 seconds -> set downtime to 400ms
+
+        This allows the guest a good chance to complete migration
+        with a small downtime value.
+        '''
+        downtime = CONF.libvirt.live_migration_downtime
+        steps = CONF.libvirt.live_migration_downtime_steps
+        delay = CONF.libvirt.live_migration_downtime_delay
+
+        if downtime < LIVE_MIGRATION_DOWNTIME_MIN:
+            downtime = LIVE_MIGRATION_DOWNTIME_MIN
+        if steps < LIVE_MIGRATION_DOWNTIME_STEPS_MIN:
+            steps = LIVE_MIGRATION_DOWNTIME_STEPS_MIN
+        if delay < LIVE_MIGRATION_DOWNTIME_DELAY_MIN:
+            delay = LIVE_MIGRATION_DOWNTIME_DELAY_MIN
+        delay = int(delay * data_gb)
+
+        offset = downtime / float(steps + 1)
+        base = (downtime - offset) ** (1 / float(steps))
+
+        for i in range(steps + 1):
+            yield (int(delay * i), int(offset + base ** i))
+
+    def _live_migration_data_gb(self, instance):
+        '''Calculate total amount of data to be transferred
+
+        :param instance: the nova.objects.Instance being migrated
+
+        Calculates the total amount of data that needs to be
+        transferred during the live migration. The actual
+        amount copied will be larger than this, due to the
+        guest OS continuing to dirty RAM while the migration
+        is taking place. So this value represents the minimal
+        data size possible.
+
+        :returns: data size to be copied in GB
+        '''
+
+        ram_gb = instance.flavor.memory_mb * units.Mi / units.Gi
+        if ram_gb < 2:
+            ram_gb = 2
+
+        # TODO(berrange) calculate size of any disks when doing
+        # a block migration
+        return ram_gb
+
    def _live_migration_monitor(self, context, instance, dest, post_method,
                                recover_method, block_migration,
                                migrate_data, dom, finish_event):
+        data_gb = self._live_migration_data_gb(instance)
+        downtime_steps = list(self._migration_downtime_steps(data_gb))
+
        n = 0
+        start = time.time()
        while True:
            info = host.DomainJobInfo.for_domain(dom)

@ -5637,6 +5736,34 @@ class LibvirtDriver(driver.ComputeDriver):
                LOG.debug("Migration not running yet",
                          instance=instance)
            elif info.type == libvirt.VIR_DOMAIN_JOB_UNBOUNDED:
+                # Migration is still running
+                #
+                # This is where we wire up calls to change live
+                # migration status. eg change max downtime, cancel
+                # the operation, change max bandwidth
+                now = time.time()
+                elapsed = now - start
+
+                # See if we need to increase the max downtime. We
+                # ignore failures, since we'd rather continue trying
+                # to migrate
+                if (len(downtime_steps) > 0 and
+                    elapsed > downtime_steps[0][0]):
+                    downtime = downtime_steps.pop(0)
+                    LOG.info(_LI("Increasing downtime to %(downtime)dms "
+                                 "after %(waittime)d sec elapsed time"),
+                             {"downtime": downtime[1],
+                              "waittime": downtime[0]},
+                             instance=instance)
+
+                    try:
+                        dom.migrateSetMaxDowntime(downtime[1])
+                    except libvirt.libvirtError as e:
+                        LOG.warn(
+                            _LW("Unable to increase max downtime to %(time)d"
+                                "ms: %(e)s"),
+                            {"time": downtime[1], "e": e}, instance=instance)
+
                # We loop every 500ms, so don't log on every
                # iteration to avoid spamming logs for long
                # running migrations. Just once every 5 secs
@ -5675,11 +5802,6 @@ class LibvirtDriver(driver.ComputeDriver):
                        "remaining_memory": info.memory_remaining,
                        "total_memory": info.memory_total}, instance=instance)

-                # Migration is still running
-                #
-                # This is where we'd wire up calls to change live
-                # migration status. eg change max downtime, cancel
-                # the operation, change max bandwidth
                n = n + 1
            elif info.type == libvirt.VIR_DOMAIN_JOB_COMPLETED:
                # Migration is all done