From 6b20239a5d293f55889cd1bffa59e4792c75edbf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Kap=C5=82o=C5=84ski?= Date: Wed, 31 Aug 2016 20:28:36 +0000 Subject: [PATCH] Fix race condition bug during live_snapshot During live_snapshot creation, when nova starts block_rebase operation in libvirt there is possibility that block_job is not yet started and libvirt blockJobInfo method will return status.end = 0 and status.cur = 0. In such case libvirt driver does not wait to finish block rebase operation and that causes a problem because created snapshot is corrupted. This patch adds check if status.end != 0 to return information that job is already finished. Change-Id: I45ac06eae0b1949f746dae305469718649bfcf23 Closes-Bug: #1530275 --- nova/tests/unit/virt/libvirt/test_driver.py | 5 +++-- nova/tests/unit/virt/libvirt/test_guest.py | 13 ++++++++++++- nova/virt/libvirt/guest.py | 9 +++++++-- 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/nova/tests/unit/virt/libvirt/test_driver.py b/nova/tests/unit/virt/libvirt/test_driver.py index fd57192c2012..1709be407bef 100644 --- a/nova/tests/unit/virt/libvirt/test_driver.py +++ b/nova/tests/unit/virt/libvirt/test_driver.py @@ -14284,7 +14284,7 @@ class LibvirtConnTestCase(test.NoDBTestCase): mock_dom.XMLDesc.return_value = xmldoc mock_dom.isPersistent.return_value = True - mock_dom.blockJobInfo.return_value = {} + mock_dom.blockJobInfo.return_value = {'cur': 100, 'end': 100} drvr._swap_volume(guest, srcfile, dstfile, 1) @@ -17557,7 +17557,8 @@ class LibvirtVolumeSnapshotTestCase(test.NoDBTestCase): flags=fakelibvirt.VIR_DOMAIN_BLOCK_COMMIT_RELATIVE) domain.blockJobInfo('vda', flags=0).AndReturn({'cur': 1, 'end': 1000}) - domain.blockJobInfo('vda', flags=0).AndReturn({}) + domain.blockJobInfo('vda', flags=0).AndReturn({'cur': 1000, + 'end': 1000}) self.mox.ReplayAll() diff --git a/nova/tests/unit/virt/libvirt/test_guest.py b/nova/tests/unit/virt/libvirt/test_guest.py index d039b72687e2..3a42cff2d1e4 100644 --- a/nova/tests/unit/virt/libvirt/test_guest.py +++ b/nova/tests/unit/virt/libvirt/test_guest.py @@ -622,7 +622,16 @@ class GuestBlockTestCase(test.NoDBTestCase): 'vda', "foo", "top", 0, flags=fakelibvirt.VIR_DOMAIN_BLOCK_COMMIT_RELATIVE) - def test_wait_for_job(self): + def test_wait_for_job_cur_end_zeros(self): + self.domain.blockJobInfo.return_value = { + "type": 4, + "bandwidth": 18, + "cur": 0, + "end": 0} + in_progress = self.gblock.wait_for_job() + self.assertTrue(in_progress) + + def test_wait_for_job_current_lower_than_end(self): self.domain.blockJobInfo.return_value = { "type": 4, "bandwidth": 18, @@ -631,6 +640,7 @@ class GuestBlockTestCase(test.NoDBTestCase): in_progress = self.gblock.wait_for_job() self.assertTrue(in_progress) + def test_wait_for_job_finished(self): self.domain.blockJobInfo.return_value = { "type": 4, "bandwidth": 18, @@ -639,6 +649,7 @@ class GuestBlockTestCase(test.NoDBTestCase): in_progress = self.gblock.wait_for_job() self.assertFalse(in_progress) + def test_wait_for_job_clean(self): self.domain.blockJobInfo.return_value = {"type": 0} in_progress = self.gblock.wait_for_job(wait_for_job_clean=True) self.assertFalse(in_progress) diff --git a/nova/virt/libvirt/guest.py b/nova/virt/libvirt/guest.py index 5a70376cf227..fc837904a794 100644 --- a/nova/virt/libvirt/guest.py +++ b/nova/virt/libvirt/guest.py @@ -695,11 +695,13 @@ class BlockDevice(object): Libvirt may return either cur==end or an empty dict when the job is complete, depending on whether the job has been cleaned up by libvirt yet, or not. + It can also return end=0 if qemu has not yet started the block + operation. :param abort_on_error: Whether to stop process and raise NovaException on error (default: False) :param wait_for_job_clean: Whether to force wait to ensure job is - finished (see bug: LP#1119173) + finished (see bug: RH Bugzilla#1119173) :returns: True if still in progress False if completed @@ -714,7 +716,10 @@ class BlockDevice(object): if wait_for_job_clean: job_ended = status.job == 0 else: - job_ended = status.cur == status.end + # NOTE(slaweq): because of bug in libvirt, which is described in + # http://www.redhat.com/archives/libvir-list/2016-September/msg00017.html + # if status.end == 0 job is not started yet so it is not finished + job_ended = status.end != 0 and status.cur == status.end return not job_ended