xenapi: reduce impact of errors during SR.scan

There are some race conditions in SR.scan leading to errors similar to:
* The SR scan failed [opterr=['MAP_DUPLICATE_KEY', 'VDI', 'sm_config',
  '$UUID', 'vhd-blocks']]
* The SR scan failed [opterr=['UUID_INVALID', 'VDI', '$UUID']]

This seems to be happening because an SR.scan is happening when a xenapi
plugin is modifying the SR at the same time.

To reduce the likely-hood of this error, I have added a mutex around
calls to SR.scan to ensure nova-compute never makes more than one call
at once, because XenServer will join any extra SR.scan requests to
listen to the result of any current SR.scan that may be running.

In addition, when and error occurs during an SR scan, I have added a
retry loop, to attempt to stop these races from impacting users.

Fixes bug 1221293
Change-Id: Ib83e6e0f4f7fab2058fe137e32934606c4fe8f7e
This commit is contained in:
John Garbutt 2013-09-16 15:55:05 +01:00
parent 97da68502a
commit 0632501eba
4 changed files with 104 additions and 21 deletions

View File

@ -18,7 +18,9 @@
import contextlib
import uuid
from eventlet import greenthread
import fixtures
import mock
import mox
from oslo.config import cfg
@ -1280,3 +1282,67 @@ class CreateKernelRamdiskTestCase(test.NoDBTestCase):
result = vm_utils.create_kernel_and_ramdisk(self.context,
self.session, self.instance, self.name_label)
self.assertEqual(("k", None), result)
class ScanSrTestCase(test.NoDBTestCase):
@mock.patch.object(vm_utils, "_scan_sr")
@mock.patch.object(vm_utils, "safe_find_sr")
def test_scan_default_sr(self, mock_safe_find_sr, mock_scan_sr):
mock_safe_find_sr.return_value = "sr_ref"
self.assertEqual("sr_ref", vm_utils.scan_default_sr("fake_session"))
mock_scan_sr.assert_called_once_with("fake_session", "sr_ref")
def test_scan_sr_works(self):
session = mock.Mock()
vm_utils._scan_sr(session, "sr_ref")
session.call_xenapi.assert_called_once_with('SR.scan', "sr_ref")
def test_scan_sr_unknown_error_fails_once(self):
session = mock.Mock()
session.call_xenapi.side_effect = test.TestingException
self.assertRaises(test.TestingException,
vm_utils._scan_sr, session, "sr_ref")
session.call_xenapi.assert_called_once_with('SR.scan', "sr_ref")
@mock.patch.object(greenthread, 'sleep')
def test_scan_sr_known_error_retries_then_throws(self, mock_sleep):
session = mock.Mock()
class FakeException(Exception):
details = ['SR_BACKEND_FAILURE_40', "", "", ""]
session.XenAPI.Failure = FakeException
session.call_xenapi.side_effect = FakeException
self.assertRaises(FakeException,
vm_utils._scan_sr, session, "sr_ref")
session.call_xenapi.assert_called_with('SR.scan', "sr_ref")
self.assertEqual(4, session.call_xenapi.call_count)
mock_sleep.assert_has_calls([mock.call(2), mock.call(4), mock.call(8)])
@mock.patch.object(greenthread, 'sleep')
def test_scan_sr_known_error_retries_then_succeeds(self, mock_sleep):
session = mock.Mock()
class FakeException(Exception):
details = ['SR_BACKEND_FAILURE_40', "", "", ""]
session.XenAPI.Failure = FakeException
sr_scan_call_count = 0
def fake_call_xenapi(*args):
fake_call_xenapi.count += 1
if fake_call_xenapi.count != 2:
raise FakeException()
fake_call_xenapi.count = 0
session.call_xenapi.side_effect = fake_call_xenapi
vm_utils._scan_sr(session, "sr_ref")
session.call_xenapi.assert_called_with('SR.scan', "sr_ref")
self.assertEqual(2, session.call_xenapi.call_count)
mock_sleep.assert_called_once_with(2)

View File

@ -2163,7 +2163,7 @@ class XenAPIHostTestCase(stubs.XenAPITestBase):
def test_update_stats_caches_hostname(self):
self.mox.StubOutWithMock(host, 'call_xenhost')
self.mox.StubOutWithMock(vm_utils, 'safe_find_sr')
self.mox.StubOutWithMock(vm_utils, 'scan_default_sr')
self.mox.StubOutWithMock(self.conn._session, 'call_xenapi')
data = {'disk_total': 0,
'disk_used': 0,
@ -2179,9 +2179,8 @@ class XenAPIHostTestCase(stubs.XenAPITestBase):
for i in range(3):
host.call_xenhost(mox.IgnoreArg(), 'host_data', {}).AndReturn(data)
vm_utils.safe_find_sr(self.conn._session).AndReturn(None)
self.conn._session.call_xenapi('SR.scan', None)
self.conn._session.call_xenapi('SR.get_record', None).AndReturn(
vm_utils.scan_default_sr(self.conn._session).AndReturn("ref")
self.conn._session.call_xenapi('SR.get_record', "ref").AndReturn(
sr_rec)
if i == 2:
# On the third call (the second below) change the hostname

View File

@ -167,8 +167,7 @@ class HostState(object):
LOG.debug(_("Updating host stats"))
data = call_xenhost(self._session, "host_data", {})
if data:
sr_ref = vm_utils.safe_find_sr(self._session)
self._session.call_xenapi("SR.scan", sr_ref)
sr_ref = vm_utils.scan_default_sr(self._session)
sr_rec = self._session.call_xenapi("SR.get_record", sr_ref)
total = int(sr_rec["physical_size"])
used = int(sr_rec["physical_utilisation"])

View File

@ -487,7 +487,7 @@ def get_vdi_uuid_for_volume(session, connection_data):
vdi_uuid = None
if 'vdi_uuid' in connection_data:
session.call_xenapi("SR.scan", sr_ref)
_scan_sr(session, sr_ref)
vdi_uuid = connection_data['vdi_uuid']
else:
try:
@ -586,9 +586,7 @@ def _safe_copy_vdi(session, sr_ref, instance, vdi_to_copy_ref):
root_uuid = imported_vhds['root']['uuid']
# TODO(sirp): for safety, we should probably re-scan the SR after every
# call to a dom0 plugin, since there is a possibility that the underlying
# VHDs changed
# rescan to discover new VHDs
scan_default_sr(session)
vdi_ref = session.call_xenapi('VDI.get_by_uuid', root_uuid)
return vdi_ref
@ -1258,8 +1256,8 @@ def _fetch_vhd_image(context, session, instance, image_id):
vdis = default_handler.download_image(
context, session, instance, image_id)
sr_ref = safe_find_sr(session)
_scan_sr(session, sr_ref)
# Ensure we can see the import VHDs as VDIs
scan_default_sr(session)
try:
_check_vdi_size(context, session, instance, vdis['root']['uuid'])
@ -1628,16 +1626,39 @@ def fetch_bandwidth(session):
return bw
def _scan_sr(session, sr_ref=None):
"""Scans the SR specified by sr_ref."""
def _scan_sr(session, sr_ref=None, max_attempts=4):
if sr_ref:
LOG.debug(_("Re-scanning SR %s"), sr_ref)
session.call_xenapi('SR.scan', sr_ref)
# NOTE(johngarbutt) xenapi will collapse any duplicate requests
# for SR.scan if there is already a scan in progress.
# However, we don't want that, because the scan may have started
# before we modified the underlying VHDs on disk through a plugin.
# Using our own mutex will reduce cases where our periodic SR scan
# in host.update_status starts racing the sr.scan after a plugin call.
@utils.synchronized('sr-scan-' + sr_ref)
def do_scan(sr_ref):
LOG.debug(_("Scanning SR %s"), sr_ref)
attempt = 1
while True:
try:
return session.call_xenapi('SR.scan', sr_ref)
except session.XenAPI.Failure as exc:
with excutils.save_and_reraise_exception() as ctxt:
if exc.details[0] == 'SR_BACKEND_FAILURE_40':
if attempt < max_attempts:
ctxt.reraise = False
LOG.warn(_("Retry SR scan due to error: %s")
% exc)
greenthread.sleep(2 ** attempt)
attempt += 1
do_scan(sr_ref)
def scan_default_sr(session):
"""Looks for the system default SR and triggers a re-scan."""
_scan_sr(session, _find_sr(session))
sr_ref = safe_find_sr(session)
_scan_sr(session, sr_ref)
return sr_ref
def safe_find_sr(session):
@ -1860,12 +1881,10 @@ def _wait_for_vhd_coalesce(session, instance, sr_ref, vdi_ref,
base_uuid = _get_vhd_parent_uuid(session, parent_ref)
return parent_uuid, base_uuid
# NOTE(sirp): This rescan is necessary to ensure the VM's `sm_config`
# matches the underlying VHDs.
_scan_sr(session, sr_ref)
max_attempts = CONF.xenapi_vhd_coalesce_max_attempts
for i in xrange(max_attempts):
# NOTE(sirp): This rescan is necessary to ensure the VM's `sm_config`
# matches the underlying VHDs.
_scan_sr(session, sr_ref)
parent_uuid = _get_vhd_parent_uuid(session, vdi_ref)
if parent_uuid and (parent_uuid != original_parent_uuid):