xenapi: reduce impact of errors during SR.scan
There are some race conditions in SR.scan leading to errors similar to: * The SR scan failed [opterr=['MAP_DUPLICATE_KEY', 'VDI', 'sm_config', '$UUID', 'vhd-blocks']] * The SR scan failed [opterr=['UUID_INVALID', 'VDI', '$UUID']] This seems to be happening because an SR.scan is happening when a xenapi plugin is modifying the SR at the same time. To reduce the likely-hood of this error, I have added a mutex around calls to SR.scan to ensure nova-compute never makes more than one call at once, because XenServer will join any extra SR.scan requests to listen to the result of any current SR.scan that may be running. In addition, when and error occurs during an SR scan, I have added a retry loop, to attempt to stop these races from impacting users. Fixes bug 1221293 Change-Id: Ib83e6e0f4f7fab2058fe137e32934606c4fe8f7e
This commit is contained in:
parent
97da68502a
commit
0632501eba
@ -18,7 +18,9 @@
|
||||
import contextlib
|
||||
import uuid
|
||||
|
||||
from eventlet import greenthread
|
||||
import fixtures
|
||||
import mock
|
||||
import mox
|
||||
from oslo.config import cfg
|
||||
|
||||
@ -1280,3 +1282,67 @@ class CreateKernelRamdiskTestCase(test.NoDBTestCase):
|
||||
result = vm_utils.create_kernel_and_ramdisk(self.context,
|
||||
self.session, self.instance, self.name_label)
|
||||
self.assertEqual(("k", None), result)
|
||||
|
||||
|
||||
class ScanSrTestCase(test.NoDBTestCase):
|
||||
@mock.patch.object(vm_utils, "_scan_sr")
|
||||
@mock.patch.object(vm_utils, "safe_find_sr")
|
||||
def test_scan_default_sr(self, mock_safe_find_sr, mock_scan_sr):
|
||||
mock_safe_find_sr.return_value = "sr_ref"
|
||||
|
||||
self.assertEqual("sr_ref", vm_utils.scan_default_sr("fake_session"))
|
||||
|
||||
mock_scan_sr.assert_called_once_with("fake_session", "sr_ref")
|
||||
|
||||
def test_scan_sr_works(self):
|
||||
session = mock.Mock()
|
||||
vm_utils._scan_sr(session, "sr_ref")
|
||||
session.call_xenapi.assert_called_once_with('SR.scan', "sr_ref")
|
||||
|
||||
def test_scan_sr_unknown_error_fails_once(self):
|
||||
session = mock.Mock()
|
||||
session.call_xenapi.side_effect = test.TestingException
|
||||
self.assertRaises(test.TestingException,
|
||||
vm_utils._scan_sr, session, "sr_ref")
|
||||
session.call_xenapi.assert_called_once_with('SR.scan', "sr_ref")
|
||||
|
||||
@mock.patch.object(greenthread, 'sleep')
|
||||
def test_scan_sr_known_error_retries_then_throws(self, mock_sleep):
|
||||
session = mock.Mock()
|
||||
|
||||
class FakeException(Exception):
|
||||
details = ['SR_BACKEND_FAILURE_40', "", "", ""]
|
||||
|
||||
session.XenAPI.Failure = FakeException
|
||||
session.call_xenapi.side_effect = FakeException
|
||||
|
||||
self.assertRaises(FakeException,
|
||||
vm_utils._scan_sr, session, "sr_ref")
|
||||
|
||||
session.call_xenapi.assert_called_with('SR.scan', "sr_ref")
|
||||
self.assertEqual(4, session.call_xenapi.call_count)
|
||||
mock_sleep.assert_has_calls([mock.call(2), mock.call(4), mock.call(8)])
|
||||
|
||||
@mock.patch.object(greenthread, 'sleep')
|
||||
def test_scan_sr_known_error_retries_then_succeeds(self, mock_sleep):
|
||||
session = mock.Mock()
|
||||
|
||||
class FakeException(Exception):
|
||||
details = ['SR_BACKEND_FAILURE_40', "", "", ""]
|
||||
|
||||
session.XenAPI.Failure = FakeException
|
||||
sr_scan_call_count = 0
|
||||
|
||||
def fake_call_xenapi(*args):
|
||||
fake_call_xenapi.count += 1
|
||||
if fake_call_xenapi.count != 2:
|
||||
raise FakeException()
|
||||
|
||||
fake_call_xenapi.count = 0
|
||||
session.call_xenapi.side_effect = fake_call_xenapi
|
||||
|
||||
vm_utils._scan_sr(session, "sr_ref")
|
||||
|
||||
session.call_xenapi.assert_called_with('SR.scan', "sr_ref")
|
||||
self.assertEqual(2, session.call_xenapi.call_count)
|
||||
mock_sleep.assert_called_once_with(2)
|
||||
|
@ -2163,7 +2163,7 @@ class XenAPIHostTestCase(stubs.XenAPITestBase):
|
||||
|
||||
def test_update_stats_caches_hostname(self):
|
||||
self.mox.StubOutWithMock(host, 'call_xenhost')
|
||||
self.mox.StubOutWithMock(vm_utils, 'safe_find_sr')
|
||||
self.mox.StubOutWithMock(vm_utils, 'scan_default_sr')
|
||||
self.mox.StubOutWithMock(self.conn._session, 'call_xenapi')
|
||||
data = {'disk_total': 0,
|
||||
'disk_used': 0,
|
||||
@ -2179,9 +2179,8 @@ class XenAPIHostTestCase(stubs.XenAPITestBase):
|
||||
|
||||
for i in range(3):
|
||||
host.call_xenhost(mox.IgnoreArg(), 'host_data', {}).AndReturn(data)
|
||||
vm_utils.safe_find_sr(self.conn._session).AndReturn(None)
|
||||
self.conn._session.call_xenapi('SR.scan', None)
|
||||
self.conn._session.call_xenapi('SR.get_record', None).AndReturn(
|
||||
vm_utils.scan_default_sr(self.conn._session).AndReturn("ref")
|
||||
self.conn._session.call_xenapi('SR.get_record', "ref").AndReturn(
|
||||
sr_rec)
|
||||
if i == 2:
|
||||
# On the third call (the second below) change the hostname
|
||||
|
@ -167,8 +167,7 @@ class HostState(object):
|
||||
LOG.debug(_("Updating host stats"))
|
||||
data = call_xenhost(self._session, "host_data", {})
|
||||
if data:
|
||||
sr_ref = vm_utils.safe_find_sr(self._session)
|
||||
self._session.call_xenapi("SR.scan", sr_ref)
|
||||
sr_ref = vm_utils.scan_default_sr(self._session)
|
||||
sr_rec = self._session.call_xenapi("SR.get_record", sr_ref)
|
||||
total = int(sr_rec["physical_size"])
|
||||
used = int(sr_rec["physical_utilisation"])
|
||||
|
@ -487,7 +487,7 @@ def get_vdi_uuid_for_volume(session, connection_data):
|
||||
vdi_uuid = None
|
||||
|
||||
if 'vdi_uuid' in connection_data:
|
||||
session.call_xenapi("SR.scan", sr_ref)
|
||||
_scan_sr(session, sr_ref)
|
||||
vdi_uuid = connection_data['vdi_uuid']
|
||||
else:
|
||||
try:
|
||||
@ -586,9 +586,7 @@ def _safe_copy_vdi(session, sr_ref, instance, vdi_to_copy_ref):
|
||||
|
||||
root_uuid = imported_vhds['root']['uuid']
|
||||
|
||||
# TODO(sirp): for safety, we should probably re-scan the SR after every
|
||||
# call to a dom0 plugin, since there is a possibility that the underlying
|
||||
# VHDs changed
|
||||
# rescan to discover new VHDs
|
||||
scan_default_sr(session)
|
||||
vdi_ref = session.call_xenapi('VDI.get_by_uuid', root_uuid)
|
||||
return vdi_ref
|
||||
@ -1258,8 +1256,8 @@ def _fetch_vhd_image(context, session, instance, image_id):
|
||||
vdis = default_handler.download_image(
|
||||
context, session, instance, image_id)
|
||||
|
||||
sr_ref = safe_find_sr(session)
|
||||
_scan_sr(session, sr_ref)
|
||||
# Ensure we can see the import VHDs as VDIs
|
||||
scan_default_sr(session)
|
||||
|
||||
try:
|
||||
_check_vdi_size(context, session, instance, vdis['root']['uuid'])
|
||||
@ -1628,16 +1626,39 @@ def fetch_bandwidth(session):
|
||||
return bw
|
||||
|
||||
|
||||
def _scan_sr(session, sr_ref=None):
|
||||
"""Scans the SR specified by sr_ref."""
|
||||
def _scan_sr(session, sr_ref=None, max_attempts=4):
|
||||
if sr_ref:
|
||||
LOG.debug(_("Re-scanning SR %s"), sr_ref)
|
||||
session.call_xenapi('SR.scan', sr_ref)
|
||||
# NOTE(johngarbutt) xenapi will collapse any duplicate requests
|
||||
# for SR.scan if there is already a scan in progress.
|
||||
# However, we don't want that, because the scan may have started
|
||||
# before we modified the underlying VHDs on disk through a plugin.
|
||||
# Using our own mutex will reduce cases where our periodic SR scan
|
||||
# in host.update_status starts racing the sr.scan after a plugin call.
|
||||
@utils.synchronized('sr-scan-' + sr_ref)
|
||||
def do_scan(sr_ref):
|
||||
LOG.debug(_("Scanning SR %s"), sr_ref)
|
||||
|
||||
attempt = 1
|
||||
while True:
|
||||
try:
|
||||
return session.call_xenapi('SR.scan', sr_ref)
|
||||
except session.XenAPI.Failure as exc:
|
||||
with excutils.save_and_reraise_exception() as ctxt:
|
||||
if exc.details[0] == 'SR_BACKEND_FAILURE_40':
|
||||
if attempt < max_attempts:
|
||||
ctxt.reraise = False
|
||||
LOG.warn(_("Retry SR scan due to error: %s")
|
||||
% exc)
|
||||
greenthread.sleep(2 ** attempt)
|
||||
attempt += 1
|
||||
do_scan(sr_ref)
|
||||
|
||||
|
||||
def scan_default_sr(session):
|
||||
"""Looks for the system default SR and triggers a re-scan."""
|
||||
_scan_sr(session, _find_sr(session))
|
||||
sr_ref = safe_find_sr(session)
|
||||
_scan_sr(session, sr_ref)
|
||||
return sr_ref
|
||||
|
||||
|
||||
def safe_find_sr(session):
|
||||
@ -1860,12 +1881,10 @@ def _wait_for_vhd_coalesce(session, instance, sr_ref, vdi_ref,
|
||||
base_uuid = _get_vhd_parent_uuid(session, parent_ref)
|
||||
return parent_uuid, base_uuid
|
||||
|
||||
# NOTE(sirp): This rescan is necessary to ensure the VM's `sm_config`
|
||||
# matches the underlying VHDs.
|
||||
_scan_sr(session, sr_ref)
|
||||
|
||||
max_attempts = CONF.xenapi_vhd_coalesce_max_attempts
|
||||
for i in xrange(max_attempts):
|
||||
# NOTE(sirp): This rescan is necessary to ensure the VM's `sm_config`
|
||||
# matches the underlying VHDs.
|
||||
_scan_sr(session, sr_ref)
|
||||
parent_uuid = _get_vhd_parent_uuid(session, vdi_ref)
|
||||
if parent_uuid and (parent_uuid != original_parent_uuid):
|
||||
|
Loading…
Reference in New Issue
Block a user