Make the block device mapping retries configurable
When booting instances passing in block-device and increasing the
volume size, instances can go in to error state if the volume takes
longer to create than the hard code value (max_tries(180)/wait_between(1))
set in nova/compute/manager.py
def _await_block_device_map_created(self,
context,
vol_id,
max_tries=180,
wait_between=1):
To fix this, max_retries/wait_between should be made configurable.
Looking through the different releases, Grizzly was 30, Havana was
60 , IceHouse is 180.
This change adds two configuration options:
a) `block_device_allocate_retries` which can be set in nova.conf
by the user to configure the number of block device mapping retries.
It defaults to 60 and replaces the max_tries argument in the above method.
b) `block_device_allocate_retries_interval` which allows the user
to specify the time interval between consecutive retries. It defaults to 3
and replaces wait_between argument in the above method.
DocImpact
Closes-Bug: #1332382
Change-Id: I16e4cd1a572bc5c2cd91fc94be85e72f576a8c26
This commit is contained in:
@@ -117,6 +117,10 @@ compute_opts = [
|
||||
cfg.IntOpt('network_allocate_retries',
|
||||
default=0,
|
||||
help="Number of times to retry network allocation on failures"),
|
||||
cfg.IntOpt('block_device_allocate_retries',
|
||||
default=60,
|
||||
help='Number of times to retry block device'
|
||||
' allocation on failures')
|
||||
]
|
||||
|
||||
interval_opts = [
|
||||
@@ -159,7 +163,11 @@ interval_opts = [
|
||||
cfg.IntOpt('instance_delete_interval',
|
||||
default=300,
|
||||
help=('Interval in seconds for retrying failed instance file '
|
||||
'deletes'))
|
||||
'deletes')),
|
||||
cfg.IntOpt('block_device_allocate_retries_interval',
|
||||
default=3,
|
||||
help='Waiting time interval (seconds) between block'
|
||||
' device allocation retries on failures')
|
||||
]
|
||||
|
||||
timeout_opts = [
|
||||
@@ -1143,24 +1151,21 @@ class ComputeManager(manager.Manager):
|
||||
instance)
|
||||
return network_info
|
||||
|
||||
def _await_block_device_map_created(self, context, vol_id, max_tries=180,
|
||||
wait_between=1):
|
||||
def _await_block_device_map_created(self, context, vol_id):
|
||||
# TODO(yamahata): creating volume simultaneously
|
||||
# reduces creation time?
|
||||
# TODO(yamahata): eliminate dumb polling
|
||||
# TODO(harlowja): make the max_tries configurable or dynamic?
|
||||
attempts = 0
|
||||
start = time.time()
|
||||
while attempts < max_tries:
|
||||
while attempts < CONF.block_device_allocate_retries:
|
||||
volume = self.volume_api.get(context, vol_id)
|
||||
volume_status = volume['status']
|
||||
if volume_status not in ['creating', 'downloading']:
|
||||
if volume_status != 'available':
|
||||
LOG.warn(_("Volume id: %s finished being created but was"
|
||||
" not set as 'available'"), vol_id)
|
||||
# NOTE(harlowja): return how many attempts were tried
|
||||
return attempts + 1
|
||||
greenthread.sleep(wait_between)
|
||||
greenthread.sleep(CONF.block_device_allocate_retries_interval)
|
||||
attempts += 1
|
||||
# NOTE(harlowja): Should only happen if we ran out of attempts
|
||||
raise exception.VolumeNotCreated(volume_id=vol_id,
|
||||
|
||||
@@ -34,6 +34,8 @@ from oslo import messaging
|
||||
import six
|
||||
from testtools import matchers as testtools_matchers
|
||||
|
||||
from eventlet import greenthread
|
||||
|
||||
import nova
|
||||
from nova import availability_zones
|
||||
from nova import block_device
|
||||
@@ -381,6 +383,8 @@ class ComputeVolumeTestCase(BaseTestCase):
|
||||
lambda *a, **kw: None)
|
||||
self.stubs.Set(self.compute.volume_api, 'check_attach',
|
||||
lambda *a, **kw: None)
|
||||
self.stubs.Set(greenthread, 'sleep',
|
||||
lambda *a, **kw: None)
|
||||
|
||||
def store_cinfo(context, *args, **kwargs):
|
||||
self.cinfo = jsonutils.loads(args[-1].get('connection_info'))
|
||||
@@ -461,7 +465,9 @@ class ComputeVolumeTestCase(BaseTestCase):
|
||||
mock_get_by_id.assert_called_once_with(self.context, 'fake')
|
||||
self.assertTrue(mock_attach.called)
|
||||
|
||||
def test_await_block_device_created_to_slow(self):
|
||||
def test_await_block_device_created_too_slow(self):
|
||||
self.flags(block_device_allocate_retries=2)
|
||||
self.flags(block_device_allocate_retries_interval=0.1)
|
||||
|
||||
def never_get(context, vol_id):
|
||||
return {
|
||||
@@ -472,13 +478,15 @@ class ComputeVolumeTestCase(BaseTestCase):
|
||||
self.stubs.Set(self.compute.volume_api, 'get', never_get)
|
||||
self.assertRaises(exception.VolumeNotCreated,
|
||||
self.compute._await_block_device_map_created,
|
||||
self.context, '1', max_tries=2, wait_between=0.1)
|
||||
self.context, '1')
|
||||
|
||||
def test_await_block_device_created_slow(self):
|
||||
c = self.compute
|
||||
self.flags(block_device_allocate_retries=4)
|
||||
self.flags(block_device_allocate_retries_interval=0.1)
|
||||
|
||||
def slow_get(context, vol_id):
|
||||
while self.fetched_attempts < 2:
|
||||
if self.fetched_attempts < 2:
|
||||
self.fetched_attempts += 1
|
||||
return {
|
||||
'status': 'creating',
|
||||
@@ -490,9 +498,7 @@ class ComputeVolumeTestCase(BaseTestCase):
|
||||
}
|
||||
|
||||
self.stubs.Set(c.volume_api, 'get', slow_get)
|
||||
attempts = c._await_block_device_map_created(self.context, '1',
|
||||
max_tries=4,
|
||||
wait_between=0.1)
|
||||
attempts = c._await_block_device_map_created(self.context, '1')
|
||||
self.assertEqual(attempts, 3)
|
||||
|
||||
def test_boot_volume_serial(self):
|
||||
|
||||
Reference in New Issue
Block a user