Make the block device mapping retries configurable

When booting instances passing in block-device and increasing the
volume size, instances can go in to error state if the volume takes
longer to create than the hard code value (max_tries(180)/wait_between(1))
set in nova/compute/manager.py

def _await_block_device_map_created(self,
                                    context,
                                    vol_id,
                                    max_tries=180,
                                    wait_between=1):

To fix this, max_retries/wait_between should be made configurable.
Looking through the different releases, Grizzly was 30, Havana was
60 , IceHouse is 180.

This change adds two configuration options:
a)  `block_device_allocate_retries` which can be set in nova.conf
by the user to configure the number of block device mapping retries.
It defaults to 60 and replaces the max_tries argument in the above method.
b) `block_device_allocate_retries_interval` which allows the user
to specify the time interval between consecutive retries. It defaults to 3
and replaces wait_between argument in the above method.

DocImpact
Closes-Bug: #1332382
Change-Id: I16e4cd1a572bc5c2cd91fc94be85e72f576a8c26
This commit is contained in:
Akash Gangil
2014-07-10 15:10:33 -07:00
parent 13d01dc16b
commit 66721eb2c0
2 changed files with 24 additions and 13 deletions

View File

@@ -117,6 +117,10 @@ compute_opts = [
cfg.IntOpt('network_allocate_retries',
default=0,
help="Number of times to retry network allocation on failures"),
cfg.IntOpt('block_device_allocate_retries',
default=60,
help='Number of times to retry block device'
' allocation on failures')
]
interval_opts = [
@@ -159,7 +163,11 @@ interval_opts = [
cfg.IntOpt('instance_delete_interval',
default=300,
help=('Interval in seconds for retrying failed instance file '
'deletes'))
'deletes')),
cfg.IntOpt('block_device_allocate_retries_interval',
default=3,
help='Waiting time interval (seconds) between block'
' device allocation retries on failures')
]
timeout_opts = [
@@ -1143,24 +1151,21 @@ class ComputeManager(manager.Manager):
instance)
return network_info
def _await_block_device_map_created(self, context, vol_id, max_tries=180,
wait_between=1):
def _await_block_device_map_created(self, context, vol_id):
# TODO(yamahata): creating volume simultaneously
# reduces creation time?
# TODO(yamahata): eliminate dumb polling
# TODO(harlowja): make the max_tries configurable or dynamic?
attempts = 0
start = time.time()
while attempts < max_tries:
while attempts < CONF.block_device_allocate_retries:
volume = self.volume_api.get(context, vol_id)
volume_status = volume['status']
if volume_status not in ['creating', 'downloading']:
if volume_status != 'available':
LOG.warn(_("Volume id: %s finished being created but was"
" not set as 'available'"), vol_id)
# NOTE(harlowja): return how many attempts were tried
return attempts + 1
greenthread.sleep(wait_between)
greenthread.sleep(CONF.block_device_allocate_retries_interval)
attempts += 1
# NOTE(harlowja): Should only happen if we ran out of attempts
raise exception.VolumeNotCreated(volume_id=vol_id,

View File

@@ -34,6 +34,8 @@ from oslo import messaging
import six
from testtools import matchers as testtools_matchers
from eventlet import greenthread
import nova
from nova import availability_zones
from nova import block_device
@@ -381,6 +383,8 @@ class ComputeVolumeTestCase(BaseTestCase):
lambda *a, **kw: None)
self.stubs.Set(self.compute.volume_api, 'check_attach',
lambda *a, **kw: None)
self.stubs.Set(greenthread, 'sleep',
lambda *a, **kw: None)
def store_cinfo(context, *args, **kwargs):
self.cinfo = jsonutils.loads(args[-1].get('connection_info'))
@@ -461,7 +465,9 @@ class ComputeVolumeTestCase(BaseTestCase):
mock_get_by_id.assert_called_once_with(self.context, 'fake')
self.assertTrue(mock_attach.called)
def test_await_block_device_created_to_slow(self):
def test_await_block_device_created_too_slow(self):
self.flags(block_device_allocate_retries=2)
self.flags(block_device_allocate_retries_interval=0.1)
def never_get(context, vol_id):
return {
@@ -472,13 +478,15 @@ class ComputeVolumeTestCase(BaseTestCase):
self.stubs.Set(self.compute.volume_api, 'get', never_get)
self.assertRaises(exception.VolumeNotCreated,
self.compute._await_block_device_map_created,
self.context, '1', max_tries=2, wait_between=0.1)
self.context, '1')
def test_await_block_device_created_slow(self):
c = self.compute
self.flags(block_device_allocate_retries=4)
self.flags(block_device_allocate_retries_interval=0.1)
def slow_get(context, vol_id):
while self.fetched_attempts < 2:
if self.fetched_attempts < 2:
self.fetched_attempts += 1
return {
'status': 'creating',
@@ -490,9 +498,7 @@ class ComputeVolumeTestCase(BaseTestCase):
}
self.stubs.Set(c.volume_api, 'get', slow_get)
attempts = c._await_block_device_map_created(self.context, '1',
max_tries=4,
wait_between=0.1)
attempts = c._await_block_device_map_created(self.context, '1')
self.assertEqual(attempts, 3)
def test_boot_volume_serial(self):