Make the block device mapping retries configurable

When booting instances passing in block-device and increasing the volume size, instances can go in to error state if the volume takes longer to create than the hard code value (max_tries(180)/wait_between(1)) set in nova/compute/manager.py def _await_block_device_map_created(self, context, vol_id, max_tries=180, wait_between=1): To fix this, max_retries/wait_between should be made configurable. Looking through the different releases, Grizzly was 30, Havana was 60 , IceHouse is 180. This change adds two configuration options: a) `block_device_allocate_retries` which can be set in nova.conf by the user to configure the number of block device mapping retries. It defaults to 60 and replaces the max_tries argument in the above method. b) `block_device_allocate_retries_interval` which allows the user to specify the time interval between consecutive retries. It defaults to 3 and replaces wait_between argument in the above method. DocImpact Closes-Bug: #1332382 Change-Id: I16e4cd1a572bc5c2cd91fc94be85e72f576a8c26
2014-07-10 15:10:33 -07:00
parent 13d01dc16b
commit 66721eb2c0
2 changed files with 24 additions and 13 deletions
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -117,6 +117,10 @@ compute_opts = [
    cfg.IntOpt('network_allocate_retries',
               default=0,
               help="Number of times to retry network allocation on failures"),
+    cfg.IntOpt('block_device_allocate_retries',
+               default=60,
+               help='Number of times to retry block device'
+                    ' allocation on failures')
    ]

 interval_opts = [
@@ -159,7 +163,11 @@ interval_opts = [
    cfg.IntOpt('instance_delete_interval',
               default=300,
               help=('Interval in seconds for retrying failed instance file '
-                     'deletes'))
+                     'deletes')),
+    cfg.IntOpt('block_device_allocate_retries_interval',
+               default=3,
+               help='Waiting time interval (seconds) between block'
+                    ' device allocation retries on failures')
 ]

 timeout_opts = [
@@ -1143,24 +1151,21 @@ class ComputeManager(manager.Manager):
                                                             instance)
        return network_info

-    def _await_block_device_map_created(self, context, vol_id, max_tries=180,
-                                        wait_between=1):
+    def _await_block_device_map_created(self, context, vol_id):
        # TODO(yamahata): creating volume simultaneously
        #                 reduces creation time?
        # TODO(yamahata): eliminate dumb polling
-        # TODO(harlowja): make the max_tries configurable or dynamic?
        attempts = 0
        start = time.time()
-        while attempts < max_tries:
+        while attempts < CONF.block_device_allocate_retries:
            volume = self.volume_api.get(context, vol_id)
            volume_status = volume['status']
            if volume_status not in ['creating', 'downloading']:
                if volume_status != 'available':
                    LOG.warn(_("Volume id: %s finished being created but was"
                               " not set as 'available'"), vol_id)
-                # NOTE(harlowja): return how many attempts were tried
                return attempts + 1
-            greenthread.sleep(wait_between)
+            greenthread.sleep(CONF.block_device_allocate_retries_interval)
            attempts += 1
        # NOTE(harlowja): Should only happen if we ran out of attempts
        raise exception.VolumeNotCreated(volume_id=vol_id,
--- a/nova/tests/compute/test_compute.py
+++ b/nova/tests/compute/test_compute.py
@@ -34,6 +34,8 @@ from oslo import messaging
 import six
 from testtools import matchers as testtools_matchers

+from eventlet import greenthread
+
 import nova
 from nova import availability_zones
 from nova import block_device
@@ -381,6 +383,8 @@ class ComputeVolumeTestCase(BaseTestCase):
                       lambda *a, **kw: None)
        self.stubs.Set(self.compute.volume_api, 'check_attach',
                       lambda *a, **kw: None)
+        self.stubs.Set(greenthread, 'sleep',
+                       lambda *a, **kw: None)

        def store_cinfo(context, *args, **kwargs):
            self.cinfo = jsonutils.loads(args[-1].get('connection_info'))
@@ -461,7 +465,9 @@ class ComputeVolumeTestCase(BaseTestCase):
            mock_get_by_id.assert_called_once_with(self.context, 'fake')
            self.assertTrue(mock_attach.called)

-    def test_await_block_device_created_to_slow(self):
+    def test_await_block_device_created_too_slow(self):
+        self.flags(block_device_allocate_retries=2)
+        self.flags(block_device_allocate_retries_interval=0.1)

        def never_get(context, vol_id):
            return {
@@ -472,13 +478,15 @@ class ComputeVolumeTestCase(BaseTestCase):
        self.stubs.Set(self.compute.volume_api, 'get', never_get)
        self.assertRaises(exception.VolumeNotCreated,
                          self.compute._await_block_device_map_created,
-                          self.context, '1', max_tries=2, wait_between=0.1)
+                          self.context, '1')

    def test_await_block_device_created_slow(self):
        c = self.compute
+        self.flags(block_device_allocate_retries=4)
+        self.flags(block_device_allocate_retries_interval=0.1)

        def slow_get(context, vol_id):
-            while self.fetched_attempts < 2:
+            if self.fetched_attempts < 2:
                self.fetched_attempts += 1
                return {
                    'status': 'creating',
@@ -490,9 +498,7 @@ class ComputeVolumeTestCase(BaseTestCase):
            }

        self.stubs.Set(c.volume_api, 'get', slow_get)
-        attempts = c._await_block_device_map_created(self.context, '1',
-                                                     max_tries=4,
-                                                     wait_between=0.1)
+        attempts = c._await_block_device_map_created(self.context, '1')
        self.assertEqual(attempts, 3)

    def test_boot_volume_serial(self):