Adds retry function to HNAS driver

Multiple requests sent to HNAS can cause concurrency problems and that ends up with 'SSC failed connection' errors. This patch adds a retry decorator to _execute in HDS HNAS Manila driver to fix this problem. The current retry functionality in Manila always uses fixed numbers to define the time to wait before performing the next attempts. This behavior can make the retries of multiple requests to collide in each attempt as they will wait the same amount of time and try to use the same resource together again. So additionally, this patch changes the behavior of manila.utils.retry() to receive a parameter that allows the function to implement randomly generated wait intervals. Change-Id: Ib862f62517fcc8816781204b902119e9b20121e0 Closes-bug: 1491550
2015-09-02 13:28:30 -03:00 · 2015-09-02 13:28:30 -03:00 · 8dc3863e10
commit 8dc3863e10
parent 6977b0196d
5 changed files with 113 additions and 24 deletions
--- a/manila/exception.py
+++ b/manila/exception.py
@ -659,6 +659,10 @@ class HNASBackendException(ManilaException):
    message = _("HNAS Backend Exception: %(msg)s")


+class HNASConnException(ManilaException):
+    message = _("HNAS Connection Exception: %(msg)s")
+
+
 # ConsistencyGroup
 class ConsistencyGroupNotFound(NotFound):
    message = _("ConsistencyGroup %(consistency_group_id)s could not be "
--- a/manila/share/drivers/hitachi/ssh.py
+++ b/manila/share/drivers/hitachi/ssh.py
@ -414,6 +414,7 @@ class HNASSSHBackend(object):
            msg = (_("Share %s was not created.") % share['id'])
            raise exception.HNASBackendException(msg=msg)

+    @mutils.retry(exception=exception.HNASConnException, wait_random=True)
    def _execute(self, commands):
        command = ['ssc', '127.0.0.1']
        if self.admin_ip0 is not None:
@ -442,13 +443,19 @@ class HNASSSHBackend(object):
                                       'out': out, 'err': err})
                return out, err
            except processutils.ProcessExecutionError as e:
-                LOG.debug("Command %(cmd)s result: out = %(out)s - err = "
-                          "%(err)s - exit = %(exit)s.", {'cmd': e.cmd,
-                                                         'out': e.stdout,
-                                                         'err': e.stderr,
-                                                         'exit': e.exit_code})
-                LOG.error(_LE("Error running SSH command."))
-                raise
+                if 'Failed to establish SSC connection' in e.stderr:
+                    LOG.debug("SSC connection error!")
+                    msg = _("Failed to establish SSC connection.")
+                    raise exception.HNASConnException(msg=msg)
+                else:
+                    LOG.debug("Command %(cmd)s result: out = %(out)s - err = "
+                              "%(err)s - exit = %(exit)s.", {'cmd': e.cmd,
+                                                             'out': e.stdout,
+                                                             'err': e.stderr,
+                                                             'exit':
+                                                             e.exit_code})
+                    LOG.error(_LE("Error running SSH command."))
+                    raise

    def _check_fs_mounted(self, fs_name):
        self._check_fs()
--- a/manila/tests/share/drivers/hitachi/test_ssh.py
+++ b/manila/tests/share/drivers/hitachi/test_ssh.py
@ -23,6 +23,7 @@ import paramiko
 from manila import exception
 from manila.share.drivers.hitachi import ssh
 from manila import test
+from manila import utils

 CONF = cfg.CONF

@ -69,7 +70,6 @@ file_system        1055  fake_span          Umount   2        4      5
 file_system2       1050  fake_span2         NoEVS    -      10       0     1
 fake_fs            1051  fake_span          Umount   2      100     1024   """

-
 HNAS_RESULT_one_fs = """ \
 Instance name      Dev   On span            State   EVS  Cap/GiB Confined Flag
 -----------------  ----  -------            ------  ---  ------- -------- ----
@ -1170,6 +1170,31 @@ class HNASSSHTestCase(test.TestCase):
                                                      port=self.port)
        self.assertIn('Request submitted successfully.', output)

+    def test__execute_retry(self):
+        commands = ['tree-clone-job-submit', '-e', '/src', '/dst']
+        concat_command = ('ssc --smuauth fake console-context --evs 2 '
+                          'tree-clone-job-submit -e /src /dst')
+        msg = 'Failed to establish SSC connection'
+
+        item_mock = mock.Mock()
+        self.mock_object(utils.pools.Pool, 'item',
+                         mock.Mock(return_value=item_mock))
+        setattr(item_mock, '__enter__', mock.Mock())
+        setattr(item_mock, '__exit__', mock.Mock())
+
+        self.mock_object(paramiko.SSHClient, 'connect')
+        # testing retrying 3 times
+        self.mock_object(putils, 'ssh_execute', mock.Mock(
+            side_effect=[putils.ProcessExecutionError(stderr=msg),
+                         putils.ProcessExecutionError(stderr=msg),
+                         putils.ProcessExecutionError(stderr=msg),
+                         (HNAS_RESULT_job, '')]))
+
+        self._driver._execute(commands)
+
+        putils.ssh_execute.assert_called_with(mock.ANY, concat_command,
+                                              check_exit_code=True)
+
    def test__execute_ssh_exception(self):
        key = self.ssh_private_key
        commands = ['tree-clone-job-submit', '-e', '/src', '/dst']
@ -1177,7 +1202,8 @@ class HNASSSHTestCase(test.TestCase):
                          'tree-clone-job-submit -e /src /dst')
        self.mock_object(paramiko.SSHClient, 'connect')
        self.mock_object(putils, 'ssh_execute',
-                         mock.Mock(side_effect=putils.ProcessExecutionError))
+                         mock.Mock(side_effect=putils.ProcessExecutionError
+                                   (stderr='Error')))

        self.assertRaises(putils.ProcessExecutionError,
                          self._driver._execute, commands)
--- a/manila/tests/test_utils.py
+++ b/manila/tests/test_utils.py
@ -694,6 +694,49 @@ class TestRetryDecorator(test.TestCase):
            self.assertEqual('success', ret)
            self.assertEqual(1, self.counter)

+    def test_no_retry_required_random(self):
+        self.counter = 0
+
+        with mock.patch.object(time, 'sleep') as mock_sleep:
+            @utils.retry(exception.ManilaException,
+                         interval=2,
+                         retries=3,
+                         backoff_rate=2,
+                         wait_random=True)
+            def succeeds():
+                self.counter += 1
+                return 'success'
+
+            ret = succeeds()
+            self.assertFalse(mock_sleep.called)
+            self.assertEqual('success', ret)
+            self.assertEqual(1, self.counter)
+
+    def test_retries_once_random(self):
+        self.counter = 0
+        interval = 2
+        backoff_rate = 2
+        retries = 3
+
+        with mock.patch.object(time, 'sleep') as mock_sleep:
+            @utils.retry(exception.ManilaException,
+                         interval,
+                         retries,
+                         backoff_rate,
+                         wait_random=True)
+            def fails_once():
+                self.counter += 1
+                if self.counter < 2:
+                    raise exception.ManilaException(data='fake')
+                else:
+                    return 'success'
+
+            ret = fails_once()
+            self.assertEqual('success', ret)
+            self.assertEqual(2, self.counter)
+            self.assertEqual(1, mock_sleep.call_count)
+            self.assertTrue(mock_sleep.called)
+
    def test_retries_once(self):
        self.counter = 0
        interval = 2
--- a/manila/utils.py
+++ b/manila/utils.py
@ -22,6 +22,7 @@ import errno
 import inspect
 import os
 import pyclbr
+import random
 import re
 import shutil
 import socket
@ -540,23 +541,25 @@ class ComparableMixin(object):
        return self._compare(other, lambda s, o: s != o)


-def retry(exception, interval=1, retries=10, backoff_rate=2):
+def retry(exception, interval=1, retries=10, backoff_rate=2,
+          wait_random=False):
    """A wrapper around retrying library.

-        This decorator allows to log and to check 'retries' input param.
-        Time interval between retries is calculated in the following way:
-        interval * backoff_rate ^ previous_attempt_number
+    This decorator allows to log and to check 'retries' input param.
+    Time interval between retries is calculated in the following way:
+    interval * backoff_rate ^ previous_attempt_number

-        :param exception: expected exception type. When wrapped function
-                          raises an exception of this type,the function
-                          execution is retried.
-        :param interval: param 'interval' is used to calculate time interval
-                         between retries:
+    :param exception: expected exception type. When wrapped function
+                      raises an exception of this type, the function
+                      execution is retried.
+    :param interval: param 'interval' is used to calculate time interval
+                     between retries:
+                     interval * backoff_rate ^ previous_attempt_number
+    :param retries: number of retries.
+    :param backoff_rate: param 'backoff_rate' is used to calculate time
+                         interval between retries:
                         interval * backoff_rate ^ previous_attempt_number
-        :param retries: number of retries
-        :param backoff_rate: param 'backoff_rate' is used to calculate time
-                             interval between retries:
-                             interval * backoff_rate ^ previous_attempt_number
+    :param wait_random: boolean value to enable retry with random wait timer.

    """
    def _retry_on_exception(e):
@ -565,8 +568,14 @@ def retry(exception, interval=1, retries=10, backoff_rate=2):
    def _backoff_sleep(previous_attempt_number, delay_since_first_attempt_ms):
        exp = backoff_rate ** previous_attempt_number
        wait_for = max(0, interval * exp)
-        LOG.debug("Sleeping for %s seconds", wait_for)
-        return wait_for * 1000.0
+
+        if wait_random:
+            wait_val = random.randrange(interval * 1000.0, wait_for * 1000.0)
+        else:
+            wait_val = wait_for * 1000.0
+
+        LOG.debug("Sleeping for %s seconds.", (wait_val / 1000.0))
+        return wait_val

    def _print_stop(previous_attempt_number, delay_since_first_attempt_ms):
        delay_since_first_attempt = delay_since_first_attempt_ms / 1000.0