From b6e8d758a0e33547a3fd78c434337e80b0a826fa Mon Sep 17 00:00:00 2001 From: Ethan Lynn Date: Tue, 13 Dec 2016 16:40:09 +0800 Subject: [PATCH] Remove retry logic from lock_acquire No need to retry, just wait for engine to pick action up again. The workflow is: ActionProc -> action.execute() -> return RES_RETRY -> action.set_status -> ignore RES_RETRY and continue Change-Id: Ic622a79b754131171cb940aa9f31ec5aef11ee47 Closes-Bug: #1648681 --- senlin/engine/actions/base.py | 2 + senlin/engine/actions/cluster_action.py | 3 +- senlin/engine/actions/node_action.py | 2 +- senlin/engine/senlin_lock.py | 32 +---- .../engine/actions/test_cluster_action.py | 2 +- .../unit/engine/actions/test_node_action.py | 2 +- senlin/tests/unit/engine/test_senlin_lock.py | 135 ++++-------------- 7 files changed, 38 insertions(+), 140 deletions(-) diff --git a/senlin/engine/actions/base.py b/senlin/engine/actions/base.py index 8c648a085..2391ae80e 100644 --- a/senlin/engine/actions/base.py +++ b/senlin/engine/actions/base.py @@ -504,6 +504,8 @@ def ActionProc(context, action_id): try: # Step 2: execute the action result, reason = action.execute() + if result == action.RES_RETRY: + success = False except Exception as ex: # We catch exception here to make sure the following logics are # executed. diff --git a/senlin/engine/actions/cluster_action.py b/senlin/engine/actions/cluster_action.py index b53c0c760..a4dbb750c 100644 --- a/senlin/engine/actions/cluster_action.py +++ b/senlin/engine/actions/cluster_action.py @@ -945,8 +945,9 @@ class ClusterAction(base.Action): self.id, self.owner, senlin_lock.CLUSTER_SCOPE, forced) + # Failed to acquire lock, return RES_RETRY if not res: - return self.RES_ERROR, _('Failed in locking cluster.') + return self.RES_RETRY, _('Failed in locking cluster.') try: res, reason = self._execute(**kwargs) diff --git a/senlin/engine/actions/node_action.py b/senlin/engine/actions/node_action.py index d1443372e..fb9ef162d 100644 --- a/senlin/engine/actions/node_action.py +++ b/senlin/engine/actions/node_action.py @@ -235,7 +235,7 @@ class NodeAction(base.Action): res = senlin_lock.node_lock_acquire(self.context, self.entity.id, self.id, self.owner, False) if not res: - res = self.RES_ERROR + res = self.RES_RETRY reason = _('Failed in locking node') else: res, reason = self._execute() diff --git a/senlin/engine/senlin_lock.py b/senlin/engine/senlin_lock.py index dd2699b6f..6f9da6961 100644 --- a/senlin/engine/senlin_lock.py +++ b/senlin/engine/senlin_lock.py @@ -16,7 +16,6 @@ import time from senlin.common.i18n import _, _LE, _LI from senlin.common import utils -from senlin.engine import scheduler from senlin.objects import action as ao from senlin.objects import cluster_lock as cl_obj from senlin.objects import node_lock as nl_obj @@ -55,23 +54,12 @@ def cluster_lock_acquire(context, cluster_id, action_id, engine=None, if action_id in owners: return True - # Step 2: retry using global configuration options - retries = cfg.CONF.lock_retry_times - retry_interval = cfg.CONF.lock_retry_interval - - while retries > 0: - scheduler.sleep(retry_interval) - LOG.debug('Acquire lock for cluster %s again' % cluster_id) - owners = cl_obj.ClusterLock.acquire(cluster_id, action_id, scope) - if action_id in owners: - return True - retries = retries - 1 - - # Step 3: Last resort is 'forced locking', only needed when retry failed + # Step 2: Last resort is 'forced locking', only needed when retry failed if forced: owners = cl_obj.ClusterLock.steal(cluster_id, action_id) return action_id in owners + # Step 3: check if the owner is a dead engine, if so, steal the lock. # Will reach here only because scope == CLUSTER_SCOPE action = ao.Action.get(context, owners[0]) if (action and action.owner and action.owner != engine and @@ -83,6 +71,7 @@ def cluster_lock_acquire(context, cluster_id, action_id, engine=None, }) reason = _('Engine died when executing this action.') owners = cl_obj.ClusterLock.steal(cluster_id, action_id) + # Mark the old action to failed. ao.Action.mark_failed(context, action.id, time.time(), reason) return action_id in owners @@ -121,23 +110,12 @@ def node_lock_acquire(context, node_id, action_id, engine=None, if action_id == owner: return True - # Step 2: retry using global configuration options - retries = cfg.CONF.lock_retry_times - retry_interval = cfg.CONF.lock_retry_interval - - while retries > 0: - scheduler.sleep(retry_interval) - LOG.debug('Acquire lock for node %s again' % node_id) - owner = nl_obj.NodeLock.acquire(node_id, action_id) - if action_id == owner: - return True - retries = retries - 1 - - # Step 3: Last resort is 'forced locking', only needed when retry failed + # Step 2: Last resort is 'forced locking', only needed when retry failed if forced: owner = nl_obj.NodeLock.steal(node_id, action_id) return action_id == owner + # Step 3: Try to steal a lock if it's owner is a dead engine. # if this node lock by dead engine action = ao.Action.get(context, owner) if (action and action.owner and action.owner != engine and diff --git a/senlin/tests/unit/engine/actions/test_cluster_action.py b/senlin/tests/unit/engine/actions/test_cluster_action.py index 8fbd42d46..115ed8aea 100644 --- a/senlin/tests/unit/engine/actions/test_cluster_action.py +++ b/senlin/tests/unit/engine/actions/test_cluster_action.py @@ -2655,7 +2655,7 @@ class ClusterActionTest(base.SenlinTestCase): res_code, res_msg = action.execute() - self.assertEqual(action.RES_ERROR, res_code) + self.assertEqual(action.RES_RETRY, res_code) self.assertEqual('Failed in locking cluster.', res_msg) mock_load.assert_called_once_with(action.context, cluster.id) diff --git a/senlin/tests/unit/engine/actions/test_node_action.py b/senlin/tests/unit/engine/actions/test_node_action.py index 07a88e854..85f073a37 100644 --- a/senlin/tests/unit/engine/actions/test_node_action.py +++ b/senlin/tests/unit/engine/actions/test_node_action.py @@ -651,7 +651,7 @@ class NodeActionTest(base.SenlinTestCase): res_code, res_msg = action.execute() reason = 'Failed in locking node' - self.assertEqual(action.RES_ERROR, res_code) + self.assertEqual(action.RES_RETRY, res_code) self.assertEqual(reason, res_msg) mock_load.assert_called_once_with(action.context, node_id='NODE_ID') mock_acquire.assert_called_once_with(self.ctx, 'FAKE_CLUSTER', diff --git a/senlin/tests/unit/engine/test_senlin_lock.py b/senlin/tests/unit/engine/test_senlin_lock.py index bd921dee1..319116dba 100644 --- a/senlin/tests/unit/engine/test_senlin_lock.py +++ b/senlin/tests/unit/engine/test_senlin_lock.py @@ -11,10 +11,8 @@ # under the License. import mock -from oslo_config import cfg from senlin.common import utils as common_utils -from senlin.engine import scheduler from senlin.engine import senlin_lock as lockm from senlin.objects import action as ao from senlin.objects import cluster_lock as clo @@ -44,111 +42,67 @@ class SenlinLockTest(base.SenlinTestCase): lockm.CLUSTER_SCOPE) @mock.patch.object(common_utils, 'is_engine_dead') - @mock.patch.object(scheduler, 'sleep') @mock.patch.object(ao.Action, 'mark_failed') @mock.patch.object(clo.ClusterLock, "acquire") @mock.patch.object(clo.ClusterLock, "steal") def test_cluster_lock_acquire_dead_owner(self, mock_steal, mock_acquire, - mock_action_fail, mock_sleep, - mock_dead): + mock_action_fail, mock_dead): mock_dead.return_value = True - mock_acquire.side_effect = ['ACTION_ABC', 'ACTION_ABC', - 'ACTION_ABC', 'ACTION_ABC'] + mock_acquire.side_effect = ['ACTION_ABC'] mock_steal.side_effect = ['ACTION_XYZ'] res = lockm.cluster_lock_acquire(self.ctx, 'CLUSTER_A', 'ACTION_XYZ', 'NEW_ENGINE') self.assertTrue(res) - self.assertEqual(4, mock_acquire.call_count) - self.assertEqual(3, mock_sleep.call_count) + mock_acquire.assert_called_once_with("CLUSTER_A", "ACTION_XYZ", + lockm.CLUSTER_SCOPE) mock_steal.assert_called_once_with('CLUSTER_A', 'ACTION_XYZ') mock_action_fail.assert_called_once_with( self.ctx, 'ACTION_ABC', mock.ANY, 'Engine died when executing this action.') - @mock.patch.object(scheduler, 'sleep') - @mock.patch.object(clo.ClusterLock, "acquire") - def test_cluster_lock_acquire_with_retry(self, mock_acquire, mock_sleep): - cfg.CONF.set_override('lock_retry_times', 5, enforce_type=True) - mock_acquire.side_effect = ['ACTION_ABC', 'ACTION_ABC', 'ACTION_XYZ'] - - res = lockm.cluster_lock_acquire(self.ctx, 'CLUSTER_A', 'ACTION_XYZ') - - self.assertTrue(res) - sleep_calls = [mock.call(cfg.CONF.lock_retry_interval)] - mock_sleep.assert_has_calls(sleep_calls * 2) - acquire_calls = [ - mock.call('CLUSTER_A', 'ACTION_XYZ', lockm.CLUSTER_SCOPE) - ] - mock_acquire.assert_has_calls(acquire_calls * 3) - @mock.patch.object(common_utils, 'is_engine_dead') - @mock.patch.object(scheduler, 'sleep') @mock.patch.object(clo.ClusterLock, "acquire") - def test_cluster_lock_acquire_max_retries(self, mock_acquire, mock_sleep, - mock_dead): - cfg.CONF.set_override('lock_retry_times', 2, enforce_type=True) + def test_cluster_lock_acquire_failed(self, mock_acquire, mock_dead): mock_dead.return_value = False - mock_acquire.side_effect = [ - 'ACTION_ABC', 'ACTION_ABC', 'ACTION_ABC', 'ACTION_XYZ' - ] + mock_acquire.return_value = 'ACTION_ABC' res = lockm.cluster_lock_acquire(self.ctx, 'CLUSTER_A', 'ACTION_XYZ') self.assertFalse(res) - sleep_calls = [mock.call(cfg.CONF.lock_retry_interval)] - mock_sleep.assert_has_calls(sleep_calls * 2) - self.assertEqual(2, mock_sleep.call_count) - acquire_calls = [ - mock.call('CLUSTER_A', 'ACTION_XYZ', lockm.CLUSTER_SCOPE) - ] - mock_acquire.assert_has_calls(acquire_calls * 3) + mock_acquire.assert_called_once_with('CLUSTER_A', 'ACTION_XYZ', + lockm.CLUSTER_SCOPE) - @mock.patch.object(scheduler, 'sleep') @mock.patch.object(clo.ClusterLock, "acquire") @mock.patch.object(clo.ClusterLock, "steal") - def test_cluster_lock_acquire_forced(self, mock_steal, mock_acquire, - mock_sleep): - cfg.CONF.set_override('lock_retry_times', 2, enforce_type=True) - mock_acquire.side_effect = ['ACTION_ABC', 'ACTION_ABC', 'ACTION_ABC'] + def test_cluster_lock_acquire_forced(self, mock_steal, mock_acquire): + mock_acquire.side_effect = ['ACTION_ABC'] mock_steal.return_value = ['ACTION_XY'] res = lockm.cluster_lock_acquire(self.ctx, 'CLUSTER_A', 'ACTION_XY', forced=True) self.assertTrue(res) - sleep_calls = [mock.call(cfg.CONF.lock_retry_interval)] - mock_sleep.assert_has_calls(sleep_calls * 2) - self.assertEqual(2, mock_sleep.call_count) - acquire_calls = [ - mock.call('CLUSTER_A', 'ACTION_XY', lockm.CLUSTER_SCOPE) - ] - mock_acquire.assert_has_calls(acquire_calls * 3) + mock_acquire.assert_called_once_with('CLUSTER_A', 'ACTION_XY', + lockm.CLUSTER_SCOPE) mock_steal.assert_called_once_with('CLUSTER_A', 'ACTION_XY') @mock.patch.object(common_utils, 'is_engine_dead') - @mock.patch.object(scheduler, 'sleep') @mock.patch.object(clo.ClusterLock, "acquire") @mock.patch.object(clo.ClusterLock, "steal") def test_cluster_lock_acquire_steal_failed(self, mock_steal, mock_acquire, - mock_sleep, mock_dead): - cfg.CONF.set_override('lock_retry_times', 2, enforce_type=True) + mock_dead): mock_dead.return_value = False - mock_acquire.side_effect = ['ACTION_ABC', 'ACTION_ABC', 'ACTION_ABC'] + mock_acquire.side_effect = ['ACTION_ABC'] mock_steal.return_value = [] res = lockm.cluster_lock_acquire(self.ctx, 'CLUSTER_A', 'ACTION_XY', forced=True) self.assertFalse(res) - sleep_calls = [mock.call(cfg.CONF.lock_retry_interval)] - mock_sleep.assert_has_calls(sleep_calls * 2) - self.assertEqual(2, mock_sleep.call_count) - acquire_calls = [ - mock.call('CLUSTER_A', 'ACTION_XY', lockm.CLUSTER_SCOPE) - ] - mock_acquire.assert_has_calls(acquire_calls * 3) + mock_acquire.assert_called_once_with('CLUSTER_A', 'ACTION_XY', + lockm.CLUSTER_SCOPE) mock_steal.assert_called_once_with('CLUSTER_A', 'ACTION_XY') @mock.patch.object(clo.ClusterLock, "release") @@ -174,59 +128,33 @@ class SenlinLockTest(base.SenlinTestCase): def test_node_lock_acquire_dead_owner(self, mock_steal, mock_acquire, mock_action_fail, mock_dead): mock_dead.return_value = True - mock_acquire.side_effect = ['ACTION_ABC', 'ACTION_ABC', - 'ACTION_ABC', 'ACTION_ABC'] + mock_acquire.side_effect = ['ACTION_ABC'] mock_steal.return_value = 'ACTION_XYZ' res = lockm.node_lock_acquire(self.ctx, 'NODE_A', 'ACTION_XYZ', 'NEW_ENGINE') self.assertTrue(res) - self.assertEqual(4, mock_acquire.call_count) + mock_acquire.assert_called_once_with('NODE_A', 'ACTION_XYZ') mock_steal.assert_called_once_with('NODE_A', 'ACTION_XYZ') mock_action_fail.assert_called_once_with( self.ctx, 'ACTION_ABC', mock.ANY, 'Engine died when executing this action.') - @mock.patch.object(scheduler, 'sleep') - @mock.patch.object(nlo.NodeLock, "acquire") - def test_node_lock_acquire_with_retry(self, mock_acquire, mock_sleep): - cfg.CONF.set_override('lock_retry_times', 5, enforce_type=True) - mock_acquire.side_effect = ['ACTION_ABC', 'ACTION_ABC', 'ACTION_XYZ'] - - res = lockm.node_lock_acquire(self.ctx, 'NODE_A', 'ACTION_XYZ') - self.assertTrue(res) - sleep_calls = [mock.call(cfg.CONF.lock_retry_interval)] - mock_sleep.assert_has_calls(sleep_calls * 2) - acquire_calls = [mock.call('NODE_A', 'ACTION_XYZ')] - mock_acquire.assert_has_calls(acquire_calls * 3) - @mock.patch.object(common_utils, 'is_engine_dead') - @mock.patch.object(scheduler, 'sleep') @mock.patch.object(nlo.NodeLock, "acquire") - def test_node_lock_acquire_max_retries(self, mock_acquire, mock_sleep, - mock_dead): - cfg.CONF.set_override('lock_retry_times', 2, enforce_type=True) + def test_node_lock_acquire_failed(self, mock_acquire, mock_dead): mock_dead.return_value = False - mock_acquire.side_effect = [ - 'ACTION_ABC', 'ACTION_ABC', 'ACTION_ABC', 'ACTION_XYZ' - ] + mock_acquire.side_effect = ['ACTION_ABC'] res = lockm.node_lock_acquire(self.ctx, 'NODE_A', 'ACTION_XYZ') self.assertFalse(res) - sleep_calls = [mock.call(cfg.CONF.lock_retry_interval)] - mock_sleep.assert_has_calls(sleep_calls * 2) - self.assertEqual(2, mock_sleep.call_count) - acquire_calls = [mock.call('NODE_A', 'ACTION_XYZ')] - mock_acquire.assert_has_calls(acquire_calls * 3) + mock_acquire.assert_called_once_with('NODE_A', 'ACTION_XYZ') - @mock.patch.object(scheduler, 'sleep') @mock.patch.object(nlo.NodeLock, "acquire") @mock.patch.object(nlo.NodeLock, "steal") - def test_node_lock_acquire_forced(self, mock_steal, mock_acquire, - mock_sleep): - cfg.CONF.set_override('lock_retry_times', 2, enforce_type=True) + def test_node_lock_acquire_forced(self, mock_steal, mock_acquire): mock_acquire.side_effect = ['ACTION_ABC', 'ACTION_ABC', 'ACTION_ABC'] mock_steal.return_value = 'ACTION_XY' @@ -234,33 +162,22 @@ class SenlinLockTest(base.SenlinTestCase): 'ACTION_XY', forced=True) self.assertTrue(res) - sleep_calls = [mock.call(cfg.CONF.lock_retry_interval)] - mock_sleep.assert_has_calls(sleep_calls * 2) - self.assertEqual(2, mock_sleep.call_count) - acquire_calls = [mock.call('NODE_A', 'ACTION_XY')] - mock_acquire.assert_has_calls(acquire_calls * 3) + mock_acquire.assert_called_once_with('NODE_A', 'ACTION_XY') mock_steal.assert_called_once_with('NODE_A', 'ACTION_XY') @mock.patch.object(ao.Action, 'get') - @mock.patch.object(scheduler, 'sleep') @mock.patch.object(nlo.NodeLock, "acquire") @mock.patch.object(nlo.NodeLock, "steal") def test_node_lock_acquire_steal_failed(self, mock_steal, mock_acquire, - mock_sleep, mock_get): - cfg.CONF.set_override('lock_retry_times', 2, enforce_type=True) - mock_get.return_value = mock.Mock(owner='ENGINE') - mock_acquire.side_effect = ['ACTION_ABC', 'ACTION_ABC', 'ACTION_ABC'] + mock_get): + mock_acquire.side_effect = ['ACTION_ABC'] mock_steal.return_value = None res = lockm.node_lock_acquire(self.ctx, 'NODE_A', 'ACTION_XY', forced=True) self.assertFalse(res) - sleep_calls = [mock.call(cfg.CONF.lock_retry_interval)] - mock_sleep.assert_has_calls(sleep_calls * 2) - self.assertEqual(2, mock_sleep.call_count) - acquire_calls = [mock.call('NODE_A', 'ACTION_XY')] - mock_acquire.assert_has_calls(acquire_calls * 3) + mock_acquire.assert_called_once_with('NODE_A', 'ACTION_XY') mock_steal.assert_called_once_with('NODE_A', 'ACTION_XY') @mock.patch.object(nlo.NodeLock, "release")