Browse Source

Remove retry logic from lock_acquire

No need to retry, just wait for engine to pick action up again.

The workflow is:
ActionProc -> action.execute() -> return RES_RETRY
-> action.set_status -> ignore RES_RETRY and continue

Change-Id: Ic622a79b754131171cb940aa9f31ec5aef11ee47
Closes-Bug: #1648681
changes/95/410095/3
Ethan Lynn 4 years ago
parent
commit
b6e8d758a0
7 changed files with 39 additions and 141 deletions
  1. +2
    -0
      senlin/engine/actions/base.py
  2. +2
    -1
      senlin/engine/actions/cluster_action.py
  3. +1
    -1
      senlin/engine/actions/node_action.py
  4. +5
    -27
      senlin/engine/senlin_lock.py
  5. +1
    -1
      senlin/tests/unit/engine/actions/test_cluster_action.py
  6. +1
    -1
      senlin/tests/unit/engine/actions/test_node_action.py
  7. +27
    -110
      senlin/tests/unit/engine/test_senlin_lock.py

+ 2
- 0
senlin/engine/actions/base.py View File

@ -504,6 +504,8 @@ def ActionProc(context, action_id):
try:
# Step 2: execute the action
result, reason = action.execute()
if result == action.RES_RETRY:
success = False
except Exception as ex:
# We catch exception here to make sure the following logics are
# executed.


+ 2
- 1
senlin/engine/actions/cluster_action.py View File

@ -945,8 +945,9 @@ class ClusterAction(base.Action):
self.id, self.owner,
senlin_lock.CLUSTER_SCOPE,
forced)
# Failed to acquire lock, return RES_RETRY
if not res:
return self.RES_ERROR, _('Failed in locking cluster.')
return self.RES_RETRY, _('Failed in locking cluster.')
try:
res, reason = self._execute(**kwargs)


+ 1
- 1
senlin/engine/actions/node_action.py View File

@ -235,7 +235,7 @@ class NodeAction(base.Action):
res = senlin_lock.node_lock_acquire(self.context, self.entity.id,
self.id, self.owner, False)
if not res:
res = self.RES_ERROR
res = self.RES_RETRY
reason = _('Failed in locking node')
else:
res, reason = self._execute()


+ 5
- 27
senlin/engine/senlin_lock.py View File

@ -16,7 +16,6 @@ import time
from senlin.common.i18n import _, _LE, _LI
from senlin.common import utils
from senlin.engine import scheduler
from senlin.objects import action as ao
from senlin.objects import cluster_lock as cl_obj
from senlin.objects import node_lock as nl_obj
@ -55,23 +54,12 @@ def cluster_lock_acquire(context, cluster_id, action_id, engine=None,
if action_id in owners:
return True
# Step 2: retry using global configuration options
retries = cfg.CONF.lock_retry_times
retry_interval = cfg.CONF.lock_retry_interval
while retries > 0:
scheduler.sleep(retry_interval)
LOG.debug('Acquire lock for cluster %s again' % cluster_id)
owners = cl_obj.ClusterLock.acquire(cluster_id, action_id, scope)
if action_id in owners:
return True
retries = retries - 1
# Step 3: Last resort is 'forced locking', only needed when retry failed
# Step 2: Last resort is 'forced locking', only needed when retry failed
if forced:
owners = cl_obj.ClusterLock.steal(cluster_id, action_id)
return action_id in owners
# Step 3: check if the owner is a dead engine, if so, steal the lock.
# Will reach here only because scope == CLUSTER_SCOPE
action = ao.Action.get(context, owners[0])
if (action and action.owner and action.owner != engine and
@ -83,6 +71,7 @@ def cluster_lock_acquire(context, cluster_id, action_id, engine=None,
})
reason = _('Engine died when executing this action.')
owners = cl_obj.ClusterLock.steal(cluster_id, action_id)
# Mark the old action to failed.
ao.Action.mark_failed(context, action.id, time.time(), reason)
return action_id in owners
@ -121,23 +110,12 @@ def node_lock_acquire(context, node_id, action_id, engine=None,
if action_id == owner:
return True
# Step 2: retry using global configuration options
retries = cfg.CONF.lock_retry_times
retry_interval = cfg.CONF.lock_retry_interval
while retries > 0:
scheduler.sleep(retry_interval)
LOG.debug('Acquire lock for node %s again' % node_id)
owner = nl_obj.NodeLock.acquire(node_id, action_id)
if action_id == owner:
return True
retries = retries - 1
# Step 3: Last resort is 'forced locking', only needed when retry failed
# Step 2: Last resort is 'forced locking', only needed when retry failed
if forced:
owner = nl_obj.NodeLock.steal(node_id, action_id)
return action_id == owner
# Step 3: Try to steal a lock if it's owner is a dead engine.
# if this node lock by dead engine
action = ao.Action.get(context, owner)
if (action and action.owner and action.owner != engine and


+ 1
- 1
senlin/tests/unit/engine/actions/test_cluster_action.py View File

@ -2655,7 +2655,7 @@ class ClusterActionTest(base.SenlinTestCase):
res_code, res_msg = action.execute()
self.assertEqual(action.RES_ERROR, res_code)
self.assertEqual(action.RES_RETRY, res_code)
self.assertEqual('Failed in locking cluster.', res_msg)
mock_load.assert_called_once_with(action.context, cluster.id)


+ 1
- 1
senlin/tests/unit/engine/actions/test_node_action.py View File

@ -651,7 +651,7 @@ class NodeActionTest(base.SenlinTestCase):
res_code, res_msg = action.execute()
reason = 'Failed in locking node'
self.assertEqual(action.RES_ERROR, res_code)
self.assertEqual(action.RES_RETRY, res_code)
self.assertEqual(reason, res_msg)
mock_load.assert_called_once_with(action.context, node_id='NODE_ID')
mock_acquire.assert_called_once_with(self.ctx, 'FAKE_CLUSTER',


+ 27
- 110
senlin/tests/unit/engine/test_senlin_lock.py View File

@ -11,10 +11,8 @@
# under the License.
import mock
from oslo_config import cfg
from senlin.common import utils as common_utils
from senlin.engine import scheduler
from senlin.engine import senlin_lock as lockm
from senlin.objects import action as ao
from senlin.objects import cluster_lock as clo
@ -44,111 +42,67 @@ class SenlinLockTest(base.SenlinTestCase):
lockm.CLUSTER_SCOPE)
@mock.patch.object(common_utils, 'is_engine_dead')
@mock.patch.object(scheduler, 'sleep')
@mock.patch.object(ao.Action, 'mark_failed')
@mock.patch.object(clo.ClusterLock, "acquire")
@mock.patch.object(clo.ClusterLock, "steal")
def test_cluster_lock_acquire_dead_owner(self, mock_steal, mock_acquire,
mock_action_fail, mock_sleep,
mock_dead):
mock_action_fail, mock_dead):
mock_dead.return_value = True
mock_acquire.side_effect = ['ACTION_ABC', 'ACTION_ABC',
'ACTION_ABC', 'ACTION_ABC']
mock_acquire.side_effect = ['ACTION_ABC']
mock_steal.side_effect = ['ACTION_XYZ']
res = lockm.cluster_lock_acquire(self.ctx, 'CLUSTER_A', 'ACTION_XYZ',
'NEW_ENGINE')
self.assertTrue(res)
self.assertEqual(4, mock_acquire.call_count)
self.assertEqual(3, mock_sleep.call_count)
mock_acquire.assert_called_once_with("CLUSTER_A", "ACTION_XYZ",
lockm.CLUSTER_SCOPE)
mock_steal.assert_called_once_with('CLUSTER_A', 'ACTION_XYZ')
mock_action_fail.assert_called_once_with(
self.ctx, 'ACTION_ABC', mock.ANY,
'Engine died when executing this action.')
@mock.patch.object(scheduler, 'sleep')
@mock.patch.object(clo.ClusterLock, "acquire")
def test_cluster_lock_acquire_with_retry(self, mock_acquire, mock_sleep):
cfg.CONF.set_override('lock_retry_times', 5, enforce_type=True)
mock_acquire.side_effect = ['ACTION_ABC', 'ACTION_ABC', 'ACTION_XYZ']
res = lockm.cluster_lock_acquire(self.ctx, 'CLUSTER_A', 'ACTION_XYZ')
self.assertTrue(res)
sleep_calls = [mock.call(cfg.CONF.lock_retry_interval)]
mock_sleep.assert_has_calls(sleep_calls * 2)
acquire_calls = [
mock.call('CLUSTER_A', 'ACTION_XYZ', lockm.CLUSTER_SCOPE)
]
mock_acquire.assert_has_calls(acquire_calls * 3)
@mock.patch.object(common_utils, 'is_engine_dead')
@mock.patch.object(scheduler, 'sleep')
@mock.patch.object(clo.ClusterLock, "acquire")
def test_cluster_lock_acquire_max_retries(self, mock_acquire, mock_sleep,
mock_dead):
cfg.CONF.set_override('lock_retry_times', 2, enforce_type=True)
def test_cluster_lock_acquire_failed(self, mock_acquire, mock_dead):
mock_dead.return_value = False
mock_acquire.side_effect = [
'ACTION_ABC', 'ACTION_ABC', 'ACTION_ABC', 'ACTION_XYZ'
]
mock_acquire.return_value = 'ACTION_ABC'
res = lockm.cluster_lock_acquire(self.ctx, 'CLUSTER_A', 'ACTION_XYZ')
self.assertFalse(res)
sleep_calls = [mock.call(cfg.CONF.lock_retry_interval)]
mock_sleep.assert_has_calls(sleep_calls * 2)
self.assertEqual(2, mock_sleep.call_count)
acquire_calls = [
mock.call('CLUSTER_A', 'ACTION_XYZ', lockm.CLUSTER_SCOPE)
]
mock_acquire.assert_has_calls(acquire_calls * 3)
@mock.patch.object(scheduler, 'sleep')
mock_acquire.assert_called_once_with('CLUSTER_A', 'ACTION_XYZ',
lockm.CLUSTER_SCOPE)
@mock.patch.object(clo.ClusterLock, "acquire")
@mock.patch.object(clo.ClusterLock, "steal")
def test_cluster_lock_acquire_forced(self, mock_steal, mock_acquire,
mock_sleep):
cfg.CONF.set_override('lock_retry_times', 2, enforce_type=True)
mock_acquire.side_effect = ['ACTION_ABC', 'ACTION_ABC', 'ACTION_ABC']
def test_cluster_lock_acquire_forced(self, mock_steal, mock_acquire):
mock_acquire.side_effect = ['ACTION_ABC']
mock_steal.return_value = ['ACTION_XY']
res = lockm.cluster_lock_acquire(self.ctx, 'CLUSTER_A',
'ACTION_XY', forced=True)
self.assertTrue(res)
sleep_calls = [mock.call(cfg.CONF.lock_retry_interval)]
mock_sleep.assert_has_calls(sleep_calls * 2)
self.assertEqual(2, mock_sleep.call_count)
acquire_calls = [
mock.call('CLUSTER_A', 'ACTION_XY', lockm.CLUSTER_SCOPE)
]
mock_acquire.assert_has_calls(acquire_calls * 3)
mock_acquire.assert_called_once_with('CLUSTER_A', 'ACTION_XY',
lockm.CLUSTER_SCOPE)
mock_steal.assert_called_once_with('CLUSTER_A', 'ACTION_XY')
@mock.patch.object(common_utils, 'is_engine_dead')
@mock.patch.object(scheduler, 'sleep')
@mock.patch.object(clo.ClusterLock, "acquire")
@mock.patch.object(clo.ClusterLock, "steal")
def test_cluster_lock_acquire_steal_failed(self, mock_steal, mock_acquire,
mock_sleep, mock_dead):
cfg.CONF.set_override('lock_retry_times', 2, enforce_type=True)
mock_dead):
mock_dead.return_value = False
mock_acquire.side_effect = ['ACTION_ABC', 'ACTION_ABC', 'ACTION_ABC']
mock_acquire.side_effect = ['ACTION_ABC']
mock_steal.return_value = []
res = lockm.cluster_lock_acquire(self.ctx, 'CLUSTER_A',
'ACTION_XY', forced=True)
self.assertFalse(res)
sleep_calls = [mock.call(cfg.CONF.lock_retry_interval)]
mock_sleep.assert_has_calls(sleep_calls * 2)
self.assertEqual(2, mock_sleep.call_count)
acquire_calls = [
mock.call('CLUSTER_A', 'ACTION_XY', lockm.CLUSTER_SCOPE)
]
mock_acquire.assert_has_calls(acquire_calls * 3)
mock_acquire.assert_called_once_with('CLUSTER_A', 'ACTION_XY',
lockm.CLUSTER_SCOPE)
mock_steal.assert_called_once_with('CLUSTER_A', 'ACTION_XY')
@mock.patch.object(clo.ClusterLock, "release")
@ -174,59 +128,33 @@ class SenlinLockTest(base.SenlinTestCase):
def test_node_lock_acquire_dead_owner(self, mock_steal, mock_acquire,
mock_action_fail, mock_dead):
mock_dead.return_value = True
mock_acquire.side_effect = ['ACTION_ABC', 'ACTION_ABC',
'ACTION_ABC', 'ACTION_ABC']
mock_acquire.side_effect = ['ACTION_ABC']
mock_steal.return_value = 'ACTION_XYZ'
res = lockm.node_lock_acquire(self.ctx, 'NODE_A', 'ACTION_XYZ',
'NEW_ENGINE')
self.assertTrue(res)
self.assertEqual(4, mock_acquire.call_count)
mock_acquire.assert_called_once_with('NODE_A', 'ACTION_XYZ')
mock_steal.assert_called_once_with('NODE_A', 'ACTION_XYZ')
mock_action_fail.assert_called_once_with(
self.ctx, 'ACTION_ABC', mock.ANY,
'Engine died when executing this action.')
@mock.patch.object(scheduler, 'sleep')
@mock.patch.object(nlo.NodeLock, "acquire")
def test_node_lock_acquire_with_retry(self, mock_acquire, mock_sleep):
cfg.CONF.set_override('lock_retry_times', 5, enforce_type=True)
mock_acquire.side_effect = ['ACTION_ABC', 'ACTION_ABC', 'ACTION_XYZ']
res = lockm.node_lock_acquire(self.ctx, 'NODE_A', 'ACTION_XYZ')
self.assertTrue(res)
sleep_calls = [mock.call(cfg.CONF.lock_retry_interval)]
mock_sleep.assert_has_calls(sleep_calls * 2)
acquire_calls = [mock.call('NODE_A', 'ACTION_XYZ')]
mock_acquire.assert_has_calls(acquire_calls * 3)
@mock.patch.object(common_utils, 'is_engine_dead')
@mock.patch.object(scheduler, 'sleep')
@mock.patch.object(nlo.NodeLock, "acquire")
def test_node_lock_acquire_max_retries(self, mock_acquire, mock_sleep,
mock_dead):
cfg.CONF.set_override('lock_retry_times', 2, enforce_type=True)
def test_node_lock_acquire_failed(self, mock_acquire, mock_dead):
mock_dead.return_value = False
mock_acquire.side_effect = [
'ACTION_ABC', 'ACTION_ABC', 'ACTION_ABC', 'ACTION_XYZ'
]
mock_acquire.side_effect = ['ACTION_ABC']
res = lockm.node_lock_acquire(self.ctx, 'NODE_A', 'ACTION_XYZ')
self.assertFalse(res)
sleep_calls = [mock.call(cfg.CONF.lock_retry_interval)]
mock_sleep.assert_has_calls(sleep_calls * 2)
self.assertEqual(2, mock_sleep.call_count)
acquire_calls = [mock.call('NODE_A', 'ACTION_XYZ')]
mock_acquire.assert_has_calls(acquire_calls * 3)
mock_acquire.assert_called_once_with('NODE_A', 'ACTION_XYZ')
@mock.patch.object(scheduler, 'sleep')
@mock.patch.object(nlo.NodeLock, "acquire")
@mock.patch.object(nlo.NodeLock, "steal")
def test_node_lock_acquire_forced(self, mock_steal, mock_acquire,
mock_sleep):
cfg.CONF.set_override('lock_retry_times', 2, enforce_type=True)
def test_node_lock_acquire_forced(self, mock_steal, mock_acquire):
mock_acquire.side_effect = ['ACTION_ABC', 'ACTION_ABC', 'ACTION_ABC']
mock_steal.return_value = 'ACTION_XY'
@ -234,33 +162,22 @@ class SenlinLockTest(base.SenlinTestCase):
'ACTION_XY', forced=True)
self.assertTrue(res)
sleep_calls = [mock.call(cfg.CONF.lock_retry_interval)]
mock_sleep.assert_has_calls(sleep_calls * 2)
self.assertEqual(2, mock_sleep.call_count)
acquire_calls = [mock.call('NODE_A', 'ACTION_XY')]
mock_acquire.assert_has_calls(acquire_calls * 3)
mock_acquire.assert_called_once_with('NODE_A', 'ACTION_XY')
mock_steal.assert_called_once_with('NODE_A', 'ACTION_XY')
@mock.patch.object(ao.Action, 'get')
@mock.patch.object(scheduler, 'sleep')
@mock.patch.object(nlo.NodeLock, "acquire")
@mock.patch.object(nlo.NodeLock, "steal")
def test_node_lock_acquire_steal_failed(self, mock_steal, mock_acquire,
mock_sleep, mock_get):
cfg.CONF.set_override('lock_retry_times', 2, enforce_type=True)
mock_get.return_value = mock.Mock(owner='ENGINE')
mock_acquire.side_effect = ['ACTION_ABC', 'ACTION_ABC', 'ACTION_ABC']
mock_get):
mock_acquire.side_effect = ['ACTION_ABC']
mock_steal.return_value = None
res = lockm.node_lock_acquire(self.ctx, 'NODE_A',
'ACTION_XY', forced=True)
self.assertFalse(res)
sleep_calls = [mock.call(cfg.CONF.lock_retry_interval)]
mock_sleep.assert_has_calls(sleep_calls * 2)
self.assertEqual(2, mock_sleep.call_count)
acquire_calls = [mock.call('NODE_A', 'ACTION_XY')]
mock_acquire.assert_has_calls(acquire_calls * 3)
mock_acquire.assert_called_once_with('NODE_A', 'ACTION_XY')
mock_steal.assert_called_once_with('NODE_A', 'ACTION_XY')
@mock.patch.object(nlo.NodeLock, "release")


Loading…
Cancel
Save