Retry on Conflict exceptions from Ironic

Retried is everything, except get and list calls and port creation
(for which Conflict is an expected exception).

Unit tests are added for the most critical cases.

Change-Id: I92675c3d72a2f598c4e0d1ae8b0a4a4aa0a2ed12
Closes-Bug: #1401222
This commit is contained in:
Dmitry Tantsur 2014-12-11 17:44:16 +01:00
parent 43b0769703
commit 16bbe0e3a7
6 changed files with 88 additions and 14 deletions

View File

@ -74,7 +74,7 @@ def _validate(ironic, node):
(node.uuid, power_state))
if not node.extra.get('ipmi_setup_credentials'):
validation = ironic.node.validate(node.uuid)
validation = utils.retry_on_conflict(ironic.node.validate, node.uuid)
if not validation.power['result']:
LOG.error('Failed validation of power interface for node %s, '
'reason: %s', node.uuid, validation.power['reason'])
@ -101,7 +101,8 @@ def _background_start_discover(ironic, node):
if not node.extra.get('ipmi_setup_credentials'):
try:
ironic.node.set_power_state(node.uuid, 'reboot')
utils.retry_on_conflict(ironic.node.set_power_state,
node.uuid, 'reboot')
except Exception as exc:
LOG.error('Failed to power on node %s, check it\'s power '
'management configuration:\n%s', node.uuid, exc)

View File

@ -93,9 +93,9 @@ def _process_node(ironic, node, node_info, cached_node):
{'mac': mac, 'node': node.uuid})
node_patches, port_patches = _run_post_hooks(node, ports, node_info)
node = ironic.node.update(node.uuid, node_patches)
node = utils.retry_on_conflict(ironic.node.update, node.uuid, node_patches)
for mac, patches in port_patches.items():
ironic.port.update(ports[mac].uuid, patches)
utils.retry_on_conflict(ironic.port.update, ports[mac].uuid, patches)
LOG.info('Node %s was updated with data from discovery process', node.uuid)
@ -116,7 +116,8 @@ def _wait_for_power_management(ironic, cached_node):
deadline = cached_node.started_at + conf.getint('discoverd', 'timeout')
while time.time() < deadline:
eventlet.greenthread.sleep(_POWER_CHECK_PERIOD)
validation = ironic.node.validate(cached_node.uuid)
validation = utils.retry_on_conflict(ironic.node.validate,
cached_node.uuid)
if validation.power['result']:
_finish_discovery(ironic, cached_node)
return
@ -131,7 +132,7 @@ def _wait_for_power_management(ironic, cached_node):
def _force_power_off(ironic, node):
LOG.debug('Forcing power off of node %s', node.uuid)
try:
ironic.node.set_power_state(node.uuid, 'off')
utils.retry_on_conflict(ironic.node.set_power_state, node.uuid, 'off')
except Exception as exc:
LOG.error('Failed to power off node %s, check it\'s power '
'management configuration:\n%s', node.uuid, exc)
@ -143,4 +144,4 @@ def _finish_discovery(ironic, node):
patch = [{'op': 'add', 'path': '/extra/newly_discovered', 'value': 'true'},
{'op': 'remove', 'path': '/extra/on_discovery'}]
ironic.node.update(node.uuid, patch)
utils.retry_on_conflict(ironic.node.update, node.uuid, patch)

View File

@ -24,6 +24,7 @@ from ironic_discoverd.test import base as test_base
from ironic_discoverd import utils
@mock.patch.object(eventlet.greenthread, 'sleep', lambda _: None)
@mock.patch.object(eventlet.greenthread, 'spawn_n',
side_effect=lambda f, *a: f(*a) and None)
@mock.patch.object(firewall, 'update_filters', autospec=True)
@ -70,9 +71,11 @@ class TestDiscover(test_base.BaseTest):
]
cli.node.list_ports.side_effect = ports
# Failure to powering on does not cause total failure
cli.node.set_power_state.side_effect = [None,
exceptions.Conflict(),
None]
cli.node.set_power_state.side_effect = [
None,
exceptions.Conflict(), # this is just retried
exceptions.InternalServerError(),
None]
discover.discover(['uuid1', 'uuid2', 'uuid3'])
@ -93,7 +96,7 @@ class TestDiscover(test_base.BaseTest):
mac=[])
filters_mock.assert_called_with(cli)
self.assertEqual(2, filters_mock.call_count) # 1 node w/o ports
self.assertEqual(3, cli.node.set_power_state.call_count)
self.assertEqual(4, cli.node.set_power_state.call_count)
cli.node.set_power_state.assert_called_with(mock.ANY, 'reboot')
patch = [{'op': 'add', 'path': '/extra/on_discovery', 'value': 'true'},
{'op': 'add', 'path': '/extra/discovery_timestamp',
@ -133,7 +136,7 @@ class TestDiscover(test_base.BaseTest):
discover.discover, ['uuid1', 'uuid2'])
cli.node.get.side_effect = [
exceptions.Conflict(),
exceptions.BadRequest(),
self.node1,
]
self.assertRaisesRegexp(utils.DiscoveryFailed,

View File

@ -14,6 +14,7 @@
import unittest
import eventlet
from ironicclient import exceptions
from keystoneclient import exceptions as keystone_exc
import mock
@ -128,3 +129,24 @@ class TestPlugins(unittest.TestCase):
def test_manager_is_cached(self):
self.assertIs(plugins_base.processing_hooks_manager(),
plugins_base.processing_hooks_manager())
@mock.patch.object(eventlet.greenthread, 'sleep', lambda _: None)
class TestUtils(unittest.TestCase):
def test_retry_on_conflict(self):
call = mock.Mock()
call.side_effect = ([exceptions.Conflict()] * (utils.RETRY_COUNT - 1)
+ [mock.sentinel.result])
res = utils.retry_on_conflict(call, 1, 2, x=3)
self.assertEqual(mock.sentinel.result, res)
call.assert_called_with(1, 2, x=3)
self.assertEqual(utils.RETRY_COUNT, call.call_count)
def test_retry_on_conflict_fail(self):
call = mock.Mock()
call.side_effect = ([exceptions.Conflict()] * (utils.RETRY_COUNT + 1)
+ [mock.sentinel.result])
self.assertRaises(exceptions.Conflict, utils.retry_on_conflict,
call, 1, 2, x=3)
call.assert_called_with(1, 2, x=3)
self.assertEqual(utils.RETRY_COUNT, call.call_count)

View File

@ -247,6 +247,35 @@ class TestProcessNode(BaseTest):
self.assertEqual(self.ports, sorted(post_hook_mock.call_args[0][1],
key=lambda p: p.address))
def test_update_retry_on_conflict(self, filters_mock, post_hook_mock):
self.cli.node.update.side_effect = [exceptions.Conflict, self.node,
exceptions.Conflict, self.node]
self.call()
self.cli.port.create.assert_any_call(node_uuid=self.uuid,
address=self.macs[0])
self.cli.port.create.assert_any_call(node_uuid=self.uuid,
address=self.macs[1])
self.cli.node.update.assert_any_call(self.uuid, self.patch_before)
self.cli.node.update.assert_any_call(self.uuid, self.patch_after)
self.assertEqual(4, self.cli.node.update.call_count)
self.cli.node.set_power_state.assert_called_once_with(self.uuid, 'off')
def test_power_off_retry_on_conflict(self, filters_mock, post_hook_mock):
self.cli.node.set_power_state.side_effect = [exceptions.Conflict, None]
self.call()
self.cli.port.create.assert_any_call(node_uuid=self.uuid,
address=self.macs[0])
self.cli.port.create.assert_any_call(node_uuid=self.uuid,
address=self.macs[1])
self.cli.node.update.assert_any_call(self.uuid, self.patch_before)
self.cli.node.update.assert_any_call(self.uuid, self.patch_after)
self.cli.node.set_power_state.assert_called_with(self.uuid, 'off')
self.assertEqual(2, self.cli.node.set_power_state.call_count)
def test_port_failed(self, filters_mock, post_hook_mock):
self.ports[0] = exceptions.Conflict()
@ -300,7 +329,7 @@ class TestProcessNode(BaseTest):
self.assertFalse(self.cli.node.set_power_state.called)
def test_power_off_failed(self, filters_mock, post_hook_mock):
self.cli.node.set_power_state.side_effect = exceptions.Conflict()
self.cli.node.set_power_state.side_effect = exceptions.BadRequest()
self.assertRaisesRegexp(utils.DiscoveryFailed, 'Failed to power off',
self.call)

View File

@ -14,15 +14,19 @@
import logging
import re
import eventlet
from ironicclient import client
from ironicclient import exceptions
from keystoneclient.v2_0 import client as keystone
import six
from ironic_discoverd import conf
LOG = logging.getLogger('discoverd')
LOG = logging.getLogger('ironic_discoverd.utils')
OS_ARGS = ('os_password', 'os_username', 'os_auth_url', 'os_tenant_name')
RETRY_COUNT = 10
RETRY_DELAY = 2
class DiscoveryFailed(Exception):
@ -45,3 +49,17 @@ def is_valid_mac(address):
m = "[0-9a-f]{2}(:[0-9a-f]{2}){5}$"
return (isinstance(address, six.string_types)
and re.match(m, address.lower()))
def retry_on_conflict(call, *args, **kwargs):
for i in range(RETRY_COUNT):
try:
return call(*args, **kwargs)
except exceptions.Conflict as exc:
LOG.warning('Conflict on calling %s: %s, retry attempt %d',
getattr(call, '__name__', repr(call)), exc, i)
if i == RETRY_COUNT - 1:
raise
eventlet.greenthread.sleep(RETRY_DELAY)
raise RuntimeError('unreachable code') # pragma: no cover