Retry resource create until success
On resource create, if a ResourceInFailure is raised then repeated attempts are made to delete and recreate the resource until success or a different error state is achieved. Likewise, the prepare-retry deletes will be retried until ResourceInFailure is not raised. An exponentially increasing delay with jitter is introduced between each create attempt, and attempts continue up to the configured action_retry_limit or stack operation timeout. Likewise An exponentially increasing delay with jitter is introduced between each prepre-retry delete attempt, and delete attempts continue up to the configured action_retry_limit or stack operation timeout. The delete attempt count is reset to zero whenever a create attempt has been performed. Creates that result from an UpdateReplace will also go through this path, so this is also helps some stack update scenarios. This change is aimed at being part of an interim solution to making heat resilient to transient cloud failures. Convergence is the permanent solution however there may be benefits to the convergence implementation from this interim effort. Currently retry is only attempted on ResourceInFailure. Eventually client plugins can indicate whether a given exception should lead to a retry attempt (such as connection errors, some 500s). Partial-Blueprint: retry-failed-api-calls Change-Id: I07c3301349bcd24096f3cafbb6d82c43bccb93de
This commit is contained in:
@@ -47,6 +47,10 @@
|
||||
# one time. (integer value)
|
||||
#max_stacks_per_tenant=100
|
||||
|
||||
# Number of times to retry to bring a resource to a non-error
|
||||
# state. Set to 0 to disable retries. (integer value)
|
||||
#action_retry_limit=5
|
||||
|
||||
# Controls how many events will be pruned whenever a stack's
|
||||
# events exceed max_events_per_stack. Set this lower to keep
|
||||
# more events at the expense of more frequent purges. (integer
|
||||
|
||||
@@ -113,6 +113,11 @@ engine_opts = [
|
||||
default=100,
|
||||
help=_('Maximum number of stacks any one tenant may have'
|
||||
' active at one time.')),
|
||||
cfg.IntOpt('action_retry_limit',
|
||||
default=5,
|
||||
help=_('Number of times to retry to bring a '
|
||||
'resource to a non-error state. Set to 0 to disable '
|
||||
'retries.')),
|
||||
cfg.IntOpt('event_purge_batch_size',
|
||||
default=10,
|
||||
help=_('Controls how many events will be pruned whenever a '
|
||||
|
||||
+48
-1
@@ -14,12 +14,14 @@
|
||||
import base64
|
||||
import contextlib
|
||||
from datetime import datetime
|
||||
from oslo.config import cfg
|
||||
import six
|
||||
import warnings
|
||||
|
||||
from heat.common import exception
|
||||
from heat.common import identifier
|
||||
from heat.common import short_id
|
||||
from heat.common import timeutils
|
||||
from heat.db import api as db_api
|
||||
from heat.engine.attributes import Attributes
|
||||
from heat.engine import environment
|
||||
@@ -34,6 +36,8 @@ from heat.openstack.common import excutils
|
||||
from heat.openstack.common.gettextutils import _
|
||||
from heat.openstack.common import log as logging
|
||||
|
||||
cfg.CONF.import_opt('action_retry_limit', 'heat.common.config')
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -465,6 +469,7 @@ class Resource(object):
|
||||
'''
|
||||
return self
|
||||
|
||||
@scheduler.wrappertask
|
||||
def create(self):
|
||||
'''
|
||||
Create the resource. Subclasses should provide a handle_create() method
|
||||
@@ -483,7 +488,49 @@ class Resource(object):
|
||||
# the parser.Stack is stored (which is after the resources
|
||||
# are __init__'d, but before they are create()'d)
|
||||
self.reparse()
|
||||
return self._do_action(action, self.properties.validate)
|
||||
|
||||
def pause():
|
||||
try:
|
||||
while True:
|
||||
yield
|
||||
except scheduler.Timeout:
|
||||
return
|
||||
|
||||
count = {self.CREATE: 0, self.DELETE: 0}
|
||||
|
||||
retry_limit = max(cfg.CONF.action_retry_limit, 0)
|
||||
first_failure = None
|
||||
|
||||
while (count[self.CREATE] <= retry_limit and
|
||||
count[self.DELETE] <= retry_limit):
|
||||
if count[action]:
|
||||
delay = timeutils.retry_backoff_delay(count[action],
|
||||
jitter_max=2.0)
|
||||
waiter = scheduler.TaskRunner(pause)
|
||||
waiter.start(timeout=delay)
|
||||
while not waiter.step():
|
||||
yield
|
||||
try:
|
||||
yield self._do_action(action, self.properties.validate)
|
||||
if action == self.CREATE:
|
||||
return
|
||||
else:
|
||||
action = self.CREATE
|
||||
except exception.ResourceFailure as failure:
|
||||
if not isinstance(failure.exc, ResourceInError):
|
||||
raise failure
|
||||
|
||||
count[action] += 1
|
||||
if action == self.CREATE:
|
||||
action = self.DELETE
|
||||
count[action] = 0
|
||||
|
||||
if first_failure is None:
|
||||
# Save the first exception
|
||||
first_failure = failure
|
||||
|
||||
if first_failure:
|
||||
raise first_failure
|
||||
|
||||
def prepare_abandon(self):
|
||||
self.abandon_in_progress = True
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
|
||||
import copy
|
||||
import mox
|
||||
from oslo.config import cfg
|
||||
import six
|
||||
|
||||
from neutronclient.v2_0 import client as neutronclient
|
||||
@@ -474,7 +475,9 @@ class PoolTest(HeatTestCase):
|
||||
self.assertEqual((rsrc.CREATE, rsrc.COMPLETE), rsrc.state)
|
||||
self.m.VerifyAll()
|
||||
|
||||
def test_create_failed_unexpected_status(self):
|
||||
def test_create_failed_error_status(self):
|
||||
cfg.CONF.set_override('action_retry_limit', 0)
|
||||
|
||||
neutron_utils.neutronV20.find_resourceid_by_name_or_id(
|
||||
mox.IsA(neutronclient.Client),
|
||||
'subnet',
|
||||
@@ -532,7 +535,7 @@ class PoolTest(HeatTestCase):
|
||||
neutronclient.Client.show_pool('5678').MultipleTimes().AndReturn(
|
||||
{'pool': {'status': 'ACTIVE'}})
|
||||
neutronclient.Client.show_vip('xyz').AndReturn(
|
||||
{'vip': {'status': 'ERROR', 'name': 'xyz'}})
|
||||
{'vip': {'status': 'SOMETHING', 'name': 'xyz'}})
|
||||
|
||||
snippet = template_format.parse(pool_template)
|
||||
stack = utils.parse_stack(snippet)
|
||||
@@ -543,7 +546,7 @@ class PoolTest(HeatTestCase):
|
||||
error = self.assertRaises(exception.ResourceFailure,
|
||||
scheduler.TaskRunner(rsrc.create))
|
||||
self.assertEqual(
|
||||
'ResourceInError: Went to status ERROR due to "error in vip"',
|
||||
'ResourceUnknownStatus: Unknown status SOMETHING',
|
||||
six.text_type(error))
|
||||
self.assertEqual((rsrc.CREATE, rsrc.FAILED), rsrc.state)
|
||||
self.m.VerifyAll()
|
||||
|
||||
+142
-1
@@ -16,9 +16,11 @@ import json
|
||||
import uuid
|
||||
|
||||
import mock
|
||||
from oslo.config import cfg
|
||||
import six
|
||||
|
||||
from heat.common import exception
|
||||
from heat.common import timeutils
|
||||
from heat.db import api as db_api
|
||||
from heat.engine import attributes
|
||||
from heat.engine.cfn import functions as cfn_funcs
|
||||
@@ -473,12 +475,151 @@ class ResourceTest(HeatTestCase):
|
||||
res = generic_rsrc.ResourceWithProps(rname, tmpl, self.stack)
|
||||
res.id = 'test_res_id'
|
||||
(res.action, res.status) = (res.INIT, res.DELETE)
|
||||
self.assertRaises(exception.ResourceFailure, res.create)
|
||||
create = scheduler.TaskRunner(res.create)
|
||||
self.assertRaises(exception.ResourceFailure, create)
|
||||
scheduler.TaskRunner(res.destroy)()
|
||||
res.state_reset()
|
||||
scheduler.TaskRunner(res.create)()
|
||||
self.assertEqual((res.CREATE, res.COMPLETE), res.state)
|
||||
|
||||
def test_create_fail_retry(self):
|
||||
tmpl = rsrc_defn.ResourceDefinition('test_resource', 'Foo',
|
||||
{'Foo': 'abc'})
|
||||
res = generic_rsrc.ResourceWithProps('test_resource', tmpl, self.stack)
|
||||
self.m.StubOutWithMock(timeutils, 'retry_backoff_delay')
|
||||
self.m.StubOutWithMock(generic_rsrc.ResourceWithProps, 'handle_create')
|
||||
self.m.StubOutWithMock(generic_rsrc.ResourceWithProps, 'handle_delete')
|
||||
|
||||
# first attempt to create fails
|
||||
generic_rsrc.ResourceWithProps.handle_create().AndRaise(
|
||||
resource.ResourceInError(resource_name='test_resource',
|
||||
resource_status='ERROR',
|
||||
resource_type='GenericResourceType',
|
||||
resource_action='CREATE',
|
||||
status_reason='just because'))
|
||||
# delete error resource from first attempt
|
||||
generic_rsrc.ResourceWithProps.handle_delete().AndReturn(None)
|
||||
|
||||
# second attempt to create succeeds
|
||||
timeutils.retry_backoff_delay(1, jitter_max=2.0).AndReturn(0.01)
|
||||
generic_rsrc.ResourceWithProps.handle_create().AndReturn(None)
|
||||
self.m.ReplayAll()
|
||||
|
||||
scheduler.TaskRunner(res.create)()
|
||||
self.assertEqual((res.CREATE, res.COMPLETE), res.state)
|
||||
self.m.VerifyAll()
|
||||
|
||||
def test_create_fail_retry_disabled(self):
|
||||
cfg.CONF.set_override('action_retry_limit', 0)
|
||||
tmpl = rsrc_defn.ResourceDefinition('test_resource', 'Foo',
|
||||
{'Foo': 'abc'})
|
||||
res = generic_rsrc.ResourceWithProps('test_resource', tmpl, self.stack)
|
||||
|
||||
self.m.StubOutWithMock(timeutils, 'retry_backoff_delay')
|
||||
self.m.StubOutWithMock(generic_rsrc.ResourceWithProps, 'handle_create')
|
||||
self.m.StubOutWithMock(generic_rsrc.ResourceWithProps, 'handle_delete')
|
||||
|
||||
# attempt to create fails
|
||||
generic_rsrc.ResourceWithProps.handle_create().AndRaise(
|
||||
resource.ResourceInError(resource_name='test_resource',
|
||||
resource_status='ERROR',
|
||||
resource_type='GenericResourceType',
|
||||
resource_action='CREATE',
|
||||
status_reason='just because'))
|
||||
self.m.ReplayAll()
|
||||
|
||||
estr = ('ResourceInError: Went to status ERROR due to "just because"')
|
||||
create = scheduler.TaskRunner(res.create)
|
||||
err = self.assertRaises(exception.ResourceFailure, create)
|
||||
self.assertEqual(estr, str(err))
|
||||
self.assertEqual((res.CREATE, res.FAILED), res.state)
|
||||
|
||||
self.m.VerifyAll()
|
||||
|
||||
def test_create_deletes_fail_retry(self):
|
||||
tmpl = rsrc_defn.ResourceDefinition('test_resource', 'Foo',
|
||||
{'Foo': 'abc'})
|
||||
res = generic_rsrc.ResourceWithProps('test_resource', tmpl, self.stack)
|
||||
|
||||
self.m.StubOutWithMock(timeutils, 'retry_backoff_delay')
|
||||
self.m.StubOutWithMock(generic_rsrc.ResourceWithProps, 'handle_create')
|
||||
self.m.StubOutWithMock(generic_rsrc.ResourceWithProps, 'handle_delete')
|
||||
|
||||
# first attempt to create fails
|
||||
generic_rsrc.ResourceWithProps.handle_create().AndRaise(
|
||||
resource.ResourceInError(resource_name='test_resource',
|
||||
resource_status='ERROR',
|
||||
resource_type='GenericResourceType',
|
||||
resource_action='CREATE',
|
||||
status_reason='just because'))
|
||||
# first attempt to delete fails
|
||||
generic_rsrc.ResourceWithProps.handle_delete().AndRaise(
|
||||
resource.ResourceInError(resource_name='test_resource',
|
||||
resource_status='ERROR',
|
||||
resource_type='GenericResourceType',
|
||||
resource_action='DELETE',
|
||||
status_reason='delete failed'))
|
||||
# second attempt to delete fails
|
||||
timeutils.retry_backoff_delay(1, jitter_max=2.0).AndReturn(0.01)
|
||||
generic_rsrc.ResourceWithProps.handle_delete().AndRaise(
|
||||
resource.ResourceInError(resource_name='test_resource',
|
||||
resource_status='ERROR',
|
||||
resource_type='GenericResourceType',
|
||||
resource_action='DELETE',
|
||||
status_reason='delete failed again'))
|
||||
|
||||
# third attempt to delete succeeds
|
||||
timeutils.retry_backoff_delay(2, jitter_max=2.0).AndReturn(0.01)
|
||||
generic_rsrc.ResourceWithProps.handle_delete().AndReturn(None)
|
||||
|
||||
# second attempt to create succeeds
|
||||
timeutils.retry_backoff_delay(1, jitter_max=2.0).AndReturn(0.01)
|
||||
generic_rsrc.ResourceWithProps.handle_create().AndReturn(None)
|
||||
self.m.ReplayAll()
|
||||
|
||||
scheduler.TaskRunner(res.create)()
|
||||
self.assertEqual((res.CREATE, res.COMPLETE), res.state)
|
||||
self.m.VerifyAll()
|
||||
|
||||
def test_creates_fail_retry(self):
|
||||
tmpl = rsrc_defn.ResourceDefinition('test_resource', 'Foo',
|
||||
{'Foo': 'abc'})
|
||||
res = generic_rsrc.ResourceWithProps('test_resource', tmpl, self.stack)
|
||||
|
||||
self.m.StubOutWithMock(timeutils, 'retry_backoff_delay')
|
||||
self.m.StubOutWithMock(generic_rsrc.ResourceWithProps, 'handle_create')
|
||||
self.m.StubOutWithMock(generic_rsrc.ResourceWithProps, 'handle_delete')
|
||||
|
||||
# first attempt to create fails
|
||||
generic_rsrc.ResourceWithProps.handle_create().AndRaise(
|
||||
resource.ResourceInError(resource_name='test_resource',
|
||||
resource_status='ERROR',
|
||||
resource_type='GenericResourceType',
|
||||
resource_action='CREATE',
|
||||
status_reason='just because'))
|
||||
# delete error resource from first attempt
|
||||
generic_rsrc.ResourceWithProps.handle_delete().AndReturn(None)
|
||||
|
||||
# second attempt to create fails
|
||||
timeutils.retry_backoff_delay(1, jitter_max=2.0).AndReturn(0.01)
|
||||
generic_rsrc.ResourceWithProps.handle_create().AndRaise(
|
||||
resource.ResourceInError(resource_name='test_resource',
|
||||
resource_status='ERROR',
|
||||
resource_type='GenericResourceType',
|
||||
resource_action='CREATE',
|
||||
status_reason='just because'))
|
||||
# delete error resource from second attempt
|
||||
generic_rsrc.ResourceWithProps.handle_delete().AndReturn(None)
|
||||
|
||||
# third attempt to create succeeds
|
||||
timeutils.retry_backoff_delay(2, jitter_max=2.0).AndReturn(0.01)
|
||||
generic_rsrc.ResourceWithProps.handle_create().AndReturn(None)
|
||||
self.m.ReplayAll()
|
||||
|
||||
scheduler.TaskRunner(res.create)()
|
||||
self.assertEqual((res.CREATE, res.COMPLETE), res.state)
|
||||
self.m.VerifyAll()
|
||||
|
||||
def test_preview(self):
|
||||
tmpl = rsrc_defn.ResourceDefinition('test_resource',
|
||||
'GenericResourceType')
|
||||
|
||||
@@ -17,6 +17,7 @@ import json
|
||||
from cinderclient import exceptions as cinder_exp
|
||||
from cinderclient.v1 import client as cinderclient
|
||||
import mox
|
||||
from oslo.config import cfg
|
||||
import six
|
||||
|
||||
from heat.common import exception
|
||||
@@ -719,6 +720,8 @@ class VolumeTest(HeatTestCase):
|
||||
|
||||
def test_snapshot_no_volume(self):
|
||||
stack_name = 'test_volume_stack'
|
||||
|
||||
cfg.CONF.set_override('action_retry_limit', 0)
|
||||
fv = FakeVolume('creating', 'error')
|
||||
|
||||
self._mock_create_volume(fv, stack_name)
|
||||
|
||||
Reference in New Issue
Block a user