Retry resource create until success

On resource create, if a ResourceInFailure is raised then
repeated attempts are made to delete and recreate the resource
until success or a different error state is achieved.

Likewise, the prepare-retry deletes will be retried until
ResourceInFailure is not raised.

An exponentially increasing delay with jitter is introduced
between each create attempt, and attempts continue up to the configured
action_retry_limit or stack operation timeout.

Likewise An exponentially increasing delay with jitter is introduced
between each prepre-retry delete attempt, and delete attempts
continue up to the configured action_retry_limit or stack operation
timeout. The delete attempt count is reset to zero whenever a create
attempt has been performed.

Creates that result from an UpdateReplace will also go
through this path, so this is also helps some stack update scenarios.

This change is aimed at being part of an interim solution to making
heat resilient to transient cloud failures. Convergence is the
permanent solution however there may be benefits to the convergence
implementation from this interim effort.

Currently retry is only attempted on ResourceInFailure. Eventually
client plugins can indicate whether a given exception should lead
to a retry attempt (such as connection errors, some 500s).

Partial-Blueprint: retry-failed-api-calls
Change-Id: I07c3301349bcd24096f3cafbb6d82c43bccb93de
This commit is contained in:
Steve Baker
2014-07-28 11:48:59 +12:00
parent f0ec53626e
commit d61427fe16
6 changed files with 208 additions and 5 deletions
+4
View File
@@ -47,6 +47,10 @@
# one time. (integer value)
#max_stacks_per_tenant=100
# Number of times to retry to bring a resource to a non-error
# state. Set to 0 to disable retries. (integer value)
#action_retry_limit=5
# Controls how many events will be pruned whenever a stack's
# events exceed max_events_per_stack. Set this lower to keep
# more events at the expense of more frequent purges. (integer
+5
View File
@@ -113,6 +113,11 @@ engine_opts = [
default=100,
help=_('Maximum number of stacks any one tenant may have'
' active at one time.')),
cfg.IntOpt('action_retry_limit',
default=5,
help=_('Number of times to retry to bring a '
'resource to a non-error state. Set to 0 to disable '
'retries.')),
cfg.IntOpt('event_purge_batch_size',
default=10,
help=_('Controls how many events will be pruned whenever a '
+48 -1
View File
@@ -14,12 +14,14 @@
import base64
import contextlib
from datetime import datetime
from oslo.config import cfg
import six
import warnings
from heat.common import exception
from heat.common import identifier
from heat.common import short_id
from heat.common import timeutils
from heat.db import api as db_api
from heat.engine.attributes import Attributes
from heat.engine import environment
@@ -34,6 +36,8 @@ from heat.openstack.common import excutils
from heat.openstack.common.gettextutils import _
from heat.openstack.common import log as logging
cfg.CONF.import_opt('action_retry_limit', 'heat.common.config')
LOG = logging.getLogger(__name__)
@@ -465,6 +469,7 @@ class Resource(object):
'''
return self
@scheduler.wrappertask
def create(self):
'''
Create the resource. Subclasses should provide a handle_create() method
@@ -483,7 +488,49 @@ class Resource(object):
# the parser.Stack is stored (which is after the resources
# are __init__'d, but before they are create()'d)
self.reparse()
return self._do_action(action, self.properties.validate)
def pause():
try:
while True:
yield
except scheduler.Timeout:
return
count = {self.CREATE: 0, self.DELETE: 0}
retry_limit = max(cfg.CONF.action_retry_limit, 0)
first_failure = None
while (count[self.CREATE] <= retry_limit and
count[self.DELETE] <= retry_limit):
if count[action]:
delay = timeutils.retry_backoff_delay(count[action],
jitter_max=2.0)
waiter = scheduler.TaskRunner(pause)
waiter.start(timeout=delay)
while not waiter.step():
yield
try:
yield self._do_action(action, self.properties.validate)
if action == self.CREATE:
return
else:
action = self.CREATE
except exception.ResourceFailure as failure:
if not isinstance(failure.exc, ResourceInError):
raise failure
count[action] += 1
if action == self.CREATE:
action = self.DELETE
count[action] = 0
if first_failure is None:
# Save the first exception
first_failure = failure
if first_failure:
raise first_failure
def prepare_abandon(self):
self.abandon_in_progress = True
+6 -3
View File
@@ -13,6 +13,7 @@
import copy
import mox
from oslo.config import cfg
import six
from neutronclient.v2_0 import client as neutronclient
@@ -474,7 +475,9 @@ class PoolTest(HeatTestCase):
self.assertEqual((rsrc.CREATE, rsrc.COMPLETE), rsrc.state)
self.m.VerifyAll()
def test_create_failed_unexpected_status(self):
def test_create_failed_error_status(self):
cfg.CONF.set_override('action_retry_limit', 0)
neutron_utils.neutronV20.find_resourceid_by_name_or_id(
mox.IsA(neutronclient.Client),
'subnet',
@@ -532,7 +535,7 @@ class PoolTest(HeatTestCase):
neutronclient.Client.show_pool('5678').MultipleTimes().AndReturn(
{'pool': {'status': 'ACTIVE'}})
neutronclient.Client.show_vip('xyz').AndReturn(
{'vip': {'status': 'ERROR', 'name': 'xyz'}})
{'vip': {'status': 'SOMETHING', 'name': 'xyz'}})
snippet = template_format.parse(pool_template)
stack = utils.parse_stack(snippet)
@@ -543,7 +546,7 @@ class PoolTest(HeatTestCase):
error = self.assertRaises(exception.ResourceFailure,
scheduler.TaskRunner(rsrc.create))
self.assertEqual(
'ResourceInError: Went to status ERROR due to "error in vip"',
'ResourceUnknownStatus: Unknown status SOMETHING',
six.text_type(error))
self.assertEqual((rsrc.CREATE, rsrc.FAILED), rsrc.state)
self.m.VerifyAll()
+142 -1
View File
@@ -16,9 +16,11 @@ import json
import uuid
import mock
from oslo.config import cfg
import six
from heat.common import exception
from heat.common import timeutils
from heat.db import api as db_api
from heat.engine import attributes
from heat.engine.cfn import functions as cfn_funcs
@@ -473,12 +475,151 @@ class ResourceTest(HeatTestCase):
res = generic_rsrc.ResourceWithProps(rname, tmpl, self.stack)
res.id = 'test_res_id'
(res.action, res.status) = (res.INIT, res.DELETE)
self.assertRaises(exception.ResourceFailure, res.create)
create = scheduler.TaskRunner(res.create)
self.assertRaises(exception.ResourceFailure, create)
scheduler.TaskRunner(res.destroy)()
res.state_reset()
scheduler.TaskRunner(res.create)()
self.assertEqual((res.CREATE, res.COMPLETE), res.state)
def test_create_fail_retry(self):
tmpl = rsrc_defn.ResourceDefinition('test_resource', 'Foo',
{'Foo': 'abc'})
res = generic_rsrc.ResourceWithProps('test_resource', tmpl, self.stack)
self.m.StubOutWithMock(timeutils, 'retry_backoff_delay')
self.m.StubOutWithMock(generic_rsrc.ResourceWithProps, 'handle_create')
self.m.StubOutWithMock(generic_rsrc.ResourceWithProps, 'handle_delete')
# first attempt to create fails
generic_rsrc.ResourceWithProps.handle_create().AndRaise(
resource.ResourceInError(resource_name='test_resource',
resource_status='ERROR',
resource_type='GenericResourceType',
resource_action='CREATE',
status_reason='just because'))
# delete error resource from first attempt
generic_rsrc.ResourceWithProps.handle_delete().AndReturn(None)
# second attempt to create succeeds
timeutils.retry_backoff_delay(1, jitter_max=2.0).AndReturn(0.01)
generic_rsrc.ResourceWithProps.handle_create().AndReturn(None)
self.m.ReplayAll()
scheduler.TaskRunner(res.create)()
self.assertEqual((res.CREATE, res.COMPLETE), res.state)
self.m.VerifyAll()
def test_create_fail_retry_disabled(self):
cfg.CONF.set_override('action_retry_limit', 0)
tmpl = rsrc_defn.ResourceDefinition('test_resource', 'Foo',
{'Foo': 'abc'})
res = generic_rsrc.ResourceWithProps('test_resource', tmpl, self.stack)
self.m.StubOutWithMock(timeutils, 'retry_backoff_delay')
self.m.StubOutWithMock(generic_rsrc.ResourceWithProps, 'handle_create')
self.m.StubOutWithMock(generic_rsrc.ResourceWithProps, 'handle_delete')
# attempt to create fails
generic_rsrc.ResourceWithProps.handle_create().AndRaise(
resource.ResourceInError(resource_name='test_resource',
resource_status='ERROR',
resource_type='GenericResourceType',
resource_action='CREATE',
status_reason='just because'))
self.m.ReplayAll()
estr = ('ResourceInError: Went to status ERROR due to "just because"')
create = scheduler.TaskRunner(res.create)
err = self.assertRaises(exception.ResourceFailure, create)
self.assertEqual(estr, str(err))
self.assertEqual((res.CREATE, res.FAILED), res.state)
self.m.VerifyAll()
def test_create_deletes_fail_retry(self):
tmpl = rsrc_defn.ResourceDefinition('test_resource', 'Foo',
{'Foo': 'abc'})
res = generic_rsrc.ResourceWithProps('test_resource', tmpl, self.stack)
self.m.StubOutWithMock(timeutils, 'retry_backoff_delay')
self.m.StubOutWithMock(generic_rsrc.ResourceWithProps, 'handle_create')
self.m.StubOutWithMock(generic_rsrc.ResourceWithProps, 'handle_delete')
# first attempt to create fails
generic_rsrc.ResourceWithProps.handle_create().AndRaise(
resource.ResourceInError(resource_name='test_resource',
resource_status='ERROR',
resource_type='GenericResourceType',
resource_action='CREATE',
status_reason='just because'))
# first attempt to delete fails
generic_rsrc.ResourceWithProps.handle_delete().AndRaise(
resource.ResourceInError(resource_name='test_resource',
resource_status='ERROR',
resource_type='GenericResourceType',
resource_action='DELETE',
status_reason='delete failed'))
# second attempt to delete fails
timeutils.retry_backoff_delay(1, jitter_max=2.0).AndReturn(0.01)
generic_rsrc.ResourceWithProps.handle_delete().AndRaise(
resource.ResourceInError(resource_name='test_resource',
resource_status='ERROR',
resource_type='GenericResourceType',
resource_action='DELETE',
status_reason='delete failed again'))
# third attempt to delete succeeds
timeutils.retry_backoff_delay(2, jitter_max=2.0).AndReturn(0.01)
generic_rsrc.ResourceWithProps.handle_delete().AndReturn(None)
# second attempt to create succeeds
timeutils.retry_backoff_delay(1, jitter_max=2.0).AndReturn(0.01)
generic_rsrc.ResourceWithProps.handle_create().AndReturn(None)
self.m.ReplayAll()
scheduler.TaskRunner(res.create)()
self.assertEqual((res.CREATE, res.COMPLETE), res.state)
self.m.VerifyAll()
def test_creates_fail_retry(self):
tmpl = rsrc_defn.ResourceDefinition('test_resource', 'Foo',
{'Foo': 'abc'})
res = generic_rsrc.ResourceWithProps('test_resource', tmpl, self.stack)
self.m.StubOutWithMock(timeutils, 'retry_backoff_delay')
self.m.StubOutWithMock(generic_rsrc.ResourceWithProps, 'handle_create')
self.m.StubOutWithMock(generic_rsrc.ResourceWithProps, 'handle_delete')
# first attempt to create fails
generic_rsrc.ResourceWithProps.handle_create().AndRaise(
resource.ResourceInError(resource_name='test_resource',
resource_status='ERROR',
resource_type='GenericResourceType',
resource_action='CREATE',
status_reason='just because'))
# delete error resource from first attempt
generic_rsrc.ResourceWithProps.handle_delete().AndReturn(None)
# second attempt to create fails
timeutils.retry_backoff_delay(1, jitter_max=2.0).AndReturn(0.01)
generic_rsrc.ResourceWithProps.handle_create().AndRaise(
resource.ResourceInError(resource_name='test_resource',
resource_status='ERROR',
resource_type='GenericResourceType',
resource_action='CREATE',
status_reason='just because'))
# delete error resource from second attempt
generic_rsrc.ResourceWithProps.handle_delete().AndReturn(None)
# third attempt to create succeeds
timeutils.retry_backoff_delay(2, jitter_max=2.0).AndReturn(0.01)
generic_rsrc.ResourceWithProps.handle_create().AndReturn(None)
self.m.ReplayAll()
scheduler.TaskRunner(res.create)()
self.assertEqual((res.CREATE, res.COMPLETE), res.state)
self.m.VerifyAll()
def test_preview(self):
tmpl = rsrc_defn.ResourceDefinition('test_resource',
'GenericResourceType')
+3
View File
@@ -17,6 +17,7 @@ import json
from cinderclient import exceptions as cinder_exp
from cinderclient.v1 import client as cinderclient
import mox
from oslo.config import cfg
import six
from heat.common import exception
@@ -719,6 +720,8 @@ class VolumeTest(HeatTestCase):
def test_snapshot_no_volume(self):
stack_name = 'test_volume_stack'
cfg.CONF.set_override('action_retry_limit', 0)
fv = FakeVolume('creating', 'error')
self._mock_create_volume(fv, stack_name)