Merge "Retry resource create until success"

This commit is contained in:
Jenkins 2014-08-06 01:53:44 +00:00 committed by Gerrit Code Review
commit 586395b132
6 changed files with 208 additions and 5 deletions

View File

@ -47,6 +47,10 @@
# one time. (integer value)
#max_stacks_per_tenant=100
# Number of times to retry to bring a resource to a non-error
# state. Set to 0 to disable retries. (integer value)
#action_retry_limit=5
# Controls how many events will be pruned whenever a stack's
# events exceed max_events_per_stack. Set this lower to keep
# more events at the expense of more frequent purges. (integer

View File

@ -113,6 +113,11 @@ engine_opts = [
default=100,
help=_('Maximum number of stacks any one tenant may have'
' active at one time.')),
cfg.IntOpt('action_retry_limit',
default=5,
help=_('Number of times to retry to bring a '
'resource to a non-error state. Set to 0 to disable '
'retries.')),
cfg.IntOpt('event_purge_batch_size',
default=10,
help=_('Controls how many events will be pruned whenever a '

View File

@ -14,12 +14,14 @@
import base64
import contextlib
from datetime import datetime
from oslo.config import cfg
import six
import warnings
from heat.common import exception
from heat.common import identifier
from heat.common import short_id
from heat.common import timeutils
from heat.db import api as db_api
from heat.engine import attributes
from heat.engine import environment
@ -34,6 +36,8 @@ from heat.openstack.common import excutils
from heat.openstack.common.gettextutils import _
from heat.openstack.common import log as logging
cfg.CONF.import_opt('action_retry_limit', 'heat.common.config')
LOG = logging.getLogger(__name__)
@ -458,6 +462,7 @@ class Resource(object):
'''
return self
@scheduler.wrappertask
def create(self):
'''
Create the resource. Subclasses should provide a handle_create() method
@ -476,7 +481,49 @@ class Resource(object):
# the parser.Stack is stored (which is after the resources
# are __init__'d, but before they are create()'d)
self.reparse()
return self._do_action(action, self.properties.validate)
def pause():
try:
while True:
yield
except scheduler.Timeout:
return
count = {self.CREATE: 0, self.DELETE: 0}
retry_limit = max(cfg.CONF.action_retry_limit, 0)
first_failure = None
while (count[self.CREATE] <= retry_limit and
count[self.DELETE] <= retry_limit):
if count[action]:
delay = timeutils.retry_backoff_delay(count[action],
jitter_max=2.0)
waiter = scheduler.TaskRunner(pause)
waiter.start(timeout=delay)
while not waiter.step():
yield
try:
yield self._do_action(action, self.properties.validate)
if action == self.CREATE:
return
else:
action = self.CREATE
except exception.ResourceFailure as failure:
if not isinstance(failure.exc, ResourceInError):
raise failure
count[action] += 1
if action == self.CREATE:
action = self.DELETE
count[action] = 0
if first_failure is None:
# Save the first exception
first_failure = failure
if first_failure:
raise first_failure
def prepare_abandon(self):
self.abandon_in_progress = True

View File

@ -13,6 +13,7 @@
import copy
import mox
from oslo.config import cfg
import six
from neutronclient.v2_0 import client as neutronclient
@ -474,7 +475,9 @@ class PoolTest(HeatTestCase):
self.assertEqual((rsrc.CREATE, rsrc.COMPLETE), rsrc.state)
self.m.VerifyAll()
def test_create_failed_unexpected_status(self):
def test_create_failed_error_status(self):
cfg.CONF.set_override('action_retry_limit', 0)
neutron_utils.neutronV20.find_resourceid_by_name_or_id(
mox.IsA(neutronclient.Client),
'subnet',
@ -532,7 +535,7 @@ class PoolTest(HeatTestCase):
neutronclient.Client.show_pool('5678').MultipleTimes().AndReturn(
{'pool': {'status': 'ACTIVE'}})
neutronclient.Client.show_vip('xyz').AndReturn(
{'vip': {'status': 'ERROR', 'name': 'xyz'}})
{'vip': {'status': 'SOMETHING', 'name': 'xyz'}})
snippet = template_format.parse(pool_template)
stack = utils.parse_stack(snippet)
@ -543,7 +546,7 @@ class PoolTest(HeatTestCase):
error = self.assertRaises(exception.ResourceFailure,
scheduler.TaskRunner(rsrc.create))
self.assertEqual(
'ResourceInError: Went to status ERROR due to "error in vip"',
'ResourceUnknownStatus: Unknown status SOMETHING',
six.text_type(error))
self.assertEqual((rsrc.CREATE, rsrc.FAILED), rsrc.state)
self.m.VerifyAll()

View File

@ -16,9 +16,11 @@ import json
import uuid
import mock
from oslo.config import cfg
import six
from heat.common import exception
from heat.common import timeutils
from heat.db import api as db_api
from heat.engine import attributes
from heat.engine.cfn import functions as cfn_funcs
@ -473,12 +475,151 @@ class ResourceTest(HeatTestCase):
res = generic_rsrc.ResourceWithProps(rname, tmpl, self.stack)
res.id = 'test_res_id'
(res.action, res.status) = (res.INIT, res.DELETE)
self.assertRaises(exception.ResourceFailure, res.create)
create = scheduler.TaskRunner(res.create)
self.assertRaises(exception.ResourceFailure, create)
scheduler.TaskRunner(res.destroy)()
res.state_reset()
scheduler.TaskRunner(res.create)()
self.assertEqual((res.CREATE, res.COMPLETE), res.state)
def test_create_fail_retry(self):
tmpl = rsrc_defn.ResourceDefinition('test_resource', 'Foo',
{'Foo': 'abc'})
res = generic_rsrc.ResourceWithProps('test_resource', tmpl, self.stack)
self.m.StubOutWithMock(timeutils, 'retry_backoff_delay')
self.m.StubOutWithMock(generic_rsrc.ResourceWithProps, 'handle_create')
self.m.StubOutWithMock(generic_rsrc.ResourceWithProps, 'handle_delete')
# first attempt to create fails
generic_rsrc.ResourceWithProps.handle_create().AndRaise(
resource.ResourceInError(resource_name='test_resource',
resource_status='ERROR',
resource_type='GenericResourceType',
resource_action='CREATE',
status_reason='just because'))
# delete error resource from first attempt
generic_rsrc.ResourceWithProps.handle_delete().AndReturn(None)
# second attempt to create succeeds
timeutils.retry_backoff_delay(1, jitter_max=2.0).AndReturn(0.01)
generic_rsrc.ResourceWithProps.handle_create().AndReturn(None)
self.m.ReplayAll()
scheduler.TaskRunner(res.create)()
self.assertEqual((res.CREATE, res.COMPLETE), res.state)
self.m.VerifyAll()
def test_create_fail_retry_disabled(self):
cfg.CONF.set_override('action_retry_limit', 0)
tmpl = rsrc_defn.ResourceDefinition('test_resource', 'Foo',
{'Foo': 'abc'})
res = generic_rsrc.ResourceWithProps('test_resource', tmpl, self.stack)
self.m.StubOutWithMock(timeutils, 'retry_backoff_delay')
self.m.StubOutWithMock(generic_rsrc.ResourceWithProps, 'handle_create')
self.m.StubOutWithMock(generic_rsrc.ResourceWithProps, 'handle_delete')
# attempt to create fails
generic_rsrc.ResourceWithProps.handle_create().AndRaise(
resource.ResourceInError(resource_name='test_resource',
resource_status='ERROR',
resource_type='GenericResourceType',
resource_action='CREATE',
status_reason='just because'))
self.m.ReplayAll()
estr = ('ResourceInError: Went to status ERROR due to "just because"')
create = scheduler.TaskRunner(res.create)
err = self.assertRaises(exception.ResourceFailure, create)
self.assertEqual(estr, str(err))
self.assertEqual((res.CREATE, res.FAILED), res.state)
self.m.VerifyAll()
def test_create_deletes_fail_retry(self):
tmpl = rsrc_defn.ResourceDefinition('test_resource', 'Foo',
{'Foo': 'abc'})
res = generic_rsrc.ResourceWithProps('test_resource', tmpl, self.stack)
self.m.StubOutWithMock(timeutils, 'retry_backoff_delay')
self.m.StubOutWithMock(generic_rsrc.ResourceWithProps, 'handle_create')
self.m.StubOutWithMock(generic_rsrc.ResourceWithProps, 'handle_delete')
# first attempt to create fails
generic_rsrc.ResourceWithProps.handle_create().AndRaise(
resource.ResourceInError(resource_name='test_resource',
resource_status='ERROR',
resource_type='GenericResourceType',
resource_action='CREATE',
status_reason='just because'))
# first attempt to delete fails
generic_rsrc.ResourceWithProps.handle_delete().AndRaise(
resource.ResourceInError(resource_name='test_resource',
resource_status='ERROR',
resource_type='GenericResourceType',
resource_action='DELETE',
status_reason='delete failed'))
# second attempt to delete fails
timeutils.retry_backoff_delay(1, jitter_max=2.0).AndReturn(0.01)
generic_rsrc.ResourceWithProps.handle_delete().AndRaise(
resource.ResourceInError(resource_name='test_resource',
resource_status='ERROR',
resource_type='GenericResourceType',
resource_action='DELETE',
status_reason='delete failed again'))
# third attempt to delete succeeds
timeutils.retry_backoff_delay(2, jitter_max=2.0).AndReturn(0.01)
generic_rsrc.ResourceWithProps.handle_delete().AndReturn(None)
# second attempt to create succeeds
timeutils.retry_backoff_delay(1, jitter_max=2.0).AndReturn(0.01)
generic_rsrc.ResourceWithProps.handle_create().AndReturn(None)
self.m.ReplayAll()
scheduler.TaskRunner(res.create)()
self.assertEqual((res.CREATE, res.COMPLETE), res.state)
self.m.VerifyAll()
def test_creates_fail_retry(self):
tmpl = rsrc_defn.ResourceDefinition('test_resource', 'Foo',
{'Foo': 'abc'})
res = generic_rsrc.ResourceWithProps('test_resource', tmpl, self.stack)
self.m.StubOutWithMock(timeutils, 'retry_backoff_delay')
self.m.StubOutWithMock(generic_rsrc.ResourceWithProps, 'handle_create')
self.m.StubOutWithMock(generic_rsrc.ResourceWithProps, 'handle_delete')
# first attempt to create fails
generic_rsrc.ResourceWithProps.handle_create().AndRaise(
resource.ResourceInError(resource_name='test_resource',
resource_status='ERROR',
resource_type='GenericResourceType',
resource_action='CREATE',
status_reason='just because'))
# delete error resource from first attempt
generic_rsrc.ResourceWithProps.handle_delete().AndReturn(None)
# second attempt to create fails
timeutils.retry_backoff_delay(1, jitter_max=2.0).AndReturn(0.01)
generic_rsrc.ResourceWithProps.handle_create().AndRaise(
resource.ResourceInError(resource_name='test_resource',
resource_status='ERROR',
resource_type='GenericResourceType',
resource_action='CREATE',
status_reason='just because'))
# delete error resource from second attempt
generic_rsrc.ResourceWithProps.handle_delete().AndReturn(None)
# third attempt to create succeeds
timeutils.retry_backoff_delay(2, jitter_max=2.0).AndReturn(0.01)
generic_rsrc.ResourceWithProps.handle_create().AndReturn(None)
self.m.ReplayAll()
scheduler.TaskRunner(res.create)()
self.assertEqual((res.CREATE, res.COMPLETE), res.state)
self.m.VerifyAll()
def test_preview(self):
tmpl = rsrc_defn.ResourceDefinition('test_resource',
'GenericResourceType')

View File

@ -17,6 +17,7 @@ import json
from cinderclient import exceptions as cinder_exp
from cinderclient.v1 import client as cinderclient
import mox
from oslo.config import cfg
import six
from heat.common import exception
@ -728,6 +729,8 @@ class VolumeTest(HeatTestCase):
def test_snapshot_no_volume(self):
stack_name = 'test_volume_stack'
cfg.CONF.set_override('action_retry_limit', 0)
fv = FakeVolume('creating', 'error')
self._mock_create_volume(fv, stack_name)