Improve performance and utility of Recovery

This makes a couple small, but significant changes to periodic recovery.

- Change the CUD calls for periodic recovery to use the Pool Manager's RPCAPI.

Periodic recovery is charged with fixing problems that have come about in
the normal course of operation. Sometimes things go very wrong and there
are a lot of resources that have an ERROR state. This change allows the work
generated to be distributed among active `designate-pool-manager` processes.
Hopefully resolving the issues faster, and reducing risk of random failure on
the one process generating the recovery events.

- Include zones that have been PENDING for a long time

After a certain time, a product of timeouts, intervals, and delays, changes
either should have successfully propagated, or gone to an ERROR state.
Unfortunately this doesn't happen on occaison. Certain random failures and
undiscovered bugs leave zones hanging out in PENDING state forever. By treating
these as states as errors, we (hopefully) ensure that Designate will clean up
these zones and return them to the happy path.

Change-Id: Ifb375bc9383e7dd8fd6e51a42a4a50dc5015cb13
This commit is contained in:
Tim Simmons 2016-01-25 17:12:54 -06:00
parent 93af7f0629
commit 04e26c32d2
3 changed files with 101 additions and 93 deletions

View File

@ -28,6 +28,7 @@ from designate import exceptions
from designate import objects
from designate import utils
from designate.central import rpcapi as central_api
from designate.pool_manager import rpcapi as pool_manager_rpcapi
from designate.mdns import rpcapi as mdns_api
from designate import service
from designate.context import DesignateContext
@ -41,6 +42,7 @@ LOG = logging.getLogger(__name__)
CONF = cfg.CONF
SUCCESS_STATUS = 'SUCCESS'
PENDING_STATUS = 'PENDING'
ERROR_STATUS = 'ERROR'
NO_ZONE_STATUS = 'NO_ZONE'
CREATE_ACTION = 'CREATE'
@ -111,6 +113,11 @@ class Service(service.RPCService, coordination.CoordinationMixin,
self._periodic_sync_retry_interval = \
CONF['service:pool_manager'].periodic_sync_retry_interval
# Compute a time (seconds) by which things should have propagated
self.max_prop_time = (self.timeout * self.max_retries +
self.max_retries * self.retry_interval +
self.delay)
# Create the necessary Backend instances for each target
self._setup_target_backends()
@ -183,6 +190,10 @@ class Service(service.RPCService, coordination.CoordinationMixin,
def mdns_api(self):
return mdns_api.MdnsAPI.get_instance()
@property
def pool_manager_api(self):
return pool_manager_rpcapi.PoolManagerAPI.get_instance()
def _get_admin_context_all_tenants(self):
return DesignateContext.get_admin_context(all_tenants=True)
@ -196,7 +207,7 @@ class Service(service.RPCService, coordination.CoordinationMixin,
return
context = self._get_admin_context_all_tenants()
LOG.debug("Starting Periodic Recovery")
LOG.info(_LI("Starting Periodic Recovery"))
try:
# Handle Deletion Failures
@ -204,21 +215,21 @@ class Service(service.RPCService, coordination.CoordinationMixin,
LOG.info(_LI("periodic_recovery:delete_zone needed on %d zones"),
len(zones))
for zone in zones:
self.delete_zone(context, zone)
self.pool_manager_api.delete_zone(context, zone)
# Handle Creation Failures
zones = self._get_failed_zones(context, CREATE_ACTION)
LOG.info(_LI("periodic_recovery:create_zone needed on %d zones"),
len(zones))
for zone in zones:
self.create_zone(context, zone)
self.pool_manager_api.create_zone(context, zone)
# Handle Update Failures
zones = self._get_failed_zones(context, UPDATE_ACTION)
LOG.info(_LI("periodic_recovery:update_zone needed on %d zones"),
len(zones))
for zone in zones:
self.update_zone(context, zone)
self.pool_manager_api.update_zone(context, zone)
except Exception:
LOG.exception(_LE('An unhandled exception in periodic '
@ -232,7 +243,7 @@ class Service(service.RPCService, coordination.CoordinationMixin,
if not self._pool_election.is_leader:
return
LOG.debug("Starting Periodic Synchronization")
LOG.info(_LI("Starting Periodic Synchronization"))
context = self._get_admin_context_all_tenants()
zones = self._fetch_healthy_zones(context)
zones = set(zones)
@ -570,12 +581,46 @@ class Service(service.RPCService, coordination.CoordinationMixin,
# Utility Methods
def _get_failed_zones(self, context, action):
"""
Fetch zones that are in ERROR status or have been PENDING for a long
time. Used by periodic recovery.
After a certain time changes either should have successfully
propagated or gone to an ERROR state.
However, random failures and undiscovered bugs leave zones hanging out
in PENDING state forever. By treating those "stale" zones as failed,
periodic recovery will attempt to restore them.
:return: :class:`ZoneList` zones
"""
criterion = {
'pool_id': CONF['service:pool_manager'].pool_id,
'action': action,
'status': ERROR_STATUS
}
return self.central_api.find_zones(context, criterion)
error_zones = self.central_api.find_zones(context, criterion)
# Include things that have been hanging out in PENDING
# status for longer than they should
# Generate the current serial, will provide a UTC Unix TS.
current = utils.increment_serial()
stale_criterion = {
'pool_id': CONF['service:pool_manager'].pool_id,
'action': action,
'status': PENDING_STATUS,
'serial': "<%s" % (current - self.max_prop_time)
}
LOG.debug('Including zones with action %(action)s and %(status)s '
'older than %(seconds)ds' % {'action': action,
'status': PENDING_STATUS,
'seconds': self.max_prop_time})
stale_zones = self.central_api.find_zones(context, stale_criterion)
if stale_zones:
LOG.warn(_LW('Found %(len)d zones PENDING for more than %(sec)d '
'seconds'), {'len': len(stale_zones),
'sec': self.max_prop_time})
error_zones.extend(stale_zones)
return error_zones
def _fetch_healthy_zones(self, context):
"""Fetch all zones not in error

View File

@ -536,33 +536,6 @@ class PoolManagerServiceNoopTest(PoolManagerTestCase):
# the zones have been put in ERROR status
self.assertEqual(3, mock_cent_update_status.call_count)
# Periodic recovery
@patch.object(pm_module.time, 'sleep')
@patch.object(mdns_rpcapi.MdnsAPI, 'notify_zone_changed')
@patch.object(central_rpcapi.CentralAPI, 'update_status')
def test_periodic_recovery(self, mock_find_zones,
mock_cent_update_status, *mocks):
def mock_get_failed_zones(ctx, action):
if action == pm_module.DELETE_ACTION:
return self._build_zones(3, 'DELETE', 'ERROR')
if action == pm_module.CREATE_ACTION:
return self._build_zones(4, 'CREATE', 'ERROR')
if action == pm_module.UPDATE_ACTION:
return self._build_zones(5, 'UPDATE', 'ERROR')
self.service._get_failed_zones = mock_get_failed_zones
self.service.delete_zone = Mock()
self.service.create_zone = Mock()
self.service.update_zone = Mock()
self.service.periodic_recovery()
self.assertEqual(3, self.service.delete_zone.call_count)
self.assertEqual(4, self.service.create_zone.call_count)
self.assertEqual(5, self.service.update_zone.call_count)
class PoolManagerServiceEndToEndTest(PoolManagerServiceNoopTest):
@ -618,42 +591,3 @@ class PoolManagerServiceEndToEndTest(PoolManagerServiceNoopTest):
LOG.error("Expected %d healthy zones, got %d", n, len(zones))
self._log_all_zones(zones, msg='listing zones')
self.assertEqual(n, len(zones))
@patch.object(mdns_rpcapi.MdnsAPI, 'notify_zone_changed')
def test_periodic_sync_and_recovery(
self, mock_cent_update_status, *a):
# Periodic sync + recovery
self.service._periodic_sync_retry_interval = 0
# Create healthy zones, run a periodic sync that will fail
self.create_zone(name='created.example.com.')
self._assert_num_healthy_zones(pm_module.CREATE_ACTION, 1)
z = self.create_zone(name='updated.example.net.')
z.email = 'info@example.net'
self.service.central_api.update_zone(self.admin_context, z)
self._assert_num_healthy_zones(pm_module.UPDATE_ACTION, 1)
with patch.object(self.service, '_update_zone_on_target',
return_value=False):
self.service.periodic_sync()
zones = self.service._fetch_healthy_zones(self.admin_context)
self.assertEqual(0, len(zones))
self._assert_num_failed_zones(pm_module.CREATE_ACTION, 1)
self._assert_num_failed_zones(pm_module.UPDATE_ACTION, 1)
# Now run a periodic_recovery that will fix the zones
backends = self.service.target_backends
for tid in self.service.target_backends:
backends[tid].create_zone = Mock()
backends[tid].update_zone = Mock()
backends[tid].delete_zone = Mock()
self.service.periodic_recovery()
# There are 2 pool targets in use
for backend in self.service.target_backends.itervalues():
self.assertEqual(1, backend.create_zone.call_count)
self.assertEqual(1, backend.update_zone.call_count)

View File

@ -19,6 +19,7 @@ Unit tests
"""
import unittest
from mock import call
from mock import Mock
from mock import MagicMock
from mock import patch
@ -94,11 +95,22 @@ class PoolManagerTest(test.BaseTestCase):
def test_get_failed_zones(self, *mocks):
with patch.object(self.pm.central_api, 'find_zones') as \
mock_find_zones:
self.pm._get_failed_zones('ctx', pm_module.DELETE_ACTION)
mock_find_zones.assert_called_once_with(
'ctx', {'action': 'DELETE', 'status': 'ERROR', 'pool_id':
'794ccc2c-d751-44fe-b57f-8894c9f5c842'})
with patch.object(pm_module.utils, 'increment_serial',
return_value=1453758656):
self.pm._get_failed_zones('ctx', pm_module.DELETE_ACTION)
call_one = call('ctx', {'action': 'DELETE', 'status': 'ERROR',
'pool_id':
'794ccc2c-d751-44fe-b57f-8894c9f5c842'})
call_two = call('ctx', {'action': 'DELETE', 'status': 'PENDING',
'serial': '<1453758201', # 1453758656-455
'pool_id':
'794ccc2c-d751-44fe-b57f-8894c9f5c842'})
# any_order because Mock adds some random calls in
mock_find_zones.assert_has_calls([call_one, call_two],
any_order=True)
@patch.object(pm_module.DesignateContext, 'get_admin_context')
def test_periodic_recover(self, mock_get_ctx, *mocks):
@ -113,19 +125,19 @@ class PoolManagerTest(test.BaseTestCase):
return [z] * 5
self.pm._get_failed_zones = mock_get_failed_zones
self.pm.delete_zone = Mock()
self.pm.create_zone = Mock()
self.pm.update_zone = Mock()
self.pm.pool_manager_api.delete_zone = Mock()
self.pm.pool_manager_api.create_zone = Mock()
self.pm.pool_manager_api.update_zone = Mock()
mock_ctx = mock_get_ctx.return_value
self.pm.periodic_recovery()
self.pm.delete_zone.assert_called_with(mock_ctx, z)
self.assertEqual(3, self.pm.delete_zone.call_count)
self.pm.create_zone.assert_called_with(mock_ctx, z)
self.assertEqual(4, self.pm.create_zone.call_count)
self.pm.update_zone.assert_called_with(mock_ctx, z)
self.assertEqual(5, self.pm.update_zone.call_count)
self.pm.pool_manager_api.delete_zone.assert_called_with(mock_ctx, z)
self.assertEqual(3, self.pm.pool_manager_api.delete_zone.call_count)
self.pm.pool_manager_api.create_zone.assert_called_with(mock_ctx, z)
self.assertEqual(4, self.pm.pool_manager_api.create_zone.call_count)
self.pm.pool_manager_api.update_zone.assert_called_with(mock_ctx, z)
self.assertEqual(5, self.pm.pool_manager_api.update_zone.call_count)
@patch.object(pm_module.DesignateContext, 'get_admin_context')
def test_periodic_recover_exception(self, mock_get_ctx, *mocks):
@ -139,15 +151,32 @@ class PoolManagerTest(test.BaseTestCase):
return [z] * 4
self.pm._get_failed_zones = mock_get_failed_zones
self.pm.delete_zone = Mock()
self.pm.create_zone = Mock(side_effect=Exception('oops'))
self.pm.update_zone = Mock()
self.pm.pool_manager_api.delete_zone = Mock()
self.pm.pool_manager_api.create_zone = Mock(
side_effect=Exception('oops'))
self.pm.pool_manager_api.update_zone = Mock()
mock_ctx = mock_get_ctx.return_value
self.pm.periodic_recovery()
self.pm.delete_zone.assert_called_with(mock_ctx, z)
self.assertEqual(3, self.pm.delete_zone.call_count)
self.pm.create_zone.assert_called_with(mock_ctx, z)
self.assertEqual(1, self.pm.create_zone.call_count)
self.assertEqual(0, self.pm.update_zone.call_count)
self.pm.pool_manager_api.delete_zone.assert_called_with(mock_ctx, z)
self.assertEqual(3, self.pm.pool_manager_api.delete_zone.call_count)
self.pm.pool_manager_api.create_zone.assert_called_with(mock_ctx, z)
self.assertEqual(1, self.pm.pool_manager_api.create_zone.call_count)
self.assertEqual(0, self.pm.pool_manager_api.update_zone.call_count)
def test_periodic_sync(self, *mocks):
def mock_fetch_healthy_zones(ctx):
return [
RoObject(name='a_zone'),
RoObject(name='b_zone'),
RoObject(name='c_zone'),
]
self.pm._fetch_healthy_zones = mock_fetch_healthy_zones
self.pm.update_zone = Mock()
self.pm._exceed_or_meet_threshold = Mock(return_value=True)
self.pm.periodic_sync()
self.assertEqual(3, self.pm.update_zone.call_count)