Improve performance and utility of Recovery
This makes a couple small, but significant changes to periodic recovery. - Change the CUD calls for periodic recovery to use the Pool Manager's RPCAPI. Periodic recovery is charged with fixing problems that have come about in the normal course of operation. Sometimes things go very wrong and there are a lot of resources that have an ERROR state. This change allows the work generated to be distributed among active `designate-pool-manager` processes. Hopefully resolving the issues faster, and reducing risk of random failure on the one process generating the recovery events. - Include zones that have been PENDING for a long time After a certain time, a product of timeouts, intervals, and delays, changes either should have successfully propagated, or gone to an ERROR state. Unfortunately this doesn't happen on occaison. Certain random failures and undiscovered bugs leave zones hanging out in PENDING state forever. By treating these as states as errors, we (hopefully) ensure that Designate will clean up these zones and return them to the happy path. Change-Id: Ifb375bc9383e7dd8fd6e51a42a4a50dc5015cb13
This commit is contained in:
parent
93af7f0629
commit
04e26c32d2
|
@ -28,6 +28,7 @@ from designate import exceptions
|
|||
from designate import objects
|
||||
from designate import utils
|
||||
from designate.central import rpcapi as central_api
|
||||
from designate.pool_manager import rpcapi as pool_manager_rpcapi
|
||||
from designate.mdns import rpcapi as mdns_api
|
||||
from designate import service
|
||||
from designate.context import DesignateContext
|
||||
|
@ -41,6 +42,7 @@ LOG = logging.getLogger(__name__)
|
|||
CONF = cfg.CONF
|
||||
|
||||
SUCCESS_STATUS = 'SUCCESS'
|
||||
PENDING_STATUS = 'PENDING'
|
||||
ERROR_STATUS = 'ERROR'
|
||||
NO_ZONE_STATUS = 'NO_ZONE'
|
||||
CREATE_ACTION = 'CREATE'
|
||||
|
@ -111,6 +113,11 @@ class Service(service.RPCService, coordination.CoordinationMixin,
|
|||
self._periodic_sync_retry_interval = \
|
||||
CONF['service:pool_manager'].periodic_sync_retry_interval
|
||||
|
||||
# Compute a time (seconds) by which things should have propagated
|
||||
self.max_prop_time = (self.timeout * self.max_retries +
|
||||
self.max_retries * self.retry_interval +
|
||||
self.delay)
|
||||
|
||||
# Create the necessary Backend instances for each target
|
||||
self._setup_target_backends()
|
||||
|
||||
|
@ -183,6 +190,10 @@ class Service(service.RPCService, coordination.CoordinationMixin,
|
|||
def mdns_api(self):
|
||||
return mdns_api.MdnsAPI.get_instance()
|
||||
|
||||
@property
|
||||
def pool_manager_api(self):
|
||||
return pool_manager_rpcapi.PoolManagerAPI.get_instance()
|
||||
|
||||
def _get_admin_context_all_tenants(self):
|
||||
return DesignateContext.get_admin_context(all_tenants=True)
|
||||
|
||||
|
@ -196,7 +207,7 @@ class Service(service.RPCService, coordination.CoordinationMixin,
|
|||
return
|
||||
|
||||
context = self._get_admin_context_all_tenants()
|
||||
LOG.debug("Starting Periodic Recovery")
|
||||
LOG.info(_LI("Starting Periodic Recovery"))
|
||||
|
||||
try:
|
||||
# Handle Deletion Failures
|
||||
|
@ -204,21 +215,21 @@ class Service(service.RPCService, coordination.CoordinationMixin,
|
|||
LOG.info(_LI("periodic_recovery:delete_zone needed on %d zones"),
|
||||
len(zones))
|
||||
for zone in zones:
|
||||
self.delete_zone(context, zone)
|
||||
self.pool_manager_api.delete_zone(context, zone)
|
||||
|
||||
# Handle Creation Failures
|
||||
zones = self._get_failed_zones(context, CREATE_ACTION)
|
||||
LOG.info(_LI("periodic_recovery:create_zone needed on %d zones"),
|
||||
len(zones))
|
||||
for zone in zones:
|
||||
self.create_zone(context, zone)
|
||||
self.pool_manager_api.create_zone(context, zone)
|
||||
|
||||
# Handle Update Failures
|
||||
zones = self._get_failed_zones(context, UPDATE_ACTION)
|
||||
LOG.info(_LI("periodic_recovery:update_zone needed on %d zones"),
|
||||
len(zones))
|
||||
for zone in zones:
|
||||
self.update_zone(context, zone)
|
||||
self.pool_manager_api.update_zone(context, zone)
|
||||
|
||||
except Exception:
|
||||
LOG.exception(_LE('An unhandled exception in periodic '
|
||||
|
@ -232,7 +243,7 @@ class Service(service.RPCService, coordination.CoordinationMixin,
|
|||
if not self._pool_election.is_leader:
|
||||
return
|
||||
|
||||
LOG.debug("Starting Periodic Synchronization")
|
||||
LOG.info(_LI("Starting Periodic Synchronization"))
|
||||
context = self._get_admin_context_all_tenants()
|
||||
zones = self._fetch_healthy_zones(context)
|
||||
zones = set(zones)
|
||||
|
@ -570,12 +581,46 @@ class Service(service.RPCService, coordination.CoordinationMixin,
|
|||
|
||||
# Utility Methods
|
||||
def _get_failed_zones(self, context, action):
|
||||
"""
|
||||
Fetch zones that are in ERROR status or have been PENDING for a long
|
||||
time. Used by periodic recovery.
|
||||
After a certain time changes either should have successfully
|
||||
propagated or gone to an ERROR state.
|
||||
However, random failures and undiscovered bugs leave zones hanging out
|
||||
in PENDING state forever. By treating those "stale" zones as failed,
|
||||
periodic recovery will attempt to restore them.
|
||||
:return: :class:`ZoneList` zones
|
||||
"""
|
||||
criterion = {
|
||||
'pool_id': CONF['service:pool_manager'].pool_id,
|
||||
'action': action,
|
||||
'status': ERROR_STATUS
|
||||
}
|
||||
return self.central_api.find_zones(context, criterion)
|
||||
error_zones = self.central_api.find_zones(context, criterion)
|
||||
|
||||
# Include things that have been hanging out in PENDING
|
||||
# status for longer than they should
|
||||
# Generate the current serial, will provide a UTC Unix TS.
|
||||
current = utils.increment_serial()
|
||||
stale_criterion = {
|
||||
'pool_id': CONF['service:pool_manager'].pool_id,
|
||||
'action': action,
|
||||
'status': PENDING_STATUS,
|
||||
'serial': "<%s" % (current - self.max_prop_time)
|
||||
}
|
||||
LOG.debug('Including zones with action %(action)s and %(status)s '
|
||||
'older than %(seconds)ds' % {'action': action,
|
||||
'status': PENDING_STATUS,
|
||||
'seconds': self.max_prop_time})
|
||||
|
||||
stale_zones = self.central_api.find_zones(context, stale_criterion)
|
||||
if stale_zones:
|
||||
LOG.warn(_LW('Found %(len)d zones PENDING for more than %(sec)d '
|
||||
'seconds'), {'len': len(stale_zones),
|
||||
'sec': self.max_prop_time})
|
||||
error_zones.extend(stale_zones)
|
||||
|
||||
return error_zones
|
||||
|
||||
def _fetch_healthy_zones(self, context):
|
||||
"""Fetch all zones not in error
|
||||
|
|
|
@ -536,33 +536,6 @@ class PoolManagerServiceNoopTest(PoolManagerTestCase):
|
|||
# the zones have been put in ERROR status
|
||||
self.assertEqual(3, mock_cent_update_status.call_count)
|
||||
|
||||
# Periodic recovery
|
||||
|
||||
@patch.object(pm_module.time, 'sleep')
|
||||
@patch.object(mdns_rpcapi.MdnsAPI, 'notify_zone_changed')
|
||||
@patch.object(central_rpcapi.CentralAPI, 'update_status')
|
||||
def test_periodic_recovery(self, mock_find_zones,
|
||||
mock_cent_update_status, *mocks):
|
||||
|
||||
def mock_get_failed_zones(ctx, action):
|
||||
if action == pm_module.DELETE_ACTION:
|
||||
return self._build_zones(3, 'DELETE', 'ERROR')
|
||||
if action == pm_module.CREATE_ACTION:
|
||||
return self._build_zones(4, 'CREATE', 'ERROR')
|
||||
if action == pm_module.UPDATE_ACTION:
|
||||
return self._build_zones(5, 'UPDATE', 'ERROR')
|
||||
|
||||
self.service._get_failed_zones = mock_get_failed_zones
|
||||
self.service.delete_zone = Mock()
|
||||
self.service.create_zone = Mock()
|
||||
self.service.update_zone = Mock()
|
||||
|
||||
self.service.periodic_recovery()
|
||||
|
||||
self.assertEqual(3, self.service.delete_zone.call_count)
|
||||
self.assertEqual(4, self.service.create_zone.call_count)
|
||||
self.assertEqual(5, self.service.update_zone.call_count)
|
||||
|
||||
|
||||
class PoolManagerServiceEndToEndTest(PoolManagerServiceNoopTest):
|
||||
|
||||
|
@ -618,42 +591,3 @@ class PoolManagerServiceEndToEndTest(PoolManagerServiceNoopTest):
|
|||
LOG.error("Expected %d healthy zones, got %d", n, len(zones))
|
||||
self._log_all_zones(zones, msg='listing zones')
|
||||
self.assertEqual(n, len(zones))
|
||||
|
||||
@patch.object(mdns_rpcapi.MdnsAPI, 'notify_zone_changed')
|
||||
def test_periodic_sync_and_recovery(
|
||||
self, mock_cent_update_status, *a):
|
||||
# Periodic sync + recovery
|
||||
self.service._periodic_sync_retry_interval = 0
|
||||
|
||||
# Create healthy zones, run a periodic sync that will fail
|
||||
self.create_zone(name='created.example.com.')
|
||||
self._assert_num_healthy_zones(pm_module.CREATE_ACTION, 1)
|
||||
|
||||
z = self.create_zone(name='updated.example.net.')
|
||||
z.email = 'info@example.net'
|
||||
self.service.central_api.update_zone(self.admin_context, z)
|
||||
self._assert_num_healthy_zones(pm_module.UPDATE_ACTION, 1)
|
||||
|
||||
with patch.object(self.service, '_update_zone_on_target',
|
||||
return_value=False):
|
||||
self.service.periodic_sync()
|
||||
|
||||
zones = self.service._fetch_healthy_zones(self.admin_context)
|
||||
self.assertEqual(0, len(zones))
|
||||
self._assert_num_failed_zones(pm_module.CREATE_ACTION, 1)
|
||||
self._assert_num_failed_zones(pm_module.UPDATE_ACTION, 1)
|
||||
|
||||
# Now run a periodic_recovery that will fix the zones
|
||||
|
||||
backends = self.service.target_backends
|
||||
for tid in self.service.target_backends:
|
||||
backends[tid].create_zone = Mock()
|
||||
backends[tid].update_zone = Mock()
|
||||
backends[tid].delete_zone = Mock()
|
||||
|
||||
self.service.periodic_recovery()
|
||||
|
||||
# There are 2 pool targets in use
|
||||
for backend in self.service.target_backends.itervalues():
|
||||
self.assertEqual(1, backend.create_zone.call_count)
|
||||
self.assertEqual(1, backend.update_zone.call_count)
|
||||
|
|
|
@ -19,6 +19,7 @@ Unit tests
|
|||
"""
|
||||
import unittest
|
||||
|
||||
from mock import call
|
||||
from mock import Mock
|
||||
from mock import MagicMock
|
||||
from mock import patch
|
||||
|
@ -94,11 +95,22 @@ class PoolManagerTest(test.BaseTestCase):
|
|||
def test_get_failed_zones(self, *mocks):
|
||||
with patch.object(self.pm.central_api, 'find_zones') as \
|
||||
mock_find_zones:
|
||||
self.pm._get_failed_zones('ctx', pm_module.DELETE_ACTION)
|
||||
|
||||
mock_find_zones.assert_called_once_with(
|
||||
'ctx', {'action': 'DELETE', 'status': 'ERROR', 'pool_id':
|
||||
'794ccc2c-d751-44fe-b57f-8894c9f5c842'})
|
||||
with patch.object(pm_module.utils, 'increment_serial',
|
||||
return_value=1453758656):
|
||||
self.pm._get_failed_zones('ctx', pm_module.DELETE_ACTION)
|
||||
|
||||
call_one = call('ctx', {'action': 'DELETE', 'status': 'ERROR',
|
||||
'pool_id':
|
||||
'794ccc2c-d751-44fe-b57f-8894c9f5c842'})
|
||||
call_two = call('ctx', {'action': 'DELETE', 'status': 'PENDING',
|
||||
'serial': '<1453758201', # 1453758656-455
|
||||
'pool_id':
|
||||
'794ccc2c-d751-44fe-b57f-8894c9f5c842'})
|
||||
|
||||
# any_order because Mock adds some random calls in
|
||||
mock_find_zones.assert_has_calls([call_one, call_two],
|
||||
any_order=True)
|
||||
|
||||
@patch.object(pm_module.DesignateContext, 'get_admin_context')
|
||||
def test_periodic_recover(self, mock_get_ctx, *mocks):
|
||||
|
@ -113,19 +125,19 @@ class PoolManagerTest(test.BaseTestCase):
|
|||
return [z] * 5
|
||||
|
||||
self.pm._get_failed_zones = mock_get_failed_zones
|
||||
self.pm.delete_zone = Mock()
|
||||
self.pm.create_zone = Mock()
|
||||
self.pm.update_zone = Mock()
|
||||
self.pm.pool_manager_api.delete_zone = Mock()
|
||||
self.pm.pool_manager_api.create_zone = Mock()
|
||||
self.pm.pool_manager_api.update_zone = Mock()
|
||||
mock_ctx = mock_get_ctx.return_value
|
||||
|
||||
self.pm.periodic_recovery()
|
||||
|
||||
self.pm.delete_zone.assert_called_with(mock_ctx, z)
|
||||
self.assertEqual(3, self.pm.delete_zone.call_count)
|
||||
self.pm.create_zone.assert_called_with(mock_ctx, z)
|
||||
self.assertEqual(4, self.pm.create_zone.call_count)
|
||||
self.pm.update_zone.assert_called_with(mock_ctx, z)
|
||||
self.assertEqual(5, self.pm.update_zone.call_count)
|
||||
self.pm.pool_manager_api.delete_zone.assert_called_with(mock_ctx, z)
|
||||
self.assertEqual(3, self.pm.pool_manager_api.delete_zone.call_count)
|
||||
self.pm.pool_manager_api.create_zone.assert_called_with(mock_ctx, z)
|
||||
self.assertEqual(4, self.pm.pool_manager_api.create_zone.call_count)
|
||||
self.pm.pool_manager_api.update_zone.assert_called_with(mock_ctx, z)
|
||||
self.assertEqual(5, self.pm.pool_manager_api.update_zone.call_count)
|
||||
|
||||
@patch.object(pm_module.DesignateContext, 'get_admin_context')
|
||||
def test_periodic_recover_exception(self, mock_get_ctx, *mocks):
|
||||
|
@ -139,15 +151,32 @@ class PoolManagerTest(test.BaseTestCase):
|
|||
return [z] * 4
|
||||
|
||||
self.pm._get_failed_zones = mock_get_failed_zones
|
||||
self.pm.delete_zone = Mock()
|
||||
self.pm.create_zone = Mock(side_effect=Exception('oops'))
|
||||
self.pm.update_zone = Mock()
|
||||
self.pm.pool_manager_api.delete_zone = Mock()
|
||||
self.pm.pool_manager_api.create_zone = Mock(
|
||||
side_effect=Exception('oops'))
|
||||
self.pm.pool_manager_api.update_zone = Mock()
|
||||
mock_ctx = mock_get_ctx.return_value
|
||||
|
||||
self.pm.periodic_recovery()
|
||||
|
||||
self.pm.delete_zone.assert_called_with(mock_ctx, z)
|
||||
self.assertEqual(3, self.pm.delete_zone.call_count)
|
||||
self.pm.create_zone.assert_called_with(mock_ctx, z)
|
||||
self.assertEqual(1, self.pm.create_zone.call_count)
|
||||
self.assertEqual(0, self.pm.update_zone.call_count)
|
||||
self.pm.pool_manager_api.delete_zone.assert_called_with(mock_ctx, z)
|
||||
self.assertEqual(3, self.pm.pool_manager_api.delete_zone.call_count)
|
||||
self.pm.pool_manager_api.create_zone.assert_called_with(mock_ctx, z)
|
||||
self.assertEqual(1, self.pm.pool_manager_api.create_zone.call_count)
|
||||
self.assertEqual(0, self.pm.pool_manager_api.update_zone.call_count)
|
||||
|
||||
def test_periodic_sync(self, *mocks):
|
||||
def mock_fetch_healthy_zones(ctx):
|
||||
return [
|
||||
RoObject(name='a_zone'),
|
||||
RoObject(name='b_zone'),
|
||||
RoObject(name='c_zone'),
|
||||
]
|
||||
|
||||
self.pm._fetch_healthy_zones = mock_fetch_healthy_zones
|
||||
self.pm.update_zone = Mock()
|
||||
self.pm._exceed_or_meet_threshold = Mock(return_value=True)
|
||||
|
||||
self.pm.periodic_sync()
|
||||
|
||||
self.assertEqual(3, self.pm.update_zone.call_count)
|
||||
|
|
Loading…
Reference in New Issue