Periodically fetch full list of watched resources
Kuryr-Kubernetes relies on watching resources in K8s API using an HTTP stream served by kube-apiserver. In such a distributed system this is sometimes unstable and e.g. etcd issues can cause some events to be omitted. To prevent controller from such situations this patch makes sure that periodically a full list of resources is fetched and injected as events into the handlers. We should probably do the same for kuryr-daemon watcher, but that case is less problematic as it'll be restarted in event of ADD requests timing out. Change-Id: I67874d086043071de072420df9ea5e86b3f2582e
This commit is contained in:
parent
2e3fb3cd54
commit
9f722e6200
@ -168,6 +168,14 @@ k8s_opts = [
|
||||
'too high, Kuryr will take longer to reconnect when K8s '
|
||||
'API stream was being silently broken.'),
|
||||
default=60),
|
||||
cfg.IntOpt('watch_reconcile_period',
|
||||
help=_('Period (in seconds) between iterations of fetching '
|
||||
'full list of watched K8s API resources and putting '
|
||||
'them into the enabled handlers. Setting 0 disables the '
|
||||
'periodic reconciling. The reconciliation is done to '
|
||||
'prevent Kuryr from missing events due to K8s API or '
|
||||
'etcd issues.'),
|
||||
default=120),
|
||||
cfg.ListOpt('enabled_handlers',
|
||||
help=_("The comma-separated handlers that should be "
|
||||
"registered for watching in the pipeline."),
|
||||
|
@ -17,6 +17,7 @@ import itertools
|
||||
import queue as py_queue
|
||||
import time
|
||||
|
||||
from oslo_concurrency import lockutils
|
||||
from oslo_log import log as logging
|
||||
|
||||
|
||||
@ -49,16 +50,21 @@ class Async(base.EventHandler):
|
||||
self._grace_period = grace_period
|
||||
self._queues = {}
|
||||
|
||||
def __call__(self, event):
|
||||
def __call__(self, event, *args, **kwargs):
|
||||
group = self._group_by(event)
|
||||
try:
|
||||
queue = self._queues[group]
|
||||
except KeyError:
|
||||
queue = py_queue.Queue(self._queue_depth)
|
||||
self._queues[group] = queue
|
||||
thread = self._thread_group.add_thread(self._run, group, queue)
|
||||
thread.link(self._done, group)
|
||||
queue.put(event)
|
||||
with lockutils.lock(group):
|
||||
try:
|
||||
queue = self._queues[group]
|
||||
# NOTE(dulek): We don't want to risk injecting an outdated
|
||||
# state if events for that resource are in queue.
|
||||
if kwargs.get('injected', False):
|
||||
return
|
||||
except KeyError:
|
||||
queue = py_queue.Queue(self._queue_depth)
|
||||
self._queues[group] = queue
|
||||
thread = self._thread_group.add_thread(self._run, group, queue)
|
||||
thread.link(self._done, group)
|
||||
queue.put((event, args, kwargs))
|
||||
|
||||
def _run(self, group, queue):
|
||||
LOG.debug("Asynchronous handler started processing %s", group)
|
||||
@ -67,7 +73,7 @@ class Async(base.EventHandler):
|
||||
# to allow more controlled environment for unit-tests (e.g. to
|
||||
# avoid tests getting stuck in infinite loops)
|
||||
try:
|
||||
event = queue.get(timeout=self._grace_period)
|
||||
event, args, kwargs = queue.get(timeout=self._grace_period)
|
||||
except py_queue.Empty:
|
||||
break
|
||||
# FIXME(ivc): temporary workaround to skip stale events
|
||||
@ -93,10 +99,10 @@ class Async(base.EventHandler):
|
||||
# also introduce cache for long operations
|
||||
time.sleep(STALE_PERIOD)
|
||||
while not queue.empty():
|
||||
event = queue.get()
|
||||
event, args, kwargs = queue.get()
|
||||
if queue.empty():
|
||||
time.sleep(STALE_PERIOD)
|
||||
self._handler(event)
|
||||
self._handler(event, *args, **kwargs)
|
||||
|
||||
def _done(self, thread, group):
|
||||
LOG.debug("Asynchronous handler stopped processing group %s", group)
|
||||
|
@ -20,7 +20,7 @@ class EventHandler(object, metaclass=abc.ABCMeta):
|
||||
"""Base class for event handlers."""
|
||||
|
||||
@abc.abstractmethod
|
||||
def __call__(self, event):
|
||||
def __call__(self, event, *args, **kwargs):
|
||||
"""Handle the event."""
|
||||
raise NotImplementedError()
|
||||
|
||||
|
@ -51,7 +51,7 @@ class Dispatcher(h_base.EventHandler):
|
||||
handlers = key_group.setdefault(key, [])
|
||||
handlers.append(handler)
|
||||
|
||||
def __call__(self, event):
|
||||
def __call__(self, event, *args, **kwargs):
|
||||
handlers = set()
|
||||
|
||||
for key_fn, key_group in self._registry.items():
|
||||
@ -67,7 +67,7 @@ class Dispatcher(h_base.EventHandler):
|
||||
obj_meta.get('uid'))
|
||||
|
||||
for handler in handlers:
|
||||
handler(event)
|
||||
handler(event, *args, **kwargs)
|
||||
|
||||
|
||||
class EventConsumer(h_base.EventHandler, metaclass=abc.ABCMeta):
|
||||
@ -113,8 +113,8 @@ class EventPipeline(h_base.EventHandler, metaclass=abc.ABCMeta):
|
||||
for key_fn, key in consumer.consumes.items():
|
||||
self._dispatcher.register(key_fn, key, handler)
|
||||
|
||||
def __call__(self, event):
|
||||
self._handler(event)
|
||||
def __call__(self, event, *args, **kwargs):
|
||||
self._handler(event, *args, **kwargs)
|
||||
|
||||
@abc.abstractmethod
|
||||
def _wrap_dispatcher(self, dispatcher):
|
||||
|
@ -73,7 +73,7 @@ class ResourceEventHandler(dispatch.EventConsumer, health.HealthHandler):
|
||||
|
||||
return deletion_timestamp
|
||||
|
||||
def __call__(self, event):
|
||||
def __call__(self, event, *args, **kwargs):
|
||||
event_type = event.get('type')
|
||||
obj = event.get('object')
|
||||
if 'MODIFIED' == event_type:
|
||||
|
@ -32,8 +32,8 @@ class LogExceptions(base.EventHandler):
|
||||
self._handler = handler
|
||||
self._exceptions = exceptions
|
||||
|
||||
def __call__(self, event):
|
||||
def __call__(self, event, *args, **kwargs):
|
||||
try:
|
||||
self._handler(event)
|
||||
self._handler(event, *args, **kwargs)
|
||||
except self._exceptions:
|
||||
LOG.exception("Failed to handle event %s", event)
|
||||
|
@ -51,7 +51,7 @@ class Retry(base.EventHandler):
|
||||
self._interval = interval
|
||||
self._k8s = clients.get_kubernetes_client()
|
||||
|
||||
def __call__(self, event):
|
||||
def __call__(self, event, *args, **kwargs):
|
||||
deadline = time.time() + self._timeout
|
||||
for attempt in itertools.count(1):
|
||||
if event.get('type') in ['MODIFIED', 'ADDED']:
|
||||
@ -75,7 +75,7 @@ class Retry(base.EventHandler):
|
||||
"object. Continuing with handler "
|
||||
"execution.")
|
||||
try:
|
||||
self._handler(event)
|
||||
self._handler(event, *args, **kwargs)
|
||||
break
|
||||
except os_exc.ConflictException as ex:
|
||||
if ex.details.startswith('Quota exceeded for resources'):
|
||||
|
@ -35,7 +35,7 @@ class TestAsyncHandler(test_base.TestCase):
|
||||
|
||||
m_handler.assert_not_called()
|
||||
self.assertEqual({group: m_queue}, async_handler._queues)
|
||||
m_queue.put.assert_called_once_with(event)
|
||||
m_queue.put.assert_called_once_with((event, (), {}))
|
||||
|
||||
@mock.patch('queue.Queue')
|
||||
def test_call_new(self, m_queue_type):
|
||||
@ -60,7 +60,22 @@ class TestAsyncHandler(test_base.TestCase):
|
||||
m_tg.add_thread.assert_called_once_with(async_handler._run, group,
|
||||
m_queue)
|
||||
m_th.link.assert_called_once_with(async_handler._done, group)
|
||||
m_queue.put.assert_called_once_with(event)
|
||||
m_queue.put.assert_called_once_with((event, (), {}))
|
||||
|
||||
def test_call_injected(self):
|
||||
event = mock.sentinel.event
|
||||
group = mock.sentinel.group
|
||||
m_queue = mock.Mock()
|
||||
m_handler = mock.Mock()
|
||||
m_group_by = mock.Mock(return_value=group)
|
||||
async_handler = h_async.Async(m_handler, mock.Mock(), m_group_by)
|
||||
async_handler._queues[group] = m_queue
|
||||
|
||||
async_handler(event, injected=True)
|
||||
|
||||
m_handler.assert_not_called()
|
||||
self.assertEqual({group: m_queue}, async_handler._queues)
|
||||
m_queue.put.assert_not_called()
|
||||
|
||||
@mock.patch('itertools.count')
|
||||
def test_run(self, m_count):
|
||||
@ -68,7 +83,7 @@ class TestAsyncHandler(test_base.TestCase):
|
||||
group = mock.sentinel.group
|
||||
m_queue = mock.Mock()
|
||||
m_queue.empty.return_value = True
|
||||
m_queue.get.return_value = event
|
||||
m_queue.get.return_value = (event, (), {})
|
||||
m_handler = mock.Mock()
|
||||
m_count.return_value = [1]
|
||||
async_handler = h_async.Async(m_handler, mock.Mock(), mock.Mock(),
|
||||
@ -81,7 +96,8 @@ class TestAsyncHandler(test_base.TestCase):
|
||||
|
||||
@mock.patch('itertools.count')
|
||||
def test_run_empty(self, m_count):
|
||||
events = [mock.sentinel.event1, mock.sentinel.event2]
|
||||
events = [(x, (), {}) for x in (mock.sentinel.event1,
|
||||
mock.sentinel.event2)]
|
||||
group = mock.sentinel.group
|
||||
m_queue = mock.Mock()
|
||||
m_queue.empty.return_value = True
|
||||
@ -93,12 +109,13 @@ class TestAsyncHandler(test_base.TestCase):
|
||||
with mock.patch('time.sleep'):
|
||||
async_handler._run(group, m_queue)
|
||||
|
||||
m_handler.assert_has_calls([mock.call(event) for event in events])
|
||||
m_handler.assert_has_calls([mock.call(event[0]) for event in events])
|
||||
self.assertEqual(len(events), m_handler.call_count)
|
||||
|
||||
@mock.patch('itertools.count')
|
||||
def test_run_stale(self, m_count):
|
||||
events = [mock.sentinel.event1, mock.sentinel.event2]
|
||||
events = [(x, (), {}) for x in (mock.sentinel.event1,
|
||||
mock.sentinel.event2)]
|
||||
group = mock.sentinel.group
|
||||
m_queue = mock.Mock()
|
||||
m_queue.empty.side_effect = [False, True, True]
|
||||
|
@ -179,13 +179,16 @@ class TestWatcher(test_base.TestCase):
|
||||
path = '/test'
|
||||
m_tg = mock.Mock()
|
||||
m_th = mock.Mock()
|
||||
m_tt = mock.Mock()
|
||||
m_handler = mock.Mock()
|
||||
watcher_obj = watcher.Watcher(m_handler, m_tg)
|
||||
watcher_obj._idle[path] = True
|
||||
watcher_obj._watching[path] = m_th
|
||||
watcher_obj._timers[path] = m_tt
|
||||
|
||||
watcher_obj._stop_watch(path)
|
||||
|
||||
m_tt.stop.assert_called()
|
||||
m_th.stop.assert_called()
|
||||
|
||||
def test_stop_watch_idle(self):
|
||||
|
@ -17,6 +17,7 @@ import sys
|
||||
import time
|
||||
|
||||
from kuryr_kubernetes import clients
|
||||
from kuryr_kubernetes import exceptions
|
||||
from kuryr_kubernetes.handlers import health
|
||||
from kuryr_kubernetes import utils
|
||||
from oslo_config import cfg
|
||||
@ -78,6 +79,7 @@ class Watcher(health.HealthHandler):
|
||||
self._running = False
|
||||
self._resources = set()
|
||||
self._watching = {}
|
||||
self._timers = {}
|
||||
self._idle = {}
|
||||
|
||||
if timeout is None:
|
||||
@ -131,11 +133,41 @@ class Watcher(health.HealthHandler):
|
||||
for path in list(self._watching):
|
||||
self._stop_watch(path)
|
||||
|
||||
def _reconcile(self, path):
|
||||
LOG.debug(f'Getting {path} for reconciliation.')
|
||||
try:
|
||||
response = self._client.get(path)
|
||||
resources = response['items']
|
||||
except exceptions.K8sClientException:
|
||||
LOG.exception(f'Error getting path when reconciling.')
|
||||
return
|
||||
|
||||
for resource in resources:
|
||||
kind = response['kind']
|
||||
# Strip List from e.g. PodList. For some reason `.items` of a list
|
||||
# returned from API doesn't have `kind` set.
|
||||
if kind.endswith('List'):
|
||||
kind = kind[:-4]
|
||||
resource['kind'] = kind
|
||||
|
||||
event = {
|
||||
'type': 'MODIFIED',
|
||||
'object': resource,
|
||||
}
|
||||
self._handler(event, injected=True)
|
||||
|
||||
def _start_watch(self, path):
|
||||
tg = self._thread_group
|
||||
self._idle[path] = True
|
||||
if tg:
|
||||
self._watching[path] = tg.add_thread(self._watch, path)
|
||||
period = CONF.kubernetes.watch_reconcile_period
|
||||
if period > 0:
|
||||
# Let's make sure handlers won't reconcile at the same time.
|
||||
initial_delay = period + 5 * len(self._timers)
|
||||
self._timers[path] = tg.add_timer_args(
|
||||
period, self._reconcile, args=(path,),
|
||||
initial_delay=initial_delay, stop_on_exception=False)
|
||||
else:
|
||||
self._watching[path] = None
|
||||
self._watch(path)
|
||||
@ -143,15 +175,21 @@ class Watcher(health.HealthHandler):
|
||||
def _stop_watch(self, path):
|
||||
if self._idle.get(path):
|
||||
if self._thread_group and path in self._watching:
|
||||
if CONF.kubernetes.watch_reconcile_period:
|
||||
self._timers[path].stop()
|
||||
self._watching[path].stop()
|
||||
# NOTE(dulek): Thread gets killed immediately, so we need to
|
||||
# take care of this ourselves.
|
||||
if CONF.kubernetes.watch_reconcile_period:
|
||||
self._timers.pop(path, None)
|
||||
self._watching.pop(path, None)
|
||||
self._idle.pop(path, None)
|
||||
|
||||
def _graceful_watch_exit(self, path):
|
||||
try:
|
||||
self._watching.pop(path, None)
|
||||
if CONF.kubernetes.watch_reconcile_period:
|
||||
self._timers.pop(path, None)
|
||||
self._idle.pop(path, None)
|
||||
LOG.info("Stopped watching '%s'", path)
|
||||
except KeyError:
|
||||
|
@ -73,8 +73,8 @@ oslo.policy==1.34.0
|
||||
oslo.privsep==1.28.0
|
||||
oslo.reports==1.18.0
|
||||
oslo.serialization==2.18.0
|
||||
oslo.service==1.24.0
|
||||
oslo.utils==3.33.0
|
||||
oslo.service==1.40.2
|
||||
oslo.utils==3.40.2
|
||||
oslo.versionedobjects==1.32.0
|
||||
oslotest==3.2.0
|
||||
packaging==17.1
|
||||
|
@ -15,8 +15,8 @@ oslo.config>=6.1.0 # Apache-2.0
|
||||
oslo.log>=3.36.0 # Apache-2.0
|
||||
oslo.reports>=1.18.0 # Apache-2.0
|
||||
oslo.serialization!=2.19.1,>=2.18.0 # Apache-2.0
|
||||
oslo.service!=1.28.1,>=1.24.0 # Apache-2.0
|
||||
oslo.utils>=3.33.0 # Apache-2.0
|
||||
oslo.service>=1.40.2 # Apache-2.0
|
||||
oslo.utils>=3.40.2 # Apache-2.0
|
||||
os-vif>=1.12.0 # Apache-2.0
|
||||
PrettyTable<0.8,>=0.7.2 # BSD
|
||||
pyroute2>=0.5.7;sys_platform!='win32' # Apache-2.0 (+ dual licensed GPL2)
|
||||
|
Loading…
x
Reference in New Issue
Block a user