Avoid race between Retries and Deletion actions

This patch set increases the timeout to wait for resources to be
created/deleted. This is needed to better support spikes without
restarting the kuryr-controller. This patch also ensures that
future retry events are not afecting the kuryr controller if
they are retried once the related resources are already deleted,
i.e., the on_delete event was executed before one of the retries.

Closes-Bug: 1847753
Change-Id: I725ba22f0babf496af219a37e42e1c33b247308a
This commit is contained in:
Luis Tomas Bolivar 2019-10-11 13:47:23 +02:00
parent f363077401
commit 998be3bbda
9 changed files with 69 additions and 13 deletions

View File

@ -56,6 +56,9 @@ class NamespacePodSubnetDriver(default_subnet.DefaultPodSubnetDriver):
try:
ns = kubernetes.get('%s/namespaces/%s' % (constants.K8S_API_BASE,
namespace))
except exceptions.K8sResourceNotFound:
LOG.warning("Namespace %s not found", namespace)
raise
except exceptions.K8sClientException:
LOG.exception("Kubernetes Client Exception.")
raise exceptions.ResourceNotReady(namespace)

View File

@ -532,12 +532,9 @@ class NetworkPolicySecurityGroupsDriver(base.PodSecurityGroupsDriver):
return crd_selectors
def create_namespace_sg_rules(self, namespace):
kubernetes = clients.get_kubernetes_client()
ns_name = namespace['metadata']['name']
LOG.debug("Creating sg rule for namespace: %s", ns_name)
crd_selectors = []
namespace = kubernetes.get(
'{}/namespaces/{}'.format(constants.K8S_API_BASE, ns_name))
knp_crds = driver_utils.get_kuryrnetpolicy_crds()
for crd in knp_crds.get('items'):
crd_selector = crd['spec'].get('podSelector')

View File

@ -306,7 +306,11 @@ class LoadBalancerHandler(k8s_base.ResourceEventHandler):
def _add_new_members(self, endpoints, lbaas_state, lbaas_spec):
changed = False
self._sync_lbaas_sgs(endpoints, lbaas_state, lbaas_spec)
try:
self._sync_lbaas_sgs(endpoints, lbaas_state, lbaas_spec)
except k_exc.K8sResourceNotFound:
LOG.debug("The svc has been deleted while processing the endpoints"
" update. No need to add new members.")
lsnr_by_id = {l.id: l for l in lbaas_state.listeners}
pool_by_lsnr_port = {(lsnr_by_id[p.listener_id].protocol,

View File

@ -120,8 +120,9 @@ class NamespaceHandler(k8s_base.ResourceEventHandler):
net_crd = self._add_kuryrnet_crd(ns_name, net_crd_spec)
self._set_net_crd(namespace, net_crd)
self._drv_sg.create_namespace_sg_rules(namespace)
except exceptions.K8sClientException:
LOG.exception("Kubernetes client exception. Rolling back "
except (exceptions.K8sClientException,
exceptions.K8sResourceNotFound):
LOG.exception("Kuryrnet CRD creation failed. Rolling back "
"resources created for the namespace.")
self._drv_subnets.rollback_network_resources(net_crd_spec, ns_name)
if net_crd_sg.get('sgId'):

View File

@ -20,6 +20,7 @@ from kuryr_kubernetes import clients
from kuryr_kubernetes import constants
from kuryr_kubernetes.controller.drivers import base as drivers
from kuryr_kubernetes.controller.drivers import utils as driver_utils
from kuryr_kubernetes import exceptions as k_exc
from kuryr_kubernetes.handlers import k8s_base
LOG = logging.getLogger(__name__)
@ -64,7 +65,11 @@ class PodLabelHandler(k8s_base.ResourceEventHandler):
project_id = self._drv_project.get_project(pod)
security_groups = self._drv_sg.get_security_groups(pod, project_id)
self._drv_vif_pool.update_vif_sgs(pod, security_groups)
self._set_pod_labels(pod, current_pod_labels)
try:
self._set_pod_labels(pod, current_pod_labels)
except k_exc.K8sResourceNotFound:
LOG.debug("Pod already deleted, no need to retry.")
return
if oslo_cfg.CONF.octavia_defaults.enforce_sg_rules:
services = driver_utils.get_services()

View File

@ -94,7 +94,7 @@ class VIFHandler(k8s_base.ResourceEventHandler):
if not state:
try:
subnets = self._drv_subnets.get_subnets(pod, project_id)
except n_exc.NotFound:
except (n_exc.NotFound, k_exc.K8sResourceNotFound):
LOG.warning("Subnet does not exists. If namespace driver is "
"used, probably the namespace for the pod is "
"already deleted. So this pod does not need to "

View File

@ -20,6 +20,7 @@ from neutronclient.common import exceptions as n_exc
from oslo_log import log as logging
from oslo_utils import excutils
from kuryr_kubernetes import clients
from kuryr_kubernetes import exceptions
from kuryr_kubernetes.handlers import base
from kuryr_kubernetes import utils
@ -48,10 +49,31 @@ class Retry(base.EventHandler):
self._exceptions = exceptions
self._timeout = timeout
self._interval = interval
self._k8s = clients.get_kubernetes_client()
def __call__(self, event):
deadline = time.time() + self._timeout
for attempt in itertools.count(1):
if event.get('type') in ['MODIFIED', 'ADDED']:
obj = event.get('object')
if obj:
try:
obj_link = obj['metadata']['selfLink']
except KeyError:
LOG.debug("Skipping object check as it does not have "
"selfLink: %s", obj)
else:
try:
self._k8s.get(obj_link)
except exceptions.K8sResourceNotFound:
LOG.debug("There is no need to process the "
"retry as the object %s has already "
"been deleted.", obj_link)
return
except exceptions.K8sClientException:
LOG.debug("Kubernetes client error getting the "
"object. Continuing with handler "
"execution.")
try:
self._handler(event)
break

View File

@ -17,8 +17,10 @@ import fixtures
import mock
import time
from kuryr_kubernetes import exceptions
from kuryr_kubernetes.handlers import retry as h_retry
from kuryr_kubernetes.tests import base as test_base
from kuryr_kubernetes.tests.unit import kuryr_fixtures as k_fix
class _EX1(Exception):
@ -42,6 +44,11 @@ class TestRetryHandler(test_base.TestCase):
f_time = self.useFixture(fixtures.MockPatch('time.time'))
f_time.mock.return_value = self.now
self.k8s = self.useFixture(k_fix.MockK8sClient()).client
f_k8s = self.useFixture(fixtures.MockPatch(
'kuryr_kubernetes.clients.get_kubernetes_client'))
f_k8s.mock.return_value = self.k8s
@mock.patch('random.randint')
@mock.patch('time.sleep')
def test_should_not_sleep(self, m_sleep, m_randint):
@ -87,10 +94,27 @@ class TestRetryHandler(test_base.TestCase):
m_handler = mock.Mock()
m_count.return_value = list(range(1, 5))
retry = h_retry.Retry(m_handler)
event = {'type': 'DELETED'}
retry(mock.sentinel.event)
retry(event)
m_handler.assert_called_once_with(mock.sentinel.event)
m_handler.assert_called_once_with(event)
m_sleep.assert_not_called()
@mock.patch('itertools.count')
@mock.patch.object(h_retry.Retry, '_sleep')
def test_call_outdated_event(self, m_sleep, m_count):
m_handler = mock.Mock()
m_count.return_value = list(range(1, 5))
obj = {'metadata': {'selfLink': mock.sentinel.selflink}}
event = {'type': 'MODIFIED', 'object': obj}
self.k8s.get.side_effect = exceptions.K8sResourceNotFound(obj)
retry = h_retry.Retry(m_handler)
retry(event)
self.k8s.get.assert_called_once_with(obj['metadata']['selfLink'])
m_handler.assert_not_called()
m_sleep.assert_not_called()
@mock.patch('itertools.count')
@ -100,7 +124,7 @@ class TestRetryHandler(test_base.TestCase):
timeout = 10
deadline = self.now + timeout
failures = [_EX1()] * (attempts - 1)
event = mock.sentinel.event
event = {'type': 'DELETED'}
m_handler = mock.Mock()
m_handler.side_effect = failures + [None]
m_sleep.return_value = 1
@ -121,7 +145,7 @@ class TestRetryHandler(test_base.TestCase):
timeout = 10
deadline = self.now + timeout
failures = [_EX1(), _EX1(), _EX11()]
event = mock.sentinel.event
event = {'type': 'DELETED'}
m_handler = mock.Mock()
m_handler.side_effect = failures
m_sleep.side_effect = [1] * (attempts - 1) + [0]

View File

@ -40,7 +40,7 @@ VALID_MULTI_POD_POOLS_OPTS = {'noop': ['neutron-vif',
'neutron': ['neutron-vif'],
'nested': ['nested-vlan'],
}
DEFAULT_TIMEOUT = 180
DEFAULT_TIMEOUT = 500
DEFAULT_INTERVAL = 3
subnet_caching_opts = [