From e416b2492aa5fd2710f38a8ea3d3d2cae04ea9d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Dulko?= Date: Mon, 23 Apr 2018 12:49:37 +0200 Subject: [PATCH] kuryr-controller A/P HA This commit implements initial version of high availability support in kuryr-controller - Active/Passive mode. In this mode only one instance of controller is processing the resources while other ones are in standby mode. If current leader dies, one of standbys is taking the leader role and starts processing resources. Please note that as leader election is based on Kubernetes mechanisms, this is only supported when kuryr-controller is run as Pod on Kubernetes cluster. Implements: bp high-availability Change-Id: I2c6c9315612d64158fb9f8284e0abb065aca7208 --- devstack/lib/kuryr_kubernetes | 21 ++- devstack/local.conf.worker.sample | 1 + devstack/plugin.sh | 10 +- devstack/settings | 4 + doc/source/devref/high_availability.rst | 131 ++++++++++++++++++ doc/source/devref/index.rst | 1 + kuryr_kubernetes/config.py | 8 ++ kuryr_kubernetes/controller/service.py | 66 ++++++++- kuryr_kubernetes/tests/unit/test_utils.py | 70 ++++++++++ kuryr_kubernetes/utils.py | 24 ++++ kuryr_kubernetes/watcher.py | 3 + .../active-passive-ha-cfbda8e6b527b48e.yaml | 9 ++ 12 files changed, 344 insertions(+), 4 deletions(-) create mode 100644 doc/source/devref/high_availability.rst create mode 100644 kuryr_kubernetes/tests/unit/test_utils.py create mode 100644 releasenotes/notes/active-passive-ha-cfbda8e6b527b48e.yaml diff --git a/devstack/lib/kuryr_kubernetes b/devstack/lib/kuryr_kubernetes index 9c4f2d754..d50cbcc4a 100644 --- a/devstack/lib/kuryr_kubernetes +++ b/devstack/lib/kuryr_kubernetes @@ -461,6 +461,7 @@ EOF function generate_controller_deployment() { output_dir=$1 health_server_port=$2 + controller_ha=$3 mkdir -p "$output_dir" rm -f ${output_dir}/controller_deployment.yml cat >> "${output_dir}/controller_deployment.yml" << EOF @@ -472,7 +473,7 @@ metadata: name: kuryr-controller namespace: kube-system spec: - replicas: 1 + replicas: ${KURYR_CONTROLLER_REPLICAS:-1} template: metadata: labels: @@ -483,6 +484,24 @@ spec: automountServiceAccountToken: true hostNetwork: true containers: +EOF + + if [ "$controller_ha" == "True" ]; then + cat >> "${output_dir}/controller_deployment.yml" << EOF + - image: gcr.io/google_containers/leader-elector:0.5 + name: leader-elector + args: + - "--election=kuryr-controller" + - "--http=0.0.0.0:${KURYR_CONTROLLER_HA_PORT:-16401}" + - "--election-namespace=kube-system" + - "--ttl=5s" + ports: + - containerPort: ${KURYR_CONTROLLER_HA_PORT:-16401} + protocol: TCP +EOF + fi + + cat >> "${output_dir}/controller_deployment.yml" << EOF - image: kuryr/controller:latest imagePullPolicy: Never name: controller diff --git a/devstack/local.conf.worker.sample b/devstack/local.conf.worker.sample index bb787449c..329375774 100644 --- a/devstack/local.conf.worker.sample +++ b/devstack/local.conf.worker.sample @@ -28,6 +28,7 @@ MULTI_HOST=1 KEYSTONE_SERVICE_HOST=$SERVICE_HOST MYSQL_HOST=$SERVICE_HOST RABBIT_HOST=$SERVICE_HOST +KURYR_K8S_API_URL="http://${SERVICE_HOST}:8080" # For Baremetal deployment, enable SDN agent that should run on worker node # enable_service q-agt diff --git a/devstack/plugin.sh b/devstack/plugin.sh index b8d77ec5c..86762b150 100644 --- a/devstack/plugin.sh +++ b/devstack/plugin.sh @@ -130,11 +130,19 @@ function configure_kuryr { function generate_containerized_kuryr_resources { local cni_daemon cni_daemon=$1 + if [[ KURYR_CONTROLLER_REPLICAS -eq 1 ]]; then + KURYR_CONTROLLER_HA="False" + else + KURYR_CONTROLLER_HA="True" + fi # Containerized deployment will use tokens provided by k8s itself. inicomment "$KURYR_CONFIG" kubernetes ssl_client_crt_file inicomment "$KURYR_CONFIG" kubernetes ssl_client_key_file + iniset "$KURYR_CONFIG" kubernetes controller_ha ${KURYR_CONTROLLER_HA} + iniset "$KURYR_CONFIG" kubernetes controller_ha_port ${KURYR_CONTROLLER_HA_PORT} + # NOTE(dulek): In the container the CA bundle will be mounted in a standard # directory, so we need to modify that. iniset "$KURYR_CONFIG" neutron cafile /etc/ssl/certs/kuryr-ca-bundle.crt @@ -146,7 +154,7 @@ function generate_containerized_kuryr_resources { generate_kuryr_configmap $output_dir $KURYR_CONFIG $KURYR_CONFIG generate_kuryr_certificates_secret $output_dir $SSL_BUNDLE_FILE generate_kuryr_service_account $output_dir - generate_controller_deployment $output_dir $KURYR_HEALTH_SERVER_PORT + generate_controller_deployment $output_dir $KURYR_HEALTH_SERVER_PORT $KURYR_CONTROLLER_HA generate_cni_daemon_set $output_dir $KURYR_CNI_HEALTH_SERVER_PORT $cni_daemon $CNI_BIN_DIR $CNI_CONF_DIR } diff --git a/devstack/settings b/devstack/settings index e1879b8b0..2aa939b4a 100644 --- a/devstack/settings +++ b/devstack/settings @@ -88,3 +88,7 @@ OVS_HOST_PATH=${OVS_HOST_PATH:-/var/run/openvswitch} # Health Server KURYR_CNI_HEALTH_SERVER_PORT=${KURYR_CNI_HEALTH_SERVER_PORT:-8090} + +# High availability of controller +KURYR_CONTROLLER_HA_PORT=${KURYR_CONTROLLER_HA_PORT:-16401} +KURYR_CONTROLLER_REPLICAS=${KURYR_CONTROLLER_REPLICAS:-1} diff --git a/doc/source/devref/high_availability.rst b/doc/source/devref/high_availability.rst new file mode 100644 index 000000000..d4adb2c61 --- /dev/null +++ b/doc/source/devref/high_availability.rst @@ -0,0 +1,131 @@ +.. + This work is licensed under a Creative Commons Attribution 3.0 Unported + License. + + http://creativecommons.org/licenses/by/3.0/legalcode + + Convention for heading levels in Neutron devref: + ======= Heading 0 (reserved for the title in a document) + ------- Heading 1 + ~~~~~~~ Heading 2 + +++++++ Heading 3 + ''''''' Heading 4 + (Avoid deeper levels because they do not render well.) + +================================ +Active/Passive High Availability +================================ + + +Overview +-------- +Initially it was assumed that there will only be a single kuryr-controller +instance in the Kuryr-Kubernetes deployment. While it simplified a lot of +controller code, it is obviously not a perfect situation. Having redundant +controllers can help with achieving higher availability and scalability of the +deployment. + +Now with introduction of possibility to run Kuryr in Pods on Kubernetes cluster +HA is much easier to be implemented. The purpose of this document is to explain +how will it work in practice. + +Proposed Solution +----------------- +There are two types of HA - Active/Passive and Active/Active. In this document +we'll focus on the former. A/P basically works as one of the instances being +the leader (doing all the exclusive tasks) and other instances waiting in +*standby* mode in case the leader *dies* to take over the leader role. As you +can see a *leader election* mechanism is required to make this work. + +Leader election ++++++++++++++++ +The idea here is to use leader election mechanism based on Kubernetes +endpoints. The idea is neatly `explained on Kubernetes blog +`_. +Election is based on Endpoint resources, that hold annotation about current +leader and its leadership lease time. If leader dies, other instances of the +service are free to take over the record. Kubernetes API mechanisms will +provide update exclusion mechanisms to prevent race conditions. + +This can be implemented by adding another *leader-elector* container to each +of kuryr-controller pods: + +.. code:: yaml + + - image: gcr.io/google_containers/leader-elector:0.5 + name: leader-elector + args: + - "--election=kuryr-controller" + - "--http=0.0.0.0:${KURYR_CONTROLLER_HA_PORT:-16401}" + - "--election-namespace=kube-system" + - "--ttl=5s" + ports: + - containerPort: ${KURYR_CONTROLLER_HA_PORT:-16401} + protocol: TCP + +This adds a new container to the pod. This container will do the +leader-election and expose the simple JSON API on port 16401 by default. This +API will be available to kuryr-controller container. + +Kuryr Controller Implementation ++++++++++++++++++++++++++++++++ +The main issue with having multiple controllers is task division. All of the +controllers are watching the same endpoints and getting the same notifications, +but those notifications cannot be processed by multiple controllers at once, +because we end up with a huge race conditon, where each controller creates +Neutron resources but only on succeeds to put the annotation on the Kubernetes +resource it is processing. + +This is obviously unacceptable so as a first step we're implementing A/P HA, +where only the leader is working on the resources and the rest waits in +standby. This will be implemented by periodically calling the leader-elector +API to check the current leader. On leader change: + +* Pod losing the leadership will stop its Watcher. Please note that it will be + stopped gracefully, so all the ongoing operations will be completed. +* Pod gaining the leadership will start its Watcher. Please note that it will + get notified about all the previously created Kubernetes resources, but will + ignore them as they already have the annotations. +* Pods not affected by leadership change will continue to be in standby mode + with their Watchers stopped. + +Please note that this means that in HA mode Watcher will not get started on +controller startup, but only when periodic task will notice that it is the +leader. + +Issues +++++++ +There are certain issues related to orphaned OpenStack resources that we may +hit. Those can happen in two cases: + +* Controller instance dies instantly during request processing. Some of + OpenStack resources were already created, but information about them was not + yet annotated onto Kubernetes resource. Therefore information is lost and we + end up with orphaned OpenStack resources. New leader will process the + Kubernetes resource by creating resources again. +* During leader transition (short period after a leader died, but before its + lease expired and periodic task on other controllers noticed that; this + shouldn't exceed 10 s) some K8s resources are deleted. New leader will not + get the notification about the deletion and those will go unnoticed. + +Both of this issues can be tackled by garbage-collector mechanism that will +periodically look over Kubernetes resources and delete OpenStack resources that +have no representation in annotations. + +The latter of the issues can also be tackled by saving last seen +``resourceVersion`` of watched resources list when stopping the Watcher and +restarting watching from that point. + +Future enhancements ++++++++++++++++++++ +It would be useful to implement the garbage collector and +``resourceVersion``-based protection mechanism described in section above. + +Besides that to further improve the scalability, we should work on +Active/Active HA model, where work is divided evenly between all of the +kuryr-controller instances. This can be achieved e.g. by using +consistent hash ring to decide which instance will process which resource. + +Potentially this can be extended with support for non-containerized deployments +by using Tooz and some other tool providing leader-election - like Consul or +Zookeeper. diff --git a/doc/source/devref/index.rst b/doc/source/devref/index.rst index 6f5562ca3..e1ceb8c8f 100644 --- a/doc/source/devref/index.rst +++ b/doc/source/devref/index.rst @@ -42,6 +42,7 @@ Design documents health_manager kuryr_kubernetes_ingress_design kuryr_kubernetes_ocp_route_design + high_availability kuryr_kubernetes_versions Indices and tables diff --git a/kuryr_kubernetes/config.py b/kuryr_kubernetes/config.py index e60785955..594c4d39d 100644 --- a/kuryr_kubernetes/config.py +++ b/kuryr_kubernetes/config.py @@ -153,6 +153,14 @@ k8s_opts = [ help=_("The comma-separated handlers that should be " "registered for watching in the pipeline."), default=['vif', 'lb', 'lbaasspec']), + cfg.BoolOpt('controller_ha', + help=_('Enable kuryr-controller active/passive HA. Only ' + 'supported in containerized deployments on Kubernetes ' + 'or OpenShift.'), + default=False), + cfg.PortOpt('controller_ha_elector_port', + help=_('Port on which leader-elector pod is listening to.'), + default=16401), ] neutron_defaults = [ diff --git a/kuryr_kubernetes/controller/service.py b/kuryr_kubernetes/controller/service.py index f905e3336..745f2faa6 100644 --- a/kuryr_kubernetes/controller/service.py +++ b/kuryr_kubernetes/controller/service.py @@ -13,11 +13,14 @@ # License for the specific language governing permissions and limitations # under the License. +import functools +import six import sys import os_vif from oslo_config import cfg from oslo_log import log as logging +from oslo_service import periodic_task from oslo_service import service from stevedore.named import NamedExtensionManager @@ -26,6 +29,7 @@ from kuryr_kubernetes import config from kuryr_kubernetes.controller.handlers import pipeline as h_pipeline from kuryr_kubernetes.controller.managers import health from kuryr_kubernetes import objects +from kuryr_kubernetes import utils from kuryr_kubernetes import watcher @@ -62,29 +66,87 @@ def _load_kuryr_ctrlr_handlers(): return ctrlr_handlers -class KuryrK8sService(service.Service): +class KuryrK8sServiceMeta(type(service.Service), + type(periodic_task.PeriodicTasks)): + pass + + +class KuryrK8sService(six.with_metaclass(KuryrK8sServiceMeta, + service.Service, + periodic_task.PeriodicTasks)): """Kuryr-Kubernetes controller Service.""" def __init__(self): super(KuryrK8sService, self).__init__() + periodic_task.PeriodicTasks.__init__(self, CONF) objects.register_locally_defined_vifs() pipeline = h_pipeline.ControllerPipeline(self.tg) self.watcher = watcher.Watcher(pipeline, self.tg) self.health_manager = health.HealthServer() + self.current_leader = None + self.node_name = utils.get_node_name() handlers = _load_kuryr_ctrlr_handlers() for handler in handlers: self.watcher.add(handler.get_watch_path()) pipeline.register(handler) + def is_leader(self): + return self.current_leader == self.node_name + def start(self): LOG.info("Service '%s' starting", self.__class__.__name__) super(KuryrK8sService, self).start() - self.watcher.start() + + if not CONF.kubernetes.controller_ha: + LOG.info('Running in non-HA mode, starting watcher immediately.') + self.watcher.start() + else: + LOG.info('Running in HA mode, watcher will be started later.') + f = functools.partial(self.run_periodic_tasks, None) + self.tg.add_timer(1, f) + self.health_manager.run() LOG.info("Service '%s' started", self.__class__.__name__) + @periodic_task.periodic_task(spacing=5, run_immediately=True) + def monitor_leader(self, context): + leader = utils.get_leader_name() + if leader is None: + # Error when fetching current leader. We're paranoid, so just to + # make sure we won't break anything we'll try to step down. + self.on_revoke_leader() + elif leader != self.current_leader and leader == self.node_name: + # I'm becoming the leader. + self.on_become_leader() + elif leader != self.current_leader and self.is_leader(): + # I'm revoked from being the leader. + self.on_revoke_leader() + elif leader == self.current_leader and self.is_leader(): + # I continue to be the leader + self.on_continue_leader() + + self.current_leader = leader + + def on_become_leader(self): + LOG.info('Controller %s becomes the leader, starting watcher.', + self.node_name) + self.watcher.start() + + def on_revoke_leader(self): + LOG.info('Controller %s stops being the leader, stopping watcher.', + self.node_name) + if self.watcher.is_running(): + self.watcher.stop() + + def on_continue_leader(self): + # Just make sure my watcher is running. + if not self.watcher.is_running(): + LOG.warning('Controller %s is the leader, but has watcher ' + 'stopped. Restarting it.') + self.watcher.start() + def wait(self): super(KuryrK8sService, self).wait() LOG.info("Service '%s' stopped", self.__class__.__name__) diff --git a/kuryr_kubernetes/tests/unit/test_utils.py b/kuryr_kubernetes/tests/unit/test_utils.py new file mode 100644 index 000000000..677974c02 --- /dev/null +++ b/kuryr_kubernetes/tests/unit/test_utils.py @@ -0,0 +1,70 @@ +# Copyright 2018 Red Hat, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import mock +import os + +from oslo_config import cfg + +from kuryr_kubernetes.tests import base as test_base +from kuryr_kubernetes import utils + +CONF = cfg.CONF + + +class TestUtils(test_base.TestCase): + @mock.patch('socket.gethostname') + def test_get_node_name_socket(self, m_gethostname): + try: + del os.environ['KUBERNETES_NODE_NAME'] + except KeyError: + pass + + m_gethostname.return_value = 'foo' + res = utils.get_node_name() + self.assertEqual('foo', res) + m_gethostname.assert_called_once_with() + + @mock.patch('socket.gethostname') + def test_get_node_name_envvar(self, m_gethostname): + os.environ['KUBERNETES_NODE_NAME'] = 'bar' + m_gethostname.return_value = 'foo' + res = utils.get_node_name() + self.assertEqual('bar', res) + m_gethostname.assert_not_called() + + @mock.patch('requests.get') + def test_get_leader_name(self, m_get): + m_get.return_value = mock.Mock(json=mock.Mock( + return_value={'name': 'foo'})) + res = utils.get_leader_name() + m_get.assert_called_once_with( + 'http://localhost:%d' % CONF.kubernetes.controller_ha_elector_port) + self.assertEqual('foo', res) + + @mock.patch('requests.get') + def test_get_leader_name_malformed(self, m_get): + m_get.return_value = mock.Mock(json=mock.Mock( + return_value={'name2': 'foo'})) + res = utils.get_leader_name() + m_get.assert_called_once_with( + 'http://localhost:%d' % CONF.kubernetes.controller_ha_elector_port) + self.assertIsNone(res) + + @mock.patch('requests.get') + def test_get_leader_name_exc(self, m_get): + m_get.side_effect = Exception + res = utils.get_leader_name() + m_get.assert_called_once_with( + 'http://localhost:%d' % CONF.kubernetes.controller_ha_elector_port) + self.assertIsNone(res) diff --git a/kuryr_kubernetes/utils.py b/kuryr_kubernetes/utils.py index 68d3e367d..53e3095d5 100644 --- a/kuryr_kubernetes/utils.py +++ b/kuryr_kubernetes/utils.py @@ -10,13 +10,19 @@ # License for the specific language governing permissions and limitations # under the License. +import os import random +import socket import time +import requests + from oslo_config import cfg +from oslo_log import log from oslo_serialization import jsonutils CONF = cfg.CONF +LOG = log.getLogger(__name__) VALID_MULTI_POD_POOLS_OPTS = {'noop': ['neutron-vif', 'nested-vlan', @@ -98,3 +104,21 @@ def exponential_sleep(deadline, attempt, interval=DEFAULT_INTERVAL): time.sleep(interval) return interval + + +def get_node_name(): + try: + return os.environ['KUBERNETES_NODE_NAME'] + except KeyError: + return socket.gethostname() + + +def get_leader_name(): + url = 'http://localhost:%d' % CONF.kubernetes.controller_ha_elector_port + try: + return requests.get(url).json()['name'] + except Exception: + LOG.exception('Error when fetching current leader pod name.') + # NOTE(dulek): Assuming there's no leader when we can't contact leader + # elector container. + return None diff --git a/kuryr_kubernetes/watcher.py b/kuryr_kubernetes/watcher.py index d5314fa12..b47ba1bc2 100644 --- a/kuryr_kubernetes/watcher.py +++ b/kuryr_kubernetes/watcher.py @@ -108,6 +108,9 @@ class Watcher(health.HealthHandler): if path in self._watching: self._stop_watch(path) + def is_running(self): + return self._running + def start(self): """Starts the Watcher. diff --git a/releasenotes/notes/active-passive-ha-cfbda8e6b527b48e.yaml b/releasenotes/notes/active-passive-ha-cfbda8e6b527b48e.yaml new file mode 100644 index 000000000..266769496 --- /dev/null +++ b/releasenotes/notes/active-passive-ha-cfbda8e6b527b48e.yaml @@ -0,0 +1,9 @@ +--- +features: + - | + Kuryr-Kubernetes now supports running kuryr-controller service in + **Active/Passive HA mode**. This is only possible when running those services + as Pods on Kubernetes cluster, as Kubernetes is used for leader election. + Also it is required to add leader-elector container to the kuryr-controller + Pods. HA is controlled by ``[kubernetes]controller_ha`` option, which + defaults to ``False``.