diff --git a/devstack/lib/kuryr_kubernetes b/devstack/lib/kuryr_kubernetes index 9c4f2d754..d50cbcc4a 100644 --- a/devstack/lib/kuryr_kubernetes +++ b/devstack/lib/kuryr_kubernetes @@ -461,6 +461,7 @@ EOF function generate_controller_deployment() { output_dir=$1 health_server_port=$2 + controller_ha=$3 mkdir -p "$output_dir" rm -f ${output_dir}/controller_deployment.yml cat >> "${output_dir}/controller_deployment.yml" << EOF @@ -472,7 +473,7 @@ metadata: name: kuryr-controller namespace: kube-system spec: - replicas: 1 + replicas: ${KURYR_CONTROLLER_REPLICAS:-1} template: metadata: labels: @@ -483,6 +484,24 @@ spec: automountServiceAccountToken: true hostNetwork: true containers: +EOF + + if [ "$controller_ha" == "True" ]; then + cat >> "${output_dir}/controller_deployment.yml" << EOF + - image: gcr.io/google_containers/leader-elector:0.5 + name: leader-elector + args: + - "--election=kuryr-controller" + - "--http=0.0.0.0:${KURYR_CONTROLLER_HA_PORT:-16401}" + - "--election-namespace=kube-system" + - "--ttl=5s" + ports: + - containerPort: ${KURYR_CONTROLLER_HA_PORT:-16401} + protocol: TCP +EOF + fi + + cat >> "${output_dir}/controller_deployment.yml" << EOF - image: kuryr/controller:latest imagePullPolicy: Never name: controller diff --git a/devstack/local.conf.worker.sample b/devstack/local.conf.worker.sample index bb787449c..329375774 100644 --- a/devstack/local.conf.worker.sample +++ b/devstack/local.conf.worker.sample @@ -28,6 +28,7 @@ MULTI_HOST=1 KEYSTONE_SERVICE_HOST=$SERVICE_HOST MYSQL_HOST=$SERVICE_HOST RABBIT_HOST=$SERVICE_HOST +KURYR_K8S_API_URL="http://${SERVICE_HOST}:8080" # For Baremetal deployment, enable SDN agent that should run on worker node # enable_service q-agt diff --git a/devstack/plugin.sh b/devstack/plugin.sh index b8d77ec5c..86762b150 100644 --- a/devstack/plugin.sh +++ b/devstack/plugin.sh @@ -130,11 +130,19 @@ function configure_kuryr { function generate_containerized_kuryr_resources { local cni_daemon cni_daemon=$1 + if [[ KURYR_CONTROLLER_REPLICAS -eq 1 ]]; then + KURYR_CONTROLLER_HA="False" + else + KURYR_CONTROLLER_HA="True" + fi # Containerized deployment will use tokens provided by k8s itself. inicomment "$KURYR_CONFIG" kubernetes ssl_client_crt_file inicomment "$KURYR_CONFIG" kubernetes ssl_client_key_file + iniset "$KURYR_CONFIG" kubernetes controller_ha ${KURYR_CONTROLLER_HA} + iniset "$KURYR_CONFIG" kubernetes controller_ha_port ${KURYR_CONTROLLER_HA_PORT} + # NOTE(dulek): In the container the CA bundle will be mounted in a standard # directory, so we need to modify that. iniset "$KURYR_CONFIG" neutron cafile /etc/ssl/certs/kuryr-ca-bundle.crt @@ -146,7 +154,7 @@ function generate_containerized_kuryr_resources { generate_kuryr_configmap $output_dir $KURYR_CONFIG $KURYR_CONFIG generate_kuryr_certificates_secret $output_dir $SSL_BUNDLE_FILE generate_kuryr_service_account $output_dir - generate_controller_deployment $output_dir $KURYR_HEALTH_SERVER_PORT + generate_controller_deployment $output_dir $KURYR_HEALTH_SERVER_PORT $KURYR_CONTROLLER_HA generate_cni_daemon_set $output_dir $KURYR_CNI_HEALTH_SERVER_PORT $cni_daemon $CNI_BIN_DIR $CNI_CONF_DIR } diff --git a/devstack/settings b/devstack/settings index e1879b8b0..2aa939b4a 100644 --- a/devstack/settings +++ b/devstack/settings @@ -88,3 +88,7 @@ OVS_HOST_PATH=${OVS_HOST_PATH:-/var/run/openvswitch} # Health Server KURYR_CNI_HEALTH_SERVER_PORT=${KURYR_CNI_HEALTH_SERVER_PORT:-8090} + +# High availability of controller +KURYR_CONTROLLER_HA_PORT=${KURYR_CONTROLLER_HA_PORT:-16401} +KURYR_CONTROLLER_REPLICAS=${KURYR_CONTROLLER_REPLICAS:-1} diff --git a/doc/source/devref/high_availability.rst b/doc/source/devref/high_availability.rst new file mode 100644 index 000000000..d4adb2c61 --- /dev/null +++ b/doc/source/devref/high_availability.rst @@ -0,0 +1,131 @@ +.. + This work is licensed under a Creative Commons Attribution 3.0 Unported + License. + + http://creativecommons.org/licenses/by/3.0/legalcode + + Convention for heading levels in Neutron devref: + ======= Heading 0 (reserved for the title in a document) + ------- Heading 1 + ~~~~~~~ Heading 2 + +++++++ Heading 3 + ''''''' Heading 4 + (Avoid deeper levels because they do not render well.) + +================================ +Active/Passive High Availability +================================ + + +Overview +-------- +Initially it was assumed that there will only be a single kuryr-controller +instance in the Kuryr-Kubernetes deployment. While it simplified a lot of +controller code, it is obviously not a perfect situation. Having redundant +controllers can help with achieving higher availability and scalability of the +deployment. + +Now with introduction of possibility to run Kuryr in Pods on Kubernetes cluster +HA is much easier to be implemented. The purpose of this document is to explain +how will it work in practice. + +Proposed Solution +----------------- +There are two types of HA - Active/Passive and Active/Active. In this document +we'll focus on the former. A/P basically works as one of the instances being +the leader (doing all the exclusive tasks) and other instances waiting in +*standby* mode in case the leader *dies* to take over the leader role. As you +can see a *leader election* mechanism is required to make this work. + +Leader election ++++++++++++++++ +The idea here is to use leader election mechanism based on Kubernetes +endpoints. The idea is neatly `explained on Kubernetes blog +`_. +Election is based on Endpoint resources, that hold annotation about current +leader and its leadership lease time. If leader dies, other instances of the +service are free to take over the record. Kubernetes API mechanisms will +provide update exclusion mechanisms to prevent race conditions. + +This can be implemented by adding another *leader-elector* container to each +of kuryr-controller pods: + +.. code:: yaml + + - image: gcr.io/google_containers/leader-elector:0.5 + name: leader-elector + args: + - "--election=kuryr-controller" + - "--http=0.0.0.0:${KURYR_CONTROLLER_HA_PORT:-16401}" + - "--election-namespace=kube-system" + - "--ttl=5s" + ports: + - containerPort: ${KURYR_CONTROLLER_HA_PORT:-16401} + protocol: TCP + +This adds a new container to the pod. This container will do the +leader-election and expose the simple JSON API on port 16401 by default. This +API will be available to kuryr-controller container. + +Kuryr Controller Implementation ++++++++++++++++++++++++++++++++ +The main issue with having multiple controllers is task division. All of the +controllers are watching the same endpoints and getting the same notifications, +but those notifications cannot be processed by multiple controllers at once, +because we end up with a huge race conditon, where each controller creates +Neutron resources but only on succeeds to put the annotation on the Kubernetes +resource it is processing. + +This is obviously unacceptable so as a first step we're implementing A/P HA, +where only the leader is working on the resources and the rest waits in +standby. This will be implemented by periodically calling the leader-elector +API to check the current leader. On leader change: + +* Pod losing the leadership will stop its Watcher. Please note that it will be + stopped gracefully, so all the ongoing operations will be completed. +* Pod gaining the leadership will start its Watcher. Please note that it will + get notified about all the previously created Kubernetes resources, but will + ignore them as they already have the annotations. +* Pods not affected by leadership change will continue to be in standby mode + with their Watchers stopped. + +Please note that this means that in HA mode Watcher will not get started on +controller startup, but only when periodic task will notice that it is the +leader. + +Issues +++++++ +There are certain issues related to orphaned OpenStack resources that we may +hit. Those can happen in two cases: + +* Controller instance dies instantly during request processing. Some of + OpenStack resources were already created, but information about them was not + yet annotated onto Kubernetes resource. Therefore information is lost and we + end up with orphaned OpenStack resources. New leader will process the + Kubernetes resource by creating resources again. +* During leader transition (short period after a leader died, but before its + lease expired and periodic task on other controllers noticed that; this + shouldn't exceed 10 s) some K8s resources are deleted. New leader will not + get the notification about the deletion and those will go unnoticed. + +Both of this issues can be tackled by garbage-collector mechanism that will +periodically look over Kubernetes resources and delete OpenStack resources that +have no representation in annotations. + +The latter of the issues can also be tackled by saving last seen +``resourceVersion`` of watched resources list when stopping the Watcher and +restarting watching from that point. + +Future enhancements ++++++++++++++++++++ +It would be useful to implement the garbage collector and +``resourceVersion``-based protection mechanism described in section above. + +Besides that to further improve the scalability, we should work on +Active/Active HA model, where work is divided evenly between all of the +kuryr-controller instances. This can be achieved e.g. by using +consistent hash ring to decide which instance will process which resource. + +Potentially this can be extended with support for non-containerized deployments +by using Tooz and some other tool providing leader-election - like Consul or +Zookeeper. diff --git a/doc/source/devref/index.rst b/doc/source/devref/index.rst index 6f5562ca3..e1ceb8c8f 100644 --- a/doc/source/devref/index.rst +++ b/doc/source/devref/index.rst @@ -42,6 +42,7 @@ Design documents health_manager kuryr_kubernetes_ingress_design kuryr_kubernetes_ocp_route_design + high_availability kuryr_kubernetes_versions Indices and tables diff --git a/kuryr_kubernetes/config.py b/kuryr_kubernetes/config.py index e60785955..594c4d39d 100644 --- a/kuryr_kubernetes/config.py +++ b/kuryr_kubernetes/config.py @@ -153,6 +153,14 @@ k8s_opts = [ help=_("The comma-separated handlers that should be " "registered for watching in the pipeline."), default=['vif', 'lb', 'lbaasspec']), + cfg.BoolOpt('controller_ha', + help=_('Enable kuryr-controller active/passive HA. Only ' + 'supported in containerized deployments on Kubernetes ' + 'or OpenShift.'), + default=False), + cfg.PortOpt('controller_ha_elector_port', + help=_('Port on which leader-elector pod is listening to.'), + default=16401), ] neutron_defaults = [ diff --git a/kuryr_kubernetes/controller/service.py b/kuryr_kubernetes/controller/service.py index f905e3336..745f2faa6 100644 --- a/kuryr_kubernetes/controller/service.py +++ b/kuryr_kubernetes/controller/service.py @@ -13,11 +13,14 @@ # License for the specific language governing permissions and limitations # under the License. +import functools +import six import sys import os_vif from oslo_config import cfg from oslo_log import log as logging +from oslo_service import periodic_task from oslo_service import service from stevedore.named import NamedExtensionManager @@ -26,6 +29,7 @@ from kuryr_kubernetes import config from kuryr_kubernetes.controller.handlers import pipeline as h_pipeline from kuryr_kubernetes.controller.managers import health from kuryr_kubernetes import objects +from kuryr_kubernetes import utils from kuryr_kubernetes import watcher @@ -62,29 +66,87 @@ def _load_kuryr_ctrlr_handlers(): return ctrlr_handlers -class KuryrK8sService(service.Service): +class KuryrK8sServiceMeta(type(service.Service), + type(periodic_task.PeriodicTasks)): + pass + + +class KuryrK8sService(six.with_metaclass(KuryrK8sServiceMeta, + service.Service, + periodic_task.PeriodicTasks)): """Kuryr-Kubernetes controller Service.""" def __init__(self): super(KuryrK8sService, self).__init__() + periodic_task.PeriodicTasks.__init__(self, CONF) objects.register_locally_defined_vifs() pipeline = h_pipeline.ControllerPipeline(self.tg) self.watcher = watcher.Watcher(pipeline, self.tg) self.health_manager = health.HealthServer() + self.current_leader = None + self.node_name = utils.get_node_name() handlers = _load_kuryr_ctrlr_handlers() for handler in handlers: self.watcher.add(handler.get_watch_path()) pipeline.register(handler) + def is_leader(self): + return self.current_leader == self.node_name + def start(self): LOG.info("Service '%s' starting", self.__class__.__name__) super(KuryrK8sService, self).start() - self.watcher.start() + + if not CONF.kubernetes.controller_ha: + LOG.info('Running in non-HA mode, starting watcher immediately.') + self.watcher.start() + else: + LOG.info('Running in HA mode, watcher will be started later.') + f = functools.partial(self.run_periodic_tasks, None) + self.tg.add_timer(1, f) + self.health_manager.run() LOG.info("Service '%s' started", self.__class__.__name__) + @periodic_task.periodic_task(spacing=5, run_immediately=True) + def monitor_leader(self, context): + leader = utils.get_leader_name() + if leader is None: + # Error when fetching current leader. We're paranoid, so just to + # make sure we won't break anything we'll try to step down. + self.on_revoke_leader() + elif leader != self.current_leader and leader == self.node_name: + # I'm becoming the leader. + self.on_become_leader() + elif leader != self.current_leader and self.is_leader(): + # I'm revoked from being the leader. + self.on_revoke_leader() + elif leader == self.current_leader and self.is_leader(): + # I continue to be the leader + self.on_continue_leader() + + self.current_leader = leader + + def on_become_leader(self): + LOG.info('Controller %s becomes the leader, starting watcher.', + self.node_name) + self.watcher.start() + + def on_revoke_leader(self): + LOG.info('Controller %s stops being the leader, stopping watcher.', + self.node_name) + if self.watcher.is_running(): + self.watcher.stop() + + def on_continue_leader(self): + # Just make sure my watcher is running. + if not self.watcher.is_running(): + LOG.warning('Controller %s is the leader, but has watcher ' + 'stopped. Restarting it.') + self.watcher.start() + def wait(self): super(KuryrK8sService, self).wait() LOG.info("Service '%s' stopped", self.__class__.__name__) diff --git a/kuryr_kubernetes/tests/unit/test_utils.py b/kuryr_kubernetes/tests/unit/test_utils.py new file mode 100644 index 000000000..677974c02 --- /dev/null +++ b/kuryr_kubernetes/tests/unit/test_utils.py @@ -0,0 +1,70 @@ +# Copyright 2018 Red Hat, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import mock +import os + +from oslo_config import cfg + +from kuryr_kubernetes.tests import base as test_base +from kuryr_kubernetes import utils + +CONF = cfg.CONF + + +class TestUtils(test_base.TestCase): + @mock.patch('socket.gethostname') + def test_get_node_name_socket(self, m_gethostname): + try: + del os.environ['KUBERNETES_NODE_NAME'] + except KeyError: + pass + + m_gethostname.return_value = 'foo' + res = utils.get_node_name() + self.assertEqual('foo', res) + m_gethostname.assert_called_once_with() + + @mock.patch('socket.gethostname') + def test_get_node_name_envvar(self, m_gethostname): + os.environ['KUBERNETES_NODE_NAME'] = 'bar' + m_gethostname.return_value = 'foo' + res = utils.get_node_name() + self.assertEqual('bar', res) + m_gethostname.assert_not_called() + + @mock.patch('requests.get') + def test_get_leader_name(self, m_get): + m_get.return_value = mock.Mock(json=mock.Mock( + return_value={'name': 'foo'})) + res = utils.get_leader_name() + m_get.assert_called_once_with( + 'http://localhost:%d' % CONF.kubernetes.controller_ha_elector_port) + self.assertEqual('foo', res) + + @mock.patch('requests.get') + def test_get_leader_name_malformed(self, m_get): + m_get.return_value = mock.Mock(json=mock.Mock( + return_value={'name2': 'foo'})) + res = utils.get_leader_name() + m_get.assert_called_once_with( + 'http://localhost:%d' % CONF.kubernetes.controller_ha_elector_port) + self.assertIsNone(res) + + @mock.patch('requests.get') + def test_get_leader_name_exc(self, m_get): + m_get.side_effect = Exception + res = utils.get_leader_name() + m_get.assert_called_once_with( + 'http://localhost:%d' % CONF.kubernetes.controller_ha_elector_port) + self.assertIsNone(res) diff --git a/kuryr_kubernetes/utils.py b/kuryr_kubernetes/utils.py index 68d3e367d..53e3095d5 100644 --- a/kuryr_kubernetes/utils.py +++ b/kuryr_kubernetes/utils.py @@ -10,13 +10,19 @@ # License for the specific language governing permissions and limitations # under the License. +import os import random +import socket import time +import requests + from oslo_config import cfg +from oslo_log import log from oslo_serialization import jsonutils CONF = cfg.CONF +LOG = log.getLogger(__name__) VALID_MULTI_POD_POOLS_OPTS = {'noop': ['neutron-vif', 'nested-vlan', @@ -98,3 +104,21 @@ def exponential_sleep(deadline, attempt, interval=DEFAULT_INTERVAL): time.sleep(interval) return interval + + +def get_node_name(): + try: + return os.environ['KUBERNETES_NODE_NAME'] + except KeyError: + return socket.gethostname() + + +def get_leader_name(): + url = 'http://localhost:%d' % CONF.kubernetes.controller_ha_elector_port + try: + return requests.get(url).json()['name'] + except Exception: + LOG.exception('Error when fetching current leader pod name.') + # NOTE(dulek): Assuming there's no leader when we can't contact leader + # elector container. + return None diff --git a/kuryr_kubernetes/watcher.py b/kuryr_kubernetes/watcher.py index d5314fa12..b47ba1bc2 100644 --- a/kuryr_kubernetes/watcher.py +++ b/kuryr_kubernetes/watcher.py @@ -108,6 +108,9 @@ class Watcher(health.HealthHandler): if path in self._watching: self._stop_watch(path) + def is_running(self): + return self._running + def start(self): """Starts the Watcher. diff --git a/releasenotes/notes/active-passive-ha-cfbda8e6b527b48e.yaml b/releasenotes/notes/active-passive-ha-cfbda8e6b527b48e.yaml new file mode 100644 index 000000000..266769496 --- /dev/null +++ b/releasenotes/notes/active-passive-ha-cfbda8e6b527b48e.yaml @@ -0,0 +1,9 @@ +--- +features: + - | + Kuryr-Kubernetes now supports running kuryr-controller service in + **Active/Passive HA mode**. This is only possible when running those services + as Pods on Kubernetes cluster, as Kubernetes is used for leader election. + Also it is required to add leader-elector container to the kuryr-controller + Pods. HA is controlled by ``[kubernetes]controller_ha`` option, which + defaults to ``False``.