From 63e80c31088206e75c2df8a2ffc9360d1a46acf2 Mon Sep 17 00:00:00 2001 From: Feilong Wang Date: Fri, 28 Feb 2020 17:34:28 +1300 Subject: [PATCH] [k8s] Support updating k8s cluster health status The original design of k8s cluster health status is allowing the health status being updated by Magnum control plane. However, it doesn't work when the cluster is private. This patch supports updating the k8s cluster health status via the Magnum cluster update API by a 3rd party service so that a controller (e.g. magnum-auto-healer) running inside the k8s cluster can call the Magnum update API to update the cluster health status. Task: 38583 Story: 2007242 Change-Id: Ie7189d328c4038403576b0324e7b0e8a9b305a5e --- doc/source/user/index.rst | 8 +++++- doc/source/user/k8s-health-monitoring.rst | 26 +++++++++++++++++++ magnum/api/controllers/v1/cluster.py | 19 +++++++++++--- magnum/api/utils.py | 3 ++- magnum/api/validation.py | 3 ++- magnum/common/policies/cluster.py | 11 ++++++++ magnum/conductor/api.py | 16 +++++++++--- .../conductor/handlers/cluster_conductor.py | 17 ++++++++++-- magnum/drivers/common/k8s_monitor.py | 9 +++++++ .../unit/api/controllers/v1/test_cluster.py | 12 ++++++--- .../handlers/test_cluster_conductor.py | 10 ++++--- magnum/tests/unit/conductor/test_monitors.py | 21 +++++++++++++++ magnum/tests/unit/conductor/test_rpcapi.py | 5 +++- magnum/tests/unit/db/utils.py | 2 +- ...uster-health-via-api-b8a3cac3031c50a5.yaml | 10 +++++++ 15 files changed, 150 insertions(+), 22 deletions(-) create mode 100644 doc/source/user/k8s-health-monitoring.rst create mode 100644 releasenotes/notes/support-updating-k8s-cluster-health-via-api-b8a3cac3031c50a5.yaml diff --git a/doc/source/user/index.rst b/doc/source/user/index.rst index 354a69e3d4..ae6d0e08d4 100644 --- a/doc/source/user/index.rst +++ b/doc/source/user/index.rst @@ -37,6 +37,7 @@ created and managed by Magnum to support the COE's. #. `Rolling Upgrade`_ #. `Keystone Authentication and Authorization for Kubernetes`_ #. `Node Groups`_ +#. `Kubernetes Health Monitoring`_ Overview ======== @@ -3479,7 +3480,7 @@ Rolling Upgrade =============== .. include:: rolling-upgrade.rst -======= + Keystone Authentication and Authorization for Kubernetes ======================================================== @@ -3490,3 +3491,8 @@ Node Groups =========== .. include:: node-groups.rst + +Kubernetes Health Monitoring +============================ + +.. include:: k8s-health-monitoring.rst diff --git a/doc/source/user/k8s-health-monitoring.rst b/doc/source/user/k8s-health-monitoring.rst new file mode 100644 index 0000000000..379be27110 --- /dev/null +++ b/doc/source/user/k8s-health-monitoring.rst @@ -0,0 +1,26 @@ +Currently Magnum can support health monitoring for Kubernetes cluster. There +are two scenarios supported now: internal and external. + +Internal Health Monitoring +-------------------------- + +Magnum has a periodic job to poll the k8s cluster if it is a reachable cluster. +If the floating IP is enabled, or the master loadbalancer is enabled and the +master loadbalancer has floating IP associated, then Magnum will take this +cluster as reachable. Then Magnum will call the k8s API per 10 seconds to poll +the health status of the cluster and then update the two attributes: +`health_status` and `health_status_reason`. + +External Health Montorning +-------------------------- + +Currently, only `magnum-auto-healer +`_ +is able to update cluster's `health_status` and `health_status_reason` +attributes. Both the label `auto_healing_enabled=True` and +`auto_healing_controller=magnum-auto-healer` must be set, otherwise, the two +attributes' value will be overwritten with 'UNKNOWN' and 'The cluster is not +accessible'. The health_status attribute can either be in `HEALTHY`, +`UNHEALTHY` or `UNKNOWN` and the health_status_reason is a dictionary +of the hostnames and their current health statuses and the API health status. + diff --git a/magnum/api/controllers/v1/cluster.py b/magnum/api/controllers/v1/cluster.py index c29e34842e..8383181581 100755 --- a/magnum/api/controllers/v1/cluster.py +++ b/magnum/api/controllers/v1/cluster.py @@ -524,8 +524,12 @@ class ClustersController(base.Controller): :param cluster_ident: UUID or logical name of a cluster. :param patch: a json PATCH document to apply to this cluster. """ - cluster, node_count = self._patch(cluster_ident, patch) - pecan.request.rpcapi.cluster_update_async(cluster, node_count) + (cluster, node_count, + health_status, + health_status_reason) = self._patch(cluster_ident, patch) + pecan.request.rpcapi.cluster_update_async(cluster, node_count, + health_status, + health_status_reason) return ClusterID(cluster.uuid) @base.Controller.api_version("1.3") # noqa @@ -539,8 +543,12 @@ class ClustersController(base.Controller): :param rollback: whether to rollback cluster on update failure. :param patch: a json PATCH document to apply to this cluster. """ - cluster, node_count = self._patch(cluster_ident, patch) + (cluster, node_count, + health_status, + health_status_reason) = self._patch(cluster_ident, patch) pecan.request.rpcapi.cluster_update_async(cluster, node_count, + health_status, + health_status_reason, rollback) return ClusterID(cluster.uuid) @@ -554,6 +562,8 @@ class ClustersController(base.Controller): cluster = api_utils.get_resource('Cluster', cluster_ident) policy.enforce(context, 'cluster:update', cluster.as_dict(), action='cluster:update') + policy.enforce(context, "cluster:update_health_status", + action="cluster:update_health_status") try: cluster_dict = cluster.as_dict() new_cluster = Cluster(**api_utils.apply_jsonpatch(cluster_dict, @@ -571,7 +581,8 @@ class ClustersController(base.Controller): delta.add(field) validation.validate_cluster_properties(delta) - return cluster, new_cluster.node_count + return (cluster, new_cluster.node_count, + new_cluster.health_status, new_cluster.health_status_reason) @expose.expose(None, types.uuid_or_name, status_code=204) def delete(self, cluster_ident): diff --git a/magnum/api/utils.py b/magnum/api/utils.py index a3cc0443a1..c0ad3cd0d7 100644 --- a/magnum/api/utils.py +++ b/magnum/api/utils.py @@ -82,7 +82,8 @@ def apply_jsonpatch(doc, patch): "'replace' operation instead.") % p['path'] raise wsme.exc.ClientSideError(msg) - if p['op'] == 'replace' and p['path'] == '/labels': + if (p['op'] == 'replace' and (p['path'] == '/labels' or + p['path'] == '/health_status_reason')): try: val = p['value'] dict_val = val if type(val) == dict else ast.literal_eval(val) diff --git a/magnum/api/validation.py b/magnum/api/validation.py index 5c028e796d..e04a2236d3 100644 --- a/magnum/api/validation.py +++ b/magnum/api/validation.py @@ -29,7 +29,8 @@ from magnum import objects CONF = magnum.conf.CONF -cluster_update_allowed_properties = set(['node_count']) +cluster_update_allowed_properties = set(['node_count', 'health_status', + 'health_status_reason']) federation_update_allowed_properties = set(['member_ids', 'properties']) diff --git a/magnum/common/policies/cluster.py b/magnum/common/policies/cluster.py index c2617e3317..bc20e44e47 100644 --- a/magnum/common/policies/cluster.py +++ b/magnum/common/policies/cluster.py @@ -129,6 +129,17 @@ rules = [ } ] ), + policy.DocumentedRuleDefault( + name=CLUSTER % 'update_health_status', + check_str=base.RULE_ADMIN_OR_USER + " or " + base.RULE_CLUSTER_USER, + description='Update the health status of an existing cluster.', + operations=[ + { + 'path': '/v1/clusters/{cluster_ident}', + 'method': 'PATCH' + } + ] + ), policy.DocumentedRuleDefault( name=CLUSTER % 'update_all_projects', check_str=base.RULE_ADMIN_API, diff --git a/magnum/conductor/api.py b/magnum/conductor/api.py index e9733842b5..34a6cf97bd 100644 --- a/magnum/conductor/api.py +++ b/magnum/conductor/api.py @@ -49,13 +49,21 @@ class API(rpc_service.API): def cluster_delete_async(self, uuid): self._cast('cluster_delete', uuid=uuid) - def cluster_update(self, cluster, node_count): + def cluster_update(self, cluster, node_count, + health_status, health_status_reason): return self._call( - 'cluster_update', cluster=cluster, node_count=node_count) + 'cluster_update', cluster=cluster, node_count=node_count, + health_status=health_status, + health_status_reason=health_status_reason) - def cluster_update_async(self, cluster, node_count, rollback=False): + def cluster_update_async(self, cluster, node_count, + health_status, health_status_reason, + rollback=False): self._cast('cluster_update', cluster=cluster, - node_count=node_count, rollback=rollback) + node_count=node_count, + health_status=health_status, + health_status_reason=health_status_reason, + rollback=rollback) def cluster_resize(self, cluster, node_count, nodes_to_remove, nodegroup, rollback=False): diff --git a/magnum/conductor/handlers/cluster_conductor.py b/magnum/conductor/handlers/cluster_conductor.py index aa5122ac90..64476acbae 100755 --- a/magnum/conductor/handlers/cluster_conductor.py +++ b/magnum/conductor/handlers/cluster_conductor.py @@ -97,7 +97,8 @@ class Handler(object): return cluster - def cluster_update(self, context, cluster, node_count, rollback=False): + def cluster_update(self, context, cluster, node_count, + health_status, health_status_reason, rollback=False): LOG.debug('cluster_heat cluster_update') osc = clients.OpenStackClients(context) @@ -122,8 +123,20 @@ class Handler(object): # Updates will be only reflected to the default worker # nodegroup. worker_ng = cluster.default_ng_worker - if worker_ng.node_count == node_count: + if (worker_ng.node_count == node_count and + cluster.health_status == health_status and + cluster.health_status_reason == health_status_reason): return + + cluster.health_status = health_status + cluster.health_status_reason = health_status_reason + + # It's not necessary to trigger driver's cluster update if it's + # only health status update + if worker_ng.node_count == node_count: + cluster.save() + return cluster + # Backup the old node count so that we can restore it # in case of an exception. old_node_count = worker_ng.node_count diff --git a/magnum/drivers/common/k8s_monitor.py b/magnum/drivers/common/k8s_monitor.py index 27433ab9b4..615d94db3d 100644 --- a/magnum/drivers/common/k8s_monitor.py +++ b/magnum/drivers/common/k8s_monitor.py @@ -49,6 +49,9 @@ class K8sMonitor(monitors.MonitorBase): self.data['pods'] = self._parse_pod_info(pods) def poll_health_status(self): + if self._is_magnum_auto_healer_running(): + return + k8s_api = k8s.create_k8s_api(self.context, self.cluster) if self._is_cluster_accessible(): status, reason = self._poll_health_status(k8s_api) @@ -60,6 +63,12 @@ class K8sMonitor(monitors.MonitorBase): self.data['health_status'] = status self.data['health_status_reason'] = reason + def _is_magnum_auto_healer_running(self): + auto_healing = self.cluster.labels.get("auto_healing_enabled") + auto_healing_enabled = strutils.bool_from_string(auto_healing) + controller = self.cluster.labels.get("auto_healing_controller") + return (auto_healing_enabled and controller == "magnum-auto-healer") + def _is_cluster_accessible(self): if self.cluster.cluster_template.master_lb_enabled: lb_fip = self.cluster.labels.get("master_lb_floating_ip_enabled", diff --git a/magnum/tests/unit/api/controllers/v1/test_cluster.py b/magnum/tests/unit/api/controllers/v1/test_cluster.py index eb3851a433..1fd6266af5 100644 --- a/magnum/tests/unit/api/controllers/v1/test_cluster.py +++ b/magnum/tests/unit/api/controllers/v1/test_cluster.py @@ -259,13 +259,15 @@ class TestPatch(api_base.FunctionalTest): self.cluster_template_obj = obj_utils.create_test_cluster_template( self.context) self.cluster_obj = obj_utils.create_test_cluster( - self.context, name='cluster_example_A', node_count=3) + self.context, name='cluster_example_A', node_count=3, + health_status='UNKNOWN', health_status_reason={}) p = mock.patch.object(rpcapi.API, 'cluster_update_async') self.mock_cluster_update = p.start() self.mock_cluster_update.side_effect = self._sim_rpc_cluster_update self.addCleanup(p.stop) - def _sim_rpc_cluster_update(self, cluster, node_count, rollback=False): + def _sim_rpc_cluster_update(self, cluster, node_count, health_status, + health_status_reason, rollback=False): cluster.status = 'UPDATE_IN_PROGRESS' default_ng_worker = cluster.default_ng_worker default_ng_worker.node_count = node_count @@ -434,7 +436,8 @@ class TestPatch(api_base.FunctionalTest): headers={'OpenStack-API-Version': 'container-infra 1.3'}) self.mock_cluster_update.assert_called_once_with( - mock.ANY, node_count, True) + mock.ANY, node_count, self.cluster_obj.health_status, + self.cluster_obj.health_status_reason, True) self.assertEqual(202, response.status_code) def test_update_cluster_with_rollback_disabled(self): @@ -446,7 +449,8 @@ class TestPatch(api_base.FunctionalTest): headers={'OpenStack-API-Version': 'container-infra 1.3'}) self.mock_cluster_update.assert_called_once_with( - mock.ANY, node_count, False) + mock.ANY, node_count, self.cluster_obj.health_status, + self.cluster_obj.health_status_reason, False) self.assertEqual(202, response.status_code) def test_remove_ok(self): diff --git a/magnum/tests/unit/conductor/handlers/test_cluster_conductor.py b/magnum/tests/unit/conductor/handlers/test_cluster_conductor.py index c986ca9903..b2046c6714 100644 --- a/magnum/tests/unit/conductor/handlers/test_cluster_conductor.py +++ b/magnum/tests/unit/conductor/handlers/test_cluster_conductor.py @@ -27,6 +27,7 @@ from magnum.conductor.handlers import cluster_conductor import magnum.conf from magnum.drivers.k8s_fedora_atomic_v1 import driver as k8s_atomic_dr from magnum import objects +from magnum.objects.fields import ClusterHealthStatus from magnum.objects.fields import ClusterStatus as cluster_status from magnum.tests import fake_notifier from magnum.tests.unit.db import base as db_base @@ -79,7 +80,8 @@ class TestHandler(db_base.DbTestCase): self.master.create() self.worker.create() self.cluster.status = cluster_status.CREATE_COMPLETE - self.handler.cluster_update(self.context, self.cluster, node_count) + self.handler.cluster_update(self.context, self.cluster, node_count, + ClusterHealthStatus.UNKNOWN, {}) notifications = fake_notifier.NOTIFICATIONS self.assertEqual(1, len(notifications)) @@ -111,7 +113,8 @@ class TestHandler(db_base.DbTestCase): self.worker.create() self.cluster.status = cluster_status.CREATE_FAILED self.assertRaises(exception.NotSupported, self.handler.cluster_update, - self.context, self.cluster, node_count) + self.context, self.cluster, node_count, + ClusterHealthStatus.UNKNOWN, {}) notifications = fake_notifier.NOTIFICATIONS self.assertEqual(1, len(notifications)) @@ -144,7 +147,8 @@ class TestHandler(db_base.DbTestCase): self.cluster.status = cluster_status.CREATE_COMPLETE self.master.create() self.worker.create() - self.handler.cluster_update(self.context, self.cluster, node_count) + self.handler.cluster_update(self.context, self.cluster, node_count, + ClusterHealthStatus.UNKNOWN, {}) notifications = fake_notifier.NOTIFICATIONS self.assertEqual(1, len(notifications)) diff --git a/magnum/tests/unit/conductor/test_monitors.py b/magnum/tests/unit/conductor/test_monitors.py index 519fbd6a8a..f1a0ca8126 100644 --- a/magnum/tests/unit/conductor/test_monitors.py +++ b/magnum/tests/unit/conductor/test_monitors.py @@ -542,3 +542,24 @@ class MonitorsTestCase(base.TestCase): self.k8s_monitor.poll_health_status() self.assertEqual(self.k8s_monitor.data['health_status'], m_fields.ClusterHealthStatus.UNKNOWN) + + def test_is_magnum_auto_healer_running(self): + cluster = self.k8s_monitor.cluster + cluster.labels['auto_healing_enabled'] = True + cluster.labels['auto_healing_controller'] = 'magnum-auto-healer' + self.k8s_monitor._is_magnum_auto_healer_running() + self.assertTrue(self.k8s_monitor._is_magnum_auto_healer_running()) + + cluster.labels['auto_healing_enabled'] = False + cluster.labels['auto_healing_controller'] = 'magnum-auto-healer' + self.k8s_monitor._is_magnum_auto_healer_running() + self.assertFalse(self.k8s_monitor._is_magnum_auto_healer_running()) + + cluster.labels['auto_healing_enabled'] = True + cluster.labels['auto_healing_controller'] = 'draino' + self.k8s_monitor._is_magnum_auto_healer_running() + self.assertFalse(self.k8s_monitor._is_magnum_auto_healer_running()) + + cluster.labels = {} + self.k8s_monitor._is_magnum_auto_healer_running() + self.assertFalse(self.k8s_monitor._is_magnum_auto_healer_running()) diff --git a/magnum/tests/unit/conductor/test_rpcapi.py b/magnum/tests/unit/conductor/test_rpcapi.py index 80cb09170d..5af75576ba 100644 --- a/magnum/tests/unit/conductor/test_rpcapi.py +++ b/magnum/tests/unit/conductor/test_rpcapi.py @@ -19,6 +19,7 @@ import mock from magnum.conductor import api as conductor_rpcapi from magnum import objects +from magnum.objects.fields import ClusterHealthStatus from magnum.tests.unit.db import base from magnum.tests.unit.db import utils as dbutils @@ -99,7 +100,9 @@ class RPCAPITestCase(base.DbTestCase): 'call', version='1.1', cluster=self.fake_cluster['name'], - node_count=2) + node_count=2, + health_status=ClusterHealthStatus.UNKNOWN, + health_status_reason={}) def test_ping_conductor(self): self._test_rpcapi('ping_conductor', diff --git a/magnum/tests/unit/db/utils.py b/magnum/tests/unit/db/utils.py index 837ebe2084..fc8cd3fa59 100644 --- a/magnum/tests/unit/db/utils.py +++ b/magnum/tests/unit/db/utils.py @@ -116,7 +116,7 @@ def get_test_cluster(**kw): if attr in kw: attrs[attr] = kw[attr] # Required only in PeriodicTestCase, may break other tests - for attr in ['keypair', 'health_status']: + for attr in ['keypair', 'health_status', 'health_status_reason']: if attr in kw: attrs[attr] = kw[attr] diff --git a/releasenotes/notes/support-updating-k8s-cluster-health-via-api-b8a3cac3031c50a5.yaml b/releasenotes/notes/support-updating-k8s-cluster-health-via-api-b8a3cac3031c50a5.yaml new file mode 100644 index 0000000000..cef330d17d --- /dev/null +++ b/releasenotes/notes/support-updating-k8s-cluster-health-via-api-b8a3cac3031c50a5.yaml @@ -0,0 +1,10 @@ +--- +features: + - | + The original design of k8s cluster health status is allowing + the health status being updated by Magnum control plane. However, + it doesn't work when the cluster is private. Now Magnum supports + updating the k8s cluster health status via the Magnum cluster + update API so that a controller (e.g. magnum-auto-healer) running + inside the k8s cluster can call the Magnum update API to update + the cluster health status.