From f6247569ced94360efb8d800942d6acfcd0fa44a Mon Sep 17 00:00:00 2001 From: Victor Romano Date: Mon, 13 Nov 2023 12:54:07 -0300 Subject: [PATCH] Add pod health status to kube rootca check As part of the kube rootca certificate update, it's recommended to have all pods in ready state to avoid problems during it. This commit adds an optional flag to 'system health-query-kube-upgrade' command to check for pod health and returns a list of pods that are not ready or completed. Usage: system health-query-kube-upgrade --rootca Test Cases: 1) PASS: Run the following commands and verify their output remains unchanged. - system health-query - system health-query-upgrade - system health-query-kube-upgrade (without --rootca) 2) PASS: Run "system health-query-kube-upgrade --rootca" without any pod in failure state and verify that the correct success message was included in the command output. 3) PASS: Repeat test 2 but adding pods in unhealthy state (Error, Evicted and CrashLoopBackOff) and verify that the output contains the correct error message and a list of the unhealthy pods. 4) PASS: Repeat test 2 but adding pods with completed and pending status and verify the completed pod was not added to the failed pod list and the correct success message was showned. 5) PASS: Repeat test 3 but adding pods with completed and pending status and verify this pods weren't added to the failed pod list and the correct failure message was showned. 6) PASS: Run 'system kube-rootca-update-start' with pods in unhealthy state and verify the update did not start and the correct error message was displayed. 7) PASS: Run 'system kube-rootca-update-start' with all pods in healthy state and verify the update process started successfully. 8) PASS: Create and apply a new sw-manager kube-rootca-update-strategy with pods in unhealthy state and verify the apply was aborted and the correct error message was displayed. 9) PASS: Create and apply a new sw-manager kube-rootca-update-strategy with all pods in healthy state and verify the update was applied successfully. Story: 2010852 Task: 49085 Change-Id: I463ecc8a1107375e4e0997e07581b10ec8d129e2 Signed-off-by: Victor Romano --- .../cgts-client/cgtsclient/v1/health.py | 7 ++-- .../cgts-client/cgtsclient/v1/health_shell.py | 13 +++++-- .../sysinv/api/controllers/v1/health.py | 9 ++--- .../api/controllers/v1/kube_rootca_update.py | 5 +-- sysinv/sysinv/sysinv/sysinv/common/health.py | 32 +++++++++++++++-- .../sysinv/sysinv/sysinv/conductor/manager.py | 3 +- .../tests/api/test_kube_rootca_update.py | 34 ++++++++++++++++--- 7 files changed, 86 insertions(+), 17 deletions(-) diff --git a/sysinv/cgts-client/cgts-client/cgtsclient/v1/health.py b/sysinv/cgts-client/cgts-client/cgtsclient/v1/health.py index 8d31da5b11..557f371696 100644 --- a/sysinv/cgts-client/cgts-client/cgtsclient/v1/health.py +++ b/sysinv/cgts-client/cgts-client/cgtsclient/v1/health.py @@ -1,6 +1,6 @@ # -*- encoding: utf-8 -*- # -# Copyright (c) 2015-2016 Wind River Systems, Inc. +# Copyright (c) 2015-2023 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # @@ -23,9 +23,12 @@ class HealthManager(base.Manager): resp, body = self.api.json_request('GET', path) return body - def get_kube_upgrade(self, relaxed=None): + def get_kube_upgrade(self, args: dict, relaxed=None): path = '/v1/health/kube-upgrade' if relaxed: path += '/relaxed' + rootca = args.get('rootca') + if rootca: + path += f'?rootca={rootca}' resp, body = self.api.json_request('GET', path) return body diff --git a/sysinv/cgts-client/cgts-client/cgtsclient/v1/health_shell.py b/sysinv/cgts-client/cgts-client/cgtsclient/v1/health_shell.py index f13e283cec..472302dc6e 100644 --- a/sysinv/cgts-client/cgts-client/cgtsclient/v1/health_shell.py +++ b/sysinv/cgts-client/cgts-client/cgtsclient/v1/health_shell.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2016 Wind River Systems, Inc. +# Copyright (c) 2016-2023 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # @@ -8,6 +8,8 @@ # All Rights Reserved. # +from cgtsclient.common import utils + def do_health_query(cc, args): """Run the Health Check.""" @@ -19,6 +21,13 @@ def do_health_query_upgrade(cc, args): print(cc.health.get_upgrade()) +@utils.arg('--rootca', + action='store_true', + default=False, + help='Whether additional RootCA verifications should be executed') def do_health_query_kube_upgrade(cc, args): """Run the Health Check for a Kubernetes Upgrade.""" - print(cc.health.get_kube_upgrade()) + attributes = {} + if args.rootca is not None: + attributes.update({'rootca': args.rootca}) + print(cc.health.get_kube_upgrade(attributes)) diff --git a/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/health.py b/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/health.py index 7274769624..838a4b95e5 100644 --- a/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/health.py +++ b/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/health.py @@ -1,4 +1,4 @@ -# Copyright (c) 2016 Wind River Systems, Inc. +# Copyright (c) 2016-2023 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # @@ -33,8 +33,8 @@ class HealthController(rest.RestController): "Unable to perform health query.")) return output - @wsme_pecan.wsexpose(wtypes.text, wtypes.text, wtypes.text) - def get_one(self, upgrade, relaxed=None): + @wsme_pecan.wsexpose(wtypes.text, wtypes.text, wtypes.text, wtypes.text) + def get_one(self, upgrade, relaxed=None, rootca=None): """Validates the health of the system for an upgrade""" force = False if relaxed: @@ -51,7 +51,8 @@ class HealthController(rest.RestController): elif upgrade == 'kube-upgrade': try: success, output = pecan.request.rpcapi.get_system_health( - pecan.request.context, kube_upgrade=True, force=force) + pecan.request.context, kube_upgrade=True, force=force, + kube_rootca_update=rootca) except Exception as e: LOG.exception(e) raise wsme.exc.ClientSideError(_( diff --git a/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/kube_rootca_update.py b/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/kube_rootca_update.py index 9b5c45aaaa..9d3b9becd4 100644 --- a/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/kube_rootca_update.py +++ b/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/kube_rootca_update.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2021 Wind River Systems, Inc. +# Copyright (c) 2021-2023 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # @@ -415,7 +415,8 @@ class KubeRootCAUpdateController(rest.RestController): LOG.info("Running in lab, ignoring health errors.") else: raise wsme.exc.ClientSideError(_( - "System is not healthy. Run system health-query for more details.")) + "System is not healthy. Run 'system health-query-kube-upgrade " + "--rootca' for more details.")) def _clear_kubernetes_resources(self, hostnames): """Clears secrets and issuers created during the update process diff --git a/sysinv/sysinv/sysinv/sysinv/common/health.py b/sysinv/sysinv/sysinv/sysinv/common/health.py index 5f34e56264..14b6e3f056 100755 --- a/sysinv/sysinv/sysinv/sysinv/common/health.py +++ b/sysinv/sysinv/sysinv/sysinv/common/health.py @@ -252,6 +252,21 @@ class Health(object): success = not fail_pod_list return success, fail_pod_list + def _check_kube_all_pods_are_healthy(self): + """Checks that all kubernetes pod are healthy + + A healthy pod is in ready or completed status. + """ + fail_pod_list = [] + pod_list = self._kube_operator.kube_get_all_pods() + + for pod in pod_list: + if pod.status.phase not in ['Pending', 'Running', 'Succeeded']: + # Add it to the failed list as it's not ready/completed/pending + fail_pod_list.append((pod.metadata.name, pod.metadata.namespace)) + success = not fail_pod_list + return success, fail_pod_list + def _check_kube_applications(self): """Checks that each kubernetes application is in a valid state""" @@ -677,7 +692,8 @@ class Health(object): def get_system_health_kube_upgrade(self, context, force=False, - alarm_ignore_list=None): + alarm_ignore_list=None, + kube_rootca_update=False): """ Ensures the system is in a valid state for a kubernetes upgrade @@ -707,6 +723,18 @@ class Health(object): output += _('Kubernetes applications not in a valid state: %s\n') \ % ', '.join(apps_not_valid) - health_ok = health_ok and success + if kube_rootca_update: + pods_healthy, fail_pod_list = self._check_kube_all_pods_are_healthy() + output += _( + 'All kubernetes pods are in a valid state: [%s]\n') \ + % (Health.SUCCESS_MSG if pods_healthy else Health.FAIL_MSG) + if not pods_healthy: + formatted_fail_pod_list = ['{} (namespace: {})'.format(name, namespace) + for name, namespace in fail_pod_list] + output += _('Kubernetes pods not in a valid state: %s\n') \ + % ', '.join(formatted_fail_pod_list) + + health_ok = health_ok and success and \ + (pods_healthy if kube_rootca_update else True) return health_ok, output diff --git a/sysinv/sysinv/sysinv/sysinv/conductor/manager.py b/sysinv/sysinv/sysinv/sysinv/conductor/manager.py index b4e495efda..dae7b3c8ce 100644 --- a/sysinv/sysinv/sysinv/sysinv/conductor/manager.py +++ b/sysinv/sysinv/sysinv/sysinv/conductor/manager.py @@ -13679,7 +13679,8 @@ class ConductorManager(service.PeriodicService): return health_util.get_system_health_kube_upgrade( context=context, force=force, - alarm_ignore_list=alarm_ignore_list) + alarm_ignore_list=alarm_ignore_list, + kube_rootca_update=kube_rootca_update) else: return health_util.get_system_health( context=context, diff --git a/sysinv/sysinv/sysinv/sysinv/tests/api/test_kube_rootca_update.py b/sysinv/sysinv/sysinv/sysinv/tests/api/test_kube_rootca_update.py index 1ac5104193..bcf01ae2b7 100644 --- a/sysinv/sysinv/sysinv/sysinv/tests/api/test_kube_rootca_update.py +++ b/sysinv/sysinv/sysinv/sysinv/tests/api/test_kube_rootca_update.py @@ -186,7 +186,9 @@ class TestPostKubeRootCAUpdate(TestKubeRootCAUpdate, dbbase.ProvisionedControllerHostTestCase): @mock.patch('sysinv.common.health.Health._check_trident_compatibility', lambda x: True) - def test_create(self): + @mock.patch('sysinv.common.health.Health._check_kube_all_pods_are_healthy') + def test_create(self, mock_pods_healthy): + mock_pods_healthy.return_value = True, [] # Test creation of kubernetes rootca update create_dict = dbutils.get_test_kube_rootca_update() result = self.post_json('/kube_rootca_update?force=False', create_dict, @@ -206,13 +208,16 @@ class TestPostKubeRootCAUpdate(TestKubeRootCAUpdate, self.assertEqual(host_updates[0]['effective_rootca_cert'], 'current_cert_serial') @mock.patch('sysinv.common.health.Health._check_trident_compatibility', lambda x: True) - def test_create_rootca_update_unhealthy_from_alarms(self): + @mock.patch('sysinv.common.health.Health._check_kube_all_pods_are_healthy') + def test_create_rootca_update_unhealthy_from_alarms(self, mock_pods_healthy): """ Test creation of kube rootca update while there are alarms""" # Test creation of kubernetes rootca update when system health check fails # 1 alarm will return False self.fake_fm_client.alarm.list.return_value = \ [FAKE_MGMT_ALARM, ] + mock_pods_healthy.return_value = True, [] + # Test creation of kubernetes rootca update create_dict = dbutils.get_test_kube_rootca_update() result = self.post_json('/kube_rootca_update?force=False', create_dict, @@ -222,8 +227,29 @@ class TestPostKubeRootCAUpdate(TestKubeRootCAUpdate, # Verify that the rootca update has the expected attributes self.assertEqual(result.content_type, 'application/json') self.assertEqual(http_client.BAD_REQUEST, result.status_int) - self.assertIn("System is not healthy. Run system health-query for more details.", - result.json['error_message']) + self.assertIn("System is not healthy. Run 'system health-query-kube-upgrade " + "--rootca' for more details.", result.json['error_message']) + + @mock.patch('sysinv.common.health.Health._check_trident_compatibility', lambda x: True) + @mock.patch('sysinv.common.health.Health._check_kube_all_pods_are_healthy') + def test_create_rootca_update_unhealthy_from_pods(self, mock_pods_healthy): + """ Test creation of kube rootca update while there are unhealthy pods""" + + # Unhealthy pods + mock_pods_healthy.return_value = False, \ + [('Unhealthy-pod-name', 'Unhealthy-pod-namespace')] + + # Test creation of kubernetes rootca update + create_dict = dbutils.get_test_kube_rootca_update() + result = self.post_json('/kube_rootca_update?force=False', create_dict, + headers=self.headers, + expect_errors=True) + + # Verify that the rootca update has the expected attributes + self.assertEqual(result.content_type, 'application/json') + self.assertEqual(http_client.BAD_REQUEST, result.status_int) + self.assertIn("System is not healthy. Run 'system health-query-kube-upgrade " + "--rootca' for more details.", result.json['error_message']) def test_create_rootca_update_exists(self): # Test creation of rootca update when a kubernetes rootca update already exists