diff --git a/sysinv/cgts-client/cgts-client/cgtsclient/v1/health.py b/sysinv/cgts-client/cgts-client/cgtsclient/v1/health.py index 47c889fef8..18c4a10a8a 100644 --- a/sysinv/cgts-client/cgts-client/cgtsclient/v1/health.py +++ b/sysinv/cgts-client/cgts-client/cgtsclient/v1/health.py @@ -20,3 +20,8 @@ class HealthManager(base.Manager): path = '/v1/health/upgrade' resp, body = self.api.json_request('GET', path) return body + + def get_kube_upgrade(self): + path = '/v1/health/kube-upgrade' + resp, body = self.api.json_request('GET', path) + return body diff --git a/sysinv/cgts-client/cgts-client/cgtsclient/v1/health_shell.py b/sysinv/cgts-client/cgts-client/cgtsclient/v1/health_shell.py index 5ece9c4807..f13e283cec 100644 --- a/sysinv/cgts-client/cgts-client/cgtsclient/v1/health_shell.py +++ b/sysinv/cgts-client/cgts-client/cgtsclient/v1/health_shell.py @@ -17,3 +17,8 @@ def do_health_query(cc, args): def do_health_query_upgrade(cc, args): """Run the Health Check for an Upgrade.""" print(cc.health.get_upgrade()) + + +def do_health_query_kube_upgrade(cc, args): + """Run the Health Check for a Kubernetes Upgrade.""" + print(cc.health.get_kube_upgrade()) diff --git a/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/health.py b/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/health.py index 14d6fd6d7c..1ca188cc77 100644 --- a/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/health.py +++ b/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/health.py @@ -36,11 +36,24 @@ class HealthController(rest.RestController): @wsme_pecan.wsexpose(wtypes.text, wtypes.text) def get_one(self, upgrade): """Validates the health of the system for an upgrade""" - try: - success, output = pecan.request.rpcapi.get_system_health( - pecan.request.context, upgrade=True) - except Exception as e: - LOG.exception(e) + if upgrade == 'upgrade': + try: + success, output = pecan.request.rpcapi.get_system_health( + pecan.request.context, upgrade=True) + except Exception as e: + LOG.exception(e) + raise wsme.exc.ClientSideError(_( + "Unable to perform health upgrade query.")) + return output + elif upgrade == 'kube-upgrade': + try: + success, output = pecan.request.rpcapi.get_system_health( + pecan.request.context, kube_upgrade=True) + except Exception as e: + LOG.exception(e) + raise wsme.exc.ClientSideError(_( + "Unable to perform kubernetes health upgrade query.")) + return output + else: raise wsme.exc.ClientSideError(_( - "Unable to perform health upgrade query.")) - return output + "Unsupported upgrade type %s." % upgrade)) diff --git a/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/kube_upgrade.py b/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/kube_upgrade.py index 8d16a5d31b..968f25e636 100755 --- a/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/kube_upgrade.py +++ b/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/kube_upgrade.py @@ -170,6 +170,16 @@ class KubeUpgradeController(rest.RestController): force = body.get('force', False) is True + # There must not be a platform upgrade in progress + try: + pecan.request.dbapi.software_upgrade_get_one() + except exception.NotFound: + pass + else: + raise wsme.exc.ClientSideError(_( + "A kubernetes upgrade cannot be done while a platform upgrade " + "is in progress")) + # There must not already be a kubernetes upgrade in progress try: pecan.request.dbapi.kube_upgrade_get_one() @@ -214,9 +224,9 @@ class KubeUpgradeController(rest.RestController): # TODO: check that all installed applications support new k8s version # TODO: check that tiller/armada support new k8s version - # The system must be healthy from the platform perspective + # The system must be healthy success, output = pecan.request.rpcapi.get_system_health( - pecan.request.context, force=force) + pecan.request.context, force=force, kube_upgrade=True) if not success: LOG.info("Health query failure during kubernetes upgrade start: %s" % output) @@ -225,9 +235,7 @@ class KubeUpgradeController(rest.RestController): else: raise wsme.exc.ClientSideError(_( "System is not in a valid state for kubernetes upgrade. " - "Run system health-query-upgrade for more details.")) - - # TODO: kubernetes related health checks... + "Run system health-query for more details.")) # Create upgrade record. create_values = {'from_version': current_kube_version, @@ -327,6 +335,15 @@ class KubeUpgradeController(rest.RestController): "Kubernetes upgrade must be in %s state to complete" % kubernetes.KUBE_UPGRADING_KUBELETS)) + # Make sure no hosts are in a transitory or failed state + kube_host_upgrades = \ + pecan.request.dbapi.kube_host_upgrade_get_list() + for kube_host_upgrade in kube_host_upgrades: + if kube_host_upgrade.status is not None: + raise wsme.exc.ClientSideError(_( + "At least one host has not completed the kubernetes " + "upgrade")) + # Make sure the target version is active version_states = self._kube_operator.kube_get_version_states() if version_states.get(kube_upgrade_obj.to_version, None) != \ diff --git a/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/upgrade.py b/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/upgrade.py index bafb92aebc..afa2152b9f 100755 --- a/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/upgrade.py +++ b/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/upgrade.py @@ -198,6 +198,16 @@ class UpgradeController(rest.RestController): "upgrade-start rejected: An upgrade can only be started " "when %s is active." % constants.CONTROLLER_0_HOSTNAME)) + # There must not be a kubernetes upgrade in progress + try: + pecan.request.dbapi.kube_upgrade_get_one() + except exception.NotFound: + pass + else: + raise wsme.exc.ClientSideError(_( + "upgrade-start rejected: A platform upgrade cannot be done " + "while a kubernetes upgrade is in progress.")) + # There must not already be an upgrade in progress try: pecan.request.dbapi.software_upgrade_get_one() diff --git a/sysinv/sysinv/sysinv/sysinv/common/health.py b/sysinv/sysinv/sysinv/sysinv/common/health.py index 98fb2c5d66..f02789d4b5 100755 --- a/sysinv/sysinv/sysinv/sysinv/common/health.py +++ b/sysinv/sysinv/sysinv/sysinv/common/health.py @@ -14,6 +14,7 @@ from oslo_log import log from sysinv._i18n import _ from sysinv.common import ceph from sysinv.common import constants +from sysinv.common import kubernetes from sysinv.common import utils from sysinv.common.fm import fmclient from sysinv.common.storage_backend_conf import StorageBackendConfig @@ -33,6 +34,7 @@ class Health(object): def __init__(self, dbapi): self._dbapi = dbapi self._ceph = ceph.CephApiOperator() + self._kube_operator = kubernetes.KubeOperator() def _check_hosts_provisioned(self, hosts): """Checks that each host is provisioned""" @@ -213,15 +215,64 @@ class Health(object): return True + def _check_kube_nodes_ready(self): + """Checks that each kubernetes node is ready""" + fail_node_list = [] + + nodes = self._kube_operator.kube_get_nodes() + for node in nodes: + for condition in node.status.conditions: + if condition.type == "Ready" and condition.status != "True": + # This node is not ready + fail_node_list.append(node.metadata.name) + + success = not fail_node_list + return success, fail_node_list + + def _check_kube_control_plane_pods(self): + """Checks that each kubernetes control plane pod is ready""" + fail_pod_list = [] + + pod_ready_status = self._kube_operator.\ + kube_get_control_plane_pod_ready_status() + + for pod_name, ready_status in pod_ready_status.items(): + if ready_status != "True": + # This pod is not ready + fail_pod_list.append(pod_name) + + success = not fail_pod_list + return success, fail_pod_list + + def _check_kube_applications(self): + """Checks that each kubernetes application is in a valid state""" + + fail_app_list = [] + apps = self._dbapi.kube_app_get_all() + + for app in apps: + # The following states are valid during kubernetes upgrade + if app.status not in [constants.APP_UPLOAD_SUCCESS, + constants.APP_APPLY_SUCCESS, + constants.APP_INACTIVE_STATE]: + fail_app_list.append(app.name) + + success = not fail_app_list + return success, fail_app_list + def get_system_health(self, context, force=False): - """Returns the general health of the system""" - # Checks the following: - # All hosts are provisioned - # All hosts are patch current - # All hosts are unlocked/enabled - # All hosts having matching configs - # No management affecting alarms - # For ceph systems: The storage cluster is healthy + """Returns the general health of the system + + Checks the following: + - All hosts are provisioned + - All hosts are patch current + - All hosts are unlocked/enabled + - All hosts having matching configs + - No management affecting alarms + - For ceph systems: The storage cluster is healthy + - All kubernetes nodes are ready + - All kubernetes control plane pods are ready + """ hosts = self._dbapi.ihost_get_list() output = _('System Health:\n') @@ -289,6 +340,24 @@ class Health(object): health_ok = health_ok and success + success, error_nodes = self._check_kube_nodes_ready() + output += _('All kubernetes nodes are ready: [%s]\n') \ + % (Health.SUCCESS_MSG if success else Health.FAIL_MSG) + if not success: + output += _('Kubernetes nodes not ready: %s\n') \ + % ', '.join(error_nodes) + + health_ok = health_ok and success + + success, error_nodes = self._check_kube_control_plane_pods() + output += _('All kubernetes control plane pods are ready: [%s]\n') \ + % (Health.SUCCESS_MSG if success else Health.FAIL_MSG) + if not success: + output += _('Kubernetes control plane pods not ready: %s\n') \ + % ', '.join(error_nodes) + + health_ok = health_ok and success + return health_ok, output def get_system_health_upgrade(self, context, force=False): @@ -358,3 +427,24 @@ class Health(object): health_ok = health_ok and success return health_ok, output + + def get_system_health_kube_upgrade(self, context, force=False): + """Ensures the system is in a valid state for a kubernetes upgrade + + Does a general health check then does the following: + - All kubernetes applications are in a stable state + """ + + health_ok, output = self.get_system_health(context, force) + + success, apps_not_valid = self._check_kube_applications() + output += _( + 'All kubernetes applications are in a valid state: [%s]\n') \ + % (Health.SUCCESS_MSG if success else Health.FAIL_MSG) + if not success: + output += _('Kubernetes applications not in a valid state: %s\n') \ + % ', '.join(apps_not_valid) + + health_ok = health_ok and success + + return health_ok, output diff --git a/sysinv/sysinv/sysinv/sysinv/common/kubernetes.py b/sysinv/sysinv/sysinv/sysinv/common/kubernetes.py index ad12f4e4ff..eed5b859a5 100644 --- a/sysinv/sysinv/sysinv/sysinv/common/kubernetes.py +++ b/sysinv/sysinv/sysinv/sysinv/common/kubernetes.py @@ -387,6 +387,42 @@ class KubeOperator(object): % (namespace, e)) raise + def kube_get_control_plane_pod_ready_status(self): + """Returns the ready status of the control plane pods.""" + c = self._get_kubernetesclient_core() + + # First get a list of master nodes + master_nodes = list() + api_response = c.list_node( + label_selector="node-role.kubernetes.io/master") + for node in api_response.items: + master_nodes.append(node.metadata.name) + + # Populate status dictionary + ready_status = dict() + for node_name in master_nodes: + for component in [KUBE_APISERVER, + KUBE_CONTROLLER_MANAGER, + KUBE_SCHEDULER]: + # Control plane pods are named by component and node. + # E.g. kube-apiserver-controller-0 + pod_name = component + '-' + node_name + ready_status[pod_name] = None + + # Retrieve the control plane pods + api_response = c.list_pod_for_all_namespaces( + label_selector="component in (%s,%s,%s)" % ( + KUBE_APISERVER, KUBE_CONTROLLER_MANAGER, KUBE_SCHEDULER) + ) + pods = api_response.items + for pod in pods: + if pod.status.conditions is not None: + for condition in pod.status.conditions: + if condition.type == "Ready": + ready_status[pod.metadata.name] = condition.status + + return ready_status + def kube_get_control_plane_versions(self): """Returns the lowest control plane component version on each master node.""" diff --git a/sysinv/sysinv/sysinv/sysinv/conductor/manager.py b/sysinv/sysinv/sysinv/sysinv/conductor/manager.py index c13f7e91c4..5f73dc1bd1 100644 --- a/sysinv/sysinv/sysinv/sysinv/conductor/manager.py +++ b/sysinv/sysinv/sysinv/sysinv/conductor/manager.py @@ -9166,19 +9166,25 @@ class ConductorManager(service.PeriodicService): return - def get_system_health(self, context, force=False, upgrade=False): + def get_system_health(self, context, force=False, upgrade=False, + kube_upgrade=False): """ Performs a system health check. :param context: request context. :param force: set to true to ignore minor and warning alarms :param upgrade: set to true to perform an upgrade health check + :param kube_upgrade: set to true to perform a kubernetes upgrade health + check """ health_util = health.Health(self.dbapi) if upgrade is True: return health_util.get_system_health_upgrade(context=context, force=force) + elif kube_upgrade is True: + return health_util.get_system_health_kube_upgrade(context=context, + force=force) else: return health_util.get_system_health(context=context, force=force) diff --git a/sysinv/sysinv/sysinv/sysinv/conductor/rpcapi.py b/sysinv/sysinv/sysinv/sysinv/conductor/rpcapi.py index 5c4854ca5f..205a8b1523 100644 --- a/sysinv/sysinv/sysinv/sysinv/conductor/rpcapi.py +++ b/sysinv/sysinv/sysinv/sysinv/conductor/rpcapi.py @@ -1285,17 +1285,21 @@ class ConductorAPI(sysinv.openstack.common.rpc.proxy.RpcProxy): return self.cast(context, self.make_msg('complete_simplex_backup', success=success)) - def get_system_health(self, context, force=False, upgrade=False): + def get_system_health(self, context, force=False, upgrade=False, + kube_upgrade=False): """ Performs a system health check. :param context: request context. :param force: set to true to ignore minor and warning alarms :param upgrade: set to true to perform an upgrade health check + :param kube_upgrade: set to true to perform a kubernetes upgrade health + check """ return self.call(context, self.make_msg('get_system_health', - force=force, upgrade=upgrade)) + force=force, upgrade=upgrade, + kube_upgrade=kube_upgrade)) def reserve_ip_for_first_storage_node(self, context): """ diff --git a/sysinv/sysinv/sysinv/sysinv/tests/api/test_kube_upgrade.py b/sysinv/sysinv/sysinv/sysinv/tests/api/test_kube_upgrade.py index f035e7b3f7..aeaddbf1e2 100644 --- a/sysinv/sysinv/sysinv/sysinv/tests/api/test_kube_upgrade.py +++ b/sysinv/sysinv/sysinv/sysinv/tests/api/test_kube_upgrade.py @@ -11,6 +11,7 @@ Tests for the API /kube_upgrade/ methods. import mock from six.moves import http_client +from sysinv.common import constants from sysinv.common import kubernetes from sysinv.tests.api import base @@ -56,16 +57,17 @@ class FakeConductorAPI(object): def __init__(self): self.kube_download_images = mock.MagicMock() self.kube_upgrade_networking = mock.MagicMock() - self.get_system_health_return = (True, "System is super healthy") + self.get_system_health_return = ( + True, "System is super healthy") - def get_system_health(self, context, force=False): + def get_system_health(self, context, force=False, kube_upgrade=False): if force: return True, "System is healthy because I was forced to say that" else: return self.get_system_health_return -class TestKubeUpgrade(base.FunctionalTest, dbbase.BaseSystemTestCase): +class TestKubeUpgrade(base.FunctionalTest, dbbase.BaseHostTestCase): def setUp(self): super(TestKubeUpgrade, self).setUp() @@ -132,6 +134,14 @@ class TestKubeUpgrade(base.FunctionalTest, dbbase.BaseSystemTestCase): self.mocked_kube_get_version_states.start() self.addCleanup(self.mocked_kube_get_version_states.stop) + def _create_controller_0(self, subfunction=None, numa_nodes=1, **kw): + return self._create_test_host( + personality=constants.CONTROLLER, + subfunction=subfunction, + numa_nodes=numa_nodes, + unit=0, + **kw) + class TestListKubeUpgrade(TestKubeUpgrade): @@ -183,6 +193,24 @@ class TestPostKubeUpgrade(TestKubeUpgrade, dbbase.ControllerHostTestCase): self.host.id) self.assertEqual('v1.43.1', kube_host_upgrade.target_version) + def test_create_platform_upgrade_exists(self): + # Test creation of upgrade when platform upgrade in progress + dbutils.create_test_load(software_version=dbutils.SW_VERSION_NEW, + compatible_version=dbutils.SW_VERSION, + state=constants.IMPORTED_LOAD_STATE) + dbutils.create_test_upgrade() + + create_dict = dbutils.post_get_test_kube_upgrade(to_version='v1.43.2') + result = self.post_json('/kube_upgrade', create_dict, + headers={'User-Agent': 'sysinv-test'}, + expect_errors=True) + + # Verify the failure + self.assertEqual(result.content_type, 'application/json') + self.assertEqual(http_client.BAD_REQUEST, result.status_int) + self.assertIn("upgrade cannot be done while a platform upgrade", + result.json['error_message']) + def test_create_upgrade_exists(self): # Test creation of upgrade when upgrade already exists dbutils.create_test_kube_upgrade( @@ -546,6 +574,43 @@ class TestPatch(TestKubeUpgrade): self.assertEqual(result['to_version'], 'v1.43.2') self.assertEqual(result['state'], new_state) + def test_update_state_complete_incomplete_host(self): + # Test updating the state of an upgrade to complete when a host has + # not completed its upgrade + self.kube_get_version_states_result = {'v1.42.1': 'available', + 'v1.42.2': 'available', + 'v1.43.1': 'available', + 'v1.43.2': 'active', + 'v1.43.3': 'available'} + + # Create host + self._create_controller_0() + + # Create the upgrade + dbutils.create_test_kube_upgrade( + from_version='v1.43.1', + to_version='v1.43.2', + state=kubernetes.KUBE_UPGRADING_KUBELETS) + + # Mark the kube host upgrade as failed + values = {'status': kubernetes.KUBE_HOST_UPGRADING_CONTROL_PLANE_FAILED} + self.dbapi.kube_host_upgrade_update(1, values) + + # Update state + new_state = kubernetes.KUBE_UPGRADE_COMPLETE + result = self.patch_json('/kube_upgrade', + [{'path': '/state', + 'value': new_state, + 'op': 'replace'}], + headers={'User-Agent': 'sysinv-test'}, + expect_errors=True) + + # Verify the failure + self.assertEqual(result.content_type, 'application/json') + self.assertEqual(http_client.BAD_REQUEST, result.status_int) + self.assertIn("At least one host has not completed", + result.json['error_message']) + def test_update_state_no_upgrade(self): # Test updating the state when an upgrade doesn't exist diff --git a/sysinv/sysinv/sysinv/sysinv/tests/api/test_upgrade.py b/sysinv/sysinv/sysinv/sysinv/tests/api/test_upgrade.py new file mode 100644 index 0000000000..b4539a7ecb --- /dev/null +++ b/sysinv/sysinv/sysinv/sysinv/tests/api/test_upgrade.py @@ -0,0 +1,93 @@ +# +# Copyright (c) 2019 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + +""" +Tests for the API /upgrade/ methods. +""" + +import mock +from six.moves import http_client + +from sysinv.common import constants +from sysinv.common import kubernetes + +from sysinv.tests.api import base +from sysinv.tests.db import base as dbbase +from sysinv.tests.db import utils as dbutils + + +class FakeConductorAPI(object): + + def __init__(self): + self.start_upgrade = mock.MagicMock() + self.get_system_health_return = (True, "System is super healthy") + + def get_system_health(self, context, force=False, upgrade=False): + if force: + return True, "System is healthy because I was forced to say that" + else: + return self.get_system_health_return + + +class TestUpgrade(base.FunctionalTest, dbbase.BaseSystemTestCase): + + def setUp(self): + super(TestUpgrade, self).setUp() + + # Mock the Conductor API + self.fake_conductor_api = FakeConductorAPI() + p = mock.patch('sysinv.conductor.rpcapi.ConductorAPI') + self.mock_conductor_api = p.start() + self.mock_conductor_api.return_value = self.fake_conductor_api + self.addCleanup(p.stop) + + # Behave as if the API is running on controller-0 + p = mock.patch('socket.gethostname') + self.mock_socket_gethostname = p.start() + self.mock_socket_gethostname.return_value = 'controller-0' + self.addCleanup(p.stop) + + +class TestPostUpgrade(TestUpgrade, dbbase.ControllerHostTestCase): + + def test_create(self): + # Create the to load + dbutils.create_test_load(software_version=dbutils.SW_VERSION_NEW, + compatible_version=dbutils.SW_VERSION, + state=constants.IMPORTED_LOAD_STATE) + + # Test creation of upgrade + create_dict = dbutils.get_test_upgrade() + result = self.post_json('/upgrade', create_dict, + headers={'User-Agent': 'sysinv-test'}) + + # Verify that the upgrade was started + self.fake_conductor_api.start_upgrade.assert_called_once() + + # Verify that the upgrade has the expected attributes + self.assertEqual(result.json['from_release'], dbutils.SW_VERSION) + self.assertEqual(result.json['to_release'], dbutils.SW_VERSION_NEW) + self.assertEqual(result.json['state'], constants.UPGRADE_STARTING) + + def test_create_kube_upgrade_exists(self): + # Test creation of upgrade when a kubernetes upgrade exists + dbutils.create_test_kube_upgrade( + from_version='v1.42.1', + to_version='v1.42.2', + state=kubernetes.KUBE_UPGRADING_FIRST_MASTER, + ) + + # Test creation of upgrade + create_dict = dbutils.get_test_upgrade() + result = self.post_json('/upgrade', create_dict, + headers={'User-Agent': 'sysinv-test'}, + expect_errors=True) + + # Verify the failure + self.assertEqual(result.content_type, 'application/json') + self.assertEqual(http_client.BAD_REQUEST, result.status_int) + self.assertIn("cannot be done while a kubernetes upgrade", + result.json['error_message']) diff --git a/sysinv/sysinv/sysinv/sysinv/tests/common/test_health.py b/sysinv/sysinv/sysinv/sysinv/tests/common/test_health.py new file mode 100644 index 0000000000..10f478fada --- /dev/null +++ b/sysinv/sysinv/sysinv/sysinv/tests/common/test_health.py @@ -0,0 +1,377 @@ +# +# Copyright (c) 2019 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + +""" +Tests for the health utilities. +""" + +import kubernetes +import mock +import uuid + +from sysinv.common import constants +from sysinv.common import health +from sysinv.openstack.common import context + +from sysinv.tests.db import base as dbbase +from sysinv.tests.db import utils as dbutils + + +class TestHealth(dbbase.BaseHostTestCase): + + def setup_result(self): + + self.patch_current_result = { + 'data': [ + {'hostname': 'controller-0', + 'patch_current': True, + }, + {'hostname': 'controller-1', + 'patch_current': True, + } + ] + } + + self.multi_node_result = [ + kubernetes.client.V1Node( + api_version="v1", + kind="Node", + metadata=kubernetes.client.V1ObjectMeta( + name="controller-0", + namespace="test-namespace-1"), + status=kubernetes.client.V1NodeStatus( + conditions=[ + kubernetes.client.V1NodeCondition( + status="False", + type="NetworkUnavailable"), + kubernetes.client.V1NodeCondition( + status="False", + type="MemoryPressure"), + kubernetes.client.V1NodeCondition( + status="False", + type="DiskPressure"), + kubernetes.client.V1NodeCondition( + status="False", + type="PIDPressure"), + kubernetes.client.V1NodeCondition( + status="True", + type="Ready"), + ], + node_info=kubernetes.client.V1NodeSystemInfo( + architecture="fake-architecture", + boot_id="fake-boot-id", + container_runtime_version="fake-cr-version", + kernel_version="fake-kernel-version", + kube_proxy_version="fake-proxy-version", + kubelet_version="v1.42.4", + machine_id="fake-machine-id", + operating_system="fake-os", + os_image="fake-os-image", + system_uuid="fake-system-uuid")) + ), + kubernetes.client.V1Node( + api_version="v1", + kind="Node", + metadata=kubernetes.client.V1ObjectMeta( + name="controller-1", + namespace="test-namespace-1"), + status=kubernetes.client.V1NodeStatus( + conditions=[ + kubernetes.client.V1NodeCondition( + status="False", + type="NetworkUnavailable"), + kubernetes.client.V1NodeCondition( + status="False", + type="MemoryPressure"), + kubernetes.client.V1NodeCondition( + status="False", + type="DiskPressure"), + kubernetes.client.V1NodeCondition( + status="False", + type="PIDPressure"), + kubernetes.client.V1NodeCondition( + status="True", + type="Ready"), + ], + node_info=kubernetes.client.V1NodeSystemInfo( + architecture="fake-architecture", + boot_id="fake-boot-id", + container_runtime_version="fake-cr-version", + kernel_version="fake-kernel-version", + kube_proxy_version="fake-proxy-version", + kubelet_version="v1.42.3", + machine_id="fake-machine-id", + operating_system="fake-os", + os_image="fake-os-image", + system_uuid="fake-system-uuid")) + ), + ] + + self.cp_pod_ready_status_result = { + 'kube-apiserver-controller-0': 'True', + 'kube-controller-manager-controller-0': 'True', + 'kube-scheduler-controller-0': 'True', + 'kube-apiserver-controller-1': 'True', + 'kube-controller-manager-controller-1': 'True', + 'kube-scheduler-controller-1': 'True', + } + + def setUp(self): + super(TestHealth, self).setUp() + + # Mock the patching API + self.mock_patch_query_hosts_result = None + + def mock_patch_query_hosts(token, timeout, region_name): + return self.mock_patch_query_hosts_result + self.mocked_patch_query_hosts = mock.patch( + 'sysinv.api.controllers.v1.patch_api.patch_query_hosts', + mock_patch_query_hosts) + self.mocked_patch_query_hosts.start() + self.addCleanup(self.mocked_patch_query_hosts.stop) + + # Mock the KubeOperator + self.kube_get_nodes_result = None + + def mock_kube_get_nodes(obj): + return self.kube_get_nodes_result + self.mocked_kube_get_nodes = mock.patch( + 'sysinv.common.kubernetes.KubeOperator.kube_get_nodes', + mock_kube_get_nodes) + self.mocked_kube_get_nodes.start() + self.addCleanup(self.mocked_kube_get_nodes.stop) + + self.kube_get_control_plane_pod_ready_status_result = None + + def mock_kube_get_control_plane_pod_ready_status(obj): + return self.kube_get_control_plane_pod_ready_status_result + self.mocked_kube_get_control_plane_pod_ready_status = mock.patch( + 'sysinv.common.kubernetes.KubeOperator.' + 'kube_get_control_plane_pod_ready_status', + mock_kube_get_control_plane_pod_ready_status) + self.mocked_kube_get_control_plane_pod_ready_status.start() + self.addCleanup( + self.mocked_kube_get_control_plane_pod_ready_status.stop) + + # Mock the fm API + p = mock.patch('sysinv.common.health.fmclient') + self.mock_fm_client_alarm_list = p.start() + self.addCleanup(p.stop) + + # Set up objects for testing + self.context = context.get_admin_context() + self.health = health.Health(self.dbapi) + + # Set up results + self.setup_result() + + def tearDown(self): + super(TestHealth, self).tearDown() + + pass + + def test_get_system_health(self): + # Create controller-0 + config_uuid = str(uuid.uuid4()) + self._create_test_host(personality=constants.CONTROLLER, + unit=0, + config_status=None, + config_applied=config_uuid, + config_target=config_uuid, + invprovision=constants.PROVISIONED, + administrative=constants.ADMIN_UNLOCKED, + operational=constants.OPERATIONAL_ENABLED, + availability=constants.AVAILABILITY_ONLINE) + + # Create controller-1 + self._create_test_host(personality=constants.CONTROLLER, + unit=1, + config_status=None, + config_applied=config_uuid, + config_target=config_uuid, + invprovision=constants.PROVISIONED, + administrative=constants.ADMIN_UNLOCKED, + operational=constants.OPERATIONAL_ENABLED, + availability=constants.AVAILABILITY_ONLINE) + + # Set up the mocked results + self.mock_patch_query_hosts_result = self.patch_current_result + self.kube_get_nodes_result = self.multi_node_result + self.kube_get_control_plane_pod_ready_status_result = \ + self.cp_pod_ready_status_result + + # Check system health + health_ok, output = self.health.get_system_health(self.context) + assert health_ok is True, "output: %s" % output + + def test_get_system_health_k8s_node_not_ready(self): + # Create controller-0 + config_uuid = str(uuid.uuid4()) + self._create_test_host(personality=constants.CONTROLLER, + unit=0, + config_status=None, + config_applied=config_uuid, + config_target=config_uuid, + invprovision=constants.PROVISIONED, + administrative=constants.ADMIN_UNLOCKED, + operational=constants.OPERATIONAL_ENABLED, + availability=constants.AVAILABILITY_ONLINE) + + # Create controller-1 + self._create_test_host(personality=constants.CONTROLLER, + unit=1, + config_status=None, + config_applied=config_uuid, + config_target=config_uuid, + invprovision=constants.PROVISIONED, + administrative=constants.ADMIN_UNLOCKED, + operational=constants.OPERATIONAL_ENABLED, + availability=constants.AVAILABILITY_ONLINE) + + # Set up the mocked results + self.mock_patch_query_hosts_result = self.patch_current_result + self.kube_get_nodes_result = self.multi_node_result + # Mark controller-0 as not ready + self.kube_get_nodes_result[0].status.conditions[4].status = "False" + self.kube_get_control_plane_pod_ready_status_result = \ + self.cp_pod_ready_status_result + + # Check system health + health_ok, output = self.health.get_system_health(self.context) + assert health_ok is False, "output: %s" % output + assert "Kubernetes nodes not ready: controller-0" in output, \ + "get_system_health output: %s" % output + + def test_get_system_health_k8s_cp_pod_not_ready(self): + # Create controller-0 + config_uuid = str(uuid.uuid4()) + self._create_test_host(personality=constants.CONTROLLER, + unit=0, + config_status=None, + config_applied=config_uuid, + config_target=config_uuid, + invprovision=constants.PROVISIONED, + administrative=constants.ADMIN_UNLOCKED, + operational=constants.OPERATIONAL_ENABLED, + availability=constants.AVAILABILITY_ONLINE) + + # Create controller-1 + self._create_test_host(personality=constants.CONTROLLER, + unit=1, + config_status=None, + config_applied=config_uuid, + config_target=config_uuid, + invprovision=constants.PROVISIONED, + administrative=constants.ADMIN_UNLOCKED, + operational=constants.OPERATIONAL_ENABLED, + availability=constants.AVAILABILITY_ONLINE) + + # Set up the mocked results + self.mock_patch_query_hosts_result = self.patch_current_result + self.kube_get_nodes_result = self.multi_node_result + self.kube_get_control_plane_pod_ready_status_result = \ + self.cp_pod_ready_status_result + # Mark a cp pod as not ready + self.kube_get_control_plane_pod_ready_status_result[ + 'kube-controller-manager-controller-1'] = 'False' + + # Check system health + health_ok, output = self.health.get_system_health(self.context) + assert health_ok is False, "get_system_health output: %s" % output + assert "kubernetes control plane pods are ready: [Fail]" in output, \ + "output: %s" % output + assert "not ready: kube-controller-manager-controller-1" in output, \ + "output: %s" % output + + def test_get_system_health_kube_upgrade(self): + # Create controller-0 + config_uuid = str(uuid.uuid4()) + self._create_test_host(personality=constants.CONTROLLER, + unit=0, + config_status=None, + config_applied=config_uuid, + config_target=config_uuid, + invprovision=constants.PROVISIONED, + administrative=constants.ADMIN_UNLOCKED, + operational=constants.OPERATIONAL_ENABLED, + availability=constants.AVAILABILITY_ONLINE) + + # Create controller-1 + self._create_test_host(personality=constants.CONTROLLER, + unit=1, + config_status=None, + config_applied=config_uuid, + config_target=config_uuid, + invprovision=constants.PROVISIONED, + administrative=constants.ADMIN_UNLOCKED, + operational=constants.OPERATIONAL_ENABLED, + availability=constants.AVAILABILITY_ONLINE) + + # Create kubernetes apps + dbutils.create_test_app(name='test-app-1', + status=constants.APP_APPLY_SUCCESS) + dbutils.create_test_app(name='test-app-2', + status=constants.APP_APPLY_SUCCESS) + dbutils.create_test_app(name='test-app-3', + status=constants.APP_UPLOAD_SUCCESS) + + # Set up the mocked results + self.mock_patch_query_hosts_result = self.patch_current_result + self.kube_get_nodes_result = self.multi_node_result + self.kube_get_control_plane_pod_ready_status_result = \ + self.cp_pod_ready_status_result + + # Check system health + health_ok, output = self.health.get_system_health_kube_upgrade( + self.context) + assert health_ok is True, "output: %s" % output + + def test_get_system_health_kube_upgrade_k8s_app_invalid_state(self): + # Create controller-0 + config_uuid = str(uuid.uuid4()) + self._create_test_host(personality=constants.CONTROLLER, + unit=0, + config_status=None, + config_applied=config_uuid, + config_target=config_uuid, + invprovision=constants.PROVISIONED, + administrative=constants.ADMIN_UNLOCKED, + operational=constants.OPERATIONAL_ENABLED, + availability=constants.AVAILABILITY_ONLINE) + + # Create controller-1 + self._create_test_host(personality=constants.CONTROLLER, + unit=1, + config_status=None, + config_applied=config_uuid, + config_target=config_uuid, + invprovision=constants.PROVISIONED, + administrative=constants.ADMIN_UNLOCKED, + operational=constants.OPERATIONAL_ENABLED, + availability=constants.AVAILABILITY_ONLINE) + + # Create kubernetes apps + dbutils.create_test_app(name='test-app-1', + status=constants.APP_APPLY_SUCCESS) + dbutils.create_test_app(name='test-app-2', + status=constants.APP_APPLY_IN_PROGRESS) + dbutils.create_test_app(name='test-app-3', + status=constants.APP_UPLOAD_SUCCESS) + + # Set up the mocked results + self.mock_patch_query_hosts_result = self.patch_current_result + self.kube_get_nodes_result = self.multi_node_result + self.kube_get_control_plane_pod_ready_status_result = \ + self.cp_pod_ready_status_result + + # Check system health + health_ok, output = self.health.get_system_health_kube_upgrade( + self.context) + assert health_ok is False, "output: %s" % output + assert "applications are in a valid state: [Fail]" in output, \ + "output: %s" % output + assert "applications not in a valid state: test-app-2" in output, \ + "output: %s" % output diff --git a/sysinv/sysinv/sysinv/sysinv/tests/common/test_kubernetes.py b/sysinv/sysinv/sysinv/sysinv/tests/common/test_kubernetes.py index 9d763b22d2..2614295c5c 100644 --- a/sysinv/sysinv/sysinv/sysinv/tests/common/test_kubernetes.py +++ b/sysinv/sysinv/sysinv/sysinv/tests/common/test_kubernetes.py @@ -54,6 +54,23 @@ FAKE_KUBE_VERSIONS = [ }, ] +FAKE_POD_STATUS = kubernetes.client.V1PodStatus( + conditions=[ + kubernetes.client.V1PodCondition( + status="True", + type="Initialized"), + kubernetes.client.V1PodCondition( + status="True", + type="Ready"), + kubernetes.client.V1PodCondition( + status="True", + type="ContainersReady"), + kubernetes.client.V1PodCondition( + status="True", + type="PodScheduled"), + ], +) + def mock_get_kube_versions(): return FAKE_KUBE_VERSIONS @@ -193,6 +210,7 @@ class TestKubeOperator(base.TestCase): metadata=kubernetes.client.V1ObjectMeta( name="kube-apiserver-test-node-1", namespace="kube-system"), + status=FAKE_POD_STATUS, spec=kubernetes.client.V1PodSpec( containers=[ kubernetes.client.V1Container( @@ -213,6 +231,7 @@ class TestKubeOperator(base.TestCase): metadata=kubernetes.client.V1ObjectMeta( name="kube-controller-manager-test-node-1", namespace="kube-system"), + status=FAKE_POD_STATUS, spec=kubernetes.client.V1PodSpec( containers=[ kubernetes.client.V1Container( @@ -233,6 +252,7 @@ class TestKubeOperator(base.TestCase): metadata=kubernetes.client.V1ObjectMeta( name="kube-scheduler-test-node-1", namespace="kube-system"), + status=FAKE_POD_STATUS, spec=kubernetes.client.V1PodSpec( containers=[ kubernetes.client.V1Container( @@ -253,6 +273,7 @@ class TestKubeOperator(base.TestCase): metadata=kubernetes.client.V1ObjectMeta( name="kube-apiserver-test-node-2", namespace="kube-system"), + status=FAKE_POD_STATUS, spec=kubernetes.client.V1PodSpec( containers=[ kubernetes.client.V1Container( @@ -273,6 +294,7 @@ class TestKubeOperator(base.TestCase): metadata=kubernetes.client.V1ObjectMeta( name="kube-controller-manager-test-node-2", namespace="kube-system"), + status=FAKE_POD_STATUS, spec=kubernetes.client.V1PodSpec( containers=[ kubernetes.client.V1Container( @@ -293,6 +315,7 @@ class TestKubeOperator(base.TestCase): metadata=kubernetes.client.V1ObjectMeta( name="kube-scheduler-test-node-2", namespace="kube-system"), + status=FAKE_POD_STATUS, spec=kubernetes.client.V1PodSpec( containers=[ kubernetes.client.V1Container( @@ -398,10 +421,106 @@ class TestKubeOperator(base.TestCase): ), } - self.single_node_result = kubernetes.client.V1NodeList( + self.cp_pods_list_result = kubernetes.client.V1PodList( api_version="v1", items=[ kubernetes.client.V1Pod( + api_version="v1", + kind="Pod", + metadata=kubernetes.client.V1ObjectMeta( + name="kube-apiserver-test-node-1", + namespace="kube-system"), + status=FAKE_POD_STATUS, + spec=kubernetes.client.V1PodSpec( + containers=[ + kubernetes.client.V1Container( + name="kube-apiserver", + image="test-image-1:v1.42.1"), + ], + ), + ), + kubernetes.client.V1Pod( + api_version="v1", + kind="Pod", + metadata=kubernetes.client.V1ObjectMeta( + name="kube-controller-manager-test-node-1", + namespace="kube-system"), + status=FAKE_POD_STATUS, + spec=kubernetes.client.V1PodSpec( + containers=[ + kubernetes.client.V1Container( + name="kube-controller-manager", + image="test-image-2:v1.42.1"), + ], + ), + ), + kubernetes.client.V1Pod( + api_version="v1", + kind="Pod", + metadata=kubernetes.client.V1ObjectMeta( + name="kube-scheduler-test-node-1", + namespace="kube-system"), + status=FAKE_POD_STATUS, + spec=kubernetes.client.V1PodSpec( + containers=[ + kubernetes.client.V1Container( + name="kube-scheduler", + image="test-image-3:v1.42.1"), + ], + ), + ), + kubernetes.client.V1Pod( + api_version="v1", + kind="Pod", + metadata=kubernetes.client.V1ObjectMeta( + name="kube-apiserver-test-node-2", + namespace="kube-system"), + status=FAKE_POD_STATUS, + spec=kubernetes.client.V1PodSpec( + containers=[ + kubernetes.client.V1Container( + name="kube-apiserver", + image="test-image-1:v1.42.1"), + ], + ), + ), + kubernetes.client.V1Pod( + api_version="v1", + kind="Pod", + metadata=kubernetes.client.V1ObjectMeta( + name="kube-controller-manager-test-node-2", + namespace="kube-system"), + status=FAKE_POD_STATUS, + spec=kubernetes.client.V1PodSpec( + containers=[ + kubernetes.client.V1Container( + name="kube-controller-manager", + image="test-image-2:v1.42.1"), + ], + ), + ), + kubernetes.client.V1Pod( + api_version="v1", + kind="Pod", + metadata=kubernetes.client.V1ObjectMeta( + name="kube-scheduler-test-node-2", + namespace="kube-system"), + status=FAKE_POD_STATUS, + spec=kubernetes.client.V1PodSpec( + containers=[ + kubernetes.client.V1Container( + name="kube-scheduler", + image="test-image-3:v1.42.1"), + ], + ), + ), + ], + ) + + self.single_node_result = kubernetes.client.V1NodeList( + api_version="v1", + items=[ + kubernetes.client.V1Node( api_version="v1", kind="Node", metadata=kubernetes.client.V1ObjectMeta( @@ -426,7 +545,7 @@ class TestKubeOperator(base.TestCase): self.multi_node_result = kubernetes.client.V1NodeList( api_version="v1", items=[ - kubernetes.client.V1Pod( + kubernetes.client.V1Node( api_version="v1", kind="Node", metadata=kubernetes.client.V1ObjectMeta( @@ -445,7 +564,7 @@ class TestKubeOperator(base.TestCase): os_image="fake-os-image", system_uuid="fake-system-uuid")) ), - kubernetes.client.V1Pod( + kubernetes.client.V1Node( api_version="v1", kind="Node", metadata=kubernetes.client.V1ObjectMeta( @@ -470,13 +589,13 @@ class TestKubeOperator(base.TestCase): self.config_map_result = kubernetes.client.V1ConfigMap( api_version="v1", data={"ClusterConfiguration": - "apiServer:\n" - " certSANs:\n" - " - 127.0.0.1\n" - " - 192.168.206.2\n" - "apiVersion: kubeadm.k8s.io/v1beta2\n" - "kubernetesVersion: v1.42.4\n" - "kind: ClusterStatus\n" + "apiServer:\n" + " certSANs:\n" + " - 127.0.0.1\n" + " - 192.168.206.2\n" + "apiVersion: kubeadm.k8s.io/v1beta2\n" + "kubernetesVersion: v1.42.4\n" + "kind: ClusterStatus\n" }, metadata=kubernetes.client.V1ObjectMeta( name="kubeadm-config", @@ -486,12 +605,12 @@ class TestKubeOperator(base.TestCase): self.config_map_result_no_version = kubernetes.client.V1ConfigMap( api_version="v1", data={"ClusterConfiguration": - "apiServer:\n" - " certSANs:\n" - " - 127.0.0.1\n" - " - 192.168.206.2\n" - "apiVersion: kubeadm.k8s.io/v1beta2\n" - "kind: ClusterStatus\n" + "apiServer:\n" + " certSANs:\n" + " - 127.0.0.1\n" + " - 192.168.206.2\n" + "apiVersion: kubeadm.k8s.io/v1beta2\n" + "kind: ClusterStatus\n" }, metadata=kubernetes.client.V1ObjectMeta( name="kubeadm-config", @@ -513,6 +632,15 @@ class TestKubeOperator(base.TestCase): mock_list_namespaced_pod) self.mocked_list_namespaced_pod.start() + self.list_pod_for_all_namespaces_result = None + + def mock_list_pod_for_all_namespaces(obj, label_selector=""): + return self.list_pod_for_all_namespaces_result + self.mocked_list_pod_for_all_namespaces = mock.patch( + 'kubernetes.client.CoreV1Api.list_pod_for_all_namespaces', + mock_list_pod_for_all_namespaces) + self.mocked_list_pod_for_all_namespaces.start() + self.list_node_result = None def mock_list_node(obj, label_selector=""): @@ -537,7 +665,9 @@ class TestKubeOperator(base.TestCase): super(TestKubeOperator, self).tearDown() self.mocked_list_namespaced_pod.stop() + self.mocked_list_pod_for_all_namespaces.stop() self.mocked_list_node.stop() + self.mocked_read_namespaced_config_map.stop() def test_kube_get_image_by_pod_name(self): @@ -563,6 +693,47 @@ class TestKubeOperator(base.TestCase): 'test-pod-1', 'test-namespace-1', 'test-container-1') assert result == "test-image-1:imageversion-1" + def test_kube_get_control_plane_pod_ready_status(self): + + self.list_pod_for_all_namespaces_result = self.cp_pods_list_result + self.list_node_result = self.multi_node_result + + result = self.kube_operator.kube_get_control_plane_pod_ready_status() + assert result == {'kube-apiserver-test-node-1': 'True', + 'kube-controller-manager-test-node-1': 'True', + 'kube-scheduler-test-node-1': 'True', + 'kube-apiserver-test-node-2': 'True', + 'kube-controller-manager-test-node-2': 'True', + 'kube-scheduler-test-node-2': 'True'} + + def test_kube_get_control_plane_pod_ready_status_single_node(self): + + self.list_pod_for_all_namespaces_result = self.cp_pods_list_result + del self.cp_pods_list_result.items[5] + del self.cp_pods_list_result.items[4] + del self.cp_pods_list_result.items[3] + self.list_node_result = self.single_node_result + + result = self.kube_operator.kube_get_control_plane_pod_ready_status() + assert result == {'kube-apiserver-test-node-1': 'True', + 'kube-controller-manager-test-node-1': 'True', + 'kube-scheduler-test-node-1': 'True'} + + def test_kube_get_control_plane_pod_ready_status_missing_pods(self): + + self.list_pod_for_all_namespaces_result = self.cp_pods_list_result + del self.cp_pods_list_result.items[5] + del self.cp_pods_list_result.items[1] + self.list_node_result = self.multi_node_result + + result = self.kube_operator.kube_get_control_plane_pod_ready_status() + assert result == {'kube-apiserver-test-node-1': 'True', + 'kube-controller-manager-test-node-1': None, + 'kube-scheduler-test-node-1': 'True', + 'kube-apiserver-test-node-2': 'True', + 'kube-controller-manager-test-node-2': 'True', + 'kube-scheduler-test-node-2': None} + def test_kube_get_control_plane_versions(self): self.list_namespaced_pod_result = self.cp_pods_result diff --git a/sysinv/sysinv/sysinv/sysinv/tests/db/utils.py b/sysinv/sysinv/sysinv/sysinv/tests/db/utils.py index 50ee69d583..f1fce35218 100644 --- a/sysinv/sysinv/sysinv/sysinv/tests/db/utils.py +++ b/sysinv/sysinv/sysinv/sysinv/tests/db/utils.py @@ -77,6 +77,7 @@ properties = { int_uninitialized = 999 SW_VERSION = '0.0' +SW_VERSION_NEW = '1.0' def get_test_node(**kw): @@ -222,9 +223,10 @@ def create_test_isystem(**kw): def get_test_load(**kw): load = { - "software_version": SW_VERSION, - "compatible_version": "N/A", + "software_version": kw.get("software_version", SW_VERSION), + "compatible_version": kw.get("compatible_version", "N/A"), "required_patches": "N/A", + "state": kw.get("state", constants.ACTIVE_LOAD_STATE), } return load @@ -235,6 +237,19 @@ def create_test_load(**kw): return dbapi.load_create(load) +def get_test_upgrade(**kw): + upgrade = {'from_load': kw.get('from_load', 1), + 'to_load': kw.get('to_load', 2), + 'state': kw.get('state', constants.UPGRADE_STARTING)} + return upgrade + + +def create_test_upgrade(**kw): + upgrade = get_test_upgrade(**kw) + dbapi = db_api.get_instance() + return dbapi.software_upgrade_create(upgrade) + + def post_get_test_kube_upgrade(**kw): upgrade = get_test_kube_upgrade(**kw) del upgrade['id']