shipyard/src/bin/shipyard_airflow/shipyard_airflow/plugins/check_k8s_node_status.py

# Copyright 2017 AT&T Intellectual Property.  All other rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import time

from kubernetes import client
from kubernetes import config


def check_node_status(time_out, interval, expected_nodes):
    """This function retrieves the current state of the nodes in the
       Kubernetes cluster. We can use it to check the state of the
       cluster join process (drydock/promenade) and determine if all
       the bare metal nodes have successfully joined the Kubernetes
       cluster.

       :param time_out: Node should be in Ready state before Time Out
       :param interval: Time interval in which we query node state
       :param expected_nodes: The list of nodes that are expected to be
           present in the check for status

        Example::

        import time
        from check_k8s_node_status import check_node_status

        # Wait for a while before checking the cluster-join process as
        # it takes time for process to be triggered across all nodes
        # We will wait for 120 seconds in this example
        time.sleep(120)

        # Calls function to check that all nodes are in Ready State
        # Time out in this case is set to 15 mins, the time interval
        # has been set to 60 seconds

        # The expected nodes are the nodes to be compared against,
        # as there could be nodes that never show up as ready, and those
        # need to be represented in the response
        check_node_status(900, 60, expected_nodes=['a','b','c'])
    """
    # Initialize Variables - the nodes we are watching for
    if not expected_nodes:
        # if you're not looking for any, don't expect me to look either
        return []

    not_ready_node_list = list(expected_nodes)

    # Calculate number of times to execute the 'for' loop
    # Ensure that 'time_out' and 'interval' is passed in as integer
    # The result from the division will be a floating number which
    # We will round off to nearest whole number

    # no div/0 or negative intervals
    if interval < 1:
        interval = 1
    if time_out < 1:
        time_out = 1

    end_range = round(int(time_out) / int(interval))
    # end_range + 1 since the first check doesn't have a sleep ahead of it
    for i in range(0, end_range + 1):
        logging.info("Remaining expected nodes to join cluster: [%s]",
                     ", ".join(not_ready_node_list))
        # Get updated snapshot view of Cluster for each iteration
        ret = _get_all_k8s_node_status()

        # cautiously prevent crashing out of this code to ensure continued
        # processing.
        if ret is not None and hasattr(ret, 'items'):
            # Check the state of nodes against the remaining expceted nodes
            for j in ret.items:
                # resolve response item fields without letting them break
                # the processing loop.
                try:
                    node_name = j.metadata.name
                    summary_status = j.status.conditions[-1].status
                    summary_message = j.status.conditions[-1].message
                except (AttributeError, IndexError):
                    # any issue with the response object, move on to next item
                    logging.warning("Malformed node status response object. "
                                    "Processing continues with the next item",
                                    exc_info=True)
                    continue

                # only check nodes that we're currently waiting for
                if node_name in not_ready_node_list:
                    if summary_status != 'True':
                        # Node not ready, print current state of node
                        logging.info("Node %s is not ready. Status is: %s",
                                     node_name, summary_message)
                    else:
                        # Remove this node from list, it is ready
                        not_ready_node_list.remove(node_name)
                        logging.info("Node %s is in ready state", node_name)

        # determine what to do based on the not_ready_node_list
        if not_ready_node_list and i == end_range:
            # There are remining items, and the timeout is elapsed
            logging.info("Timed Out! Nodes [%s] did not reach ready state",
                         ", ".join(not_ready_node_list))
            break
        elif not not_ready_node_list:
            # Exit loop where there are no more nodes to wait for (all ready)
            logging.info("All expected nodes are in ready state")
            break
        else:
            # There are nodes remaining, and time remining
            # Back off and check again in next iteration
            logging.info("Waiting %d seconds for next check of cluster status",
                         int(interval))
            time.sleep(int(interval))

    # Return the nodes that are not ready.
    return not_ready_node_list


def _get_all_k8s_node_status():
    """Invoke Kubernetes and return the status response object"""
    # Note that we are using 'in_cluster_config'
    try:
        config.load_incluster_config()
        v1 = client.CoreV1Api()
        return v1.list_node(watch=False)
    except Exception:
        # Log some diagnostics and return None.
        logging.warning("There was an error retrieving the cluster status",
                        exc_info=True)