1aa814491b
When checking for deployed nodes, the kubernetes join check was only performing a negative check - and would wait for up to the timeout even in the event that nodes that were not part of the current processing before proceeding. This had the drawback of being overall likely to add wait time in any complex deployment scenario, as well as (and more importantly) miss the case where a node never started to try to join, and assume that was a success. This patchset flips the logic to positively look for an expected set of nodes instead, and will not wait upon nodes that are not currently being checked. The end result should remedy both of the drawbacks listed above. Change-Id: Ib07e4e2677ec4f773d695d57893fdfa5e4b7ff76
139 lines
5.8 KiB
Python
139 lines
5.8 KiB
Python
# Copyright 2017 AT&T Intellectual Property. All other rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
import logging
|
|
import time
|
|
|
|
from kubernetes import client
|
|
from kubernetes import config
|
|
|
|
|
|
def check_node_status(time_out, interval, expected_nodes):
|
|
"""This function retrieves the current state of the nodes in the
|
|
Kubernetes cluster. We can use it to check the state of the
|
|
cluster join process (drydock/promenade) and determine if all
|
|
the bare metal nodes have successfully joined the Kubernetes
|
|
cluster.
|
|
|
|
:param time_out: Node should be in Ready state before Time Out
|
|
:param interval: Time interval in which we query node state
|
|
:param expected_nodes: The list of nodes that are expected to be
|
|
present in the check for status
|
|
|
|
Example::
|
|
|
|
import time
|
|
from check_k8s_node_status import check_node_status
|
|
|
|
# Wait for a while before checking the cluster-join process as
|
|
# it takes time for process to be triggered across all nodes
|
|
# We will wait for 120 seconds in this example
|
|
time.sleep(120)
|
|
|
|
# Calls function to check that all nodes are in Ready State
|
|
# Time out in this case is set to 15 mins, the time interval
|
|
# has been set to 60 seconds
|
|
|
|
# The expected nodes are the nodes to be compared against,
|
|
# as there could be nodes that never show up as ready, and those
|
|
# need to be represented in the response
|
|
check_node_status(900, 60, expected_nodes=['a','b','c'])
|
|
"""
|
|
# Initialize Variables - the nodes we are watching for
|
|
if not expected_nodes:
|
|
# if you're not looking for any, don't expect me to look either
|
|
return []
|
|
|
|
not_ready_node_list = list(expected_nodes)
|
|
|
|
# Calculate number of times to execute the 'for' loop
|
|
# Ensure that 'time_out' and 'interval' is passed in as integer
|
|
# The result from the division will be a floating number which
|
|
# We will round off to nearest whole number
|
|
|
|
# no div/0 or negative intervals
|
|
if interval < 1:
|
|
interval = 1
|
|
if time_out < 1:
|
|
time_out = 1
|
|
|
|
end_range = round(int(time_out) / int(interval))
|
|
# end_range + 1 since the first check doesn't have a sleep ahead of it
|
|
for i in range(0, end_range + 1):
|
|
logging.info("Remaining expected nodes to join cluster: [%s]",
|
|
", ".join(not_ready_node_list))
|
|
# Get updated snapshot view of Cluster for each iteration
|
|
ret = _get_all_k8s_node_status()
|
|
|
|
# cautiously prevent crashing out of this code to ensure continued
|
|
# processing.
|
|
if ret is not None and hasattr(ret, 'items'):
|
|
# Check the state of nodes against the remaining expceted nodes
|
|
for j in ret.items:
|
|
# resolve response item fields without letting them break
|
|
# the processing loop.
|
|
try:
|
|
node_name = j.metadata.name
|
|
summary_status = j.status.conditions[-1].status
|
|
summary_message = j.status.conditions[-1].message
|
|
except (AttributeError, IndexError):
|
|
# any issue with the response object, move on to next item
|
|
logging.warning("Malformed node status response object. "
|
|
"Processing continues with the next item",
|
|
exc_info=True)
|
|
continue
|
|
|
|
# only check nodes that we're currently waiting for
|
|
if node_name in not_ready_node_list:
|
|
if summary_status != 'True':
|
|
# Node not ready, print current state of node
|
|
logging.info("Node %s is not ready. Status is: %s",
|
|
node_name, summary_message)
|
|
else:
|
|
# Remove this node from list, it is ready
|
|
not_ready_node_list.remove(node_name)
|
|
logging.info("Node %s is in ready state", node_name)
|
|
|
|
# determine what to do based on the not_ready_node_list
|
|
if not_ready_node_list and i == end_range:
|
|
# There are remining items, and the timeout is elapsed
|
|
logging.info("Timed Out! Nodes [%s] did not reach ready state",
|
|
", ".join(not_ready_node_list))
|
|
break
|
|
elif not not_ready_node_list:
|
|
# Exit loop where there are no more nodes to wait for (all ready)
|
|
logging.info("All expected nodes are in ready state")
|
|
break
|
|
else:
|
|
# There are nodes remaining, and time remining
|
|
# Back off and check again in next iteration
|
|
logging.info("Waiting %d seconds for next check of cluster status",
|
|
int(interval))
|
|
time.sleep(int(interval))
|
|
|
|
# Return the nodes that are not ready.
|
|
return not_ready_node_list
|
|
|
|
|
|
def _get_all_k8s_node_status():
|
|
"""Invoke Kubernetes and return the status response object"""
|
|
# Note that we are using 'in_cluster_config'
|
|
try:
|
|
config.load_incluster_config()
|
|
v1 = client.CoreV1Api()
|
|
return v1.list_node(watch=False)
|
|
except Exception:
|
|
# Log some diagnostics and return None.
|
|
logging.warning("There was an error retrieving the cluster status",
|
|
exc_info=True)
|