Add Operator to Check Node Status

Had a discussion with Scott and I agree that checking
node status (instead of pod readiness) should be the way
to determine if the cluster-join process is completed.

This P.S. adds the operator to address that in the short
run. The long term solution will be to add a promenade API
endpoint that checks cluster-join and Airflow will query
that endpoint instead.

This P.S. also updates some of the comments/wordings used
in the 'check_k8s_pod_status' operator.

Change-Id: I2e5390f1f64c7d8ce374b2537e6d14e65dffecd6
changes/87/569187/1
Anthony Lin 5 years ago committed by Scott Hussey
parent a61e324f10
commit 7225cc76cb
  1. 104
      shipyard_airflow/plugins/check_k8s_node_status.py
  2. 15
      shipyard_airflow/plugins/check_k8s_pod_status.py

@ -0,0 +1,104 @@
# Copyright 2017 AT&T Intellectual Property. All other rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import time
from airflow.exceptions import AirflowException
from kubernetes import client, config
def check_node_status(time_out, interval):
"""This function retrieves the current state of the nodes in the
Kubernetes cluster. We can use it to check the state of the
cluster join process (drydock/promenade) and determine if all
the bare metal nodes have successfully joined the Kubernetes
cluster.
:param time_out: Node should be in Ready state before Time Out
:param interval: Time interval in which we query node state
Example::
from check_k8s_node_status import check_node_status
# Calls function to check that all nodes are in Ready State
# Time out in this case is set to 15 mins, the time interval
# has been set to 60 seconds
check_node_status(900, 60)
"""
# Initialize Variable
not_ready_node_list = []
# Note that we are using 'in_cluster_config'
config.load_incluster_config()
v1 = client.CoreV1Api()
# Logs initial state of all nodes in the cluster
ret_init = v1.list_node(watch=False)
for i in ret_init.items:
logging.info("Current state of nodes in Cluster is")
logging.info("%s\t%s\t%s", i.metadata.name,
i.status.conditions[-1].status,
i.status.conditions[-1].type)
# Populates the list of nodes in the Cluster
not_ready_node_list.append(i.metadata.name)
# Calculate number of times to execute the 'for' loop
# Ensure that 'time_out' and 'interval' is passed in as integer
# The result from the division will be a floating number which
# We will round off to nearest whole number
end_range = round(int(time_out) / int(interval))
for i in range(0, end_range + 1):
# Reset node_ready to True for each iteration
cluster_ready = True
# Get updated snapshot view of Cluster for each iteration
ret = v1.list_node(watch=False)
# Check the current state of nodes that are not in Ready state
# from the previous iteration
for j in ret.items:
if j.metadata.name in not_ready_node_list:
if j.status.conditions[-1].status != 'True':
# Set cluster_ready to False
cluster_ready = False
# Print current state of node
logging.info("Node %s is not Ready", j.metadata.name)
logging.debug("Current status of %s is %s",
j.metadata.name,
j.status.conditions[-1].message)
else:
# Remove 'Ready' node from list
not_ready_node_list.remove(j.metadata.name)
logging.info("Node %s is in Ready state", j.metadata.name)
# Raise Time Out Exception
if not cluster_ready and i == end_range:
raise AirflowException("Timed Out! One or more Nodes fail to "
"get into Ready State!")
# Exit loop if Cluster is in Ready state
if cluster_ready:
logging.info("All nodes are in Ready state")
break
else:
# Back off and check again in next iteration
logging.info("Wait for %d seconds...", int(interval))
time.sleep(int(interval))

@ -21,10 +21,9 @@ from kubernetes import client, config
def check_pods_status(required_pods):
"""This function retrieves the current state of pods in the
Kubernetes cluster. We can use it to check the state of the
cluster join process (drydock/promenade) and determine if all
the bare metal nodes have successfully joined the Kubernetes
cluster.
Kubernetes cluster. We can use it to check that all the pods
that are of interest to us are up in the cluster. Function
returns boolean True/False and queries every 30 seconds.
:param required_pods: List of pods that are of interest to us
@ -32,12 +31,12 @@ def check_pods_status(required_pods):
from check_k8s_pod_status import check_pods_status
join_complete = False
pods_ready = False
required_pods = ['ceph-', 'calico-', 'coredns-', 'kubernetes-']
# Time out can be set as required
while not join_complete:
join_complete = check_pods_status(required_pods)
while not pods_ready:
pods_ready = check_pods_status(required_pods)
"""
# Note that we are using 'in_cluster_config'
config.load_incluster_config()
@ -52,7 +51,7 @@ def check_pods_status(required_pods):
# Return Boolean 'False' if required pods are not in
# 'Succeeded' or 'Running' state
if i.status.phase not in ['Succeeded', 'Running']:
logging.info("Cluster is not in ready state...")
logging.info("Pods are not in ready state...")
logging.info("%s is in %s state", i.metadata.name,
i.status.phase)
logging.info("Wait for 30 seconds...")

Loading…
Cancel
Save