Add Backoff time before checking cluster join

The current logic checks for nodes that started the join process
(based on the snapshot of the environment that was taken by the
operator at that point in time). It will not check the state of
nodes that it is not aware of, i.e. those that it did not capture
initially will not be checked. Hence there is a need to introduce
backoff time as it takes a while before all the nodes start to join
the Cluster.

This is a short term stop gap approach until the Promenade API is ready
for consumption

Change-Id: I2bdf9c970ecb509fe833fd353e6648a97118d79b
This commit is contained in:
Anthony Lin 2017-12-08 01:56:38 +00:00
parent 7225cc76cb
commit ed8107baad
6 changed files with 29 additions and 2 deletions

View File

@ -328,6 +328,7 @@ conf:
prepare_node_task_timeout: 1800
deploy_node_query_interval: 30
deploy_node_task_timeout: 3600
cluster_join_check_backoff_time: 120
healthcheck:
schema: http
endpoint: /api/v1.0/health

View File

@ -82,6 +82,8 @@
# Time out (in seconds) for deploy_node task (integer value)
#deploy_node_task_timeout = 3600
# Backoff time (in seconds) before checking cluster join (integer value)
#cluster_join_check_backoff_time = 120
[healthcheck]

View File

@ -170,6 +170,11 @@ SECTIONS = [
default=3600,
help='Time out (in seconds) for deploy_node task'
),
cfg.IntOpt(
'cluster_join_check_backoff_time',
default=120,
help='Backoff time (in seconds) before checking cluster join'
),
]
),
ConfigSection(

View File

@ -31,8 +31,14 @@ def check_node_status(time_out, interval):
Example::
import time
from check_k8s_node_status import check_node_status
# Wait for a while before checking the cluster-join process as
# it takes time for process to be triggered across all nodes
# We will wait for 120 seconds in this example
time.sleep(120)
# Calls function to check that all nodes are in Ready State
# Time out in this case is set to 15 mins, the time interval
# has been set to 60 seconds
@ -48,8 +54,9 @@ def check_node_status(time_out, interval):
# Logs initial state of all nodes in the cluster
ret_init = v1.list_node(watch=False)
logging.info("Current state of nodes in Cluster is")
for i in ret_init.items:
logging.info("Current state of nodes in Cluster is")
logging.info("%s\t%s\t%s", i.metadata.name,
i.status.conditions[-1].status,
i.status.conditions[-1].type)

View File

@ -165,6 +165,16 @@ class DryDockOperator(BaseOperator):
self.drydock_action(drydock_client, context, self.action,
query_interval, task_timeout)
# Wait for 120 seconds (default value) before checking the cluster
# join process as it takes time for process to be triggered across
# all nodes
cluster_join_check_backoff_time = config.get(
'drydock', 'cluster_join_check_backoff_time')
logging.info("All nodes deployed in MAAS")
logging.info("Wait for %d seconds before checking node state...",
int(cluster_join_check_backoff_time))
time.sleep(cluster_join_check_backoff_time)
# Check that cluster join process is completed before declaring
# deploy_node as 'completed'. Set time out to 30 minutes and set
# polling interval to 30 seconds.

View File

@ -8,6 +8,7 @@ web_server = http://airflow-web-int.ucp.svc.cluster.local:8080/
[deckhand]
service_type = deckhand
[drydock]
cluster_join_check_backoff_time = 120
deploy_node_query_interval = 30
deploy_node_task_timeout = 3600
prepare_node_query_interval = 30
@ -36,4 +37,5 @@ project_name = service
user_domain_name = default
username = shipyard
[shipyard]
service_type = shipyard
service_type = shipyard