Add Backoff time before checking cluster join
The current logic checks for nodes that started the join process (based on the snapshot of the environment that was taken by the operator at that point in time). It will not check the state of nodes that it is not aware of, i.e. those that it did not capture initially will not be checked. Hence there is a need to introduce backoff time as it takes a while before all the nodes start to join the Cluster. This is a short term stop gap approach until the Promenade API is ready for consumption Change-Id: I2bdf9c970ecb509fe833fd353e6648a97118d79b
This commit is contained in:
parent
7225cc76cb
commit
ed8107baad
@ -328,6 +328,7 @@ conf:
|
||||
prepare_node_task_timeout: 1800
|
||||
deploy_node_query_interval: 30
|
||||
deploy_node_task_timeout: 3600
|
||||
cluster_join_check_backoff_time: 120
|
||||
healthcheck:
|
||||
schema: http
|
||||
endpoint: /api/v1.0/health
|
||||
|
@ -82,6 +82,8 @@
|
||||
# Time out (in seconds) for deploy_node task (integer value)
|
||||
#deploy_node_task_timeout = 3600
|
||||
|
||||
# Backoff time (in seconds) before checking cluster join (integer value)
|
||||
#cluster_join_check_backoff_time = 120
|
||||
|
||||
[healthcheck]
|
||||
|
||||
|
@ -170,6 +170,11 @@ SECTIONS = [
|
||||
default=3600,
|
||||
help='Time out (in seconds) for deploy_node task'
|
||||
),
|
||||
cfg.IntOpt(
|
||||
'cluster_join_check_backoff_time',
|
||||
default=120,
|
||||
help='Backoff time (in seconds) before checking cluster join'
|
||||
),
|
||||
]
|
||||
),
|
||||
ConfigSection(
|
||||
|
@ -31,8 +31,14 @@ def check_node_status(time_out, interval):
|
||||
|
||||
Example::
|
||||
|
||||
import time
|
||||
from check_k8s_node_status import check_node_status
|
||||
|
||||
# Wait for a while before checking the cluster-join process as
|
||||
# it takes time for process to be triggered across all nodes
|
||||
# We will wait for 120 seconds in this example
|
||||
time.sleep(120)
|
||||
|
||||
# Calls function to check that all nodes are in Ready State
|
||||
# Time out in this case is set to 15 mins, the time interval
|
||||
# has been set to 60 seconds
|
||||
@ -48,8 +54,9 @@ def check_node_status(time_out, interval):
|
||||
# Logs initial state of all nodes in the cluster
|
||||
ret_init = v1.list_node(watch=False)
|
||||
|
||||
logging.info("Current state of nodes in Cluster is")
|
||||
|
||||
for i in ret_init.items:
|
||||
logging.info("Current state of nodes in Cluster is")
|
||||
logging.info("%s\t%s\t%s", i.metadata.name,
|
||||
i.status.conditions[-1].status,
|
||||
i.status.conditions[-1].type)
|
||||
|
@ -165,6 +165,16 @@ class DryDockOperator(BaseOperator):
|
||||
self.drydock_action(drydock_client, context, self.action,
|
||||
query_interval, task_timeout)
|
||||
|
||||
# Wait for 120 seconds (default value) before checking the cluster
|
||||
# join process as it takes time for process to be triggered across
|
||||
# all nodes
|
||||
cluster_join_check_backoff_time = config.get(
|
||||
'drydock', 'cluster_join_check_backoff_time')
|
||||
logging.info("All nodes deployed in MAAS")
|
||||
logging.info("Wait for %d seconds before checking node state...",
|
||||
int(cluster_join_check_backoff_time))
|
||||
time.sleep(cluster_join_check_backoff_time)
|
||||
|
||||
# Check that cluster join process is completed before declaring
|
||||
# deploy_node as 'completed'. Set time out to 30 minutes and set
|
||||
# polling interval to 30 seconds.
|
||||
|
@ -8,6 +8,7 @@ web_server = http://airflow-web-int.ucp.svc.cluster.local:8080/
|
||||
[deckhand]
|
||||
service_type = deckhand
|
||||
[drydock]
|
||||
cluster_join_check_backoff_time = 120
|
||||
deploy_node_query_interval = 30
|
||||
deploy_node_task_timeout = 3600
|
||||
prepare_node_query_interval = 30
|
||||
@ -36,4 +37,5 @@ project_name = service
|
||||
user_domain_name = default
|
||||
username = shipyard
|
||||
[shipyard]
|
||||
service_type = shipyard
|
||||
service_type = shipyard
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user