Add Backoff time before checking cluster join

The current logic checks for nodes that started the join process (based on the snapshot of the environment that was taken by the operator at that point in time). It will not check the state of nodes that it is not aware of, i.e. those that it did not capture initially will not be checked. Hence there is a need to introduce backoff time as it takes a while before all the nodes start to join the Cluster. This is a short term stop gap approach until the Promenade API is ready for consumption Change-Id: I2bdf9c970ecb509fe833fd353e6648a97118d79b
2017-12-08 01:56:38 +00:00 · 2017-12-08 01:56:38 +00:00 · ed8107baad
commit ed8107baad
parent 7225cc76cb
6 changed files with 29 additions and 2 deletions
--- a/charts/shipyard/values.yaml
+++ b/charts/shipyard/values.yaml
@ -328,6 +328,7 @@ conf:
      prepare_node_task_timeout: 1800
      deploy_node_query_interval: 30
      deploy_node_task_timeout: 3600
+      cluster_join_check_backoff_time: 120
    healthcheck:
      schema: http
      endpoint: /api/v1.0/health
--- a/etc/shipyard/shipyard.conf.sample
+++ b/etc/shipyard/shipyard.conf.sample
@ -82,6 +82,8 @@
 # Time out (in seconds) for deploy_node task (integer value)
 #deploy_node_task_timeout = 3600

+# Backoff time (in seconds) before checking cluster join (integer value)
+#cluster_join_check_backoff_time = 120

 [healthcheck]

--- a/shipyard_airflow/conf/config.py
+++ b/shipyard_airflow/conf/config.py
@ -170,6 +170,11 @@ SECTIONS = [
                default=3600,
                help='Time out (in seconds) for deploy_node task'
            ),
+            cfg.IntOpt(
+                'cluster_join_check_backoff_time',
+                default=120,
+                help='Backoff time (in seconds) before checking cluster join'
+            ),
        ]
    ),
    ConfigSection(
--- a/shipyard_airflow/plugins/check_k8s_node_status.py
+++ b/shipyard_airflow/plugins/check_k8s_node_status.py
@ -31,8 +31,14 @@ def check_node_status(time_out, interval):

        Example::

+        import time
        from check_k8s_node_status import check_node_status

+        # Wait for a while before checking the cluster-join process as
+        # it takes time for process to be triggered across all nodes
+        # We will wait for 120 seconds in this example
+        time.sleep(120)
+
        # Calls function to check that all nodes are in Ready State
        # Time out in this case is set to 15 mins, the time interval
        # has been set to 60 seconds
@ -48,8 +54,9 @@ def check_node_status(time_out, interval):
    # Logs initial state of all nodes in the cluster
    ret_init = v1.list_node(watch=False)

+    logging.info("Current state of nodes in Cluster is")
+
    for i in ret_init.items:
-        logging.info("Current state of nodes in Cluster is")
        logging.info("%s\t%s\t%s", i.metadata.name,
                     i.status.conditions[-1].status,
                     i.status.conditions[-1].type)
--- a/shipyard_airflow/plugins/drydock_operators.py
+++ b/shipyard_airflow/plugins/drydock_operators.py
@ -165,6 +165,16 @@ class DryDockOperator(BaseOperator):
            self.drydock_action(drydock_client, context, self.action,
                                query_interval, task_timeout)

+            # Wait for 120 seconds (default value) before checking the cluster
+            # join process as it takes time for process to be triggered across
+            # all nodes
+            cluster_join_check_backoff_time = config.get(
+                'drydock', 'cluster_join_check_backoff_time')
+            logging.info("All nodes deployed in MAAS")
+            logging.info("Wait for %d seconds before checking node state...",
+                         int(cluster_join_check_backoff_time))
+            time.sleep(cluster_join_check_backoff_time)
+
            # Check that cluster join process is completed before declaring
            # deploy_node as 'completed'. Set time out to 30 minutes and set
            # polling interval to 30 seconds.
--- a/tests/unit/control/test.conf
+++ b/tests/unit/control/test.conf
@ -8,6 +8,7 @@ web_server = http://airflow-web-int.ucp.svc.cluster.local:8080/
 [deckhand]
 service_type = deckhand
 [drydock]
+cluster_join_check_backoff_time = 120
 deploy_node_query_interval = 30
 deploy_node_task_timeout = 3600
 prepare_node_query_interval = 30
@ -36,4 +37,5 @@ project_name = service
 user_domain_name = default
 username = shipyard
 [shipyard]
-service_type = shipyard
+service_type = shipyard
+