From 5190189a60cce637ada2446b786d5c00a4f2a4f6 Mon Sep 17 00:00:00 2001 From: Anthony Lin Date: Fri, 5 Jan 2018 07:47:11 +0000 Subject: [PATCH] Update DryDock Operator The following errors [0] were encountered during our end-to-end testing. This is a result of extended execution of the workflow that led to expiration of the keystone token. It is also possible for the 'prepare_site' task to take more than 120 seconds to complete. Hence we are increasing the time out for the 'prepare_site_task_timeout' variable to 300 seconds. This P.S. addresses the above 2 observations [0] Logs from DryDock Authorization failed for token Identity response: {"error": {"message": "Failed to validate token", "code": 404, "title": "Not Found"}} Authorization failed for token Change-Id: I4760e390822e6e8c9540216035e263d054fde400 --- charts/shipyard/values.yaml | 2 +- etc/shipyard/shipyard.conf.sample | 2 +- shipyard_airflow/conf/config.py | 2 +- shipyard_airflow/plugins/drydock_operators.py | 51 +++++++++++++++++-- tests/unit/control/test.conf | 2 +- tools/resources/shipyard.conf | 2 +- 6 files changed, 51 insertions(+), 10 deletions(-) diff --git a/charts/shipyard/values.yaml b/charts/shipyard/values.yaml index fedb48c4..15a7698f 100644 --- a/charts/shipyard/values.yaml +++ b/charts/shipyard/values.yaml @@ -323,7 +323,7 @@ conf: verify_site_query_interval: 10 verify_site_task_timeout: 60 prepare_site_query_interval: 10 - prepare_site_task_timeout: 120 + prepare_site_task_timeout: 300 prepare_node_query_interval: 30 prepare_node_task_timeout: 1800 deploy_node_query_interval: 30 diff --git a/etc/shipyard/shipyard.conf.sample b/etc/shipyard/shipyard.conf.sample index 140e16fc..dc17fe6f 100644 --- a/etc/shipyard/shipyard.conf.sample +++ b/etc/shipyard/shipyard.conf.sample @@ -68,7 +68,7 @@ #prepare_site_query_interval = 10 # Time out (in seconds) for prepare_site task (integer value) -#prepare_site_task_timeout = 120 +#prepare_site_task_timeout = 300 # Query interval (in seconds) for prepare_node task (integer value) #prepare_node_query_interval = 30 diff --git a/shipyard_airflow/conf/config.py b/shipyard_airflow/conf/config.py index 1b6106b0..afc42375 100644 --- a/shipyard_airflow/conf/config.py +++ b/shipyard_airflow/conf/config.py @@ -147,7 +147,7 @@ SECTIONS = [ ), cfg.IntOpt( 'prepare_site_task_timeout', - default=120, + default=300, help='Time out (in seconds) for prepare_site task' ), cfg.IntOpt( diff --git a/shipyard_airflow/plugins/drydock_operators.py b/shipyard_airflow/plugins/drydock_operators.py index a7429cda..d380fad2 100644 --- a/shipyard_airflow/plugins/drydock_operators.py +++ b/shipyard_airflow/plugins/drydock_operators.py @@ -28,6 +28,7 @@ from airflow.utils.decorators import apply_defaults import drydock_provisioner.drydock_client.client as client import drydock_provisioner.drydock_client.session as session from check_k8s_node_status import check_node_status +from drydock_provisioner import error as errors from service_endpoint import ucp_service_endpoint from service_token import shipyard_service_token @@ -152,7 +153,7 @@ class DryDockOperator(BaseOperator): # Create Task for prepare_site elif self.action == 'prepare_site': # Default settings for 'prepare_site' execution is to query - # the task every 10 seconds and to time out after 120 seconds + # the task every 10 seconds and to time out after 300 seconds query_interval = config.get('drydock', 'prepare_site_query_interval') task_timeout = config.get('drydock', 'prepare_site_task_timeout') @@ -249,8 +250,8 @@ class DryDockOperator(BaseOperator): logging.info('Task ID is %s', task_id) # Query Task - self.drydock_query_task(drydock_client, interval, time_out, - task_id) + self.drydock_query_task(drydock_client, context, interval, + time_out, task_id) def drydock_perform_task(self, drydock_client, context, perform_task, nodes_filter): @@ -279,7 +280,13 @@ class DryDockOperator(BaseOperator): else: raise AirflowException("Unable to create task!") - def drydock_query_task(self, drydock_client, interval, time_out, task_id): + def drydock_query_task(self, drydock_client, context, interval, + time_out, task_id): + + # Initialize Variables + keystone_token_expired = False + new_dd_client = None + dd_client = drydock_client # Calculate number of times to execute the 'for' loop # Convert 'time_out' and 'interval' from string into integer @@ -290,15 +297,49 @@ class DryDockOperator(BaseOperator): # Query task status for i in range(0, end_range + 1): + if keystone_token_expired: + logging.info("Established new drydock session") + dd_client = new_dd_client + try: # Retrieve current task state - task_state = drydock_client.get_task(task_id=task_id) + task_state = dd_client.get_task(task_id=task_id) task_status = task_state.get('status') task_result = task_state.get('result')['status'] logging.info("Current status of task id %s is %s", task_id, task_status) + + keystone_token_expired = False + + except errors.ClientUnauthorizedError as unauthorized_error: + + # TODO: This is a temporary workaround. Drydock will be + # updated with the appropriate fix in the drydock api + # client by having the session detect a 401/403 response + # and refresh the token appropriately. + # Logs drydock client unauthorized error + keystone_token_expired = True + logging.error(unauthorized_error) + + # Set up new drydock client with new keystone token + logging.info("Setting up new drydock session...") + + context['svc_endpoint'] = ucp_service_endpoint( + self, svc_type='physicalprovisioner') + + new_dd_client = self.drydock_session_client(context) + + except errors.ClientForbiddenError as forbidden_error: + raise AirflowException(forbidden_error) + + except errors.ClientError as client_error: + raise AirflowException(client_error) + except: + # There can be instances where there are intermittent network + # issues that prevents us from retrieving the task state. We + # will want to retry in such situations. logging.info("Unable to retrieve task state. Retrying...") # Raise Time Out Exception diff --git a/tests/unit/control/test.conf b/tests/unit/control/test.conf index 5930e7d8..fd1eb072 100644 --- a/tests/unit/control/test.conf +++ b/tests/unit/control/test.conf @@ -14,7 +14,7 @@ deploy_node_task_timeout = 3600 prepare_node_query_interval = 30 prepare_node_task_timeout = 1800 prepare_site_query_interval = 10 -prepare_site_task_timeout = 120 +prepare_site_task_timeout = 300 service_type = physicalprovisioner verify_site_query_interval = 10 verify_site_task_timeout = 60 diff --git a/tools/resources/shipyard.conf b/tools/resources/shipyard.conf index 86c4b328..29121f9d 100644 --- a/tools/resources/shipyard.conf +++ b/tools/resources/shipyard.conf @@ -16,7 +16,7 @@ deploy_node_task_timeout = 3600 prepare_node_query_interval = 30 prepare_node_task_timeout = 1800 prepare_site_query_interval = 10 -prepare_site_task_timeout = 120 +prepare_site_task_timeout = 300 service_type = physicalprovisioner verify_site_query_interval = 10 verify_site_task_timeout = 60