Update DryDock Operator

The following errors [0] were encountered during our end-to-end
testing. This is a result of extended execution of the workflow
that led to expiration of the keystone token.

It is also possible for the 'prepare_site' task to take more than
120 seconds to complete. Hence we are increasing the time out for
the 'prepare_site_task_timeout' variable to 300 seconds.

This P.S. addresses the above 2 observations

[0] Logs from DryDock

Authorization failed for token
Identity response: {"error": {"message": "Failed to validate token", "code": 404, "title": "Not Found"}}
Authorization failed for token

Change-Id: I4760e390822e6e8c9540216035e263d054fde400
This commit is contained in:
Anthony Lin 2018-01-05 07:47:11 +00:00
parent 609bc0a624
commit 5190189a60
6 changed files with 51 additions and 10 deletions

View File

@ -323,7 +323,7 @@ conf:
verify_site_query_interval: 10
verify_site_task_timeout: 60
prepare_site_query_interval: 10
prepare_site_task_timeout: 120
prepare_site_task_timeout: 300
prepare_node_query_interval: 30
prepare_node_task_timeout: 1800
deploy_node_query_interval: 30

View File

@ -68,7 +68,7 @@
#prepare_site_query_interval = 10
# Time out (in seconds) for prepare_site task (integer value)
#prepare_site_task_timeout = 120
#prepare_site_task_timeout = 300
# Query interval (in seconds) for prepare_node task (integer value)
#prepare_node_query_interval = 30

View File

@ -147,7 +147,7 @@ SECTIONS = [
),
cfg.IntOpt(
'prepare_site_task_timeout',
default=120,
default=300,
help='Time out (in seconds) for prepare_site task'
),
cfg.IntOpt(

View File

@ -28,6 +28,7 @@ from airflow.utils.decorators import apply_defaults
import drydock_provisioner.drydock_client.client as client
import drydock_provisioner.drydock_client.session as session
from check_k8s_node_status import check_node_status
from drydock_provisioner import error as errors
from service_endpoint import ucp_service_endpoint
from service_token import shipyard_service_token
@ -152,7 +153,7 @@ class DryDockOperator(BaseOperator):
# Create Task for prepare_site
elif self.action == 'prepare_site':
# Default settings for 'prepare_site' execution is to query
# the task every 10 seconds and to time out after 120 seconds
# the task every 10 seconds and to time out after 300 seconds
query_interval = config.get('drydock',
'prepare_site_query_interval')
task_timeout = config.get('drydock', 'prepare_site_task_timeout')
@ -249,8 +250,8 @@ class DryDockOperator(BaseOperator):
logging.info('Task ID is %s', task_id)
# Query Task
self.drydock_query_task(drydock_client, interval, time_out,
task_id)
self.drydock_query_task(drydock_client, context, interval,
time_out, task_id)
def drydock_perform_task(self, drydock_client, context,
perform_task, nodes_filter):
@ -279,7 +280,13 @@ class DryDockOperator(BaseOperator):
else:
raise AirflowException("Unable to create task!")
def drydock_query_task(self, drydock_client, interval, time_out, task_id):
def drydock_query_task(self, drydock_client, context, interval,
time_out, task_id):
# Initialize Variables
keystone_token_expired = False
new_dd_client = None
dd_client = drydock_client
# Calculate number of times to execute the 'for' loop
# Convert 'time_out' and 'interval' from string into integer
@ -290,15 +297,49 @@ class DryDockOperator(BaseOperator):
# Query task status
for i in range(0, end_range + 1):
if keystone_token_expired:
logging.info("Established new drydock session")
dd_client = new_dd_client
try:
# Retrieve current task state
task_state = drydock_client.get_task(task_id=task_id)
task_state = dd_client.get_task(task_id=task_id)
task_status = task_state.get('status')
task_result = task_state.get('result')['status']
logging.info("Current status of task id %s is %s",
task_id, task_status)
keystone_token_expired = False
except errors.ClientUnauthorizedError as unauthorized_error:
# TODO: This is a temporary workaround. Drydock will be
# updated with the appropriate fix in the drydock api
# client by having the session detect a 401/403 response
# and refresh the token appropriately.
# Logs drydock client unauthorized error
keystone_token_expired = True
logging.error(unauthorized_error)
# Set up new drydock client with new keystone token
logging.info("Setting up new drydock session...")
context['svc_endpoint'] = ucp_service_endpoint(
self, svc_type='physicalprovisioner')
new_dd_client = self.drydock_session_client(context)
except errors.ClientForbiddenError as forbidden_error:
raise AirflowException(forbidden_error)
except errors.ClientError as client_error:
raise AirflowException(client_error)
except:
# There can be instances where there are intermittent network
# issues that prevents us from retrieving the task state. We
# will want to retry in such situations.
logging.info("Unable to retrieve task state. Retrying...")
# Raise Time Out Exception

View File

@ -14,7 +14,7 @@ deploy_node_task_timeout = 3600
prepare_node_query_interval = 30
prepare_node_task_timeout = 1800
prepare_site_query_interval = 10
prepare_site_task_timeout = 120
prepare_site_task_timeout = 300
service_type = physicalprovisioner
verify_site_query_interval = 10
verify_site_task_timeout = 60

View File

@ -16,7 +16,7 @@ deploy_node_task_timeout = 3600
prepare_node_query_interval = 30
prepare_node_task_timeout = 1800
prepare_site_query_interval = 10
prepare_site_task_timeout = 120
prepare_site_task_timeout = 300
service_type = physicalprovisioner
verify_site_query_interval = 10
verify_site_task_timeout = 60