Handle REST API timeouts gracefully in the VIM

The VIM is leaking FDs. The problem happens as follows:
- The VIM has worker processes that are used to communicate with
  other processes through their REST APIs (e.g. sysinv, nova,
  cinder). The VIM does not specify a timeout when sending REST API
  requests.
- The VIM does have a timeout for how long a worker process takes to
  process a request, which can vary depending on the request.
- If the worker process sends a REST API request and does not get a
  response in time (e.g. because a message is lost or the target
  process is down), the VIM terminates the worker process. This is
  being done with a call to Process.terminate in the python
  multiprocessing library. The docs for this library clearly indicate
  that Process.terminate should not be used for a process that uses
  any shared resources (e.g. pipes). In this case, the worker
  processes are using shared resources (pipes for one) and these
  resources are not freed, leading to the FD leak.

The solution is to ensure that a timeout is set when sending REST API
requests. This timeout must be less than the worker timeout to ensure
that the workers do not timeout (and leak FDs) except in the rarest
of cases.

Change-Id: Iccff914e86224be96689738cdcc536a4d5acb861
Closes-Bug: 1862049
Signed-off-by: Bart Wensley <barton.wensley@windriver.com>
This commit is contained in:
Bart Wensley 2020-06-03 11:37:56 -05:00
parent 6817c1cc15
commit ccd59a0711
8 changed files with 114 additions and 38 deletions

View File

@ -1 +1 @@
TIS_PATCH_VER=79
TIS_PATCH_VER=80

View File

@ -55,11 +55,26 @@ class TaskFuture(object):
del kwargs['timeout_in_secs']
if timeout_in_secs is None:
# WARNING: Any change to the default timeout must be reflected in
# the timeouts used for any work being done.
timeout_in_secs = 20
elif 0 >= timeout_in_secs:
timeout_in_secs = None # No timeout wanted, wait forever
# Note about timeouts. When the timeout expires, the VIM will terminate
# the worker process doing the work. Unfortunately, the python
# multiprocessing library used to manage these processes results in
# leaked file descriptors each time a process is terminated. That
# means this timeout should be a last resort - the work being done
# (e.g. sending a REST API request) must have its own timeout
# mechanism to ensure it completes before the worker process times
# out. Adding 5 seconds to the configured (or default) timeout to
# ensure the underlying timeout mechanism has the opportunity to
# abort the work being done.
if timeout_in_secs is not None:
timeout_in_secs += 5
if self._scheduler.running_task is not None:
task_work = TaskWork(timeout_in_secs, target, *args, **kwargs)
self._scheduler.running_task.add_task_work(task_work)

View File

@ -115,6 +115,8 @@ max_request_wait_in_secs=45
host=127.0.0.1
port=30004
# WARNING: Any changes to these timeouts must be reflected in the timeouts
# used for the associated REST API calls.
[nfvi-timeouts]
openstack.get_token=10
neutron.disable_host_services=40

View File

@ -204,8 +204,11 @@ def upload_image_data_by_url(token, image_id, image_data_url):
operations.append(operation)
api_cmd_payload = operations
# WARNING: Any change to the timeout must be reflected in the config.ini
# file for the nfvi plugins.
response = rest_api_request(token, "PATCH", api_cmd, api_cmd_headers,
json.dumps(api_cmd_payload))
json.dumps(api_cmd_payload),
timeout_in_secs=180)
return response
@ -227,8 +230,10 @@ def upload_image_data_by_file(token, image_id, image_file):
file = open(image_file, "rb")
api_cmd_payload = file
try:
# WARNING: Any change to the timeout must be reflected in the config.ini
# file for the nfvi plugins.
response = rest_api_request(token, "PUT", api_cmd, api_cmd_headers,
api_cmd_payload)
api_cmd_payload, timeout_in_secs=180)
finally:
file.close()

View File

@ -569,7 +569,10 @@ def delete_host_services(token, host_uuid):
api_cmd_headers = dict()
response = rest_api_request(token, "DELETE", api_cmd, api_cmd_headers)
# WARNING: Any change to the timeout must be reflected in the config.ini
# file for the nfvi plugins.
response = rest_api_request(token, "DELETE", api_cmd, api_cmd_headers,
timeout_in_secs=40)
return response
@ -724,8 +727,11 @@ def disable_host_services(token, host_uuid):
api_cmd_payload = dict()
api_cmd_payload['host'] = payload
# WARNING: Any change to the timeout must be reflected in the config.ini
# file for the nfvi plugins.
response = rest_api_request(token, "PUT", api_cmd, api_cmd_headers,
json.dumps(api_cmd_payload))
json.dumps(api_cmd_payload),
timeout_in_secs=40)
return response

View File

@ -60,7 +60,9 @@ def get_token(directory):
}}}})
request_info.add_data(payload)
request = urllib.request.urlopen(request_info)
# WARNING: Any change to the timeout must be reflected in the config.ini
# file for the nfvi plugins.
request = urllib.request.urlopen(request_info, timeout=10)
# Identity API v3 returns token id in X-Subject-Token
# response header.
token_id = request.info().getheader('X-Subject-Token')

View File

@ -287,8 +287,8 @@ def rest_api_get_server(host, port):
return RestAPIServer(host, port)
def _rest_api_request(token_id, method, api_cmd, api_cmd_headers=None,
api_cmd_payload=None):
def _rest_api_request(token_id, method, api_cmd, api_cmd_headers,
api_cmd_payload, timeout_in_secs):
"""
Internal: make a rest-api request
"""
@ -320,7 +320,7 @@ def _rest_api_request(token_id, method, api_cmd, api_cmd_headers=None,
# opener = urllib.request.build_opener(handler)
# urllib.request.install_opener(opener)
request = urllib.request.urlopen(request_info)
request = urllib.request.urlopen(request_info, timeout=timeout_in_secs)
headers = list() # list of tuples
for key, value in request.info().items():
@ -424,15 +424,29 @@ def _rest_api_request(token_id, method, api_cmd, api_cmd_headers=None,
raise OpenStackException(method, api_cmd, api_cmd_headers,
api_cmd_payload, str(e), str(e))
except Exception as e:
now_ms = timers.get_monotonic_timestamp_in_ms()
elapsed_ms = now_ms - start_ms
log_error("Rest-API failure, %s, %s, hdrs=%s, payload=%s, elapsed_ms=%s"
% (method, api_cmd, api_cmd_headers, api_cmd_payload,
int(elapsed_ms)))
raise OpenStackException(method, api_cmd, api_cmd_headers,
api_cmd_payload, str(e), str(e))
def rest_api_request(token, method, api_cmd, api_cmd_headers=None,
api_cmd_payload=None):
api_cmd_payload=None, timeout_in_secs=20):
"""
Make a rest-api request using the given token
WARNING: Any change to the default timeout must be reflected in the timeout
calculations done in the TaskFuture class.
"""
try:
return _rest_api_request(token.get_id(), method, api_cmd,
api_cmd_headers, api_cmd_payload)
api_cmd_headers, api_cmd_payload,
timeout_in_secs)
except OpenStackRestAPIException as e:
if httplib.UNAUTHORIZED == e.http_status_code:
@ -441,9 +455,12 @@ def rest_api_request(token, method, api_cmd, api_cmd_headers=None,
def rest_api_request_with_context(context, method, api_cmd,
api_cmd_headers=None, api_cmd_payload=None):
api_cmd_headers=None, api_cmd_payload=None,
timeout_in_secs=20):
"""
Make a rest-api request using the given context
WARNING: Any change to the default timeout must be reflected in the timeout
calculations done in the TaskFuture class.
"""
return _rest_api_request(context.token_id, method, api_cmd, api_cmd_headers,
api_cmd_payload)
api_cmd_payload, timeout_in_secs)

View File

@ -11,6 +11,10 @@ from nfv_plugins.nfvi_plugins.openstack.rest_api import rest_api_request
DLOG = debug.debug_get_logger('nfv_plugins.nfvi_plugins.openstack.sysinv')
# WARNING: Any change to this timeout must be reflected in the config.ini
# file for the nfvi plugins.
REST_API_REQUEST_TIMEOUT = 45
def get_datanetworks(token, host_uuid):
"""
@ -25,7 +29,8 @@ def get_datanetworks(token, host_uuid):
api_cmd_headers['Content-Type'] = "application/json"
api_cmd_headers['User-Agent'] = "vim/1.0"
response = rest_api_request(token, "GET", api_cmd, api_cmd_headers)
response = rest_api_request(token, "GET", api_cmd, api_cmd_headers,
timeout_in_secs=REST_API_REQUEST_TIMEOUT)
result_data = response.result_data['interface_datanetworks']
return result_data
@ -42,7 +47,8 @@ def get_system_info(token):
api_cmd = url + "/isystems"
response = rest_api_request(token, "GET", api_cmd)
response = rest_api_request(token, "GET", api_cmd,
timeout_in_secs=REST_API_REQUEST_TIMEOUT)
return response
@ -56,7 +62,8 @@ def get_hosts(token):
api_cmd = url + "/ihosts"
response = rest_api_request(token, "GET", api_cmd)
response = rest_api_request(token, "GET", api_cmd,
timeout_in_secs=REST_API_REQUEST_TIMEOUT)
return response
@ -70,7 +77,8 @@ def get_host(token, host_uuid):
api_cmd = url + "/ihosts/%s" % host_uuid
response = rest_api_request(token, "GET", api_cmd)
response = rest_api_request(token, "GET", api_cmd,
timeout_in_secs=REST_API_REQUEST_TIMEOUT)
return response
@ -84,7 +92,8 @@ def get_host_labels(token, host_uuid):
api_cmd = url + "/ihosts/%s/labels" % host_uuid
response = rest_api_request(token, "GET", api_cmd)
response = rest_api_request(token, "GET", api_cmd,
timeout_in_secs=REST_API_REQUEST_TIMEOUT)
return response
@ -98,7 +107,8 @@ def get_upgrade(token):
api_cmd = url + "/upgrade"
response = rest_api_request(token, "GET", api_cmd)
response = rest_api_request(token, "GET", api_cmd,
timeout_in_secs=REST_API_REQUEST_TIMEOUT)
return response
@ -120,7 +130,8 @@ def upgrade_start(token):
api_cmd_payload['force'] = "false"
response = rest_api_request(token, "POST", api_cmd, api_cmd_headers,
json.dumps(api_cmd_payload))
json.dumps(api_cmd_payload),
timeout_in_secs=REST_API_REQUEST_TIMEOUT)
return response
@ -147,7 +158,8 @@ def upgrade_activate(token):
api_cmd_payload.append(host_data)
response = rest_api_request(token, "PATCH", api_cmd, api_cmd_headers,
json.dumps(api_cmd_payload))
json.dumps(api_cmd_payload),
timeout_in_secs=REST_API_REQUEST_TIMEOUT)
return response
@ -165,7 +177,8 @@ def upgrade_complete(token):
api_cmd_headers['Content-Type'] = "application/json"
api_cmd_headers['User-Agent'] = "vim/1.0"
response = rest_api_request(token, "DELETE", api_cmd, api_cmd_headers)
response = rest_api_request(token, "DELETE", api_cmd, api_cmd_headers,
timeout_in_secs=REST_API_REQUEST_TIMEOUT)
return response
@ -179,7 +192,8 @@ def get_host_lvgs(token, host_uuid):
api_cmd = url + "/ihosts/%s/ilvgs" % host_uuid
response = rest_api_request(token, "GET", api_cmd)
response = rest_api_request(token, "GET", api_cmd,
timeout_in_secs=REST_API_REQUEST_TIMEOUT)
return response
@ -206,7 +220,8 @@ def notify_host_services_enabled(token, host_uuid):
api_cmd_list.append(api_cmd_payload)
response = rest_api_request(token, "PATCH", api_cmd, api_cmd_headers,
json.dumps(api_cmd_list))
json.dumps(api_cmd_list),
timeout_in_secs=REST_API_REQUEST_TIMEOUT)
return response
@ -233,7 +248,8 @@ def notify_host_services_disabled(token, host_uuid):
api_cmd_list.append(api_cmd_payload)
response = rest_api_request(token, "PATCH", api_cmd, api_cmd_headers,
json.dumps(api_cmd_list))
json.dumps(api_cmd_list),
timeout_in_secs=REST_API_REQUEST_TIMEOUT)
return response
@ -260,7 +276,8 @@ def notify_host_services_disable_extend(token, host_uuid):
api_cmd_list.append(api_cmd_payload_action)
response = rest_api_request(token, "PATCH", api_cmd, api_cmd_headers,
json.dumps(api_cmd_list))
json.dumps(api_cmd_list),
timeout_in_secs=REST_API_REQUEST_TIMEOUT)
return response
@ -293,7 +310,8 @@ def notify_host_services_disable_failed(token, host_uuid, reason):
api_cmd_list.append(api_cmd_payload_reason)
response = rest_api_request(token, "PATCH", api_cmd, api_cmd_headers,
json.dumps(api_cmd_list))
json.dumps(api_cmd_list),
timeout_in_secs=REST_API_REQUEST_TIMEOUT)
return response
@ -311,7 +329,8 @@ def notify_host_services_deleted(token, host_uuid):
api_cmd_headers['Content-Type'] = "application/json"
api_cmd_headers['User-Agent'] = "vim/1.0"
response = rest_api_request(token, "DELETE", api_cmd, api_cmd_headers)
response = rest_api_request(token, "DELETE", api_cmd, api_cmd_headers,
timeout_in_secs=REST_API_REQUEST_TIMEOUT)
return response
@ -344,7 +363,8 @@ def notify_host_services_delete_failed(token, host_uuid, reason):
api_cmd_list.append(api_cmd_payload_reason)
response = rest_api_request(token, "PATCH", api_cmd, api_cmd_headers,
json.dumps(api_cmd_list))
json.dumps(api_cmd_list),
timeout_in_secs=REST_API_REQUEST_TIMEOUT)
return response
@ -371,7 +391,8 @@ def lock_host(token, host_uuid):
api_cmd_payload.append(host_data)
response = rest_api_request(token, "PATCH", api_cmd, api_cmd_headers,
json.dumps(api_cmd_payload))
json.dumps(api_cmd_payload),
timeout_in_secs=REST_API_REQUEST_TIMEOUT)
return response
@ -398,7 +419,8 @@ def unlock_host(token, host_uuid):
api_cmd_payload.append(host_data)
response = rest_api_request(token, "PATCH", api_cmd, api_cmd_headers,
json.dumps(api_cmd_payload))
json.dumps(api_cmd_payload),
timeout_in_secs=REST_API_REQUEST_TIMEOUT)
return response
@ -425,7 +447,8 @@ def reboot_host(token, host_uuid):
api_cmd_payload.append(host_data)
response = rest_api_request(token, "PATCH", api_cmd, api_cmd_headers,
json.dumps(api_cmd_payload))
json.dumps(api_cmd_payload),
timeout_in_secs=REST_API_REQUEST_TIMEOUT)
return response
@ -447,7 +470,8 @@ def upgrade_host(token, host_uuid):
api_cmd_payload['force'] = "false"
response = rest_api_request(token, "POST", api_cmd, api_cmd_headers,
json.dumps(api_cmd_payload))
json.dumps(api_cmd_payload),
timeout_in_secs=REST_API_REQUEST_TIMEOUT)
return response
@ -474,7 +498,8 @@ def swact_from_host(token, host_uuid):
api_cmd_payload.append(host_data)
response = rest_api_request(token, "PATCH", api_cmd, api_cmd_headers,
json.dumps(api_cmd_payload))
json.dumps(api_cmd_payload),
timeout_in_secs=REST_API_REQUEST_TIMEOUT)
return response
@ -492,7 +517,8 @@ def get_host_devices(token, host_uuid):
api_cmd_headers['Content-Type'] = "application/json"
api_cmd_headers['User-Agent'] = "vim/1.0"
response = rest_api_request(token, "GET", api_cmd, api_cmd_headers)
response = rest_api_request(token, "GET", api_cmd, api_cmd_headers,
timeout_in_secs=REST_API_REQUEST_TIMEOUT)
return response
@ -510,7 +536,8 @@ def get_host_device(token, device_uuid):
api_cmd_headers['Content-Type'] = "application/json"
api_cmd_headers['User-Agent'] = "vim/1.0"
response = rest_api_request(token, "GET", api_cmd, api_cmd_headers)
response = rest_api_request(token, "GET", api_cmd, api_cmd_headers,
timeout_in_secs=REST_API_REQUEST_TIMEOUT)
return response
@ -531,7 +558,8 @@ def host_device_image_update(token, host_uuid):
api_cmd_payload = dict()
response = rest_api_request(token, "POST", api_cmd, api_cmd_headers,
json.dumps(api_cmd_payload))
json.dumps(api_cmd_payload),
timeout_in_secs=REST_API_REQUEST_TIMEOUT)
return response
@ -552,5 +580,6 @@ def host_device_image_update_abort(token, host_uuid):
api_cmd_payload = dict()
response = rest_api_request(token, "POST", api_cmd, api_cmd_headers,
json.dumps(api_cmd_payload))
json.dumps(api_cmd_payload),
timeout_in_secs=REST_API_REQUEST_TIMEOUT)
return response