Improve remote install robustness

Adding retries to handle the following types of failure:
1. Create communication session failed - Failed to create session.
2. Unable to establish Redfish client connections to BMC at <ip address>
(Server not reachable, return code: 503).
3. Fail to set System Power State to On/Off.

Test Plan:
PASS: Retries work properly when session creation fails.
PASS: Retries work properly when Unable to establish Redfish client
      connection to BMC.
PASS: Retries work properly when returning 500 error in the "Power Off
      Host" stage.
PASS: rvmc script executed successfully without above errors.

Story: 2010144
Task: 46761

Signed-off-by: Li Zhu <li.zhu@windriver.com>
Change-Id: I6bb2e0822a51770b181181b49a86fb51d6dca18b
This commit is contained in:
Li Zhu 2022-10-25 17:39:27 -04:00
parent 6c85ea114b
commit eaf07202a9

View File

@ -1,7 +1,7 @@
#!/usr/bin/python3 #!/usr/bin/python3
############################################################################### ###############################################################################
# #
# Copyright (c) 2019-2020 Wind River Systems, Inc. # Copyright (c) 2019-2022 Wind River Systems, Inc.
# #
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# #
@ -126,6 +126,7 @@ import yaml
# Import Redfish Python Library # Import Redfish Python Library
# Module: https://pypi.org/project/redfish/ # Module: https://pypi.org/project/redfish/
import redfish import redfish
from redfish.rest.v1 import InvalidCredentialsError
FEATURE_NAME = 'Redfish Virtual Media Controller' FEATURE_NAME = 'Redfish Virtual Media Controller'
@ -177,6 +178,14 @@ def ilog(string):
sys.stdout.write("\n%s Info : %s" % (t(), string)) sys.stdout.write("\n%s Info : %s" % (t(), string))
def wlog(string):
"""
Warning Log Utility
"""
sys.stdout.write("\n%s Warn : %s" % (t(), string))
def elog(string): def elog(string):
""" """
Error Log Utility Error Log Utility
@ -274,6 +283,21 @@ RETRY_DELAY_SECS = 10
# 2 second delay constant # 2 second delay constant
DELAY_2_SECS = 2 DELAY_2_SECS = 2
# max number of establishing BMC connection attempts
MAX_CONNECTION_ATTEMPTS = 3
# interval in seconds between BMC connection attempts
CONNECTION_RETRY_INTERVAL = 15
# max number of session creation attempts
MAX_SESSION_CREATION_ATTEMPTS = 3
# interval in seconds between session creation attempts
SESSION_CREATION_RETRY_INTERVAL = 15
# max number of retries for http transient error (e.g. response status: 500)
MAX_HTTP_TRANSIENT_ERROR_RETRIES = 5
# interval in seconds between http request retries
HTTP_REQUEST_RETRY_INTERVAL = 10
def is_ipv6_address(address): def is_ipv6_address(address):
""" """
@ -454,7 +478,7 @@ class VmcObject(object):
dlog1("Password : %s" % self.pw_encoded) dlog1("Password : %s" % self.pw_encoded)
dlog1("Image : %s" % self.img) dlog1("Image : %s" % self.img)
def make_request(self, operation=None, path=None, payload=None): def make_request(self, operation=None, path=None, payload=None, retry=-1):
""" """
Issue a Redfish http request, Issue a Redfish http request,
Check response, Check response,
@ -467,6 +491,10 @@ class VmcObject(object):
:type path: str :type path: str
:param payload: POST or PATCH payload data :param payload: POST or PATCH payload data
:type payload: dictionary :type payload: dictionary
:param retry: The number of retries. The default value -1 means
disabling retry. If the number in
[0 .. MAX_HTTP_TRANSIENT_ERROR_RETRIES), the retry will be executed.
:type retry: int
:returns True if request succeeded (200,202(accepted),204(no content) :returns True if request succeeded (200,202(accepted),204(no content)
""" """
@ -477,28 +505,34 @@ class VmcObject(object):
url = self.url url = self.url
before_request_time = datetime.datetime.now().replace(microsecond=0) before_request_time = datetime.datetime.now().replace(microsecond=0)
request_log = "Request : %s %s" % (operation, url)
try: try:
dlog3("Request : %s %s" % (operation, url))
if operation == GET: if operation == GET:
dlog3("Headers : %s : %s" % (operation, GET_HEADERS)) request_log += "\nHeaders : %s : %s" % \
(operation, GET_HEADERS)
self.response = self.redfish_obj.get(url, headers=GET_HEADERS) self.response = self.redfish_obj.get(url, headers=GET_HEADERS)
elif operation == POST: elif operation == POST:
dlog3("Headers : %s : %s" % (operation, POST_HEADERS)) request_log += "\nHeaders : %s : %s" % \
dlog3("Payload : %s" % payload) (operation, POST_HEADERS)
request_log += "\nPayload : %s" % payload
self.response = self.redfish_obj.post(url, self.response = self.redfish_obj.post(url,
body=payload, body=payload,
headers=POST_HEADERS) headers=POST_HEADERS)
elif operation == PATCH: elif operation == PATCH:
dlog3("Headers : %s : %s" % (operation, PATCH_HEADERS)) request_log += "\nHeaders : %s : %s" % \
dlog3("Payload : %s" % payload) (operation, PATCH_HEADERS)
request_log += "\nPayload : %s" % payload
self.response = self.redfish_obj.patch(url, self.response = self.redfish_obj.patch(url,
body=payload, body=payload,
headers=PATCH_HEADERS) headers=PATCH_HEADERS)
else: else:
dlog3(request_log)
elog("Unsupported operation: %s" % operation) elog("Unsupported operation: %s" % operation)
return False return False
dlog3(request_log)
except Exception as ex: except Exception as ex:
elog("Failed operation on '%s' (%s)" % (url, ex)) elog("Failed operation on '%s' (%s)" % (url, ex))
@ -507,7 +541,20 @@ class VmcObject(object):
delta = after_request_time - before_request_time delta = after_request_time - before_request_time
# if we got a response, check its status # if we got a response, check its status
if self.check_ok_status(url, operation, delta.seconds) is False: if self.check_ok_status(url, operation, delta.seconds) is False:
self._exit(1) if retry < 0 or retry >= MAX_HTTP_TRANSIENT_ERROR_RETRIES:
elog("Failed in an error response:\n%s" % self.response)
self._exit(1)
else:
retry += 1
wlog("Got an error response for: \n%s" % request_log)
ilog("Make request: retry (%i of %i) in %i secs." %
(retry, MAX_HTTP_TRANSIENT_ERROR_RETRIES,
HTTP_REQUEST_RETRY_INTERVAL))
time.sleep(HTTP_REQUEST_RETRY_INTERVAL)
self.make_request(operation=operation,
path=path,
payload=payload,
retry=retry)
# handle 204 success with no content ; clear last response # handle 204 success with no content ; clear last response
if self.response.status == 204: if self.response.status == 204:
@ -725,26 +772,34 @@ class VmcObject(object):
ilog("BMC Ping Ok : %s (%i)" % (self.ip, ping_count)) ilog("BMC Ping Ok : %s (%i)" % (self.ip, ping_count))
# try to connect # try to connect
connect_error = False fail_counter = 0
try: err_msg = "Unable to establish %s to BMC at %s." % (stage, self.uri)
# One time Redfish Client Object Create while fail_counter < MAX_CONNECTION_ATTEMPTS:
self.redfish_obj = \ ex_log = ""
redfish.redfish_client(base_url=self.uri, try:
username=self.un, # One time Redfish Client Object Create
password=self.pw, self.redfish_obj = \
default_prefix=REDFISH_ROOT_PATH) redfish.redfish_client(base_url=self.uri,
if self.redfish_obj is None: username=self.un,
connect_error = True password=self.pw,
elog("Unable to establish %s to BMC at %s" % default_prefix=REDFISH_ROOT_PATH)
(stage, self.uri)) if self.redfish_obj is None:
except Exception as ex: fail_counter += 1
connect_error = True else:
elog("Unable to establish %s to BMC at %s (%s)" % return
(stage, self.uri, ex)) except Exception as ex:
fail_counter += 1
ex_log = " (%s)" % str(ex)
if connect_error is True: if fail_counter < MAX_CONNECTION_ATTEMPTS:
alog("Check BMC ip address is pingable and supports Redfish") wlog(err_msg + " Retry (%i/%i) in %i secs." %
self._exit(1) (fail_counter, MAX_CONNECTION_ATTEMPTS - 1,
CONNECTION_RETRY_INTERVAL) + ex_log)
time.sleep(CONNECTION_RETRY_INTERVAL)
elog(err_msg)
alog("Check BMC ip address is pingable and supports Redfish")
self._exit(1)
########################################################################### ###########################################################################
# Redfish Root Query # Redfish Root Query
@ -784,14 +839,27 @@ class VmcObject(object):
stage = 'Create Communication Session' stage = 'Create Communication Session'
slog(stage) slog(stage)
try: fail_counter = 0
self.redfish_obj.login(auth="session") while fail_counter < MAX_SESSION_CREATION_ATTEMPTS:
dlog1("Session : Open") try:
self.session = True self.redfish_obj.login(auth="session")
dlog1("Session : Open")
except Exception as ex: self.session = True
elog("Failed to Create session ; %s" % ex) return
self._exit(1) except InvalidCredentialsError:
elog("Failed to Create session due to invalid credentials.")
alog("Check BMC username and password in config file")
self._exit(1)
except Exception as ex:
err_msg = "Failed to Create session ; %s." % str(ex)
fail_counter += 1
if fail_counter >= MAX_SESSION_CREATION_ATTEMPTS:
elog(err_msg)
self._exit(1)
wlog(err_msg + " Retry (%i/%i) in %i secs."
% (fail_counter, MAX_SESSION_CREATION_ATTEMPTS - 1,
CONNECTION_RETRY_INTERVAL))
time.sleep(SESSION_CREATION_RETRY_INTERVAL)
########################################################################### ###########################################################################
# Query Redfish Managers # Query Redfish Managers
@ -911,7 +979,8 @@ class VmcObject(object):
self._exit(1) self._exit(1)
if self.make_request(operation=GET, if self.make_request(operation=GET,
path=self.systems_member_url) is False: path=self.systems_member_url,
retry=0) is False:
elog("Unable to get %s from %s" % elog("Unable to get %s from %s" %
(info, self.systems_member_url)) (info, self.systems_member_url))
self._exit(1) self._exit(1)
@ -1048,7 +1117,7 @@ class VmcObject(object):
poll_count = 0 poll_count = 0
MAX_STATE_POLL_COUNT = 60 # some servers take longer than 10 seconds MAX_STATE_POLL_COUNT = 60 # some servers take longer than 10 seconds
while poll_count < MAX_STATE_POLL_COUNT and self.power_state != state: while poll_count < MAX_STATE_POLL_COUNT and self.power_state != state:
time.sleep(1) time.sleep(3)
poll_count = poll_count + 1 poll_count = poll_count + 1
# get systems info # get systems info
@ -1307,7 +1376,6 @@ class VmcObject(object):
while poll_count < MAX_POLL_COUNT and ejecting: while poll_count < MAX_POLL_COUNT and ejecting:
# verify the image is not in inserted # verify the image is not in inserted
poll_count = poll_count + 1 poll_count = poll_count + 1
vm_eject = self.vm_actions.get(eject_media_label)
if self.make_request(operation=GET, if self.make_request(operation=GET,
path=self.vm_url) is True: path=self.vm_url) is True:
if self.get_key_value('Inserted') is False: if self.get_key_value('Inserted') is False: