Remove redfish cache entry upon errors

Some transient errors can ultimately cause the client to need to be
completely restarted due to cached connection data.

Ironic now explicitly removes the cache entry when a sushy
AccessError or python AttributeError is detected originating
from the library. This will now result in the prior cached connection
object to be discarded, and upon the next attempt to interact with
the same node, a new connection will be launched.

This will result in new sessions being created, but in all likelihood
the prior session had already timed out or had been administratively
removed. Sushy's code, as of
https://review.opendev.org/c/openstack/sushy/+/820076
will raise SessionService lookup access errors as AccessErrors.
Prior to that change, they should have been raised as AttributeError
as the previous call sould have returned None to be used as an object.

*Also* Includes follow-up change Ia59f774c9340e3a6fa63418afedf12098c709052
squashed into this change, and necessary backport friendly mock of
AccessError which had changed since this release of ironic.

Change-Id: Icc6e5dd74d9f15e679a7e764fe49238ed6b8dc1e
Story: 2009719
Task: 44107
(cherry picked from commit 1439af27ba)
(cherry picked from commit 01997c8418)
(cherry picked from commit e3e7deaf48)
(cherry picked from commit ab5495eecb)
(cherry picked from commit e57aa7598b)
(cherry picked from commit 3e3afc3254)
(cherry picked from commit 4b6b63ac66)
This commit is contained in:
Julia Kreger 2021-12-01 14:20:35 -08:00
parent edd6737c68
commit 36f3105016
4 changed files with 115 additions and 1 deletions

View File

@ -215,6 +215,16 @@ class SessionCache(object):
# NOTE(etingof): perhaps this session token is no good
if isinstance(exc_val, sushy.exceptions.ConnectionError):
self.__class__._sessions.pop(self._session_key, None)
# NOTE(TheJulia): A hard access error has surfaced, we
# likely need to eliminate the session.
if isinstance(exc_val, sushy.exceptions.AccessError):
self.__class__._sessions.pop(self._session_key, None)
# NOTE(TheJulia): Something very bad has happened, such
# as the session is out of date, and refresh of the SessionService
# failed resulting in an AttributeError surfacing.
# https://storyboard.openstack.org/#!/story/2009719
if isinstance(exc_val, AttributeError):
self.__class__._sessions.pop(self._session_key, None)
@classmethod
def _expire_oldest_session(cls):
@ -265,6 +275,23 @@ def get_system(node):
'auth_type': driver_info['auth_type'],
'node': node.uuid, 'error': e})
raise exception.RedfishConnectionError(node=node.uuid, error=e)
except sushy.exceptions.AccessError as e:
LOG.warning('For node %(node)s, we received an authentication '
'access error from address %(address)s with auth_type '
'%(auth_type)s. The client will not be re-used upon '
'the next re-attempt. Please ensure your using the '
'correct credentials. Error: %(error)s',
{'address': driver_info['address'],
'auth_type': driver_info['auth_type'],
'node': node.uuid, 'error': e})
raise exception.RedfishError(node=node.uuid, error=e)
except AttributeError as e:
LOG.warning('For node %(node)s, we received at AttributeError '
'when attempting to utilize the client. A new '
'client session shall be used upon the next attempt.'
'Attribute Error: %(error)s',
{'node': node.uuid, 'error': e})
raise exception.RedfishError(node=node.uuid, error=e)
try:
return _get_system()

View File

@ -16,6 +16,7 @@
import collections
import copy
import os
import time
import mock
from oslo_config import cfg
@ -191,8 +192,9 @@ class RedfishUtilsTestCase(db_base.DbTestCase):
# Redfish specific configurations
self.config(connection_attempts=3, group='redfish')
fake_conn = mock_sushy.return_value
fake_conn = mock.Mock()
fake_conn.get_system.side_effect = sushy.exceptions.ConnectionError()
mock_sushy.return_value = fake_conn
self.assertRaises(exception.RedfishConnectionError,
redfish_utils.get_system, self.node)
@ -206,6 +208,76 @@ class RedfishUtilsTestCase(db_base.DbTestCase):
mock_sleep.assert_called_with(
redfish_utils.CONF.redfish.connection_retry_interval)
@mock.patch.object(time, 'sleep', lambda seconds: None)
@mock.patch.object(sushy, 'Sushy', autospec=True)
@mock.patch('ironic.drivers.modules.redfish.utils.'
'SessionCache._sessions', {})
def test_get_system_resource_access_error_retry(self, mock_sushy):
# Sushy access errors HTTP Errors
class fake_response(object):
status_code = 401
body = None
@classmethod
def json(cls):
return {}
fake_conn = mock_sushy.return_value
fake_system = mock.Mock()
fake_conn.get_system.side_effect = iter(
[
sushy.exceptions.AccessError(
method='GET',
url='http://path/to/url',
response=fake_response),
fake_system,
])
self.assertRaises(exception.RedfishError,
redfish_utils.get_system, self.node)
# Retry, as in next power sync perhaps
client = redfish_utils.get_system(self.node)
client('foo')
expected_get_system_calls = [
mock.call(self.parsed_driver_info['system_id']),
mock.call(self.parsed_driver_info['system_id']),
]
fake_conn.get_system.assert_has_calls(expected_get_system_calls)
fake_system.assert_called_with('foo')
self.assertEqual(fake_conn.get_system.call_count, 2)
@mock.patch.object(time, 'sleep', lambda seconds: None)
@mock.patch.object(sushy, 'Sushy', autospec=True)
@mock.patch('ironic.drivers.modules.redfish.utils.'
'SessionCache._sessions', {})
def test_get_system_resource_attribute_error(self, mock_sushy):
fake_conn = mock_sushy.return_value
fake_system = mock.Mock()
fake_conn.get_system.side_effect = iter(
[
AttributeError,
fake_system,
])
# We need to check for AttributeError explicitly as
# otherwise we break existing tests if we try to catch
# it explicitly.
self.assertRaises(exception.RedfishError,
redfish_utils.get_system, self.node)
# Retry, as in next power sync perhaps
client = redfish_utils.get_system(self.node)
client('bar')
expected_get_system_calls = [
mock.call(self.parsed_driver_info['system_id']),
mock.call(self.parsed_driver_info['system_id']),
]
fake_conn.get_system.assert_has_calls(expected_get_system_calls)
fake_system.assert_called_once_with('bar')
self.assertEqual(fake_conn.get_system.call_count, 2)
@mock.patch.object(sushy, 'Sushy', autospec=True)
@mock.patch('ironic.drivers.modules.redfish.utils.'
'SessionCache._sessions', {})

View File

@ -224,6 +224,8 @@ if not sushy:
type('SushyError', (MockKwargsException,), {}))
sushy.exceptions.ConnectionError = (
type('ConnectionError', (sushy.exceptions.SushyError,), {}))
sushy.exceptions.AccessError = (
type('AccessError', (sushy.exceptions.SushyError,), {}))
sushy.exceptions.ResourceNotFoundError = (
type('ResourceNotFoundError', (sushy.exceptions.SushyError,), {}))
sushy.exceptions.MissingAttributeError = (

View File

@ -0,0 +1,13 @@
---
fixes:
- |
Fixes connection caching issues with Redfish BMCs where AccessErrors were
previously not disqualifying the cached connection from being re-used.
Ironic will now explicitly open a new connection instead of using the
previous connection in the cache. Under normal circumstances, the
``sushy`` redfish library would detect and refresh sessions,
however a prior case exists where it may not detect a failure and contain
cached session credential data which is ultimately invalid, blocking
future access to the BMC via Redfish until the cache entry expired or
the ``ironic-conductor`` service was restarted. For more information
please see `story 2009719 <https://storyboard.openstack.org/#!/story/2009719>`_.