[Pure Storage] Enable sync repl volume creation during failover

Currently when cinder failover is invoked, due to the primary
storage backend being down, it is not possible, through the
driver, to create a new volume with sync replication
functionality. Non-replicated and async replicated volumes can
be created in this scenario - although not recommended due to
potential issues after failback.

A synchronously replicated volume could be safely created
during failover as the Pure Storage architecture can allow
this to happen. When the failed array is available again, any
new sync replication volumes created during the outage will be
automatically recovered by the backend's own internal systems.

This patch updates the driver to check, during volume creation,
if the backend is in failover mode and then allow sync volumes
to be correctly created, even though the primary array could be
inaccessible. Sync volume attachment will also be allowed to
continue should one of the backend replica pair arrays be down.

Creating different replication volume types have been tested
both failover and failback scenarios in Pure's labs and this
patch has proved to work as expected.

Additionally included is work from abandoned
change I7ed3ebd7fec389870edad0c1cc07ac553854dd8a, which
resolves replication issues in A/A deployments.

Also, fixes bug where a deleted replication pod can cause the
driver to fail on restart.

Closes-Bug: #2035404
Change-Id: I58f0f10b63431896e7532b16b561683cd242e9ee
This commit is contained in:
Simon Dodsley 2023-09-18 15:14:20 -04:00 committed by Simon Dodsley
parent 89b930f7ff
commit e1d93531b9
3 changed files with 140 additions and 74 deletions

View File

@ -1085,6 +1085,48 @@ class PureBaseVolumeDriverTestCase(PureBaseSharedDriverTestCase):
for update, vol_name in zip(model_updates, vol_names): for update, vol_name in zip(model_updates, vol_names):
self.assertEqual(vol_name, update['provider_id']) self.assertEqual(vol_name, update['provider_id'])
@mock.patch(BASE_DRIVER_OBJ + '._swap_replication_state')
@mock.patch(BASE_DRIVER_OBJ + '._setup_replicated_pods')
@mock.patch(BASE_DRIVER_OBJ + '._generate_replication_retention')
@mock.patch(BASE_DRIVER_OBJ + '._setup_replicated_pgroups')
def test_do_setup_replicated_sync_rep_need_swap(
self,
mock_setup_repl_pgroups,
mock_generate_replication_retention,
mock_setup_pods,
mock_swap):
"""Test do_setup when using replication and active is secondary."""
retention = mock.MagicMock()
mock_generate_replication_retention.return_value = retention
self._setup_mocks_for_replication()
self.mock_config.safe_get.return_value = [
{
"backend_id": "foo",
"managed_backend_name": None,
"san_ip": "1.2.3.4",
"api_token": "abc123",
"type": "sync",
}
]
mock_sync_target = mock.MagicMock()
mock_sync_target.get.return_value = GET_ARRAY_SECONDARY
self.array.get.return_value = GET_ARRAY_PRIMARY
self.purestorage_module.FlashArray.side_effect = [self.array,
mock_sync_target]
self.driver._active_backend_id = 'foo'
self.driver.do_setup(None)
self.assertEqual(self.array, self.driver._array)
mock_setup_repl_pgroups.assert_has_calls([
mock.call(self.array, [mock_sync_target], 'cinder-group',
REPLICATION_INTERVAL_IN_SEC, retention),
])
mock_setup_pods.assert_has_calls([
mock.call(self.array, [mock_sync_target], 'cinder-pod')
])
mock_swap.assert_called_once_with(self.driver._array, mock_sync_target)
def test_update_provider_info_update_some(self): def test_update_provider_info_update_some(self):
test_vols = [ test_vols = [
self.new_fake_vol(spec={'id': fake.VOLUME_ID}, self.new_fake_vol(spec={'id': fake.VOLUME_ID},
@ -3331,10 +3373,13 @@ class PureBaseVolumeDriverTestCase(PureBaseSharedDriverTestCase):
] ]
self.assertEqual(expected_updates, volume_updates) self.assertEqual(expected_updates, volume_updates)
@mock.patch(BASE_DRIVER_OBJ + '._get_secondary')
@mock.patch(BASE_DRIVER_OBJ + '._get_flasharray') @mock.patch(BASE_DRIVER_OBJ + '._get_flasharray')
@mock.patch(BASE_DRIVER_OBJ + '._find_async_failover_target') @mock.patch(BASE_DRIVER_OBJ + '._find_async_failover_target')
def test_async_failover_error_propagates(self, mock_find_failover_target, def test_async_failover_error_propagates(self, mock_find_failover_target,
mock_get_array): mock_get_array,
mock_get_secondary):
mock_get_secondary.return_value = self.async_array2
mock_find_failover_target.return_value = ( mock_find_failover_target.return_value = (
self.async_array2, self.async_array2,
REPLICATED_PGSNAPS[1] REPLICATED_PGSNAPS[1]
@ -3647,9 +3692,6 @@ class PureISCSIDriverTestCase(PureBaseSharedDriverTestCase):
mock_get_iscsi_ports.assert_called_with(self.array) mock_get_iscsi_ports.assert_called_with(self.array)
mock_connection.assert_called_with(self.array, vol_name, mock_connection.assert_called_with(self.array, vol_name,
ISCSI_CONNECTOR, None, None) ISCSI_CONNECTOR, None, None)
self.assert_error_propagates([mock_get_iscsi_ports, mock_connection],
self.driver.initialize_connection,
vol, ISCSI_CONNECTOR)
@mock.patch(ISCSI_DRIVER_OBJ + "._get_wwn") @mock.patch(ISCSI_DRIVER_OBJ + "._get_wwn")
@mock.patch(ISCSI_DRIVER_OBJ + "._connect") @mock.patch(ISCSI_DRIVER_OBJ + "._connect")
@ -3675,9 +3717,6 @@ class PureISCSIDriverTestCase(PureBaseSharedDriverTestCase):
mock_get_iscsi_ports.assert_called_with(self.array) mock_get_iscsi_ports.assert_called_with(self.array)
mock_connection.assert_called_with(self.array, vol_name, mock_connection.assert_called_with(self.array, vol_name,
ISCSI_CONNECTOR, None, None) ISCSI_CONNECTOR, None, None)
self.assert_error_propagates([mock_get_iscsi_ports, mock_connection],
self.driver.initialize_connection,
vol, ISCSI_CONNECTOR)
@mock.patch(ISCSI_DRIVER_OBJ + "._get_wwn") @mock.patch(ISCSI_DRIVER_OBJ + "._get_wwn")
@mock.patch(ISCSI_DRIVER_OBJ + "._connect") @mock.patch(ISCSI_DRIVER_OBJ + "._connect")
@ -3849,10 +3888,6 @@ class PureISCSIDriverTestCase(PureBaseSharedDriverTestCase):
chap_password) chap_password)
self.assertDictEqual(result, real_result) self.assertDictEqual(result, real_result)
self.assert_error_propagates([mock_get_iscsi_ports, mock_connection],
self.driver.initialize_connection,
vol, ISCSI_CONNECTOR)
@mock.patch(ISCSI_DRIVER_OBJ + "._get_wwn") @mock.patch(ISCSI_DRIVER_OBJ + "._get_wwn")
@mock.patch(ISCSI_DRIVER_OBJ + "._connect") @mock.patch(ISCSI_DRIVER_OBJ + "._connect")
@mock.patch(ISCSI_DRIVER_OBJ + "._get_target_iscsi_ports") @mock.patch(ISCSI_DRIVER_OBJ + "._get_target_iscsi_ports")
@ -4839,12 +4874,6 @@ class PureNVMEDriverTestCase(PureBaseSharedDriverTestCase):
mock_connection.assert_called_with( mock_connection.assert_called_with(
self.array, vol_name, NVME_CONNECTOR self.array, vol_name, NVME_CONNECTOR
) )
self.assert_error_propagates(
[mock_get_nvme_ports, mock_connection],
self.driver.initialize_connection,
vol,
NVME_CONNECTOR,
)
@mock.patch(NVME_DRIVER_OBJ + "._get_nguid") @mock.patch(NVME_DRIVER_OBJ + "._get_nguid")
@mock.patch(NVME_DRIVER_OBJ + "._get_wwn") @mock.patch(NVME_DRIVER_OBJ + "._get_wwn")
@ -4875,12 +4904,6 @@ class PureNVMEDriverTestCase(PureBaseSharedDriverTestCase):
mock_connection.assert_called_with( mock_connection.assert_called_with(
self.array, vol_name, NVME_CONNECTOR self.array, vol_name, NVME_CONNECTOR
) )
self.assert_error_propagates(
[mock_get_nvme_ports, mock_connection],
self.driver.initialize_connection,
vol,
NVME_CONNECTOR,
)
@mock.patch(NVME_DRIVER_OBJ + "._get_nguid") @mock.patch(NVME_DRIVER_OBJ + "._get_nguid")
@mock.patch(NVME_DRIVER_OBJ + "._get_wwn") @mock.patch(NVME_DRIVER_OBJ + "._get_wwn")

View File

@ -198,7 +198,7 @@ def pure_driver_debug_trace(f):
cls_name = driver.__class__.__name__ cls_name = driver.__class__.__name__
method_name = "%(cls_name)s.%(method)s" % {"cls_name": cls_name, method_name = "%(cls_name)s.%(method)s" % {"cls_name": cls_name,
"method": f.__name__} "method": f.__name__}
backend_name = driver._get_current_array().backend_id backend_name = driver._get_current_array(True).backend_id
LOG.debug("[%(backend_name)s] Enter %(method_name)s, args=%(args)s," LOG.debug("[%(backend_name)s] Enter %(method_name)s, args=%(args)s,"
" kwargs=%(kwargs)s", " kwargs=%(kwargs)s",
{ {
@ -440,13 +440,10 @@ class PureBaseVolumeDriver(san.SanDriver):
# If we have failed over at some point we need to adjust our current # If we have failed over at some point we need to adjust our current
# array based on the one that we have failed over to # array based on the one that we have failed over to
if (self._array is not None and if (self._active_backend_id and
self._active_backend_id is not None and
self._active_backend_id != self._array.backend_id): self._active_backend_id != self._array.backend_id):
for secondary_array in self._replication_target_arrays: secondary_array = self._get_secondary(self._active_backend_id)
if secondary_array.backend_id == self._active_backend_id:
self._swap_replication_state(self._array, secondary_array) self._swap_replication_state(self._array, secondary_array)
break
def do_setup_trisync(self): def do_setup_trisync(self):
repl_device = {} repl_device = {}
@ -2281,20 +2278,30 @@ class PureBaseVolumeDriver(san.SanDriver):
""" """
active_backend_id, volume_update_list, group_update_list = ( active_backend_id, volume_update_list, group_update_list = (
self.failover(context, volumes, secondary_id, groups)) self.failover(context, volumes, secondary_id, groups))
self.failover_completed(context, secondary_id) self.failover_completed(context, active_backend_id)
return active_backend_id, volume_update_list, group_update_list return active_backend_id, volume_update_list, group_update_list
@pure_driver_debug_trace @pure_driver_debug_trace
def failover_completed(self, context, secondary_id=None): def failover_completed(self, context, active_backend_id=None):
"""Failover to replication target.""" """Failover to replication target."""
LOG.info('Driver failover completion started.') LOG.info('Driver failover completion started.')
if secondary_id == 'default': current = self._get_current_array()
self._swap_replication_state(self._get_current_array(), # This should not happen unless we receive the same RPC message twice
if active_backend_id == current.backend_id:
LOG.info('No need to switch replication backend, already using it')
# Manager sets the active_backend to '' when secondary_id was default,
# but the driver failover_host method calls us with "default"
elif not active_backend_id or active_backend_id == 'default':
LOG.info('Failing back to %s', self._failed_over_primary_array)
self._swap_replication_state(current,
self._failed_over_primary_array, self._failed_over_primary_array,
failback=True) failback=True)
else: else:
self._swap_replication_state(self._get_current_array(), secondary = self._get_secondary(active_backend_id)
self._find_sync_failover_target()) LOG.info('Failing over to %s', secondary.backend_id)
self._swap_replication_state(current,
secondary)
LOG.info('Driver failover completion completed.') LOG.info('Driver failover completion completed.')
@pure_driver_debug_trace @pure_driver_debug_trace
@ -2340,7 +2347,7 @@ class PureBaseVolumeDriver(san.SanDriver):
'done after a failover has completed.') 'done after a failover has completed.')
raise exception.InvalidReplicationTarget(message=msg) raise exception.InvalidReplicationTarget(message=msg)
current_array = self._get_current_array() current_array = self._get_current_array(True)
LOG.debug("Failover replication for array %(primary)s to " LOG.debug("Failover replication for array %(primary)s to "
"%(secondary)s.", "%(secondary)s.",
{"primary": current_array.backend_id, {"primary": current_array.backend_id,
@ -2356,19 +2363,9 @@ class PureBaseVolumeDriver(san.SanDriver):
secondary_array = None secondary_array = None
pg_snap = None # used for async only pg_snap = None # used for async only
if secondary_id: if secondary_id:
for array in self._replication_target_arrays: secondary_array = self._get_secondary(secondary_id)
if array.backend_id == secondary_id: if secondary_array.replication_type in [REPLICATION_TYPE_ASYNC,
secondary_array = array REPLICATION_TYPE_SYNC]:
break
if not secondary_array:
raise exception.InvalidReplicationTarget(
reason=_("Unable to determine secondary_array from"
" supplied secondary: %(secondary)s.") %
{"secondary": secondary_id}
)
if secondary_array.replication_type == REPLICATION_TYPE_ASYNC:
pg_snap = self._get_latest_replicated_pg_snap( pg_snap = self._get_latest_replicated_pg_snap(
secondary_array, secondary_array,
self._get_current_array().array_name, self._get_current_array().array_name,
@ -2403,7 +2400,7 @@ class PureBaseVolumeDriver(san.SanDriver):
elif secondary_array.replication_type == REPLICATION_TYPE_SYNC: elif secondary_array.replication_type == REPLICATION_TYPE_SYNC:
model_updates = self._sync_failover_host(volumes, secondary_array) model_updates = self._sync_failover_host(volumes, secondary_array)
current_array = self._get_current_array() current_array = self._get_current_array(True)
return secondary_array.backend_id, model_updates, [] return secondary_array.backend_id, model_updates, []
@ -2436,6 +2433,7 @@ class PureBaseVolumeDriver(san.SanDriver):
if failback: if failback:
self._replication_target_arrays.append(current_array) self._replication_target_arrays.append(current_array)
self._is_replication_enabled = True self._is_replication_enabled = True
self._failed_over_primary_array = None
# If its sync rep then swap the two in their lists since it is a # If its sync rep then swap the two in their lists since it is a
# bi-directional setup, if the primary is still OK or comes back # bi-directional setup, if the primary is still OK or comes back
@ -2730,6 +2728,17 @@ class PureBaseVolumeDriver(san.SanDriver):
return secondary_array, pg_snap return secondary_array, pg_snap
def _get_secondary(self, secondary_id):
for array in self._replication_target_arrays:
if array.backend_id == secondary_id:
return array
raise exception.InvalidReplicationTarget(
reason=_("Unable to determine secondary_array from"
" supplied secondary: %(secondary)s.") %
{"secondary": secondary_id}
)
def _find_sync_failover_target(self): def _find_sync_failover_target(self):
secondary_array = None secondary_array = None
if not self._active_cluster_target_arrays: if not self._active_cluster_target_arrays:
@ -2766,6 +2775,14 @@ class PureBaseVolumeDriver(san.SanDriver):
# We have to rely on a call that is only available in REST API 1.3 # We have to rely on a call that is only available in REST API 1.3
# therefore we have to create a temporary FlashArray for this. # therefore we have to create a temporary FlashArray for this.
if hasattr(secondary_array, '_request_kwargs'):
target_array = self._get_flasharray(
secondary_array._target,
api_token=secondary_array._api_token,
rest_version='1.3',
request_kwargs=secondary_array._request_kwargs,
)
else:
target_array = self._get_flasharray( target_array = self._get_flasharray(
secondary_array._target, secondary_array._target,
api_token=secondary_array._api_token, api_token=secondary_array._api_token,
@ -2859,13 +2876,15 @@ class PureBaseVolumeDriver(san.SanDriver):
return wwn.lower() return wwn.lower()
def _get_current_array(self, init=False): def _get_current_array(self, init=False):
if not init and self._is_active_cluster_enabled: if (not init and
for target_array in self._active_cluster_target_arrays: self._is_active_cluster_enabled and
not self._failed_over_primary_array):
try: try:
pod_info = self._array.get_pod(self._replication_pod_name)
for target_array in self._active_cluster_target_arrays:
LOG.info("Checking target array %s...", LOG.info("Checking target array %s...",
target_array.array_name) target_array.array_name)
status_ok = False status_ok = False
pod_info = target_array.get_pod(self._replication_pod_name)
for pod_array in pod_info['arrays']: for pod_array in pod_info['arrays']:
if pod_array['array_id'] == target_array.array_id: if pod_array['array_id'] == target_array.array_id:
if pod_array['status'] == 'online': if pod_array['status'] == 'online':
@ -2917,7 +2936,8 @@ class PureISCSIDriver(PureBaseVolumeDriver, san.SanISCSIDriver):
pure_vol_name = self._get_vol_name(volume) pure_vol_name = self._get_vol_name(volume)
target_arrays = [self._get_current_array()] target_arrays = [self._get_current_array()]
if (self._is_vol_in_pod(pure_vol_name) and if (self._is_vol_in_pod(pure_vol_name) and
self._is_active_cluster_enabled): self._is_active_cluster_enabled and
not self._failed_over_primary_array):
target_arrays += self._uniform_active_cluster_target_arrays target_arrays += self._uniform_active_cluster_target_arrays
chap_username = None chap_username = None
@ -2928,9 +2948,15 @@ class PureISCSIDriver(PureBaseVolumeDriver, san.SanISCSIDriver):
targets = [] targets = []
for array in target_arrays: for array in target_arrays:
try:
connection = self._connect(array, pure_vol_name, connector, connection = self._connect(array, pure_vol_name, connector,
chap_username, chap_password) chap_username, chap_password)
except purestorage.PureError as err:
# Swallow any exception, just warn and continue
LOG.warning("self._connect failed with"
" message: %(msg)s", {"msg": err.reason})
continue
target_ports = self._get_target_iscsi_ports(array) target_ports = self._get_target_iscsi_ports(array)
targets.append({ targets.append({
"connection": connection, "connection": connection,
@ -3161,7 +3187,8 @@ class PureFCDriver(PureBaseVolumeDriver, driver.FibreChannelDriver):
pure_vol_name = self._get_vol_name(volume) pure_vol_name = self._get_vol_name(volume)
target_arrays = [self._get_current_array()] target_arrays = [self._get_current_array()]
if (self._is_vol_in_pod(pure_vol_name) and if (self._is_vol_in_pod(pure_vol_name) and
self._is_active_cluster_enabled): self._is_active_cluster_enabled and
not self._failed_over_primary_array):
target_arrays += self._uniform_active_cluster_target_arrays target_arrays += self._uniform_active_cluster_target_arrays
target_luns = [] target_luns = []
@ -3360,15 +3387,20 @@ class PureNVMEDriver(PureBaseVolumeDriver, driver.BaseVD):
"""Allow connection to connector and return connection info.""" """Allow connection to connector and return connection info."""
pure_vol_name = self._get_vol_name(volume) pure_vol_name = self._get_vol_name(volume)
target_arrays = [self._get_current_array()] target_arrays = [self._get_current_array()]
if ( if (self._is_vol_in_pod(pure_vol_name) and
self._is_vol_in_pod(pure_vol_name) self._is_active_cluster_enabled and
and self._is_active_cluster_enabled not self._failed_over_primary_array):
):
target_arrays += self._uniform_active_cluster_target_arrays target_arrays += self._uniform_active_cluster_target_arrays
targets = [] targets = []
for array in target_arrays: for array in target_arrays:
try:
connection = self._connect(array, pure_vol_name, connector) connection = self._connect(array, pure_vol_name, connector)
except purestorage.PureError as err:
# Swallow any exception, just warn and continue
LOG.warning("self._connect failed with"
" message: %(msg)s", {"msg": err.reason})
continue
target_ports = self._get_target_nvme_ports(array) target_ports = self._get_target_nvme_ports(array)
targets.append( targets.append(
{ {

View File

@ -0,0 +1,11 @@
---
features:
- |
Pure Storage driver: Allow synchronously replicated volumes
to be created during a replication failover event. These will
remain viable volumes when the replication is failed back to
its original state.
fixes:
- |
[Pure Storage] `Bug #2035404 <https://bugs.launchpad.net/cinder/+bug/2035404>`_:
Fixed issue with missing replication pod causing driver to fail on restart.