Add action to force resync of images in all pools on local endpoint

There exist failure scenarios where abrupt shutdown and/or
interruptions to communication may lead to a split-brain situation
where the RBD Mirroring process in both Ceph clusters claim to be
the primary.

In such a situation the operator must decide which cluster has the
most recent data and should be elected primary by using the
``demote`` and ``promote`` (optionally with force parameter)
actions.

After making this decision the secondary cluster must be resynced
to track the promoted master, this is done by running the
``resync-pools`` action on the non-master cluster.

Change-Id: I4f57c9202ed4d055066286f808369ec0ddddb7ea
This commit is contained in:
Frode Nordahl 2019-04-01 12:08:55 +02:00
parent 0770640158
commit 79bc4e1379
5 changed files with 97 additions and 1 deletions

View File

@ -6,7 +6,32 @@ Ceph 12.2 Luminous or later is required.
# Usage
TBC
## Recovering from abrupt shutdown
There exist failure scenarios where abrupt shutdown and/or interruptions to
communication may lead to a split-brain situation where the RBD Mirroring
process in both Ceph clusters claim to be the primary.
In such a situation the operator must decide which cluster has the most
recent data and should be elected primary by using the ``demote`` and
``promote`` (optionally with force parameter) actions.
After making this decision the secondary cluster must be resynced to track
the promoted master, this is done by running the ``resync-pools`` action on
the non-master cluster.
juju run-action -m site-b ceph-rbd-mirror/leader --wait demote
juju run-action -m site-a ceph-rbd-mirror/leader --wait promote force=True
juju run-action -m site-a ceph-rbd-mirror/leader --wait status verbose=True
juju run-action -m site-b ceph-rbd-mirror/leader --wait status verbose=True
juju run-action -m site-b ceph-rbd-mirror/leader --wait resync-pools i-really-mean-it=True
__NOTE__ When using Ceph Luminous, the mirror state information will not be
accurate after recovering from unclean shutdown. Regardless of the output of
the status information you will be able to write to images after a forced
promote.
# Bugs

View File

@ -16,6 +16,18 @@ refresh-pools:
Refresh list of pools from local and remote Ceph endpoint.
As a side effect, mirroring will be configured for any manually created
pools that the charm currently does not know about.
resync-pools:
description: |
\
USE WITH CAUTION - Force image resync for all images in pools on local
Ceph endpoint.
params:
i-really-mean-it:
type: boolean
description: |
This must be set to true to perform the action
required:
- i-really-mean-it
status:
description: |
Get mirror pool status

View File

@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import json
import os
import subprocess
@ -95,10 +96,41 @@ def refresh_pools(args):
return reactive.main()
def resync_pools(args):
"""Force image resync on pools in local Ceph endpoint."""
if not ch_core.hookenv.action_get('i-really-mean-it'):
ch_core.hookenv.action_fail('Required parameter not set')
return
with charms_openstack.charm.provide_charm_instance() as charm:
ceph_local = reactive.endpoint_from_name('ceph-local')
pools = charm.eligible_pools(ceph_local.pools)
result = collections.defaultdict(dict)
for pool in pools:
# list images in pool
output = subprocess.check_output(
['rbd', '--id', charm.ceph_id, '--format', 'json',
'-p', pool, 'ls'], universal_newlines=True)
images = json.loads(output)
for image in images:
output = subprocess.check_output(
['rbd', '--id', charm.ceph_id, 'mirror', 'image', 'resync',
'{}/{}'.format(pool, image)], universal_newlines=True)
result[pool][image] = output.rstrip()
output_str = ''
for pool in result:
for image in result[pool]:
if output_str:
output_str += '\n'
output_str += '{}/{}: {}'.format(pool, image,
result[pool][image])
ch_core.hookenv.action_set({'output': output_str})
ACTIONS = {
'demote': rbd_mirror_action,
'promote': rbd_mirror_action,
'refresh-pools': refresh_pools,
'resync-pools': resync_pools,
'status': rbd_mirror_action,
}

1
src/actions/resync-pools Symbolic link
View File

@ -0,0 +1 @@
actions.py

View File

@ -13,6 +13,7 @@
# limitations under the License.
import collections
import json
import mock
import sys
@ -111,6 +112,31 @@ class TestCephRBDMirrorActions(test_utils.PatchHelper):
self._KV.flush.assert_called_once_with()
self.main.assert_called_once_with()
def test_resync_pools(self):
self.patch_object(actions.reactive, 'endpoint_from_name')
self.patch_object(actions.ch_core.hookenv, 'action_get')
self.patch_object(actions.subprocess, 'check_output')
self.patch_object(actions.ch_core.hookenv, 'action_set')
endpoint = mock.MagicMock()
endpoint.pools = collections.OrderedDict(
{'apool': {'applications': {'rbd': {}}}})
self.endpoint_from_name.return_value = endpoint
self.crm_charm.eligible_pools.return_value = endpoint.pools
self.crm_charm.ceph_id = 'acephid'
self.action_get.return_value = False
actions.resync_pools([])
self.assertFalse(self.check_output.called)
self.assertFalse(self.action_set.called)
self.action_get.return_value = True
self.check_output.side_effect = [
json.dumps(['imagea']),
'resync flagged for imagea\n',
]
actions.resync_pools([])
self.assertEquals(
sorted(self.action_set.call_args[0][0]['output'].split('\n')),
['apool/imagea: resync flagged for imagea'])
def test_main(self):
self.patch_object(actions, 'ACTIONS')
self.patch_object(actions.ch_core.hookenv, 'action_fail')