diff --git a/actions.yaml b/actions.yaml index f4233aed..56a22c6c 100644 --- a/actions.yaml +++ b/actions.yaml @@ -149,3 +149,39 @@ get-availability-zone: show-all: type: boolean description: Option to view information for all units. Default is 'false'. +remove-disk: + description: | + Remove disks from Ceph, producing a report afterwards indicating the user + as to how to replace them in the closest way possible. + params: + osd-devices: + type: string + description: A space-separated list of devices to remove + osd-ids: + type: string + description: | + A space separated list of OSD ids to remove. This parameter is mutually + exclusive with the parameter 'osd-devices'. + purge: + type: boolean + description: | + Whether to fully purge the OSD or let the id be available for reuse. + default: false + timeout: + type: integer + description: | + The time in minutes to wait for the OSD to be safe to remove. + default: 5 + force: + type: boolean + description: | + Whether to forcefully remove the OSD even if it's determined to be + unsafe to destroy it. + default: false + format: + type: string + enum: + - text + - json + default: text + description: The output format returned for the command. diff --git a/actions/add_disk.py b/actions/add_disk.py index 6f2f9819..57d49fcf 100755 --- a/actions/add_disk.py +++ b/actions/add_disk.py @@ -61,6 +61,9 @@ def add_device(request, device_path, bucket=None, else: effective_dev = device_path + if osd_id is not None and osd_id.startswith('osd.'): + osd_id = osd_id[4:] + charms_ceph.utils.osdize(effective_dev, hookenv.config('osd-format'), ceph_hooks.get_journal_devices(), hookenv.config('ignore-device-errors'), @@ -91,6 +94,14 @@ def add_device(request, device_path, bucket=None, } ) + if part_iter is not None: + # Update the alias map so we can refer to an OSD via the original + # device instead of the newly created cache name. + aliases = db.get('osd-aliases', {}) + aliases[device_path] = effective_dev + db.set('osd-aliases', aliases) + db.flush() + return request @@ -183,5 +194,5 @@ if __name__ == "__main__": for error in errors: part_iter.cleanup(error) - function_fail('Failed to add devices: {}', ','.join(errors)) + function_fail('Failed to add devices: {}'.format(','.join(errors))) sys.exit(1) diff --git a/actions/remove-disk b/actions/remove-disk new file mode 120000 index 00000000..29934df0 --- /dev/null +++ b/actions/remove-disk @@ -0,0 +1 @@ +./remove_disk.py \ No newline at end of file diff --git a/actions/remove_disk.py b/actions/remove_disk.py new file mode 100755 index 00000000..7a48cba1 --- /dev/null +++ b/actions/remove_disk.py @@ -0,0 +1,358 @@ +#!/usr/bin/env python3 +# +# Copyright 2021 Canonical Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime +import errno +import json +from math import ceil +import subprocess +import sys +import time + +sys.path.append('lib') +sys.path.append('hooks') + +import charmhelpers.core.hookenv as hookenv +from charmhelpers.core.hookenv import function_fail + +import charms_ceph.utils +from charmhelpers.core.unitdata import kv +from utils import (get_bcache_names, bcache_remove, device_size, + get_parent_device, remove_lvm, wipefs_safely) + + +def normalize_osd_id(osd_id): + """Make sure an OSD id has the form 'osd.'. + + :param osd_id: The OSD id, either a string or the integer ID. + :type osd_id: Option[int, str] + + :returns: A string with the form 'osd.. + :rtype: str + """ + if not isinstance(osd_id, str) or not osd_id.startswith('osd.'): + osd_id = 'osd.' + str(osd_id) + return osd_id + + +def get_device_map(): + """Get a list of osd.id, device-path for every device that + is being used by local OSD. + + :returns: A list of OSD ids and devices. + :rtype: list[dict['id', 'path']] + """ + ret = [] + vlist = subprocess.check_output(['ceph-volume', 'lvm', 'list', + '--format=json']) + for osd_id, data in json.loads(vlist.decode('utf8')).items(): + osd_id = normalize_osd_id(osd_id) + for elem in data: + for device in elem['devices']: + ret.append({'id': osd_id, 'path': device}) + return ret + + +def map_device_to_id(dev_map, device): + """Get the OSD id for a device or bcache name. + + :param dev_map: A map with the same form as that returned by + the function 'get_device_map'. + :type dev_map: list[dict['id', 'path']] + + :param device: The path to the device. + :type device: str + + :returns: The OSD id in use by the device, if any. + :rtype: Option[None, str] + """ + for elem in dev_map: + if device == elem['path']: + return elem['id'] + + +def map_id_to_device(dev_map, osd_id): + """Get the device path for an OSD id. + + :param dev_map: A map with the same form as that returned by + the function 'get_device_map'. + :type dev_map: list[dict['id', 'path']] + + :param osd_id: The OSD id to check against. + :type osd_id: str + + :returns: The device path being used by the OSD id, if any. + :rtype: Option[None, str] + """ + for elem in dev_map: + if elem['id'] == osd_id: + return elem['path'] + + +def safe_to_destroy(osd_id): + """Test whether an OSD id is safe to destroy per the Ceph cluster.""" + ret = subprocess.call(['ceph', '--id', 'osd-removal', + 'osd', 'safe-to-destroy', osd_id]) + return ret == 0 + + +def safe_to_stop(osd_id): + """Test whether an OSD is safe to stop.""" + ret = subprocess.call(['ceph', '--id', 'osd-removal', + 'osd', 'ok-to-stop', osd_id]) + return ret == 0 + + +def reweight_osd(osd_id): + """Set the weight of the OSD id to zero.""" + subprocess.check_call(['ceph', '--id', 'osd-removal', + 'osd', 'crush', 'reweight', osd_id, '0']) + + +def destroy(osd_id, purge=False): + """Destroy or purge an OSD id.""" + for _ in range(10): + # We might get here before the OSD is marked as down. As such, + # retry if the error code is EBUSY. + try: + subprocess.check_call(['ceph', '--id', 'osd-removal', 'osd', + 'purge' if purge else 'destroy', + osd_id, '--yes-i-really-mean-it']) + return + except subprocess.CalledProcessError as e: + if e.returncode != errno.EBUSY: + raise + time.sleep(0.1) + + +class RemoveException(Exception): + """Exception type used to notify of errors for this action.""" + pass + + +class ActionOSD: + + """Class used to encapsulate all the needed information to + perform OSD removal.""" + + def __init__(self, dev_map, dev=None, osd_id=None, aliases={}): + """Construct an action-OSD. + + :param dev_map: A map with the same form as that returned by + the function 'get_device_map'. + :type dev_map: list[dict['id', 'path']] + + :param dev: The device being used by an OSD. + :type dev: Option[None, str] + + :param osd_id: The OSD id. + :type osd_id: Option[None, int, str] + """ + if dev is not None: + if osd_id is not None: + raise RemoveException( + 'osd-ids and osd-devices are mutually exclusive') + elif dev in aliases: + self.alias = dev + self.device = aliases.get(dev) + else: + self.device, self.alias = dev, None + + self.osd_id = map_device_to_id(dev_map, self.device) + self.bcache_backing, self.bcache_caching = \ + get_bcache_names(self.device) + if self.osd_id is None: + raise RemoveException('Device {} is not being used' + .format(self.device)) + else: + self.alias = None + self.osd_id = normalize_osd_id(osd_id) + self.device = map_id_to_device(dev_map, self.osd_id) + if self.device is None: + raise RemoveException('Invalid osd ID: {}'.format(self.osd_id)) + + self.bcache_backing, self.bcache_caching = \ + get_bcache_names(self.device) + + self.report = {} # maps device -> actions. + + @property + def osd_device(self): + return self.bcache_backing or self.device + + def remove(self, purge, timeout, force): + """Remove the OSD from the cluster. + + :param purge: Whether to purge or just destroy the OSD. + :type purge: bool + + :param timeout: The number of minutes to wait for until the OSD + is safe to destroy. + :type timeout: int + + :param force: Whether to proceed with OSD removal, even when + it's not safe to do so. + :type force: bool + """ + # Set the CRUSH weight to 0. + hookenv.log('Reweighting OSD', hookenv.DEBUG) + reweight_osd(self.osd_id) + + # Ensure that the OSD is safe to stop and destroy. + end = (datetime.datetime.now() + + datetime.timedelta(seconds=timeout * 60)) + safe_stop, safe_destroy = False, False + + while True: + if not safe_stop and safe_to_stop(self.osd_id): + safe_stop = True + if not safe_destroy and safe_to_destroy(self.osd_id): + safe_destroy = True + + if safe_stop and safe_destroy: + break + + curr = datetime.datetime.now() + if curr >= end: + if force: + hookenv.log( + 'OSD not safe to destroy, but "force" was specified', + hookenv.DEBUG) + break + + raise RemoveException( + 'timed out waiting for an OSD to be safe to destroy') + time.sleep(min(1, (end - curr).total_seconds())) + + # Stop the OSD service. + hookenv.log('Stopping the OSD service', hookenv.DEBUG) + charms_ceph.utils.stop_osd(self.osd_id[4:]) + + # Remove the OSD from the cluster. + hookenv.log('Destroying the OSD', hookenv.DEBUG) + destroy(self.osd_id, purge) + report = self.report.setdefault(self.osd_device, + {'osd-ids': self.osd_id}) + + if self.bcache_backing: + # Remove anything related to bcache. + size = int(ceil(device_size(self.bcache_caching))) + caching = get_parent_device(self.bcache_caching) + report.update({'cache-devices': caching, 'partition-size': size}) + bcache_remove(self.device, self.bcache_backing, + self.bcache_caching) + else: + remove_lvm(self.device) + wipefs_safely(self.device) + + +def make_same_length(l1, l2): + """Make sure 2 lists have the same length, padding out with None's.""" + ln = max(len(l1), len(l2)) + l1.extend([None] * (ln - len(l1))) + l2.extend([None] * (ln - len(l2))) + + +def write_report(report, ftype): + """Generate a report on how to re-established the removed disks + to be part of the cluster again, then set the 'message' attribute to + either a JSON object or a textual representation. + + :param report: The initial, raw report from the 'ActionOSD' objects. + :type report: dict + + :param ftype: Either 'text' or 'json'; specifies the type of report + :type ftype: Enum['text', 'json'] + """ + if ftype == 'text': + msg = '{} disks have been removed\n'.format(len(report)) + msg += 'To replace them, run:\n' + for device, action_args in report.items(): + args = json.dumps(action_args, separators=(' ', '=')) + args = args.replace('{', '').replace('}', '').replace('"', '') + msg += 'juju run-action {} add-disk {} {}'.format( + hookenv.local_unit(), 'osd-devices=' + device, args) + else: + msg = json.dumps(report) + + hookenv.action_set({'message': msg}) + + +def get_list(key): + """Retrieve the action arguments based on the key as a list.""" + ret = hookenv.action_get(key) + return ret.split() if ret else [] + + +def advertise_osd_count(count): + """Let the Ceph-mon know of the updated OSD number.""" + for relid in hookenv.relation_ids('mon'): + hookenv.relation_set( + relation_id=relid, + relation_settings={'bootstrapped-osds': count} + ) + + +def main(): + osd_ids = get_list('osd-ids') + osd_devs = get_list('osd-devices') + purge = hookenv.action_get('purge') + force = hookenv.action_get('force') + timeout = hookenv.action_get('timeout') + + if timeout <= 0: + function_fail('timeout must be > 0') + sys.exit(1) + elif not osd_ids and not osd_devs: + function_fail('One of osd-ids or osd-devices must be provided') + sys.exit(1) + + make_same_length(osd_ids, osd_devs) + errors = [] + report = {} + dev_map = get_device_map() + charm_devices = kv().get('osd-devices', []) + aliases = kv().get('osd-aliases', {}) + + for dev, osd_id in zip(osd_devs, osd_ids): + try: + action_osd = ActionOSD(dev_map, dev=dev, osd_id=osd_id, + aliases=aliases) + if action_osd.device not in charm_devices: + errors.append('Device {} not being used by Ceph' + .format(action_osd.device)) + continue + action_osd.remove(purge, timeout, force) + charm_devices.remove(action_osd.device) + if action_osd.alias: + aliases.pop(action_osd.alias) + report.update(action_osd.report) + except RemoveException as e: + errors.append(str(e)) + + kv().set('osd-devices', charm_devices) + kv().set('osd-aliases', aliases) + kv().flush() + advertise_osd_count(len(charm_devices)) + write_report(report, hookenv.action_get('format')) + + if errors: + function_fail('Failed to remove devices: {}'.format(','.join(errors))) + sys.exit(1) + + +if __name__ == '__main__': + main() diff --git a/hooks/ceph_hooks.py b/hooks/ceph_hooks.py index fb30f221..7c03190b 100755 --- a/hooks/ceph_hooks.py +++ b/hooks/ceph_hooks.py @@ -79,6 +79,7 @@ from utils import ( is_osd_bootstrap_ready, import_osd_bootstrap_key, import_osd_upgrade_key, + import_osd_removal_key, get_host_ip, get_networks, assert_charm_supports_ipv6, @@ -662,11 +663,14 @@ def get_bdev_enable_discard(): def mon_relation(): bootstrap_key = relation_get('osd_bootstrap_key') upgrade_key = relation_get('osd_upgrade_key') + removal_key = relation_get('osd_disk_removal_key') if get_fsid() and get_auth() and bootstrap_key: log('mon has provided conf- scanning disks') emit_cephconf() import_osd_bootstrap_key(bootstrap_key) import_osd_upgrade_key(upgrade_key) + if removal_key: + import_osd_removal_key(removal_key) prepare_disks_and_activate() _, settings, _ = (ch_ceph.CephOSDConfContext() .filter_osd_from_mon_settings()) diff --git a/hooks/utils.py b/hooks/utils.py index 26f5f836..44f96c62 100644 --- a/hooks/utils.py +++ b/hooks/utils.py @@ -72,6 +72,7 @@ except ImportError: _bootstrap_keyring = "/var/lib/ceph/bootstrap-osd/ceph.keyring" _upgrade_keyring = "/var/lib/ceph/osd/ceph.client.osd-upgrade.keyring" +_removal_keyring = "/var/lib/ceph/osd/ceph.client.osd-removal.keyring" def is_osd_bootstrap_ready(): @@ -83,6 +84,21 @@ def is_osd_bootstrap_ready(): return os.path.exists(_bootstrap_keyring) +def _import_key(key, path, name): + if not os.path.exists(path): + cmd = [ + 'sudo', + '-u', + ceph.ceph_user(), + 'ceph-authtool', + path, + '--create-keyring', + '--name={}'.format(name), + '--add-key={}'.format(key) + ] + subprocess.check_call(cmd) + + def import_osd_bootstrap_key(key): """ Ensure that the osd-bootstrap keyring is setup. @@ -90,18 +106,7 @@ def import_osd_bootstrap_key(key): :param key: The cephx key to add to the bootstrap keyring :type key: str :raises: subprocess.CalledProcessError""" - if not os.path.exists(_bootstrap_keyring): - cmd = [ - "sudo", - "-u", - ceph.ceph_user(), - 'ceph-authtool', - _bootstrap_keyring, - '--create-keyring', - '--name=client.bootstrap-osd', - '--add-key={}'.format(key) - ] - subprocess.check_call(cmd) + _import_key(key, _bootstrap_keyring, 'client.bootstrap-osd') def import_osd_upgrade_key(key): @@ -111,18 +116,17 @@ def import_osd_upgrade_key(key): :param key: The cephx key to add to the upgrade keyring :type key: str :raises: subprocess.CalledProcessError""" - if not os.path.exists(_upgrade_keyring): - cmd = [ - "sudo", - "-u", - ceph.ceph_user(), - 'ceph-authtool', - _upgrade_keyring, - '--create-keyring', - '--name=client.osd-upgrade', - '--add-key={}'.format(key) - ] - subprocess.check_call(cmd) + _import_key(key, _upgrade_keyring, 'client.osd-upgrade') + + +def import_osd_removal_key(key): + """ + Ensure that the osd-removal keyring is setup. + + :param key: The cephx key to add to the upgrade keyring + :type key: str + :raises: subprocess.CalledProcessError""" + _import_key(key, _removal_keyring, 'client.osd-removal') def render_template(template_name, context, template_dir=TEMPLATES_DIR): @@ -348,16 +352,16 @@ class DeviceError(Exception): pass -def _check_output(args): +def _check_output(args, **kwargs): try: - return subprocess.check_output(args).decode('UTF-8') + return subprocess.check_output(args, **kwargs).decode('UTF-8') except subprocess.CalledProcessError as e: raise DeviceError(str(e)) -def _check_call(args): +def _check_call(args, **kwargs): try: - return subprocess.check_call(args) + return subprocess.check_call(args, **kwargs) except subprocess.CalledProcessError as e: raise DeviceError(str(e)) @@ -458,16 +462,37 @@ def device_size(dev): return ret / (1024 * 1024 * 1024) # Return size in GB. -def bcache_remove(bcache, cache_dev): +def remove_lvm(device): + """Remove any physical and logical volumes associated to a device.""" + vgs = [] + try: + rv = _check_output(['sudo', 'pvdisplay', device]) + except DeviceError: + # Assume no physical volumes. + return + + for line in rv.splitlines(): + line = line.strip() + if line.startswith('VG Name'): + vgs.append(line.split()[2]) + if vgs: + _check_call(['sudo', 'vgremove', '-y'] + vgs) + _check_call(['sudo', 'pvremove', '-y', device]) + + +def bcache_remove(bcache, backing, caching): """Remove a bcache kernel device, given its caching. :param bache: The path of the bcache device. :type bcache: str - :param cache_dev: The caching device used for the bcache name. - :type cache_dev: str + :param backing: The backing device for bcache + :type backing: str + + :param caching: The caching device for bcache + :type caching: str """ - rv = _check_output(['sudo', 'bcache-super-show', cache_dev]) + rv = _check_output(['sudo', 'bcache-super-show', backing]) uuid = None # Fetch the UUID for the caching device. for line in rv.split('\n'): @@ -478,15 +503,47 @@ def bcache_remove(bcache, cache_dev): else: return bcache_name = bcache[bcache.rfind('/') + 1:] - with open('/sys/block/{}/bcache/stop'.format(bcache_name), 'wb') as f: - f.write(b'1') - with open('/sys/fs/bcache/{}/stop'.format(uuid), 'wb') as f: - f.write(b'1') + + def write_one(path): + os.system('echo 1 | sudo tee {}'.format(path)) + + # The command ceph-volume typically creates PV's and VG's for the + # OSD device. Remove them now before deleting the bcache. + remove_lvm(bcache) + + # NOTE: We *must* do the following steps in this order. For + # kernels 4.x and prior, not doing so will cause the bcache device + # to be undeletable. + # In addition, we have to use 'sudo tee' as done above, since it + # can cause permission issues in some implementations. + write_one('/sys/block/{}/bcache/detach'.format(bcache_name)) + write_one('/sys/block/{}/bcache/stop'.format(bcache_name)) + write_one('/sys/fs/bcache/{}/stop'.format(uuid)) + + # We wipe the bcache signatures here because the bcache tools will not + # create the devices otherwise. There is a 'force' option, but it's not + # always available, so we do the portable thing here. + wipefs_safely(backing) + wipefs_safely(caching) -def wipe_disk(dev): +def wipe_disk(dev, timeout=None): """Destroy all data in a specific device, including partition tables.""" - _check_call(['sudo', 'wipefs', '-a', dev]) + _check_call(['sudo', 'wipefs', '-a', dev], timeout=timeout) + + +def wipefs_safely(dev): + for _ in range(10): + try: + wipe_disk(dev, 1) + return + except DeviceError: + time.sleep(0.3) + except subprocess.TimeoutExpired: + # If this command times out, then it's likely because + # the disk is dead, so give up. + return + raise DeviceError('Failed to wipe bcache device: {}'.format(dev)) class PartitionIter: @@ -556,11 +613,71 @@ class PartitionIter: return ret def cleanup(self, device): + """Destroy any created partitions and bcache names for a device.""" args = self.created.get(device) if not args: return + bcache, caching = args try: - bcache_remove(*args) + bcache_remove(bcache, device, caching) except DeviceError: - log('Failed to cleanup bcache device: {}'.format(args[0])) + log('Failed to cleanup bcache device: {}'.format(bcache)) + + +def _device_suffix(dev): + ix = dev.rfind('/') + if ix >= 0: + dev = dev[ix + 1:] + return dev + + +def get_bcache_names(dev): + """Return the backing and caching devices for a bcache device, + in that specific order. + + :param dev: The path to the bcache device, i.e: /dev/bcache0 + :type dev: str + + :returns: A tuple with the backing and caching devices. + :rtype: list[Option[None, str], Option[None, str]] + """ + if dev is None: + return None, None + + dev_name = _device_suffix(dev) + bcache_path = '/sys/block/{}/slaves'.format(dev_name) + if (not os.path.exists('/sys/block/{}/bcache'.format(dev_name)) or + not os.path.exists(bcache_path)): + return None, None + + cache = os.listdir(bcache_path) + if len(cache) < 2: + return None, None + + backing = '/dev/' + cache[0] + caching = '/dev/' + cache[1] + out = _check_output(['sudo', 'bcache-super-show', backing]) + if 'backing device' not in out: + return caching, backing + return backing, caching + + +def get_parent_device(dev): + """Return the device's parent, assuming if it's a block device.""" + try: + rv = subprocess.check_output(['lsblk', '-as', dev, '-J']) + rv = json.loads(rv.decode('UTF-8')) + except subprocess.CalledProcessError: + return dev + + children = rv.get('blockdevices', []) + if not children: + return dev + + children = children[0].get('children', []) + for child in children: + if 'children' not in child: + return '/dev/' + child['name'] + + return dev diff --git a/lib/charms_ceph/utils.py b/lib/charms_ceph/utils.py index 643f2e03..429b8900 100644 --- a/lib/charms_ceph/utils.py +++ b/lib/charms_ceph/utils.py @@ -1162,6 +1162,10 @@ osd_upgrade_caps = collections.OrderedDict([ 'allow command "osd in"', 'allow command "osd rm"', 'allow command "auth del"', + 'allow command "osd safe-to-destroy"', + 'allow command "osd crush reweight"', + 'allow command "osd purge"', + 'allow command "osd destroy"', ]) ]) diff --git a/templates/ceph.conf b/templates/ceph.conf index 1284b516..782a231d 100644 --- a/templates/ceph.conf +++ b/templates/ceph.conf @@ -51,6 +51,9 @@ enable experimental unrecoverable data corrupting features = bluestore rocksdb [client.osd-upgrade] keyring = /var/lib/ceph/osd/ceph.client.osd-upgrade.keyring +[client.osd-removal] +keyring = /var/lib/ceph/osd/ceph.client.osd-removal.keyring + [mon] keyring = /var/lib/ceph/mon/$cluster-$id/keyring diff --git a/unit_tests/test_actions_add_disk.py b/unit_tests/test_actions_add_disk.py index dd2bb64d..1d06394f 100644 --- a/unit_tests/test_actions_add_disk.py +++ b/unit_tests/test_actions_add_disk.py @@ -44,7 +44,7 @@ class AddDiskActionTests(CharmTestCase): db = mock.MagicMock() self.kv.return_value = db - db.get.return_value = ['/dev/myosddev'] + db.get.side_effect = {'osd-devices': ['/dev/myosddev']}.get request = {'ops': []} add_disk.add_device(request, '/dev/myosddev') @@ -57,11 +57,13 @@ class AddDiskActionTests(CharmTestCase): True, None)]) piter = add_disk.PartitionIter(['/dev/cache'], 100, ['/dev/myosddev']) - mock_create_bcache = mock.MagicMock(side_effect=lambda b: b) + mock_create_bcache = mock.MagicMock(side_effect=lambda b: '/dev/cache') with mock.patch.object(add_disk.PartitionIter, 'create_bcache', mock_create_bcache) as mock_call: add_disk.add_device(request, '/dev/myosddev', part_iter=piter) mock_call.assert_called() + db.set.assert_called_with('osd-aliases', + {'/dev/myosddev': '/dev/cache'}) mock_create_bcache.side_effect = lambda b: None with mock.patch.object(add_disk.PartitionIter, 'create_bcache', diff --git a/unit_tests/test_actions_remove_disk.py b/unit_tests/test_actions_remove_disk.py new file mode 100644 index 00000000..369d3f1f --- /dev/null +++ b/unit_tests/test_actions_remove_disk.py @@ -0,0 +1,136 @@ +# Copyright 2021 Canonical Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from unittest import mock + +from actions import remove_disk + +from test_utils import CharmTestCase + + +class RemoveDiskActionTests(CharmTestCase): + + @mock.patch.object(remove_disk.subprocess, 'check_output') + def test_get_device_map(self, check_output): + check_output.return_value = b''' +{ + "1": [{"devices": ["/dev/sdx1"]}], + "2": [{"devices": ["/dev/sdc2", "/dev/sdc3"]}] +} + ''' + rv = remove_disk.get_device_map() + self.assertEqual(rv[0]['path'], '/dev/sdx1') + self.assertEqual(rv[1]['id'], rv[2]['id']) + + def test_normalize_osd_id(self): + self.assertEqual('osd.1', remove_disk.normalize_osd_id(1)) + self.assertEqual('osd.2', remove_disk.normalize_osd_id('osd.2')) + self.assertEqual('osd.3', remove_disk.normalize_osd_id('3')) + + def test_map_device_id(self): + dev_map = [ + {'id': 'osd.1', 'path': '/dev/sdc1'}, + {'id': 'osd.2', 'path': '/dev/sdd2'}, + {'id': 'osd.2', 'path': '/dev/sdx3'} + ] + self.assertEqual( + 'osd.1', + remove_disk.map_device_to_id(dev_map, '/dev/sdc1')) + self.assertIsNone( + remove_disk.map_device_to_id(dev_map, '/dev/sdx4')) + + self.assertEqual( + '/dev/sdd2', + remove_disk.map_id_to_device(dev_map, 'osd.2')) + self.assertIsNone( + remove_disk.map_id_to_device(dev_map, 'osd.3')) + + @mock.patch.object(remove_disk, 'get_bcache_names') + def test_action_osd_constructor(self, bcache_names): + bcache_names.return_value = ('bcache0', '/dev/bcache0') + dev_map = [ + {'path': '/dev/sdx1', 'id': 'osd.1'} + ] + with self.assertRaises(remove_disk.RemoveException): + remove_disk.ActionOSD(dev_map, dev='/dev/sdx1', osd_id='osd.1') + obj = remove_disk.ActionOSD(dev_map, dev='/dev/sdx1') + self.assertEqual(obj.osd_id, 'osd.1') + obj = remove_disk.ActionOSD(dev_map, osd_id='1') + self.assertEqual(obj.device, '/dev/sdx1') + + @mock.patch.object(remove_disk, 'device_size') + @mock.patch.object(remove_disk.charms_ceph.utils, 'stop_osd') + @mock.patch.object(remove_disk, 'bcache_remove') + @mock.patch.object(remove_disk.subprocess, 'call') + @mock.patch.object(remove_disk.subprocess, 'check_call') + @mock.patch.object(remove_disk, 'get_bcache_names') + def test_action_osd_remove(self, get_bcache_names, check_call, + call, bcache_remove, stop_osd, device_size): + call.return_value = 0 + get_bcache_names.return_value = ('/dev/backing', '/dev/caching') + device_size.side_effect = lambda x: 1 if x == '/dev/caching' else 0 + dev_map = [ + {'path': '/dev/bcache0', 'id': 'osd.1'} + ] + prefix_args = ['ceph', '--id', 'osd-removal'] + obj = remove_disk.ActionOSD(dev_map, osd_id='1') + + obj.remove(True, 1, True) + call.assert_any_call(prefix_args + ['osd', 'safe-to-destroy', 'osd.1']) + check_call.assert_any_call(prefix_args + ['osd', 'purge', 'osd.1', + '--yes-i-really-mean-it']) + check_call.assert_any_call(prefix_args + ['osd', 'crush', 'reweight', + 'osd.1', '0']) + bcache_remove.assert_called_with( + '/dev/bcache0', '/dev/backing', '/dev/caching') + report = obj.report + self.assertIn('/dev/backing', report) + report = report['/dev/backing'] + self.assertIn('osd-ids', report) + self.assertIn('osd.1', report['osd-ids']) + self.assertIn('cache-devices', report) + self.assertIn('partition-size', report) + self.assertEqual('/dev/caching', report['cache-devices']) + self.assertEqual(1, report['partition-size']) + + # Test the timeout check. + with self.assertRaises(remove_disk.RemoveException): + call.return_value = 1 + obj.remove(False, 0, False) + + @mock.patch.object(remove_disk.hookenv, 'local_unit') + @mock.patch.object(remove_disk.hookenv, 'action_set') + def test_write_report(self, action_set, local_unit): + output = {} + local_unit.return_value = 'ceph-osd/0' + action_set.side_effect = lambda x: output.update(x) + report = {'dev@': {'osd-ids': 'osd.1', 'cache-devices': 'cache@', + 'partition-size': 5}} + remove_disk.write_report(report, 'text') + self.assertIn('message', output) + msg = output['message'] + self.assertIn('juju run-action ceph-osd/0 add-disk', msg) + self.assertIn('osd-devices=dev@', msg) + self.assertIn('osd-ids=osd.1', msg) + self.assertIn('cache-devices=cache@', msg) + self.assertIn('partition-size=5', msg) + + def test_make_same_length(self): + l1, l2 = [1], [] + remove_disk.make_same_length(l1, l2) + self.assertEqual(len(l1), len(l2)) + self.assertIsNone(l2[0]) + prev_len = len(l1) + remove_disk.make_same_length(l1, l2) + self.assertEqual(len(l1), prev_len) diff --git a/unit_tests/test_ceph_utils.py b/unit_tests/test_ceph_utils.py index f338eb3a..f0fbabd6 100644 --- a/unit_tests/test_ceph_utils.py +++ b/unit_tests/test_ceph_utils.py @@ -15,7 +15,7 @@ import unittest -from unittest.mock import patch, mock_open +from unittest.mock import patch with patch('charmhelpers.contrib.hardening.harden.harden') as mock_dec: mock_dec.side_effect = (lambda *dargs, **dkwargs: lambda f: @@ -204,7 +204,10 @@ class CephUtilsTestCase(unittest.TestCase): self.assertEqual(745, int(utils.device_size(''))) @patch('subprocess.check_output') - def test_bcache_remove(self, check_output): + @patch.object(utils, 'remove_lvm') + @patch.object(utils, 'wipe_disk') + @patch('os.system') + def test_bcache_remove(self, system, wipe_disk, remove_lvm, check_output): check_output.return_value = b''' sb.magic ok sb.first_sector 8 [match] @@ -223,15 +226,93 @@ class CephUtilsTestCase(unittest.TestCase): dev.cache.replacement 0 [lru] cset.uuid 424242 ''' - mo = mock_open() - with patch('builtins.open', mo): - utils.bcache_remove('/dev/bcache0', '/dev/nvme0n1p1') - mo.assert_any_call('/sys/block/bcache0/bcache/stop', 'wb') - mo.assert_any_call('/sys/fs/bcache/424242/stop', 'wb') + utils.bcache_remove('/dev/bcache0', 'backing', 'caching') + system.assert_any_call( + 'echo 1 | sudo tee /sys/block/bcache0/bcache/detach') + system.assert_any_call( + 'echo 1 | sudo tee /sys/block/bcache0/bcache/stop') + system.assert_any_call( + 'echo 1 | sudo tee /sys/fs/bcache/424242/stop') + wipe_disk.assert_any_call('backing', 1) + wipe_disk.assert_any_call('caching', 1) + @patch('os.listdir') + @patch('os.path.exists') + @patch('subprocess.check_output') + def test_get_bcache_names(self, check_output, exists, listdir): + exists.return_value = True + check_output.return_value = b''' +sb.magic ok +sb.first_sector 8 [match] +sb.csum A71D96D4364343BF [match] +sb.version 1 [backing device] + +dev.label (empty) +dev.uuid cca84a86-3f68-4ffb-8be1-4449c9fb29a8 +dev.sectors_per_block 1 +dev.sectors_per_bucket 1024 +dev.data.first_sector 16 +dev.data.cache_mode 1 [writeback] +dev.data.cache_state 1 [clean] + +cset.uuid 57add9da-e5de-47c6-8f39-3e16aafb8d31 + ''' + listdir.return_value = ['backing', 'caching'] + values = utils.get_bcache_names('/dev/bcache0') + self.assertEqual(2, len(values)) + self.assertEqual(values[0], '/dev/backing') + check_output.return_value = b''' +sb.magic ok +sb.first_sector 8 [match] +sb.csum 6802E76075FF7B77 [match] +sb.version 3 [cache device] + +dev.label (empty) +dev.uuid fb6e9d06-12e2-46ca-b8fd-797ecec1a126 +dev.sectors_per_block 1 +dev.sectors_per_bucket 1024 +dev.cache.first_sector 1024 +dev.cache.cache_sectors 10238976 +dev.cache.total_sectors 10240000 +dev.cache.ordered yes +dev.cache.discard no +dev.cache.pos 0 +dev.cache.replacement 0 [lru] + +cset.uuid 57add9da-e5de-47c6-8f39-3e16aafb8d31 + ''' + values = utils.get_bcache_names('/dev/bcache0') + self.assertEqual(values[0], '/dev/caching') + + @patch('subprocess.check_output') + @patch('subprocess.check_call') + def test_remove_lvm(self, check_call, check_output): + check_output.return_value = b''' +--- Physical volume --- + PV Name /dev/bcache0 + VG Name ceph-1 + VG Name ceph-2 + ''' + utils.remove_lvm('/dev/bcache0') + check_call.assert_any_call( + ['sudo', 'vgremove', '-y', 'ceph-1', 'ceph-2']) + check_call.assert_any_call(['sudo', 'pvremove', '-y', '/dev/bcache0']) + + check_call.reset_mock() + + def just_raise(*args): + raise utils.DeviceError() + + check_output.side_effect = just_raise + utils.remove_lvm('') + check_call.assert_not_called() + + @patch.object(utils, 'wipe_disk') + @patch.object(utils, 'bcache_remove') @patch.object(utils, 'create_partition') @patch.object(utils, 'setup_bcache') - def test_partition_iter(self, setup_bcache, create_partition): + def test_partition_iter(self, setup_bcache, create_partition, + bcache_remove, wipe_disk): create_partition.side_effect = \ lambda c, s, n: c + '|' + str(s) + '|' + str(n) setup_bcache.side_effect = lambda *args: args @@ -239,6 +320,8 @@ class CephUtilsTestCase(unittest.TestCase): 200, ['dev1', 'dev2', 'dev3']) piter.create_bcache('dev1') setup_bcache.assert_called_with('dev1', '/dev/nvm0n1|200|0') + piter.cleanup('dev1') + bcache_remove.assert_called() setup_bcache.mock_reset() piter.create_bcache('dev2') setup_bcache.assert_called_with('dev2', '/dev/nvm0n2|200|0') @@ -258,3 +341,14 @@ class CephUtilsTestCase(unittest.TestCase): # 300GB across 3 devices, i.e: 100 for each. self.assertEqual(100, next(piter)) self.assertEqual(100, next(piter)) + + @patch.object(utils.subprocess, 'check_output') + def test_parent_device(self, check_output): + check_output.return_value = b''' +{"blockdevices": [ + {"name": "loop1p1", + "children": [ + {"name": "loop1"}] + }] +}''' + self.assertEqual(utils.get_parent_device('/dev/loop1p1'), '/dev/loop1')