Browse Source

Add pre-flight check for device pristinity

Add `non-pristine` key to `list-disks` action.

No longer attempt to do initializtion of `osd-journal` devices.

Make py27 test noop

Flip pep8 test to py3

Partial-Bug: #1698154
Change-Id: I0ca574fa7f0683b4e8a693b9f62fbf6b39689789
Depends-On: I90a866aa138d18e4242783c42d4c7c587f696d7d
Frode Nordahl 10 months ago
parent
commit
352d699387
No account linked to committer's email address
9 changed files with 252 additions and 115 deletions
  1. 12
    1
      actions.yaml
  2. 19
    4
      actions/list_disks.py
  3. 34
    59
      hooks/ceph_hooks.py
  4. 20
    1
      hooks/utils.py
  5. 35
    10
      lib/ceph/utils.py
  6. 64
    0
      tests/basic_deployment.py
  7. 5
    5
      tox.ini
  8. 0
    35
      unit_tests/test_ceph_hooks.py
  9. 63
    0
      unit_tests/test_ceph_utils.py

+ 12
- 1
actions.yaml View File

@@ -25,7 +25,18 @@ resume:
25 25
     Set the local osd units in the charm to 'in'. Note that the pause option
26 26
     does NOT stop the osd processes.
27 27
 list-disks:
28
-  description: List the unmounted disk on the specified unit
28
+  description: |
29
+    List disks
30
+    .
31
+    The 'disks' key is populated with block devices that are known by udev,
32
+    are not mounted and not mentioned in 'osd-journal' configuration option.
33
+    .
34
+    The 'blacklist' key is populated with osd-devices in the blacklist stored
35
+    in the local kv store of this specific unit.
36
+    .
37
+    The 'non-pristine' key is populated with block devices that are known by
38
+    udev, are not mounted, not mentioned in 'osd-journal' configuration option
39
+    and are currently not eligible for use because of presence of foreign data.
29 40
 add-disk:
30 41
   description: Add disk(s) to Ceph
31 42
   params:

+ 19
- 4
actions/list_disks.py View File

@@ -15,10 +15,17 @@
15 15
 # limitations under the License.
16 16
 
17 17
 """
18
-List unmounted devices.
18
+List disks
19 19
 
20
-This script will get all block devices known by udev and check if they
21
-are mounted so that we can give unmounted devices to the administrator.
20
+The 'disks' key is populated with block devices that are known by udev,
21
+are not mounted and not mentioned in 'osd-journal' configuration option.
22
+
23
+The 'blacklist' key is populated with osd-devices in the blacklist stored
24
+in the local kv store of this specific unit.
25
+
26
+The 'non-pristine' key is populated with block devices that are known by
27
+udev, are not mounted, not mentioned in 'osd-journal' configuration option
28
+and are currently not eligible for use because of presence of foreign data.
22 29
 """
23 30
 
24 31
 import sys
@@ -32,7 +39,15 @@ import ceph.utils
32 39
 import utils
33 40
 
34 41
 if __name__ == '__main__':
42
+    non_pristine = []
43
+    osd_journal = utils.get_journal_devices()
44
+    for dev in list(set(ceph.utils.unmounted_disks()) - set(osd_journal)):
45
+        if (not ceph.utils.is_active_bluestore_device(dev) and
46
+                not ceph.utils.is_pristine_disk(dev)):
47
+            non_pristine.append(dev)
48
+
35 49
     hookenv.action_set({
36
-        'disks': ceph.utils.unmounted_disks(),
50
+        'disks': list(set(ceph.utils.unmounted_disks()) - set(osd_journal)),
37 51
         'blacklist': utils.get_blacklist(),
52
+        'non-pristine': non_pristine,
38 53
     })

+ 34
- 59
hooks/ceph_hooks.py View File

@@ -20,7 +20,6 @@ import glob
20 20
 import os
21 21
 import shutil
22 22
 import sys
23
-import tempfile
24 23
 import socket
25 24
 import subprocess
26 25
 import netifaces
@@ -41,6 +40,7 @@ from charmhelpers.core.hookenv import (
41 40
     Hooks,
42 41
     UnregisteredHookError,
43 42
     service_name,
43
+    status_get,
44 44
     status_set,
45 45
     storage_get,
46 46
     storage_list,
@@ -76,6 +76,7 @@ from utils import (
76 76
     get_public_addr,
77 77
     get_cluster_addr,
78 78
     get_blacklist,
79
+    get_journal_devices,
79 80
 )
80 81
 from charmhelpers.contrib.openstack.alternatives import install_alternative
81 82
 from charmhelpers.contrib.network.ip import (
@@ -85,6 +86,9 @@ from charmhelpers.contrib.network.ip import (
85 86
 )
86 87
 from charmhelpers.contrib.storage.linux.ceph import (
87 88
     CephConfContext)
89
+from charmhelpers.contrib.storage.linux.utils import (
90
+    is_device_mounted,
91
+)
88 92
 from charmhelpers.contrib.charmsupport import nrpe
89 93
 from charmhelpers.contrib.hardening.harden import harden
90 94
 
@@ -357,38 +361,6 @@ def emit_cephconf(upgrading=False):
357 361
                         charm_ceph_conf, 90)
358 362
 
359 363
 
360
-JOURNAL_ZAPPED = '/var/lib/ceph/journal_zapped'
361
-
362
-
363
-def read_zapped_journals():
364
-    if os.path.exists(JOURNAL_ZAPPED):
365
-        with open(JOURNAL_ZAPPED, 'rt', encoding='UTF-8') as zapfile:
366
-            zapped = set(
367
-                filter(None,
368
-                       [l.strip() for l in zapfile.readlines()]))
369
-            log("read zapped: {}".format(zapped), level=DEBUG)
370
-            return zapped
371
-    return set()
372
-
373
-
374
-def write_zapped_journals(journal_devs):
375
-    tmpfh, tmpfile = tempfile.mkstemp()
376
-    with os.fdopen(tmpfh, 'wb') as zapfile:
377
-        log("write zapped: {}".format(journal_devs),
378
-            level=DEBUG)
379
-        zapfile.write('\n'.join(sorted(list(journal_devs))).encode('UTF-8'))
380
-    shutil.move(tmpfile, JOURNAL_ZAPPED)
381
-
382
-
383
-def check_overlap(journaldevs, datadevs):
384
-    if not journaldevs.isdisjoint(datadevs):
385
-        msg = ("Journal/data devices mustn't"
386
-               " overlap; journal: {0}, data: {1}".format(journaldevs,
387
-                                                          datadevs))
388
-        log(msg, level=ERROR)
389
-        raise ValueError(msg)
390
-
391
-
392 364
 @hooks.hook('config-changed')
393 365
 @harden()
394 366
 def config_changed():
@@ -438,13 +410,28 @@ def prepare_disks_and_activate():
438 410
         vaultlocker.write_vaultlocker_conf(context)
439 411
 
440 412
     osd_journal = get_journal_devices()
441
-    check_overlap(osd_journal, set(get_devices()))
413
+    if not osd_journal.isdisjoint(set(get_devices())):
414
+        raise ValueError('`osd-journal` and `osd-devices` options must not'
415
+                         'overlap.')
442 416
     log("got journal devs: {}".format(osd_journal), level=DEBUG)
443
-    already_zapped = read_zapped_journals()
444
-    non_zapped = osd_journal - already_zapped
445
-    for journ in non_zapped:
446
-        ceph.maybe_zap_journal(journ)
447
-    write_zapped_journals(osd_journal)
417
+
418
+    # pre-flight check of eligible device pristinity
419
+    devices = get_devices()
420
+    # filter osd-devices that are file system paths
421
+    devices = [dev for dev in devices if dev.startswith('/dev')]
422
+    # filter osd-devices that does not exist on this unit
423
+    devices = [dev for dev in devices if os.path.exists(dev)]
424
+    # filter osd-devices that are already mounted
425
+    devices = [dev for dev in devices if not is_device_mounted(dev)]
426
+    # filter osd-devices that are active bluestore devices
427
+    devices = [dev for dev in devices
428
+               if not ceph.is_active_bluestore_device(dev)]
429
+    log('Checking for pristine devices: "{}"'.format(devices), level=DEBUG)
430
+    if not all(ceph.is_pristine_disk(dev) for dev in devices):
431
+        status_set('blocked',
432
+                   'Non-pristine devices detected, consult '
433
+                   '`list-disks`, `zap-disk` and `blacklist-*` actions.')
434
+        return
448 435
 
449 436
     if ceph.is_bootstrapped():
450 437
         log('ceph bootstrapped, rescanning disks')
@@ -521,20 +508,6 @@ def get_devices():
521 508
     return [device for device in devices if device not in _blacklist]
522 509
 
523 510
 
524
-def get_journal_devices():
525
-    if config('osd-journal'):
526
-        devices = [l.strip() for l in config('osd-journal').split(' ')]
527
-    else:
528
-        devices = []
529
-    storage_ids = storage_list('osd-journals')
530
-    devices.extend((storage_get('location', s) for s in storage_ids))
531
-
532
-    # Filter out any devices in the action managed unit-local device blacklist
533
-    _blacklist = get_blacklist()
534
-    return set(device for device in devices
535
-               if device not in _blacklist and os.path.exists(device))
536
-
537
-
538 511
 @hooks.hook('mon-relation-changed',
539 512
             'mon-relation-departed')
540 513
 def mon_relation():
@@ -631,13 +604,15 @@ def assess_status():
631 604
 
632 605
     # Check for OSD device creation parity i.e. at least some devices
633 606
     # must have been presented and used for this charm to be operational
607
+    (prev_status, prev_message) = status_get()
634 608
     running_osds = ceph.get_running_osds()
635
-    if not running_osds:
636
-        status_set('blocked',
637
-                   'No block devices detected using current configuration')
638
-    else:
639
-        status_set('active',
640
-                   'Unit is ready ({} OSD)'.format(len(running_osds)))
609
+    if prev_status != 'blocked':
610
+        if not running_osds:
611
+            status_set('blocked',
612
+                       'No block devices detected using current configuration')
613
+        else:
614
+            status_set('active',
615
+                       'Unit is ready ({} OSD)'.format(len(running_osds)))
641 616
 
642 617
 
643 618
 @hooks.hook('update-status')

+ 20
- 1
hooks/utils.py View File

@@ -12,8 +12,10 @@
12 12
 # See the License for the specific language governing permissions and
13 13
 # limitations under the License.
14 14
 
15
-import socket
16 15
 import re
16
+import os
17
+import socket
18
+
17 19
 from charmhelpers.core.hookenv import (
18 20
     unit_get,
19 21
     cached,
@@ -22,6 +24,8 @@ from charmhelpers.core.hookenv import (
22 24
     log,
23 25
     DEBUG,
24 26
     status_set,
27
+    storage_get,
28
+    storage_list,
25 29
 )
26 30
 from charmhelpers.core import unitdata
27 31
 from charmhelpers.fetch import (
@@ -39,6 +43,7 @@ from charmhelpers.contrib.network.ip import (
39 43
     get_ipv6_addr
40 44
 )
41 45
 
46
+
42 47
 TEMPLATES_DIR = 'templates'
43 48
 
44 49
 try:
@@ -213,3 +218,17 @@ def get_blacklist():
213 218
     """Get blacklist stored in the local kv() store"""
214 219
     db = unitdata.kv()
215 220
     return db.get('osd-blacklist', [])
221
+
222
+
223
+def get_journal_devices():
224
+    if config('osd-journal'):
225
+        devices = [l.strip() for l in config('osd-journal').split(' ')]
226
+    else:
227
+        devices = []
228
+    storage_ids = storage_list('osd-journals')
229
+    devices.extend((storage_get('location', s) for s in storage_ids))
230
+
231
+    # Filter out any devices in the action managed unit-local device blacklist
232
+    _blacklist = get_blacklist()
233
+    return set(device for device in devices
234
+               if device not in _blacklist and os.path.exists(device))

+ 35
- 10
lib/ceph/utils.py View File

@@ -66,7 +66,6 @@ from charmhelpers.contrib.storage.linux.ceph import (
66 66
 from charmhelpers.contrib.storage.linux.utils import (
67 67
     is_block_device,
68 68
     is_device_mounted,
69
-    zap_disk,
70 69
 )
71 70
 from charmhelpers.contrib.openstack.utils import (
72 71
     get_os_codename_install_source,
@@ -870,7 +869,42 @@ def get_partition_list(dev):
870 869
         raise
871 870
 
872 871
 
872
+def is_pristine_disk(dev):
873
+    """
874
+    Read first 2048 bytes (LBA 0 - 3) of block device to determine whether it
875
+    is actually all zeros and safe for us to use.
876
+
877
+    Existing partitioning tools does not discern between a failure to read from
878
+    block device, failure to understand a partition table and the fact that a
879
+    block device has no partition table.  Since we need to be positive about
880
+    which is which we need to read the device directly and confirm ourselves.
881
+
882
+    :param dev: Path to block device
883
+    :type dev: str
884
+    :returns: True all 2048 bytes == 0x0, False if not
885
+    :rtype: bool
886
+    """
887
+    want_bytes = 2048
888
+
889
+    f = open(dev, 'rb')
890
+    data = f.read(want_bytes)
891
+    read_bytes = len(data)
892
+    if read_bytes != want_bytes:
893
+        log('{}: short read, got {} bytes expected {}.'
894
+            .format(dev, read_bytes, want_bytes), level=WARNING)
895
+        return False
896
+
897
+    return all(byte == 0x0 for byte in data)
898
+
899
+
873 900
 def is_osd_disk(dev):
901
+    db = kv()
902
+    osd_devices = db.get('osd-devices', [])
903
+    if dev in osd_devices:
904
+        log('Device {} already processed by charm,'
905
+            ' skipping'.format(dev))
906
+        return True
907
+
874 908
     partitions = get_partition_list(dev)
875 909
     for partition in partitions:
876 910
         try:
@@ -1296,15 +1330,6 @@ def update_monfs():
1296 1330
             pass
1297 1331
 
1298 1332
 
1299
-def maybe_zap_journal(journal_dev):
1300
-    if is_osd_disk(journal_dev):
1301
-        log('Looks like {} is already an OSD data'
1302
-            ' or journal, skipping.'.format(journal_dev))
1303
-        return
1304
-    zap_disk(journal_dev)
1305
-    log("Zapped journal device {}".format(journal_dev))
1306
-
1307
-
1308 1333
 def get_partitions(dev):
1309 1334
     cmd = ['partx', '--raw', '--noheadings', dev]
1310 1335
     try:

+ 64
- 0
tests/basic_deployment.py View File

@@ -15,6 +15,7 @@
15 15
 # limitations under the License.
16 16
 
17 17
 import amulet
18
+import re
18 19
 import time
19 20
 
20 21
 import keystoneclient
@@ -712,6 +713,69 @@ class CephOsdBasicDeployment(OpenStackAmuletDeployment):
712 713
                                                   mtime, unit_name))
713 714
             amulet.raise_status('Folder mtime is older than provided mtime')
714 715
 
716
+    def test_901_blocked_when_non_pristine_disk_appears(self):
717
+        """
718
+        Validate that charm goes into blocked state when it is presented with
719
+        new block devices that have foreign data on them.
720
+
721
+        Instances used in UOSCI has a flavour with ephemeral storage in
722
+        addition to the bootable instance storage.  The ephemeral storage
723
+        device is partitioned, formatted and mounted early in the boot process
724
+        by cloud-init.
725
+
726
+        As long as the device is mounted the charm will not attempt to use it.
727
+
728
+        If we unmount it and trigger the config-changed hook the block device
729
+        will appear as a new and previously untouched device for the charm.
730
+
731
+        One of the first steps of device eligibility checks should be to make
732
+        sure we are seeing a pristine and empty device before doing any
733
+        further processing.
734
+
735
+        As the ephemeral device will have data on it we can use it to validate
736
+        that these checks work as intended.
737
+        """
738
+        u.log.debug('Checking behaviour when non-pristine disks appear...')
739
+        u.log.debug('Configuring ephemeral-unmount...')
740
+        self.d.configure('ceph-osd', {'ephemeral-unmount': '/mnt',
741
+                                      'osd-devices': '/dev/vdb'})
742
+        self._auto_wait_for_status(message=re.compile('Non-pristine.*'),
743
+                                   include_only=['ceph-osd'])
744
+        u.log.debug('Units now in blocked state, running zap-disk action...')
745
+        action_ids = []
746
+        self.ceph_osd_sentry = self.d.sentry['ceph-osd'][0]
747
+        for unit in range(0, 3):
748
+            zap_disk_params = {
749
+                'devices': '/dev/vdb',
750
+                'i-really-mean-it': True,
751
+            }
752
+            action_id = u.run_action(self.d.sentry['ceph-osd'][unit],
753
+                                     'zap-disk', params=zap_disk_params)
754
+            action_ids.append(action_id)
755
+        for unit in range(0, 3):
756
+            assert u.wait_on_action(action_ids[unit]), (
757
+                'zap-disk action failed.')
758
+
759
+        u.log.debug('Running add-disk action...')
760
+        action_ids = []
761
+        for unit in range(0, 3):
762
+            add_disk_params = {
763
+                'osd-devices': '/dev/vdb',
764
+            }
765
+            action_id = u.run_action(self.d.sentry['ceph-osd'][unit],
766
+                                     'add-disk', params=add_disk_params)
767
+            action_ids.append(action_id)
768
+
769
+        # NOTE(fnordahl): LP: #1774694
770
+        # for unit in range(0, 3):
771
+        #     assert u.wait_on_action(action_ids[unit]), (
772
+        #         'add-disk action failed.')
773
+
774
+        u.log.debug('Wait for idle/ready status...')
775
+        self._auto_wait_for_status(include_only=['ceph-osd'])
776
+
777
+        u.log.debug('OK')
778
+
715 779
     def test_910_pause_and_resume(self):
716 780
         """The services can be paused and resumed. """
717 781
         u.log.debug('Checking pause and resume actions...')

+ 5
- 5
tox.ini View File

@@ -18,11 +18,11 @@ whitelist_externals = juju
18 18
 passenv = HOME TERM AMULET_* CS_API_*
19 19
 
20 20
 [testenv:py27]
21
-basepython = python2.7
22
-deps = -r{toxinidir}/requirements.txt
23
-       -r{toxinidir}/test-requirements.txt
24
-# temporarily disable py27
25
-commands = /bin/true
21
+# ceph charms are Python3-only, but py27 unit test target
22
+# is required by OpenStack Governance.  Remove this shim as soon as
23
+# permitted.  http://governance.openstack.org/reference/cti/python_cti.html
24
+whitelist_externals = true
25
+commands = true
26 26
 
27 27
 [testenv:py35]
28 28
 basepython = python3.5

+ 0
- 35
unit_tests/test_ceph_hooks.py View File

@@ -405,21 +405,6 @@ class CephHooksTestCase(unittest.TestCase):
405 405
         devices = ceph_hooks.get_devices()
406 406
         self.assertEqual(devices, ['/dev/vda', '/dev/vdb'])
407 407
 
408
-    @patch('os.path.exists')
409
-    @patch.object(ceph_hooks, 'storage_list')
410
-    @patch.object(ceph_hooks, 'config')
411
-    def test_get_journal_devices(self, mock_config, mock_storage_list,
412
-                                 mock_os_path_exists):
413
-        '''Devices returned as expected'''
414
-        config = {'osd-journal': '/dev/vda /dev/vdb'}
415
-        mock_config.side_effect = lambda key: config[key]
416
-        mock_storage_list.return_value = []
417
-        mock_os_path_exists.return_value = True
418
-        devices = ceph_hooks.get_journal_devices()
419
-        mock_storage_list.assert_called()
420
-        mock_os_path_exists.assert_called()
421
-        self.assertEqual(devices, set(['/dev/vda', '/dev/vdb']))
422
-
423 408
     @patch.object(ceph_hooks, 'get_blacklist')
424 409
     @patch.object(ceph_hooks, 'storage_list')
425 410
     @patch.object(ceph_hooks, 'config')
@@ -435,26 +420,6 @@ class CephHooksTestCase(unittest.TestCase):
435 420
         mock_get_blacklist.assert_called()
436 421
         self.assertEqual(devices, ['/dev/vdb'])
437 422
 
438
-    @patch('os.path.exists')
439
-    @patch.object(ceph_hooks, 'get_blacklist')
440
-    @patch.object(ceph_hooks, 'storage_list')
441
-    @patch.object(ceph_hooks, 'config')
442
-    def test_get_journal_devices_blacklist(self, mock_config,
443
-                                           mock_storage_list,
444
-                                           mock_get_blacklist,
445
-                                           mock_os_path_exists):
446
-        '''Devices returned as expected when blacklist in effect'''
447
-        config = {'osd-journal': '/dev/vda /dev/vdb'}
448
-        mock_config.side_effect = lambda key: config[key]
449
-        mock_storage_list.return_value = []
450
-        mock_get_blacklist.return_value = ['/dev/vda']
451
-        mock_os_path_exists.return_value = True
452
-        devices = ceph_hooks.get_journal_devices()
453
-        mock_storage_list.assert_called()
454
-        mock_os_path_exists.assert_called()
455
-        mock_get_blacklist.assert_called()
456
-        self.assertEqual(devices, set(['/dev/vdb']))
457
-
458 423
     @patch.object(ceph_hooks, 'log')
459 424
     @patch.object(ceph_hooks, 'config')
460 425
     @patch('os.environ')

+ 63
- 0
unit_tests/test_ceph_utils.py View File

@@ -0,0 +1,63 @@
1
+# Copyright 2016 Canonical Ltd
2
+
3
+#
4
+# Licensed under the Apache License, Version 2.0 (the "License");
5
+# you may not use this file except in compliance with the License.
6
+# You may obtain a copy of the License at
7
+#
8
+#  http://www.apache.org/licenses/LICENSE-2.0
9
+#
10
+# Unless required by applicable law or agreed to in writing, software
11
+# distributed under the License is distributed on an "AS IS" BASIS,
12
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+# See the License for the specific language governing permissions and
14
+# limitations under the License.
15
+
16
+import unittest
17
+
18
+from mock import patch
19
+
20
+with patch('charmhelpers.contrib.hardening.harden.harden') as mock_dec:
21
+    mock_dec.side_effect = (lambda *dargs, **dkwargs: lambda f:
22
+                            lambda *args, **kwargs: f(*args, **kwargs))
23
+    import utils
24
+
25
+
26
+class CephUtilsTestCase(unittest.TestCase):
27
+    def setUp(self):
28
+        super(CephUtilsTestCase, self).setUp()
29
+
30
+    @patch('os.path.exists')
31
+    @patch.object(utils, 'storage_list')
32
+    @patch.object(utils, 'config')
33
+    def test_get_journal_devices(self, mock_config, mock_storage_list,
34
+                                 mock_os_path_exists):
35
+        '''Devices returned as expected'''
36
+        config = {'osd-journal': '/dev/vda /dev/vdb'}
37
+        mock_config.side_effect = lambda key: config[key]
38
+        mock_storage_list.return_value = []
39
+        mock_os_path_exists.return_value = True
40
+        devices = utils.get_journal_devices()
41
+        mock_storage_list.assert_called()
42
+        mock_os_path_exists.assert_called()
43
+        self.assertEqual(devices, set(['/dev/vda', '/dev/vdb']))
44
+
45
+    @patch('os.path.exists')
46
+    @patch.object(utils, 'get_blacklist')
47
+    @patch.object(utils, 'storage_list')
48
+    @patch.object(utils, 'config')
49
+    def test_get_journal_devices_blacklist(self, mock_config,
50
+                                           mock_storage_list,
51
+                                           mock_get_blacklist,
52
+                                           mock_os_path_exists):
53
+        '''Devices returned as expected when blacklist in effect'''
54
+        config = {'osd-journal': '/dev/vda /dev/vdb'}
55
+        mock_config.side_effect = lambda key: config[key]
56
+        mock_storage_list.return_value = []
57
+        mock_get_blacklist.return_value = ['/dev/vda']
58
+        mock_os_path_exists.return_value = True
59
+        devices = utils.get_journal_devices()
60
+        mock_storage_list.assert_called()
61
+        mock_os_path_exists.assert_called()
62
+        mock_get_blacklist.assert_called()
63
+        self.assertEqual(devices, set(['/dev/vdb']))

Loading…
Cancel
Save