Add handoffs-only mode to DB replicators.

The object reconstructor has a handoffs-only mode that is very useful when a cluster requires rapid rebalancing, like when disks are nearing fullness. This mode's goal is to remove handoff partitions from disks without spending effort on primary partitions. The object replicator has a similar mode, though it varies in some details. This commit adds a handoffs-only mode to the account and container replicators. Change-Id: I588b151ee65ae49d204bd6bf58555504c15edf9f Closes-Bug: 1668399
2018-02-16 16:37:58 -08:00 · 2018-02-16 16:37:58 -08:00 · 47fed6f2f9
commit 47fed6f2f9
parent 2bfd9c6a9b
4 changed files with 219 additions and 10 deletions
--- a/etc/account-server.conf-sample
+++ b/etc/account-server.conf-sample
@ -163,6 +163,25 @@ use = egg:swift#recon
 # Work only with ionice_class.
 # ionice_class =
 # ionice_priority =
+#
+# The handoffs_only mode option is for special-case emergency
+# situations such as full disks in the cluster. This option SHOULD NOT
+# BE ENABLED except in emergencies. When handoffs_only mode is enabled
+# the replicator will *only* replicate from handoff nodes to primary
+# nodes and will not sync primary nodes with other primary nodes.
+#
+# This has two main effects: first, the replicator becomes much more
+# effective at removing misplaced databases, thereby freeing up disk
+# space at a much faster pace than normal. Second, the replicator does
+# not sync data between primary nodes, so out-of-sync account and
+# container listings will not resolve while handoffs_only is enabled.
+#
+# This mode is intended to allow operators to temporarily sacrifice
+# consistency in order to gain faster rebalancing, such as during a
+# capacity addition with nearly-full disks. It is not intended for
+# long-term use.
+#
+# handoffs_only = no

 [account-auditor]
 # You can override the default log routing for this app here (don't use set!):
--- a/etc/container-server.conf-sample
+++ b/etc/container-server.conf-sample
@ -172,6 +172,25 @@ use = egg:swift#recon
 # Work only with ionice_class.
 # ionice_class =
 # ionice_priority =
+#
+# The handoffs_only mode option is for special-case emergency
+# situations such as full disks in the cluster. This option SHOULD NOT
+# BE ENABLED except in emergencies. When handoffs_only mode is enabled
+# the replicator will *only* replicate from handoff nodes to primary
+# nodes and will not sync primary nodes with other primary nodes.
+#
+# This has two main effects: first, the replicator becomes much more
+# effective at removing misplaced databases, thereby freeing up disk
+# space at a much faster pace than normal. Second, the replicator does
+# not sync data between primary nodes, so out-of-sync account and
+# container listings will not resolve while handoffs_only is enabled.
+#
+# This mode is intended to allow operators to temporarily sacrifice
+# consistency in order to gain faster rebalancing, such as during a
+# capacity addition with nearly-full disks. It is not intended for
+# long-term use.
+#
+# handoffs_only = no

 [container-updater]
 # You can override the default log routing for this app here (don't use set!):
--- a/swift/common/db_replicator.py
+++ b/swift/common/db_replicator.py
@ -87,13 +87,14 @@ def roundrobin_datadirs(datadirs):
    found (in their proper places). The partitions within each data
    dir are walked randomly, however.

-    :param datadirs: a list of (path, node_id) to walk
+    :param datadirs: a list of (path, node_id, partition_filter) to walk
    :returns: A generator of (partition, path_to_db_file, node_id)
    """

-    def walk_datadir(datadir, node_id):
+    def walk_datadir(datadir, node_id, part_filter):
        partitions = [pd for pd in os.listdir(datadir)
-                      if looks_like_partition(pd)]
+                      if looks_like_partition(pd)
+                      and (part_filter is None or part_filter(pd))]
        random.shuffle(partitions)
        for partition in partitions:
            part_dir = os.path.join(datadir, partition)
@ -125,7 +126,8 @@ def roundrobin_datadirs(datadirs):
                            if e.errno != errno.ENOTEMPTY:
                                raise

-    its = [walk_datadir(datadir, node_id) for datadir, node_id in datadirs]
+    its = [walk_datadir(datadir, node_id, filt)
+           for datadir, node_id, filt in datadirs]
    while its:
        for it in its:
            try:
@ -206,6 +208,7 @@ class Replicator(Daemon):
                                   self.recon_replicator)
        self.extract_device_re = re.compile('%s%s([^%s]+)' % (
            self.root, os.path.sep, os.path.sep))
+        self.handoffs_only = config_true_value(conf.get('handoffs_only', 'no'))

    def _zero_stats(self):
        """Zero out the stats."""
@ -631,6 +634,14 @@ class Replicator(Daemon):
            return match.groups()[0]
        return "UNKNOWN"

+    def handoffs_only_filter(self, device_id):
+        def filt(partition_dir):
+            partition = int(partition_dir)
+            primary_node_ids = [
+                d['id'] for d in self.ring.get_part_nodes(partition)]
+            return device_id not in primary_node_ids
+        return filt
+
    def report_up_to_date(self, full_info):
        return True

@ -642,6 +653,13 @@ class Replicator(Daemon):
        if not ips:
            self.logger.error(_('ERROR Failed to get my own IPs?'))
            return
+
+        if self.handoffs_only:
+            self.logger.warning(
+                'Starting replication pass with handoffs_only enabled. '
+                'This mode is not intended for normal '
+                'operation; use handoffs_only with care.')
+
        self._local_device_ids = set()
        found_local = False
        for node in self.ring.devs:
@ -664,7 +682,9 @@ class Replicator(Daemon):
                datadir = os.path.join(self.root, node['device'], self.datadir)
                if os.path.isdir(datadir):
                    self._local_device_ids.add(node['id'])
-                    dirs.append((datadir, node['id']))
+                    filt = (self.handoffs_only_filter(node['id'])
+                            if self.handoffs_only else None)
+                    dirs.append((datadir, node['id'], filt))
        if not found_local:
            self.logger.error("Can't find itself %s with port %s in ring "
                              "file, not replicating",
@ -675,6 +695,10 @@ class Replicator(Daemon):
                self._replicate_object, part, object_file, node_id)
        self.cpool.waitall()
        self.logger.info(_('Replication run OVER'))
+        if self.handoffs_only:
+            self.logger.warning(
+                'Finished replication pass with handoffs_only enabled. '
+                'If handoffs_only is no longer required, disable it.')
        self._report_stats()

    def run_forever(self, *args, **kwargs):
--- a/test/unit/common/test_db_replicator.py
+++ b/test/unit/common/test_db_replicator.py
@ -1220,7 +1220,8 @@ class TestDBReplicator(unittest.TestCase):
            self.assertTrue(os.path.isdir(dirpath))

        node_id = 1
-        results = list(db_replicator.roundrobin_datadirs([(datadir, node_id)]))
+        results = list(db_replicator.roundrobin_datadirs(
+            [(datadir, node_id, None)]))
        expected = [
            ('450', os.path.join(datadir, db_path), node_id),
        ]
@ -1241,12 +1242,14 @@ class TestDBReplicator(unittest.TestCase):
        self.assertEqual({'18', '1054', '1060', '450'},
                         set(os.listdir(datadir)))

-        results = list(db_replicator.roundrobin_datadirs([(datadir, node_id)]))
+        results = list(db_replicator.roundrobin_datadirs(
+            [(datadir, node_id, None)]))
        self.assertEqual(results, expected)
        self.assertEqual({'1054', '1060', '450'},
                         set(os.listdir(datadir)))

-        results = list(db_replicator.roundrobin_datadirs([(datadir, node_id)]))
+        results = list(db_replicator.roundrobin_datadirs(
+            [(datadir, node_id, None)]))
        self.assertEqual(results, expected)
        # non db file in '1060' dir is not deleted and exception is handled
        self.assertEqual({'1060', '450'},
@ -1333,8 +1336,8 @@ class TestDBReplicator(unittest.TestCase):
                mock.patch(base + 'random.shuffle', _shuffle), \
                mock.patch(base + 'os.rmdir', _rmdir):

-            datadirs = [('/srv/node/sda/containers', 1),
-                        ('/srv/node/sdb/containers', 2)]
+            datadirs = [('/srv/node/sda/containers', 1, None),
+                        ('/srv/node/sdb/containers', 2, None)]
            results = list(db_replicator.roundrobin_datadirs(datadirs))
            # The results show that the .db files are returned, the devices
            # interleaved.
@ -1438,6 +1441,150 @@ class TestDBReplicator(unittest.TestCase):
                      replicator.logger)])


+class TestHandoffsOnly(unittest.TestCase):
+    class FakeRing3Nodes(object):
+        _replicas = 3
+
+        # Three nodes, two disks each
+        devs = [
+            dict(id=0, region=1, zone=1,
+                 meta='', weight=500.0, ip='10.0.0.1', port=6201,
+                 replication_ip='10.0.0.1', replication_port=6201,
+                 device='sdp'),
+            dict(id=1, region=1, zone=1,
+                 meta='', weight=500.0, ip='10.0.0.1', port=6201,
+                 replication_ip='10.0.0.1', replication_port=6201,
+                 device='sdq'),
+
+            dict(id=2, region=1, zone=1,
+                 meta='', weight=500.0, ip='10.0.0.2', port=6201,
+                 replication_ip='10.0.0.2', replication_port=6201,
+                 device='sdp'),
+            dict(id=3, region=1, zone=1,
+                 meta='', weight=500.0, ip='10.0.0.2', port=6201,
+                 replication_ip='10.0.0.2', replication_port=6201,
+                 device='sdq'),
+
+            dict(id=4, region=1, zone=1,
+                 meta='', weight=500.0, ip='10.0.0.3', port=6201,
+                 replication_ip='10.0.0.3', replication_port=6201,
+                 device='sdp'),
+            dict(id=5, region=1, zone=1,
+                 meta='', weight=500.0, ip='10.0.0.3', port=6201,
+                 replication_ip='10.0.0.3', replication_port=6201,
+                 device='sdq'),
+        ]
+
+        def __init__(self, *a, **kw):
+            pass
+
+        def get_part(self, account, container=None, obj=None):
+            return 0
+
+        def get_part_nodes(self, part):
+            nodes = []
+            for offset in range(self._replicas):
+                i = (part + offset) % len(self.devs)
+                nodes.append(self.devs[i])
+            return nodes
+
+        def get_more_nodes(self, part):
+            for offset in range(self._replicas, len(self.devs)):
+                i = (part + offset) % len(self.devs)
+                yield self.devs[i]
+
+    def _make_fake_db(self, disk, partition, db_hash):
+        directories = [
+            os.path.join(self.root, disk),
+            os.path.join(self.root, disk, 'containers'),
+            os.path.join(self.root, disk, 'containers', str(partition)),
+            os.path.join(self.root, disk, 'containers', str(partition),
+                         db_hash[-3:]),
+            os.path.join(self.root, disk, 'containers', str(partition),
+                         db_hash[-3:], db_hash)]
+
+        for d in directories:
+            try:
+                os.mkdir(d)
+            except OSError as err:
+                if err.errno != errno.EEXIST:
+                    raise
+        file_path = os.path.join(directories[-1], db_hash + ".db")
+        with open(file_path, 'w'):
+            pass
+
+    def setUp(self):
+        self.root = mkdtemp()
+
+        # object disks; they're just here to make sure they don't trip us up
+        os.mkdir(os.path.join(self.root, 'sdc'))
+        os.mkdir(os.path.join(self.root, 'sdc', 'objects'))
+        os.mkdir(os.path.join(self.root, 'sdd'))
+        os.mkdir(os.path.join(self.root, 'sdd', 'objects'))
+
+        # part 0 belongs on sdp
+        self._make_fake_db('sdp', 0, '010101013cf2b7979af9eaa71cb67220')
+
+        # part 1 does not belong on sdp
+        self._make_fake_db('sdp', 1, 'abababab2b5368158355e799323b498d')
+
+        # part 1 belongs on sdq
+        self._make_fake_db('sdq', 1, '02020202e30f696a3cfa63d434a3c94e')
+
+        # part 2 does not belong on sdq
+        self._make_fake_db('sdq', 2, 'bcbcbcbc15d3835053d568c57e2c83b5')
+
+    def cleanUp(self):
+        rmtree(self.root, ignore_errors=True)
+
+    def test_scary_warnings(self):
+        logger = unit.FakeLogger()
+        replicator = TestReplicator({
+            'handoffs_only': 'yes',
+            'devices': self.root,
+            'bind_port': 6201,
+            'mount_check': 'no',
+        }, logger=logger)
+
+        with patch.object(db_replicator, 'whataremyips',
+                          return_value=['10.0.0.1']), \
+                patch.object(replicator, '_replicate_object'), \
+                patch.object(replicator, 'ring', self.FakeRing3Nodes()):
+            replicator.run_once()
+
+        self.assertEqual(
+            logger.get_lines_for_level('warning'),
+            [('Starting replication pass with handoffs_only enabled. This '
+              'mode is not intended for normal operation; use '
+              'handoffs_only with care.'),
+             ('Finished replication pass with handoffs_only enabled. '
+              'If handoffs_only is no longer required, disable it.')])
+
+    def test_skips_primary_partitions(self):
+        replicator = TestReplicator({
+            'handoffs_only': 'yes',
+            'devices': self.root,
+            'bind_port': 6201,
+            'mount_check': 'no',
+        })
+
+        with patch.object(db_replicator, 'whataremyips',
+                          return_value=['10.0.0.1']), \
+                patch.object(replicator, '_replicate_object') as mock_repl, \
+                patch.object(replicator, 'ring', self.FakeRing3Nodes()):
+            replicator.run_once()
+
+        self.assertEqual(sorted(mock_repl.mock_calls), [
+            mock.call('1', os.path.join(
+                self.root, 'sdp', 'containers', '1', '98d',
+                'abababab2b5368158355e799323b498d',
+                'abababab2b5368158355e799323b498d.db'), 0),
+            mock.call('2', os.path.join(
+                self.root, 'sdq', 'containers', '2', '3b5',
+                'bcbcbcbc15d3835053d568c57e2c83b5',
+                'bcbcbcbc15d3835053d568c57e2c83b5.db'), 1)])
+
+
 class TestReplToNode(unittest.TestCase):
    def setUp(self):
        db_replicator.ring = FakeRing()