From e9cd9f74a5264f396783ca2a4548a3da7cee7bff Mon Sep 17 00:00:00 2001
From: Matthew Oliver <matt@oliver.net.au>
Date: Mon, 12 Aug 2019 16:16:17 +1000
Subject: [PATCH] sharder: Keep cleaving on empty shard ranges

When a container is being cleaved there is a possiblity that we're
dealing with an empty or near empty container created on a handoff node.
These containers may have a valid list of shard ranges, so would need
to cleave to the new shards.
Currently, when using a `cleave_batch_size` that is smaller then the
number of shard ranges on the cleaving container, these containers will
have to take a few shard passes to shard, even though there maybe
nothing in them.

This is worse if a really large container is sharding, and due to being
slow, error limitted a node causing a new container on a handoff
location. This empty container would have a large number of shard ranges
and could take a _very_ long time to shard away, slowing the process
down.

This patch eliminates the issue by detecting when no objects are
returned for a shard range. The `_cleave_shard_range` method now
returns 3 possible results:

  - CLEAVE_SUCCESS
  - CLEAVE_FAILED
  - CLEAVE_EMPTY

They all are pretty self explanitory. When `CLEAVE_EMPTY` is returned
the code will:

  - Log
  - Not replicate the empty temp shard container sitting in a
    handoff location
  - Not count the shard range in the `cleave_batch_size` count
  - Update the cleaving context so sharding can move forward

If there already is a shard range DB existing on a handoff node to use
then the sharder wont skip it, even if there are no objects, it'll
replicate it and treat it as normal, including using a `cleave_batch_size`
slot.

Change-Id: Id338f6c3187f93454bcdf025a32a073284a4a159
Closes-Bug: #1839355
---
 swift/container/sharder.py          | 69 ++++++++++++++++++++++-------
 test/unit/container/test_sharder.py | 59 ++++++++++++++++++++++++
 2 files changed, 112 insertions(+), 16 deletions(-)

diff --git a/swift/container/sharder.py b/swift/container/sharder.py
index 3912fb35d6..007d511e87 100644
--- a/swift/container/sharder.py
+++ b/swift/container/sharder.py
@@ -40,6 +40,11 @@ from swift.container.backend import ContainerBroker, \
 from swift.container.replicator import ContainerReplicator
 
 
+CLEAVE_SUCCESS = 0
+CLEAVE_FAILED = 1
+CLEAVE_EMPTY = 2
+
+
 def sharding_enabled(broker):
     # NB all shards will by default have been created with
     # X-Container-Sysmeta-Sharding set and will therefore be candidates for
@@ -636,6 +641,7 @@ class ContainerSharder(ContainerReplicator):
         """
         part = self.ring.get_part(shard_range.account, shard_range.container)
         node = self.find_local_handoff_for_part(part)
+        put_timestamp = Timestamp.now().internal
         if not node:
             raise DeviceUnavailable(
                 'No mounted devices found suitable for creating shard broker '
@@ -644,7 +650,7 @@ class ContainerSharder(ContainerReplicator):
         shard_broker = ContainerBroker.create_broker(
             os.path.join(self.root, node['device']), part, shard_range.account,
             shard_range.container, epoch=shard_range.epoch,
-            storage_policy_index=policy_index)
+            storage_policy_index=policy_index, put_timestamp=put_timestamp)
 
         # Get the valid info into the broker.container, etc
         shard_broker.get_info()
@@ -654,7 +660,7 @@ class ContainerSharder(ContainerReplicator):
             'X-Container-Sysmeta-Sharding':
                 ('True', Timestamp.now().internal)})
 
-        return part, shard_broker, node['id']
+        return part, shard_broker, node['id'], put_timestamp
 
     def _audit_root_container(self, broker):
         # This is the root container, and therefore the tome of knowledge,
@@ -844,7 +850,7 @@ class ContainerSharder(ContainerReplicator):
             last_index = next_index = 0
             for obj in objs:
                 if dest_shard_range is None:
-                    # no more destinations: yield remainder of batch and return
+                    # no more destinations: yield remainder of batch and bail
                     # NB there may be more batches of objects but none of them
                     # will be placed so no point fetching them
                     yield objs[last_index:], None, info
@@ -933,7 +939,7 @@ class ContainerSharder(ContainerReplicator):
                 continue
 
             if dest_shard_range not in dest_brokers:
-                part, dest_broker, node_id = self._get_shard_broker(
+                part, dest_broker, node_id, _junk = self._get_shard_broker(
                     dest_shard_range, src_broker.root_path, policy_index)
                 # save the broker info that was sampled prior to the *first*
                 # yielded objects for this destination
@@ -1162,12 +1168,15 @@ class ContainerSharder(ContainerReplicator):
         start = time.time()
         policy_index = broker.storage_policy_index
         try:
-            shard_part, shard_broker, node_id = self._get_shard_broker(
-                shard_range, broker.root_path, policy_index)
+            shard_part, shard_broker, node_id, put_timestamp = \
+                self._get_shard_broker(shard_range, broker.root_path,
+                                       policy_index)
         except DeviceUnavailable as duex:
             self.logger.warning(str(duex))
             self._increment_stat('cleaved', 'failure', statsd=True)
-            return False
+            return CLEAVE_FAILED
+
+        own_shard_range = broker.get_own_shard_range()
 
         # only cleave from the retiring db - misplaced objects handler will
         # deal with any objects in the fresh db
@@ -1178,13 +1187,36 @@ class ContainerSharder(ContainerReplicator):
         source_db_id = source_broker.get_info()['id']
         source_max_row = source_broker.get_max_row()
         sync_point = shard_broker.get_sync(source_db_id)
-        if sync_point < source_max_row:
+        if sync_point < source_max_row or source_max_row == -1:
             sync_from_row = max(cleaving_context.last_cleave_to_row or -1,
                                 sync_point)
+            objects = None
             for objects, info in self.yield_objects(
                     source_broker, shard_range,
                     since_row=sync_from_row):
                 shard_broker.merge_items(objects)
+            if objects is None:
+                self.logger.info("Cleaving '%s': %r - zero objects found",
+                                 broker.path, shard_range)
+                if shard_broker.get_info()['put_timestamp'] == put_timestamp:
+                    # This was just created; don't need to replicate this
+                    # SR because there was nothing there. So cleanup and
+                    # remove the shard_broker from its hand off location.
+                    self.delete_db(shard_broker)
+                    cleaving_context.cursor = shard_range.upper_str
+                    cleaving_context.ranges_done += 1
+                    cleaving_context.ranges_todo -= 1
+                    if shard_range.upper >= own_shard_range.upper:
+                        # cleaving complete
+                        cleaving_context.cleaving_done = True
+                    cleaving_context.store(broker)
+                    # Because nothing was here we wont count it in the shard
+                    # batch count.
+                    return CLEAVE_EMPTY
+                # Else, it wasn't newly created by us, and
+                # we don't know what's in it or why. Let it get
+                # replicated and counted in the batch count.
+
             # Note: the max row stored as a sync point is sampled *before*
             # objects are yielded to ensure that is less than or equal to
             # the last yielded row. Other sync points are also copied from the
@@ -1199,8 +1231,6 @@ class ContainerSharder(ContainerReplicator):
             self.logger.debug("Cleaving '%s': %r - shard db already in sync",
                               broker.path, shard_range)
 
-        own_shard_range = broker.get_own_shard_range()
-
         replication_quorum = self.existing_shard_replication_quorum
         if shard_range.includes(own_shard_range):
             # When shrinking, include deleted own (donor) shard range in
@@ -1242,7 +1272,7 @@ class ContainerSharder(ContainerReplicator):
                 '%s successes, %s required.', shard_range, broker.path,
                 replication_successes, replication_quorum)
             self._increment_stat('cleaved', 'failure', statsd=True)
-            return False
+            return CLEAVE_FAILED
 
         elapsed = round(time.time() - start, 3)
         self._min_stat('cleaved', 'min_time', elapsed)
@@ -1259,7 +1289,7 @@ class ContainerSharder(ContainerReplicator):
             'Cleaved %s for shard range %s in %gs.',
             broker.path, shard_range, elapsed)
         self._increment_stat('cleaved', 'success', statsd=True)
-        return True
+        return CLEAVE_SUCCESS
 
     def _cleave(self, broker):
         # Returns True if misplaced objects have been moved and the entire
@@ -1304,23 +1334,30 @@ class ContainerSharder(ContainerReplicator):
                               cleaving_context.ranges_todo, broker.path)
 
         ranges_done = []
-        for shard_range in ranges_todo[:self.cleave_batch_size]:
+        for shard_range in ranges_todo:
             if shard_range.state == ShardRange.FOUND:
                 break
             elif shard_range.state in (ShardRange.CREATED,
                                        ShardRange.CLEAVED,
                                        ShardRange.ACTIVE):
-                if self._cleave_shard_range(
-                        broker, cleaving_context, shard_range):
+                cleave_result = self._cleave_shard_range(
+                    broker, cleaving_context, shard_range)
+                if cleave_result == CLEAVE_SUCCESS:
                     ranges_done.append(shard_range)
-                else:
+                    if len(ranges_done) == self.cleave_batch_size:
+                        break
+                elif cleave_result == CLEAVE_FAILED:
                     break
+                # else, no errors, but no rows found either. keep going,
+                # and don't count it against our batch size
             else:
                 self.logger.warning('Unexpected shard range state for cleave',
                                     shard_range.state)
                 break
 
         if not ranges_done:
+            # _cleave_shard_range always store()s the context on success; make
+            # sure we *also* do that if we hit a failure right off the bat
             cleaving_context.store(broker)
         self.logger.debug(
             'Cleaved %s shard ranges for %s', len(ranges_done), broker.path)
diff --git a/test/unit/container/test_sharder.py b/test/unit/container/test_sharder.py
index a563671802..641d885a12 100644
--- a/test/unit/container/test_sharder.py
+++ b/test/unit/container/test_sharder.py
@@ -1321,6 +1321,65 @@ class TestSharder(BaseTestSharder):
         self.assertEqual(8, context.cleave_to_row)
         self.assertEqual(8, context.max_row)
 
+    def test_cleave_root_empty_db_with_ranges(self):
+        broker = self._make_broker()
+        broker.enable_sharding(Timestamp.now())
+
+        shard_bounds = (('', 'd'), ('d', 'x'), ('x', ''))
+        shard_ranges = self._make_shard_ranges(
+            shard_bounds, state=ShardRange.CREATED)
+
+        broker.merge_shard_ranges(shard_ranges)
+        self.assertTrue(broker.set_sharding_state())
+
+        sharder_conf = {'cleave_batch_size': 1}
+        with self._mock_sharder(sharder_conf) as sharder:
+            self.assertTrue(sharder._cleave(broker))
+
+        info_lines = sharder.logger.get_lines_for_level('info')
+        expected_zero_obj = [line for line in info_lines
+                             if " - zero objects found" in line]
+        self.assertEqual(len(expected_zero_obj), len(shard_bounds))
+
+        cleaving_context = CleavingContext.load(broker)
+        # even though there is a cleave_batch_size of 1, we don't count empty
+        # ranges when cleaving seeing as they aren't replicated
+        self.assertEqual(cleaving_context.ranges_done, 3)
+        self.assertEqual(cleaving_context.ranges_todo, 0)
+        self.assertTrue(cleaving_context.cleaving_done)
+
+    def test_cleave_root_empty_db_with_pre_existing_shard_db_handoff(self):
+        broker = self._make_broker()
+        broker.enable_sharding(Timestamp.now())
+
+        shard_bounds = (('', 'd'), ('d', 'x'), ('x', ''))
+        shard_ranges = self._make_shard_ranges(
+            shard_bounds, state=ShardRange.CREATED)
+
+        broker.merge_shard_ranges(shard_ranges)
+        self.assertTrue(broker.set_sharding_state())
+
+        sharder_conf = {'cleave_batch_size': 1}
+        with self._mock_sharder(sharder_conf) as sharder:
+            # pre-create a shard broker on a handoff location. This will force
+            # the sharder to not skip it but instead force to replicate it and
+            # use up a cleave_batch_size count.
+            sharder._get_shard_broker(shard_ranges[0], broker.root_path,
+                                      0)
+            self.assertFalse(sharder._cleave(broker))
+
+        info_lines = sharder.logger.get_lines_for_level('info')
+        expected_zero_obj = [line for line in info_lines
+                             if " - zero objects found" in line]
+        self.assertEqual(len(expected_zero_obj), 1)
+
+        cleaving_context = CleavingContext.load(broker)
+        # even though there is a cleave_batch_size of 1, we don't count empty
+        # ranges when cleaving seeing as they aren't replicated
+        self.assertEqual(cleaving_context.ranges_done, 1)
+        self.assertEqual(cleaving_context.ranges_todo, 2)
+        self.assertFalse(cleaving_context.cleaving_done)
+
     def test_cleave_shard(self):
         broker = self._make_broker(account='.shards_a', container='shard_c')
         own_shard_range = ShardRange(