From 09ead690968a221e28d75f0e5c190f74172df67f Mon Sep 17 00:00:00 2001
From: Matthew Oliver <matt@oliver.net.au>
Date: Wed, 26 May 2021 15:54:52 +1000
Subject: [PATCH] Add warn level logging if we hit un-recoverable EC rebuild

Currently there is an 'all(fragments)' check in the ECAppIter to know
when the iter is done. It assumes that all the fragments will be emtpy
when they are done. But if this is not the case we should log.

We have no idea if how often this can happen but currently we don't log
it so have no idea. But it could be a cause of some EC short read 500
that clusters get from time to time.

Here is an example of the message:

  May 26 23:42:57 saio proxy-server[7942]: Un-recoverable fragment rebuild. Only received 3/4 fragments for '/a/c/o' (txn: txb43f239d78834bdabcaa6-0060aedd01) (client_ip: 127.0.0.1)

Change-Id: If68e72d613685081a3149aedbfd07a69b8669d7f
---
 swift/proxy/controllers/obj.py          |  8 +++++++-
 test/unit/proxy/controllers/test_obj.py | 14 ++++++++++----
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/swift/proxy/controllers/obj.py b/swift/proxy/controllers/obj.py
index e7af91ffd4..e7bd572fca 100644
--- a/swift/proxy/controllers/obj.py
+++ b/swift/proxy/controllers/obj.py
@@ -1438,7 +1438,13 @@ class ECAppIter(object):
                 # with an un-reconstructible list of fragments - so we'll
                 # break out of the iter so WSGI can tear down the broken
                 # connection.
-                if not all(fragments):
+                frags_with_data = sum([1 for f in fragments if f])
+                if frags_with_data < len(fragments):
+                    if frags_with_data > 0:
+                        self.logger.warning(
+                            'Un-recoverable fragment rebuild. Only received '
+                            '%d/%d fragments for %r', frags_with_data,
+                            len(fragments), quote(self.path))
                     break
                 try:
                     segment = self.policy.pyeclib_driver.decode(fragments)
diff --git a/test/unit/proxy/controllers/test_obj.py b/test/unit/proxy/controllers/test_obj.py
index 0ccf86249f..4bbef7e085 100644
--- a/test/unit/proxy/controllers/test_obj.py
+++ b/test/unit/proxy/controllers/test_obj.py
@@ -4920,9 +4920,9 @@ class TestECObjController(ECObjectControllerMixin, unittest.TestCase):
         # re-raise the ChunkReadTimeout and still want a txn-id
         with set_http_connect(*status_codes, body_iter=body_iter,
                               headers=headers), \
-            mock.patch(
-                'swift.proxy.controllers.obj.ECFragGetter.fast_forward',
-                side_effect=ValueError()):
+                mock.patch(
+                    'swift.proxy.controllers.obj.ECFragGetter.fast_forward',
+                    side_effect=ValueError()):
             resp = req.get_response(self.app)
             self.assertEqual(resp.status_int, 200)
             self.assertNotEqual(md5(resp.body).hexdigest(), etag)
@@ -4930,7 +4930,13 @@ class TestECObjController(ECObjectControllerMixin, unittest.TestCase):
         self.assertEqual(2, len(error_lines))
         self.assertIn('Unable to fast forward', error_lines[0])
         self.assertIn('Timeout fetching', error_lines[1])
-        for line in self.logger.logger.records['ERROR']:
+        warning_lines = self.logger.get_lines_for_level('warning')
+        self.assertEqual(1, len(warning_lines))
+        self.assertIn(
+            'Un-recoverable fragment rebuild. Only received 9/10 fragments',
+            warning_lines[0])
+        for line in self.logger.logger.records['ERROR'] + \
+                self.logger.logger.records['WARNING']:
             self.assertIn(req.headers['x-trans-id'], line)
 
     def test_GET_read_timeout_resume_mixed_etag(self):