Add warn level logging if we hit un-recoverable EC rebuild

Currently there is an 'all(fragments)' check in the ECAppIter to know
when the iter is done. It assumes that all the fragments will be emtpy
when they are done. But if this is not the case we should log.

We have no idea if how often this can happen but currently we don't log
it so have no idea. But it could be a cause of some EC short read 500
that clusters get from time to time.

Here is an example of the message:

  May 26 23:42:57 saio proxy-server[7942]: Un-recoverable fragment rebuild. Only received 3/4 fragments for '/a/c/o' (txn: txb43f239d78834bdabcaa6-0060aedd01) (client_ip: 127.0.0.1)

Change-Id: If68e72d613685081a3149aedbfd07a69b8669d7f
This commit is contained in:
Matthew Oliver 2021-05-26 15:54:52 +10:00
parent b3def185c6
commit 09ead69096
2 changed files with 17 additions and 5 deletions

View File

@ -1438,7 +1438,13 @@ class ECAppIter(object):
# with an un-reconstructible list of fragments - so we'll
# break out of the iter so WSGI can tear down the broken
# connection.
if not all(fragments):
frags_with_data = sum([1 for f in fragments if f])
if frags_with_data < len(fragments):
if frags_with_data > 0:
self.logger.warning(
'Un-recoverable fragment rebuild. Only received '
'%d/%d fragments for %r', frags_with_data,
len(fragments), quote(self.path))
break
try:
segment = self.policy.pyeclib_driver.decode(fragments)

View File

@ -4920,9 +4920,9 @@ class TestECObjController(ECObjectControllerMixin, unittest.TestCase):
# re-raise the ChunkReadTimeout and still want a txn-id
with set_http_connect(*status_codes, body_iter=body_iter,
headers=headers), \
mock.patch(
'swift.proxy.controllers.obj.ECFragGetter.fast_forward',
side_effect=ValueError()):
mock.patch(
'swift.proxy.controllers.obj.ECFragGetter.fast_forward',
side_effect=ValueError()):
resp = req.get_response(self.app)
self.assertEqual(resp.status_int, 200)
self.assertNotEqual(md5(resp.body).hexdigest(), etag)
@ -4930,7 +4930,13 @@ class TestECObjController(ECObjectControllerMixin, unittest.TestCase):
self.assertEqual(2, len(error_lines))
self.assertIn('Unable to fast forward', error_lines[0])
self.assertIn('Timeout fetching', error_lines[1])
for line in self.logger.logger.records['ERROR']:
warning_lines = self.logger.get_lines_for_level('warning')
self.assertEqual(1, len(warning_lines))
self.assertIn(
'Un-recoverable fragment rebuild. Only received 9/10 fragments',
warning_lines[0])
for line in self.logger.logger.records['ERROR'] + \
self.logger.logger.records['WARNING']:
self.assertIn(req.headers['x-trans-id'], line)
def test_GET_read_timeout_resume_mixed_etag(self):