Proxy: add metrics related to error limiter.

There are three new metrics added:
 1. 'error_limiter.incremented_limit', when one node has accumulated
    enough errors to trigger error suppression.
 2. 'error_limiter.forced_limit', when one node runs into serious
    error like Insufficient Storage.
 3. 'error_limiter.is_limited', when one node is error limited
    during node selection process due to suppression_limit.

UpgradeImpact:
There is change of log format when 'ERROR Insufficient Storage' is
raised, from '%(msg)s %(node)s' to
'Node will be error limited from now: %s, error: %s'.

Change-Id: Id9bec9b46dad82163517fd71cfdda5b751464588
This commit is contained in:
Jianjian Huo 2022-11-02 17:20:59 -07:00 committed by Tim Burke
parent 6a010499d5
commit 38124221d7
4 changed files with 175 additions and 54 deletions

View File

@ -84,8 +84,10 @@ class ErrorLimiter(object):
the given ``node``. the given ``node``.
:param node: dictionary describing a node. :param node: dictionary describing a node.
:returns: True if suppression_limit is exceeded, False otherwise
""" """
node_key = self.node_key(node) node_key = self.node_key(node)
error_stats = self.stats[node_key] error_stats = self.stats[node_key]
error_stats['errors'] = error_stats.get('errors', 0) + 1 error_stats['errors'] = error_stats.get('errors', 0) + 1
error_stats['last_error'] = time() error_stats['last_error'] = time()
return error_stats['errors'] > self.suppression_limit

View File

@ -657,8 +657,9 @@ class Application(object):
""" """
limited = self.error_limiter.is_limited(node) limited = self.error_limiter.is_limited(node)
if limited: if limited:
self.logger.increment('error_limiter.is_limited')
self.logger.debug( self.logger.debug(
'Node error limited: %s', node_to_string(node)) 'Node is error limited: %s', node_to_string(node))
return limited return limited
def error_limit(self, node, msg): def error_limit(self, node, msg):
@ -672,8 +673,24 @@ class Application(object):
:param msg: error message :param msg: error message
""" """
self.error_limiter.limit(node) self.error_limiter.limit(node)
self.logger.error('%(msg)s %(node)s', self.logger.increment('error_limiter.forced_limit')
{'msg': msg, 'node': node_to_string(node)}) self.logger.error(
'Node will be error limited from now: %s, error: %s',
node_to_string(node),
msg)
def _error_increment(self, node):
"""
Call increment() on error limiter once, emit metrics and log if error
suppression will be triggered.
:param node: dictionary of node to handle errors for
"""
if self.error_limiter.increment(node):
self.logger.increment('error_limiter.incremented_limit')
self.logger.error(
'Node will be error limited from now: %s',
node_to_string(node))
def error_occurred(self, node, msg): def error_occurred(self, node, msg):
""" """
@ -682,11 +699,11 @@ class Application(object):
:param node: dictionary of node to handle errors for :param node: dictionary of node to handle errors for
:param msg: error message :param msg: error message
""" """
self.error_limiter.increment(node)
if isinstance(msg, bytes): if isinstance(msg, bytes):
msg = msg.decode('utf-8') msg = msg.decode('utf-8')
self.logger.error('%(msg)s %(node)s', self.logger.error('%(msg)s %(node)s',
{'msg': msg, 'node': node_to_string(node)}) {'msg': msg, 'node': node_to_string(node)})
self._error_increment(node)
def check_response(self, node, server_type, response, method, path, def check_response(self, node, server_type, response, method, path,
body=None): body=None):
@ -739,7 +756,6 @@ class Application(object):
:param typ: server type :param typ: server type
:param additional_info: additional information to log :param additional_info: additional information to log
""" """
self.error_limiter.increment(node)
if 'level' in kwargs: if 'level' in kwargs:
log = functools.partial(self.logger.log, kwargs.pop('level')) log = functools.partial(self.logger.log, kwargs.pop('level'))
if 'exc_info' not in kwargs: if 'exc_info' not in kwargs:
@ -753,6 +769,7 @@ class Application(object):
{'type': typ, 'node': node_to_string(node), {'type': typ, 'node': node_to_string(node),
'info': additional_info}, 'info': additional_info},
**kwargs) **kwargs)
self._error_increment(node)
def modify_wsgi_pipeline(self, pipe): def modify_wsgi_pipeline(self, pipe):
""" """

View File

@ -74,14 +74,14 @@ class TestErrorLimiter(unittest.TestCase):
last_errors = 0 last_errors = 0
node_key = limiter.node_key(node) node_key = limiter.node_key(node)
for _ in range(limiter.suppression_limit): for _ in range(limiter.suppression_limit):
limiter.increment(node) self.assertFalse(limiter.increment(node))
node_errors = limiter.stats.get(node_key) node_errors = limiter.stats.get(node_key)
self.assertGreater(node_errors['errors'], last_errors) self.assertGreater(node_errors['errors'], last_errors)
self.assertFalse(limiter.is_limited(node)) self.assertFalse(limiter.is_limited(node))
last_errors = node_errors['errors'] last_errors = node_errors['errors']
# One more to make sure it is > suppression_limit # One more to make sure it is > suppression_limit
limiter.increment(node) self.assertTrue(limiter.increment(node))
node_errors = limiter.stats.get(node_key) node_errors = limiter.stats.get(node_key)
self.assertEqual(limiter.suppression_limit + 1, self.assertEqual(limiter.suppression_limit + 1,
node_errors['errors']) node_errors['errors'])

View File

@ -70,7 +70,8 @@ from swift.common.exceptions import ChunkReadTimeout, DiskFileNotExist, \
from swift.common import utils, constraints, registry from swift.common import utils, constraints, registry
from swift.common.utils import hash_path, storage_directory, \ from swift.common.utils import hash_path, storage_directory, \
parse_content_type, parse_mime_headers, StatsdClient, \ parse_content_type, parse_mime_headers, StatsdClient, \
iter_multipart_mime_documents, public, mkdirs, NullLogger, md5 iter_multipart_mime_documents, public, mkdirs, NullLogger, md5, \
node_to_string
from swift.common.wsgi import loadapp, ConfigString, SwiftHttpProtocol from swift.common.wsgi import loadapp, ConfigString, SwiftHttpProtocol
from swift.proxy.controllers import base as proxy_base from swift.proxy.controllers import base as proxy_base
from swift.proxy.controllers.base import get_cache_key, cors_validation, \ from swift.proxy.controllers.base import get_cache_key, cors_validation, \
@ -1167,26 +1168,31 @@ class TestProxyServer(unittest.TestCase):
def test_exception_occurred(self): def test_exception_occurred(self):
def do_test(additional_info): def do_test(additional_info):
logger = debug_logger('test') logger = debug_logger('test')
app = proxy_server.Application({}, suppression_limit = 10
app = proxy_server.Application(
{'error_suppression_limit': suppression_limit},
account_ring=FakeRing(), account_ring=FakeRing(),
container_ring=FakeRing(), container_ring=FakeRing(),
logger=logger) logger=logger)
node = app.container_ring.get_part_nodes(0)[0] node = app.container_ring.get_part_nodes(0)[0]
node_key = app.error_limiter.node_key(node) node_key = app.error_limiter.node_key(node)
self.assertNotIn(node_key, app.error_limiter.stats) # sanity self.assertNotIn(node_key, app.error_limiter.stats) # sanity
if six.PY2:
expected_info = additional_info.decode('utf8')
else:
expected_info = additional_info
for i in range(suppression_limit):
try: try:
raise Exception('kaboom1!') raise Exception('kaboom1!')
except Exception as err: except Exception as err:
caught_exc = err caught_exc = err
app.exception_occurred(node, 'server-type', additional_info) app.exception_occurred(
node, 'server-type', additional_info)
self.assertEqual(1, node_error_count(app, node)) self.assertEqual(i + 1, node_error_count(app, node))
line = logger.get_lines_for_level('error')[-1] line = logger.get_lines_for_level('error')[-1]
self.assertIn('server-type server', line) self.assertIn('server-type server', line)
if six.PY2: self.assertIn(expected_info, line)
self.assertIn(additional_info.decode('utf8'), line)
else:
self.assertIn(additional_info, line)
self.assertIn(node['ip'], line) self.assertIn(node['ip'], line)
self.assertIn(str(node['port']), line) self.assertIn(str(node['port']), line)
self.assertIn(node['device'], line) self.assertIn(node['device'], line)
@ -1194,6 +1200,33 @@ class TestProxyServer(unittest.TestCase):
self.assertTrue(log_kwargs['exc_info']) self.assertTrue(log_kwargs['exc_info'])
self.assertIs(caught_exc, log_kwargs['exc_info'][1]) self.assertIs(caught_exc, log_kwargs['exc_info'][1])
self.assertEqual(0, logger.get_increment_counts().get(
'error_limiter.incremented_limit', 0))
# One more error to go beyond suppression_limit.
try:
raise Exception('kaboom1!')
except Exception as err:
caught_exc = err
app.exception_occurred(node, 'server-type', additional_info)
self.assertEqual(suppression_limit + 1,
node_error_count(app, node))
lines = logger.get_lines_for_level('error')
self.assertIn('server-type server', lines[-2])
self.assertIn(expected_info, lines[-2])
self.assertIn(node['ip'], lines[-2])
self.assertIn(str(node['port']), lines[-2])
self.assertIn(node['device'], lines[-2])
log_args, log_kwargs = logger.log_dict['error'][-2]
self.assertTrue(log_kwargs['exc_info'])
self.assertIs(caught_exc, log_kwargs['exc_info'][1])
self.assertEqual(
('Node will be error limited from now: %s' %
node_to_string(node)),
lines[-1])
self.assertEqual(1, logger.get_increment_counts().get(
'error_limiter.incremented_limit', 0))
do_test('success') do_test('success')
do_test('succès') do_test('succès')
do_test(u'success') do_test(u'success')
@ -1201,7 +1234,9 @@ class TestProxyServer(unittest.TestCase):
def test_error_occurred(self): def test_error_occurred(self):
def do_test(msg): def do_test(msg):
logger = debug_logger('test') logger = debug_logger('test')
app = proxy_server.Application({}, suppression_limit = 10
app = proxy_server.Application(
{'error_suppression_limit': suppression_limit},
account_ring=FakeRing(), account_ring=FakeRing(),
container_ring=FakeRing(), container_ring=FakeRing(),
logger=logger) logger=logger)
@ -1209,18 +1244,38 @@ class TestProxyServer(unittest.TestCase):
node_key = app.error_limiter.node_key(node) node_key = app.error_limiter.node_key(node)
self.assertNotIn(node_key, app.error_limiter.stats) # sanity self.assertNotIn(node_key, app.error_limiter.stats) # sanity
app.error_occurred(node, msg)
self.assertEqual(1, node_error_count(app, node))
line = logger.get_lines_for_level('error')[-1]
if six.PY2: if six.PY2:
self.assertIn(msg.decode('utf8'), line) expected_msg = msg.decode('utf8')
else: else:
self.assertIn(msg, line) expected_msg = msg
for i in range(suppression_limit):
app.error_occurred(node, msg)
self.assertEqual(i + 1, node_error_count(app, node))
line = logger.get_lines_for_level('error')[-1]
self.assertIn(expected_msg, line)
self.assertIn(node['ip'], line) self.assertIn(node['ip'], line)
self.assertIn(str(node['port']), line) self.assertIn(str(node['port']), line)
self.assertIn(node['device'], line) self.assertIn(node['device'], line)
self.assertEqual(0, logger.get_increment_counts().get(
'error_limiter.incremented_limit', 0))
# One more error to go beyond suppression_limit.
app.error_occurred(node, msg)
self.assertEqual(suppression_limit + 1,
node_error_count(app, node))
lines = logger.get_lines_for_level('error')
self.assertIn(expected_msg, lines[-2])
self.assertIn(node['ip'], lines[-2])
self.assertIn(str(node['port']), lines[-2])
self.assertIn(node['device'], lines[-2])
self.assertEqual(
('Node will be error limited from now: %s' %
node_to_string(node)),
lines[-1])
self.assertEqual(1, logger.get_increment_counts().get(
'error_limiter.incremented_limit', 0))
do_test('success') do_test('success')
do_test('succès') do_test('succès')
do_test(u'success') do_test(u'success')
@ -1312,7 +1367,9 @@ class TestProxyServer(unittest.TestCase):
self.assertFalse(ret) self.assertFalse(ret)
error_lines = app.logger.get_lines_for_level('error') error_lines = app.logger.get_lines_for_level('error')
self.assertEqual(1, len(error_lines)) self.assertEqual(1, len(error_lines))
self.assertEqual('ERROR Insufficient Storage 10.0.0.0:1000/sda', self.assertEqual(
'Node will be error limited from now: 10.0.0.0:1000/sda, '
'error: ERROR Insufficient Storage',
error_lines[0]) error_lines[0])
self.assertEqual(11, node_error_count(app, node)) self.assertEqual(11, node_error_count(app, node))
self.assertTrue(app.error_limited(node)) self.assertTrue(app.error_limited(node))
@ -1323,7 +1380,9 @@ class TestProxyServer(unittest.TestCase):
self.assertFalse(ret) self.assertFalse(ret)
error_lines = app.logger.get_lines_for_level('error') error_lines = app.logger.get_lines_for_level('error')
self.assertEqual(1, len(error_lines)) self.assertEqual(1, len(error_lines))
self.assertEqual('ERROR Insufficient Storage 10.0.0.0:1000/sda', self.assertEqual(
'Node will be error limited from now: 10.0.0.0:1000/sda, '
'error: ERROR Insufficient Storage',
error_lines[0]) error_lines[0])
self.assertEqual(11, node_error_count(app, node)) self.assertEqual(11, node_error_count(app, node))
self.assertTrue(app.error_limited(node)) self.assertTrue(app.error_limited(node))
@ -3409,8 +3468,22 @@ class TestReplicatedObjectController(
controller = \ controller = \
ReplicatedObjectController( ReplicatedObjectController(
self.app, 'a', 'c', 'o.jpg') self.app, 'a', 'c', 'o.jpg')
self.app.error_limit( error_node = object_ring.get_part_nodes(1)[0]
object_ring.get_part_nodes(1)[0], 'test') self.app.error_limit(error_node, 'test')
self.assertEqual(
1, self.logger.get_increment_counts().get(
'error_limiter.forced_limit', 0))
line = self.logger.get_lines_for_level('error')[-1]
self.assertEqual(
('Node will be error limited from now: %s, error: %s' %
(node_to_string(error_node),
'test')),
line)
# no error limited checking yet.
self.assertEqual(
0, self.logger.get_increment_counts().get(
'error_limiter.is_limited', 0))
set_http_connect(200, 200, # account, container set_http_connect(200, 200, # account, container
201, 201, 201, # 3 working backends 201, 201, 201, # 3 working backends
give_connect=test_connect) give_connect=test_connect)
@ -3419,6 +3492,10 @@ class TestReplicatedObjectController(
req.body = 'a' req.body = 'a'
res = controller.PUT(req) res = controller.PUT(req)
self.assertTrue(res.status.startswith('201 ')) self.assertTrue(res.status.startswith('201 '))
# error limited happened during PUT.
self.assertEqual(
1, self.logger.get_increment_counts().get(
'error_limiter.is_limited', 0))
# this is kind of a hokey test, but in FakeRing, the port is even when # this is kind of a hokey test, but in FakeRing, the port is even when
# the region is 0, and odd when the region is 1, so this test asserts # the region is 0, and odd when the region is 1, so this test asserts
@ -5164,8 +5241,9 @@ class TestReplicatedObjectController(
self.assertEqual( self.assertEqual(
self.app.logger.get_lines_for_level('warning'), [ self.app.logger.get_lines_for_level('warning'), [
'Handoff requested (5)']) 'Handoff requested (5)'])
self.assertEqual(self.app.logger.get_increments(), self.assertEqual(
['handoff_count']) self.app.logger.get_increments(),
['error_limiter.is_limited', 'handoff_count'])
# two error-limited primary nodes -> two handoff warnings # two error-limited primary nodes -> two handoff warnings
self.app.log_handoffs = True self.app.log_handoffs = True
@ -5186,9 +5264,9 @@ class TestReplicatedObjectController(
'Handoff requested (5)', 'Handoff requested (5)',
'Handoff requested (6)', 'Handoff requested (6)',
]) ])
self.assertEqual(self.app.logger.get_increments(), stats = self.app.logger.get_increment_counts()
['handoff_count', self.assertEqual(2, stats.get('error_limiter.is_limited', 0))
'handoff_count']) self.assertEqual(2, stats.get('handoff_count', 0))
# all error-limited primary nodes -> four handoff warnings, # all error-limited primary nodes -> four handoff warnings,
# plus a handoff-all metric # plus a handoff-all metric
@ -5213,12 +5291,10 @@ class TestReplicatedObjectController(
'Handoff requested (9)', 'Handoff requested (9)',
'Handoff requested (10)', 'Handoff requested (10)',
]) ])
self.assertEqual(self.app.logger.get_increments(), stats = self.app.logger.get_increment_counts()
['handoff_count', self.assertEqual(4, stats.get('error_limiter.is_limited', 0))
'handoff_count', self.assertEqual(4, stats.get('handoff_count', 0))
'handoff_count', self.assertEqual(1, stats.get('handoff_all_count', 0))
'handoff_count',
'handoff_all_count'])
finally: finally:
object_ring.max_more_nodes = 0 object_ring.max_more_nodes = 0
@ -5250,10 +5326,36 @@ class TestReplicatedObjectController(
object_ring, 0, self.logger)) object_ring, 0, self.logger))
self.assertIn(first_nodes[0], second_nodes) self.assertIn(first_nodes[0], second_nodes)
self.assertEqual(
0, self.logger.get_increment_counts().get(
'error_limiter.is_limited', 0))
self.assertEqual(
0, self.logger.get_increment_counts().get(
'error_limiter.forced_limit', 0))
self.app.error_limit(first_nodes[0], 'test') self.app.error_limit(first_nodes[0], 'test')
self.assertEqual(
1, self.logger.get_increment_counts().get(
'error_limiter.forced_limit', 0))
line = self.logger.get_lines_for_level('error')[-1]
self.assertEqual(
('Node will be error limited from now: %s, error: %s' %
(node_to_string(first_nodes[0]),
'test')),
line)
second_nodes = list(self.app.iter_nodes( second_nodes = list(self.app.iter_nodes(
object_ring, 0, self.logger)) object_ring, 0, self.logger))
self.assertNotIn(first_nodes[0], second_nodes) self.assertNotIn(first_nodes[0], second_nodes)
self.assertEqual(
1, self.logger.get_increment_counts().get(
'error_limiter.is_limited', 0))
third_nodes = list(self.app.iter_nodes(
object_ring, 0, self.logger))
self.assertNotIn(first_nodes[0], third_nodes)
self.assertEqual(
2, self.logger.get_increment_counts().get(
'error_limiter.is_limited', 0))
def test_iter_nodes_gives_extra_if_error_limited_inline(self): def test_iter_nodes_gives_extra_if_error_limited_inline(self):
object_ring = self.app.get_object_ring(None) object_ring = self.app.get_object_ring(None)