Fix "Handoff requested (N)" log spam
A long, long time ago, on a GET request, the proxy would go look on 3*
nodes for the requested thing. If one of the primary nodes was
error-limited, it'd look on two primaries and a handoff. Since this
indicated some failure somewhere, the proxy would emit a warning:
"Handoff requested (1)". If two primaries were down, there'd be a
second message "Handoff requested (2)", and so on.
Some StatsD messages were emitted too.
A somewhat shorter time ago (commit d79a67eb
), the proxy started
looking into handoffs if it got 404s from the primaries. While this
was a good idea, it resulted lots of "Handoff requested (N)" log spam;
you'd see these messages on every single 404. Also, the StatsD
handoff_count and handoff_all_count metrics shot way up and turned
into noise.
This commit restores the original intent (and usefulness) of the log
messages and StatsD metrics: if the proxy only looks at the normal
number of handoff nodes, nothing is logged. However, if a primary is
down, then the message "Handoff requested (1)" will be logged,
indicating that the proxy looked at one more handoff than it normally
would, and this happened because a primary node was error-limited.
Closes-Bug: 1297214
* or whatever the replica count was
Change-Id: If1b77c18c880b096e8ab1df3008db40ce313835d
This commit is contained in:
parent
6c256c5057
commit
4bc600ff42
@ -490,6 +490,7 @@ class Application(object):
|
||||
handoff_nodes = node_iter
|
||||
nodes_left = self.request_node_count(len(primary_nodes))
|
||||
|
||||
log_handoffs_threshold = nodes_left - len(primary_nodes)
|
||||
for node in primary_nodes:
|
||||
if not self.error_limited(node):
|
||||
yield node
|
||||
@ -501,11 +502,11 @@ class Application(object):
|
||||
for node in handoff_nodes:
|
||||
if not self.error_limited(node):
|
||||
handoffs += 1
|
||||
if self.log_handoffs:
|
||||
if self.log_handoffs and handoffs > log_handoffs_threshold:
|
||||
self.logger.increment('handoff_count')
|
||||
self.logger.warning(
|
||||
'Handoff requested (%d)' % handoffs)
|
||||
if handoffs == len(primary_nodes):
|
||||
if handoffs - log_handoffs_threshold == len(primary_nodes):
|
||||
self.logger.increment('handoff_all_count')
|
||||
yield node
|
||||
if not self.error_limited(node):
|
||||
|
@ -152,10 +152,7 @@ class FakeRing(Ring):
|
||||
def clear_errors(self):
|
||||
for dev in self.devs:
|
||||
for key in ('errors', 'last_error'):
|
||||
try:
|
||||
del dev[key]
|
||||
except KeyError:
|
||||
pass
|
||||
dev.pop(key, None)
|
||||
|
||||
def set_replicas(self, replicas):
|
||||
self.replicas = replicas
|
||||
|
@ -2381,34 +2381,85 @@ class TestObjectController(unittest.TestCase):
|
||||
collected_nodes.append(node)
|
||||
self.assertEquals(len(collected_nodes), 9)
|
||||
|
||||
# zero error-limited primary nodes -> no handoff warnings
|
||||
self.app.log_handoffs = True
|
||||
self.app.logger = FakeLogger()
|
||||
object_ring.max_more_nodes = 2
|
||||
self.app.request_node_count = lambda r: 7
|
||||
object_ring.max_more_nodes = 20
|
||||
partition, nodes = object_ring.get_nodes('account',
|
||||
'container',
|
||||
'object')
|
||||
collected_nodes = []
|
||||
for node in self.app.iter_nodes(object_ring,
|
||||
partition):
|
||||
for node in self.app.iter_nodes(object_ring, partition):
|
||||
collected_nodes.append(node)
|
||||
self.assertEquals(len(collected_nodes), 5)
|
||||
self.assertEquals(
|
||||
self.app.logger.log_dict['warning'],
|
||||
[(('Handoff requested (1)',), {}),
|
||||
(('Handoff requested (2)',), {})])
|
||||
|
||||
self.app.log_handoffs = False
|
||||
self.app.logger = FakeLogger()
|
||||
object_ring.max_more_nodes = 2
|
||||
partition, nodes = object_ring.get_nodes('account',
|
||||
'container',
|
||||
'object')
|
||||
collected_nodes = []
|
||||
for node in self.app.iter_nodes(object_ring,
|
||||
partition):
|
||||
collected_nodes.append(node)
|
||||
self.assertEquals(len(collected_nodes), 5)
|
||||
self.assertEquals(len(collected_nodes), 7)
|
||||
self.assertEquals(self.app.logger.log_dict['warning'], [])
|
||||
self.assertEquals(self.app.logger.get_increments(), [])
|
||||
|
||||
# one error-limited primary node -> one handoff warning
|
||||
self.app.log_handoffs = True
|
||||
self.app.logger = FakeLogger()
|
||||
self.app.request_node_count = lambda r: 7
|
||||
object_ring.clear_errors()
|
||||
object_ring._devs[0]['errors'] = 999
|
||||
object_ring._devs[0]['last_error'] = 2 ** 63 - 1
|
||||
|
||||
collected_nodes = []
|
||||
for node in self.app.iter_nodes(object_ring, partition):
|
||||
collected_nodes.append(node)
|
||||
self.assertEquals(len(collected_nodes), 7)
|
||||
self.assertEquals(self.app.logger.log_dict['warning'], [
|
||||
(('Handoff requested (5)',), {})])
|
||||
self.assertEquals(self.app.logger.get_increments(),
|
||||
['handoff_count'])
|
||||
|
||||
# two error-limited primary nodes -> two handoff warnings
|
||||
self.app.log_handoffs = True
|
||||
self.app.logger = FakeLogger()
|
||||
self.app.request_node_count = lambda r: 7
|
||||
object_ring.clear_errors()
|
||||
for i in range(2):
|
||||
object_ring._devs[i]['errors'] = 999
|
||||
object_ring._devs[i]['last_error'] = 2 ** 63 - 1
|
||||
|
||||
collected_nodes = []
|
||||
for node in self.app.iter_nodes(object_ring, partition):
|
||||
collected_nodes.append(node)
|
||||
self.assertEquals(len(collected_nodes), 7)
|
||||
self.assertEquals(self.app.logger.log_dict['warning'], [
|
||||
(('Handoff requested (5)',), {}),
|
||||
(('Handoff requested (6)',), {})])
|
||||
self.assertEquals(self.app.logger.get_increments(),
|
||||
['handoff_count',
|
||||
'handoff_count'])
|
||||
|
||||
# all error-limited primary nodes -> four handoff warnings,
|
||||
# plus a handoff-all metric
|
||||
self.app.log_handoffs = True
|
||||
self.app.logger = FakeLogger()
|
||||
self.app.request_node_count = lambda r: 10
|
||||
object_ring.set_replicas(4) # otherwise we run out of handoffs
|
||||
object_ring.clear_errors()
|
||||
for i in range(4):
|
||||
object_ring._devs[i]['errors'] = 999
|
||||
object_ring._devs[i]['last_error'] = 2 ** 63 - 1
|
||||
|
||||
collected_nodes = []
|
||||
for node in self.app.iter_nodes(object_ring, partition):
|
||||
collected_nodes.append(node)
|
||||
self.assertEquals(len(collected_nodes), 10)
|
||||
self.assertEquals(self.app.logger.log_dict['warning'], [
|
||||
(('Handoff requested (7)',), {}),
|
||||
(('Handoff requested (8)',), {}),
|
||||
(('Handoff requested (9)',), {}),
|
||||
(('Handoff requested (10)',), {})])
|
||||
self.assertEquals(self.app.logger.get_increments(),
|
||||
['handoff_count',
|
||||
'handoff_count',
|
||||
'handoff_count',
|
||||
'handoff_count',
|
||||
'handoff_all_count'])
|
||||
|
||||
finally:
|
||||
object_ring.max_more_nodes = 0
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user