Fix "Handoff requested (N)" log spam

A long, long time ago, on a GET request, the proxy would go look on 3*
nodes for the requested thing. If one of the primary nodes was
error-limited, it'd look on two primaries and a handoff. Since this
indicated some failure somewhere, the proxy would emit a warning:
"Handoff requested (1)". If two primaries were down, there'd be a
second message "Handoff requested (2)", and so on.

Some StatsD messages were emitted too.

A somewhat shorter time ago (commit d79a67eb), the proxy started
looking into handoffs if it got 404s from the primaries. While this
was a good idea, it resulted lots of "Handoff requested (N)" log spam;
you'd see these messages on every single 404. Also, the StatsD
handoff_count and handoff_all_count metrics shot way up and turned
into noise.

This commit restores the original intent (and usefulness) of the log
messages and StatsD metrics: if the proxy only looks at the normal
number of handoff nodes, nothing is logged. However, if a primary is
down, then the message "Handoff requested (1)" will be logged,
indicating that the proxy looked at one more handoff than it normally
would, and this happened because a primary node was error-limited.

Closes-Bug: 1297214

* or whatever the replica count was

Change-Id: If1b77c18c880b096e8ab1df3008db40ce313835d
This commit is contained in:
Samuel Merritt 2014-07-25 18:50:19 -07:00
parent 6c256c5057
commit 4bc600ff42
3 changed files with 75 additions and 26 deletions

View File

@ -490,6 +490,7 @@ class Application(object):
handoff_nodes = node_iter
nodes_left = self.request_node_count(len(primary_nodes))
log_handoffs_threshold = nodes_left - len(primary_nodes)
for node in primary_nodes:
if not self.error_limited(node):
yield node
@ -501,11 +502,11 @@ class Application(object):
for node in handoff_nodes:
if not self.error_limited(node):
handoffs += 1
if self.log_handoffs:
if self.log_handoffs and handoffs > log_handoffs_threshold:
self.logger.increment('handoff_count')
self.logger.warning(
'Handoff requested (%d)' % handoffs)
if handoffs == len(primary_nodes):
if handoffs - log_handoffs_threshold == len(primary_nodes):
self.logger.increment('handoff_all_count')
yield node
if not self.error_limited(node):

View File

@ -152,10 +152,7 @@ class FakeRing(Ring):
def clear_errors(self):
for dev in self.devs:
for key in ('errors', 'last_error'):
try:
del dev[key]
except KeyError:
pass
dev.pop(key, None)
def set_replicas(self, replicas):
self.replicas = replicas

View File

@ -2381,34 +2381,85 @@ class TestObjectController(unittest.TestCase):
collected_nodes.append(node)
self.assertEquals(len(collected_nodes), 9)
# zero error-limited primary nodes -> no handoff warnings
self.app.log_handoffs = True
self.app.logger = FakeLogger()
object_ring.max_more_nodes = 2
self.app.request_node_count = lambda r: 7
object_ring.max_more_nodes = 20
partition, nodes = object_ring.get_nodes('account',
'container',
'object')
collected_nodes = []
for node in self.app.iter_nodes(object_ring,
partition):
for node in self.app.iter_nodes(object_ring, partition):
collected_nodes.append(node)
self.assertEquals(len(collected_nodes), 5)
self.assertEquals(
self.app.logger.log_dict['warning'],
[(('Handoff requested (1)',), {}),
(('Handoff requested (2)',), {})])
self.app.log_handoffs = False
self.app.logger = FakeLogger()
object_ring.max_more_nodes = 2
partition, nodes = object_ring.get_nodes('account',
'container',
'object')
collected_nodes = []
for node in self.app.iter_nodes(object_ring,
partition):
collected_nodes.append(node)
self.assertEquals(len(collected_nodes), 5)
self.assertEquals(len(collected_nodes), 7)
self.assertEquals(self.app.logger.log_dict['warning'], [])
self.assertEquals(self.app.logger.get_increments(), [])
# one error-limited primary node -> one handoff warning
self.app.log_handoffs = True
self.app.logger = FakeLogger()
self.app.request_node_count = lambda r: 7
object_ring.clear_errors()
object_ring._devs[0]['errors'] = 999
object_ring._devs[0]['last_error'] = 2 ** 63 - 1
collected_nodes = []
for node in self.app.iter_nodes(object_ring, partition):
collected_nodes.append(node)
self.assertEquals(len(collected_nodes), 7)
self.assertEquals(self.app.logger.log_dict['warning'], [
(('Handoff requested (5)',), {})])
self.assertEquals(self.app.logger.get_increments(),
['handoff_count'])
# two error-limited primary nodes -> two handoff warnings
self.app.log_handoffs = True
self.app.logger = FakeLogger()
self.app.request_node_count = lambda r: 7
object_ring.clear_errors()
for i in range(2):
object_ring._devs[i]['errors'] = 999
object_ring._devs[i]['last_error'] = 2 ** 63 - 1
collected_nodes = []
for node in self.app.iter_nodes(object_ring, partition):
collected_nodes.append(node)
self.assertEquals(len(collected_nodes), 7)
self.assertEquals(self.app.logger.log_dict['warning'], [
(('Handoff requested (5)',), {}),
(('Handoff requested (6)',), {})])
self.assertEquals(self.app.logger.get_increments(),
['handoff_count',
'handoff_count'])
# all error-limited primary nodes -> four handoff warnings,
# plus a handoff-all metric
self.app.log_handoffs = True
self.app.logger = FakeLogger()
self.app.request_node_count = lambda r: 10
object_ring.set_replicas(4) # otherwise we run out of handoffs
object_ring.clear_errors()
for i in range(4):
object_ring._devs[i]['errors'] = 999
object_ring._devs[i]['last_error'] = 2 ** 63 - 1
collected_nodes = []
for node in self.app.iter_nodes(object_ring, partition):
collected_nodes.append(node)
self.assertEquals(len(collected_nodes), 10)
self.assertEquals(self.app.logger.log_dict['warning'], [
(('Handoff requested (7)',), {}),
(('Handoff requested (8)',), {}),
(('Handoff requested (9)',), {}),
(('Handoff requested (10)',), {})])
self.assertEquals(self.app.logger.get_increments(),
['handoff_count',
'handoff_count',
'handoff_count',
'handoff_count',
'handoff_all_count'])
finally:
object_ring.max_more_nodes = 0