Fix "Handoff requested (N)" log spam

A long, long time ago, on a GET request, the proxy would go look on 3* nodes for the requested thing. If one of the primary nodes was error-limited, it'd look on two primaries and a handoff. Since this indicated some failure somewhere, the proxy would emit a warning: "Handoff requested (1)". If two primaries were down, there'd be a second message "Handoff requested (2)", and so on. Some StatsD messages were emitted too. A somewhat shorter time ago (commit d79a67eb), the proxy started looking into handoffs if it got 404s from the primaries. While this was a good idea, it resulted lots of "Handoff requested (N)" log spam; you'd see these messages on every single 404. Also, the StatsD handoff_count and handoff_all_count metrics shot way up and turned into noise. This commit restores the original intent (and usefulness) of the log messages and StatsD metrics: if the proxy only looks at the normal number of handoff nodes, nothing is logged. However, if a primary is down, then the message "Handoff requested (1)" will be logged, indicating that the proxy looked at one more handoff than it normally would, and this happened because a primary node was error-limited. Closes-Bug: 1297214 * or whatever the replica count was Change-Id: If1b77c18c880b096e8ab1df3008db40ce313835d
2014-07-25 18:50:19 -07:00 · 2014-07-25 18:50:19 -07:00 · 4bc600ff42
commit 4bc600ff42
parent 6c256c5057
3 changed files with 75 additions and 26 deletions
--- a/swift/proxy/server.py
+++ b/swift/proxy/server.py
@ -490,6 +490,7 @@ class Application(object):
        handoff_nodes = node_iter
        nodes_left = self.request_node_count(len(primary_nodes))

+        log_handoffs_threshold = nodes_left - len(primary_nodes)
        for node in primary_nodes:
            if not self.error_limited(node):
                yield node
@ -501,11 +502,11 @@ class Application(object):
        for node in handoff_nodes:
            if not self.error_limited(node):
                handoffs += 1
-                if self.log_handoffs:
+                if self.log_handoffs and handoffs > log_handoffs_threshold:
                    self.logger.increment('handoff_count')
                    self.logger.warning(
                        'Handoff requested (%d)' % handoffs)
-                    if handoffs == len(primary_nodes):
+                    if handoffs - log_handoffs_threshold == len(primary_nodes):
                        self.logger.increment('handoff_all_count')
                yield node
                if not self.error_limited(node):
--- a/test/unit/init.py
+++ b/test/unit/init.py
@ -152,10 +152,7 @@ class FakeRing(Ring):
    def clear_errors(self):
        for dev in self.devs:
            for key in ('errors', 'last_error'):
-                try:
-                    del dev[key]
-                except KeyError:
-                    pass
+                dev.pop(key, None)

    def set_replicas(self, replicas):
        self.replicas = replicas
--- a/test/unit/proxy/test_server.py
+++ b/test/unit/proxy/test_server.py
@ -2381,34 +2381,85 @@ class TestObjectController(unittest.TestCase):
                    collected_nodes.append(node)
                self.assertEquals(len(collected_nodes), 9)

+                # zero error-limited primary nodes -> no handoff warnings
                self.app.log_handoffs = True
                self.app.logger = FakeLogger()
-                object_ring.max_more_nodes = 2
+                self.app.request_node_count = lambda r: 7
+                object_ring.max_more_nodes = 20
                partition, nodes = object_ring.get_nodes('account',
                                                         'container',
                                                         'object')
                collected_nodes = []
-                for node in self.app.iter_nodes(object_ring,
-                                                partition):
+                for node in self.app.iter_nodes(object_ring, partition):
                    collected_nodes.append(node)
-                self.assertEquals(len(collected_nodes), 5)
-                self.assertEquals(
-                    self.app.logger.log_dict['warning'],
-                    [(('Handoff requested (1)',), {}),
-                     (('Handoff requested (2)',), {})])
-
-                self.app.log_handoffs = False
-                self.app.logger = FakeLogger()
-                object_ring.max_more_nodes = 2
-                partition, nodes = object_ring.get_nodes('account',
-                                                         'container',
-                                                         'object')
-                collected_nodes = []
-                for node in self.app.iter_nodes(object_ring,
-                                                partition):
-                    collected_nodes.append(node)
-                self.assertEquals(len(collected_nodes), 5)
+                self.assertEquals(len(collected_nodes), 7)
                self.assertEquals(self.app.logger.log_dict['warning'], [])
+                self.assertEquals(self.app.logger.get_increments(), [])
+
+                # one error-limited primary node -> one handoff warning
+                self.app.log_handoffs = True
+                self.app.logger = FakeLogger()
+                self.app.request_node_count = lambda r: 7
+                object_ring.clear_errors()
+                object_ring._devs[0]['errors'] = 999
+                object_ring._devs[0]['last_error'] = 2 ** 63 - 1
+
+                collected_nodes = []
+                for node in self.app.iter_nodes(object_ring, partition):
+                    collected_nodes.append(node)
+                self.assertEquals(len(collected_nodes), 7)
+                self.assertEquals(self.app.logger.log_dict['warning'], [
+                    (('Handoff requested (5)',), {})])
+                self.assertEquals(self.app.logger.get_increments(),
+                                  ['handoff_count'])
+
+                # two error-limited primary nodes -> two handoff warnings
+                self.app.log_handoffs = True
+                self.app.logger = FakeLogger()
+                self.app.request_node_count = lambda r: 7
+                object_ring.clear_errors()
+                for i in range(2):
+                    object_ring._devs[i]['errors'] = 999
+                    object_ring._devs[i]['last_error'] = 2 ** 63 - 1
+
+                collected_nodes = []
+                for node in self.app.iter_nodes(object_ring, partition):
+                    collected_nodes.append(node)
+                self.assertEquals(len(collected_nodes), 7)
+                self.assertEquals(self.app.logger.log_dict['warning'], [
+                    (('Handoff requested (5)',), {}),
+                    (('Handoff requested (6)',), {})])
+                self.assertEquals(self.app.logger.get_increments(),
+                                  ['handoff_count',
+                                   'handoff_count'])
+
+                # all error-limited primary nodes -> four handoff warnings,
+                # plus a handoff-all metric
+                self.app.log_handoffs = True
+                self.app.logger = FakeLogger()
+                self.app.request_node_count = lambda r: 10
+                object_ring.set_replicas(4)  # otherwise we run out of handoffs
+                object_ring.clear_errors()
+                for i in range(4):
+                    object_ring._devs[i]['errors'] = 999
+                    object_ring._devs[i]['last_error'] = 2 ** 63 - 1
+
+                collected_nodes = []
+                for node in self.app.iter_nodes(object_ring, partition):
+                    collected_nodes.append(node)
+                self.assertEquals(len(collected_nodes), 10)
+                self.assertEquals(self.app.logger.log_dict['warning'], [
+                    (('Handoff requested (7)',), {}),
+                    (('Handoff requested (8)',), {}),
+                    (('Handoff requested (9)',), {}),
+                    (('Handoff requested (10)',), {})])
+                self.assertEquals(self.app.logger.get_increments(),
+                                  ['handoff_count',
+                                   'handoff_count',
+                                   'handoff_count',
+                                   'handoff_count',
+                                   'handoff_all_count'])
+
            finally:
                object_ring.max_more_nodes = 0