RetryHelper: Add metric to count number of failures on auto retry
Since we retry automatically only on non-recoverable failures it's expected that the number of failures on auto retry is the same as the number of auto retries. The number of auto retries is captured by the action/auto_retry_count metric. If the value of this metric is compared with the value of the new metric we can verify if our assumption is correct. If there is a mismatch between the values we have exceptions that are considered as non-recoverable, but which actually are recoverable. In this case we should change the code to treat them as recoverable. Signed-off-by: Edwin Kempin <ekempin@google.com> Change-Id: Id52026bf2d1a27e7c0668bcdd63ca1effdf8db09
This commit is contained in:
@@ -20,6 +20,7 @@ by RetryHelper to execute an action (0 == single attempt, no retry)
|
||||
* `action/retry_timeout_count`: Number of action executions of RetryHelper
|
||||
that ultimately timed out
|
||||
* `action/auto_retry_count`: Number of automatic retries with tracing
|
||||
* `action/failures_on_auto_retry_count`: Number of failures on auto retry
|
||||
|
||||
=== Pushes
|
||||
|
||||
|
||||
@@ -122,6 +122,7 @@ public class RetryHelper {
|
||||
final Counter1<ActionType> attemptCounts;
|
||||
final Counter1<ActionType> timeoutCount;
|
||||
final Counter2<ActionType, String> autoRetryCount;
|
||||
final Counter2<ActionType, String> failuresOnAutoRetryCount;
|
||||
|
||||
@Inject
|
||||
Metrics(MetricMaker metricMaker) {
|
||||
@@ -154,6 +155,16 @@ public class RetryHelper {
|
||||
Field.ofString("operation_name", Metadata.Builder::operationName)
|
||||
.description("The name of the operation that was retried.")
|
||||
.build());
|
||||
failuresOnAutoRetryCount =
|
||||
metricMaker.newCounter(
|
||||
"action/failures_on_auto_retry_count",
|
||||
new Description("Number of failures on auto retry")
|
||||
.setCumulative()
|
||||
.setUnit("failures"),
|
||||
actionTypeField,
|
||||
Field.ofString("operation_name", Metadata.Builder::operationName)
|
||||
.description("The name of the operation that was retried.")
|
||||
.build());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -313,6 +324,7 @@ public class RetryHelper {
|
||||
// enabled and it failed again. Log the failure so that admin can see if it
|
||||
// differs from the failure that triggered the retry.
|
||||
logger.atFine().withCause(t).log("auto-retry of %s has failed", caller);
|
||||
metrics.failuresOnAutoRetryCount.increment(actionType, caller);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user