Add metric to count how often requests are automatically retried

Signed-off-by: Edwin Kempin <ekempin@google.com>
Change-Id: Idd0e11babea2de2813ffbd741f58b3d0a43d7d4a
This commit is contained in:
Edwin Kempin
2019-08-02 13:16:55 +02:00
parent 4313344073
commit ae84332a99
3 changed files with 22 additions and 5 deletions

View File

@@ -19,6 +19,7 @@ The following metrics are reported.
by RetryHelper to execute an action (0 == single attempt, no retry) by RetryHelper to execute an action (0 == single attempt, no retry)
* `action/retry_timeout_count`: Number of action executions of RetryHelper * `action/retry_timeout_count`: Number of action executions of RetryHelper
that ultimately timed out that ultimately timed out
* `action/auto_retry_count`: Number of automatic retries with tracing
=== Pushes === Pushes

View File

@@ -89,6 +89,9 @@ public abstract class Metadata {
// One or more resources // One or more resources
public abstract Optional<Boolean> multiple(); public abstract Optional<Boolean> multiple();
// The name of an operation that is performed.
public abstract Optional<String> operationName();
// Partial or full computation // Partial or full computation
public abstract Optional<Boolean> partial(); public abstract Optional<Boolean> partial();
@@ -185,6 +188,8 @@ public abstract class Metadata {
public abstract Builder multiple(boolean multiple); public abstract Builder multiple(boolean multiple);
public abstract Builder operationName(String operationName);
public abstract Builder partial(boolean partial); public abstract Builder partial(boolean partial);
public abstract Builder noteDbFilePath(@Nullable String noteDbFilePath); public abstract Builder noteDbFilePath(@Nullable String noteDbFilePath);

View File

@@ -35,6 +35,7 @@ import com.google.gerrit.common.Nullable;
import com.google.gerrit.extensions.restapi.RestApiException; import com.google.gerrit.extensions.restapi.RestApiException;
import com.google.gerrit.git.LockFailureException; import com.google.gerrit.git.LockFailureException;
import com.google.gerrit.metrics.Counter1; import com.google.gerrit.metrics.Counter1;
import com.google.gerrit.metrics.Counter2;
import com.google.gerrit.metrics.Description; import com.google.gerrit.metrics.Description;
import com.google.gerrit.metrics.Field; import com.google.gerrit.metrics.Field;
import com.google.gerrit.metrics.MetricMaker; import com.google.gerrit.metrics.MetricMaker;
@@ -120,6 +121,7 @@ public class RetryHelper {
public static class Metrics { public static class Metrics {
final Counter1<ActionType> attemptCounts; final Counter1<ActionType> attemptCounts;
final Counter1<ActionType> timeoutCount; final Counter1<ActionType> timeoutCount;
final Counter2<ActionType, String> autoRetryCount;
@Inject @Inject
Metrics(MetricMaker metricMaker) { Metrics(MetricMaker metricMaker) {
@@ -142,6 +144,16 @@ public class RetryHelper {
.setCumulative() .setCumulative()
.setUnit("timeouts"), .setUnit("timeouts"),
actionTypeField); actionTypeField);
autoRetryCount =
metricMaker.newCounter(
"action/auto_retry_count",
new Description("Number of automatic retries with tracing")
.setCumulative()
.setUnit("retries"),
actionTypeField,
Field.ofString("operation_name", Metadata.Builder::operationName)
.description("The name of the operation that was retried.")
.build());
} }
} }
@@ -286,22 +298,21 @@ public class RetryHelper {
if (retryWithTraceOnFailure if (retryWithTraceOnFailure
&& opts.retryWithTrace().isPresent() && opts.retryWithTrace().isPresent()
&& opts.retryWithTrace().get().test(t)) { && opts.retryWithTrace().get().test(t)) {
String caller = opts.caller().map(Class::getSimpleName).orElse("N/A");
if (!traceContext.isTracing()) { if (!traceContext.isTracing()) {
traceContext traceContext
.addTag(RequestId.Type.TRACE_ID, "retry-on-failure-" + new RequestId()) .addTag(RequestId.Type.TRACE_ID, "retry-on-failure-" + new RequestId())
.forceLogging(); .forceLogging();
logger.atFine().withCause(t).log( logger.atFine().withCause(t).log(
"%s failed, retry with tracing enabled", "%s failed, retry with tracing enabled", caller);
opts.caller().map(Class::getSimpleName).orElse("N/A")); metrics.autoRetryCount.increment(actionType, caller);
return true; return true;
} }
// A non-recoverable failure occurred. We retried the operation with tracing // A non-recoverable failure occurred. We retried the operation with tracing
// enabled and it failed again. Log the failure so that admin can see if it // enabled and it failed again. Log the failure so that admin can see if it
// differs from the failure that triggered the retry. // differs from the failure that triggered the retry.
logger.atFine().withCause(t).log( logger.atFine().withCause(t).log("auto-retry of %s has failed", caller);
"auto-retry of %s has failed",
opts.caller().map(Class::getSimpleName).orElse("N/A"));
return false; return false;
} }