Add metric to count how often requests are automatically retried
Signed-off-by: Edwin Kempin <ekempin@google.com> Change-Id: Idd0e11babea2de2813ffbd741f58b3d0a43d7d4a
This commit is contained in:
@@ -19,6 +19,7 @@ The following metrics are reported.
|
|||||||
by RetryHelper to execute an action (0 == single attempt, no retry)
|
by RetryHelper to execute an action (0 == single attempt, no retry)
|
||||||
* `action/retry_timeout_count`: Number of action executions of RetryHelper
|
* `action/retry_timeout_count`: Number of action executions of RetryHelper
|
||||||
that ultimately timed out
|
that ultimately timed out
|
||||||
|
* `action/auto_retry_count`: Number of automatic retries with tracing
|
||||||
|
|
||||||
=== Pushes
|
=== Pushes
|
||||||
|
|
||||||
|
|||||||
@@ -89,6 +89,9 @@ public abstract class Metadata {
|
|||||||
// One or more resources
|
// One or more resources
|
||||||
public abstract Optional<Boolean> multiple();
|
public abstract Optional<Boolean> multiple();
|
||||||
|
|
||||||
|
// The name of an operation that is performed.
|
||||||
|
public abstract Optional<String> operationName();
|
||||||
|
|
||||||
// Partial or full computation
|
// Partial or full computation
|
||||||
public abstract Optional<Boolean> partial();
|
public abstract Optional<Boolean> partial();
|
||||||
|
|
||||||
@@ -185,6 +188,8 @@ public abstract class Metadata {
|
|||||||
|
|
||||||
public abstract Builder multiple(boolean multiple);
|
public abstract Builder multiple(boolean multiple);
|
||||||
|
|
||||||
|
public abstract Builder operationName(String operationName);
|
||||||
|
|
||||||
public abstract Builder partial(boolean partial);
|
public abstract Builder partial(boolean partial);
|
||||||
|
|
||||||
public abstract Builder noteDbFilePath(@Nullable String noteDbFilePath);
|
public abstract Builder noteDbFilePath(@Nullable String noteDbFilePath);
|
||||||
|
|||||||
@@ -35,6 +35,7 @@ import com.google.gerrit.common.Nullable;
|
|||||||
import com.google.gerrit.extensions.restapi.RestApiException;
|
import com.google.gerrit.extensions.restapi.RestApiException;
|
||||||
import com.google.gerrit.git.LockFailureException;
|
import com.google.gerrit.git.LockFailureException;
|
||||||
import com.google.gerrit.metrics.Counter1;
|
import com.google.gerrit.metrics.Counter1;
|
||||||
|
import com.google.gerrit.metrics.Counter2;
|
||||||
import com.google.gerrit.metrics.Description;
|
import com.google.gerrit.metrics.Description;
|
||||||
import com.google.gerrit.metrics.Field;
|
import com.google.gerrit.metrics.Field;
|
||||||
import com.google.gerrit.metrics.MetricMaker;
|
import com.google.gerrit.metrics.MetricMaker;
|
||||||
@@ -120,6 +121,7 @@ public class RetryHelper {
|
|||||||
public static class Metrics {
|
public static class Metrics {
|
||||||
final Counter1<ActionType> attemptCounts;
|
final Counter1<ActionType> attemptCounts;
|
||||||
final Counter1<ActionType> timeoutCount;
|
final Counter1<ActionType> timeoutCount;
|
||||||
|
final Counter2<ActionType, String> autoRetryCount;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
Metrics(MetricMaker metricMaker) {
|
Metrics(MetricMaker metricMaker) {
|
||||||
@@ -142,6 +144,16 @@ public class RetryHelper {
|
|||||||
.setCumulative()
|
.setCumulative()
|
||||||
.setUnit("timeouts"),
|
.setUnit("timeouts"),
|
||||||
actionTypeField);
|
actionTypeField);
|
||||||
|
autoRetryCount =
|
||||||
|
metricMaker.newCounter(
|
||||||
|
"action/auto_retry_count",
|
||||||
|
new Description("Number of automatic retries with tracing")
|
||||||
|
.setCumulative()
|
||||||
|
.setUnit("retries"),
|
||||||
|
actionTypeField,
|
||||||
|
Field.ofString("operation_name", Metadata.Builder::operationName)
|
||||||
|
.description("The name of the operation that was retried.")
|
||||||
|
.build());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -286,22 +298,21 @@ public class RetryHelper {
|
|||||||
if (retryWithTraceOnFailure
|
if (retryWithTraceOnFailure
|
||||||
&& opts.retryWithTrace().isPresent()
|
&& opts.retryWithTrace().isPresent()
|
||||||
&& opts.retryWithTrace().get().test(t)) {
|
&& opts.retryWithTrace().get().test(t)) {
|
||||||
|
String caller = opts.caller().map(Class::getSimpleName).orElse("N/A");
|
||||||
if (!traceContext.isTracing()) {
|
if (!traceContext.isTracing()) {
|
||||||
traceContext
|
traceContext
|
||||||
.addTag(RequestId.Type.TRACE_ID, "retry-on-failure-" + new RequestId())
|
.addTag(RequestId.Type.TRACE_ID, "retry-on-failure-" + new RequestId())
|
||||||
.forceLogging();
|
.forceLogging();
|
||||||
logger.atFine().withCause(t).log(
|
logger.atFine().withCause(t).log(
|
||||||
"%s failed, retry with tracing enabled",
|
"%s failed, retry with tracing enabled", caller);
|
||||||
opts.caller().map(Class::getSimpleName).orElse("N/A"));
|
metrics.autoRetryCount.increment(actionType, caller);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// A non-recoverable failure occurred. We retried the operation with tracing
|
// A non-recoverable failure occurred. We retried the operation with tracing
|
||||||
// enabled and it failed again. Log the failure so that admin can see if it
|
// enabled and it failed again. Log the failure so that admin can see if it
|
||||||
// differs from the failure that triggered the retry.
|
// differs from the failure that triggered the retry.
|
||||||
logger.atFine().withCause(t).log(
|
logger.atFine().withCause(t).log("auto-retry of %s has failed", caller);
|
||||||
"auto-retry of %s has failed",
|
|
||||||
opts.caller().map(Class::getSimpleName).orElse("N/A"));
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user