Include cause for retries into retry metrics
By looking at the metrics we want to know how often a certain error causes requests to be retried. E.g. it's good that we retry requests on LOCK_FAILURE, but if LOCK_FAILURES cause too many retries we want to investigate why they are occurring that often. At the moment the retry metrics only tell us how often requests were retried but not why. Signed-off-by: Edwin Kempin <ekempin@google.com> Change-Id: If58d87d05bdb64606cb9e37c215727e221c5e327
This commit is contained in:
committed by
David Pursehouse
parent
65c1650256
commit
d31e337bf1
@@ -59,6 +59,9 @@ public abstract class Metadata {
|
||||
// The type of change ID which the user used to identify a change (e.g. numeric ID, triplet etc.).
|
||||
public abstract Optional<String> changeIdType();
|
||||
|
||||
// The cause of an error.
|
||||
public abstract Optional<String> cause();
|
||||
|
||||
// The type of an event.
|
||||
public abstract Optional<String> eventType();
|
||||
|
||||
@@ -158,12 +161,12 @@ public abstract class Metadata {
|
||||
* Metadata{accountId=Optional.empty, actionType=Optional.empty, authDomainName=Optional.empty,
|
||||
* branchName=Optional.empty, cacheKey=Optional.empty, cacheName=Optional.empty,
|
||||
* className=Optional.empty, changeId=Optional[9212550], changeIdType=Optional.empty,
|
||||
* eventType=Optional.empty, exportValue=Optional.empty, filePath=Optional.empty,
|
||||
* garbageCollectorName=Optional.empty, gitOperation=Optional.empty, groupId=Optional.empty,
|
||||
* groupName=Optional.empty, groupUuid=Optional.empty, httpStatus=Optional.empty,
|
||||
* indexName=Optional.empty, indexVersion=Optional[0], methodName=Optional.empty,
|
||||
* multiple=Optional.empty, operationName=Optional.empty, partial=Optional.empty,
|
||||
* noteDbFilePath=Optional.empty, noteDbRefName=Optional.empty,
|
||||
* cause=Optional.empty, eventType=Optional.empty, exportValue=Optional.empty,
|
||||
* filePath=Optional.empty, garbageCollectorName=Optional.empty, gitOperation=Optional.empty,
|
||||
* groupId=Optional.empty, groupName=Optional.empty, groupUuid=Optional.empty,
|
||||
* httpStatus=Optional.empty, indexName=Optional.empty, indexVersion=Optional[0],
|
||||
* methodName=Optional.empty, multiple=Optional.empty, operationName=Optional.empty,
|
||||
* partial=Optional.empty, noteDbFilePath=Optional.empty, noteDbRefName=Optional.empty,
|
||||
* noteDbSequenceType=Optional.empty, noteDbTable=Optional.empty, patchSetId=Optional.empty,
|
||||
* pluginMetadata=[], pluginName=Optional.empty, projectName=Optional.empty,
|
||||
* pushType=Optional.empty, resourceCount=Optional.empty, restViewName=Optional.empty,
|
||||
@@ -276,6 +279,8 @@ public abstract class Metadata {
|
||||
|
||||
public abstract Builder changeIdType(@Nullable String changeIdType);
|
||||
|
||||
public abstract Builder cause(@Nullable String cause);
|
||||
|
||||
public abstract Builder eventType(@Nullable String eventType);
|
||||
|
||||
public abstract Builder exportValue(@Nullable String exportValue);
|
||||
|
||||
@@ -32,10 +32,12 @@ import com.google.common.base.Throwables;
|
||||
import com.google.common.collect.Maps;
|
||||
import com.google.common.flogger.FluentLogger;
|
||||
import com.google.gerrit.common.Nullable;
|
||||
import com.google.gerrit.exceptions.StorageException;
|
||||
import com.google.gerrit.extensions.restapi.RestApiException;
|
||||
import com.google.gerrit.git.LockFailureException;
|
||||
import com.google.gerrit.metrics.Counter1;
|
||||
import com.google.gerrit.metrics.Counter2;
|
||||
import com.google.gerrit.metrics.Counter3;
|
||||
import com.google.gerrit.metrics.Description;
|
||||
import com.google.gerrit.metrics.Field;
|
||||
import com.google.gerrit.metrics.MetricMaker;
|
||||
@@ -55,6 +57,7 @@ import java.util.concurrent.ExecutionException;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.function.Predicate;
|
||||
import org.eclipse.jgit.lib.Config;
|
||||
import org.eclipse.jgit.lib.RefUpdate;
|
||||
|
||||
@Singleton
|
||||
public class RetryHelper {
|
||||
@@ -127,10 +130,10 @@ public class RetryHelper {
|
||||
@VisibleForTesting
|
||||
@Singleton
|
||||
public static class Metrics {
|
||||
final Counter1<ActionType> attemptCounts;
|
||||
final Counter2<ActionType, String> attemptCounts;
|
||||
final Counter1<ActionType> timeoutCount;
|
||||
final Counter2<ActionType, String> autoRetryCount;
|
||||
final Counter2<ActionType, String> failuresOnAutoRetryCount;
|
||||
final Counter3<ActionType, String, String> autoRetryCount;
|
||||
final Counter3<ActionType, String, String> failuresOnAutoRetryCount;
|
||||
|
||||
@Inject
|
||||
Metrics(MetricMaker metricMaker) {
|
||||
@@ -144,7 +147,10 @@ public class RetryHelper {
|
||||
+ " (0 == single attempt, no retry)")
|
||||
.setCumulative()
|
||||
.setUnit("attempts"),
|
||||
actionTypeField);
|
||||
actionTypeField,
|
||||
Field.ofString("cause", Metadata.Builder::cause)
|
||||
.description("The cause for the last attempt.")
|
||||
.build());
|
||||
timeoutCount =
|
||||
metricMaker.newCounter(
|
||||
"action/retry_timeout_count",
|
||||
@@ -162,6 +168,9 @@ public class RetryHelper {
|
||||
actionTypeField,
|
||||
Field.ofString("operation_name", Metadata.Builder::operationName)
|
||||
.description("The name of the operation that was retried.")
|
||||
.build(),
|
||||
Field.ofString("cause", Metadata.Builder::cause)
|
||||
.description("The cause for the retry.")
|
||||
.build());
|
||||
failuresOnAutoRetryCount =
|
||||
metricMaker.newCounter(
|
||||
@@ -172,6 +181,9 @@ public class RetryHelper {
|
||||
actionTypeField,
|
||||
Field.ofString("operation_name", Metadata.Builder::operationName)
|
||||
.description("The name of the operation that was retried.")
|
||||
.build(),
|
||||
Field.ofString("cause", Metadata.Builder::cause)
|
||||
.description("The cause for the retry.")
|
||||
.build());
|
||||
}
|
||||
}
|
||||
@@ -330,13 +342,14 @@ public class RetryHelper {
|
||||
&& opts.retryWithTrace().isPresent()
|
||||
&& opts.retryWithTrace().get().test(t)) {
|
||||
String caller = opts.caller().orElse("N/A");
|
||||
String cause = formatCause(t);
|
||||
if (!traceContext.isTracing()) {
|
||||
String traceId = "retry-on-failure-" + new RequestId();
|
||||
traceContext.addTag(RequestId.Type.TRACE_ID, traceId).forceLogging();
|
||||
opts.onAutoTrace().ifPresent(c -> c.accept(traceId));
|
||||
logger.atFine().withCause(t).log(
|
||||
"AutoRetry: %s failed, retry with tracing enabled", caller);
|
||||
metrics.autoRetryCount.increment(actionType, caller);
|
||||
metrics.autoRetryCount.increment(actionType, caller, cause);
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -345,7 +358,7 @@ public class RetryHelper {
|
||||
// differs from the failure that triggered the retry.
|
||||
logger.atFine().withCause(t).log(
|
||||
"AutoRetry: auto-retry of %s has failed", caller);
|
||||
metrics.failuresOnAutoRetryCount.increment(actionType, caller);
|
||||
metrics.failuresOnAutoRetryCount.increment(actionType, caller, cause);
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -356,11 +369,25 @@ public class RetryHelper {
|
||||
} finally {
|
||||
if (listener.getAttemptCount() > 1) {
|
||||
logger.atFine().log("%s was attempted %d times", actionType, listener.getAttemptCount());
|
||||
metrics.attemptCounts.incrementBy(actionType, listener.getAttemptCount() - 1);
|
||||
metrics.attemptCounts.incrementBy(
|
||||
actionType,
|
||||
listener.getCause().map(RetryHelper::formatCause).orElse("_unknown"),
|
||||
listener.getAttemptCount() - 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static String formatCause(Throwable t) {
|
||||
if (t instanceof UpdateException || t instanceof StorageException) {
|
||||
t = t.getCause();
|
||||
}
|
||||
|
||||
if (t instanceof LockFailureException) {
|
||||
return RefUpdate.Result.LOCK_FAILURE.name();
|
||||
}
|
||||
return t.getClass().getSimpleName();
|
||||
}
|
||||
|
||||
/**
|
||||
* Executes an action and records the timeout as metric.
|
||||
*
|
||||
@@ -409,18 +436,27 @@ public class RetryHelper {
|
||||
|
||||
private static class MetricListener implements RetryListener {
|
||||
private long attemptCount;
|
||||
private Optional<Throwable> cause;
|
||||
|
||||
MetricListener() {
|
||||
attemptCount = 1;
|
||||
cause = Optional.empty();
|
||||
}
|
||||
|
||||
@Override
|
||||
public <V> void onRetry(Attempt<V> attempt) {
|
||||
attemptCount = attempt.getAttemptNumber();
|
||||
if (attempt.hasException()) {
|
||||
cause = Optional.of(attempt.getExceptionCause());
|
||||
}
|
||||
}
|
||||
|
||||
long getAttemptCount() {
|
||||
return attemptCount;
|
||||
}
|
||||
|
||||
Optional<Throwable> getCause() {
|
||||
return cause;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user