From d31e337bf1a3fb10cbf07476220006f7218b6e31 Mon Sep 17 00:00:00 2001 From: Edwin Kempin Date: Thu, 31 Oct 2019 10:59:02 +0100 Subject: [PATCH] Include cause for retries into retry metrics By looking at the metrics we want to know how often a certain error causes requests to be retried. E.g. it's good that we retry requests on LOCK_FAILURE, but if LOCK_FAILURES cause too many retries we want to investigate why they are occurring that often. At the moment the retry metrics only tell us how often requests were retried but not why. Signed-off-by: Edwin Kempin Change-Id: If58d87d05bdb64606cb9e37c215727e221c5e327 --- .../gerrit/server/logging/Metadata.java | 17 ++++--- .../gerrit/server/update/RetryHelper.java | 50 ++++++++++++++++--- 2 files changed, 54 insertions(+), 13 deletions(-) diff --git a/java/com/google/gerrit/server/logging/Metadata.java b/java/com/google/gerrit/server/logging/Metadata.java index 60e41ed998..64d156fd32 100644 --- a/java/com/google/gerrit/server/logging/Metadata.java +++ b/java/com/google/gerrit/server/logging/Metadata.java @@ -59,6 +59,9 @@ public abstract class Metadata { // The type of change ID which the user used to identify a change (e.g. numeric ID, triplet etc.). public abstract Optional changeIdType(); + // The cause of an error. + public abstract Optional cause(); + // The type of an event. public abstract Optional eventType(); @@ -158,12 +161,12 @@ public abstract class Metadata { * Metadata{accountId=Optional.empty, actionType=Optional.empty, authDomainName=Optional.empty, * branchName=Optional.empty, cacheKey=Optional.empty, cacheName=Optional.empty, * className=Optional.empty, changeId=Optional[9212550], changeIdType=Optional.empty, - * eventType=Optional.empty, exportValue=Optional.empty, filePath=Optional.empty, - * garbageCollectorName=Optional.empty, gitOperation=Optional.empty, groupId=Optional.empty, - * groupName=Optional.empty, groupUuid=Optional.empty, httpStatus=Optional.empty, - * indexName=Optional.empty, indexVersion=Optional[0], methodName=Optional.empty, - * multiple=Optional.empty, operationName=Optional.empty, partial=Optional.empty, - * noteDbFilePath=Optional.empty, noteDbRefName=Optional.empty, + * cause=Optional.empty, eventType=Optional.empty, exportValue=Optional.empty, + * filePath=Optional.empty, garbageCollectorName=Optional.empty, gitOperation=Optional.empty, + * groupId=Optional.empty, groupName=Optional.empty, groupUuid=Optional.empty, + * httpStatus=Optional.empty, indexName=Optional.empty, indexVersion=Optional[0], + * methodName=Optional.empty, multiple=Optional.empty, operationName=Optional.empty, + * partial=Optional.empty, noteDbFilePath=Optional.empty, noteDbRefName=Optional.empty, * noteDbSequenceType=Optional.empty, noteDbTable=Optional.empty, patchSetId=Optional.empty, * pluginMetadata=[], pluginName=Optional.empty, projectName=Optional.empty, * pushType=Optional.empty, resourceCount=Optional.empty, restViewName=Optional.empty, @@ -276,6 +279,8 @@ public abstract class Metadata { public abstract Builder changeIdType(@Nullable String changeIdType); + public abstract Builder cause(@Nullable String cause); + public abstract Builder eventType(@Nullable String eventType); public abstract Builder exportValue(@Nullable String exportValue); diff --git a/java/com/google/gerrit/server/update/RetryHelper.java b/java/com/google/gerrit/server/update/RetryHelper.java index 14aa523ef0..7c020dac2f 100644 --- a/java/com/google/gerrit/server/update/RetryHelper.java +++ b/java/com/google/gerrit/server/update/RetryHelper.java @@ -32,10 +32,12 @@ import com.google.common.base.Throwables; import com.google.common.collect.Maps; import com.google.common.flogger.FluentLogger; import com.google.gerrit.common.Nullable; +import com.google.gerrit.exceptions.StorageException; import com.google.gerrit.extensions.restapi.RestApiException; import com.google.gerrit.git.LockFailureException; import com.google.gerrit.metrics.Counter1; import com.google.gerrit.metrics.Counter2; +import com.google.gerrit.metrics.Counter3; import com.google.gerrit.metrics.Description; import com.google.gerrit.metrics.Field; import com.google.gerrit.metrics.MetricMaker; @@ -55,6 +57,7 @@ import java.util.concurrent.ExecutionException; import java.util.function.Consumer; import java.util.function.Predicate; import org.eclipse.jgit.lib.Config; +import org.eclipse.jgit.lib.RefUpdate; @Singleton public class RetryHelper { @@ -127,10 +130,10 @@ public class RetryHelper { @VisibleForTesting @Singleton public static class Metrics { - final Counter1 attemptCounts; + final Counter2 attemptCounts; final Counter1 timeoutCount; - final Counter2 autoRetryCount; - final Counter2 failuresOnAutoRetryCount; + final Counter3 autoRetryCount; + final Counter3 failuresOnAutoRetryCount; @Inject Metrics(MetricMaker metricMaker) { @@ -144,7 +147,10 @@ public class RetryHelper { + " (0 == single attempt, no retry)") .setCumulative() .setUnit("attempts"), - actionTypeField); + actionTypeField, + Field.ofString("cause", Metadata.Builder::cause) + .description("The cause for the last attempt.") + .build()); timeoutCount = metricMaker.newCounter( "action/retry_timeout_count", @@ -162,6 +168,9 @@ public class RetryHelper { actionTypeField, Field.ofString("operation_name", Metadata.Builder::operationName) .description("The name of the operation that was retried.") + .build(), + Field.ofString("cause", Metadata.Builder::cause) + .description("The cause for the retry.") .build()); failuresOnAutoRetryCount = metricMaker.newCounter( @@ -172,6 +181,9 @@ public class RetryHelper { actionTypeField, Field.ofString("operation_name", Metadata.Builder::operationName) .description("The name of the operation that was retried.") + .build(), + Field.ofString("cause", Metadata.Builder::cause) + .description("The cause for the retry.") .build()); } } @@ -330,13 +342,14 @@ public class RetryHelper { && opts.retryWithTrace().isPresent() && opts.retryWithTrace().get().test(t)) { String caller = opts.caller().orElse("N/A"); + String cause = formatCause(t); if (!traceContext.isTracing()) { String traceId = "retry-on-failure-" + new RequestId(); traceContext.addTag(RequestId.Type.TRACE_ID, traceId).forceLogging(); opts.onAutoTrace().ifPresent(c -> c.accept(traceId)); logger.atFine().withCause(t).log( "AutoRetry: %s failed, retry with tracing enabled", caller); - metrics.autoRetryCount.increment(actionType, caller); + metrics.autoRetryCount.increment(actionType, caller, cause); return true; } @@ -345,7 +358,7 @@ public class RetryHelper { // differs from the failure that triggered the retry. logger.atFine().withCause(t).log( "AutoRetry: auto-retry of %s has failed", caller); - metrics.failuresOnAutoRetryCount.increment(actionType, caller); + metrics.failuresOnAutoRetryCount.increment(actionType, caller, cause); return false; } @@ -356,11 +369,25 @@ public class RetryHelper { } finally { if (listener.getAttemptCount() > 1) { logger.atFine().log("%s was attempted %d times", actionType, listener.getAttemptCount()); - metrics.attemptCounts.incrementBy(actionType, listener.getAttemptCount() - 1); + metrics.attemptCounts.incrementBy( + actionType, + listener.getCause().map(RetryHelper::formatCause).orElse("_unknown"), + listener.getAttemptCount() - 1); } } } + private static String formatCause(Throwable t) { + if (t instanceof UpdateException || t instanceof StorageException) { + t = t.getCause(); + } + + if (t instanceof LockFailureException) { + return RefUpdate.Result.LOCK_FAILURE.name(); + } + return t.getClass().getSimpleName(); + } + /** * Executes an action and records the timeout as metric. * @@ -409,18 +436,27 @@ public class RetryHelper { private static class MetricListener implements RetryListener { private long attemptCount; + private Optional cause; MetricListener() { attemptCount = 1; + cause = Optional.empty(); } @Override public void onRetry(Attempt attempt) { attemptCount = attempt.getAttemptNumber(); + if (attempt.hasException()) { + cause = Optional.of(attempt.getExceptionCause()); + } } long getAttemptCount() { return attemptCount; } + + Optional getCause() { + return cause; + } } }