diff --git a/Documentation/config-gerrit.txt b/Documentation/config-gerrit.txt index 1601d7a6d2..91cb87f0fa 100644 --- a/Documentation/config-gerrit.txt +++ b/Documentation/config-gerrit.txt @@ -3790,6 +3790,14 @@ attempt fails. `` can be `ACCOUNT_UPDATE`, `CHANGE_UPDATE`, Defaults to link:#retry.timeout[`retry.timeout`]; unit suffixes are supported, and assumes milliseconds if not specified. +[[retry.retryWithTraceOnFailure]]retry.retryWithTraceOnFailure:: ++ +Whether Gerrit should automatically retry operations on failure with tracing +enabled. The automatically generated traces can help with debugging. Please +note that only some of the REST endpoints support automatic retry. ++ +By default this is set to false. + [[rules]] === Section rules diff --git a/java/com/google/gerrit/server/submit/MergeOp.java b/java/com/google/gerrit/server/submit/MergeOp.java index 06c52c7a1a..6c3d48b66d 100644 --- a/java/com/google/gerrit/server/submit/MergeOp.java +++ b/java/com/google/gerrit/server/submit/MergeOp.java @@ -511,6 +511,8 @@ public class MergeOp implements AutoCloseable { retryHelper .getDefaultTimeout(ActionType.CHANGE_UPDATE) .multipliedBy(cs.projects().size())) + .caller(getClass()) + .retryWithTrace(t -> !(t instanceof RestApiException)) .build()); if (projects > 1) { diff --git a/java/com/google/gerrit/server/update/RetryHelper.java b/java/com/google/gerrit/server/update/RetryHelper.java index 14d04cb45d..695febab11 100644 --- a/java/com/google/gerrit/server/update/RetryHelper.java +++ b/java/com/google/gerrit/server/update/RetryHelper.java @@ -40,11 +40,14 @@ import com.google.gerrit.metrics.Field; import com.google.gerrit.metrics.MetricMaker; import com.google.gerrit.server.config.GerritServerConfig; import com.google.gerrit.server.logging.Metadata; +import com.google.gerrit.server.logging.RequestId; +import com.google.gerrit.server.logging.TraceContext; import com.google.inject.Inject; import com.google.inject.Singleton; import java.time.Duration; import java.util.Arrays; import java.util.Map; +import java.util.Optional; import java.util.concurrent.ExecutionException; import java.util.function.Consumer; import java.util.function.Predicate; @@ -94,12 +97,20 @@ public class RetryHelper { @Nullable abstract Duration timeout(); + abstract Optional> caller(); + + abstract Optional> retryWithTrace(); + @AutoValue.Builder public abstract static class Builder { public abstract Builder listener(RetryListener listener); public abstract Builder timeout(Duration timeout); + public abstract Builder caller(Class caller); + + public abstract Builder retryWithTrace(Predicate exceptionPredicate); + public abstract Options build(); } } @@ -147,6 +158,7 @@ public class RetryHelper { private final Map defaultTimeouts; private final WaitStrategy waitStrategy; @Nullable private final Consumer> overwriteDefaultRetryerStrategySetup; + private final boolean retryWithTraceOnFailure; @Inject RetryHelper(@GerritServerConfig Config cfg, Metrics metrics, BatchUpdate.Factory updateFactory) { @@ -186,6 +198,7 @@ public class RetryHelper { MILLISECONDS), WaitStrategies.randomWait(50, MILLISECONDS)); this.overwriteDefaultRetryerStrategySetup = overwriteDefaultRetryerStrategySetup; + this.retryWithTraceOnFailure = cfg.getBoolean("retry", "retryWithTraceOnFailure", false); } public Duration getDefaultTimeout(ActionType actionType) { @@ -256,8 +269,35 @@ public class RetryHelper { Predicate exceptionPredicate) throws Throwable { MetricListener listener = new MetricListener(); - try { - RetryerBuilder retryerBuilder = createRetryerBuilder(actionType, opts, exceptionPredicate); + try (TraceContext traceContext = TraceContext.open()) { + RetryerBuilder retryerBuilder = + createRetryerBuilder( + actionType, + opts, + t -> { + // exceptionPredicate checks for temporary errors for which the operation should be + // retried (e.g. LockFailure). The retry has good chances to succeed. + if (exceptionPredicate.test(t)) { + return true; + } + + // A non-recoverable failure occurred. Check if we should retry to capture a trace + // of the failure. If a trace was already done there is no need to retry. + if (retryWithTraceOnFailure + && opts.retryWithTrace().isPresent() + && opts.retryWithTrace().get().test(t) + && !traceContext.isTracing()) { + traceContext + .addTag(RequestId.Type.TRACE_ID, "retry-on-failure-" + new RequestId()) + .forceLogging(); + logger.atFine().withCause(t).log( + "%s failed, retry with tracing eanbled", + opts.caller().map(Class::getSimpleName).orElse("N/A")); + return true; + } + + return false; + }); retryerBuilder.withRetryListener(listener); return executeWithTimeoutCount(actionType, action, retryerBuilder.build()); } finally { diff --git a/java/com/google/gerrit/server/update/RetryingRestCollectionModifyView.java b/java/com/google/gerrit/server/update/RetryingRestCollectionModifyView.java index 76203867dd..9204565352 100644 --- a/java/com/google/gerrit/server/update/RetryingRestCollectionModifyView.java +++ b/java/com/google/gerrit/server/update/RetryingRestCollectionModifyView.java @@ -17,6 +17,7 @@ package com.google.gerrit.server.update; import com.google.gerrit.extensions.restapi.AuthException; import com.google.gerrit.extensions.restapi.BadRequestException; import com.google.gerrit.extensions.restapi.ResourceConflictException; +import com.google.gerrit.extensions.restapi.RestApiException; import com.google.gerrit.extensions.restapi.RestCollectionModifyView; import com.google.gerrit.extensions.restapi.RestResource; @@ -32,7 +33,13 @@ public abstract class RetryingRestCollectionModifyView< @Override public final O apply(P parentResource, I input) throws AuthException, BadRequestException, ResourceConflictException, Exception { - return retryHelper.execute((updateFactory) -> applyImpl(updateFactory, parentResource, input)); + RetryHelper.Options retryOptions = + RetryHelper.options() + .caller(getClass()) + .retryWithTrace(t -> !(t instanceof RestApiException)) + .build(); + return retryHelper.execute( + (updateFactory) -> applyImpl(updateFactory, parentResource, input), retryOptions); } protected abstract O applyImpl(BatchUpdate.Factory updateFactory, P parentResource, I input) diff --git a/java/com/google/gerrit/server/update/RetryingRestModifyView.java b/java/com/google/gerrit/server/update/RetryingRestModifyView.java index e2f4a020b0..b471d70a14 100644 --- a/java/com/google/gerrit/server/update/RetryingRestModifyView.java +++ b/java/com/google/gerrit/server/update/RetryingRestModifyView.java @@ -14,6 +14,7 @@ package com.google.gerrit.server.update; +import com.google.gerrit.extensions.restapi.RestApiException; import com.google.gerrit.extensions.restapi.RestModifyView; import com.google.gerrit.extensions.restapi.RestResource; @@ -27,7 +28,13 @@ public abstract class RetryingRestModifyView @Override public final O apply(R resource, I input) throws Exception { - return retryHelper.execute((updateFactory) -> applyImpl(updateFactory, resource, input)); + RetryHelper.Options retryOptions = + RetryHelper.options() + .caller(getClass()) + .retryWithTrace(t -> !(t instanceof RestApiException)) + .build(); + return retryHelper.execute( + (updateFactory) -> applyImpl(updateFactory, resource, input), retryOptions); } protected abstract O applyImpl(BatchUpdate.Factory updateFactory, R resource, I input) diff --git a/javatests/com/google/gerrit/acceptance/rest/TraceIT.java b/javatests/com/google/gerrit/acceptance/rest/TraceIT.java index 3a0151721b..56a9b69c4e 100644 --- a/javatests/com/google/gerrit/acceptance/rest/TraceIT.java +++ b/javatests/com/google/gerrit/acceptance/rest/TraceIT.java @@ -16,6 +16,7 @@ package com.google.gerrit.acceptance.rest; import static com.google.common.truth.Truth.assertThat; import static org.apache.http.HttpStatus.SC_CREATED; +import static org.apache.http.HttpStatus.SC_INTERNAL_SERVER_ERROR; import static org.apache.http.HttpStatus.SC_OK; import com.google.auto.value.AutoValue; @@ -28,6 +29,7 @@ import com.google.gerrit.acceptance.AbstractDaemonTest; import com.google.gerrit.acceptance.GerritConfig; import com.google.gerrit.acceptance.PushOneCommit; import com.google.gerrit.acceptance.RestResponse; +import com.google.gerrit.common.data.SubmitRecord; import com.google.gerrit.extensions.api.changes.ReviewInput; import com.google.gerrit.extensions.events.ChangeIndexedListener; import com.google.gerrit.extensions.registration.DynamicSet; @@ -44,10 +46,14 @@ import com.google.gerrit.server.logging.Metadata; import com.google.gerrit.server.logging.PerformanceLogger; import com.google.gerrit.server.logging.TraceContext; import com.google.gerrit.server.project.CreateProjectArgs; +import com.google.gerrit.server.project.SubmitRuleOptions; +import com.google.gerrit.server.query.change.ChangeData; +import com.google.gerrit.server.rules.SubmitRule; import com.google.gerrit.server.validators.ProjectCreationValidationListener; import com.google.gerrit.server.validators.ValidationException; import com.google.inject.Inject; import java.util.ArrayList; +import java.util.Collection; import java.util.List; import java.util.SortedMap; import java.util.SortedSet; @@ -79,6 +85,7 @@ public class TraceIT extends AbstractDaemonTest { @Inject private DynamicSet commitValidationListeners; @Inject private DynamicSet changeIndexedListeners; @Inject private DynamicSet performanceLoggers; + @Inject private DynamicSet submitRules; @Inject private WorkQueue workQueue; private TraceValidatingProjectCreationValidationListener projectCreationListener; @@ -579,6 +586,44 @@ public class TraceIT extends AbstractDaemonTest { assertThat(projectCreationListener.tags.get("project")).containsExactly("new24"); } + @GerritConfig(name = "retry.retryWithTraceOnFailure", value = "true") + public void autoRetryWithTrace() throws Exception { + String changeId = createChange().getChangeId(); + approve(changeId); + + TraceSubmitRule traceSubmitRule = new TraceSubmitRule(); + traceSubmitRule.failOnce = true; + RegistrationHandle submitRuleRegistrationHandle = submitRules.add("gerrit", traceSubmitRule); + try { + RestResponse response = adminRestSession.post("/changes/" + changeId + "/submit"); + assertThat(response.getStatusCode()).isEqualTo(SC_OK); + assertThat(response.getHeader(RestApiServlet.X_GERRIT_TRACE)).isNull(); + assertThat(traceSubmitRule.traceId).startsWith("retry-on-failure-"); + assertThat(traceSubmitRule.isLoggingForced).isTrue(); + } finally { + submitRuleRegistrationHandle.remove(); + } + } + + @Test + public void noAutoRetryWithTraceIfDisabled() throws Exception { + String changeId = createChange().getChangeId(); + approve(changeId); + + TraceSubmitRule traceSubmitRule = new TraceSubmitRule(); + traceSubmitRule.failOnce = true; + RegistrationHandle submitRuleRegistrationHandle = submitRules.add("gerrit", traceSubmitRule); + try { + RestResponse response = adminRestSession.post("/changes/" + changeId + "/submit"); + assertThat(response.getStatusCode()).isEqualTo(SC_INTERNAL_SERVER_ERROR); + assertThat(response.getHeader(RestApiServlet.X_GERRIT_TRACE)).isNull(); + assertThat(traceSubmitRule.traceId).isNull(); + assertThat(traceSubmitRule.isLoggingForced).isNull(); + } finally { + submitRuleRegistrationHandle.remove(); + } + } + private void assertForceLogging(boolean expected) { assertThat(LoggingContext.getInstance().shouldForceLogging(null, null, false)) .isEqualTo(expected); @@ -629,6 +674,28 @@ public class TraceIT extends AbstractDaemonTest { public void onChangeDeleted(int id) {} } + private static class TraceSubmitRule implements SubmitRule { + String traceId; + Boolean isLoggingForced; + boolean failOnce; + + @Override + public Collection evaluate(ChangeData changeData, SubmitRuleOptions options) { + if (failOnce) { + failOnce = false; + throw new IllegalStateException("forced failure from test"); + } + + this.traceId = + Iterables.getFirst(LoggingContext.getInstance().getTagsAsMap().get("TRACE_ID"), null); + this.isLoggingForced = LoggingContext.getInstance().shouldForceLogging(null, null, false); + + SubmitRecord submitRecord = new SubmitRecord(); + submitRecord.status = SubmitRecord.Status.OK; + return ImmutableList.of(submitRecord); + } + } + private static class TestPerformanceLogger implements PerformanceLogger { private List logEntries = new ArrayList<>();