Automatically retry REST operations on failure with tracing enabled
On REST endpoint level we already retry the REST operation if there was a potentially temporary failure such as a LockFailure. Extend this to also retry once on non-recoverable failures, but with request tracing switched on. This is the same as if the user would retry the REST call with request tracing after it has failed. Capturing traces for failures automatically can help with debugging issues and safe roundtrips between user and supporting engineer. This only works for REST endpoints which support retrying (REST endpoints that extend RetryingRestModifyView or RetryingRestCollectionModifyView). Enabling more REST endpoints to retry is outside the scope of this change, but surely feasible. Submit has custom retry login in MergeOp. Adapt the retry in there too. Whether retry on failure should be done can be configured by setting retry.retryWithTraceOnFailure in gerrit.config (default = false). Since not all REST endpoints are atomic it can happen that a REST endpoint did some partial updates before it failed. If the operation is retried, in this case it can happen that this partial update is done again or that the failure is different now. This is OK since the chances for this are low and the same would happen if the user retries the operation. Change-Id: I2b78cbef50e5c3ce4fafce0d7cdcb3b1835abc86 Signed-off-by: Edwin Kempin <ekempin@google.com>
This commit is contained in:
@@ -3790,6 +3790,14 @@ attempt fails. `<operationType>` can be `ACCOUNT_UPDATE`, `CHANGE_UPDATE`,
|
||||
Defaults to link:#retry.timeout[`retry.timeout`]; unit suffixes are supported,
|
||||
and assumes milliseconds if not specified.
|
||||
|
||||
[[retry.retryWithTraceOnFailure]]retry.retryWithTraceOnFailure::
|
||||
+
|
||||
Whether Gerrit should automatically retry operations on failure with tracing
|
||||
enabled. The automatically generated traces can help with debugging. Please
|
||||
note that only some of the REST endpoints support automatic retry.
|
||||
+
|
||||
By default this is set to false.
|
||||
|
||||
[[rules]]
|
||||
=== Section rules
|
||||
|
||||
|
||||
@@ -511,6 +511,8 @@ public class MergeOp implements AutoCloseable {
|
||||
retryHelper
|
||||
.getDefaultTimeout(ActionType.CHANGE_UPDATE)
|
||||
.multipliedBy(cs.projects().size()))
|
||||
.caller(getClass())
|
||||
.retryWithTrace(t -> !(t instanceof RestApiException))
|
||||
.build());
|
||||
|
||||
if (projects > 1) {
|
||||
|
||||
@@ -40,11 +40,14 @@ import com.google.gerrit.metrics.Field;
|
||||
import com.google.gerrit.metrics.MetricMaker;
|
||||
import com.google.gerrit.server.config.GerritServerConfig;
|
||||
import com.google.gerrit.server.logging.Metadata;
|
||||
import com.google.gerrit.server.logging.RequestId;
|
||||
import com.google.gerrit.server.logging.TraceContext;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import java.time.Duration;
|
||||
import java.util.Arrays;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.ExecutionException;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.function.Predicate;
|
||||
@@ -94,12 +97,20 @@ public class RetryHelper {
|
||||
@Nullable
|
||||
abstract Duration timeout();
|
||||
|
||||
abstract Optional<Class<?>> caller();
|
||||
|
||||
abstract Optional<Predicate<Throwable>> retryWithTrace();
|
||||
|
||||
@AutoValue.Builder
|
||||
public abstract static class Builder {
|
||||
public abstract Builder listener(RetryListener listener);
|
||||
|
||||
public abstract Builder timeout(Duration timeout);
|
||||
|
||||
public abstract Builder caller(Class<?> caller);
|
||||
|
||||
public abstract Builder retryWithTrace(Predicate<Throwable> exceptionPredicate);
|
||||
|
||||
public abstract Options build();
|
||||
}
|
||||
}
|
||||
@@ -147,6 +158,7 @@ public class RetryHelper {
|
||||
private final Map<ActionType, Duration> defaultTimeouts;
|
||||
private final WaitStrategy waitStrategy;
|
||||
@Nullable private final Consumer<RetryerBuilder<?>> overwriteDefaultRetryerStrategySetup;
|
||||
private final boolean retryWithTraceOnFailure;
|
||||
|
||||
@Inject
|
||||
RetryHelper(@GerritServerConfig Config cfg, Metrics metrics, BatchUpdate.Factory updateFactory) {
|
||||
@@ -186,6 +198,7 @@ public class RetryHelper {
|
||||
MILLISECONDS),
|
||||
WaitStrategies.randomWait(50, MILLISECONDS));
|
||||
this.overwriteDefaultRetryerStrategySetup = overwriteDefaultRetryerStrategySetup;
|
||||
this.retryWithTraceOnFailure = cfg.getBoolean("retry", "retryWithTraceOnFailure", false);
|
||||
}
|
||||
|
||||
public Duration getDefaultTimeout(ActionType actionType) {
|
||||
@@ -256,8 +269,35 @@ public class RetryHelper {
|
||||
Predicate<Throwable> exceptionPredicate)
|
||||
throws Throwable {
|
||||
MetricListener listener = new MetricListener();
|
||||
try {
|
||||
RetryerBuilder<T> retryerBuilder = createRetryerBuilder(actionType, opts, exceptionPredicate);
|
||||
try (TraceContext traceContext = TraceContext.open()) {
|
||||
RetryerBuilder<T> retryerBuilder =
|
||||
createRetryerBuilder(
|
||||
actionType,
|
||||
opts,
|
||||
t -> {
|
||||
// exceptionPredicate checks for temporary errors for which the operation should be
|
||||
// retried (e.g. LockFailure). The retry has good chances to succeed.
|
||||
if (exceptionPredicate.test(t)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// A non-recoverable failure occurred. Check if we should retry to capture a trace
|
||||
// of the failure. If a trace was already done there is no need to retry.
|
||||
if (retryWithTraceOnFailure
|
||||
&& opts.retryWithTrace().isPresent()
|
||||
&& opts.retryWithTrace().get().test(t)
|
||||
&& !traceContext.isTracing()) {
|
||||
traceContext
|
||||
.addTag(RequestId.Type.TRACE_ID, "retry-on-failure-" + new RequestId())
|
||||
.forceLogging();
|
||||
logger.atFine().withCause(t).log(
|
||||
"%s failed, retry with tracing eanbled",
|
||||
opts.caller().map(Class::getSimpleName).orElse("N/A"));
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
});
|
||||
retryerBuilder.withRetryListener(listener);
|
||||
return executeWithTimeoutCount(actionType, action, retryerBuilder.build());
|
||||
} finally {
|
||||
|
||||
@@ -17,6 +17,7 @@ package com.google.gerrit.server.update;
|
||||
import com.google.gerrit.extensions.restapi.AuthException;
|
||||
import com.google.gerrit.extensions.restapi.BadRequestException;
|
||||
import com.google.gerrit.extensions.restapi.ResourceConflictException;
|
||||
import com.google.gerrit.extensions.restapi.RestApiException;
|
||||
import com.google.gerrit.extensions.restapi.RestCollectionModifyView;
|
||||
import com.google.gerrit.extensions.restapi.RestResource;
|
||||
|
||||
@@ -32,7 +33,13 @@ public abstract class RetryingRestCollectionModifyView<
|
||||
@Override
|
||||
public final O apply(P parentResource, I input)
|
||||
throws AuthException, BadRequestException, ResourceConflictException, Exception {
|
||||
return retryHelper.execute((updateFactory) -> applyImpl(updateFactory, parentResource, input));
|
||||
RetryHelper.Options retryOptions =
|
||||
RetryHelper.options()
|
||||
.caller(getClass())
|
||||
.retryWithTrace(t -> !(t instanceof RestApiException))
|
||||
.build();
|
||||
return retryHelper.execute(
|
||||
(updateFactory) -> applyImpl(updateFactory, parentResource, input), retryOptions);
|
||||
}
|
||||
|
||||
protected abstract O applyImpl(BatchUpdate.Factory updateFactory, P parentResource, I input)
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
|
||||
package com.google.gerrit.server.update;
|
||||
|
||||
import com.google.gerrit.extensions.restapi.RestApiException;
|
||||
import com.google.gerrit.extensions.restapi.RestModifyView;
|
||||
import com.google.gerrit.extensions.restapi.RestResource;
|
||||
|
||||
@@ -27,7 +28,13 @@ public abstract class RetryingRestModifyView<R extends RestResource, I, O>
|
||||
|
||||
@Override
|
||||
public final O apply(R resource, I input) throws Exception {
|
||||
return retryHelper.execute((updateFactory) -> applyImpl(updateFactory, resource, input));
|
||||
RetryHelper.Options retryOptions =
|
||||
RetryHelper.options()
|
||||
.caller(getClass())
|
||||
.retryWithTrace(t -> !(t instanceof RestApiException))
|
||||
.build();
|
||||
return retryHelper.execute(
|
||||
(updateFactory) -> applyImpl(updateFactory, resource, input), retryOptions);
|
||||
}
|
||||
|
||||
protected abstract O applyImpl(BatchUpdate.Factory updateFactory, R resource, I input)
|
||||
|
||||
@@ -16,6 +16,7 @@ package com.google.gerrit.acceptance.rest;
|
||||
|
||||
import static com.google.common.truth.Truth.assertThat;
|
||||
import static org.apache.http.HttpStatus.SC_CREATED;
|
||||
import static org.apache.http.HttpStatus.SC_INTERNAL_SERVER_ERROR;
|
||||
import static org.apache.http.HttpStatus.SC_OK;
|
||||
|
||||
import com.google.auto.value.AutoValue;
|
||||
@@ -28,6 +29,7 @@ import com.google.gerrit.acceptance.AbstractDaemonTest;
|
||||
import com.google.gerrit.acceptance.GerritConfig;
|
||||
import com.google.gerrit.acceptance.PushOneCommit;
|
||||
import com.google.gerrit.acceptance.RestResponse;
|
||||
import com.google.gerrit.common.data.SubmitRecord;
|
||||
import com.google.gerrit.extensions.api.changes.ReviewInput;
|
||||
import com.google.gerrit.extensions.events.ChangeIndexedListener;
|
||||
import com.google.gerrit.extensions.registration.DynamicSet;
|
||||
@@ -44,10 +46,14 @@ import com.google.gerrit.server.logging.Metadata;
|
||||
import com.google.gerrit.server.logging.PerformanceLogger;
|
||||
import com.google.gerrit.server.logging.TraceContext;
|
||||
import com.google.gerrit.server.project.CreateProjectArgs;
|
||||
import com.google.gerrit.server.project.SubmitRuleOptions;
|
||||
import com.google.gerrit.server.query.change.ChangeData;
|
||||
import com.google.gerrit.server.rules.SubmitRule;
|
||||
import com.google.gerrit.server.validators.ProjectCreationValidationListener;
|
||||
import com.google.gerrit.server.validators.ValidationException;
|
||||
import com.google.inject.Inject;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.SortedMap;
|
||||
import java.util.SortedSet;
|
||||
@@ -79,6 +85,7 @@ public class TraceIT extends AbstractDaemonTest {
|
||||
@Inject private DynamicSet<CommitValidationListener> commitValidationListeners;
|
||||
@Inject private DynamicSet<ChangeIndexedListener> changeIndexedListeners;
|
||||
@Inject private DynamicSet<PerformanceLogger> performanceLoggers;
|
||||
@Inject private DynamicSet<SubmitRule> submitRules;
|
||||
@Inject private WorkQueue workQueue;
|
||||
|
||||
private TraceValidatingProjectCreationValidationListener projectCreationListener;
|
||||
@@ -579,6 +586,44 @@ public class TraceIT extends AbstractDaemonTest {
|
||||
assertThat(projectCreationListener.tags.get("project")).containsExactly("new24");
|
||||
}
|
||||
|
||||
@GerritConfig(name = "retry.retryWithTraceOnFailure", value = "true")
|
||||
public void autoRetryWithTrace() throws Exception {
|
||||
String changeId = createChange().getChangeId();
|
||||
approve(changeId);
|
||||
|
||||
TraceSubmitRule traceSubmitRule = new TraceSubmitRule();
|
||||
traceSubmitRule.failOnce = true;
|
||||
RegistrationHandle submitRuleRegistrationHandle = submitRules.add("gerrit", traceSubmitRule);
|
||||
try {
|
||||
RestResponse response = adminRestSession.post("/changes/" + changeId + "/submit");
|
||||
assertThat(response.getStatusCode()).isEqualTo(SC_OK);
|
||||
assertThat(response.getHeader(RestApiServlet.X_GERRIT_TRACE)).isNull();
|
||||
assertThat(traceSubmitRule.traceId).startsWith("retry-on-failure-");
|
||||
assertThat(traceSubmitRule.isLoggingForced).isTrue();
|
||||
} finally {
|
||||
submitRuleRegistrationHandle.remove();
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void noAutoRetryWithTraceIfDisabled() throws Exception {
|
||||
String changeId = createChange().getChangeId();
|
||||
approve(changeId);
|
||||
|
||||
TraceSubmitRule traceSubmitRule = new TraceSubmitRule();
|
||||
traceSubmitRule.failOnce = true;
|
||||
RegistrationHandle submitRuleRegistrationHandle = submitRules.add("gerrit", traceSubmitRule);
|
||||
try {
|
||||
RestResponse response = adminRestSession.post("/changes/" + changeId + "/submit");
|
||||
assertThat(response.getStatusCode()).isEqualTo(SC_INTERNAL_SERVER_ERROR);
|
||||
assertThat(response.getHeader(RestApiServlet.X_GERRIT_TRACE)).isNull();
|
||||
assertThat(traceSubmitRule.traceId).isNull();
|
||||
assertThat(traceSubmitRule.isLoggingForced).isNull();
|
||||
} finally {
|
||||
submitRuleRegistrationHandle.remove();
|
||||
}
|
||||
}
|
||||
|
||||
private void assertForceLogging(boolean expected) {
|
||||
assertThat(LoggingContext.getInstance().shouldForceLogging(null, null, false))
|
||||
.isEqualTo(expected);
|
||||
@@ -629,6 +674,28 @@ public class TraceIT extends AbstractDaemonTest {
|
||||
public void onChangeDeleted(int id) {}
|
||||
}
|
||||
|
||||
private static class TraceSubmitRule implements SubmitRule {
|
||||
String traceId;
|
||||
Boolean isLoggingForced;
|
||||
boolean failOnce;
|
||||
|
||||
@Override
|
||||
public Collection<SubmitRecord> evaluate(ChangeData changeData, SubmitRuleOptions options) {
|
||||
if (failOnce) {
|
||||
failOnce = false;
|
||||
throw new IllegalStateException("forced failure from test");
|
||||
}
|
||||
|
||||
this.traceId =
|
||||
Iterables.getFirst(LoggingContext.getInstance().getTagsAsMap().get("TRACE_ID"), null);
|
||||
this.isLoggingForced = LoggingContext.getInstance().shouldForceLogging(null, null, false);
|
||||
|
||||
SubmitRecord submitRecord = new SubmitRecord();
|
||||
submitRecord.status = SubmitRecord.Status.OK;
|
||||
return ImmutableList.of(submitRecord);
|
||||
}
|
||||
}
|
||||
|
||||
private static class TestPerformanceLogger implements PerformanceLogger {
|
||||
private List<PerformanceLogEntry> logEntries = new ArrayList<>();
|
||||
|
||||
|
||||
Reference in New Issue
Block a user