Merge "Automatically retry REST operations on failure with tracing enabled"

This commit is contained in:
Edwin Kempin
2019-07-19 07:32:37 +00:00
committed by Gerrit Code Review
6 changed files with 135 additions and 4 deletions

View File

@@ -3790,6 +3790,14 @@ attempt fails. `<operationType>` can be `ACCOUNT_UPDATE`, `CHANGE_UPDATE`,
Defaults to link:#retry.timeout[`retry.timeout`]; unit suffixes are supported,
and assumes milliseconds if not specified.
[[retry.retryWithTraceOnFailure]]retry.retryWithTraceOnFailure::
+
Whether Gerrit should automatically retry operations on failure with tracing
enabled. The automatically generated traces can help with debugging. Please
note that only some of the REST endpoints support automatic retry.
+
By default this is set to false.
[[rules]]
=== Section rules

View File

@@ -511,6 +511,8 @@ public class MergeOp implements AutoCloseable {
retryHelper
.getDefaultTimeout(ActionType.CHANGE_UPDATE)
.multipliedBy(cs.projects().size()))
.caller(getClass())
.retryWithTrace(t -> !(t instanceof RestApiException))
.build());
if (projects > 1) {

View File

@@ -40,11 +40,14 @@ import com.google.gerrit.metrics.Field;
import com.google.gerrit.metrics.MetricMaker;
import com.google.gerrit.server.config.GerritServerConfig;
import com.google.gerrit.server.logging.Metadata;
import com.google.gerrit.server.logging.RequestId;
import com.google.gerrit.server.logging.TraceContext;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import java.time.Duration;
import java.util.Arrays;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.ExecutionException;
import java.util.function.Consumer;
import java.util.function.Predicate;
@@ -94,12 +97,20 @@ public class RetryHelper {
@Nullable
abstract Duration timeout();
abstract Optional<Class<?>> caller();
abstract Optional<Predicate<Throwable>> retryWithTrace();
@AutoValue.Builder
public abstract static class Builder {
public abstract Builder listener(RetryListener listener);
public abstract Builder timeout(Duration timeout);
public abstract Builder caller(Class<?> caller);
public abstract Builder retryWithTrace(Predicate<Throwable> exceptionPredicate);
public abstract Options build();
}
}
@@ -147,6 +158,7 @@ public class RetryHelper {
private final Map<ActionType, Duration> defaultTimeouts;
private final WaitStrategy waitStrategy;
@Nullable private final Consumer<RetryerBuilder<?>> overwriteDefaultRetryerStrategySetup;
private final boolean retryWithTraceOnFailure;
@Inject
RetryHelper(@GerritServerConfig Config cfg, Metrics metrics, BatchUpdate.Factory updateFactory) {
@@ -186,6 +198,7 @@ public class RetryHelper {
MILLISECONDS),
WaitStrategies.randomWait(50, MILLISECONDS));
this.overwriteDefaultRetryerStrategySetup = overwriteDefaultRetryerStrategySetup;
this.retryWithTraceOnFailure = cfg.getBoolean("retry", "retryWithTraceOnFailure", false);
}
public Duration getDefaultTimeout(ActionType actionType) {
@@ -256,8 +269,35 @@ public class RetryHelper {
Predicate<Throwable> exceptionPredicate)
throws Throwable {
MetricListener listener = new MetricListener();
try {
RetryerBuilder<T> retryerBuilder = createRetryerBuilder(actionType, opts, exceptionPredicate);
try (TraceContext traceContext = TraceContext.open()) {
RetryerBuilder<T> retryerBuilder =
createRetryerBuilder(
actionType,
opts,
t -> {
// exceptionPredicate checks for temporary errors for which the operation should be
// retried (e.g. LockFailure). The retry has good chances to succeed.
if (exceptionPredicate.test(t)) {
return true;
}
// A non-recoverable failure occurred. Check if we should retry to capture a trace
// of the failure. If a trace was already done there is no need to retry.
if (retryWithTraceOnFailure
&& opts.retryWithTrace().isPresent()
&& opts.retryWithTrace().get().test(t)
&& !traceContext.isTracing()) {
traceContext
.addTag(RequestId.Type.TRACE_ID, "retry-on-failure-" + new RequestId())
.forceLogging();
logger.atFine().withCause(t).log(
"%s failed, retry with tracing eanbled",
opts.caller().map(Class::getSimpleName).orElse("N/A"));
return true;
}
return false;
});
retryerBuilder.withRetryListener(listener);
return executeWithTimeoutCount(actionType, action, retryerBuilder.build());
} finally {

View File

@@ -17,6 +17,7 @@ package com.google.gerrit.server.update;
import com.google.gerrit.extensions.restapi.AuthException;
import com.google.gerrit.extensions.restapi.BadRequestException;
import com.google.gerrit.extensions.restapi.ResourceConflictException;
import com.google.gerrit.extensions.restapi.RestApiException;
import com.google.gerrit.extensions.restapi.RestCollectionModifyView;
import com.google.gerrit.extensions.restapi.RestResource;
@@ -32,7 +33,13 @@ public abstract class RetryingRestCollectionModifyView<
@Override
public final O apply(P parentResource, I input)
throws AuthException, BadRequestException, ResourceConflictException, Exception {
return retryHelper.execute((updateFactory) -> applyImpl(updateFactory, parentResource, input));
RetryHelper.Options retryOptions =
RetryHelper.options()
.caller(getClass())
.retryWithTrace(t -> !(t instanceof RestApiException))
.build();
return retryHelper.execute(
(updateFactory) -> applyImpl(updateFactory, parentResource, input), retryOptions);
}
protected abstract O applyImpl(BatchUpdate.Factory updateFactory, P parentResource, I input)

View File

@@ -14,6 +14,7 @@
package com.google.gerrit.server.update;
import com.google.gerrit.extensions.restapi.RestApiException;
import com.google.gerrit.extensions.restapi.RestModifyView;
import com.google.gerrit.extensions.restapi.RestResource;
@@ -27,7 +28,13 @@ public abstract class RetryingRestModifyView<R extends RestResource, I, O>
@Override
public final O apply(R resource, I input) throws Exception {
return retryHelper.execute((updateFactory) -> applyImpl(updateFactory, resource, input));
RetryHelper.Options retryOptions =
RetryHelper.options()
.caller(getClass())
.retryWithTrace(t -> !(t instanceof RestApiException))
.build();
return retryHelper.execute(
(updateFactory) -> applyImpl(updateFactory, resource, input), retryOptions);
}
protected abstract O applyImpl(BatchUpdate.Factory updateFactory, R resource, I input)

View File

@@ -16,6 +16,7 @@ package com.google.gerrit.acceptance.rest;
import static com.google.common.truth.Truth.assertThat;
import static org.apache.http.HttpStatus.SC_CREATED;
import static org.apache.http.HttpStatus.SC_INTERNAL_SERVER_ERROR;
import static org.apache.http.HttpStatus.SC_OK;
import com.google.auto.value.AutoValue;
@@ -28,6 +29,7 @@ import com.google.gerrit.acceptance.AbstractDaemonTest;
import com.google.gerrit.acceptance.GerritConfig;
import com.google.gerrit.acceptance.PushOneCommit;
import com.google.gerrit.acceptance.RestResponse;
import com.google.gerrit.common.data.SubmitRecord;
import com.google.gerrit.extensions.api.changes.ReviewInput;
import com.google.gerrit.extensions.events.ChangeIndexedListener;
import com.google.gerrit.extensions.registration.DynamicSet;
@@ -44,10 +46,14 @@ import com.google.gerrit.server.logging.Metadata;
import com.google.gerrit.server.logging.PerformanceLogger;
import com.google.gerrit.server.logging.TraceContext;
import com.google.gerrit.server.project.CreateProjectArgs;
import com.google.gerrit.server.project.SubmitRuleOptions;
import com.google.gerrit.server.query.change.ChangeData;
import com.google.gerrit.server.rules.SubmitRule;
import com.google.gerrit.server.validators.ProjectCreationValidationListener;
import com.google.gerrit.server.validators.ValidationException;
import com.google.inject.Inject;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.SortedMap;
import java.util.SortedSet;
@@ -79,6 +85,7 @@ public class TraceIT extends AbstractDaemonTest {
@Inject private DynamicSet<CommitValidationListener> commitValidationListeners;
@Inject private DynamicSet<ChangeIndexedListener> changeIndexedListeners;
@Inject private DynamicSet<PerformanceLogger> performanceLoggers;
@Inject private DynamicSet<SubmitRule> submitRules;
@Inject private WorkQueue workQueue;
private TraceValidatingProjectCreationValidationListener projectCreationListener;
@@ -579,6 +586,44 @@ public class TraceIT extends AbstractDaemonTest {
assertThat(projectCreationListener.tags.get("project")).containsExactly("new24");
}
@GerritConfig(name = "retry.retryWithTraceOnFailure", value = "true")
public void autoRetryWithTrace() throws Exception {
String changeId = createChange().getChangeId();
approve(changeId);
TraceSubmitRule traceSubmitRule = new TraceSubmitRule();
traceSubmitRule.failOnce = true;
RegistrationHandle submitRuleRegistrationHandle = submitRules.add("gerrit", traceSubmitRule);
try {
RestResponse response = adminRestSession.post("/changes/" + changeId + "/submit");
assertThat(response.getStatusCode()).isEqualTo(SC_OK);
assertThat(response.getHeader(RestApiServlet.X_GERRIT_TRACE)).isNull();
assertThat(traceSubmitRule.traceId).startsWith("retry-on-failure-");
assertThat(traceSubmitRule.isLoggingForced).isTrue();
} finally {
submitRuleRegistrationHandle.remove();
}
}
@Test
public void noAutoRetryWithTraceIfDisabled() throws Exception {
String changeId = createChange().getChangeId();
approve(changeId);
TraceSubmitRule traceSubmitRule = new TraceSubmitRule();
traceSubmitRule.failOnce = true;
RegistrationHandle submitRuleRegistrationHandle = submitRules.add("gerrit", traceSubmitRule);
try {
RestResponse response = adminRestSession.post("/changes/" + changeId + "/submit");
assertThat(response.getStatusCode()).isEqualTo(SC_INTERNAL_SERVER_ERROR);
assertThat(response.getHeader(RestApiServlet.X_GERRIT_TRACE)).isNull();
assertThat(traceSubmitRule.traceId).isNull();
assertThat(traceSubmitRule.isLoggingForced).isNull();
} finally {
submitRuleRegistrationHandle.remove();
}
}
private void assertForceLogging(boolean expected) {
assertThat(LoggingContext.getInstance().shouldForceLogging(null, null, false))
.isEqualTo(expected);
@@ -629,6 +674,28 @@ public class TraceIT extends AbstractDaemonTest {
public void onChangeDeleted(int id) {}
}
private static class TraceSubmitRule implements SubmitRule {
String traceId;
Boolean isLoggingForced;
boolean failOnce;
@Override
public Collection<SubmitRecord> evaluate(ChangeData changeData, SubmitRuleOptions options) {
if (failOnce) {
failOnce = false;
throw new IllegalStateException("forced failure from test");
}
this.traceId =
Iterables.getFirst(LoggingContext.getInstance().getTagsAsMap().get("TRACE_ID"), null);
this.isLoggingForced = LoggingContext.getInstance().shouldForceLogging(null, null, false);
SubmitRecord submitRecord = new SubmitRecord();
submitRecord.status = SubmitRecord.Status.OK;
return ImmutableList.of(submitRecord);
}
}
private static class TestPerformanceLogger implements PerformanceLogger {
private List<PerformanceLogEntry> logEntries = new ArrayList<>();