Make config parameter names for RetryHelper more generic

Initially RetryHelper was exclusively used to retry change updates in
NoteDb. However since change I8c16487780 we use RetryHelper also to
retry non-NoteDb operations, e.g. index queries. This means RetryHelper
is no longer specific to NoteDb updates and hence its configuration
parameters shouldn't be part of the noteDb configuration. Hence rename
notedb.retryMaxWait to retry.maxWait and notedb.retryTimeout to
retry.timeout. Since it may make sense to have a different retry timeout
for certain operation types it is possible to overwrite retry.timeout
by operation type by setting notedb.<operationType>.retryTimeout.

Change-Id: I1a869b0f7d593a3145848db74317fd97483265d1
Signed-off-by: Edwin Kempin <ekempin@google.com>
This commit is contained in:
Edwin Kempin
2018-01-22 11:55:07 +01:00
parent d0d9331b5b
commit 22687fae75
4 changed files with 70 additions and 31 deletions

View File

@@ -3437,26 +3437,6 @@ each process retrieves at once.
+
By default, 1.
[[noteDb.retryMaxWait]]noteDb.retryMaxWait::
+
Maximum time to wait between attempts to retry update operations when one
attempt fails due to contention (aka lock failure) on the underlying ref
storage. Operations are retried with exponential backoff, plus some random
jitter, until the interval reaches this limit. After that, retries continue to
occur after a fixed timeout (plus jitter), up to
link:#noteDb.retryTimeout[`noteDb.retryTimeout`].
+
Defaults to 5 seconds; unit suffixes are supported, and assumes milliseconds if
not specified.
[[noteDb.retryTimeout]]noteDb.retryTimeout::
+
Total timeout for retrying update operations when one attempt fails due to
contention (aka lock failure) on the underlying ref storage.
+
Defaults to 20 seconds; unit suffixes are supported, and assumes milliseconds if
not specified.
[[oauth]]
=== Section oauth
@@ -3780,6 +3760,40 @@ A name of a group which exists in the database. Zero, one or many
groups are allowed. Each on its own line. Groups which don't exist
in the database are ignored.
[[retry]]
=== Section retry
[[retry.maxWait]]retry.maxWait::
+
Maximum time to wait between attempts to retry an operations when one attempt
fails (e.g. on NoteDb updates due to contention, aka lock failure, on the
underlying ref storage). Operations are retried with exponential backoff, plus
some random jitter, until the interval reaches this limit. After that, retries
continue to occur after a fixed timeout (plus jitter), up to
link:#retry.timeout[`retry.timeout`].
+
Defaults to 5 seconds; unit suffixes are supported, and assumes milliseconds if
not specified.
[[retry.timeout]]retry.timeout::
+
Total timeout for retrying operations when one attempt fails.
+
It is possible to overwrite this default timeout based on operation types by
setting link:#retry.operationType.timeout[`retry.<operationType>.timeout`].
+
Defaults to 20 seconds; unit suffixes are supported, and assumes milliseconds if
not specified.
[[retry.operationType.timeout]]retry.<operationType>.timeout::
+
Total timeout for retrying operations of type `<operationType>` when one
attempt fails. `<operationType>` can be `ACCOUNT_UPDATE`, `CHANGE_UPDATE`,
`GROUP_UPDATE` and `INDEX_QUERY`.
+
Defaults to link:#retry.timeout[`retry.timeout`]; unit suffixes are supported,
and assumes milliseconds if not specified.
[[rules]]
=== Section rules

View File

@@ -73,6 +73,7 @@ import com.google.gerrit.server.update.BatchUpdate;
import com.google.gerrit.server.update.BatchUpdateOp;
import com.google.gerrit.server.update.ChangeContext;
import com.google.gerrit.server.update.RetryHelper;
import com.google.gerrit.server.update.RetryHelper.ActionType;
import com.google.gerrit.server.update.UpdateException;
import com.google.gerrit.server.util.RequestId;
import com.google.gwtorm.server.OrmException;
@@ -489,7 +490,10 @@ public class MergeOp implements AutoCloseable {
.listener(retryTracker)
// Up to the entire submit operation is retried, including possibly many projects.
// Multiply the timeout by the number of projects we're actually attempting to submit.
.timeout(retryHelper.getDefaultTimeout().multipliedBy(cs.projects().size()))
.timeout(
retryHelper
.getDefaultTimeout(ActionType.CHANGE_UPDATE)
.multipliedBy(cs.projects().size()))
.build());
if (projects > 1) {

View File

@@ -2954,7 +2954,9 @@ class ReceiveCommits {
},
// Use a multiple of the default timeout to account for inner retries that may otherwise
// eat up the whole timeout so that no time is left to retry this outer action.
RetryHelper.options().timeout(retryHelper.getDefaultTimeout().multipliedBy(5)).build());
RetryHelper.options()
.timeout(retryHelper.getDefaultTimeout(ActionType.CHANGE_UPDATE).multipliedBy(5))
.build());
} catch (RestApiException e) {
logError("Can't insert patchset", e);
} catch (UpdateException e) {

View File

@@ -30,6 +30,7 @@ import com.google.auto.value.AutoValue;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Predicate;
import com.google.common.base.Throwables;
import com.google.common.collect.Maps;
import com.google.gerrit.common.Nullable;
import com.google.gerrit.extensions.restapi.RestApiException;
import com.google.gerrit.metrics.Counter1;
@@ -43,6 +44,8 @@ import com.google.gerrit.server.notedb.NotesMigration;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import java.time.Duration;
import java.util.Arrays;
import java.util.Map;
import java.util.concurrent.ExecutionException;
import java.util.function.Consumer;
import org.eclipse.jgit.lib.Config;
@@ -138,7 +141,7 @@ public class RetryHelper {
private final NotesMigration migration;
private final Metrics metrics;
private final BatchUpdate.Factory updateFactory;
private final Duration defaultTimeout;
private final Map<ActionType, Duration> defaultTimeouts;
private final WaitStrategy waitStrategy;
@Nullable private final Consumer<RetryerBuilder<?>> overwriteDefaultRetryerStrategySetup;
@@ -164,20 +167,35 @@ public class RetryHelper {
this.migration = migration;
this.updateFactory =
new BatchUpdate.Factory(migration, reviewDbBatchUpdateFactory, noteDbBatchUpdateFactory);
this.defaultTimeout =
Duration defaultTimeout =
Duration.ofMillis(
cfg.getTimeUnit("noteDb", null, "retryTimeout", SECONDS.toMillis(20), MILLISECONDS));
cfg.getTimeUnit("retry", null, "timeout", SECONDS.toMillis(20), MILLISECONDS));
this.defaultTimeouts = Maps.newEnumMap(ActionType.class);
Arrays.stream(ActionType.values())
.forEach(
at ->
defaultTimeouts.put(
at,
Duration.ofMillis(
cfg.getTimeUnit(
"retry",
at.name(),
"timeout",
SECONDS.toMillis(defaultTimeout.getSeconds()),
MILLISECONDS))));
this.waitStrategy =
WaitStrategies.join(
WaitStrategies.exponentialWait(
cfg.getTimeUnit("noteDb", null, "retryMaxWait", SECONDS.toMillis(5), MILLISECONDS),
cfg.getTimeUnit("retry", null, "maxWait", SECONDS.toMillis(5), MILLISECONDS),
MILLISECONDS),
WaitStrategies.randomWait(50, MILLISECONDS));
this.overwriteDefaultRetryerStrategySetup = overwriteDefaultRetryerStrategySetup;
}
public Duration getDefaultTimeout() {
return defaultTimeout;
public Duration getDefaultTimeout(ActionType actionType) {
return defaultTimeouts.get(actionType);
}
public <T> T execute(
@@ -255,7 +273,7 @@ public class RetryHelper {
throws Throwable {
MetricListener listener = new MetricListener();
try {
RetryerBuilder<T> retryerBuilder = createRetryerBuilder(opts, exceptionPredicate);
RetryerBuilder<T> retryerBuilder = createRetryerBuilder(actionType, opts, exceptionPredicate);
retryerBuilder.withRetryListener(listener);
return executeWithTimeoutCount(actionType, action, retryerBuilder.build());
} finally {
@@ -289,7 +307,7 @@ public class RetryHelper {
}
private <O> RetryerBuilder<O> createRetryerBuilder(
Options opts, Predicate<Throwable> exceptionPredicate) {
ActionType actionType, Options opts, Predicate<Throwable> exceptionPredicate) {
RetryerBuilder<O> retryerBuilder =
RetryerBuilder.<O>newBuilder().retryIfException(exceptionPredicate);
if (opts.listener() != null) {
@@ -304,7 +322,8 @@ public class RetryHelper {
return retryerBuilder
.withStopStrategy(
StopStrategies.stopAfterDelay(
firstNonNull(opts.timeout(), defaultTimeout).toMillis(), MILLISECONDS))
firstNonNull(opts.timeout(), getDefaultTimeout(actionType)).toMillis(),
MILLISECONDS))
.withWaitStrategy(waitStrategy);
}