RetryHelper: Make time limits configurable

Use an arbitrarily high limit during NoteDbOnlyIT, where we explicitly
want retries to happen, in order to avoid spurious timeouts caused by
scheduling delays on overloaded test machines. For other tests,
LockFailureExceptions generally shouldn't crop up during updates anyway,
and if they do, we would rather the updates time out in finite amount of
time so we can see the resulting stack trace.

Change-Id: I5bad32749b994e6473176cdd45cc80d9d4e195eb
This commit is contained in:
Dave Borowitz
2017-05-16 18:48:41 -04:00
parent c08066c1e6
commit 2b87d7f862
3 changed files with 63 additions and 7 deletions

View File

@@ -146,3 +146,34 @@ Here, the migration process looks like:
- Run a Flume to migrate all existing changes to NoteDb primary. (Also - Run a Flume to migrate all existing changes to NoteDb primary. (Also
closed-source, but basically just a wrapper around `PrimaryStorageMigrator`.) closed-source, but basically just a wrapper around `PrimaryStorageMigrator`.)
- Turn off access to ReviewDb changes tables. - Turn off access to ReviewDb changes tables.
== Configuration
This section describes general configuration for the NoteDb backend that is not
directly related to the migration process. These options will continue to be
supported after the migration is complete and the migration-related options are
obsolete.
[[noteDb.retryMaxWait]]noteDb.retryMaxWait::
+
Maximum time to wait between attempts to retry update operations when one
attempt fails due to contention (aka lock failure) on the underlying ref
storage. Operations are retried with exponential backoff, plus some random
jitter, until the interval reaches this limit. After that, retries continue to
occur after a fixed timeout (plus jitter), up to
link:#noteDb.retryTimeout[`noteDb.retryTimeout`].
+
Only applicable when `noteDb.changes.fuseUpdates=true`.
+
Defaults to 5 seconds; unit suffixes are supported, and assumes milliseconds if
not specified.
[[noteDb.retryTimeout]]noteDb.retryTimeout::
+
Total timeout for retrying update operations when one attempt fails due to
contention (aka lock failure) on the underlying ref storage.
+
Only applicable when `noteDb.changes.fuseUpdates=true`.
+
Defaults to 20 seconds; unit suffixes are supported, and assumes milliseconds if
not specified.

View File

@@ -36,6 +36,7 @@ import com.google.gerrit.server.update.BatchUpdateOp;
import com.google.gerrit.server.update.ChangeContext; import com.google.gerrit.server.update.ChangeContext;
import com.google.gerrit.server.update.RepoContext; import com.google.gerrit.server.update.RepoContext;
import com.google.gerrit.server.update.RetryHelper; import com.google.gerrit.server.update.RetryHelper;
import com.google.gerrit.testutil.ConfigSuite;
import com.google.inject.Inject; import com.google.inject.Inject;
import java.io.IOException; import java.io.IOException;
import java.util.Collections; import java.util.Collections;
@@ -45,6 +46,7 @@ import java.util.Optional;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
import org.eclipse.jgit.internal.storage.dfs.InMemoryRepository; import org.eclipse.jgit.internal.storage.dfs.InMemoryRepository;
import org.eclipse.jgit.lib.CommitBuilder; import org.eclipse.jgit.lib.CommitBuilder;
import org.eclipse.jgit.lib.Config;
import org.eclipse.jgit.lib.Constants; import org.eclipse.jgit.lib.Constants;
import org.eclipse.jgit.lib.ObjectId; import org.eclipse.jgit.lib.ObjectId;
import org.eclipse.jgit.lib.ObjectInserter; import org.eclipse.jgit.lib.ObjectInserter;
@@ -58,6 +60,14 @@ import org.junit.Before;
import org.junit.Test; import org.junit.Test;
public class NoteDbOnlyIT extends AbstractDaemonTest { public class NoteDbOnlyIT extends AbstractDaemonTest {
@ConfigSuite.Default
public static Config defaultConfig() {
Config cfg = new Config();
// Avoid spurious timeouts during intentional retries due to overloaded test machines.
cfg.setString("noteDb", null, "retryTimeout", Integer.MAX_VALUE + "s");
return cfg;
}
@Inject private RetryHelper retryHelper; @Inject private RetryHelper retryHelper;
@Before @Before

View File

@@ -14,18 +14,24 @@
package com.google.gerrit.server.update; package com.google.gerrit.server.update;
import static java.util.concurrent.TimeUnit.MILLISECONDS;
import static java.util.concurrent.TimeUnit.SECONDS;
import com.github.rholder.retry.RetryException; import com.github.rholder.retry.RetryException;
import com.github.rholder.retry.RetryerBuilder; import com.github.rholder.retry.RetryerBuilder;
import com.github.rholder.retry.StopStrategies; import com.github.rholder.retry.StopStrategies;
import com.github.rholder.retry.StopStrategy;
import com.github.rholder.retry.WaitStrategies; import com.github.rholder.retry.WaitStrategies;
import com.github.rholder.retry.WaitStrategy;
import com.google.common.base.Throwables; import com.google.common.base.Throwables;
import com.google.gerrit.extensions.restapi.RestApiException; import com.google.gerrit.extensions.restapi.RestApiException;
import com.google.gerrit.server.config.GerritServerConfig;
import com.google.gerrit.server.git.LockFailureException; import com.google.gerrit.server.git.LockFailureException;
import com.google.gerrit.server.notedb.NotesMigration; import com.google.gerrit.server.notedb.NotesMigration;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit; import org.eclipse.jgit.lib.Config;
@Singleton @Singleton
public class RetryHelper { public class RetryHelper {
@@ -35,9 +41,12 @@ public class RetryHelper {
private final NotesMigration migration; private final NotesMigration migration;
private final BatchUpdate.Factory updateFactory; private final BatchUpdate.Factory updateFactory;
private final StopStrategy stopStrategy;
private final WaitStrategy waitStrategy;
@Inject @Inject
RetryHelper( RetryHelper(
@GerritServerConfig Config cfg,
NotesMigration migration, NotesMigration migration,
ReviewDbBatchUpdate.AssistedFactory reviewDbBatchUpdateFactory, ReviewDbBatchUpdate.AssistedFactory reviewDbBatchUpdateFactory,
FusedNoteDbBatchUpdate.AssistedFactory fusedNoteDbBatchUpdateFactory, FusedNoteDbBatchUpdate.AssistedFactory fusedNoteDbBatchUpdateFactory,
@@ -49,19 +58,25 @@ public class RetryHelper {
reviewDbBatchUpdateFactory, reviewDbBatchUpdateFactory,
fusedNoteDbBatchUpdateFactory, fusedNoteDbBatchUpdateFactory,
unfusedNoteDbBatchUpdateFactory); unfusedNoteDbBatchUpdateFactory);
this.stopStrategy =
StopStrategies.stopAfterDelay(
cfg.getTimeUnit("noteDb", null, "retryTimeout", SECONDS.toMillis(5), MILLISECONDS),
MILLISECONDS);
this.waitStrategy =
WaitStrategies.join(
WaitStrategies.exponentialWait(
cfg.getTimeUnit("noteDb", null, "retryMaxWait", SECONDS.toMillis(20), MILLISECONDS),
MILLISECONDS),
WaitStrategies.randomWait(50, MILLISECONDS));
} }
public <T> T execute(Action<T> action) throws RestApiException, UpdateException { public <T> T execute(Action<T> action) throws RestApiException, UpdateException {
try { try {
RetryerBuilder<T> builder = RetryerBuilder.newBuilder(); RetryerBuilder<T> builder = RetryerBuilder.newBuilder();
if (migration.disableChangeReviewDb() && migration.fuseUpdates()) { if (migration.disableChangeReviewDb() && migration.fuseUpdates()) {
// TODO(dborowitz): Make configurable.
builder builder
.withStopStrategy(StopStrategies.stopAfterDelay(20, TimeUnit.SECONDS)) .withStopStrategy(stopStrategy)
.withWaitStrategy( .withWaitStrategy(waitStrategy)
WaitStrategies.join(
WaitStrategies.exponentialWait(5, TimeUnit.SECONDS),
WaitStrategies.randomWait(50, TimeUnit.MILLISECONDS)))
.retryIfException(RetryHelper::isLockFailure); .retryIfException(RetryHelper::isLockFailure);
} else { } else {
// Either we aren't full-NoteDb, or the underlying ref storage doesn't support atomic // Either we aren't full-NoteDb, or the underlying ref storage doesn't support atomic