RetryHelper: Add metrics for attempt counts and failures

There is currently no way to inspect metrics from acceptance tests,
which use DisabledMetricMaker, so this had to be tested manually. To do
this, I used a test NoteDb site with the example-consoleMetricReporter
plugin.

First, I made a single update and verified that the only observation in
the attempt count histogram was 1 attempt:

batch_update/retry_attempt_counts
             count = 1
               min = 1
               max = 1

Then, I added a hack in PostReview[1] that throws LockFailureException
every other attempt, with a long series after 10 attempts. This showed
at least 2 attempts for every RetryHelper invocation, and a single
timeout:

batch_update/retry_timeout_count
             count = 1
batch_update/retry_attempt_counts
             count = 6
               min = 2
               max = 16

[1] https://gerrit-review.googlesource.com/122630

Change-Id: If0f42acac42da19cb2f9a64320816bc3fedfbed6
This commit is contained in:
Dave Borowitz 2017-08-23 13:16:33 -04:00
parent 974c81db02
commit 44f346a9a1
2 changed files with 68 additions and 3 deletions

View File

@ -86,6 +86,10 @@ topic submissions that concluded successfully.
* `batch_update/execute_change_ops`: BatchUpdate change update latency,
excluding reindexing
* `batch_update/retry_attempt_counts`: Distribution of number of attempts made
by RetryHelper (1 == single attempt, no retry)
* `batch_update/retry_timeout_count`: Number of executions of RetryHelper that
ultimately timed out
=== NoteDb

View File

@ -18,6 +18,7 @@ import static com.google.common.base.MoreObjects.firstNonNull;
import static java.util.concurrent.TimeUnit.MILLISECONDS;
import static java.util.concurrent.TimeUnit.SECONDS;
import com.github.rholder.retry.Attempt;
import com.github.rholder.retry.RetryException;
import com.github.rholder.retry.RetryListener;
import com.github.rholder.retry.RetryerBuilder;
@ -28,6 +29,10 @@ import com.google.auto.value.AutoValue;
import com.google.common.base.Throwables;
import com.google.gerrit.common.Nullable;
import com.google.gerrit.extensions.restapi.RestApiException;
import com.google.gerrit.metrics.Counter0;
import com.google.gerrit.metrics.Description;
import com.google.gerrit.metrics.Histogram0;
import com.google.gerrit.metrics.MetricMaker;
import com.google.gerrit.server.config.GerritServerConfig;
import com.google.gerrit.server.git.LockFailureException;
import com.google.gerrit.server.notedb.NotesMigration;
@ -75,6 +80,30 @@ public class RetryHelper {
}
}
@Singleton
private static class Metrics {
final Histogram0 attemptCounts;
final Counter0 timeoutCount;
@Inject
Metrics(MetricMaker metricMaker) {
attemptCounts =
metricMaker.newHistogram(
"batch_update/retry_attempt_counts",
new Description(
"Distribution of number of attempts made by RetryHelper"
+ " (1 == single attempt, no retry)")
.setCumulative()
.setUnit("attempts"));
timeoutCount =
metricMaker.newCounter(
"batch_update/retry_timeout_count",
new Description("Number of executions of RetryHelper that ultimately timed out")
.setCumulative()
.setUnit("timeouts"));
}
}
public static Options.Builder options() {
return new AutoValue_RetryHelper_Options.Builder();
}
@ -84,6 +113,7 @@ public class RetryHelper {
}
private final NotesMigration migration;
private final Metrics metrics;
private final BatchUpdate.Factory updateFactory;
private final Duration defaultTimeout;
private final WaitStrategy waitStrategy;
@ -91,9 +121,11 @@ public class RetryHelper {
@Inject
RetryHelper(
@GerritServerConfig Config cfg,
Metrics metrics,
NotesMigration migration,
ReviewDbBatchUpdate.AssistedFactory reviewDbBatchUpdateFactory,
NoteDbBatchUpdate.AssistedFactory noteDbBatchUpdateFactory) {
this.metrics = metrics;
this.migration = migration;
this.updateFactory =
new BatchUpdate.Factory(migration, reviewDbBatchUpdateFactory, noteDbBatchUpdateFactory);
@ -117,18 +149,18 @@ public class RetryHelper {
}
public <T> T execute(Action<T> action, Options opts) throws RestApiException, UpdateException {
MetricListener listener = null;
try {
RetryerBuilder<T> builder = RetryerBuilder.newBuilder();
if (migration.disableChangeReviewDb()) {
listener = new MetricListener(opts.listener());
builder
.withRetryListener(listener)
.withStopStrategy(
StopStrategies.stopAfterDelay(
firstNonNull(opts.timeout(), defaultTimeout).toMillis(), MILLISECONDS))
.withWaitStrategy(waitStrategy)
.retryIfException(RetryHelper::isLockFailure);
if (opts.listener() != null) {
builder.withRetryListener(opts.listener());
}
} else {
// Either we aren't full-NoteDb, or the underlying ref storage doesn't support atomic
// transactions. Either way, retrying a partially-failed operation is not idempotent, so
@ -136,11 +168,18 @@ public class RetryHelper {
}
return builder.build().call(() -> action.call(updateFactory));
} catch (ExecutionException | RetryException e) {
if (e instanceof RetryException) {
metrics.timeoutCount.increment();
}
if (e.getCause() != null) {
Throwables.throwIfInstanceOf(e.getCause(), UpdateException.class);
Throwables.throwIfInstanceOf(e.getCause(), RestApiException.class);
}
throw new UpdateException(e);
} finally {
if (listener != null) {
metrics.attemptCounts.record(listener.getAttemptCount());
}
}
}
@ -150,4 +189,26 @@ public class RetryHelper {
}
return t instanceof LockFailureException;
}
private static class MetricListener implements RetryListener {
private final RetryListener delegate;
private long attemptCount;
MetricListener(@Nullable RetryListener delegate) {
this.delegate = delegate;
attemptCount = 1;
}
@Override
public <V> void onRetry(Attempt<V> attempt) {
attemptCount = attempt.getAttemptNumber();
if (delegate != null) {
delegate.onRetry(attempt);
}
}
long getAttemptCount() {
return attemptCount;
}
}
}