RetryHelper: Add metrics for attempt counts and failures
There is currently no way to inspect metrics from acceptance tests, which use DisabledMetricMaker, so this had to be tested manually. To do this, I used a test NoteDb site with the example-consoleMetricReporter plugin. First, I made a single update and verified that the only observation in the attempt count histogram was 1 attempt: batch_update/retry_attempt_counts count = 1 min = 1 max = 1 Then, I added a hack in PostReview[1] that throws LockFailureException every other attempt, with a long series after 10 attempts. This showed at least 2 attempts for every RetryHelper invocation, and a single timeout: batch_update/retry_timeout_count count = 1 batch_update/retry_attempt_counts count = 6 min = 2 max = 16 [1] https://gerrit-review.googlesource.com/122630 Change-Id: If0f42acac42da19cb2f9a64320816bc3fedfbed6
This commit is contained in:
parent
974c81db02
commit
44f346a9a1
@ -86,6 +86,10 @@ topic submissions that concluded successfully.
|
||||
|
||||
* `batch_update/execute_change_ops`: BatchUpdate change update latency,
|
||||
excluding reindexing
|
||||
* `batch_update/retry_attempt_counts`: Distribution of number of attempts made
|
||||
by RetryHelper (1 == single attempt, no retry)
|
||||
* `batch_update/retry_timeout_count`: Number of executions of RetryHelper that
|
||||
ultimately timed out
|
||||
|
||||
=== NoteDb
|
||||
|
||||
|
@ -18,6 +18,7 @@ import static com.google.common.base.MoreObjects.firstNonNull;
|
||||
import static java.util.concurrent.TimeUnit.MILLISECONDS;
|
||||
import static java.util.concurrent.TimeUnit.SECONDS;
|
||||
|
||||
import com.github.rholder.retry.Attempt;
|
||||
import com.github.rholder.retry.RetryException;
|
||||
import com.github.rholder.retry.RetryListener;
|
||||
import com.github.rholder.retry.RetryerBuilder;
|
||||
@ -28,6 +29,10 @@ import com.google.auto.value.AutoValue;
|
||||
import com.google.common.base.Throwables;
|
||||
import com.google.gerrit.common.Nullable;
|
||||
import com.google.gerrit.extensions.restapi.RestApiException;
|
||||
import com.google.gerrit.metrics.Counter0;
|
||||
import com.google.gerrit.metrics.Description;
|
||||
import com.google.gerrit.metrics.Histogram0;
|
||||
import com.google.gerrit.metrics.MetricMaker;
|
||||
import com.google.gerrit.server.config.GerritServerConfig;
|
||||
import com.google.gerrit.server.git.LockFailureException;
|
||||
import com.google.gerrit.server.notedb.NotesMigration;
|
||||
@ -75,6 +80,30 @@ public class RetryHelper {
|
||||
}
|
||||
}
|
||||
|
||||
@Singleton
|
||||
private static class Metrics {
|
||||
final Histogram0 attemptCounts;
|
||||
final Counter0 timeoutCount;
|
||||
|
||||
@Inject
|
||||
Metrics(MetricMaker metricMaker) {
|
||||
attemptCounts =
|
||||
metricMaker.newHistogram(
|
||||
"batch_update/retry_attempt_counts",
|
||||
new Description(
|
||||
"Distribution of number of attempts made by RetryHelper"
|
||||
+ " (1 == single attempt, no retry)")
|
||||
.setCumulative()
|
||||
.setUnit("attempts"));
|
||||
timeoutCount =
|
||||
metricMaker.newCounter(
|
||||
"batch_update/retry_timeout_count",
|
||||
new Description("Number of executions of RetryHelper that ultimately timed out")
|
||||
.setCumulative()
|
||||
.setUnit("timeouts"));
|
||||
}
|
||||
}
|
||||
|
||||
public static Options.Builder options() {
|
||||
return new AutoValue_RetryHelper_Options.Builder();
|
||||
}
|
||||
@ -84,6 +113,7 @@ public class RetryHelper {
|
||||
}
|
||||
|
||||
private final NotesMigration migration;
|
||||
private final Metrics metrics;
|
||||
private final BatchUpdate.Factory updateFactory;
|
||||
private final Duration defaultTimeout;
|
||||
private final WaitStrategy waitStrategy;
|
||||
@ -91,9 +121,11 @@ public class RetryHelper {
|
||||
@Inject
|
||||
RetryHelper(
|
||||
@GerritServerConfig Config cfg,
|
||||
Metrics metrics,
|
||||
NotesMigration migration,
|
||||
ReviewDbBatchUpdate.AssistedFactory reviewDbBatchUpdateFactory,
|
||||
NoteDbBatchUpdate.AssistedFactory noteDbBatchUpdateFactory) {
|
||||
this.metrics = metrics;
|
||||
this.migration = migration;
|
||||
this.updateFactory =
|
||||
new BatchUpdate.Factory(migration, reviewDbBatchUpdateFactory, noteDbBatchUpdateFactory);
|
||||
@ -117,18 +149,18 @@ public class RetryHelper {
|
||||
}
|
||||
|
||||
public <T> T execute(Action<T> action, Options opts) throws RestApiException, UpdateException {
|
||||
MetricListener listener = null;
|
||||
try {
|
||||
RetryerBuilder<T> builder = RetryerBuilder.newBuilder();
|
||||
if (migration.disableChangeReviewDb()) {
|
||||
listener = new MetricListener(opts.listener());
|
||||
builder
|
||||
.withRetryListener(listener)
|
||||
.withStopStrategy(
|
||||
StopStrategies.stopAfterDelay(
|
||||
firstNonNull(opts.timeout(), defaultTimeout).toMillis(), MILLISECONDS))
|
||||
.withWaitStrategy(waitStrategy)
|
||||
.retryIfException(RetryHelper::isLockFailure);
|
||||
if (opts.listener() != null) {
|
||||
builder.withRetryListener(opts.listener());
|
||||
}
|
||||
} else {
|
||||
// Either we aren't full-NoteDb, or the underlying ref storage doesn't support atomic
|
||||
// transactions. Either way, retrying a partially-failed operation is not idempotent, so
|
||||
@ -136,11 +168,18 @@ public class RetryHelper {
|
||||
}
|
||||
return builder.build().call(() -> action.call(updateFactory));
|
||||
} catch (ExecutionException | RetryException e) {
|
||||
if (e instanceof RetryException) {
|
||||
metrics.timeoutCount.increment();
|
||||
}
|
||||
if (e.getCause() != null) {
|
||||
Throwables.throwIfInstanceOf(e.getCause(), UpdateException.class);
|
||||
Throwables.throwIfInstanceOf(e.getCause(), RestApiException.class);
|
||||
}
|
||||
throw new UpdateException(e);
|
||||
} finally {
|
||||
if (listener != null) {
|
||||
metrics.attemptCounts.record(listener.getAttemptCount());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -150,4 +189,26 @@ public class RetryHelper {
|
||||
}
|
||||
return t instanceof LockFailureException;
|
||||
}
|
||||
|
||||
private static class MetricListener implements RetryListener {
|
||||
private final RetryListener delegate;
|
||||
private long attemptCount;
|
||||
|
||||
MetricListener(@Nullable RetryListener delegate) {
|
||||
this.delegate = delegate;
|
||||
attemptCount = 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public <V> void onRetry(Attempt<V> attempt) {
|
||||
attemptCount = attempt.getAttemptNumber();
|
||||
if (delegate != null) {
|
||||
delegate.onRetry(attempt);
|
||||
}
|
||||
}
|
||||
|
||||
long getAttemptCount() {
|
||||
return attemptCount;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user