Skip to content

Commit

Permalink
Don't abort retries if we're over the rate limit
Browse files Browse the repository at this point in the history
Signed-off-by: Dimitar Dimitrov <[email protected]>
  • Loading branch information
dimitarvdimitrov committed Jan 9, 2025
1 parent f19496b commit 93cf6a1
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 38 deletions.
23 changes: 13 additions & 10 deletions pkg/ruler/remotequerier.go
Original file line number Diff line number Diff line change
Expand Up @@ -356,18 +356,21 @@ func (q *RemoteQuerier) sendRequest(ctx context.Context, req *httpgrpc.HTTPReque
if !retry.Ongoing() {
return nil, err
}
retry.Wait()

// The backoff wait spreads the retries. We check the rate limiter only after that spread.
// That should give queries a bigger chance at passing through the rate limiter
// in cases of short error bursts and the rate of failed queries being larger than the rate limit.
if !q.retryLimiter.Allow() {
return nil, fmt.Errorf("exhausted global retry budget for remote ruler execution (ruler.query-frontend.max-retries-rate); last error was: %w", err)
retryReservation := q.retryLimiter.Reserve()
if !retryReservation.OK() {
// This should only happen if we've misconfigured the limiter.
return nil, fmt.Errorf("couldn't reserve a retry token")
}
level.Warn(logger).Log("msg", "failed to remotely evaluate query expression, will retry", "err", err)

// Avoid masking last known error if context was cancelled while waiting.
if ctx.Err() != nil {
// We want to wait at least the time for the backoff, but also don't want to exceed the rate limit.
// All of this is capped to the max backoff, so that we are less likely to overrun into the next evaluation.
retryDelay := max(retry.NextDelay(), min(retryConfig.MaxBackoff, retryReservation.Delay()))
level.Warn(logger).Log("msg", "failed to remotely evaluate query expression, will retry", "err", err, "retry_delay", retryDelay)

select {
case <-time.After(retryDelay):
case <-ctx.Done():
// Avoid masking last known error if context was cancelled while waiting.
return nil, fmt.Errorf("%s while retrying request, last error was: %w", ctx.Err(), err)
}
}
Expand Down
28 changes: 0 additions & 28 deletions pkg/ruler/remotequerier_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -299,34 +299,6 @@ func TestRemoteQuerier_QueryRetryOnFailure(t *testing.T) {
})
}
}
func TestRemoteQuerier_QueryRetryBudgetExhaustion(t *testing.T) {
ctx := context.Background()
attempts := &atomic.Int64{}

// Mock client that always returns a 500 error to trigger retries
mockClientFn := func(context.Context, *httpgrpc.HTTPRequest, ...grpc.CallOption) (*httpgrpc.HTTPResponse, error) {
attempts.Add(1)
return nil, httpgrpc.Errorf(http.StatusInternalServerError, "some error")
}

// Create querier with very low retry rate to trigger budget exhaustion quickly
q := NewRemoteQuerier(
mockHTTPGRPCClient(mockClientFn),
time.Minute, // timeout
0.0001, // retry rate
formatJSON,
"/prometheus",
log.NewNopLogger(),
)

_, err := q.Query(ctx, "test_query", time.Now())

require.Error(t, err)
require.ErrorContains(t, err, "exhausted global retry budget for remote ruler execution (ruler.query-frontend.max-retries-rate)")

// Should have attempted at least one retry before exhausting budget
require.GreaterOrEqual(t, attempts.Load(), int64(2))
}

func TestRemoteQuerier_QueryJSONDecoding(t *testing.T) {
scenarios := map[string]struct {
Expand Down

0 comments on commit 93cf6a1

Please sign in to comment.