diff --git a/CHANGELOG.md b/CHANGELOG.md index 277755851d0..b7312e580c2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,7 @@ * [CHANGE] Querier: The `.` pattern in regular expressions in PromQL matches newline characters. With this change regular expressions like `.*` match strings that include `\n`. To maintain the old behaviour, you will have to change regular expressions by replacing all `.` patterns with `[^\n]`, e.g. `foo[^\n]*`. This upgrades PromQL compatibility from Prometheus 2.0 to 3.0. #9844 * [CHANGE] Querier: Lookback and range selectors are left open and right closed (previously left closed and right closed). This change affects queries when the evaluation time perfectly aligns with the sample timestamps. For example assume querying a timeseries with evenly spaced samples exactly 1 minute apart. Previously, a range query with `5m` would usually return 5 samples, or 6 samples if the query evaluation aligns perfectly with a scrape. Now, queries like this will always return 5 samples. This upgrades PromQL compatibility from Prometheus 2.0 to 3.0. #9844 * [CHANGE] Querier: promql(native histograms): Introduce exponential interpolation. #9844 +* [CHANGE] Ruler: cap the rate of retries for remote query evaluation to 170/sec. This is configurable via `-ruler.query-frontend.max-retries-rate`. #10375 * [FEATURE] Querier: add experimental streaming PromQL engine, enabled with `-querier.query-engine=mimir`. #10067 * [FEATURE] Distributor: Add support for `lz4` OTLP compression. #9763 * [FEATURE] Query-frontend: added experimental configuration options `query-frontend.cache-errors` and `query-frontend.results-cache-ttl-for-errors` to allow non-transient responses to be cached. When set to `true` error responses from hitting limits or bad data are cached for a short TTL. #9028 diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index 7ee589d29ca..0d2a2ab6b0c 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -13237,6 +13237,16 @@ "fieldDefaultValue": "protobuf", "fieldFlag": "ruler.query-frontend.query-result-response-format", "fieldType": "string" + }, + { + "kind": "field", + "name": "max_retries_rate", + "required": false, + "desc": "Maximum number of retries for failed queries per second.", + "fieldValue": null, + "fieldDefaultValue": 170, + "fieldFlag": "ruler.query-frontend.max-retries-rate", + "fieldType": "float" } ], "fieldValue": null, diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl index bcf39c1bf34..9b1a2a524df 100644 --- a/cmd/mimir/help-all.txt.tmpl +++ b/cmd/mimir/help-all.txt.tmpl @@ -2989,6 +2989,8 @@ Usage of ./cmd/mimir/mimir: Override the default minimum TLS version. Allowed values: VersionTLS10, VersionTLS11, VersionTLS12, VersionTLS13 -ruler.query-frontend.grpc-client-config.tls-server-name string Override the expected name on the server certificate. + -ruler.query-frontend.max-retries-rate float + Maximum number of retries for failed queries per second. (default 170) -ruler.query-frontend.query-result-response-format string Format to use when retrieving query results from query-frontends. Supported values: json, protobuf (default "protobuf") -ruler.query-stats-enabled diff --git a/cmd/mimir/help.txt.tmpl b/cmd/mimir/help.txt.tmpl index 45bf1322da6..07e9fd78019 100644 --- a/cmd/mimir/help.txt.tmpl +++ b/cmd/mimir/help.txt.tmpl @@ -739,6 +739,8 @@ Usage of ./cmd/mimir/mimir: Maximum number of rules per rule group by namespace. Value is a map, where each key is the namespace and value is the number of rules allowed in the namespace (int). On the command line, this map is given in a JSON format. The number of rules specified has the same meaning as -ruler.max-rules-per-rule-group, but only applies for the specific namespace. If specified, it supersedes -ruler.max-rules-per-rule-group. (default {}) -ruler.query-frontend.address string GRPC listen address of the query-frontend(s). Must be a DNS address (prefixed with dns:///) to enable client side load balancing. + -ruler.query-frontend.max-retries-rate float + Maximum number of retries for failed queries per second. (default 170) -ruler.query-frontend.query-result-response-format string Format to use when retrieving query results from query-frontends. Supported values: json, protobuf (default "protobuf") -ruler.recording-rules-evaluation-enabled diff --git a/docs/sources/mimir/configure/configuration-parameters/index.md b/docs/sources/mimir/configure/configuration-parameters/index.md index 67325aab69d..20df3e87660 100644 --- a/docs/sources/mimir/configure/configuration-parameters/index.md +++ b/docs/sources/mimir/configure/configuration-parameters/index.md @@ -2114,6 +2114,10 @@ query_frontend: # CLI flag: -ruler.query-frontend.query-result-response-format [query_result_response_format: | default = "protobuf"] + # Maximum number of retries for failed queries per second. + # CLI flag: -ruler.query-frontend.max-retries-rate + [max_retries_rate: | default = 170] + tenant_federation: # Enable rule groups to query against multiple tenants. The tenant IDs # involved need to be in the rule group's 'source_tenants' field. If this flag diff --git a/pkg/mimir/modules.go b/pkg/mimir/modules.go index c66d15c6474..6b91735f996 100644 --- a/pkg/mimir/modules.go +++ b/pkg/mimir/modules.go @@ -852,7 +852,7 @@ func (t *Mimir) initRuler() (serv services.Service, err error) { if err != nil { return nil, err } - remoteQuerier := ruler.NewRemoteQuerier(queryFrontendClient, t.Cfg.Querier.EngineConfig.Timeout, t.Cfg.Ruler.QueryFrontend.QueryResultResponseFormat, t.Cfg.API.PrometheusHTTPPrefix, util_log.Logger, ruler.WithOrgIDMiddleware) + remoteQuerier := ruler.NewRemoteQuerier(queryFrontendClient, t.Cfg.Querier.EngineConfig.Timeout, t.Cfg.Ruler.QueryFrontend.MaxRetriesRate, t.Cfg.Ruler.QueryFrontend.QueryResultResponseFormat, t.Cfg.API.PrometheusHTTPPrefix, util_log.Logger, ruler.WithOrgIDMiddleware) embeddedQueryable = prom_remote.NewSampleAndChunkQueryableClient( remoteQuerier, diff --git a/pkg/ruler/remotequerier.go b/pkg/ruler/remotequerier.go index 3a25145c960..f02eb4d8669 100644 --- a/pkg/ruler/remotequerier.go +++ b/pkg/ruler/remotequerier.go @@ -31,6 +31,7 @@ import ( "github.com/prometheus/prometheus/storage" "github.com/prometheus/prometheus/storage/remote" "golang.org/x/exp/slices" + "golang.org/x/time/rate" "google.golang.org/grpc" "google.golang.org/grpc/codes" @@ -67,6 +68,8 @@ type QueryFrontendConfig struct { GRPCClientConfig grpcclient.Config `yaml:"grpc_client_config" doc:"description=Configures the gRPC client used to communicate between the rulers and query-frontends."` QueryResultResponseFormat string `yaml:"query_result_response_format"` + + MaxRetriesRate float64 `yaml:"max_retries_rate"` } func (c *QueryFrontendConfig) RegisterFlags(f *flag.FlagSet) { @@ -80,6 +83,7 @@ func (c *QueryFrontendConfig) RegisterFlags(f *flag.FlagSet) { c.GRPCClientConfig.RegisterFlagsWithPrefix("ruler.query-frontend.grpc-client-config", f) f.StringVar(&c.QueryResultResponseFormat, "ruler.query-frontend.query-result-response-format", formatProtobuf, fmt.Sprintf("Format to use when retrieving query results from query-frontends. Supported values: %s", strings.Join(allFormats, ", "))) + f.Float64Var(&c.MaxRetriesRate, "ruler.query-frontend.max-retries-rate", 170, "Maximum number of retries for failed queries per second.") } func (c *QueryFrontendConfig) Validate() error { @@ -115,6 +119,7 @@ type Middleware func(ctx context.Context, req *httpgrpc.HTTPRequest) error // RemoteQuerier executes read operations against a httpgrpc.HTTPClient. type RemoteQuerier struct { client httpgrpc.HTTPClient + retryLimiter *rate.Limiter timeout time.Duration middlewares []Middleware promHTTPPrefix string @@ -130,6 +135,7 @@ var protobufDecoderInstance = protobufDecoder{} func NewRemoteQuerier( client httpgrpc.HTTPClient, timeout time.Duration, + maxRetryRate float64, // maxRetryRate is the maximum number of retries for failed queries per second. preferredQueryResultResponseFormat string, prometheusHTTPPrefix string, logger log.Logger, @@ -138,6 +144,7 @@ func NewRemoteQuerier( return &RemoteQuerier{ client: client, timeout: timeout, + retryLimiter: rate.NewLimiter(rate.Limit(maxRetryRate), 1), middlewares: middlewares, promHTTPPrefix: prometheusHTTPPrefix, logger: logger, @@ -349,12 +356,22 @@ func (q *RemoteQuerier) sendRequest(ctx context.Context, req *httpgrpc.HTTPReque if !retry.Ongoing() { return nil, err } - level.Warn(logger).Log("msg", "failed to remotely evaluate query expression, will retry", "err", err) - retry.Wait() - // Avoid masking last known error if context was cancelled while waiting. - if ctx.Err() != nil { - return nil, fmt.Errorf("%s while retrying request, last err was: %w", ctx.Err(), err) + retryReservation := q.retryLimiter.Reserve() + if !retryReservation.OK() { + // This should only happen if we've misconfigured the limiter. + return nil, fmt.Errorf("couldn't reserve a retry token") + } + // We want to wait at least the time for the backoff, but also don't want to exceed the rate limit. + // All of this is capped to the max backoff, so that we are less likely to overrun into the next evaluation. + retryDelay := max(retry.NextDelay(), min(retryConfig.MaxBackoff, retryReservation.Delay())) + level.Warn(logger).Log("msg", "failed to remotely evaluate query expression, will retry", "err", err, "retry_delay", retryDelay) + select { + case <-time.After(retryDelay): + case <-ctx.Done(): + retryReservation.Cancel() + // Avoid masking last known error if context was cancelled while waiting. + return nil, fmt.Errorf("%s while retrying request, last error was: %w", ctx.Err(), err) } } } diff --git a/pkg/ruler/remotequerier_test.go b/pkg/ruler/remotequerier_test.go index 1105bb77093..39b7df17755 100644 --- a/pkg/ruler/remotequerier_test.go +++ b/pkg/ruler/remotequerier_test.go @@ -64,7 +64,7 @@ func TestRemoteQuerier_Read(t *testing.T) { t.Run("should issue a remote read request", func(t *testing.T) { client, inReq := setup() - q := NewRemoteQuerier(client, time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(client, time.Minute, 1, formatJSON, "/prometheus", log.NewNopLogger()) _, err := q.Read(context.Background(), &prompb.Query{}, false) require.NoError(t, err) @@ -76,7 +76,7 @@ func TestRemoteQuerier_Read(t *testing.T) { t.Run("should not inject the read consistency header if none is defined in the context", func(t *testing.T) { client, inReq := setup() - q := NewRemoteQuerier(client, time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(client, time.Minute, 1, formatJSON, "/prometheus", log.NewNopLogger()) _, err := q.Read(context.Background(), &prompb.Query{}, false) require.NoError(t, err) @@ -86,7 +86,7 @@ func TestRemoteQuerier_Read(t *testing.T) { t.Run("should inject the read consistency header if it is defined in the context", func(t *testing.T) { client, inReq := setup() - q := NewRemoteQuerier(client, time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(client, time.Minute, 1, formatJSON, "/prometheus", log.NewNopLogger()) ctx := api.ContextWithReadConsistencyLevel(context.Background(), api.ReadConsistencyStrong) _, err := q.Read(ctx, &prompb.Query{}, false) @@ -101,7 +101,7 @@ func TestRemoteQuerier_ReadReqTimeout(t *testing.T) { <-ctx.Done() return nil, ctx.Err() } - q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Second, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Second, 1, formatJSON, "/prometheus", log.NewNopLogger()) _, err := q.Read(context.Background(), &prompb.Query{}, false) require.Error(t, err) @@ -139,7 +139,7 @@ func TestRemoteQuerier_Query(t *testing.T) { t.Run(fmt.Sprintf("format = %s", format), func(t *testing.T) { client, inReq := setup() - q := NewRemoteQuerier(client, time.Minute, format, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(client, time.Minute, 1, format, "/prometheus", log.NewNopLogger()) _, err := q.Query(context.Background(), "qs", tm) require.NoError(t, err) @@ -165,7 +165,7 @@ func TestRemoteQuerier_Query(t *testing.T) { t.Run("should not inject the read consistency header if none is defined in the context", func(t *testing.T) { client, inReq := setup() - q := NewRemoteQuerier(client, time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(client, time.Minute, 1, formatJSON, "/prometheus", log.NewNopLogger()) _, err := q.Query(context.Background(), "qs", tm) require.NoError(t, err) @@ -175,7 +175,7 @@ func TestRemoteQuerier_Query(t *testing.T) { t.Run("should inject the read consistency header if it is defined in the context", func(t *testing.T) { client, inReq := setup() - q := NewRemoteQuerier(client, time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(client, time.Minute, 1, formatJSON, "/prometheus", log.NewNopLogger()) ctx := api.ContextWithReadConsistencyLevel(context.Background(), api.ReadConsistencyStrong) _, err := q.Query(ctx, "qs", tm) @@ -276,7 +276,7 @@ func TestRemoteQuerier_QueryRetryOnFailure(t *testing.T) { } return testCase.response, nil } - q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, 1, formatJSON, "/prometheus", log.NewNopLogger()) require.Equal(t, int64(0), count.Load()) _, err := q.Query(ctx, "qs", time.Now()) if testCase.err == nil { @@ -284,12 +284,12 @@ func TestRemoteQuerier_QueryRetryOnFailure(t *testing.T) { require.NoError(t, err) } else { require.Error(t, err) - require.EqualError(t, err, testCase.expectedError.Error()) + require.ErrorContains(t, err, testCase.expectedError.Error()) } require.Equal(t, int64(1), count.Load()) } else { require.Error(t, err) - require.EqualError(t, err, testCase.expectedError.Error()) + require.ErrorContains(t, err, testCase.expectedError.Error()) if testCase.expectedRetries { require.Greater(t, count.Load(), int64(1)) } else { @@ -405,7 +405,7 @@ func TestRemoteQuerier_QueryJSONDecoding(t *testing.T) { Body: []byte(scenario.body), }, nil } - q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, 1, formatJSON, "/prometheus", log.NewNopLogger()) tm := time.Unix(1649092025, 515834) actual, err := q.Query(context.Background(), "qs", tm) @@ -678,7 +678,7 @@ func TestRemoteQuerier_QueryProtobufDecoding(t *testing.T) { Body: b, }, nil } - q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, formatProtobuf, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, 1, formatProtobuf, "/prometheus", log.NewNopLogger()) tm := time.Unix(1649092025, 515834) actual, err := q.Query(context.Background(), "qs", tm) @@ -701,7 +701,7 @@ func TestRemoteQuerier_QueryUnknownResponseContentType(t *testing.T) { Body: []byte("some body content"), }, nil } - q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, 1, formatJSON, "/prometheus", log.NewNopLogger()) tm := time.Unix(1649092025, 515834) _, err := q.Query(context.Background(), "qs", tm) @@ -713,7 +713,7 @@ func TestRemoteQuerier_QueryReqTimeout(t *testing.T) { <-ctx.Done() return nil, ctx.Err() } - q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Second, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Second, 1, formatJSON, "/prometheus", log.NewNopLogger()) tm := time.Unix(1649092025, 515834) _, err := q.Query(context.Background(), "qs", tm) @@ -771,7 +771,7 @@ func TestRemoteQuerier_StatusErrorResponses(t *testing.T) { return testCase.resp, testCase.err } logger := newLoggerWithCounter() - q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, formatJSON, "/prometheus", logger) + q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, 1, formatJSON, "/prometheus", logger) tm := time.Unix(1649092025, 515834)