From c7bd8bdd51f6508cef8575d17b122fa02b243184 Mon Sep 17 00:00:00 2001 From: Dustin Wilson Date: Fri, 7 Mar 2025 14:51:47 -0500 Subject: [PATCH 1/2] Compactor: make -compactor.max-lookback a per-tenant option (#10794) * resolve conflicts * update comment --- CHANGELOG.md | 2 +- cmd/mimir/config-descriptor.json | 22 +++++++++---------- .../configuration-parameters/index.md | 14 ++++++------ pkg/compactor/blocks_cleaner_test.go | 9 ++++++++ pkg/compactor/compactor.go | 10 +++++---- pkg/util/validation/limits.go | 8 +++++++ 6 files changed, 42 insertions(+), 23 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a3db8af5aee..1f192c6e76a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -52,7 +52,7 @@ * [ENHANCEMENT] Querier, ingester: The series API respects passed `limit` parameter. #10620 #10652 * [ENHANCEMENT] Store-gateway: Add experimental settings under `-store-gateway.dynamic-replication` to allow more than the default of 3 store-gateways to own recent blocks. #10382 #10637 * [ENHANCEMENT] Ingester: Add reactive concurrency limiters to protect push and read operations from overload. #10574 -* [ENHANCEMENT] Compactor: Add experimental `-compactor.max-lookback` option to limit blocks considered in each compaction cycle. Blocks uploaded prior to the lookback period aren't processed. This option helps reduce CPU utilization in tenants with large block metadata files that are processed before each compaction. #10585 +* [ENHANCEMENT] Compactor: Add experimental `-compactor.max-lookback` option to limit blocks considered in each compaction cycle. Blocks uploaded prior to the lookback period aren't processed. This option helps reduce CPU utilization in tenants with large block metadata files that are processed before each compaction. #10585 #10794 * [ENHANCEMENT] Distributor: Optionally expose the current HA replica for each tenant in the `cortex_ha_tracker_elected_replica_status` metric. This is enabled with the `-distributor.ha-tracker.enable-elected-replica-metric=true` flag. #10644 * [ENHANCEMENT] Enable three Go runtime metrics: #10641 * `go_cpu_classes_gc_total_cpu_seconds_total` diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index 8c0000adfd4..b0671a3ee59 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -5071,6 +5071,17 @@ "fieldType": "int", "fieldCategory": "advanced" }, + { + "kind": "field", + "name": "compactor_max_lookback", + "required": false, + "desc": "Blocks uploaded before the lookback aren't considered in compactor cycles. If set, this value should be larger than all values in `-blocks-storage.tsdb.block-ranges-period`. A value of 0s means that all blocks are considered regardless of their upload time.", + "fieldValue": null, + "fieldDefaultValue": 0, + "fieldFlag": "compactor.max-lookback", + "fieldType": "duration", + "fieldCategory": "experimental" + }, { "kind": "field", "name": "s3_sse_type", @@ -11608,17 +11619,6 @@ "fieldType": "string", "fieldCategory": "advanced" }, - { - "kind": "field", - "name": "max_lookback", - "required": false, - "desc": "Blocks uploaded before the lookback aren't considered in compactor cycles. If set, this value should be larger than all values in `-blocks-storage.tsdb.block-ranges-period`. A value of 0s means that all blocks are considered regardless of their upload time.", - "fieldValue": null, - "fieldDefaultValue": 0, - "fieldFlag": "compactor.max-lookback", - "fieldType": "duration", - "fieldCategory": "experimental" - }, { "kind": "field", "name": "upload_sparse_index_headers", diff --git a/docs/sources/mimir/configure/configuration-parameters/index.md b/docs/sources/mimir/configure/configuration-parameters/index.md index 00cd96b5311..764ada15828 100644 --- a/docs/sources/mimir/configure/configuration-parameters/index.md +++ b/docs/sources/mimir/configure/configuration-parameters/index.md @@ -3932,6 +3932,13 @@ The `limits` block configures default and per-tenant limits imposed by component # CLI flag: -compactor.block-upload-max-block-size-bytes [compactor_block_upload_max_block_size_bytes: | default = 0] +# (experimental) Blocks uploaded before the lookback aren't considered in +# compactor cycles. If set, this value should be larger than all values in +# `-blocks-storage.tsdb.block-ranges-period`. A value of 0s means that all +# blocks are considered regardless of their upload time. +# CLI flag: -compactor.max-lookback +[compactor_max_lookback: | default = 0s] + # S3 server-side encryption type. Required to enable server-side encryption # overrides for a specific tenant. If not set, the default S3 client settings # are used. @@ -4965,13 +4972,6 @@ sharding_ring: # CLI flag: -compactor.compaction-jobs-order [compaction_jobs_order: | default = "smallest-range-oldest-blocks-first"] -# (experimental) Blocks uploaded before the lookback aren't considered in -# compactor cycles. If set, this value should be larger than all values in -# `-blocks-storage.tsdb.block-ranges-period`. A value of 0s means that all -# blocks are considered regardless of their upload time. -# CLI flag: -compactor.max-lookback -[max_lookback: | default = 0s] - # (experimental) If enabled, the compactor constructs and uploads sparse index # headers to object storage during each compaction cycle. This allows # store-gateway instances to use the sparse headers from object storage instead diff --git a/pkg/compactor/blocks_cleaner_test.go b/pkg/compactor/blocks_cleaner_test.go index c19c15bab58..efc5a0d988a 100644 --- a/pkg/compactor/blocks_cleaner_test.go +++ b/pkg/compactor/blocks_cleaner_test.go @@ -1544,6 +1544,7 @@ type mockConfigProvider struct { userPartialBlockDelayInvalid map[string]bool verifyChunks map[string]bool perTenantInMemoryCache map[string]int + maxLookback map[string]time.Duration } func newMockConfigProvider() *mockConfigProvider { @@ -1558,6 +1559,7 @@ func newMockConfigProvider() *mockConfigProvider { userPartialBlockDelayInvalid: make(map[string]bool), verifyChunks: make(map[string]bool), perTenantInMemoryCache: make(map[string]int), + maxLookback: make(map[string]time.Duration), } } @@ -1625,6 +1627,13 @@ func (m *mockConfigProvider) S3SSEKMSEncryptionContext(string) string { return "" } +func (m *mockConfigProvider) CompactorMaxLookback(user string) time.Duration { + if result, ok := m.maxLookback[user]; ok { + return result + } + return 0 +} + func (c *BlocksCleaner) runCleanupWithErr(ctx context.Context) error { allUsers, isDeleted, err := c.refreshOwnedUsers(ctx) if err != nil { diff --git a/pkg/compactor/compactor.go b/pkg/compactor/compactor.go index 44d074e3e63..3be3b0361a5 100644 --- a/pkg/compactor/compactor.go +++ b/pkg/compactor/compactor.go @@ -119,8 +119,7 @@ type Config struct { // Compactors sharding. ShardingRing RingConfig `yaml:"sharding_ring"` - CompactionJobsOrder string `yaml:"compaction_jobs_order" category:"advanced"` - MaxLookback time.Duration `yaml:"max_lookback" category:"experimental"` + CompactionJobsOrder string `yaml:"compaction_jobs_order" category:"advanced"` // No need to add options to customize the retry backoff, // given the defaults should be fine, but allow to override @@ -163,7 +162,6 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet, logger log.Logger) { "If 0, blocks will be deleted straight away. Note that deleting blocks immediately can cause query failures.") f.DurationVar(&cfg.TenantCleanupDelay, "compactor.tenant-cleanup-delay", 6*time.Hour, "For tenants marked for deletion, this is the time between deletion of the last block, and doing final cleanup (marker files, debug files) of the tenant.") f.BoolVar(&cfg.NoBlocksFileCleanupEnabled, "compactor.no-blocks-file-cleanup-enabled", false, "If enabled, will delete the bucket-index, markers and debug files in the tenant bucket when there are no blocks left in the index.") - f.DurationVar(&cfg.MaxLookback, "compactor.max-lookback", 0*time.Second, "Blocks uploaded before the lookback aren't considered in compactor cycles. If set, this value should be larger than all values in `-blocks-storage.tsdb.block-ranges-period`. A value of 0s means that all blocks are considered regardless of their upload time.") f.BoolVar(&cfg.UploadSparseIndexHeaders, "compactor.upload-sparse-index-headers", false, "If enabled, the compactor constructs and uploads sparse index headers to object storage during each compaction cycle. This allows store-gateway instances to use the sparse headers from object storage instead of recreating them locally.") // compactor concurrency options @@ -248,6 +246,10 @@ type ConfigProvider interface { // CompactorInMemoryTenantMetaCacheSize returns number of parsed *Meta objects that we can keep in memory for the user between compactions. CompactorInMemoryTenantMetaCacheSize(userID string) int + + // CompactorMaxLookback returns the duration of the compactor lookback period, blocks uploaded before the lookback period aren't + // considered in compactor cycles + CompactorMaxLookback(userID string) time.Duration } // MultitenantCompactor is a multi-tenant TSDB block compactor based on Thanos. @@ -795,7 +797,7 @@ func (c *MultitenantCompactor) compactUser(ctx context.Context, userID string) e // Disable maxLookback (set to 0s) when block upload is enabled, block upload enabled implies there will be blocks // beyond the lookback period, we don't want the compactor to skip these - var maxLookback = c.compactorCfg.MaxLookback + var maxLookback = c.cfgProvider.CompactorMaxLookback(userID) if c.cfgProvider.CompactorBlockUploadEnabled(userID) { maxLookback = 0 } diff --git a/pkg/util/validation/limits.go b/pkg/util/validation/limits.go index 6fed2983f7f..6f5e8280836 100644 --- a/pkg/util/validation/limits.go +++ b/pkg/util/validation/limits.go @@ -232,6 +232,7 @@ type Limits struct { CompactorBlockUploadVerifyChunks bool `yaml:"compactor_block_upload_verify_chunks" json:"compactor_block_upload_verify_chunks"` CompactorBlockUploadMaxBlockSizeBytes int64 `yaml:"compactor_block_upload_max_block_size_bytes" json:"compactor_block_upload_max_block_size_bytes" category:"advanced"` CompactorInMemoryTenantMetaCacheSize int `yaml:"compactor_in_memory_tenant_meta_cache_size" json:"compactor_in_memory_tenant_meta_cache_size" category:"experimental" doc:"hidden"` + CompactorMaxLookback model.Duration `yaml:"compactor_max_lookback" json:"compactor_max_lookback" category:"experimental"` // This config doesn't have a CLI flag registered here because they're registered in // their own original config struct. @@ -381,6 +382,7 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) { f.BoolVar(&l.CompactorBlockUploadVerifyChunks, "compactor.block-upload-verify-chunks", true, "Verify chunks when uploading blocks via the upload API for the tenant.") f.Int64Var(&l.CompactorBlockUploadMaxBlockSizeBytes, "compactor.block-upload-max-block-size-bytes", 0, "Maximum size in bytes of a block that is allowed to be uploaded or validated. 0 = no limit.") f.IntVar(&l.CompactorInMemoryTenantMetaCacheSize, "compactor.in-memory-tenant-meta-cache-size", 0, "Size of per-tenant in-memory cache for parsed meta.json files. This is useful when meta.json files are big and parsing is expensive. Small meta.json files are not cached. 0 means this cache is disabled.") + f.Var(&l.CompactorMaxLookback, "compactor.max-lookback", "Blocks uploaded before the lookback aren't considered in compactor cycles. If set, this value should be larger than all values in `-blocks-storage.tsdb.block-ranges-period`. A value of 0s means that all blocks are considered regardless of their upload time.") // Query-frontend. f.Var(&l.MaxTotalQueryLength, MaxTotalQueryLengthFlag, "Limit the total query time range (end - start time). This limit is enforced in the query-frontend on the received instant, range or remote read query.") @@ -908,6 +910,12 @@ func (o *Overrides) EvaluationDelay(userID string) time.Duration { return time.Duration(o.getOverridesForUser(userID).RulerEvaluationDelay) } +// CompactorMaxLookback returns the duration of the compactor lookback period, blocks uploaded before the lookback period aren't +// considered in compactor cycles +func (o *Overrides) CompactorMaxLookback(userID string) time.Duration { + return time.Duration(o.getOverridesForUser(userID).CompactorMaxLookback) +} + // CompactorBlocksRetentionPeriod returns the retention period for a given user. func (o *Overrides) CompactorBlocksRetentionPeriod(userID string) time.Duration { return time.Duration(o.getOverridesForUser(userID).CompactorBlocksRetentionPeriod) From d65b32cf43febef53eb4ba15dda8c230ef87856c Mon Sep 17 00:00:00 2001 From: Nick Pillitteri <56quarters@users.noreply.github.com> Date: Fri, 7 Mar 2025 14:52:18 -0500 Subject: [PATCH 2/2] mixin: Add req/sec floor to `MimirCacheRequestErrors` (#10832) Require at least 10 req/sec to the cache to consider alerting on request errors. This avoids noisy alerts in low-traffic clusters Fixes #10831 Signed-off-by: Nick Pillitteri --- CHANGELOG.md | 1 + .../templates/metamonitoring/mixin-alerts.yaml | 2 +- operations/mimir-mixin-compiled-baremetal/alerts.yaml | 2 +- operations/mimir-mixin-compiled-gem/alerts.yaml | 2 +- operations/mimir-mixin-compiled/alerts.yaml | 2 +- operations/mimir-mixin/alerts/alerts.libsonnet | 5 +++-- 6 files changed, 8 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1f192c6e76a..1a7f4252619 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -90,6 +90,7 @@ ### Mixin +* [CHANGE] Alerts: Only alert on errors performing cache operations if there are over 10 request/sec to avoid flapping. #10832 * [FEATURE] Add compiled mixin for GEM installations in `operations/mimir-mixin-compiled-gem`. #10690 * [ENHANCEMENT] Dashboards: clarify that the ingester and store-gateway panels on the 'Reads' dashboard show data from all query requests to that component, not just requests from the main query path (ie. requests from the ruler query path are included as well). #10598 * [ENHANCEMENT] Dashboards: add ingester and store-gateway panels from the 'Reads' dashboard to the 'Remote ruler reads' dashboard as well. #10598 diff --git a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml index 7bbc8d36677..f104c49f0a6 100644 --- a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml +++ b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml @@ -124,7 +124,7 @@ spec: / sum by(cluster, namespace, name, operation) ( rate(thanos_cache_operations_total{operation!~"add|delete"}[1m]) - ) + ) > 10 ) * 100 > 5 for: 5m labels: diff --git a/operations/mimir-mixin-compiled-baremetal/alerts.yaml b/operations/mimir-mixin-compiled-baremetal/alerts.yaml index 774ca2d529a..f171b5c4f0d 100644 --- a/operations/mimir-mixin-compiled-baremetal/alerts.yaml +++ b/operations/mimir-mixin-compiled-baremetal/alerts.yaml @@ -112,7 +112,7 @@ groups: / sum by(cluster, namespace, name, operation) ( rate(thanos_cache_operations_total{operation!~"add|delete"}[1m]) - ) + ) > 10 ) * 100 > 5 for: 5m labels: diff --git a/operations/mimir-mixin-compiled-gem/alerts.yaml b/operations/mimir-mixin-compiled-gem/alerts.yaml index 8b652c93799..a162fd690e3 100644 --- a/operations/mimir-mixin-compiled-gem/alerts.yaml +++ b/operations/mimir-mixin-compiled-gem/alerts.yaml @@ -112,7 +112,7 @@ groups: / sum by(cluster, namespace, name, operation) ( rate(thanos_cache_operations_total{operation!~"add|delete"}[1m]) - ) + ) > 10 ) * 100 > 5 for: 5m labels: diff --git a/operations/mimir-mixin-compiled/alerts.yaml b/operations/mimir-mixin-compiled/alerts.yaml index dfc68b652ce..06c090c8bc8 100644 --- a/operations/mimir-mixin-compiled/alerts.yaml +++ b/operations/mimir-mixin-compiled/alerts.yaml @@ -112,7 +112,7 @@ groups: / sum by(cluster, namespace, name, operation) ( rate(thanos_cache_operations_total{operation!~"add|delete"}[1m]) - ) + ) > 10 ) * 100 > 5 for: 5m labels: diff --git a/operations/mimir-mixin/alerts/alerts.libsonnet b/operations/mimir-mixin/alerts/alerts.libsonnet index 0f53637875d..03fd1a4f35c 100644 --- a/operations/mimir-mixin/alerts/alerts.libsonnet +++ b/operations/mimir-mixin/alerts/alerts.libsonnet @@ -204,7 +204,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; alert: $.alertName('CacheRequestErrors'), // Specifically exclude "add" and "delete" operations which are used for cache invalidation and "locking" // since they are expected to sometimes fail in normal operation (such as when a "lock" already exists or - // key being invalidated does not exist). + // key being invalidated does not exist). We also only alert when there at least 10 req/sec to the cache + // to avoid flapping alerts in low-traffic environments. expr: ||| ( sum by(%(group_by)s, name, operation) ( @@ -213,7 +214,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; / sum by(%(group_by)s, name, operation) ( rate(thanos_cache_operations_total{operation!~"add|delete"}[%(range_interval)s]) - ) + ) > 10 ) * 100 > 5 ||| % { group_by: $._config.alert_aggregation_labels,