From 1596710e7217e1cc49e21b70f4cda798337b20b7 Mon Sep 17 00:00:00 2001 From: Alex Weaver Date: Fri, 28 Feb 2025 15:22:13 -0600 Subject: [PATCH 01/15] Extract notifier config, introduce alertmanager client config --- pkg/ruler/notifier.go | 43 ---------------- pkg/ruler/notifier/notifier_config.go | 72 +++++++++++++++++++++++++++ pkg/ruler/notifier_test.go | 31 ++++++------ pkg/ruler/ruler.go | 3 +- 4 files changed, 90 insertions(+), 59 deletions(-) create mode 100644 pkg/ruler/notifier/notifier_config.go diff --git a/pkg/ruler/notifier.go b/pkg/ruler/notifier.go index ab21552125..ba60eb6f83 100644 --- a/pkg/ruler/notifier.go +++ b/pkg/ruler/notifier.go @@ -8,7 +8,6 @@ package ruler import ( "context" "errors" - "flag" "net/url" "strings" "sync" @@ -16,15 +15,12 @@ import ( gklog "github.com/go-kit/log" "github.com/go-kit/log/level" "github.com/grafana/dskit/cancellation" - "github.com/grafana/dskit/crypto/tls" - "github.com/grafana/dskit/flagext" config_util "github.com/prometheus/common/config" "github.com/prometheus/common/model" "github.com/prometheus/prometheus/config" "github.com/prometheus/prometheus/discovery" "github.com/prometheus/prometheus/notifier" - "github.com/grafana/mimir/pkg/util" util_log "github.com/grafana/mimir/pkg/util/log" ) @@ -33,45 +29,6 @@ var ( errRulerSimultaneousBasicAuthAndOAuth = errors.New("cannot use both Basic Auth and OAuth2 simultaneously") ) -type NotifierConfig struct { - TLSEnabled bool `yaml:"tls_enabled" category:"advanced"` - TLS tls.ClientConfig `yaml:",inline"` - BasicAuth util.BasicAuth `yaml:",inline"` - OAuth2 OAuth2Config `yaml:"oauth2"` - ProxyURL string `yaml:"proxy_url" category:"advanced"` -} - -func (cfg *NotifierConfig) RegisterFlags(f *flag.FlagSet) { - f.BoolVar(&cfg.TLSEnabled, "ruler.alertmanager-client.tls-enabled", true, "Enable TLS for gRPC client connecting to alertmanager.") - cfg.TLS.RegisterFlagsWithPrefix("ruler.alertmanager-client", f) - cfg.BasicAuth.RegisterFlagsWithPrefix("ruler.alertmanager-client.", f) - cfg.OAuth2.RegisterFlagsWithPrefix("ruler.alertmanager-client.oauth.", f) - f.StringVar(&cfg.ProxyURL, "ruler.alertmanager-client.proxy-url", "", "Optional HTTP, HTTPS via CONNECT, or SOCKS5 proxy URL to route requests through. Applies to all requests, including auxiliary traffic, such as OAuth token requests.") -} - -type OAuth2Config struct { - ClientID string `yaml:"client_id"` - ClientSecret flagext.Secret `yaml:"client_secret"` - TokenURL string `yaml:"token_url"` - Scopes flagext.StringSliceCSV `yaml:"scopes,omitempty"` - EndpointParams flagext.LimitsMap[string] `yaml:"endpoint_params" category:"advanced"` -} - -func (cfg *OAuth2Config) RegisterFlagsWithPrefix(prefix string, f *flag.FlagSet) { - f.StringVar(&cfg.ClientID, prefix+"client_id", "", "OAuth2 client ID. Enables the use of OAuth2 for authenticating with Alertmanager.") - f.Var(&cfg.ClientSecret, prefix+"client_secret", "OAuth2 client secret.") - f.StringVar(&cfg.TokenURL, prefix+"token_url", "", "Endpoint used to fetch access token.") - f.Var(&cfg.Scopes, prefix+"scopes", "Optional scopes to include with the token request.") - if !cfg.EndpointParams.IsInitialized() { - cfg.EndpointParams = flagext.NewLimitsMap[string](nil) - } - f.Var(&cfg.EndpointParams, prefix+"endpoint-params", "Optional additional URL parameters to send to the token URL.") -} - -func (cfg *OAuth2Config) IsEnabled() bool { - return cfg.ClientID != "" || cfg.TokenURL != "" -} - // rulerNotifier bundles a notifier.Manager together with an associated // Alertmanager service discovery manager and handles the lifecycle // of both actors. diff --git a/pkg/ruler/notifier/notifier_config.go b/pkg/ruler/notifier/notifier_config.go new file mode 100644 index 0000000000..0239f1f66f --- /dev/null +++ b/pkg/ruler/notifier/notifier_config.go @@ -0,0 +1,72 @@ +package notifier + +import ( + "encoding/json" + "flag" + "fmt" + + "github.com/grafana/dskit/crypto/tls" + "github.com/grafana/dskit/flagext" + "github.com/grafana/mimir/pkg/util" +) + +type AlertmanagerClientConfig struct { + AlertmanagerURL string `yaml:"alertmanager_url"` + NotifierConfig Config `yaml:",inline" json:",inline"` +} + +func (acc *AlertmanagerClientConfig) String() string { + out, err := json.Marshal(acc) + if err != nil { + return fmt.Sprintf("failed to marshal: %v", err) + } + return string(out) +} + +func (acc *AlertmanagerClientConfig) Set(s string) error { + new := AlertmanagerClientConfig{} + if err := json.Unmarshal([]byte(s), &new); err != nil { + return err + } + *acc = new + return nil +} + +type Config struct { + TLSEnabled bool `yaml:"tls_enabled" category:"advanced"` + TLS tls.ClientConfig `yaml:",inline"` + BasicAuth util.BasicAuth `yaml:",inline"` + OAuth2 OAuth2Config `yaml:"oauth2"` + ProxyURL string `yaml:"proxy_url" category:"advanced"` +} + +func (cfg *Config) RegisterFlags(f *flag.FlagSet) { + f.BoolVar(&cfg.TLSEnabled, "ruler.alertmanager-client.tls-enabled", true, "Enable TLS for gRPC client connecting to alertmanager.") + cfg.TLS.RegisterFlagsWithPrefix("ruler.alertmanager-client", f) + cfg.BasicAuth.RegisterFlagsWithPrefix("ruler.alertmanager-client.", f) + cfg.OAuth2.RegisterFlagsWithPrefix("ruler.alertmanager-client.oauth.", f) + f.StringVar(&cfg.ProxyURL, "ruler.alertmanager-client.proxy-url", "", "Optional HTTP, HTTPS via CONNECT, or SOCKS5 proxy URL to route requests through. Applies to all requests, including auxiliary traffic, such as OAuth token requests.") +} + +type OAuth2Config struct { + ClientID string `yaml:"client_id"` + ClientSecret flagext.Secret `yaml:"client_secret"` + TokenURL string `yaml:"token_url"` + Scopes flagext.StringSliceCSV `yaml:"scopes,omitempty"` + EndpointParams flagext.LimitsMap[string] `yaml:"endpoint_params" category:"advanced"` +} + +func (cfg *OAuth2Config) RegisterFlagsWithPrefix(prefix string, f *flag.FlagSet) { + f.StringVar(&cfg.ClientID, prefix+"client_id", "", "OAuth2 client ID. Enables the use of OAuth2 for authenticating with Alertmanager.") + f.Var(&cfg.ClientSecret, prefix+"client_secret", "OAuth2 client secret.") + f.StringVar(&cfg.TokenURL, prefix+"token_url", "", "Endpoint used to fetch access token.") + f.Var(&cfg.Scopes, prefix+"scopes", "Optional scopes to include with the token request.") + if !cfg.EndpointParams.IsInitialized() { + cfg.EndpointParams = flagext.NewLimitsMap[string](nil) + } + f.Var(&cfg.EndpointParams, prefix+"endpoint-params", "Optional additional URL parameters to send to the token URL.") +} + +func (cfg *OAuth2Config) IsEnabled() bool { + return cfg.ClientID != "" || cfg.TokenURL != "" +} diff --git a/pkg/ruler/notifier_test.go b/pkg/ruler/notifier_test.go index 99f1ed8d84..a08a983c56 100644 --- a/pkg/ruler/notifier_test.go +++ b/pkg/ruler/notifier_test.go @@ -23,6 +23,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "github.com/grafana/mimir/pkg/ruler/notifier" "github.com/grafana/mimir/pkg/util" ) @@ -203,7 +204,7 @@ func TestBuildNotifierConfig(t *testing.T) { name: "with basic authentication URL, no service discovery, and explicit config", cfg: &Config{ AlertmanagerURL: "http://marco:hunter2@alertmanager-0.default.svc.cluster.local/alertmanager", - Notifier: NotifierConfig{ + Notifier: notifier.Config{ BasicAuth: util.BasicAuth{ Username: "jacob", Password: flagext.SecretWithValue("test"), @@ -273,7 +274,7 @@ func TestBuildNotifierConfig(t *testing.T) { name: "with service discovery URL, basic auth, and proxy URL", cfg: &Config{ AlertmanagerURL: "dnssrv+https://marco:hunter2@_http._tcp.alertmanager-0.default.svc.cluster.local/alertmanager", - Notifier: NotifierConfig{ + Notifier: notifier.Config{ ProxyURL: "http://my-proxy.proxy-namespace.svc.cluster.local.:1234", }, }, @@ -305,8 +306,8 @@ func TestBuildNotifierConfig(t *testing.T) { name: "with OAuth2", cfg: &Config{ AlertmanagerURL: "dnssrv+https://_http._tcp.alertmanager-0.default.svc.cluster.local/alertmanager", - Notifier: NotifierConfig{ - OAuth2: OAuth2Config{ + Notifier: notifier.Config{ + OAuth2: notifier.OAuth2Config{ ClientID: "oauth2-client-id", ClientSecret: flagext.SecretWithValue("test"), TokenURL: "https://oauth2-token-endpoint.local/token", @@ -342,8 +343,8 @@ func TestBuildNotifierConfig(t *testing.T) { name: "with OAuth2 and optional scopes", cfg: &Config{ AlertmanagerURL: "dnssrv+https://_http._tcp.alertmanager-0.default.svc.cluster.local/alertmanager", - Notifier: NotifierConfig{ - OAuth2: OAuth2Config{ + Notifier: notifier.Config{ + OAuth2: notifier.OAuth2Config{ ClientID: "oauth2-client-id", ClientSecret: flagext.SecretWithValue("test"), TokenURL: "https://oauth2-token-endpoint.local/token", @@ -381,8 +382,8 @@ func TestBuildNotifierConfig(t *testing.T) { name: "with OAuth2 and optional endpoint params", cfg: &Config{ AlertmanagerURL: "dnssrv+https://_http._tcp.alertmanager-0.default.svc.cluster.local/alertmanager", - Notifier: NotifierConfig{ - OAuth2: OAuth2Config{ + Notifier: notifier.Config{ + OAuth2: notifier.OAuth2Config{ ClientID: "oauth2-client-id", ClientSecret: flagext.SecretWithValue("test"), TokenURL: "https://oauth2-token-endpoint.local/token", @@ -426,9 +427,9 @@ func TestBuildNotifierConfig(t *testing.T) { name: "with OAuth2 and proxy_url simultaneously, inheriting proxy", cfg: &Config{ AlertmanagerURL: "dnssrv+https://_http._tcp.alertmanager-0.default.svc.cluster.local/alertmanager", - Notifier: NotifierConfig{ + Notifier: notifier.Config{ ProxyURL: "http://my-proxy.proxy-namespace.svc.cluster.local.:1234", - OAuth2: OAuth2Config{ + OAuth2: notifier.OAuth2Config{ ClientID: "oauth2-client-id", ClientSecret: flagext.SecretWithValue("test"), TokenURL: "https://oauth2-token-endpoint.local/token", @@ -493,7 +494,7 @@ func TestBuildNotifierConfig(t *testing.T) { name: "misspelled proxy URL", cfg: &Config{ AlertmanagerURL: "http://alertmanager.default.svc.cluster.local/alertmanager", - Notifier: NotifierConfig{ + Notifier: notifier.Config{ ProxyURL: "http://example.local" + string(rune(0x7f)), }, }, @@ -503,11 +504,11 @@ func TestBuildNotifierConfig(t *testing.T) { name: "basic auth and oauth provided at the same time", cfg: &Config{ AlertmanagerURL: "http://alertmanager.default.svc.cluster.local/alertmanager", - Notifier: NotifierConfig{ + Notifier: notifier.Config{ BasicAuth: util.BasicAuth{ Username: "test-user", }, - OAuth2: OAuth2Config{ + OAuth2: notifier.OAuth2Config{ ClientID: "oauth2-client-id", ClientSecret: flagext.SecretWithValue("test"), TokenURL: "https://oauth2-token-endpoint.local/token", @@ -521,8 +522,8 @@ func TestBuildNotifierConfig(t *testing.T) { name: "basic auth via URL and oauth provided at the same time", cfg: &Config{ AlertmanagerURL: "http://marco:hunter2@alertmanager.default.svc.cluster.local/alertmanager", - Notifier: NotifierConfig{ - OAuth2: OAuth2Config{ + Notifier: notifier.Config{ + OAuth2: notifier.OAuth2Config{ ClientID: "oauth2-client-id", ClientSecret: flagext.SecretWithValue("test"), TokenURL: "https://oauth2-token-endpoint.local/token", diff --git a/pkg/ruler/ruler.go b/pkg/ruler/ruler.go index 607eeec5fe..52fe2d0619 100644 --- a/pkg/ruler/ruler.go +++ b/pkg/ruler/ruler.go @@ -41,6 +41,7 @@ import ( "golang.org/x/sync/errgroup" "github.com/grafana/mimir/pkg/mimirpb" + "github.com/grafana/mimir/pkg/ruler/notifier" "github.com/grafana/mimir/pkg/ruler/rulespb" "github.com/grafana/mimir/pkg/ruler/rulestore" "github.com/grafana/mimir/pkg/util" @@ -114,7 +115,7 @@ type Config struct { // HTTP timeout duration when sending notifications to the Alertmanager. NotificationTimeout time.Duration `yaml:"notification_timeout" category:"advanced"` // Client configs for interacting with the Alertmanager - Notifier NotifierConfig `yaml:"alertmanager_client"` + Notifier notifier.Config `yaml:"alertmanager_client"` // Max time to tolerate outage for restoring "for" state of alert. OutageTolerance time.Duration `yaml:"for_outage_tolerance" category:"advanced"` From fdf3c22cfe4272bb0135d94311c03eb46a087abc Mon Sep 17 00:00:00 2001 From: Alex Weaver Date: Tue, 4 Mar 2025 15:33:36 -0600 Subject: [PATCH 02/15] Wire up notifier config per tenant --- pkg/ruler/compat.go | 2 ++ pkg/util/validation/limits.go | 29 ++++++++++++++++++----------- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/pkg/ruler/compat.go b/pkg/ruler/compat.go index 87e951a90d..841d06dfb0 100644 --- a/pkg/ruler/compat.go +++ b/pkg/ruler/compat.go @@ -28,6 +28,7 @@ import ( "github.com/grafana/mimir/pkg/mimirpb" "github.com/grafana/mimir/pkg/querier" querier_stats "github.com/grafana/mimir/pkg/querier/stats" + notifierCfg "github.com/grafana/mimir/pkg/ruler/notifier" util_log "github.com/grafana/mimir/pkg/util/log" ) @@ -213,6 +214,7 @@ type RulesLimits interface { RulerSyncRulesOnChangesEnabled(userID string) bool RulerProtectedNamespaces(userID string) []string RulerMaxIndependentRuleEvaluationConcurrencyPerTenant(userID string) int64 + RulerAlertmanagerClientConfig(userID string) notifierCfg.AlertmanagerClientConfig } func MetricsQueryFunc(qf rules.QueryFunc, userID string, queries, failedQueries *prometheus.CounterVec, remoteQuerier bool) rules.QueryFunc { diff --git a/pkg/util/validation/limits.go b/pkg/util/validation/limits.go index 6fed2983f7..abcf4e90da 100644 --- a/pkg/util/validation/limits.go +++ b/pkg/util/validation/limits.go @@ -25,6 +25,7 @@ import ( asmodel "github.com/grafana/mimir/pkg/ingester/activeseries/model" "github.com/grafana/mimir/pkg/querier/api" + "github.com/grafana/mimir/pkg/ruler/notifier" "github.com/grafana/mimir/pkg/storage/tsdb/block" "github.com/grafana/mimir/pkg/util" ) @@ -206,17 +207,18 @@ type Limits struct { CostAttributionCooldown model.Duration `yaml:"cost_attribution_cooldown" json:"cost_attribution_cooldown" category:"experimental"` // Ruler defaults and limits. - RulerEvaluationDelay model.Duration `yaml:"ruler_evaluation_delay_duration" json:"ruler_evaluation_delay_duration"` - RulerTenantShardSize int `yaml:"ruler_tenant_shard_size" json:"ruler_tenant_shard_size"` - RulerMaxRulesPerRuleGroup int `yaml:"ruler_max_rules_per_rule_group" json:"ruler_max_rules_per_rule_group"` - RulerMaxRuleGroupsPerTenant int `yaml:"ruler_max_rule_groups_per_tenant" json:"ruler_max_rule_groups_per_tenant"` - RulerRecordingRulesEvaluationEnabled bool `yaml:"ruler_recording_rules_evaluation_enabled" json:"ruler_recording_rules_evaluation_enabled"` - RulerAlertingRulesEvaluationEnabled bool `yaml:"ruler_alerting_rules_evaluation_enabled" json:"ruler_alerting_rules_evaluation_enabled"` - RulerSyncRulesOnChangesEnabled bool `yaml:"ruler_sync_rules_on_changes_enabled" json:"ruler_sync_rules_on_changes_enabled" category:"advanced"` - RulerMaxRulesPerRuleGroupByNamespace flagext.LimitsMap[int] `yaml:"ruler_max_rules_per_rule_group_by_namespace" json:"ruler_max_rules_per_rule_group_by_namespace" category:"experimental"` - RulerMaxRuleGroupsPerTenantByNamespace flagext.LimitsMap[int] `yaml:"ruler_max_rule_groups_per_tenant_by_namespace" json:"ruler_max_rule_groups_per_tenant_by_namespace" category:"experimental"` - RulerProtectedNamespaces flagext.StringSliceCSV `yaml:"ruler_protected_namespaces" json:"ruler_protected_namespaces" category:"experimental"` - RulerMaxIndependentRuleEvaluationConcurrencyPerTenant int64 `yaml:"ruler_max_independent_rule_evaluation_concurrency_per_tenant" json:"ruler_max_independent_rule_evaluation_concurrency_per_tenant" category:"experimental"` + RulerEvaluationDelay model.Duration `yaml:"ruler_evaluation_delay_duration" json:"ruler_evaluation_delay_duration"` + RulerTenantShardSize int `yaml:"ruler_tenant_shard_size" json:"ruler_tenant_shard_size"` + RulerMaxRulesPerRuleGroup int `yaml:"ruler_max_rules_per_rule_group" json:"ruler_max_rules_per_rule_group"` + RulerMaxRuleGroupsPerTenant int `yaml:"ruler_max_rule_groups_per_tenant" json:"ruler_max_rule_groups_per_tenant"` + RulerRecordingRulesEvaluationEnabled bool `yaml:"ruler_recording_rules_evaluation_enabled" json:"ruler_recording_rules_evaluation_enabled"` + RulerAlertingRulesEvaluationEnabled bool `yaml:"ruler_alerting_rules_evaluation_enabled" json:"ruler_alerting_rules_evaluation_enabled"` + RulerSyncRulesOnChangesEnabled bool `yaml:"ruler_sync_rules_on_changes_enabled" json:"ruler_sync_rules_on_changes_enabled" category:"advanced"` + RulerMaxRulesPerRuleGroupByNamespace flagext.LimitsMap[int] `yaml:"ruler_max_rules_per_rule_group_by_namespace" json:"ruler_max_rules_per_rule_group_by_namespace" category:"experimental"` + RulerMaxRuleGroupsPerTenantByNamespace flagext.LimitsMap[int] `yaml:"ruler_max_rule_groups_per_tenant_by_namespace" json:"ruler_max_rule_groups_per_tenant_by_namespace" category:"experimental"` + RulerProtectedNamespaces flagext.StringSliceCSV `yaml:"ruler_protected_namespaces" json:"ruler_protected_namespaces" category:"experimental"` + RulerMaxIndependentRuleEvaluationConcurrencyPerTenant int64 `yaml:"ruler_max_independent_rule_evaluation_concurrency_per_tenant" json:"ruler_max_independent_rule_evaluation_concurrency_per_tenant" category:"experimental"` + RulerAlertmanagerClientConfig notifier.AlertmanagerClientConfig `yaml:"ruler_alertmanager_client_config" json:"ruler_alertmanager_client_config" category:"experimental"` // Store-gateway. StoreGatewayTenantShardSize int `yaml:"store_gateway_tenant_shard_size" json:"store_gateway_tenant_shard_size"` @@ -369,6 +371,7 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) { f.Var(&l.RulerMaxRuleGroupsPerTenantByNamespace, "ruler.max-rule-groups-per-tenant-by-namespace", "Maximum number of rule groups per tenant by namespace. Value is a map, where each key is the namespace and value is the number of rule groups allowed in the namespace (int). On the command line, this map is given in a JSON format. The number of rule groups specified has the same meaning as -ruler.max-rule-groups-per-tenant, but only applies for the specific namespace. If specified, it supersedes -ruler.max-rule-groups-per-tenant.") f.Var(&l.RulerProtectedNamespaces, "ruler.protected-namespaces", "List of namespaces that are protected from modification unless a special HTTP header is used. If a namespace is protected, it can only be read, not modified via the ruler's configuration API. The value is a list of strings, where each string is a namespace name. On the command line, this list is given as a comma-separated list.") f.Int64Var(&l.RulerMaxIndependentRuleEvaluationConcurrencyPerTenant, "ruler.max-independent-rule-evaluation-concurrency-per-tenant", 4, "Maximum number of independent rules that can run concurrently for each tenant. Depends on ruler.max-independent-rule-evaluation-concurrency being greater than 0. Ideally this flag should be a lower value. 0 to disable.") + f.Var(&l.RulerAlertmanagerClientConfig, "ruler.alertmanager-client-config", "Per-tenant alertmanager client configuration. If not supplied, the tenant's notifications will be sent to the ruler-wide default.") f.Var(&l.CompactorBlocksRetentionPeriod, "compactor.blocks-retention-period", "Delete blocks containing samples older than the specified retention period. Also used by query-frontend to avoid querying beyond the retention period by instant, range or remote read queries. 0 to disable.") f.IntVar(&l.CompactorSplitAndMergeShards, "compactor.split-and-merge-shards", 0, "The number of shards to use when splitting blocks. 0 to disable splitting.") @@ -1042,6 +1045,10 @@ func (o *Overrides) RulerMaxIndependentRuleEvaluationConcurrencyPerTenant(userID return o.getOverridesForUser(userID).RulerMaxIndependentRuleEvaluationConcurrencyPerTenant } +func (o *Overrides) RulerAlertmanagerClientConfig(userID string) notifier.AlertmanagerClientConfig { + return o.getOverridesForUser(userID).RulerAlertmanagerClientConfig +} + // StoreGatewayTenantShardSize returns the store-gateway shard size for a given user. func (o *Overrides) StoreGatewayTenantShardSize(userID string) int { return o.getOverridesForUser(userID).StoreGatewayTenantShardSize From e2be10c977f41860a1abc1df2a55ae7e471a5774 Mon Sep 17 00:00:00 2001 From: Alex Weaver Date: Tue, 4 Mar 2025 16:43:54 -0600 Subject: [PATCH 03/15] Tests and fix initialization --- pkg/util/validation/limits.go | 3 + pkg/util/validation/limits_test.go | 110 +++++++++++++++++++++++++++++ 2 files changed, 113 insertions(+) diff --git a/pkg/util/validation/limits.go b/pkg/util/validation/limits.go index abcf4e90da..f18a7c9a72 100644 --- a/pkg/util/validation/limits.go +++ b/pkg/util/validation/limits.go @@ -371,6 +371,9 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) { f.Var(&l.RulerMaxRuleGroupsPerTenantByNamespace, "ruler.max-rule-groups-per-tenant-by-namespace", "Maximum number of rule groups per tenant by namespace. Value is a map, where each key is the namespace and value is the number of rule groups allowed in the namespace (int). On the command line, this map is given in a JSON format. The number of rule groups specified has the same meaning as -ruler.max-rule-groups-per-tenant, but only applies for the specific namespace. If specified, it supersedes -ruler.max-rule-groups-per-tenant.") f.Var(&l.RulerProtectedNamespaces, "ruler.protected-namespaces", "List of namespaces that are protected from modification unless a special HTTP header is used. If a namespace is protected, it can only be read, not modified via the ruler's configuration API. The value is a list of strings, where each string is a namespace name. On the command line, this list is given as a comma-separated list.") f.Int64Var(&l.RulerMaxIndependentRuleEvaluationConcurrencyPerTenant, "ruler.max-independent-rule-evaluation-concurrency-per-tenant", 4, "Maximum number of independent rules that can run concurrently for each tenant. Depends on ruler.max-independent-rule-evaluation-concurrency being greater than 0. Ideally this flag should be a lower value. 0 to disable.") + if !l.RulerAlertmanagerClientConfig.NotifierConfig.OAuth2.EndpointParams.IsInitialized() { + l.RulerAlertmanagerClientConfig.NotifierConfig.OAuth2.EndpointParams = flagext.NewLimitsMap[string](nil) + } f.Var(&l.RulerAlertmanagerClientConfig, "ruler.alertmanager-client-config", "Per-tenant alertmanager client configuration. If not supplied, the tenant's notifications will be sent to the ruler-wide default.") f.Var(&l.CompactorBlocksRetentionPeriod, "compactor.blocks-retention-period", "Delete blocks containing samples older than the specified retention period. Also used by query-frontend to avoid querying beyond the retention period by instant, range or remote read queries. 0 to disable.") diff --git a/pkg/util/validation/limits_test.go b/pkg/util/validation/limits_test.go index 67b86f7463..544e98d35e 100644 --- a/pkg/util/validation/limits_test.go +++ b/pkg/util/validation/limits_test.go @@ -17,6 +17,7 @@ import ( "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" "github.com/grafana/dskit/flagext" + "github.com/grafana/mimir/pkg/ruler/notifier" "github.com/pkg/errors" "github.com/prometheus/common/model" "github.com/prometheus/prometheus/model/relabel" @@ -1026,6 +1027,115 @@ user1: } } +func TestRulerAlertmanagerClientConfig(t *testing.T) { + tc := map[string]struct { + baseYAML string + overrides string + expectedConfig notifier.AlertmanagerClientConfig + }{ + "no override provided": { + baseYAML: ``, + expectedConfig: notifier.AlertmanagerClientConfig{ + NotifierConfig: notifier.Config{ + OAuth2: notifier.OAuth2Config{ + EndpointParams: flagext.NewLimitsMap[string](nil), + }, + }, + }, + }, + "no user specific client config": { + baseYAML: ` +ruler_alertmanager_client_config: + alertmanager_url: http://custom-url:8080 + proxy_url: http://some-proxy:1234 + oauth2: + client_id: myclient + client_secret: mysecret + token_url: http://token-url + scopes: abc,def + endpoint_params: + key1: value1 +`, + expectedConfig: notifier.AlertmanagerClientConfig{ + AlertmanagerURL: "http://custom-url:8080", + NotifierConfig: notifier.Config{ + ProxyURL: "http://some-proxy:1234", + OAuth2: notifier.OAuth2Config{ + ClientID: "myclient", + ClientSecret: flagext.SecretWithValue("mysecret"), + TokenURL: "http://token-url", + Scopes: []string{"abc", "def"}, + EndpointParams: flagext.NewLimitsMapWithData(map[string]string{ + "key1": "value1", + }, nil), + }, + }, + }, + }, + "overridden config for specific user": { + baseYAML: ` +ruler_alertmanager_client_config: + alertmanager_url: http://some-base-url:8080 +`, + overrides: ` +user1: + ruler_alertmanager_client_config: + alertmanager_url: http://custom-url-for-this-tenant:8080 + proxy_url: http://some-proxy:1234 + oauth2: + client_id: myclient + client_secret: mysecret + token_url: http://token-url + scopes: abc,def + endpoint_params: + key1: value1 +`, + expectedConfig: notifier.AlertmanagerClientConfig{ + AlertmanagerURL: "http://custom-url-for-this-tenant:8080", + NotifierConfig: notifier.Config{ + ProxyURL: "http://some-proxy:1234", + OAuth2: notifier.OAuth2Config{ + ClientID: "myclient", + ClientSecret: flagext.SecretWithValue("mysecret"), + TokenURL: "http://token-url", + Scopes: []string{"abc", "def"}, + EndpointParams: flagext.NewLimitsMapWithData(map[string]string{ + "key1": "value1", + }, nil), + }, + }, + }, + }, + } + + for name, tt := range tc { + t.Run(name, func(t *testing.T) { + t.Cleanup(func() { + SetDefaultLimitsForYAMLUnmarshalling(getDefaultLimits()) + }) + + SetDefaultLimitsForYAMLUnmarshalling(getDefaultLimits()) + + var limitsYAML Limits + limitsYAML.RulerAlertmanagerClientConfig.NotifierConfig.OAuth2.EndpointParams = flagext.NewLimitsMap[string](nil) + err := yaml.Unmarshal([]byte(tt.baseYAML), &limitsYAML) + require.NoError(t, err) + + SetDefaultLimitsForYAMLUnmarshalling(limitsYAML) + + overrides := map[string]*Limits{} + err = yaml.Unmarshal([]byte(tt.overrides), &overrides) + require.NoError(t, err) + + tl := NewMockTenantLimits(overrides) + ov, err := NewOverrides(limitsYAML, tl) + require.NoError(t, err) + + require.Equal(t, tt.expectedConfig, ov.RulerAlertmanagerClientConfig("user1")) + }) + } +} + func TestActiveSeriesCustomTrackersConfig(t *testing.T) { tests := map[string]struct { cfg string From 0ba0239434a4e852c1c7269db6d753e8ee290663 Mon Sep 17 00:00:00 2001 From: Alex Weaver Date: Wed, 5 Mar 2025 14:03:13 -0600 Subject: [PATCH 04/15] Improve comparison, add hash --- pkg/ruler/notifier/notifier_config.go | 24 ++++++++++++++++++++++++ pkg/util/validation/limits_test.go | 11 +++-------- 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/pkg/ruler/notifier/notifier_config.go b/pkg/ruler/notifier/notifier_config.go index 0239f1f66f..90043dd19d 100644 --- a/pkg/ruler/notifier/notifier_config.go +++ b/pkg/ruler/notifier/notifier_config.go @@ -4,12 +4,21 @@ import ( "encoding/json" "flag" "fmt" + "hash/fnv" "github.com/grafana/dskit/crypto/tls" "github.com/grafana/dskit/flagext" "github.com/grafana/mimir/pkg/util" ) +var DefaultAlertmanagerClientConfig = AlertmanagerClientConfig{ + NotifierConfig: Config{ + OAuth2: OAuth2Config{ + EndpointParams: flagext.NewLimitsMap[string](nil), + }, + }, +} + type AlertmanagerClientConfig struct { AlertmanagerURL string `yaml:"alertmanager_url"` NotifierConfig Config `yaml:",inline" json:",inline"` @@ -32,6 +41,21 @@ func (acc *AlertmanagerClientConfig) Set(s string) error { return nil } +// Hash calculates a cryptographically weak, insecure hash of the configuration. +func (acc *AlertmanagerClientConfig) Hash() uint64 { + h := fnv.New64a() + h.Write([]byte(acc.String())) + return h.Sum64() +} + +func (acc *AlertmanagerClientConfig) Equal(other AlertmanagerClientConfig) bool { + return acc.Hash() == other.Hash() +} + +func (acc *AlertmanagerClientConfig) IsDefault() bool { + return acc.Equal(DefaultAlertmanagerClientConfig) +} + type Config struct { TLSEnabled bool `yaml:"tls_enabled" category:"advanced"` TLS tls.ClientConfig `yaml:",inline"` diff --git a/pkg/util/validation/limits_test.go b/pkg/util/validation/limits_test.go index 544e98d35e..a643664e36 100644 --- a/pkg/util/validation/limits_test.go +++ b/pkg/util/validation/limits_test.go @@ -1034,14 +1034,8 @@ func TestRulerAlertmanagerClientConfig(t *testing.T) { expectedConfig notifier.AlertmanagerClientConfig }{ "no override provided": { - baseYAML: ``, - expectedConfig: notifier.AlertmanagerClientConfig{ - NotifierConfig: notifier.Config{ - OAuth2: notifier.OAuth2Config{ - EndpointParams: flagext.NewLimitsMap[string](nil), - }, - }, - }, + baseYAML: ``, + expectedConfig: notifier.DefaultAlertmanagerClientConfig, }, "no user specific client config": { baseYAML: ` @@ -1132,6 +1126,7 @@ user1: require.NoError(t, err) require.Equal(t, tt.expectedConfig, ov.RulerAlertmanagerClientConfig("user1")) + require.True(t, tt.expectedConfig.Equal(ov.RulerAlertmanagerClientConfig("user1"))) }) } } From 3c3be9b6f1a09ca0eec0cbba58d5c68eef6a2926 Mon Sep 17 00:00:00 2001 From: Alex Weaver Date: Wed, 5 Mar 2025 14:45:03 -0600 Subject: [PATCH 05/15] Build user specific notifier if limits provided --- pkg/mimir/modules.go | 2 +- pkg/ruler/manager.go | 19 +++++++++++++++++-- pkg/ruler/manager_test.go | 7 ++++--- pkg/ruler/ruler_test.go | 2 +- 4 files changed, 23 insertions(+), 7 deletions(-) diff --git a/pkg/mimir/modules.go b/pkg/mimir/modules.go index 0459d10c5c..50306dac63 100644 --- a/pkg/mimir/modules.go +++ b/pkg/mimir/modules.go @@ -966,7 +966,7 @@ func (t *Mimir) initRuler() (serv services.Service, err error) { ) dnsResolver := dns.NewProvider(util_log.Logger, dnsProviderReg, dns.GolangResolverType) - manager, err := ruler.NewDefaultMultiTenantManager(t.Cfg.Ruler, managerFactory, t.Registerer, util_log.Logger, dnsResolver) + manager, err := ruler.NewDefaultMultiTenantManager(t.Cfg.Ruler, managerFactory, t.Registerer, util_log.Logger, dnsResolver, t.Overrides) if err != nil { return nil, err } diff --git a/pkg/ruler/manager.go b/pkg/ruler/manager.go index 1d4de52f76..5999b5a3aa 100644 --- a/pkg/ruler/manager.go +++ b/pkg/ruler/manager.go @@ -35,6 +35,9 @@ type DefaultMultiTenantManager struct { cfg Config notifierCfg *config.Config managerFactory ManagerFactory + limits RulesLimits + dnsResolver AddressProvider + refreshMetrics discovery.RefreshMetricsManager mapper *mapper @@ -59,7 +62,7 @@ type DefaultMultiTenantManager struct { rulerIsRunning atomic.Bool } -func NewDefaultMultiTenantManager(cfg Config, managerFactory ManagerFactory, reg prometheus.Registerer, logger log.Logger, dnsResolver AddressProvider) (*DefaultMultiTenantManager, error) { +func NewDefaultMultiTenantManager(cfg Config, managerFactory ManagerFactory, reg prometheus.Registerer, logger log.Logger, dnsResolver AddressProvider, limits RulesLimits) (*DefaultMultiTenantManager, error) { refreshMetrics := discovery.NewRefreshMetrics(reg) ncfg, err := buildNotifierConfig(&cfg, dnsResolver, refreshMetrics) if err != nil { @@ -75,6 +78,9 @@ func NewDefaultMultiTenantManager(cfg Config, managerFactory ManagerFactory, reg cfg: cfg, notifierCfg: ncfg, managerFactory: managerFactory, + limits: limits, + dnsResolver: dnsResolver, + refreshMetrics: refreshMetrics, notifiers: map[string]*rulerNotifier{}, mapper: newMapper(cfg.RulePath, logger), userManagers: map[string]RulesManager{}, @@ -321,8 +327,17 @@ func (r *DefaultMultiTenantManager) getOrCreateNotifier(userID string) (*notifie n.run() + notifierCfg := r.notifierCfg + userSpecificCfg := r.limits.RulerAlertmanagerClientConfig(userID) + if !userSpecificCfg.IsDefault() { + notifierCfg, err = buildNotifierConfig(nil, r.dnsResolver, r.refreshMetrics) + if err != nil { + return nil, err + } + } + // This should never fail, unless there's a programming mistake. - if err := n.applyConfig(r.notifierCfg); err != nil { + if err := n.applyConfig(notifierCfg); err != nil { return nil, err } diff --git a/pkg/ruler/manager_test.go b/pkg/ruler/manager_test.go index ac0766e84f..4e1782b14a 100644 --- a/pkg/ruler/manager_test.go +++ b/pkg/ruler/manager_test.go @@ -31,6 +31,7 @@ import ( "github.com/grafana/mimir/pkg/ruler/rulespb" testutil "github.com/grafana/mimir/pkg/util/test" + "github.com/grafana/mimir/pkg/util/validation" ) func TestDefaultMultiTenantManager_SyncFullRuleGroups(t *testing.T) { @@ -46,7 +47,7 @@ func TestDefaultMultiTenantManager_SyncFullRuleGroups(t *testing.T) { user2Group1 = createRuleGroup("group-1", user2, createRecordingRule("sum:metric_1", "sum(metric_1)")) ) - m, err := NewDefaultMultiTenantManager(Config{RulePath: t.TempDir()}, managerMockFactory, nil, logger, nil) + m, err := NewDefaultMultiTenantManager(Config{RulePath: t.TempDir()}, managerMockFactory, nil, logger, nil, validation.MockOverrides(nil)) require.NoError(t, err) // Initialise the manager with some rules and start it. @@ -132,7 +133,7 @@ func TestDefaultMultiTenantManager_SyncPartialRuleGroups(t *testing.T) { user2Group1 = createRuleGroup("group-1", user2, createRecordingRule("sum:metric_1", "sum(metric_1)")) ) - m, err := NewDefaultMultiTenantManager(Config{RulePath: t.TempDir()}, managerMockFactory, nil, logger, nil) + m, err := NewDefaultMultiTenantManager(Config{RulePath: t.TempDir()}, managerMockFactory, nil, logger, nil, validation.MockOverrides(nil)) require.NoError(t, err) t.Cleanup(m.Stop) @@ -308,7 +309,7 @@ func TestDefaultMultiTenantManager_WaitsToDrainPendingNotificationsOnShutdown(t NotificationQueueCapacity: 1000, NotificationTimeout: 10 * time.Second, } - m, err := NewDefaultMultiTenantManager(cfg, managerMockFactory, nil, logger, nil) + m, err := NewDefaultMultiTenantManager(cfg, managerMockFactory, nil, logger, nil, validation.MockOverrides(nil)) require.NoError(t, err) m.SyncFullRuleGroups(ctx, map[string]rulespb.RuleGroupList{ diff --git a/pkg/ruler/ruler_test.go b/pkg/ruler/ruler_test.go index 298fe24f50..87ad8dc136 100644 --- a/pkg/ruler/ruler_test.go +++ b/pkg/ruler/ruler_test.go @@ -250,7 +250,7 @@ func prepareRulerManager(t *testing.T, cfg Config, opts ...prepareOption) *Defau pusher.MockPush(&mimirpb.WriteResponse{}, nil) managerFactory := DefaultTenantManagerFactory(cfg, pusher, noopQueryable, queryFunc, &NoopMultiTenantConcurrencyController{}, options.limits, options.registerer) - manager, err := NewDefaultMultiTenantManager(cfg, managerFactory, prometheus.NewRegistry(), options.logger, nil) + manager, err := NewDefaultMultiTenantManager(cfg, managerFactory, prometheus.NewRegistry(), options.logger, nil, validation.MockOverrides(nil)) require.NoError(t, err) return manager From f81d974049daee828486bcd06a1a2cf7288b8e19 Mon Sep 17 00:00:00 2001 From: Alex Weaver Date: Wed, 5 Mar 2025 15:52:56 -0600 Subject: [PATCH 06/15] Build user specific config --- pkg/ruler/manager.go | 4 +- pkg/ruler/notifier.go | 68 +++++++++++++++--------------- pkg/ruler/notifier_test.go | 2 +- pkg/ruler/service_discovery.go | 2 +- pkg/util/validation/limits_mock.go | 4 +- 5 files changed, 42 insertions(+), 38 deletions(-) diff --git a/pkg/ruler/manager.go b/pkg/ruler/manager.go index 5999b5a3aa..027f964b8c 100644 --- a/pkg/ruler/manager.go +++ b/pkg/ruler/manager.go @@ -64,7 +64,7 @@ type DefaultMultiTenantManager struct { func NewDefaultMultiTenantManager(cfg Config, managerFactory ManagerFactory, reg prometheus.Registerer, logger log.Logger, dnsResolver AddressProvider, limits RulesLimits) (*DefaultMultiTenantManager, error) { refreshMetrics := discovery.NewRefreshMetrics(reg) - ncfg, err := buildNotifierConfig(&cfg, dnsResolver, refreshMetrics) + ncfg, err := buildNotifierConfig(cfg.AlertmanagerURL, cfg.Notifier, cfg, dnsResolver, refreshMetrics) if err != nil { return nil, err } @@ -330,7 +330,7 @@ func (r *DefaultMultiTenantManager) getOrCreateNotifier(userID string) (*notifie notifierCfg := r.notifierCfg userSpecificCfg := r.limits.RulerAlertmanagerClientConfig(userID) if !userSpecificCfg.IsDefault() { - notifierCfg, err = buildNotifierConfig(nil, r.dnsResolver, r.refreshMetrics) + notifierCfg, err = buildNotifierConfig(userSpecificCfg.AlertmanagerURL, userSpecificCfg.NotifierConfig, r.cfg, r.dnsResolver, r.refreshMetrics) if err != nil { return nil, err } diff --git a/pkg/ruler/notifier.go b/pkg/ruler/notifier.go index ba60eb6f83..7babea231c 100644 --- a/pkg/ruler/notifier.go +++ b/pkg/ruler/notifier.go @@ -11,6 +11,7 @@ import ( "net/url" "strings" "sync" + "time" gklog "github.com/go-kit/log" "github.com/go-kit/log/level" @@ -21,6 +22,7 @@ import ( "github.com/prometheus/prometheus/discovery" "github.com/prometheus/prometheus/notifier" + rulernotifier "github.com/grafana/mimir/pkg/ruler/notifier" util_log "github.com/grafana/mimir/pkg/util/log" ) @@ -98,13 +100,13 @@ func (rn *rulerNotifier) stop() { // Builds a Prometheus config.Config from a ruler.Config with just the required // options to configure notifications to Alertmanager. -func buildNotifierConfig(rulerConfig *Config, resolver AddressProvider, rmi discovery.RefreshMetricsManager) (*config.Config, error) { - if rulerConfig.AlertmanagerURL == "" { +func buildNotifierConfig(amURL string, notifierCfg rulernotifier.Config, rulerConfig Config, resolver AddressProvider, rmi discovery.RefreshMetricsManager) (*config.Config, error) { + if amURL == "" { // no AM URLs were provided, so we can just return a default config without errors return &config.Config{}, nil } - amURLs := strings.Split(rulerConfig.AlertmanagerURL, ",") + amURLs := strings.Split(amURL, ",") amConfigs := make([]*config.AlertmanagerConfig, 0, len(amURLs)) for _, rawURL := range amURLs { @@ -120,7 +122,7 @@ func buildNotifierConfig(rulerConfig *Config, resolver AddressProvider, rmi disc sdConfig = staticTarget(url) } - amCfgWithSD, err := amConfigWithSD(rulerConfig, url, sdConfig) + amCfgWithSD, err := amConfigWithSD(url, notifierCfg, sdConfig, rulerConfig.NotificationTimeout) if err != nil { return nil, err } @@ -136,12 +138,12 @@ func buildNotifierConfig(rulerConfig *Config, resolver AddressProvider, rmi disc return promConfig, nil } -func amConfigWithSD(rulerConfig *Config, url *url.URL, sdConfig discovery.Config) (*config.AlertmanagerConfig, error) { +func amConfigWithSD(url *url.URL, notifierCfg rulernotifier.Config, sdConfig discovery.Config, notifyTimeout time.Duration) (*config.AlertmanagerConfig, error) { amConfig := &config.AlertmanagerConfig{ APIVersion: config.AlertmanagerAPIVersionV2, Scheme: url.Scheme, PathPrefix: url.Path, - Timeout: model.Duration(rulerConfig.NotificationTimeout), + Timeout: model.Duration(notifyTimeout), ServiceDiscoveryConfigs: discovery.Configs{sdConfig}, HTTPClientConfig: config_util.HTTPClientConfig{}, } @@ -158,16 +160,16 @@ func amConfigWithSD(rulerConfig *Config, url *url.URL, sdConfig discovery.Config } // Override URL basic authentication configs with hard coded config values if present - if rulerConfig.Notifier.BasicAuth.IsEnabled() { + if notifierCfg.BasicAuth.IsEnabled() { amConfig.HTTPClientConfig.BasicAuth = &config_util.BasicAuth{ - Username: rulerConfig.Notifier.BasicAuth.Username, - Password: config_util.Secret(rulerConfig.Notifier.BasicAuth.Password.String()), + Username: notifierCfg.BasicAuth.Username, + Password: config_util.Secret(notifierCfg.BasicAuth.Password.String()), } } // Whether to use an optional HTTP, HTTP+CONNECT, or SOCKS5 proxy. - if rulerConfig.Notifier.ProxyURL != "" { - url, err := url.Parse(rulerConfig.Notifier.ProxyURL) + if notifierCfg.ProxyURL != "" { + url, err := url.Parse(notifierCfg.ProxyURL) if err != nil { return nil, err } @@ -175,24 +177,24 @@ func amConfigWithSD(rulerConfig *Config, url *url.URL, sdConfig discovery.Config } // Whether to use OAuth2 or not. - if rulerConfig.Notifier.OAuth2.IsEnabled() { + if notifierCfg.OAuth2.IsEnabled() { if amConfig.HTTPClientConfig.BasicAuth != nil { return nil, errRulerSimultaneousBasicAuthAndOAuth } amConfig.HTTPClientConfig.OAuth2 = &config_util.OAuth2{ - ClientID: rulerConfig.Notifier.OAuth2.ClientID, - ClientSecret: config_util.Secret(rulerConfig.Notifier.OAuth2.ClientSecret.String()), - TokenURL: rulerConfig.Notifier.OAuth2.TokenURL, - Scopes: rulerConfig.Notifier.OAuth2.Scopes, + ClientID: notifierCfg.OAuth2.ClientID, + ClientSecret: config_util.Secret(notifierCfg.OAuth2.ClientSecret.String()), + TokenURL: notifierCfg.OAuth2.TokenURL, + Scopes: notifierCfg.OAuth2.Scopes, } - if rulerConfig.Notifier.OAuth2.EndpointParams.IsInitialized() { - amConfig.HTTPClientConfig.OAuth2.EndpointParams = rulerConfig.Notifier.OAuth2.EndpointParams.Read() + if notifierCfg.OAuth2.EndpointParams.IsInitialized() { + amConfig.HTTPClientConfig.OAuth2.EndpointParams = notifierCfg.OAuth2.EndpointParams.Read() } - if rulerConfig.Notifier.ProxyURL != "" { - url, err := url.Parse(rulerConfig.Notifier.ProxyURL) + if notifierCfg.ProxyURL != "" { + url, err := url.Parse(notifierCfg.ProxyURL) if err != nil { return nil, err } @@ -201,29 +203,29 @@ func amConfigWithSD(rulerConfig *Config, url *url.URL, sdConfig discovery.Config } // Whether to use TLS or not. - if rulerConfig.Notifier.TLSEnabled { - if rulerConfig.Notifier.TLS.Reader == nil { + if notifierCfg.TLSEnabled { + if notifierCfg.TLS.Reader == nil { amConfig.HTTPClientConfig.TLSConfig = config_util.TLSConfig{ - CAFile: rulerConfig.Notifier.TLS.CAPath, - CertFile: rulerConfig.Notifier.TLS.CertPath, - KeyFile: rulerConfig.Notifier.TLS.KeyPath, - InsecureSkipVerify: rulerConfig.Notifier.TLS.InsecureSkipVerify, - ServerName: rulerConfig.Notifier.TLS.ServerName, + CAFile: notifierCfg.TLS.CAPath, + CertFile: notifierCfg.TLS.CertPath, + KeyFile: notifierCfg.TLS.KeyPath, + InsecureSkipVerify: notifierCfg.TLS.InsecureSkipVerify, + ServerName: notifierCfg.TLS.ServerName, } } else { - cert, err := rulerConfig.Notifier.TLS.Reader.ReadSecret(rulerConfig.Notifier.TLS.CertPath) + cert, err := notifierCfg.TLS.Reader.ReadSecret(notifierCfg.TLS.CertPath) if err != nil { return nil, err } - key, err := rulerConfig.Notifier.TLS.Reader.ReadSecret(rulerConfig.Notifier.TLS.KeyPath) + key, err := notifierCfg.TLS.Reader.ReadSecret(notifierCfg.TLS.KeyPath) if err != nil { return nil, err } var ca []byte - if rulerConfig.Notifier.TLS.CAPath != "" { - ca, err = rulerConfig.Notifier.TLS.Reader.ReadSecret(rulerConfig.Notifier.TLS.CAPath) + if notifierCfg.TLS.CAPath != "" { + ca, err = notifierCfg.TLS.Reader.ReadSecret(notifierCfg.TLS.CAPath) if err != nil { return nil, err } @@ -233,8 +235,8 @@ func amConfigWithSD(rulerConfig *Config, url *url.URL, sdConfig discovery.Config CA: string(ca), Cert: string(cert), Key: config_util.Secret(key), - InsecureSkipVerify: rulerConfig.Notifier.TLS.InsecureSkipVerify, - ServerName: rulerConfig.Notifier.TLS.ServerName, + InsecureSkipVerify: notifierCfg.TLS.InsecureSkipVerify, + ServerName: notifierCfg.TLS.ServerName, } } } diff --git a/pkg/ruler/notifier_test.go b/pkg/ruler/notifier_test.go index a08a983c56..958c62d1b0 100644 --- a/pkg/ruler/notifier_test.go +++ b/pkg/ruler/notifier_test.go @@ -537,7 +537,7 @@ func TestBuildNotifierConfig(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - ncfg, err := buildNotifierConfig(tt.cfg, nil, nil) + ncfg, err := buildNotifierConfig(tt.cfg.AlertmanagerURL, tt.cfg.Notifier, *tt.cfg, nil, nil) if tt.err == nil { require.NoError(t, err) require.Equal(t, tt.ncfg, ncfg) diff --git a/pkg/ruler/service_discovery.go b/pkg/ruler/service_discovery.go index 33a9ca5fc1..72016dda85 100644 --- a/pkg/ruler/service_discovery.go +++ b/pkg/ruler/service_discovery.go @@ -42,7 +42,7 @@ type dnsServiceDiscovery struct { host string } -func dnsSD(rulerConfig *Config, resolver AddressProvider, qType dns.QType, url *url.URL, rmi discovery.RefreshMetricsInstantiator) discovery.Config { +func dnsSD(rulerConfig Config, resolver AddressProvider, qType dns.QType, url *url.URL, rmi discovery.RefreshMetricsInstantiator) discovery.Config { return dnsServiceDiscovery{ resolver: resolver, refreshInterval: rulerConfig.AlertmanagerRefreshInterval, diff --git a/pkg/util/validation/limits_mock.go b/pkg/util/validation/limits_mock.go index b22ce0fc9f..7520e790ae 100644 --- a/pkg/util/validation/limits_mock.go +++ b/pkg/util/validation/limits_mock.go @@ -31,7 +31,9 @@ func (l *mockTenantLimits) AllByUserID() map[string]*Limits { func MockOverrides(customize func(defaults *Limits, tenantLimits map[string]*Limits)) *Overrides { defaults := MockDefaultLimits() tenantLimits := map[string]*Limits{} - customize(defaults, tenantLimits) + if customize != nil { + customize(defaults, tenantLimits) + } overrides, err := NewOverrides(*defaults, NewMockTenantLimits(tenantLimits)) if err != nil { From ad628ed5d4321b45cf23e3bc95cbd223ed103c0f Mon Sep 17 00:00:00 2001 From: Alex Weaver Date: Wed, 5 Mar 2025 16:01:00 -0600 Subject: [PATCH 07/15] generate docs --- cmd/mimir/config-descriptor.json | 182 ++++++++++++++++++ cmd/mimir/help-all.txt.tmpl | 2 + cmd/mimir/help.txt.tmpl | 2 + .../configuration-parameters/index.md | 79 ++++++++ 4 files changed, 265 insertions(+) diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index f1f7582d57..d077b7d59d 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -4970,6 +4970,188 @@ "fieldType": "int", "fieldCategory": "experimental" }, + { + "kind": "block", + "name": "ruler_alertmanager_client_config", + "required": false, + "desc": "", + "blockEntries": [ + { + "kind": "field", + "name": "alertmanager_url", + "required": false, + "desc": "Per-tenant alertmanager client configuration. If not supplied, the tenant's notifications will be sent to the ruler-wide default.", + "fieldValue": null, + "fieldDefaultValue": "{\"AlertmanagerURL\":\"\",\"NotifierConfig\":{\"TLSEnabled\":false,\"TLS\":{\"CertPath\":\"\",\"KeyPath\":\"\",\"CAPath\":\"\",\"ServerName\":\"\",\"InsecureSkipVerify\":false,\"CipherSuites\":\"\",\"MinVersion\":\"\",\"Reader\":null},\"BasicAuth\":{\"Username\":\"\",\"Password\":{}},\"OAuth2\":{\"ClientID\":\"\",\"ClientSecret\":{},\"TokenURL\":\"\",\"Scopes\":null,\"EndpointParams\":{}},\"ProxyURL\":\"\"}}", + "fieldFlag": "ruler.alertmanager-client-config", + "fieldType": "string" + }, + { + "kind": "field", + "name": "tls_enabled", + "required": false, + "desc": "", + "fieldValue": null, + "fieldDefaultValue": false, + "fieldType": "boolean", + "fieldCategory": "advanced" + }, + { + "kind": "field", + "name": "tls_cert_path", + "required": false, + "desc": "", + "fieldValue": null, + "fieldDefaultValue": "", + "fieldType": "string", + "fieldCategory": "advanced" + }, + { + "kind": "field", + "name": "tls_key_path", + "required": false, + "desc": "", + "fieldValue": null, + "fieldDefaultValue": "", + "fieldType": "string", + "fieldCategory": "advanced" + }, + { + "kind": "field", + "name": "tls_ca_path", + "required": false, + "desc": "", + "fieldValue": null, + "fieldDefaultValue": "", + "fieldType": "string", + "fieldCategory": "advanced" + }, + { + "kind": "field", + "name": "tls_server_name", + "required": false, + "desc": "", + "fieldValue": null, + "fieldDefaultValue": "", + "fieldType": "string", + "fieldCategory": "advanced" + }, + { + "kind": "field", + "name": "tls_insecure_skip_verify", + "required": false, + "desc": "", + "fieldValue": null, + "fieldDefaultValue": false, + "fieldType": "boolean", + "fieldCategory": "advanced" + }, + { + "kind": "field", + "name": "tls_cipher_suites", + "required": false, + "desc": "Override the default cipher suite list (separated by commas). Allowed values:\n\nSecure Ciphers:\n- TLS_AES_128_GCM_SHA256\n- TLS_AES_256_GCM_SHA384\n- TLS_CHACHA20_POLY1305_SHA256\n- TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA\n- TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA\n- TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA\n- TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA\n- TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256\n- TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384\n- TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256\n- TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384\n- TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305_SHA256\n- TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_SHA256\n\nInsecure Ciphers:\n- TLS_RSA_WITH_RC4_128_SHA\n- TLS_RSA_WITH_3DES_EDE_CBC_SHA\n- TLS_RSA_WITH_AES_128_CBC_SHA\n- TLS_RSA_WITH_AES_256_CBC_SHA\n- TLS_RSA_WITH_AES_128_CBC_SHA256\n- TLS_RSA_WITH_AES_128_GCM_SHA256\n- TLS_RSA_WITH_AES_256_GCM_SHA384\n- TLS_ECDHE_ECDSA_WITH_RC4_128_SHA\n- TLS_ECDHE_RSA_WITH_RC4_128_SHA\n- TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA\n- TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256\n- TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256\n", + "fieldValue": null, + "fieldDefaultValue": "", + "fieldType": "string", + "fieldCategory": "advanced" + }, + { + "kind": "field", + "name": "tls_min_version", + "required": false, + "desc": "", + "fieldValue": null, + "fieldDefaultValue": "", + "fieldType": "string", + "fieldCategory": "advanced" + }, + { + "kind": "field", + "name": "basic_auth_username", + "required": false, + "desc": "", + "fieldValue": null, + "fieldDefaultValue": "", + "fieldType": "string" + }, + { + "kind": "block", + "name": "basic_auth_password", + "required": false, + "desc": "", + "fieldValue": null, + "fieldDefaultValue": null + }, + { + "kind": "block", + "name": "oauth2", + "required": false, + "desc": "", + "blockEntries": [ + { + "kind": "field", + "name": "client_id", + "required": false, + "desc": "", + "fieldValue": null, + "fieldDefaultValue": "", + "fieldType": "string" + }, + { + "kind": "block", + "name": "client_secret", + "required": false, + "desc": "", + "fieldValue": null, + "fieldDefaultValue": null + }, + { + "kind": "field", + "name": "token_url", + "required": false, + "desc": "", + "fieldValue": null, + "fieldDefaultValue": "", + "fieldType": "string" + }, + { + "kind": "field", + "name": "scopes", + "required": false, + "desc": "", + "fieldValue": null, + "fieldDefaultValue": "", + "fieldType": "string" + }, + { + "kind": "field", + "name": "endpoint_params", + "required": false, + "desc": "", + "fieldValue": null, + "fieldDefaultValue": {}, + "fieldType": "map of string to string", + "fieldCategory": "advanced" + } + ], + "fieldValue": null, + "fieldDefaultValue": null + }, + { + "kind": "field", + "name": "proxy_url", + "required": false, + "desc": "", + "fieldValue": null, + "fieldDefaultValue": "", + "fieldType": "string", + "fieldCategory": "advanced" + } + ], + "fieldValue": null, + "fieldDefaultValue": null + }, { "kind": "field", "name": "store_gateway_tenant_shard_size", diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl index f1aa9b7273..f64f1e328d 100644 --- a/cmd/mimir/help-all.txt.tmpl +++ b/cmd/mimir/help-all.txt.tmpl @@ -2909,6 +2909,8 @@ Usage of ./cmd/mimir/mimir: OpenStack Swift username. -ruler.alerting-rules-evaluation-enabled Controls whether alerting rules evaluation is enabled. This configuration option can be used to forcefully disable alerting rules evaluation on a per-tenant basis. (default true) + -ruler.alertmanager-client-config value + Per-tenant alertmanager client configuration. If not supplied, the tenant's notifications will be sent to the ruler-wide default. -ruler.alertmanager-client.basic-auth-password string HTTP Basic authentication password. It overrides the password set in the URL (if any). -ruler.alertmanager-client.basic-auth-username string diff --git a/cmd/mimir/help.txt.tmpl b/cmd/mimir/help.txt.tmpl index 9b3daf16e3..3ac153df73 100644 --- a/cmd/mimir/help.txt.tmpl +++ b/cmd/mimir/help.txt.tmpl @@ -713,6 +713,8 @@ Usage of ./cmd/mimir/mimir: OpenStack Swift username. -ruler.alerting-rules-evaluation-enabled Controls whether alerting rules evaluation is enabled. This configuration option can be used to forcefully disable alerting rules evaluation on a per-tenant basis. (default true) + -ruler.alertmanager-client-config value + Per-tenant alertmanager client configuration. If not supplied, the tenant's notifications will be sent to the ruler-wide default. -ruler.alertmanager-client.basic-auth-password string HTTP Basic authentication password. It overrides the password set in the URL (if any). -ruler.alertmanager-client.basic-auth-username string diff --git a/docs/sources/mimir/configure/configuration-parameters/index.md b/docs/sources/mimir/configure/configuration-parameters/index.md index 43222c07f8..158e246bf0 100644 --- a/docs/sources/mimir/configure/configuration-parameters/index.md +++ b/docs/sources/mimir/configure/configuration-parameters/index.md @@ -3881,6 +3881,85 @@ The `limits` block configures default and per-tenant limits imposed by component # CLI flag: -ruler.max-independent-rule-evaluation-concurrency-per-tenant [ruler_max_independent_rule_evaluation_concurrency_per_tenant: | default = 4] +ruler_alertmanager_client_config: + # Per-tenant alertmanager client configuration. If not supplied, the tenant's + # notifications will be sent to the ruler-wide default. + # CLI flag: -ruler.alertmanager-client-config + [alertmanager_url: | default = "{\"AlertmanagerURL\":\"\",\"NotifierConfig\":{\"TLSEnabled\":false,\"TLS\":{\"CertPath\":\"\",\"KeyPath\":\"\",\"CAPath\":\"\",\"ServerName\":\"\",\"InsecureSkipVerify\":false,\"CipherSuites\":\"\",\"MinVersion\":\"\",\"Reader\":null},\"BasicAuth\":{\"Username\":\"\",\"Password\":{}},\"OAuth2\":{\"ClientID\":\"\",\"ClientSecret\":{},\"TokenURL\":\"\",\"Scopes\":null,\"EndpointParams\":{}},\"ProxyURL\":\"\"}}"] + + # (advanced) + [tls_enabled: | default = ] + + # (advanced) + [tls_cert_path: | default = ""] + + # (advanced) + [tls_key_path: | default = ""] + + # (advanced) + [tls_ca_path: | default = ""] + + # (advanced) + [tls_server_name: | default = ""] + + # (advanced) + [tls_insecure_skip_verify: | default = ] + + # (advanced) Override the default cipher suite list (separated by commas). + # Allowed values: + # + # Secure Ciphers: + # - TLS_AES_128_GCM_SHA256 + # - TLS_AES_256_GCM_SHA384 + # - TLS_CHACHA20_POLY1305_SHA256 + # - TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA + # - TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA + # - TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA + # - TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA + # - TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256 + # - TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384 + # - TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256 + # - TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384 + # - TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305_SHA256 + # - TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_SHA256 + # + # Insecure Ciphers: + # - TLS_RSA_WITH_RC4_128_SHA + # - TLS_RSA_WITH_3DES_EDE_CBC_SHA + # - TLS_RSA_WITH_AES_128_CBC_SHA + # - TLS_RSA_WITH_AES_256_CBC_SHA + # - TLS_RSA_WITH_AES_128_CBC_SHA256 + # - TLS_RSA_WITH_AES_128_GCM_SHA256 + # - TLS_RSA_WITH_AES_256_GCM_SHA384 + # - TLS_ECDHE_ECDSA_WITH_RC4_128_SHA + # - TLS_ECDHE_RSA_WITH_RC4_128_SHA + # - TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA + # - TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256 + # - TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256 + [tls_cipher_suites: | default = ""] + + # (advanced) + [tls_min_version: | default = ""] + + [basic_auth_username: | default = ""] + + basic_auth_password: + + oauth2: + [client_id: | default = ""] + + client_secret: + + [token_url: | default = ""] + + [scopes: | default = ""] + + # (advanced) + [endpoint_params: | default = ] + + # (advanced) + [proxy_url: | default = ""] + # The tenant's shard size, used when store-gateway sharding is enabled. Value of # 0 disables shuffle sharding for the tenant, that is all tenant blocks are # sharded across all store-gateway replicas. From 4a278d05334b132dee98cebd4ec0f61cf360e5a2 Mon Sep 17 00:00:00 2001 From: Alex Weaver Date: Wed, 5 Mar 2025 17:28:34 -0600 Subject: [PATCH 08/15] Adjust and debug wrong levelled flags in docs generator, use a workaround by double suppressing docs --- cmd/mimir/config-descriptor.json | 5 ++--- .../mimir/configure/configuration-parameters/index.md | 7 +++---- pkg/ruler/notifier/notifier_config.go | 2 +- pkg/util/validation/limits.go | 2 +- tools/doc-generator/parse/parser.go | 3 +++ 5 files changed, 10 insertions(+), 9 deletions(-) diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index d077b7d59d..35cb9e6d45 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -4980,10 +4980,9 @@ "kind": "field", "name": "alertmanager_url", "required": false, - "desc": "Per-tenant alertmanager client configuration. If not supplied, the tenant's notifications will be sent to the ruler-wide default.", + "desc": "", "fieldValue": null, - "fieldDefaultValue": "{\"AlertmanagerURL\":\"\",\"NotifierConfig\":{\"TLSEnabled\":false,\"TLS\":{\"CertPath\":\"\",\"KeyPath\":\"\",\"CAPath\":\"\",\"ServerName\":\"\",\"InsecureSkipVerify\":false,\"CipherSuites\":\"\",\"MinVersion\":\"\",\"Reader\":null},\"BasicAuth\":{\"Username\":\"\",\"Password\":{}},\"OAuth2\":{\"ClientID\":\"\",\"ClientSecret\":{},\"TokenURL\":\"\",\"Scopes\":null,\"EndpointParams\":{}},\"ProxyURL\":\"\"}}", - "fieldFlag": "ruler.alertmanager-client-config", + "fieldDefaultValue": "", "fieldType": "string" }, { diff --git a/docs/sources/mimir/configure/configuration-parameters/index.md b/docs/sources/mimir/configure/configuration-parameters/index.md index 158e246bf0..588dd8ae9a 100644 --- a/docs/sources/mimir/configure/configuration-parameters/index.md +++ b/docs/sources/mimir/configure/configuration-parameters/index.md @@ -3881,11 +3881,10 @@ The `limits` block configures default and per-tenant limits imposed by component # CLI flag: -ruler.max-independent-rule-evaluation-concurrency-per-tenant [ruler_max_independent_rule_evaluation_concurrency_per_tenant: | default = 4] +# Per-tenant alertmanager client configuration. If not supplied, the tenant's +# notifications will be sent to the ruler-wide default. ruler_alertmanager_client_config: - # Per-tenant alertmanager client configuration. If not supplied, the tenant's - # notifications will be sent to the ruler-wide default. - # CLI flag: -ruler.alertmanager-client-config - [alertmanager_url: | default = "{\"AlertmanagerURL\":\"\",\"NotifierConfig\":{\"TLSEnabled\":false,\"TLS\":{\"CertPath\":\"\",\"KeyPath\":\"\",\"CAPath\":\"\",\"ServerName\":\"\",\"InsecureSkipVerify\":false,\"CipherSuites\":\"\",\"MinVersion\":\"\",\"Reader\":null},\"BasicAuth\":{\"Username\":\"\",\"Password\":{}},\"OAuth2\":{\"ClientID\":\"\",\"ClientSecret\":{},\"TokenURL\":\"\",\"Scopes\":null,\"EndpointParams\":{}},\"ProxyURL\":\"\"}}"] + [alertmanager_url: | default = ""] # (advanced) [tls_enabled: | default = ] diff --git a/pkg/ruler/notifier/notifier_config.go b/pkg/ruler/notifier/notifier_config.go index 90043dd19d..5c5b02a025 100644 --- a/pkg/ruler/notifier/notifier_config.go +++ b/pkg/ruler/notifier/notifier_config.go @@ -20,7 +20,7 @@ var DefaultAlertmanagerClientConfig = AlertmanagerClientConfig{ } type AlertmanagerClientConfig struct { - AlertmanagerURL string `yaml:"alertmanager_url"` + AlertmanagerURL string `yaml:"alertmanager_url" doc:"nocli"` NotifierConfig Config `yaml:",inline" json:",inline"` } diff --git a/pkg/util/validation/limits.go b/pkg/util/validation/limits.go index f18a7c9a72..597858472a 100644 --- a/pkg/util/validation/limits.go +++ b/pkg/util/validation/limits.go @@ -218,7 +218,7 @@ type Limits struct { RulerMaxRuleGroupsPerTenantByNamespace flagext.LimitsMap[int] `yaml:"ruler_max_rule_groups_per_tenant_by_namespace" json:"ruler_max_rule_groups_per_tenant_by_namespace" category:"experimental"` RulerProtectedNamespaces flagext.StringSliceCSV `yaml:"ruler_protected_namespaces" json:"ruler_protected_namespaces" category:"experimental"` RulerMaxIndependentRuleEvaluationConcurrencyPerTenant int64 `yaml:"ruler_max_independent_rule_evaluation_concurrency_per_tenant" json:"ruler_max_independent_rule_evaluation_concurrency_per_tenant" category:"experimental"` - RulerAlertmanagerClientConfig notifier.AlertmanagerClientConfig `yaml:"ruler_alertmanager_client_config" json:"ruler_alertmanager_client_config" category:"experimental"` + RulerAlertmanagerClientConfig notifier.AlertmanagerClientConfig `yaml:"ruler_alertmanager_client_config" json:"ruler_alertmanager_client_config" category:"experimental" doc:"nocli|description=Per-tenant alertmanager client configuration. If not supplied, the tenant's notifications will be sent to the ruler-wide default."` // Store-gateway. StoreGatewayTenantShardSize int `yaml:"store_gateway_tenant_shard_size" json:"store_gateway_tenant_shard_size"` diff --git a/tools/doc-generator/parse/parser.go b/tools/doc-generator/parse/parser.go index de94561968..e1fac8de8e 100644 --- a/tools/doc-generator/parse/parser.go +++ b/tools/doc-generator/parse/parser.go @@ -24,6 +24,7 @@ import ( "github.com/thanos-io/objstore/providers/s3" asmodel "github.com/grafana/mimir/pkg/ingester/activeseries/model" + "github.com/grafana/mimir/pkg/ruler/notifier" "github.com/grafana/mimir/pkg/storage/tsdb" "github.com/grafana/mimir/pkg/util/configdoc" "github.com/grafana/mimir/pkg/util/validation" @@ -494,6 +495,8 @@ func ReflectType(typ string) reflect.Type { return reflect.TypeOf([]*validation.BlockedQuery{}) case "blocked_requests_config...": return reflect.TypeOf([]*validation.BlockedRequest{}) + case "ruler_alertmanager_client_config...": + return reflect.TypeOf(notifier.AlertmanagerClientConfig{}) case "map of string to float64": return reflect.TypeOf(flagext.LimitsMap[float64]{}) case "map of string to int": From 7e5b393b9486e2f910eb7e99fc34f302fa03c9d6 Mon Sep 17 00:00:00 2001 From: Alex Weaver Date: Thu, 6 Mar 2025 10:54:28 -0600 Subject: [PATCH 09/15] lint --- pkg/ruler/notifier/notifier_config.go | 7 ++++--- pkg/util/validation/limits_test.go | 3 ++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/pkg/ruler/notifier/notifier_config.go b/pkg/ruler/notifier/notifier_config.go index 5c5b02a025..cafcae8d95 100644 --- a/pkg/ruler/notifier/notifier_config.go +++ b/pkg/ruler/notifier/notifier_config.go @@ -8,6 +8,7 @@ import ( "github.com/grafana/dskit/crypto/tls" "github.com/grafana/dskit/flagext" + "github.com/grafana/mimir/pkg/util" ) @@ -33,11 +34,11 @@ func (acc *AlertmanagerClientConfig) String() string { } func (acc *AlertmanagerClientConfig) Set(s string) error { - new := AlertmanagerClientConfig{} - if err := json.Unmarshal([]byte(s), &new); err != nil { + cfg := AlertmanagerClientConfig{} + if err := json.Unmarshal([]byte(s), &cfg); err != nil { return err } - *acc = new + *acc = cfg return nil } diff --git a/pkg/util/validation/limits_test.go b/pkg/util/validation/limits_test.go index a643664e36..12630a377c 100644 --- a/pkg/util/validation/limits_test.go +++ b/pkg/util/validation/limits_test.go @@ -17,7 +17,6 @@ import ( "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" "github.com/grafana/dskit/flagext" - "github.com/grafana/mimir/pkg/ruler/notifier" "github.com/pkg/errors" "github.com/prometheus/common/model" "github.com/prometheus/prometheus/model/relabel" @@ -25,6 +24,8 @@ import ( "github.com/stretchr/testify/require" "golang.org/x/time/rate" "gopkg.in/yaml.v3" + + "github.com/grafana/mimir/pkg/ruler/notifier" ) func TestMain(m *testing.M) { From f8fd2e378ae1a2a2dcac601149b745efe7e43e9e Mon Sep 17 00:00:00 2001 From: Alex Weaver Date: Thu, 6 Mar 2025 11:10:02 -0600 Subject: [PATCH 10/15] license header --- pkg/ruler/notifier/notifier_config.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pkg/ruler/notifier/notifier_config.go b/pkg/ruler/notifier/notifier_config.go index cafcae8d95..163979f70c 100644 --- a/pkg/ruler/notifier/notifier_config.go +++ b/pkg/ruler/notifier/notifier_config.go @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: AGPL-3.0-only + package notifier import ( From 1276bfd539b3a6c6f4e421136b1adbc61debf21a Mon Sep 17 00:00:00 2001 From: Alex Weaver Date: Thu, 6 Mar 2025 14:10:04 -0600 Subject: [PATCH 11/15] Tests for notifier config --- pkg/ruler/notifier/notifier_config.go | 6 +- pkg/ruler/notifier/notifier_config_test.go | 300 +++++++++++++++++++++ 2 files changed, 303 insertions(+), 3 deletions(-) create mode 100644 pkg/ruler/notifier/notifier_config_test.go diff --git a/pkg/ruler/notifier/notifier_config.go b/pkg/ruler/notifier/notifier_config.go index 163979f70c..323c05eec7 100644 --- a/pkg/ruler/notifier/notifier_config.go +++ b/pkg/ruler/notifier/notifier_config.go @@ -3,13 +3,13 @@ package notifier import ( - "encoding/json" "flag" "fmt" "hash/fnv" "github.com/grafana/dskit/crypto/tls" "github.com/grafana/dskit/flagext" + "gopkg.in/yaml.v3" "github.com/grafana/mimir/pkg/util" ) @@ -28,7 +28,7 @@ type AlertmanagerClientConfig struct { } func (acc *AlertmanagerClientConfig) String() string { - out, err := json.Marshal(acc) + out, err := yaml.Marshal(acc) if err != nil { return fmt.Sprintf("failed to marshal: %v", err) } @@ -37,7 +37,7 @@ func (acc *AlertmanagerClientConfig) String() string { func (acc *AlertmanagerClientConfig) Set(s string) error { cfg := AlertmanagerClientConfig{} - if err := json.Unmarshal([]byte(s), &cfg); err != nil { + if err := yaml.Unmarshal([]byte(s), &cfg); err != nil { return err } *acc = cfg diff --git a/pkg/ruler/notifier/notifier_config_test.go b/pkg/ruler/notifier/notifier_config_test.go new file mode 100644 index 0000000000..4d195436d4 --- /dev/null +++ b/pkg/ruler/notifier/notifier_config_test.go @@ -0,0 +1,300 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package notifier + +import ( + "testing" + + "github.com/google/go-cmp/cmp" + "github.com/stretchr/testify/require" + + "github.com/grafana/dskit/crypto/tls" + "github.com/grafana/dskit/flagext" +) + +func TestAlertmanagerClientConfig(t *testing.T) { + t.Run("IsDefault", func(t *testing.T) { + tc := []struct { + name string + cfg *AlertmanagerClientConfig + exp bool + }{ + { + name: "default", + cfg: &DefaultAlertmanagerClientConfig, + exp: true, + }, + { + name: "initialized limits map", + cfg: &AlertmanagerClientConfig{ + NotifierConfig: Config{ + OAuth2: OAuth2Config{ + EndpointParams: flagext.NewLimitsMap[string](nil), + }, + }, + }, + exp: true, + }, + { + name: "empty scopes", + cfg: &AlertmanagerClientConfig{ + NotifierConfig: Config{ + OAuth2: OAuth2Config{ + Scopes: []string{}, + }, + }, + }, + exp: true, + }, + { + name: "golang default", + cfg: &AlertmanagerClientConfig{}, + exp: true, + }, + { + name: "custom TLS reader ignored", + cfg: &AlertmanagerClientConfig{ + NotifierConfig: Config{ + TLS: tls.ClientConfig{ + Reader: &fakeSecretReader{}, + }, + }, + }, + exp: true, + }, + { + name: "modified field", + cfg: &AlertmanagerClientConfig{ + AlertmanagerURL: "test", + }, + exp: false, + }, + { + name: "modified endpoint param", + cfg: &AlertmanagerClientConfig{ + NotifierConfig: Config{ + OAuth2: OAuth2Config{ + EndpointParams: flagext.NewLimitsMapWithData(map[string]string{"k1": "v1"}, nil), + }, + }, + }, + exp: false, + }, + { + name: "modified scope", + cfg: &AlertmanagerClientConfig{ + NotifierConfig: Config{ + OAuth2: OAuth2Config{ + Scopes: []string{"asdf"}, + }, + }, + }, + exp: false, + }, + } + + for _, tt := range tc { + t.Run(tt.name, func(t *testing.T) { + require.Equal(t, tt.exp, tt.cfg.IsDefault(), tt.cfg) + }) + } + }) + + t.Run("Equal", func(t *testing.T) { + tc := []struct { + name string + cfg1 AlertmanagerClientConfig + cfg2 AlertmanagerClientConfig + exp bool + }{ + { + name: "same values", + cfg1: AlertmanagerClientConfig{ + AlertmanagerURL: "http://some-url", + NotifierConfig: Config{ + ProxyURL: "http://some-proxy:1234", + TLS: tls.ClientConfig{ + CertPath: "cert-path", + KeyPath: "key-path", + CAPath: "ca-path", + ServerName: "server", + InsecureSkipVerify: true, + CipherSuites: "TLS_AES_256_GCM_SHA384", + MinVersion: "1.3", + }, + OAuth2: OAuth2Config{ + ClientID: "myclient", + ClientSecret: flagext.SecretWithValue("mysecret"), + TokenURL: "http://token-url", + Scopes: []string{"abc", "def"}, + EndpointParams: flagext.NewLimitsMapWithData(map[string]string{ + "key1": "value1", + }, nil), + }, + }, + }, + cfg2: AlertmanagerClientConfig{ + AlertmanagerURL: "http://some-url", + NotifierConfig: Config{ + ProxyURL: "http://some-proxy:1234", + TLS: tls.ClientConfig{ + CertPath: "cert-path", + KeyPath: "key-path", + CAPath: "ca-path", + ServerName: "server", + InsecureSkipVerify: true, + CipherSuites: "TLS_AES_256_GCM_SHA384", + MinVersion: "1.3", + }, + OAuth2: OAuth2Config{ + ClientID: "myclient", + ClientSecret: flagext.SecretWithValue("mysecret"), + TokenURL: "http://token-url", + Scopes: []string{"abc", "def"}, + EndpointParams: flagext.NewLimitsMapWithData(map[string]string{ + "key1": "value1", + }, nil), + }, + }, + }, + exp: true, + }, + { + name: "differing value", + cfg1: AlertmanagerClientConfig{ + AlertmanagerURL: "http://some-url", + NotifierConfig: Config{ + ProxyURL: "http://some-proxy:1234", + TLS: tls.ClientConfig{ + CertPath: "cert-path", + KeyPath: "key-path", + CAPath: "ca-path", + ServerName: "server", + InsecureSkipVerify: true, + CipherSuites: "TLS_AES_256_GCM_SHA384", + MinVersion: "1.3", + }, + OAuth2: OAuth2Config{ + ClientID: "myclient", + ClientSecret: flagext.SecretWithValue("mysecret"), + TokenURL: "http://token-url", + Scopes: []string{"abc", "def"}, + EndpointParams: flagext.NewLimitsMapWithData(map[string]string{ + "key1": "value1", + }, nil), + }, + }, + }, + cfg2: AlertmanagerClientConfig{ + AlertmanagerURL: "http://another-url", + NotifierConfig: Config{ + ProxyURL: "http://some-proxy:1234", + TLS: tls.ClientConfig{ + CertPath: "cert-path", + KeyPath: "key-path", + CAPath: "ca-path", + ServerName: "server", + InsecureSkipVerify: true, + CipherSuites: "TLS_AES_256_GCM_SHA384", + MinVersion: "1.3", + }, + OAuth2: OAuth2Config{ + ClientID: "myclient", + ClientSecret: flagext.SecretWithValue("mysecret"), + TokenURL: "http://token-url", + Scopes: []string{"abc", "def"}, + EndpointParams: flagext.NewLimitsMapWithData(map[string]string{ + "key1": "value1", + }, nil), + }, + }, + }, + exp: false, + }, + { + name: "different endpoint params order", + cfg1: AlertmanagerClientConfig{ + AlertmanagerURL: "http://some-url", + NotifierConfig: Config{ + OAuth2: OAuth2Config{ + EndpointParams: flagext.NewLimitsMapWithData(map[string]string{ + "key1": "value1", + "key2": "value2", + }, nil), + }, + }, + }, + cfg2: AlertmanagerClientConfig{ + AlertmanagerURL: "http://some-url", + NotifierConfig: Config{ + OAuth2: OAuth2Config{ + EndpointParams: flagext.NewLimitsMapWithData(map[string]string{ + "key2": "value2", + "key1": "value1", + }, nil), + }, + }, + }, + exp: true, + }, + { + name: "different scopes order", + cfg1: AlertmanagerClientConfig{ + AlertmanagerURL: "http://some-url", + NotifierConfig: Config{ + OAuth2: OAuth2Config{ + Scopes: []string{"s1", "s2"}, + }, + }, + }, + cfg2: AlertmanagerClientConfig{ + AlertmanagerURL: "http://some-url", + NotifierConfig: Config{ + OAuth2: OAuth2Config{ + Scopes: []string{"s2", "s1"}, + }, + }, + }, + exp: false, + }, + { + name: "ignores different secrets reader", + cfg1: AlertmanagerClientConfig{ + AlertmanagerURL: "http://some-url", + NotifierConfig: Config{ + TLS: tls.ClientConfig{ + Reader: &fakeSecretReader{}, + }, + OAuth2: OAuth2Config{ + ClientID: "myclient", + }, + }, + }, + cfg2: AlertmanagerClientConfig{ + AlertmanagerURL: "http://some-url", + NotifierConfig: Config{ + TLS: tls.ClientConfig{ + Reader: nil, + }, + OAuth2: OAuth2Config{ + ClientID: "myclient", + }, + }, + }, + exp: true, + }, + } + + for _, tt := range tc { + t.Run(tt.name, func(t *testing.T) { + require.Equal(t, tt.exp, tt.cfg1.Equal(tt.cfg2), cmp.Diff(tt.cfg1, tt.cfg2)) + }) + } + }) +} + +type fakeSecretReader struct{} + +func (fsr *fakeSecretReader) ReadSecret(path string) ([]byte, error) { + return []byte{}, nil +} From 346608cea7d2066079695ab44f8bd3f75f9b784d Mon Sep 17 00:00:00 2001 From: Alex Weaver Date: Thu, 6 Mar 2025 14:13:13 -0600 Subject: [PATCH 12/15] lint --- pkg/ruler/notifier/notifier_config_test.go | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pkg/ruler/notifier/notifier_config_test.go b/pkg/ruler/notifier/notifier_config_test.go index 4d195436d4..cf058f61c0 100644 --- a/pkg/ruler/notifier/notifier_config_test.go +++ b/pkg/ruler/notifier/notifier_config_test.go @@ -6,10 +6,9 @@ import ( "testing" "github.com/google/go-cmp/cmp" - "github.com/stretchr/testify/require" - "github.com/grafana/dskit/crypto/tls" "github.com/grafana/dskit/flagext" + "github.com/stretchr/testify/require" ) func TestAlertmanagerClientConfig(t *testing.T) { @@ -295,6 +294,6 @@ func TestAlertmanagerClientConfig(t *testing.T) { type fakeSecretReader struct{} -func (fsr *fakeSecretReader) ReadSecret(path string) ([]byte, error) { +func (fsr *fakeSecretReader) ReadSecret(_ string) ([]byte, error) { return []byte{}, nil } From 85a7c820dd6e94ad7ee5939e9adfd757c907c6ce Mon Sep 17 00:00:00 2001 From: Alex Weaver Date: Thu, 6 Mar 2025 15:16:50 -0600 Subject: [PATCH 13/15] Add integration test --- pkg/ruler/manager_test.go | 125 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 125 insertions(+) diff --git a/pkg/ruler/manager_test.go b/pkg/ruler/manager_test.go index 4e1782b14a..4bc744c2ec 100644 --- a/pkg/ruler/manager_test.go +++ b/pkg/ruler/manager_test.go @@ -29,6 +29,7 @@ import ( "go.uber.org/atomic" "gopkg.in/yaml.v3" + rulernotifier "github.com/grafana/mimir/pkg/ruler/notifier" "github.com/grafana/mimir/pkg/ruler/rulespb" testutil "github.com/grafana/mimir/pkg/util/test" "github.com/grafana/mimir/pkg/util/validation" @@ -270,6 +271,111 @@ func TestFilterRuleGroupsByNotEmptyUsers(t *testing.T) { } } +func TestDefaultMultiTenantManager_NotifierConfiguration(t *testing.T) { + // We have two alertmanagers. + alertmanager1ReceivedRequest := make(chan struct{}, 2) + alertmanager1 := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + alertmanager1ReceivedRequest <- struct{}{} + w.WriteHeader(http.StatusOK) + })) + defer func() { + alertmanager1.Close() + }() + + alertmanager2ReceivedRequest := make(chan struct{}, 2) + alertmanager2 := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + alertmanager2ReceivedRequest <- struct{}{} + w.WriteHeader(http.StatusOK) + })) + defer func() { + alertmanager2.Close() + }() + + ctx := context.Background() + logger := testutil.NewTestingLogger(t) + + // We have two users with rules. + const user1 = "user-1" + user1Group := createRuleGroup("group-1", user1, createRecordingRule("count:metric_1", "count(metric_1)")) + + const user2 = "user-2" + user2Group := createRuleGroup("group-1", user2, createRecordingRule("count:metric_1", "count(metric_1)")) + + // The ruler config points at alertmanager 1. + cfg := Config{ + RulePath: t.TempDir(), + AlertmanagerURL: alertmanager1.URL, + NotificationQueueCapacity: 1000, + NotificationTimeout: 10 * time.Second, + } + + // user-2's tenant configuration is overriddent to point at alertmanager 2. + overrides := validation.MockOverrides(func(_ *validation.Limits, tenantLimits map[string]*validation.Limits) { + tenantLimits[user1] = validation.MockDefaultLimits() + tenantLimits[user2] = validation.MockDefaultLimits() + tenantLimits[user2].RulerAlertmanagerClientConfig = rulernotifier.AlertmanagerClientConfig{ + AlertmanagerURL: alertmanager2.URL, + } + }) + + // Start a manager. + m, err := NewDefaultMultiTenantManager(cfg, managerMockFactory, nil, logger, nil, overrides) + require.NoError(t, err) + defer m.Stop() + m.SyncFullRuleGroups(ctx, map[string]rulespb.RuleGroupList{ + user1: {user1Group}, + user2: {user2Group}, + }) + m.Start() + + t.Run("creating notifier with global alertmanager settings sends to correct alertmanager", func(t *testing.T) { + _ = assertManagerMockRunningForUser(t, m, user1) + userNotifier := assertNotifierRunningForUser(t, m, user1) + waitForAlertmanagerToBeDiscovered(t, userNotifier.notifier) + + require.Equal(t, 1, len(userNotifier.notifier.Alertmanagers())) + require.Contains(t, userNotifier.notifier.Alertmanagers()[0].String(), alertmanager1.URL) + + // Send an alert. + userNotifier.notifier.Send(¬ifier.Alert{Labels: labels.FromStrings(labels.AlertName, "alert-1")}) + + // Wait for the Alertmanager to receive the request. + select { + case <-alertmanager1ReceivedRequest: + // We can continue. + case <-alertmanager2ReceivedRequest: + require.FailNow(t, "wrong alertmanager received the alert") + case <-time.After(time.Second): + require.FailNow(t, "gave up waiting for first notification request to be sent") + } + }) + + // Clear out anything potentially leftover + clearStructChannel(alertmanager1ReceivedRequest) + clearStructChannel(alertmanager2ReceivedRequest) + + t.Run("creating notifier with tenant-overridden alertmanager settings sends to correct alertmanager", func(t *testing.T) { + _ = assertManagerMockRunningForUser(t, m, user2) + userNotifier := assertNotifierRunningForUser(t, m, user2) + waitForAlertmanagerToBeDiscovered(t, userNotifier.notifier) + + require.Equal(t, 1, len(userNotifier.notifier.Alertmanagers())) + require.Contains(t, userNotifier.notifier.Alertmanagers()[0].String(), alertmanager2.URL) + + // Send an alert. + userNotifier.notifier.Send(¬ifier.Alert{Labels: labels.FromStrings(labels.AlertName, "alert-2")}) + // Wait for the Alertmanager to receive the request. + select { + case <-alertmanager1ReceivedRequest: + require.FailNow(t, "wrong alertmanager received the alert") + case <-alertmanager2ReceivedRequest: + // We can continue. + case <-time.After(time.Second): + require.FailNow(t, "gave up waiting for first notification request to be sent") + } + }) +} + func TestDefaultMultiTenantManager_WaitsToDrainPendingNotificationsOnShutdown(t *testing.T) { releaseReceiver := make(chan struct{}) receiverReceivedRequest := make(chan struct{}, 2) @@ -378,6 +484,12 @@ func getManager(m *DefaultMultiTenantManager, user string) RulesManager { return m.userManagers[user] } +func getNotifier(m *DefaultMultiTenantManager, user string) *rulerNotifier { + m.notifiersMtx.Lock() + defer m.notifiersMtx.Unlock() + return m.notifiers[user] +} + func assertManagerMockRunningForUser(t *testing.T, m *DefaultMultiTenantManager, userID string) *managerMock { t.Helper() @@ -408,6 +520,13 @@ func assertManagerMockStopped(t *testing.T, m *managerMock) { }) } +func assertNotifierRunningForUser(t *testing.T, m *DefaultMultiTenantManager, userID string) *rulerNotifier { + t.Helper() + n := getNotifier(m, userID) + require.NotNil(t, n) + return n +} + func assertRuleGroupsMappedOnDisk(t *testing.T, m *DefaultMultiTenantManager, userID string, expectedRuleGroups rulespb.RuleGroupList) { t.Helper() @@ -444,6 +563,12 @@ func assertRuleGroupsMappedOnDisk(t *testing.T, m *DefaultMultiTenantManager, us } } +func clearStructChannel(ch chan struct{}) { + for len(ch) > 0 { + <-ch + } +} + func managerMockFactory(_ context.Context, _ string, n *notifier.Manager, _ log.Logger, _ prometheus.Registerer) RulesManager { return &managerMock{done: make(chan struct{}), notifier: n} } From 69b8e928e6c5edc5d7f1c68e0ba598f44f53f2ac Mon Sep 17 00:00:00 2001 From: Alex Weaver Date: Thu, 6 Mar 2025 15:22:01 -0600 Subject: [PATCH 14/15] lint --- pkg/ruler/manager_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/ruler/manager_test.go b/pkg/ruler/manager_test.go index 4bc744c2ec..98f8e23835 100644 --- a/pkg/ruler/manager_test.go +++ b/pkg/ruler/manager_test.go @@ -274,7 +274,7 @@ func TestFilterRuleGroupsByNotEmptyUsers(t *testing.T) { func TestDefaultMultiTenantManager_NotifierConfiguration(t *testing.T) { // We have two alertmanagers. alertmanager1ReceivedRequest := make(chan struct{}, 2) - alertmanager1 := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + alertmanager1 := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { alertmanager1ReceivedRequest <- struct{}{} w.WriteHeader(http.StatusOK) })) @@ -283,7 +283,7 @@ func TestDefaultMultiTenantManager_NotifierConfiguration(t *testing.T) { }() alertmanager2ReceivedRequest := make(chan struct{}, 2) - alertmanager2 := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + alertmanager2 := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { alertmanager2ReceivedRequest <- struct{}{} w.WriteHeader(http.StatusOK) })) From 1fc443c3a7ff689a8308b277f897100955907baa Mon Sep 17 00:00:00 2001 From: Alex Weaver Date: Thu, 6 Mar 2025 15:29:08 -0600 Subject: [PATCH 15/15] changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c0a69acab5..23745fe2c5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -64,6 +64,7 @@ * [ENHANCEMENT] All: Add `cortex_client_request_invalid_cluster_validation_labels_total` metrics, that is used by Mimir's gRPC clients to track invalid cluster validations. #10767 * [ENHANCEMENT] Ingester client: Add support to configure cluster validation for ingester clients. Failed cluster validations are tracked by `cortex_client_request_invalid_cluster_validation_labels_total` with label `client=ingester`. #10767 * [ENHANCEMENT] Add experimental metric `cortex_distributor_dropped_native_histograms_total` to measure native histograms silently dropped when native histograms are disabled for a tenant. #10760 +* [ENHANCEMENT] Add tenant configuration block `ruler_alertmanager_client_config` which allows the Ruler's Alertmanager client options to be specified on a per-tenant basis. #10816 * [BUGFIX] Distributor: Use a boolean to track changes while merging the ReplicaDesc components, rather than comparing the objects directly. #10185 * [BUGFIX] Querier: fix timeout responding to query-frontend when response size is very close to `-querier.frontend-client.grpc-max-send-msg-size`. #10154 * [BUGFIX] Query-frontend and querier: show warning/info annotations in some cases where they were missing (if a lazy querier was used). #10277