diff --git a/CHANGELOG.md b/CHANGELOG.md index c0a69acab5..23745fe2c5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -64,6 +64,7 @@ * [ENHANCEMENT] All: Add `cortex_client_request_invalid_cluster_validation_labels_total` metrics, that is used by Mimir's gRPC clients to track invalid cluster validations. #10767 * [ENHANCEMENT] Ingester client: Add support to configure cluster validation for ingester clients. Failed cluster validations are tracked by `cortex_client_request_invalid_cluster_validation_labels_total` with label `client=ingester`. #10767 * [ENHANCEMENT] Add experimental metric `cortex_distributor_dropped_native_histograms_total` to measure native histograms silently dropped when native histograms are disabled for a tenant. #10760 +* [ENHANCEMENT] Add tenant configuration block `ruler_alertmanager_client_config` which allows the Ruler's Alertmanager client options to be specified on a per-tenant basis. #10816 * [BUGFIX] Distributor: Use a boolean to track changes while merging the ReplicaDesc components, rather than comparing the objects directly. #10185 * [BUGFIX] Querier: fix timeout responding to query-frontend when response size is very close to `-querier.frontend-client.grpc-max-send-msg-size`. #10154 * [BUGFIX] Query-frontend and querier: show warning/info annotations in some cases where they were missing (if a lazy querier was used). #10277 diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index f1f7582d57..35cb9e6d45 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -4970,6 +4970,187 @@ "fieldType": "int", "fieldCategory": "experimental" }, + { + "kind": "block", + "name": "ruler_alertmanager_client_config", + "required": false, + "desc": "", + "blockEntries": [ + { + "kind": "field", + "name": "alertmanager_url", + "required": false, + "desc": "", + "fieldValue": null, + "fieldDefaultValue": "", + "fieldType": "string" + }, + { + "kind": "field", + "name": "tls_enabled", + "required": false, + "desc": "", + "fieldValue": null, + "fieldDefaultValue": false, + "fieldType": "boolean", + "fieldCategory": "advanced" + }, + { + "kind": "field", + "name": "tls_cert_path", + "required": false, + "desc": "", + "fieldValue": null, + "fieldDefaultValue": "", + "fieldType": "string", + "fieldCategory": "advanced" + }, + { + "kind": "field", + "name": "tls_key_path", + "required": false, + "desc": "", + "fieldValue": null, + "fieldDefaultValue": "", + "fieldType": "string", + "fieldCategory": "advanced" + }, + { + "kind": "field", + "name": "tls_ca_path", + "required": false, + "desc": "", + "fieldValue": null, + "fieldDefaultValue": "", + "fieldType": "string", + "fieldCategory": "advanced" + }, + { + "kind": "field", + "name": "tls_server_name", + "required": false, + "desc": "", + "fieldValue": null, + "fieldDefaultValue": "", + "fieldType": "string", + "fieldCategory": "advanced" + }, + { + "kind": "field", + "name": "tls_insecure_skip_verify", + "required": false, + "desc": "", + "fieldValue": null, + "fieldDefaultValue": false, + "fieldType": "boolean", + "fieldCategory": "advanced" + }, + { + "kind": "field", + "name": "tls_cipher_suites", + "required": false, + "desc": "Override the default cipher suite list (separated by commas). Allowed values:\n\nSecure Ciphers:\n- TLS_AES_128_GCM_SHA256\n- TLS_AES_256_GCM_SHA384\n- TLS_CHACHA20_POLY1305_SHA256\n- TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA\n- TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA\n- TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA\n- TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA\n- TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256\n- TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384\n- TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256\n- TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384\n- TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305_SHA256\n- TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_SHA256\n\nInsecure Ciphers:\n- TLS_RSA_WITH_RC4_128_SHA\n- TLS_RSA_WITH_3DES_EDE_CBC_SHA\n- TLS_RSA_WITH_AES_128_CBC_SHA\n- TLS_RSA_WITH_AES_256_CBC_SHA\n- TLS_RSA_WITH_AES_128_CBC_SHA256\n- TLS_RSA_WITH_AES_128_GCM_SHA256\n- TLS_RSA_WITH_AES_256_GCM_SHA384\n- TLS_ECDHE_ECDSA_WITH_RC4_128_SHA\n- TLS_ECDHE_RSA_WITH_RC4_128_SHA\n- TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA\n- TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256\n- TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256\n", + "fieldValue": null, + "fieldDefaultValue": "", + "fieldType": "string", + "fieldCategory": "advanced" + }, + { + "kind": "field", + "name": "tls_min_version", + "required": false, + "desc": "", + "fieldValue": null, + "fieldDefaultValue": "", + "fieldType": "string", + "fieldCategory": "advanced" + }, + { + "kind": "field", + "name": "basic_auth_username", + "required": false, + "desc": "", + "fieldValue": null, + "fieldDefaultValue": "", + "fieldType": "string" + }, + { + "kind": "block", + "name": "basic_auth_password", + "required": false, + "desc": "", + "fieldValue": null, + "fieldDefaultValue": null + }, + { + "kind": "block", + "name": "oauth2", + "required": false, + "desc": "", + "blockEntries": [ + { + "kind": "field", + "name": "client_id", + "required": false, + "desc": "", + "fieldValue": null, + "fieldDefaultValue": "", + "fieldType": "string" + }, + { + "kind": "block", + "name": "client_secret", + "required": false, + "desc": "", + "fieldValue": null, + "fieldDefaultValue": null + }, + { + "kind": "field", + "name": "token_url", + "required": false, + "desc": "", + "fieldValue": null, + "fieldDefaultValue": "", + "fieldType": "string" + }, + { + "kind": "field", + "name": "scopes", + "required": false, + "desc": "", + "fieldValue": null, + "fieldDefaultValue": "", + "fieldType": "string" + }, + { + "kind": "field", + "name": "endpoint_params", + "required": false, + "desc": "", + "fieldValue": null, + "fieldDefaultValue": {}, + "fieldType": "map of string to string", + "fieldCategory": "advanced" + } + ], + "fieldValue": null, + "fieldDefaultValue": null + }, + { + "kind": "field", + "name": "proxy_url", + "required": false, + "desc": "", + "fieldValue": null, + "fieldDefaultValue": "", + "fieldType": "string", + "fieldCategory": "advanced" + } + ], + "fieldValue": null, + "fieldDefaultValue": null + }, { "kind": "field", "name": "store_gateway_tenant_shard_size", diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl index f1aa9b7273..f64f1e328d 100644 --- a/cmd/mimir/help-all.txt.tmpl +++ b/cmd/mimir/help-all.txt.tmpl @@ -2909,6 +2909,8 @@ Usage of ./cmd/mimir/mimir: OpenStack Swift username. -ruler.alerting-rules-evaluation-enabled Controls whether alerting rules evaluation is enabled. This configuration option can be used to forcefully disable alerting rules evaluation on a per-tenant basis. (default true) + -ruler.alertmanager-client-config value + Per-tenant alertmanager client configuration. If not supplied, the tenant's notifications will be sent to the ruler-wide default. -ruler.alertmanager-client.basic-auth-password string HTTP Basic authentication password. It overrides the password set in the URL (if any). -ruler.alertmanager-client.basic-auth-username string diff --git a/cmd/mimir/help.txt.tmpl b/cmd/mimir/help.txt.tmpl index 9b3daf16e3..3ac153df73 100644 --- a/cmd/mimir/help.txt.tmpl +++ b/cmd/mimir/help.txt.tmpl @@ -713,6 +713,8 @@ Usage of ./cmd/mimir/mimir: OpenStack Swift username. -ruler.alerting-rules-evaluation-enabled Controls whether alerting rules evaluation is enabled. This configuration option can be used to forcefully disable alerting rules evaluation on a per-tenant basis. (default true) + -ruler.alertmanager-client-config value + Per-tenant alertmanager client configuration. If not supplied, the tenant's notifications will be sent to the ruler-wide default. -ruler.alertmanager-client.basic-auth-password string HTTP Basic authentication password. It overrides the password set in the URL (if any). -ruler.alertmanager-client.basic-auth-username string diff --git a/docs/sources/mimir/configure/configuration-parameters/index.md b/docs/sources/mimir/configure/configuration-parameters/index.md index 43222c07f8..588dd8ae9a 100644 --- a/docs/sources/mimir/configure/configuration-parameters/index.md +++ b/docs/sources/mimir/configure/configuration-parameters/index.md @@ -3881,6 +3881,84 @@ The `limits` block configures default and per-tenant limits imposed by component # CLI flag: -ruler.max-independent-rule-evaluation-concurrency-per-tenant [ruler_max_independent_rule_evaluation_concurrency_per_tenant: | default = 4] +# Per-tenant alertmanager client configuration. If not supplied, the tenant's +# notifications will be sent to the ruler-wide default. +ruler_alertmanager_client_config: + [alertmanager_url: | default = ""] + + # (advanced) + [tls_enabled: | default = ] + + # (advanced) + [tls_cert_path: | default = ""] + + # (advanced) + [tls_key_path: | default = ""] + + # (advanced) + [tls_ca_path: | default = ""] + + # (advanced) + [tls_server_name: | default = ""] + + # (advanced) + [tls_insecure_skip_verify: | default = ] + + # (advanced) Override the default cipher suite list (separated by commas). + # Allowed values: + # + # Secure Ciphers: + # - TLS_AES_128_GCM_SHA256 + # - TLS_AES_256_GCM_SHA384 + # - TLS_CHACHA20_POLY1305_SHA256 + # - TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA + # - TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA + # - TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA + # - TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA + # - TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256 + # - TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384 + # - TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256 + # - TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384 + # - TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305_SHA256 + # - TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_SHA256 + # + # Insecure Ciphers: + # - TLS_RSA_WITH_RC4_128_SHA + # - TLS_RSA_WITH_3DES_EDE_CBC_SHA + # - TLS_RSA_WITH_AES_128_CBC_SHA + # - TLS_RSA_WITH_AES_256_CBC_SHA + # - TLS_RSA_WITH_AES_128_CBC_SHA256 + # - TLS_RSA_WITH_AES_128_GCM_SHA256 + # - TLS_RSA_WITH_AES_256_GCM_SHA384 + # - TLS_ECDHE_ECDSA_WITH_RC4_128_SHA + # - TLS_ECDHE_RSA_WITH_RC4_128_SHA + # - TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA + # - TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256 + # - TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256 + [tls_cipher_suites: | default = ""] + + # (advanced) + [tls_min_version: | default = ""] + + [basic_auth_username: | default = ""] + + basic_auth_password: + + oauth2: + [client_id: | default = ""] + + client_secret: + + [token_url: | default = ""] + + [scopes: | default = ""] + + # (advanced) + [endpoint_params: | default = ] + + # (advanced) + [proxy_url: | default = ""] + # The tenant's shard size, used when store-gateway sharding is enabled. Value of # 0 disables shuffle sharding for the tenant, that is all tenant blocks are # sharded across all store-gateway replicas. diff --git a/pkg/mimir/modules.go b/pkg/mimir/modules.go index 0459d10c5c..50306dac63 100644 --- a/pkg/mimir/modules.go +++ b/pkg/mimir/modules.go @@ -966,7 +966,7 @@ func (t *Mimir) initRuler() (serv services.Service, err error) { ) dnsResolver := dns.NewProvider(util_log.Logger, dnsProviderReg, dns.GolangResolverType) - manager, err := ruler.NewDefaultMultiTenantManager(t.Cfg.Ruler, managerFactory, t.Registerer, util_log.Logger, dnsResolver) + manager, err := ruler.NewDefaultMultiTenantManager(t.Cfg.Ruler, managerFactory, t.Registerer, util_log.Logger, dnsResolver, t.Overrides) if err != nil { return nil, err } diff --git a/pkg/ruler/compat.go b/pkg/ruler/compat.go index 87e951a90d..841d06dfb0 100644 --- a/pkg/ruler/compat.go +++ b/pkg/ruler/compat.go @@ -28,6 +28,7 @@ import ( "github.com/grafana/mimir/pkg/mimirpb" "github.com/grafana/mimir/pkg/querier" querier_stats "github.com/grafana/mimir/pkg/querier/stats" + notifierCfg "github.com/grafana/mimir/pkg/ruler/notifier" util_log "github.com/grafana/mimir/pkg/util/log" ) @@ -213,6 +214,7 @@ type RulesLimits interface { RulerSyncRulesOnChangesEnabled(userID string) bool RulerProtectedNamespaces(userID string) []string RulerMaxIndependentRuleEvaluationConcurrencyPerTenant(userID string) int64 + RulerAlertmanagerClientConfig(userID string) notifierCfg.AlertmanagerClientConfig } func MetricsQueryFunc(qf rules.QueryFunc, userID string, queries, failedQueries *prometheus.CounterVec, remoteQuerier bool) rules.QueryFunc { diff --git a/pkg/ruler/manager.go b/pkg/ruler/manager.go index 1d4de52f76..027f964b8c 100644 --- a/pkg/ruler/manager.go +++ b/pkg/ruler/manager.go @@ -35,6 +35,9 @@ type DefaultMultiTenantManager struct { cfg Config notifierCfg *config.Config managerFactory ManagerFactory + limits RulesLimits + dnsResolver AddressProvider + refreshMetrics discovery.RefreshMetricsManager mapper *mapper @@ -59,9 +62,9 @@ type DefaultMultiTenantManager struct { rulerIsRunning atomic.Bool } -func NewDefaultMultiTenantManager(cfg Config, managerFactory ManagerFactory, reg prometheus.Registerer, logger log.Logger, dnsResolver AddressProvider) (*DefaultMultiTenantManager, error) { +func NewDefaultMultiTenantManager(cfg Config, managerFactory ManagerFactory, reg prometheus.Registerer, logger log.Logger, dnsResolver AddressProvider, limits RulesLimits) (*DefaultMultiTenantManager, error) { refreshMetrics := discovery.NewRefreshMetrics(reg) - ncfg, err := buildNotifierConfig(&cfg, dnsResolver, refreshMetrics) + ncfg, err := buildNotifierConfig(cfg.AlertmanagerURL, cfg.Notifier, cfg, dnsResolver, refreshMetrics) if err != nil { return nil, err } @@ -75,6 +78,9 @@ func NewDefaultMultiTenantManager(cfg Config, managerFactory ManagerFactory, reg cfg: cfg, notifierCfg: ncfg, managerFactory: managerFactory, + limits: limits, + dnsResolver: dnsResolver, + refreshMetrics: refreshMetrics, notifiers: map[string]*rulerNotifier{}, mapper: newMapper(cfg.RulePath, logger), userManagers: map[string]RulesManager{}, @@ -321,8 +327,17 @@ func (r *DefaultMultiTenantManager) getOrCreateNotifier(userID string) (*notifie n.run() + notifierCfg := r.notifierCfg + userSpecificCfg := r.limits.RulerAlertmanagerClientConfig(userID) + if !userSpecificCfg.IsDefault() { + notifierCfg, err = buildNotifierConfig(userSpecificCfg.AlertmanagerURL, userSpecificCfg.NotifierConfig, r.cfg, r.dnsResolver, r.refreshMetrics) + if err != nil { + return nil, err + } + } + // This should never fail, unless there's a programming mistake. - if err := n.applyConfig(r.notifierCfg); err != nil { + if err := n.applyConfig(notifierCfg); err != nil { return nil, err } diff --git a/pkg/ruler/manager_test.go b/pkg/ruler/manager_test.go index ac0766e84f..98f8e23835 100644 --- a/pkg/ruler/manager_test.go +++ b/pkg/ruler/manager_test.go @@ -29,8 +29,10 @@ import ( "go.uber.org/atomic" "gopkg.in/yaml.v3" + rulernotifier "github.com/grafana/mimir/pkg/ruler/notifier" "github.com/grafana/mimir/pkg/ruler/rulespb" testutil "github.com/grafana/mimir/pkg/util/test" + "github.com/grafana/mimir/pkg/util/validation" ) func TestDefaultMultiTenantManager_SyncFullRuleGroups(t *testing.T) { @@ -46,7 +48,7 @@ func TestDefaultMultiTenantManager_SyncFullRuleGroups(t *testing.T) { user2Group1 = createRuleGroup("group-1", user2, createRecordingRule("sum:metric_1", "sum(metric_1)")) ) - m, err := NewDefaultMultiTenantManager(Config{RulePath: t.TempDir()}, managerMockFactory, nil, logger, nil) + m, err := NewDefaultMultiTenantManager(Config{RulePath: t.TempDir()}, managerMockFactory, nil, logger, nil, validation.MockOverrides(nil)) require.NoError(t, err) // Initialise the manager with some rules and start it. @@ -132,7 +134,7 @@ func TestDefaultMultiTenantManager_SyncPartialRuleGroups(t *testing.T) { user2Group1 = createRuleGroup("group-1", user2, createRecordingRule("sum:metric_1", "sum(metric_1)")) ) - m, err := NewDefaultMultiTenantManager(Config{RulePath: t.TempDir()}, managerMockFactory, nil, logger, nil) + m, err := NewDefaultMultiTenantManager(Config{RulePath: t.TempDir()}, managerMockFactory, nil, logger, nil, validation.MockOverrides(nil)) require.NoError(t, err) t.Cleanup(m.Stop) @@ -269,6 +271,111 @@ func TestFilterRuleGroupsByNotEmptyUsers(t *testing.T) { } } +func TestDefaultMultiTenantManager_NotifierConfiguration(t *testing.T) { + // We have two alertmanagers. + alertmanager1ReceivedRequest := make(chan struct{}, 2) + alertmanager1 := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + alertmanager1ReceivedRequest <- struct{}{} + w.WriteHeader(http.StatusOK) + })) + defer func() { + alertmanager1.Close() + }() + + alertmanager2ReceivedRequest := make(chan struct{}, 2) + alertmanager2 := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + alertmanager2ReceivedRequest <- struct{}{} + w.WriteHeader(http.StatusOK) + })) + defer func() { + alertmanager2.Close() + }() + + ctx := context.Background() + logger := testutil.NewTestingLogger(t) + + // We have two users with rules. + const user1 = "user-1" + user1Group := createRuleGroup("group-1", user1, createRecordingRule("count:metric_1", "count(metric_1)")) + + const user2 = "user-2" + user2Group := createRuleGroup("group-1", user2, createRecordingRule("count:metric_1", "count(metric_1)")) + + // The ruler config points at alertmanager 1. + cfg := Config{ + RulePath: t.TempDir(), + AlertmanagerURL: alertmanager1.URL, + NotificationQueueCapacity: 1000, + NotificationTimeout: 10 * time.Second, + } + + // user-2's tenant configuration is overriddent to point at alertmanager 2. + overrides := validation.MockOverrides(func(_ *validation.Limits, tenantLimits map[string]*validation.Limits) { + tenantLimits[user1] = validation.MockDefaultLimits() + tenantLimits[user2] = validation.MockDefaultLimits() + tenantLimits[user2].RulerAlertmanagerClientConfig = rulernotifier.AlertmanagerClientConfig{ + AlertmanagerURL: alertmanager2.URL, + } + }) + + // Start a manager. + m, err := NewDefaultMultiTenantManager(cfg, managerMockFactory, nil, logger, nil, overrides) + require.NoError(t, err) + defer m.Stop() + m.SyncFullRuleGroups(ctx, map[string]rulespb.RuleGroupList{ + user1: {user1Group}, + user2: {user2Group}, + }) + m.Start() + + t.Run("creating notifier with global alertmanager settings sends to correct alertmanager", func(t *testing.T) { + _ = assertManagerMockRunningForUser(t, m, user1) + userNotifier := assertNotifierRunningForUser(t, m, user1) + waitForAlertmanagerToBeDiscovered(t, userNotifier.notifier) + + require.Equal(t, 1, len(userNotifier.notifier.Alertmanagers())) + require.Contains(t, userNotifier.notifier.Alertmanagers()[0].String(), alertmanager1.URL) + + // Send an alert. + userNotifier.notifier.Send(¬ifier.Alert{Labels: labels.FromStrings(labels.AlertName, "alert-1")}) + + // Wait for the Alertmanager to receive the request. + select { + case <-alertmanager1ReceivedRequest: + // We can continue. + case <-alertmanager2ReceivedRequest: + require.FailNow(t, "wrong alertmanager received the alert") + case <-time.After(time.Second): + require.FailNow(t, "gave up waiting for first notification request to be sent") + } + }) + + // Clear out anything potentially leftover + clearStructChannel(alertmanager1ReceivedRequest) + clearStructChannel(alertmanager2ReceivedRequest) + + t.Run("creating notifier with tenant-overridden alertmanager settings sends to correct alertmanager", func(t *testing.T) { + _ = assertManagerMockRunningForUser(t, m, user2) + userNotifier := assertNotifierRunningForUser(t, m, user2) + waitForAlertmanagerToBeDiscovered(t, userNotifier.notifier) + + require.Equal(t, 1, len(userNotifier.notifier.Alertmanagers())) + require.Contains(t, userNotifier.notifier.Alertmanagers()[0].String(), alertmanager2.URL) + + // Send an alert. + userNotifier.notifier.Send(¬ifier.Alert{Labels: labels.FromStrings(labels.AlertName, "alert-2")}) + // Wait for the Alertmanager to receive the request. + select { + case <-alertmanager1ReceivedRequest: + require.FailNow(t, "wrong alertmanager received the alert") + case <-alertmanager2ReceivedRequest: + // We can continue. + case <-time.After(time.Second): + require.FailNow(t, "gave up waiting for first notification request to be sent") + } + }) +} + func TestDefaultMultiTenantManager_WaitsToDrainPendingNotificationsOnShutdown(t *testing.T) { releaseReceiver := make(chan struct{}) receiverReceivedRequest := make(chan struct{}, 2) @@ -308,7 +415,7 @@ func TestDefaultMultiTenantManager_WaitsToDrainPendingNotificationsOnShutdown(t NotificationQueueCapacity: 1000, NotificationTimeout: 10 * time.Second, } - m, err := NewDefaultMultiTenantManager(cfg, managerMockFactory, nil, logger, nil) + m, err := NewDefaultMultiTenantManager(cfg, managerMockFactory, nil, logger, nil, validation.MockOverrides(nil)) require.NoError(t, err) m.SyncFullRuleGroups(ctx, map[string]rulespb.RuleGroupList{ @@ -377,6 +484,12 @@ func getManager(m *DefaultMultiTenantManager, user string) RulesManager { return m.userManagers[user] } +func getNotifier(m *DefaultMultiTenantManager, user string) *rulerNotifier { + m.notifiersMtx.Lock() + defer m.notifiersMtx.Unlock() + return m.notifiers[user] +} + func assertManagerMockRunningForUser(t *testing.T, m *DefaultMultiTenantManager, userID string) *managerMock { t.Helper() @@ -407,6 +520,13 @@ func assertManagerMockStopped(t *testing.T, m *managerMock) { }) } +func assertNotifierRunningForUser(t *testing.T, m *DefaultMultiTenantManager, userID string) *rulerNotifier { + t.Helper() + n := getNotifier(m, userID) + require.NotNil(t, n) + return n +} + func assertRuleGroupsMappedOnDisk(t *testing.T, m *DefaultMultiTenantManager, userID string, expectedRuleGroups rulespb.RuleGroupList) { t.Helper() @@ -443,6 +563,12 @@ func assertRuleGroupsMappedOnDisk(t *testing.T, m *DefaultMultiTenantManager, us } } +func clearStructChannel(ch chan struct{}) { + for len(ch) > 0 { + <-ch + } +} + func managerMockFactory(_ context.Context, _ string, n *notifier.Manager, _ log.Logger, _ prometheus.Registerer) RulesManager { return &managerMock{done: make(chan struct{}), notifier: n} } diff --git a/pkg/ruler/notifier.go b/pkg/ruler/notifier.go index ab21552125..7babea231c 100644 --- a/pkg/ruler/notifier.go +++ b/pkg/ruler/notifier.go @@ -8,23 +8,21 @@ package ruler import ( "context" "errors" - "flag" "net/url" "strings" "sync" + "time" gklog "github.com/go-kit/log" "github.com/go-kit/log/level" "github.com/grafana/dskit/cancellation" - "github.com/grafana/dskit/crypto/tls" - "github.com/grafana/dskit/flagext" config_util "github.com/prometheus/common/config" "github.com/prometheus/common/model" "github.com/prometheus/prometheus/config" "github.com/prometheus/prometheus/discovery" "github.com/prometheus/prometheus/notifier" - "github.com/grafana/mimir/pkg/util" + rulernotifier "github.com/grafana/mimir/pkg/ruler/notifier" util_log "github.com/grafana/mimir/pkg/util/log" ) @@ -33,45 +31,6 @@ var ( errRulerSimultaneousBasicAuthAndOAuth = errors.New("cannot use both Basic Auth and OAuth2 simultaneously") ) -type NotifierConfig struct { - TLSEnabled bool `yaml:"tls_enabled" category:"advanced"` - TLS tls.ClientConfig `yaml:",inline"` - BasicAuth util.BasicAuth `yaml:",inline"` - OAuth2 OAuth2Config `yaml:"oauth2"` - ProxyURL string `yaml:"proxy_url" category:"advanced"` -} - -func (cfg *NotifierConfig) RegisterFlags(f *flag.FlagSet) { - f.BoolVar(&cfg.TLSEnabled, "ruler.alertmanager-client.tls-enabled", true, "Enable TLS for gRPC client connecting to alertmanager.") - cfg.TLS.RegisterFlagsWithPrefix("ruler.alertmanager-client", f) - cfg.BasicAuth.RegisterFlagsWithPrefix("ruler.alertmanager-client.", f) - cfg.OAuth2.RegisterFlagsWithPrefix("ruler.alertmanager-client.oauth.", f) - f.StringVar(&cfg.ProxyURL, "ruler.alertmanager-client.proxy-url", "", "Optional HTTP, HTTPS via CONNECT, or SOCKS5 proxy URL to route requests through. Applies to all requests, including auxiliary traffic, such as OAuth token requests.") -} - -type OAuth2Config struct { - ClientID string `yaml:"client_id"` - ClientSecret flagext.Secret `yaml:"client_secret"` - TokenURL string `yaml:"token_url"` - Scopes flagext.StringSliceCSV `yaml:"scopes,omitempty"` - EndpointParams flagext.LimitsMap[string] `yaml:"endpoint_params" category:"advanced"` -} - -func (cfg *OAuth2Config) RegisterFlagsWithPrefix(prefix string, f *flag.FlagSet) { - f.StringVar(&cfg.ClientID, prefix+"client_id", "", "OAuth2 client ID. Enables the use of OAuth2 for authenticating with Alertmanager.") - f.Var(&cfg.ClientSecret, prefix+"client_secret", "OAuth2 client secret.") - f.StringVar(&cfg.TokenURL, prefix+"token_url", "", "Endpoint used to fetch access token.") - f.Var(&cfg.Scopes, prefix+"scopes", "Optional scopes to include with the token request.") - if !cfg.EndpointParams.IsInitialized() { - cfg.EndpointParams = flagext.NewLimitsMap[string](nil) - } - f.Var(&cfg.EndpointParams, prefix+"endpoint-params", "Optional additional URL parameters to send to the token URL.") -} - -func (cfg *OAuth2Config) IsEnabled() bool { - return cfg.ClientID != "" || cfg.TokenURL != "" -} - // rulerNotifier bundles a notifier.Manager together with an associated // Alertmanager service discovery manager and handles the lifecycle // of both actors. @@ -141,13 +100,13 @@ func (rn *rulerNotifier) stop() { // Builds a Prometheus config.Config from a ruler.Config with just the required // options to configure notifications to Alertmanager. -func buildNotifierConfig(rulerConfig *Config, resolver AddressProvider, rmi discovery.RefreshMetricsManager) (*config.Config, error) { - if rulerConfig.AlertmanagerURL == "" { +func buildNotifierConfig(amURL string, notifierCfg rulernotifier.Config, rulerConfig Config, resolver AddressProvider, rmi discovery.RefreshMetricsManager) (*config.Config, error) { + if amURL == "" { // no AM URLs were provided, so we can just return a default config without errors return &config.Config{}, nil } - amURLs := strings.Split(rulerConfig.AlertmanagerURL, ",") + amURLs := strings.Split(amURL, ",") amConfigs := make([]*config.AlertmanagerConfig, 0, len(amURLs)) for _, rawURL := range amURLs { @@ -163,7 +122,7 @@ func buildNotifierConfig(rulerConfig *Config, resolver AddressProvider, rmi disc sdConfig = staticTarget(url) } - amCfgWithSD, err := amConfigWithSD(rulerConfig, url, sdConfig) + amCfgWithSD, err := amConfigWithSD(url, notifierCfg, sdConfig, rulerConfig.NotificationTimeout) if err != nil { return nil, err } @@ -179,12 +138,12 @@ func buildNotifierConfig(rulerConfig *Config, resolver AddressProvider, rmi disc return promConfig, nil } -func amConfigWithSD(rulerConfig *Config, url *url.URL, sdConfig discovery.Config) (*config.AlertmanagerConfig, error) { +func amConfigWithSD(url *url.URL, notifierCfg rulernotifier.Config, sdConfig discovery.Config, notifyTimeout time.Duration) (*config.AlertmanagerConfig, error) { amConfig := &config.AlertmanagerConfig{ APIVersion: config.AlertmanagerAPIVersionV2, Scheme: url.Scheme, PathPrefix: url.Path, - Timeout: model.Duration(rulerConfig.NotificationTimeout), + Timeout: model.Duration(notifyTimeout), ServiceDiscoveryConfigs: discovery.Configs{sdConfig}, HTTPClientConfig: config_util.HTTPClientConfig{}, } @@ -201,16 +160,16 @@ func amConfigWithSD(rulerConfig *Config, url *url.URL, sdConfig discovery.Config } // Override URL basic authentication configs with hard coded config values if present - if rulerConfig.Notifier.BasicAuth.IsEnabled() { + if notifierCfg.BasicAuth.IsEnabled() { amConfig.HTTPClientConfig.BasicAuth = &config_util.BasicAuth{ - Username: rulerConfig.Notifier.BasicAuth.Username, - Password: config_util.Secret(rulerConfig.Notifier.BasicAuth.Password.String()), + Username: notifierCfg.BasicAuth.Username, + Password: config_util.Secret(notifierCfg.BasicAuth.Password.String()), } } // Whether to use an optional HTTP, HTTP+CONNECT, or SOCKS5 proxy. - if rulerConfig.Notifier.ProxyURL != "" { - url, err := url.Parse(rulerConfig.Notifier.ProxyURL) + if notifierCfg.ProxyURL != "" { + url, err := url.Parse(notifierCfg.ProxyURL) if err != nil { return nil, err } @@ -218,24 +177,24 @@ func amConfigWithSD(rulerConfig *Config, url *url.URL, sdConfig discovery.Config } // Whether to use OAuth2 or not. - if rulerConfig.Notifier.OAuth2.IsEnabled() { + if notifierCfg.OAuth2.IsEnabled() { if amConfig.HTTPClientConfig.BasicAuth != nil { return nil, errRulerSimultaneousBasicAuthAndOAuth } amConfig.HTTPClientConfig.OAuth2 = &config_util.OAuth2{ - ClientID: rulerConfig.Notifier.OAuth2.ClientID, - ClientSecret: config_util.Secret(rulerConfig.Notifier.OAuth2.ClientSecret.String()), - TokenURL: rulerConfig.Notifier.OAuth2.TokenURL, - Scopes: rulerConfig.Notifier.OAuth2.Scopes, + ClientID: notifierCfg.OAuth2.ClientID, + ClientSecret: config_util.Secret(notifierCfg.OAuth2.ClientSecret.String()), + TokenURL: notifierCfg.OAuth2.TokenURL, + Scopes: notifierCfg.OAuth2.Scopes, } - if rulerConfig.Notifier.OAuth2.EndpointParams.IsInitialized() { - amConfig.HTTPClientConfig.OAuth2.EndpointParams = rulerConfig.Notifier.OAuth2.EndpointParams.Read() + if notifierCfg.OAuth2.EndpointParams.IsInitialized() { + amConfig.HTTPClientConfig.OAuth2.EndpointParams = notifierCfg.OAuth2.EndpointParams.Read() } - if rulerConfig.Notifier.ProxyURL != "" { - url, err := url.Parse(rulerConfig.Notifier.ProxyURL) + if notifierCfg.ProxyURL != "" { + url, err := url.Parse(notifierCfg.ProxyURL) if err != nil { return nil, err } @@ -244,29 +203,29 @@ func amConfigWithSD(rulerConfig *Config, url *url.URL, sdConfig discovery.Config } // Whether to use TLS or not. - if rulerConfig.Notifier.TLSEnabled { - if rulerConfig.Notifier.TLS.Reader == nil { + if notifierCfg.TLSEnabled { + if notifierCfg.TLS.Reader == nil { amConfig.HTTPClientConfig.TLSConfig = config_util.TLSConfig{ - CAFile: rulerConfig.Notifier.TLS.CAPath, - CertFile: rulerConfig.Notifier.TLS.CertPath, - KeyFile: rulerConfig.Notifier.TLS.KeyPath, - InsecureSkipVerify: rulerConfig.Notifier.TLS.InsecureSkipVerify, - ServerName: rulerConfig.Notifier.TLS.ServerName, + CAFile: notifierCfg.TLS.CAPath, + CertFile: notifierCfg.TLS.CertPath, + KeyFile: notifierCfg.TLS.KeyPath, + InsecureSkipVerify: notifierCfg.TLS.InsecureSkipVerify, + ServerName: notifierCfg.TLS.ServerName, } } else { - cert, err := rulerConfig.Notifier.TLS.Reader.ReadSecret(rulerConfig.Notifier.TLS.CertPath) + cert, err := notifierCfg.TLS.Reader.ReadSecret(notifierCfg.TLS.CertPath) if err != nil { return nil, err } - key, err := rulerConfig.Notifier.TLS.Reader.ReadSecret(rulerConfig.Notifier.TLS.KeyPath) + key, err := notifierCfg.TLS.Reader.ReadSecret(notifierCfg.TLS.KeyPath) if err != nil { return nil, err } var ca []byte - if rulerConfig.Notifier.TLS.CAPath != "" { - ca, err = rulerConfig.Notifier.TLS.Reader.ReadSecret(rulerConfig.Notifier.TLS.CAPath) + if notifierCfg.TLS.CAPath != "" { + ca, err = notifierCfg.TLS.Reader.ReadSecret(notifierCfg.TLS.CAPath) if err != nil { return nil, err } @@ -276,8 +235,8 @@ func amConfigWithSD(rulerConfig *Config, url *url.URL, sdConfig discovery.Config CA: string(ca), Cert: string(cert), Key: config_util.Secret(key), - InsecureSkipVerify: rulerConfig.Notifier.TLS.InsecureSkipVerify, - ServerName: rulerConfig.Notifier.TLS.ServerName, + InsecureSkipVerify: notifierCfg.TLS.InsecureSkipVerify, + ServerName: notifierCfg.TLS.ServerName, } } } diff --git a/pkg/ruler/notifier/notifier_config.go b/pkg/ruler/notifier/notifier_config.go new file mode 100644 index 0000000000..323c05eec7 --- /dev/null +++ b/pkg/ruler/notifier/notifier_config.go @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package notifier + +import ( + "flag" + "fmt" + "hash/fnv" + + "github.com/grafana/dskit/crypto/tls" + "github.com/grafana/dskit/flagext" + "gopkg.in/yaml.v3" + + "github.com/grafana/mimir/pkg/util" +) + +var DefaultAlertmanagerClientConfig = AlertmanagerClientConfig{ + NotifierConfig: Config{ + OAuth2: OAuth2Config{ + EndpointParams: flagext.NewLimitsMap[string](nil), + }, + }, +} + +type AlertmanagerClientConfig struct { + AlertmanagerURL string `yaml:"alertmanager_url" doc:"nocli"` + NotifierConfig Config `yaml:",inline" json:",inline"` +} + +func (acc *AlertmanagerClientConfig) String() string { + out, err := yaml.Marshal(acc) + if err != nil { + return fmt.Sprintf("failed to marshal: %v", err) + } + return string(out) +} + +func (acc *AlertmanagerClientConfig) Set(s string) error { + cfg := AlertmanagerClientConfig{} + if err := yaml.Unmarshal([]byte(s), &cfg); err != nil { + return err + } + *acc = cfg + return nil +} + +// Hash calculates a cryptographically weak, insecure hash of the configuration. +func (acc *AlertmanagerClientConfig) Hash() uint64 { + h := fnv.New64a() + h.Write([]byte(acc.String())) + return h.Sum64() +} + +func (acc *AlertmanagerClientConfig) Equal(other AlertmanagerClientConfig) bool { + return acc.Hash() == other.Hash() +} + +func (acc *AlertmanagerClientConfig) IsDefault() bool { + return acc.Equal(DefaultAlertmanagerClientConfig) +} + +type Config struct { + TLSEnabled bool `yaml:"tls_enabled" category:"advanced"` + TLS tls.ClientConfig `yaml:",inline"` + BasicAuth util.BasicAuth `yaml:",inline"` + OAuth2 OAuth2Config `yaml:"oauth2"` + ProxyURL string `yaml:"proxy_url" category:"advanced"` +} + +func (cfg *Config) RegisterFlags(f *flag.FlagSet) { + f.BoolVar(&cfg.TLSEnabled, "ruler.alertmanager-client.tls-enabled", true, "Enable TLS for gRPC client connecting to alertmanager.") + cfg.TLS.RegisterFlagsWithPrefix("ruler.alertmanager-client", f) + cfg.BasicAuth.RegisterFlagsWithPrefix("ruler.alertmanager-client.", f) + cfg.OAuth2.RegisterFlagsWithPrefix("ruler.alertmanager-client.oauth.", f) + f.StringVar(&cfg.ProxyURL, "ruler.alertmanager-client.proxy-url", "", "Optional HTTP, HTTPS via CONNECT, or SOCKS5 proxy URL to route requests through. Applies to all requests, including auxiliary traffic, such as OAuth token requests.") +} + +type OAuth2Config struct { + ClientID string `yaml:"client_id"` + ClientSecret flagext.Secret `yaml:"client_secret"` + TokenURL string `yaml:"token_url"` + Scopes flagext.StringSliceCSV `yaml:"scopes,omitempty"` + EndpointParams flagext.LimitsMap[string] `yaml:"endpoint_params" category:"advanced"` +} + +func (cfg *OAuth2Config) RegisterFlagsWithPrefix(prefix string, f *flag.FlagSet) { + f.StringVar(&cfg.ClientID, prefix+"client_id", "", "OAuth2 client ID. Enables the use of OAuth2 for authenticating with Alertmanager.") + f.Var(&cfg.ClientSecret, prefix+"client_secret", "OAuth2 client secret.") + f.StringVar(&cfg.TokenURL, prefix+"token_url", "", "Endpoint used to fetch access token.") + f.Var(&cfg.Scopes, prefix+"scopes", "Optional scopes to include with the token request.") + if !cfg.EndpointParams.IsInitialized() { + cfg.EndpointParams = flagext.NewLimitsMap[string](nil) + } + f.Var(&cfg.EndpointParams, prefix+"endpoint-params", "Optional additional URL parameters to send to the token URL.") +} + +func (cfg *OAuth2Config) IsEnabled() bool { + return cfg.ClientID != "" || cfg.TokenURL != "" +} diff --git a/pkg/ruler/notifier/notifier_config_test.go b/pkg/ruler/notifier/notifier_config_test.go new file mode 100644 index 0000000000..cf058f61c0 --- /dev/null +++ b/pkg/ruler/notifier/notifier_config_test.go @@ -0,0 +1,299 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package notifier + +import ( + "testing" + + "github.com/google/go-cmp/cmp" + "github.com/grafana/dskit/crypto/tls" + "github.com/grafana/dskit/flagext" + "github.com/stretchr/testify/require" +) + +func TestAlertmanagerClientConfig(t *testing.T) { + t.Run("IsDefault", func(t *testing.T) { + tc := []struct { + name string + cfg *AlertmanagerClientConfig + exp bool + }{ + { + name: "default", + cfg: &DefaultAlertmanagerClientConfig, + exp: true, + }, + { + name: "initialized limits map", + cfg: &AlertmanagerClientConfig{ + NotifierConfig: Config{ + OAuth2: OAuth2Config{ + EndpointParams: flagext.NewLimitsMap[string](nil), + }, + }, + }, + exp: true, + }, + { + name: "empty scopes", + cfg: &AlertmanagerClientConfig{ + NotifierConfig: Config{ + OAuth2: OAuth2Config{ + Scopes: []string{}, + }, + }, + }, + exp: true, + }, + { + name: "golang default", + cfg: &AlertmanagerClientConfig{}, + exp: true, + }, + { + name: "custom TLS reader ignored", + cfg: &AlertmanagerClientConfig{ + NotifierConfig: Config{ + TLS: tls.ClientConfig{ + Reader: &fakeSecretReader{}, + }, + }, + }, + exp: true, + }, + { + name: "modified field", + cfg: &AlertmanagerClientConfig{ + AlertmanagerURL: "test", + }, + exp: false, + }, + { + name: "modified endpoint param", + cfg: &AlertmanagerClientConfig{ + NotifierConfig: Config{ + OAuth2: OAuth2Config{ + EndpointParams: flagext.NewLimitsMapWithData(map[string]string{"k1": "v1"}, nil), + }, + }, + }, + exp: false, + }, + { + name: "modified scope", + cfg: &AlertmanagerClientConfig{ + NotifierConfig: Config{ + OAuth2: OAuth2Config{ + Scopes: []string{"asdf"}, + }, + }, + }, + exp: false, + }, + } + + for _, tt := range tc { + t.Run(tt.name, func(t *testing.T) { + require.Equal(t, tt.exp, tt.cfg.IsDefault(), tt.cfg) + }) + } + }) + + t.Run("Equal", func(t *testing.T) { + tc := []struct { + name string + cfg1 AlertmanagerClientConfig + cfg2 AlertmanagerClientConfig + exp bool + }{ + { + name: "same values", + cfg1: AlertmanagerClientConfig{ + AlertmanagerURL: "http://some-url", + NotifierConfig: Config{ + ProxyURL: "http://some-proxy:1234", + TLS: tls.ClientConfig{ + CertPath: "cert-path", + KeyPath: "key-path", + CAPath: "ca-path", + ServerName: "server", + InsecureSkipVerify: true, + CipherSuites: "TLS_AES_256_GCM_SHA384", + MinVersion: "1.3", + }, + OAuth2: OAuth2Config{ + ClientID: "myclient", + ClientSecret: flagext.SecretWithValue("mysecret"), + TokenURL: "http://token-url", + Scopes: []string{"abc", "def"}, + EndpointParams: flagext.NewLimitsMapWithData(map[string]string{ + "key1": "value1", + }, nil), + }, + }, + }, + cfg2: AlertmanagerClientConfig{ + AlertmanagerURL: "http://some-url", + NotifierConfig: Config{ + ProxyURL: "http://some-proxy:1234", + TLS: tls.ClientConfig{ + CertPath: "cert-path", + KeyPath: "key-path", + CAPath: "ca-path", + ServerName: "server", + InsecureSkipVerify: true, + CipherSuites: "TLS_AES_256_GCM_SHA384", + MinVersion: "1.3", + }, + OAuth2: OAuth2Config{ + ClientID: "myclient", + ClientSecret: flagext.SecretWithValue("mysecret"), + TokenURL: "http://token-url", + Scopes: []string{"abc", "def"}, + EndpointParams: flagext.NewLimitsMapWithData(map[string]string{ + "key1": "value1", + }, nil), + }, + }, + }, + exp: true, + }, + { + name: "differing value", + cfg1: AlertmanagerClientConfig{ + AlertmanagerURL: "http://some-url", + NotifierConfig: Config{ + ProxyURL: "http://some-proxy:1234", + TLS: tls.ClientConfig{ + CertPath: "cert-path", + KeyPath: "key-path", + CAPath: "ca-path", + ServerName: "server", + InsecureSkipVerify: true, + CipherSuites: "TLS_AES_256_GCM_SHA384", + MinVersion: "1.3", + }, + OAuth2: OAuth2Config{ + ClientID: "myclient", + ClientSecret: flagext.SecretWithValue("mysecret"), + TokenURL: "http://token-url", + Scopes: []string{"abc", "def"}, + EndpointParams: flagext.NewLimitsMapWithData(map[string]string{ + "key1": "value1", + }, nil), + }, + }, + }, + cfg2: AlertmanagerClientConfig{ + AlertmanagerURL: "http://another-url", + NotifierConfig: Config{ + ProxyURL: "http://some-proxy:1234", + TLS: tls.ClientConfig{ + CertPath: "cert-path", + KeyPath: "key-path", + CAPath: "ca-path", + ServerName: "server", + InsecureSkipVerify: true, + CipherSuites: "TLS_AES_256_GCM_SHA384", + MinVersion: "1.3", + }, + OAuth2: OAuth2Config{ + ClientID: "myclient", + ClientSecret: flagext.SecretWithValue("mysecret"), + TokenURL: "http://token-url", + Scopes: []string{"abc", "def"}, + EndpointParams: flagext.NewLimitsMapWithData(map[string]string{ + "key1": "value1", + }, nil), + }, + }, + }, + exp: false, + }, + { + name: "different endpoint params order", + cfg1: AlertmanagerClientConfig{ + AlertmanagerURL: "http://some-url", + NotifierConfig: Config{ + OAuth2: OAuth2Config{ + EndpointParams: flagext.NewLimitsMapWithData(map[string]string{ + "key1": "value1", + "key2": "value2", + }, nil), + }, + }, + }, + cfg2: AlertmanagerClientConfig{ + AlertmanagerURL: "http://some-url", + NotifierConfig: Config{ + OAuth2: OAuth2Config{ + EndpointParams: flagext.NewLimitsMapWithData(map[string]string{ + "key2": "value2", + "key1": "value1", + }, nil), + }, + }, + }, + exp: true, + }, + { + name: "different scopes order", + cfg1: AlertmanagerClientConfig{ + AlertmanagerURL: "http://some-url", + NotifierConfig: Config{ + OAuth2: OAuth2Config{ + Scopes: []string{"s1", "s2"}, + }, + }, + }, + cfg2: AlertmanagerClientConfig{ + AlertmanagerURL: "http://some-url", + NotifierConfig: Config{ + OAuth2: OAuth2Config{ + Scopes: []string{"s2", "s1"}, + }, + }, + }, + exp: false, + }, + { + name: "ignores different secrets reader", + cfg1: AlertmanagerClientConfig{ + AlertmanagerURL: "http://some-url", + NotifierConfig: Config{ + TLS: tls.ClientConfig{ + Reader: &fakeSecretReader{}, + }, + OAuth2: OAuth2Config{ + ClientID: "myclient", + }, + }, + }, + cfg2: AlertmanagerClientConfig{ + AlertmanagerURL: "http://some-url", + NotifierConfig: Config{ + TLS: tls.ClientConfig{ + Reader: nil, + }, + OAuth2: OAuth2Config{ + ClientID: "myclient", + }, + }, + }, + exp: true, + }, + } + + for _, tt := range tc { + t.Run(tt.name, func(t *testing.T) { + require.Equal(t, tt.exp, tt.cfg1.Equal(tt.cfg2), cmp.Diff(tt.cfg1, tt.cfg2)) + }) + } + }) +} + +type fakeSecretReader struct{} + +func (fsr *fakeSecretReader) ReadSecret(_ string) ([]byte, error) { + return []byte{}, nil +} diff --git a/pkg/ruler/notifier_test.go b/pkg/ruler/notifier_test.go index 99f1ed8d84..958c62d1b0 100644 --- a/pkg/ruler/notifier_test.go +++ b/pkg/ruler/notifier_test.go @@ -23,6 +23,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "github.com/grafana/mimir/pkg/ruler/notifier" "github.com/grafana/mimir/pkg/util" ) @@ -203,7 +204,7 @@ func TestBuildNotifierConfig(t *testing.T) { name: "with basic authentication URL, no service discovery, and explicit config", cfg: &Config{ AlertmanagerURL: "http://marco:hunter2@alertmanager-0.default.svc.cluster.local/alertmanager", - Notifier: NotifierConfig{ + Notifier: notifier.Config{ BasicAuth: util.BasicAuth{ Username: "jacob", Password: flagext.SecretWithValue("test"), @@ -273,7 +274,7 @@ func TestBuildNotifierConfig(t *testing.T) { name: "with service discovery URL, basic auth, and proxy URL", cfg: &Config{ AlertmanagerURL: "dnssrv+https://marco:hunter2@_http._tcp.alertmanager-0.default.svc.cluster.local/alertmanager", - Notifier: NotifierConfig{ + Notifier: notifier.Config{ ProxyURL: "http://my-proxy.proxy-namespace.svc.cluster.local.:1234", }, }, @@ -305,8 +306,8 @@ func TestBuildNotifierConfig(t *testing.T) { name: "with OAuth2", cfg: &Config{ AlertmanagerURL: "dnssrv+https://_http._tcp.alertmanager-0.default.svc.cluster.local/alertmanager", - Notifier: NotifierConfig{ - OAuth2: OAuth2Config{ + Notifier: notifier.Config{ + OAuth2: notifier.OAuth2Config{ ClientID: "oauth2-client-id", ClientSecret: flagext.SecretWithValue("test"), TokenURL: "https://oauth2-token-endpoint.local/token", @@ -342,8 +343,8 @@ func TestBuildNotifierConfig(t *testing.T) { name: "with OAuth2 and optional scopes", cfg: &Config{ AlertmanagerURL: "dnssrv+https://_http._tcp.alertmanager-0.default.svc.cluster.local/alertmanager", - Notifier: NotifierConfig{ - OAuth2: OAuth2Config{ + Notifier: notifier.Config{ + OAuth2: notifier.OAuth2Config{ ClientID: "oauth2-client-id", ClientSecret: flagext.SecretWithValue("test"), TokenURL: "https://oauth2-token-endpoint.local/token", @@ -381,8 +382,8 @@ func TestBuildNotifierConfig(t *testing.T) { name: "with OAuth2 and optional endpoint params", cfg: &Config{ AlertmanagerURL: "dnssrv+https://_http._tcp.alertmanager-0.default.svc.cluster.local/alertmanager", - Notifier: NotifierConfig{ - OAuth2: OAuth2Config{ + Notifier: notifier.Config{ + OAuth2: notifier.OAuth2Config{ ClientID: "oauth2-client-id", ClientSecret: flagext.SecretWithValue("test"), TokenURL: "https://oauth2-token-endpoint.local/token", @@ -426,9 +427,9 @@ func TestBuildNotifierConfig(t *testing.T) { name: "with OAuth2 and proxy_url simultaneously, inheriting proxy", cfg: &Config{ AlertmanagerURL: "dnssrv+https://_http._tcp.alertmanager-0.default.svc.cluster.local/alertmanager", - Notifier: NotifierConfig{ + Notifier: notifier.Config{ ProxyURL: "http://my-proxy.proxy-namespace.svc.cluster.local.:1234", - OAuth2: OAuth2Config{ + OAuth2: notifier.OAuth2Config{ ClientID: "oauth2-client-id", ClientSecret: flagext.SecretWithValue("test"), TokenURL: "https://oauth2-token-endpoint.local/token", @@ -493,7 +494,7 @@ func TestBuildNotifierConfig(t *testing.T) { name: "misspelled proxy URL", cfg: &Config{ AlertmanagerURL: "http://alertmanager.default.svc.cluster.local/alertmanager", - Notifier: NotifierConfig{ + Notifier: notifier.Config{ ProxyURL: "http://example.local" + string(rune(0x7f)), }, }, @@ -503,11 +504,11 @@ func TestBuildNotifierConfig(t *testing.T) { name: "basic auth and oauth provided at the same time", cfg: &Config{ AlertmanagerURL: "http://alertmanager.default.svc.cluster.local/alertmanager", - Notifier: NotifierConfig{ + Notifier: notifier.Config{ BasicAuth: util.BasicAuth{ Username: "test-user", }, - OAuth2: OAuth2Config{ + OAuth2: notifier.OAuth2Config{ ClientID: "oauth2-client-id", ClientSecret: flagext.SecretWithValue("test"), TokenURL: "https://oauth2-token-endpoint.local/token", @@ -521,8 +522,8 @@ func TestBuildNotifierConfig(t *testing.T) { name: "basic auth via URL and oauth provided at the same time", cfg: &Config{ AlertmanagerURL: "http://marco:hunter2@alertmanager.default.svc.cluster.local/alertmanager", - Notifier: NotifierConfig{ - OAuth2: OAuth2Config{ + Notifier: notifier.Config{ + OAuth2: notifier.OAuth2Config{ ClientID: "oauth2-client-id", ClientSecret: flagext.SecretWithValue("test"), TokenURL: "https://oauth2-token-endpoint.local/token", @@ -536,7 +537,7 @@ func TestBuildNotifierConfig(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - ncfg, err := buildNotifierConfig(tt.cfg, nil, nil) + ncfg, err := buildNotifierConfig(tt.cfg.AlertmanagerURL, tt.cfg.Notifier, *tt.cfg, nil, nil) if tt.err == nil { require.NoError(t, err) require.Equal(t, tt.ncfg, ncfg) diff --git a/pkg/ruler/ruler.go b/pkg/ruler/ruler.go index 607eeec5fe..52fe2d0619 100644 --- a/pkg/ruler/ruler.go +++ b/pkg/ruler/ruler.go @@ -41,6 +41,7 @@ import ( "golang.org/x/sync/errgroup" "github.com/grafana/mimir/pkg/mimirpb" + "github.com/grafana/mimir/pkg/ruler/notifier" "github.com/grafana/mimir/pkg/ruler/rulespb" "github.com/grafana/mimir/pkg/ruler/rulestore" "github.com/grafana/mimir/pkg/util" @@ -114,7 +115,7 @@ type Config struct { // HTTP timeout duration when sending notifications to the Alertmanager. NotificationTimeout time.Duration `yaml:"notification_timeout" category:"advanced"` // Client configs for interacting with the Alertmanager - Notifier NotifierConfig `yaml:"alertmanager_client"` + Notifier notifier.Config `yaml:"alertmanager_client"` // Max time to tolerate outage for restoring "for" state of alert. OutageTolerance time.Duration `yaml:"for_outage_tolerance" category:"advanced"` diff --git a/pkg/ruler/ruler_test.go b/pkg/ruler/ruler_test.go index 298fe24f50..87ad8dc136 100644 --- a/pkg/ruler/ruler_test.go +++ b/pkg/ruler/ruler_test.go @@ -250,7 +250,7 @@ func prepareRulerManager(t *testing.T, cfg Config, opts ...prepareOption) *Defau pusher.MockPush(&mimirpb.WriteResponse{}, nil) managerFactory := DefaultTenantManagerFactory(cfg, pusher, noopQueryable, queryFunc, &NoopMultiTenantConcurrencyController{}, options.limits, options.registerer) - manager, err := NewDefaultMultiTenantManager(cfg, managerFactory, prometheus.NewRegistry(), options.logger, nil) + manager, err := NewDefaultMultiTenantManager(cfg, managerFactory, prometheus.NewRegistry(), options.logger, nil, validation.MockOverrides(nil)) require.NoError(t, err) return manager diff --git a/pkg/ruler/service_discovery.go b/pkg/ruler/service_discovery.go index 33a9ca5fc1..72016dda85 100644 --- a/pkg/ruler/service_discovery.go +++ b/pkg/ruler/service_discovery.go @@ -42,7 +42,7 @@ type dnsServiceDiscovery struct { host string } -func dnsSD(rulerConfig *Config, resolver AddressProvider, qType dns.QType, url *url.URL, rmi discovery.RefreshMetricsInstantiator) discovery.Config { +func dnsSD(rulerConfig Config, resolver AddressProvider, qType dns.QType, url *url.URL, rmi discovery.RefreshMetricsInstantiator) discovery.Config { return dnsServiceDiscovery{ resolver: resolver, refreshInterval: rulerConfig.AlertmanagerRefreshInterval, diff --git a/pkg/util/validation/limits.go b/pkg/util/validation/limits.go index 6fed2983f7..597858472a 100644 --- a/pkg/util/validation/limits.go +++ b/pkg/util/validation/limits.go @@ -25,6 +25,7 @@ import ( asmodel "github.com/grafana/mimir/pkg/ingester/activeseries/model" "github.com/grafana/mimir/pkg/querier/api" + "github.com/grafana/mimir/pkg/ruler/notifier" "github.com/grafana/mimir/pkg/storage/tsdb/block" "github.com/grafana/mimir/pkg/util" ) @@ -206,17 +207,18 @@ type Limits struct { CostAttributionCooldown model.Duration `yaml:"cost_attribution_cooldown" json:"cost_attribution_cooldown" category:"experimental"` // Ruler defaults and limits. - RulerEvaluationDelay model.Duration `yaml:"ruler_evaluation_delay_duration" json:"ruler_evaluation_delay_duration"` - RulerTenantShardSize int `yaml:"ruler_tenant_shard_size" json:"ruler_tenant_shard_size"` - RulerMaxRulesPerRuleGroup int `yaml:"ruler_max_rules_per_rule_group" json:"ruler_max_rules_per_rule_group"` - RulerMaxRuleGroupsPerTenant int `yaml:"ruler_max_rule_groups_per_tenant" json:"ruler_max_rule_groups_per_tenant"` - RulerRecordingRulesEvaluationEnabled bool `yaml:"ruler_recording_rules_evaluation_enabled" json:"ruler_recording_rules_evaluation_enabled"` - RulerAlertingRulesEvaluationEnabled bool `yaml:"ruler_alerting_rules_evaluation_enabled" json:"ruler_alerting_rules_evaluation_enabled"` - RulerSyncRulesOnChangesEnabled bool `yaml:"ruler_sync_rules_on_changes_enabled" json:"ruler_sync_rules_on_changes_enabled" category:"advanced"` - RulerMaxRulesPerRuleGroupByNamespace flagext.LimitsMap[int] `yaml:"ruler_max_rules_per_rule_group_by_namespace" json:"ruler_max_rules_per_rule_group_by_namespace" category:"experimental"` - RulerMaxRuleGroupsPerTenantByNamespace flagext.LimitsMap[int] `yaml:"ruler_max_rule_groups_per_tenant_by_namespace" json:"ruler_max_rule_groups_per_tenant_by_namespace" category:"experimental"` - RulerProtectedNamespaces flagext.StringSliceCSV `yaml:"ruler_protected_namespaces" json:"ruler_protected_namespaces" category:"experimental"` - RulerMaxIndependentRuleEvaluationConcurrencyPerTenant int64 `yaml:"ruler_max_independent_rule_evaluation_concurrency_per_tenant" json:"ruler_max_independent_rule_evaluation_concurrency_per_tenant" category:"experimental"` + RulerEvaluationDelay model.Duration `yaml:"ruler_evaluation_delay_duration" json:"ruler_evaluation_delay_duration"` + RulerTenantShardSize int `yaml:"ruler_tenant_shard_size" json:"ruler_tenant_shard_size"` + RulerMaxRulesPerRuleGroup int `yaml:"ruler_max_rules_per_rule_group" json:"ruler_max_rules_per_rule_group"` + RulerMaxRuleGroupsPerTenant int `yaml:"ruler_max_rule_groups_per_tenant" json:"ruler_max_rule_groups_per_tenant"` + RulerRecordingRulesEvaluationEnabled bool `yaml:"ruler_recording_rules_evaluation_enabled" json:"ruler_recording_rules_evaluation_enabled"` + RulerAlertingRulesEvaluationEnabled bool `yaml:"ruler_alerting_rules_evaluation_enabled" json:"ruler_alerting_rules_evaluation_enabled"` + RulerSyncRulesOnChangesEnabled bool `yaml:"ruler_sync_rules_on_changes_enabled" json:"ruler_sync_rules_on_changes_enabled" category:"advanced"` + RulerMaxRulesPerRuleGroupByNamespace flagext.LimitsMap[int] `yaml:"ruler_max_rules_per_rule_group_by_namespace" json:"ruler_max_rules_per_rule_group_by_namespace" category:"experimental"` + RulerMaxRuleGroupsPerTenantByNamespace flagext.LimitsMap[int] `yaml:"ruler_max_rule_groups_per_tenant_by_namespace" json:"ruler_max_rule_groups_per_tenant_by_namespace" category:"experimental"` + RulerProtectedNamespaces flagext.StringSliceCSV `yaml:"ruler_protected_namespaces" json:"ruler_protected_namespaces" category:"experimental"` + RulerMaxIndependentRuleEvaluationConcurrencyPerTenant int64 `yaml:"ruler_max_independent_rule_evaluation_concurrency_per_tenant" json:"ruler_max_independent_rule_evaluation_concurrency_per_tenant" category:"experimental"` + RulerAlertmanagerClientConfig notifier.AlertmanagerClientConfig `yaml:"ruler_alertmanager_client_config" json:"ruler_alertmanager_client_config" category:"experimental" doc:"nocli|description=Per-tenant alertmanager client configuration. If not supplied, the tenant's notifications will be sent to the ruler-wide default."` // Store-gateway. StoreGatewayTenantShardSize int `yaml:"store_gateway_tenant_shard_size" json:"store_gateway_tenant_shard_size"` @@ -369,6 +371,10 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) { f.Var(&l.RulerMaxRuleGroupsPerTenantByNamespace, "ruler.max-rule-groups-per-tenant-by-namespace", "Maximum number of rule groups per tenant by namespace. Value is a map, where each key is the namespace and value is the number of rule groups allowed in the namespace (int). On the command line, this map is given in a JSON format. The number of rule groups specified has the same meaning as -ruler.max-rule-groups-per-tenant, but only applies for the specific namespace. If specified, it supersedes -ruler.max-rule-groups-per-tenant.") f.Var(&l.RulerProtectedNamespaces, "ruler.protected-namespaces", "List of namespaces that are protected from modification unless a special HTTP header is used. If a namespace is protected, it can only be read, not modified via the ruler's configuration API. The value is a list of strings, where each string is a namespace name. On the command line, this list is given as a comma-separated list.") f.Int64Var(&l.RulerMaxIndependentRuleEvaluationConcurrencyPerTenant, "ruler.max-independent-rule-evaluation-concurrency-per-tenant", 4, "Maximum number of independent rules that can run concurrently for each tenant. Depends on ruler.max-independent-rule-evaluation-concurrency being greater than 0. Ideally this flag should be a lower value. 0 to disable.") + if !l.RulerAlertmanagerClientConfig.NotifierConfig.OAuth2.EndpointParams.IsInitialized() { + l.RulerAlertmanagerClientConfig.NotifierConfig.OAuth2.EndpointParams = flagext.NewLimitsMap[string](nil) + } + f.Var(&l.RulerAlertmanagerClientConfig, "ruler.alertmanager-client-config", "Per-tenant alertmanager client configuration. If not supplied, the tenant's notifications will be sent to the ruler-wide default.") f.Var(&l.CompactorBlocksRetentionPeriod, "compactor.blocks-retention-period", "Delete blocks containing samples older than the specified retention period. Also used by query-frontend to avoid querying beyond the retention period by instant, range or remote read queries. 0 to disable.") f.IntVar(&l.CompactorSplitAndMergeShards, "compactor.split-and-merge-shards", 0, "The number of shards to use when splitting blocks. 0 to disable splitting.") @@ -1042,6 +1048,10 @@ func (o *Overrides) RulerMaxIndependentRuleEvaluationConcurrencyPerTenant(userID return o.getOverridesForUser(userID).RulerMaxIndependentRuleEvaluationConcurrencyPerTenant } +func (o *Overrides) RulerAlertmanagerClientConfig(userID string) notifier.AlertmanagerClientConfig { + return o.getOverridesForUser(userID).RulerAlertmanagerClientConfig +} + // StoreGatewayTenantShardSize returns the store-gateway shard size for a given user. func (o *Overrides) StoreGatewayTenantShardSize(userID string) int { return o.getOverridesForUser(userID).StoreGatewayTenantShardSize diff --git a/pkg/util/validation/limits_mock.go b/pkg/util/validation/limits_mock.go index b22ce0fc9f..7520e790ae 100644 --- a/pkg/util/validation/limits_mock.go +++ b/pkg/util/validation/limits_mock.go @@ -31,7 +31,9 @@ func (l *mockTenantLimits) AllByUserID() map[string]*Limits { func MockOverrides(customize func(defaults *Limits, tenantLimits map[string]*Limits)) *Overrides { defaults := MockDefaultLimits() tenantLimits := map[string]*Limits{} - customize(defaults, tenantLimits) + if customize != nil { + customize(defaults, tenantLimits) + } overrides, err := NewOverrides(*defaults, NewMockTenantLimits(tenantLimits)) if err != nil { diff --git a/pkg/util/validation/limits_test.go b/pkg/util/validation/limits_test.go index 67b86f7463..12630a377c 100644 --- a/pkg/util/validation/limits_test.go +++ b/pkg/util/validation/limits_test.go @@ -24,6 +24,8 @@ import ( "github.com/stretchr/testify/require" "golang.org/x/time/rate" "gopkg.in/yaml.v3" + + "github.com/grafana/mimir/pkg/ruler/notifier" ) func TestMain(m *testing.M) { @@ -1026,6 +1028,110 @@ user1: } } +func TestRulerAlertmanagerClientConfig(t *testing.T) { + tc := map[string]struct { + baseYAML string + overrides string + expectedConfig notifier.AlertmanagerClientConfig + }{ + "no override provided": { + baseYAML: ``, + expectedConfig: notifier.DefaultAlertmanagerClientConfig, + }, + "no user specific client config": { + baseYAML: ` +ruler_alertmanager_client_config: + alertmanager_url: http://custom-url:8080 + proxy_url: http://some-proxy:1234 + oauth2: + client_id: myclient + client_secret: mysecret + token_url: http://token-url + scopes: abc,def + endpoint_params: + key1: value1 +`, + expectedConfig: notifier.AlertmanagerClientConfig{ + AlertmanagerURL: "http://custom-url:8080", + NotifierConfig: notifier.Config{ + ProxyURL: "http://some-proxy:1234", + OAuth2: notifier.OAuth2Config{ + ClientID: "myclient", + ClientSecret: flagext.SecretWithValue("mysecret"), + TokenURL: "http://token-url", + Scopes: []string{"abc", "def"}, + EndpointParams: flagext.NewLimitsMapWithData(map[string]string{ + "key1": "value1", + }, nil), + }, + }, + }, + }, + "overridden config for specific user": { + baseYAML: ` +ruler_alertmanager_client_config: + alertmanager_url: http://some-base-url:8080 +`, + overrides: ` +user1: + ruler_alertmanager_client_config: + alertmanager_url: http://custom-url-for-this-tenant:8080 + proxy_url: http://some-proxy:1234 + oauth2: + client_id: myclient + client_secret: mysecret + token_url: http://token-url + scopes: abc,def + endpoint_params: + key1: value1 +`, + expectedConfig: notifier.AlertmanagerClientConfig{ + AlertmanagerURL: "http://custom-url-for-this-tenant:8080", + NotifierConfig: notifier.Config{ + ProxyURL: "http://some-proxy:1234", + OAuth2: notifier.OAuth2Config{ + ClientID: "myclient", + ClientSecret: flagext.SecretWithValue("mysecret"), + TokenURL: "http://token-url", + Scopes: []string{"abc", "def"}, + EndpointParams: flagext.NewLimitsMapWithData(map[string]string{ + "key1": "value1", + }, nil), + }, + }, + }, + }, + } + + for name, tt := range tc { + t.Run(name, func(t *testing.T) { + t.Cleanup(func() { + SetDefaultLimitsForYAMLUnmarshalling(getDefaultLimits()) + }) + + SetDefaultLimitsForYAMLUnmarshalling(getDefaultLimits()) + + var limitsYAML Limits + limitsYAML.RulerAlertmanagerClientConfig.NotifierConfig.OAuth2.EndpointParams = flagext.NewLimitsMap[string](nil) + err := yaml.Unmarshal([]byte(tt.baseYAML), &limitsYAML) + require.NoError(t, err) + + SetDefaultLimitsForYAMLUnmarshalling(limitsYAML) + + overrides := map[string]*Limits{} + err = yaml.Unmarshal([]byte(tt.overrides), &overrides) + require.NoError(t, err) + + tl := NewMockTenantLimits(overrides) + ov, err := NewOverrides(limitsYAML, tl) + require.NoError(t, err) + + require.Equal(t, tt.expectedConfig, ov.RulerAlertmanagerClientConfig("user1")) + require.True(t, tt.expectedConfig.Equal(ov.RulerAlertmanagerClientConfig("user1"))) + }) + } +} + func TestActiveSeriesCustomTrackersConfig(t *testing.T) { tests := map[string]struct { cfg string diff --git a/tools/doc-generator/parse/parser.go b/tools/doc-generator/parse/parser.go index de94561968..e1fac8de8e 100644 --- a/tools/doc-generator/parse/parser.go +++ b/tools/doc-generator/parse/parser.go @@ -24,6 +24,7 @@ import ( "github.com/thanos-io/objstore/providers/s3" asmodel "github.com/grafana/mimir/pkg/ingester/activeseries/model" + "github.com/grafana/mimir/pkg/ruler/notifier" "github.com/grafana/mimir/pkg/storage/tsdb" "github.com/grafana/mimir/pkg/util/configdoc" "github.com/grafana/mimir/pkg/util/validation" @@ -494,6 +495,8 @@ func ReflectType(typ string) reflect.Type { return reflect.TypeOf([]*validation.BlockedQuery{}) case "blocked_requests_config...": return reflect.TypeOf([]*validation.BlockedRequest{}) + case "ruler_alertmanager_client_config...": + return reflect.TypeOf(notifier.AlertmanagerClientConfig{}) case "map of string to float64": return reflect.TypeOf(flagext.LimitsMap[float64]{}) case "map of string to int":