Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ruler: Make alertmanager client and all related config per-tenant configurable #10816

Open
wants to merge 15 commits into
base: main
Choose a base branch
from
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
* [ENHANCEMENT] All: Add `cortex_client_request_invalid_cluster_validation_labels_total` metrics, that is used by Mimir's gRPC clients to track invalid cluster validations. #10767
* [ENHANCEMENT] Ingester client: Add support to configure cluster validation for ingester clients. Failed cluster validations are tracked by `cortex_client_request_invalid_cluster_validation_labels_total` with label `client=ingester`. #10767
* [ENHANCEMENT] Add experimental metric `cortex_distributor_dropped_native_histograms_total` to measure native histograms silently dropped when native histograms are disabled for a tenant. #10760
* [ENHANCEMENT] Add tenant configuration block `ruler_alertmanager_client_config` which allows the Ruler's Alertmanager client options to be specified on a per-tenant basis. #10816
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
* [ENHANCEMENT] Add tenant configuration block `ruler_alertmanager_client_config` which allows the Ruler's Alertmanager client options to be specified on a per-tenant basis. #10816
* [ENHANCEMENT] Add tenant configuration block `ruler_alertmanager_client_config`, which allows you to specify the ruler's Alertmanager client options on a per-tenant basis. #10816

* [BUGFIX] Distributor: Use a boolean to track changes while merging the ReplicaDesc components, rather than comparing the objects directly. #10185
* [BUGFIX] Querier: fix timeout responding to query-frontend when response size is very close to `-querier.frontend-client.grpc-max-send-msg-size`. #10154
* [BUGFIX] Query-frontend and querier: show warning/info annotations in some cases where they were missing (if a lazy querier was used). #10277
Expand Down
181 changes: 181 additions & 0 deletions cmd/mimir/config-descriptor.json
Original file line number Diff line number Diff line change
Expand Up @@ -4970,6 +4970,187 @@
"fieldType": "int",
"fieldCategory": "experimental"
},
{
"kind": "block",
"name": "ruler_alertmanager_client_config",
"required": false,
"desc": "",
"blockEntries": [
{
"kind": "field",
"name": "alertmanager_url",
"required": false,
"desc": "",
"fieldValue": null,
"fieldDefaultValue": "",
"fieldType": "string"
},
{
"kind": "field",
"name": "tls_enabled",
"required": false,
"desc": "",
"fieldValue": null,
"fieldDefaultValue": false,
"fieldType": "boolean",
"fieldCategory": "advanced"
},
{
"kind": "field",
"name": "tls_cert_path",
"required": false,
"desc": "",
"fieldValue": null,
"fieldDefaultValue": "",
"fieldType": "string",
"fieldCategory": "advanced"
},
{
"kind": "field",
"name": "tls_key_path",
"required": false,
"desc": "",
"fieldValue": null,
"fieldDefaultValue": "",
"fieldType": "string",
"fieldCategory": "advanced"
},
{
"kind": "field",
"name": "tls_ca_path",
"required": false,
"desc": "",
"fieldValue": null,
"fieldDefaultValue": "",
"fieldType": "string",
"fieldCategory": "advanced"
},
{
"kind": "field",
"name": "tls_server_name",
"required": false,
"desc": "",
"fieldValue": null,
"fieldDefaultValue": "",
"fieldType": "string",
"fieldCategory": "advanced"
},
{
"kind": "field",
"name": "tls_insecure_skip_verify",
"required": false,
"desc": "",
"fieldValue": null,
"fieldDefaultValue": false,
"fieldType": "boolean",
"fieldCategory": "advanced"
},
{
"kind": "field",
"name": "tls_cipher_suites",
"required": false,
"desc": "Override the default cipher suite list (separated by commas). Allowed values:\n\nSecure Ciphers:\n- TLS_AES_128_GCM_SHA256\n- TLS_AES_256_GCM_SHA384\n- TLS_CHACHA20_POLY1305_SHA256\n- TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA\n- TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA\n- TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA\n- TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA\n- TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256\n- TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384\n- TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256\n- TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384\n- TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305_SHA256\n- TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_SHA256\n\nInsecure Ciphers:\n- TLS_RSA_WITH_RC4_128_SHA\n- TLS_RSA_WITH_3DES_EDE_CBC_SHA\n- TLS_RSA_WITH_AES_128_CBC_SHA\n- TLS_RSA_WITH_AES_256_CBC_SHA\n- TLS_RSA_WITH_AES_128_CBC_SHA256\n- TLS_RSA_WITH_AES_128_GCM_SHA256\n- TLS_RSA_WITH_AES_256_GCM_SHA384\n- TLS_ECDHE_ECDSA_WITH_RC4_128_SHA\n- TLS_ECDHE_RSA_WITH_RC4_128_SHA\n- TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA\n- TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256\n- TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256\n",
"fieldValue": null,
"fieldDefaultValue": "",
"fieldType": "string",
"fieldCategory": "advanced"
},
{
"kind": "field",
"name": "tls_min_version",
"required": false,
"desc": "",
"fieldValue": null,
"fieldDefaultValue": "",
"fieldType": "string",
"fieldCategory": "advanced"
},
{
"kind": "field",
"name": "basic_auth_username",
"required": false,
"desc": "",
"fieldValue": null,
"fieldDefaultValue": "",
"fieldType": "string"
},
{
"kind": "block",
"name": "basic_auth_password",
"required": false,
"desc": "",
"fieldValue": null,
"fieldDefaultValue": null
},
{
"kind": "block",
"name": "oauth2",
"required": false,
"desc": "",
"blockEntries": [
{
"kind": "field",
"name": "client_id",
"required": false,
"desc": "",
"fieldValue": null,
"fieldDefaultValue": "",
"fieldType": "string"
},
{
"kind": "block",
"name": "client_secret",
"required": false,
"desc": "",
"fieldValue": null,
"fieldDefaultValue": null
},
{
"kind": "field",
"name": "token_url",
"required": false,
"desc": "",
"fieldValue": null,
"fieldDefaultValue": "",
"fieldType": "string"
},
{
"kind": "field",
"name": "scopes",
"required": false,
"desc": "",
"fieldValue": null,
"fieldDefaultValue": "",
"fieldType": "string"
},
{
"kind": "field",
"name": "endpoint_params",
"required": false,
"desc": "",
"fieldValue": null,
"fieldDefaultValue": {},
"fieldType": "map of string to string",
"fieldCategory": "advanced"
}
],
"fieldValue": null,
"fieldDefaultValue": null
},
{
"kind": "field",
"name": "proxy_url",
"required": false,
"desc": "",
"fieldValue": null,
"fieldDefaultValue": "",
"fieldType": "string",
"fieldCategory": "advanced"
}
],
"fieldValue": null,
"fieldDefaultValue": null
},
{
"kind": "field",
"name": "store_gateway_tenant_shard_size",
Expand Down
2 changes: 2 additions & 0 deletions cmd/mimir/help-all.txt.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -2909,6 +2909,8 @@ Usage of ./cmd/mimir/mimir:
OpenStack Swift username.
-ruler.alerting-rules-evaluation-enabled
Controls whether alerting rules evaluation is enabled. This configuration option can be used to forcefully disable alerting rules evaluation on a per-tenant basis. (default true)
-ruler.alertmanager-client-config value
Per-tenant alertmanager client configuration. If not supplied, the tenant's notifications will be sent to the ruler-wide default.
-ruler.alertmanager-client.basic-auth-password string
HTTP Basic authentication password. It overrides the password set in the URL (if any).
-ruler.alertmanager-client.basic-auth-username string
Expand Down
2 changes: 2 additions & 0 deletions cmd/mimir/help.txt.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -713,6 +713,8 @@ Usage of ./cmd/mimir/mimir:
OpenStack Swift username.
-ruler.alerting-rules-evaluation-enabled
Controls whether alerting rules evaluation is enabled. This configuration option can be used to forcefully disable alerting rules evaluation on a per-tenant basis. (default true)
-ruler.alertmanager-client-config value
Per-tenant alertmanager client configuration. If not supplied, the tenant's notifications will be sent to the ruler-wide default.
-ruler.alertmanager-client.basic-auth-password string
HTTP Basic authentication password. It overrides the password set in the URL (if any).
-ruler.alertmanager-client.basic-auth-username string
Expand Down
78 changes: 78 additions & 0 deletions docs/sources/mimir/configure/configuration-parameters/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -3881,6 +3881,84 @@ The `limits` block configures default and per-tenant limits imposed by component
# CLI flag: -ruler.max-independent-rule-evaluation-concurrency-per-tenant
[ruler_max_independent_rule_evaluation_concurrency_per_tenant: <int> | default = 4]

# Per-tenant alertmanager client configuration. If not supplied, the tenant's
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# Per-tenant alertmanager client configuration. If not supplied, the tenant's
# Per-tenant Alertmanager client configuration. If not supplied, the tenant's

# notifications will be sent to the ruler-wide default.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# notifications will be sent to the ruler-wide default.
# notifications are sent to the ruler-wide default.

ruler_alertmanager_client_config:
[alertmanager_url: <string> | default = ""]

# (advanced)
[tls_enabled: <boolean> | default = ]

# (advanced)
[tls_cert_path: <string> | default = ""]

# (advanced)
[tls_key_path: <string> | default = ""]

# (advanced)
[tls_ca_path: <string> | default = ""]

# (advanced)
[tls_server_name: <string> | default = ""]

# (advanced)
[tls_insecure_skip_verify: <boolean> | default = ]

# (advanced) Override the default cipher suite list (separated by commas).
# Allowed values:
#
# Secure Ciphers:
# - TLS_AES_128_GCM_SHA256
# - TLS_AES_256_GCM_SHA384
# - TLS_CHACHA20_POLY1305_SHA256
# - TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA
# - TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA
# - TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA
# - TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA
# - TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256
# - TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384
# - TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256
# - TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384
# - TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305_SHA256
# - TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_SHA256
#
# Insecure Ciphers:
# - TLS_RSA_WITH_RC4_128_SHA
# - TLS_RSA_WITH_3DES_EDE_CBC_SHA
# - TLS_RSA_WITH_AES_128_CBC_SHA
# - TLS_RSA_WITH_AES_256_CBC_SHA
# - TLS_RSA_WITH_AES_128_CBC_SHA256
# - TLS_RSA_WITH_AES_128_GCM_SHA256
# - TLS_RSA_WITH_AES_256_GCM_SHA384
# - TLS_ECDHE_ECDSA_WITH_RC4_128_SHA
# - TLS_ECDHE_RSA_WITH_RC4_128_SHA
# - TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA
# - TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256
# - TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256
[tls_cipher_suites: <string> | default = ""]

# (advanced)
[tls_min_version: <string> | default = ""]

[basic_auth_username: <string> | default = ""]

basic_auth_password:

oauth2:
[client_id: <string> | default = ""]

client_secret:

[token_url: <string> | default = ""]

[scopes: <string> | default = ""]

# (advanced)
[endpoint_params: <map of string to string> | default = ]

# (advanced)
[proxy_url: <string> | default = ""]

# The tenant's shard size, used when store-gateway sharding is enabled. Value of
# 0 disables shuffle sharding for the tenant, that is all tenant blocks are
# sharded across all store-gateway replicas.
Expand Down
2 changes: 1 addition & 1 deletion pkg/mimir/modules.go
Original file line number Diff line number Diff line change
Expand Up @@ -966,7 +966,7 @@ func (t *Mimir) initRuler() (serv services.Service, err error) {
)

dnsResolver := dns.NewProvider(util_log.Logger, dnsProviderReg, dns.GolangResolverType)
manager, err := ruler.NewDefaultMultiTenantManager(t.Cfg.Ruler, managerFactory, t.Registerer, util_log.Logger, dnsResolver)
manager, err := ruler.NewDefaultMultiTenantManager(t.Cfg.Ruler, managerFactory, t.Registerer, util_log.Logger, dnsResolver, t.Overrides)
if err != nil {
return nil, err
}
Expand Down
2 changes: 2 additions & 0 deletions pkg/ruler/compat.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (
"github.com/grafana/mimir/pkg/mimirpb"
"github.com/grafana/mimir/pkg/querier"
querier_stats "github.com/grafana/mimir/pkg/querier/stats"
notifierCfg "github.com/grafana/mimir/pkg/ruler/notifier"
util_log "github.com/grafana/mimir/pkg/util/log"
)

Expand Down Expand Up @@ -213,6 +214,7 @@ type RulesLimits interface {
RulerSyncRulesOnChangesEnabled(userID string) bool
RulerProtectedNamespaces(userID string) []string
RulerMaxIndependentRuleEvaluationConcurrencyPerTenant(userID string) int64
RulerAlertmanagerClientConfig(userID string) notifierCfg.AlertmanagerClientConfig
}

func MetricsQueryFunc(qf rules.QueryFunc, userID string, queries, failedQueries *prometheus.CounterVec, remoteQuerier bool) rules.QueryFunc {
Expand Down
21 changes: 18 additions & 3 deletions pkg/ruler/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@ type DefaultMultiTenantManager struct {
cfg Config
notifierCfg *config.Config
managerFactory ManagerFactory
limits RulesLimits
dnsResolver AddressProvider
refreshMetrics discovery.RefreshMetricsManager

mapper *mapper

Expand All @@ -59,9 +62,9 @@ type DefaultMultiTenantManager struct {
rulerIsRunning atomic.Bool
}

func NewDefaultMultiTenantManager(cfg Config, managerFactory ManagerFactory, reg prometheus.Registerer, logger log.Logger, dnsResolver AddressProvider) (*DefaultMultiTenantManager, error) {
func NewDefaultMultiTenantManager(cfg Config, managerFactory ManagerFactory, reg prometheus.Registerer, logger log.Logger, dnsResolver AddressProvider, limits RulesLimits) (*DefaultMultiTenantManager, error) {
refreshMetrics := discovery.NewRefreshMetrics(reg)
ncfg, err := buildNotifierConfig(&cfg, dnsResolver, refreshMetrics)
ncfg, err := buildNotifierConfig(cfg.AlertmanagerURL, cfg.Notifier, cfg, dnsResolver, refreshMetrics)
if err != nil {
return nil, err
}
Expand All @@ -75,6 +78,9 @@ func NewDefaultMultiTenantManager(cfg Config, managerFactory ManagerFactory, reg
cfg: cfg,
notifierCfg: ncfg,
managerFactory: managerFactory,
limits: limits,
dnsResolver: dnsResolver,
refreshMetrics: refreshMetrics,
notifiers: map[string]*rulerNotifier{},
mapper: newMapper(cfg.RulePath, logger),
userManagers: map[string]RulesManager{},
Expand Down Expand Up @@ -321,8 +327,17 @@ func (r *DefaultMultiTenantManager) getOrCreateNotifier(userID string) (*notifie

n.run()

notifierCfg := r.notifierCfg
userSpecificCfg := r.limits.RulerAlertmanagerClientConfig(userID)
if !userSpecificCfg.IsDefault() {
notifierCfg, err = buildNotifierConfig(userSpecificCfg.AlertmanagerURL, userSpecificCfg.NotifierConfig, r.cfg, r.dnsResolver, r.refreshMetrics)
if err != nil {
return nil, err
}
}

// This should never fail, unless there's a programming mistake.
if err := n.applyConfig(r.notifierCfg); err != nil {
if err := n.applyConfig(notifierCfg); err != nil {
return nil, err
}

Expand Down
Loading
Loading