Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Alertmanager: Initialize skipped Grafana Alertmanagers receiving requests #10691

Open
wants to merge 22 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
e63e411
(WIP) Alertmanager: Initialize skipped Grafana Alertmanagers receivin…
santihernandezc Feb 19, 2025
8bbf5de
remove unnecessary lines, refactor,
santihernandezc Feb 20, 2025
eefbcbb
use sync.Map instead of map + mutex
santihernandezc Feb 20, 2025
7883412
add gauge for number of Alertmanagers skipped during the last config …
santihernandezc Feb 20, 2025
40572a2
make doc, make reference-help
santihernandezc Feb 20, 2025
d7bd126
reduce the amount of store operations by only storing a zero-value ti…
santihernandezc Feb 21, 2025
395cf74
remove unnecessary zeroTimeUnix var
santihernandezc Feb 21, 2025
7699941
wording in logs
santihernandezc Feb 24, 2025
e23a766
Merge branch 'main' of https://github.com/grafana/mimir into santiher…
santihernandezc Feb 24, 2025
2201e08
use LoadOrStore()
santihernandezc Feb 28, 2025
83d8c88
receivingRequests -> lastRequestTime
santihernandezc Feb 28, 2025
67cafb6
fix receiving alerts -> receiving requests
santihernandezc Feb 28, 2025
51524d0
Merge branch 'main' of https://github.com/grafana/mimir into santiher…
santihernandezc Feb 28, 2025
79c89f8
improve redability in computeConfig()
santihernandezc Feb 28, 2025
373336b
Add counter for on-request initializations
santihernandezc Mar 3, 2025
45246b6
fix custom mimir config being ignored in grafana tenants, tests
santihernandezc Mar 3, 2025
39da214
fix order of expects in tests
santihernandezc Mar 3, 2025
5ba975f
make test diff smaller
santihernandezc Mar 3, 2025
01303a9
Merge branch 'main' of https://github.com/grafana/mimir into santiher…
santihernandezc Mar 3, 2025
c688474
handle errNotUploadingFallback errors
santihernandezc Mar 5, 2025
8e88697
delete tenant from skipped list if it's not owned by the instance, al…
santihernandezc Mar 5, 2025
c3eb098
prevent race conditions when starting Alertmanagers, refactor
santihernandezc Mar 5, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion cmd/mimir/config-descriptor.json
Original file line number Diff line number Diff line change
Expand Up @@ -16075,13 +16075,24 @@
"kind": "field",
"name": "grafana_alertmanager_conditionally_skip_tenant_suffix",
"required": false,
"desc": "Skip starting the Alertmanager for tenants matching this suffix unless they have a promoted, non-default Grafana Alertmanager configuration.",
"desc": "Skip starting the Alertmanager for tenants matching this suffix unless they have a promoted, non-default Grafana Alertmanager configuration or they are receiving requests.",
"fieldValue": null,
"fieldDefaultValue": "",
"fieldFlag": "alertmanager.grafana-alertmanager-conditionally-skip-tenant-suffix",
"fieldType": "string",
"fieldCategory": "experimental"
},
{
"kind": "field",
"name": "grafana_alertmanager_idle_grace_period",
"required": false,
"desc": "Duration to wait before shutting down an idle Alertmanager for a tenant that matches grafana-alertmanager-conditionally-skip-tenant-suffix and is using an unpromoted or default configuration.",
"fieldValue": null,
"fieldDefaultValue": 300000000000,
"fieldFlag": "alertmanager.grafana-alertmanager-grace-period",
"fieldType": "duration",
"fieldCategory": "experimental"
},
{
"kind": "field",
"name": "max_concurrent_get_requests_per_tenant",
Expand Down
4 changes: 3 additions & 1 deletion cmd/mimir/help-all.txt.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,9 @@ Usage of ./cmd/mimir/mimir:
-alertmanager.grafana-alertmanager-compatibility-enabled
[experimental] Enable routes to support the migration and operation of the Grafana Alertmanager.
-alertmanager.grafana-alertmanager-conditionally-skip-tenant-suffix string
[experimental] Skip starting the Alertmanager for tenants matching this suffix unless they have a promoted, non-default Grafana Alertmanager configuration.
[experimental] Skip starting the Alertmanager for tenants matching this suffix unless they have a promoted, non-default Grafana Alertmanager configuration or they are receiving requests.
-alertmanager.grafana-alertmanager-grace-period duration
[experimental] Duration to wait before shutting down an idle Alertmanager for a tenant that matches grafana-alertmanager-conditionally-skip-tenant-suffix and is using an unpromoted or default configuration. (default 5m0s)
-alertmanager.log-parsing-label-matchers
[experimental] Enable logging when parsing label matchers. This flag is intended to be used with -alertmanager.utf8-strict-mode-enabled to validate UTF-8 strict mode is working as intended.
-alertmanager.max-alerts-count int
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2520,10 +2520,17 @@ sharding_ring:
[grafana_alertmanager_compatibility_enabled: <boolean> | default = false]

# (experimental) Skip starting the Alertmanager for tenants matching this suffix
# unless they have a promoted, non-default Grafana Alertmanager configuration.
# unless they have a promoted, non-default Grafana Alertmanager configuration or
# they are receiving requests.
# CLI flag: -alertmanager.grafana-alertmanager-conditionally-skip-tenant-suffix
[grafana_alertmanager_conditionally_skip_tenant_suffix: <string> | default = ""]

# (experimental) Duration to wait before shutting down an idle Alertmanager for
# a tenant that matches grafana-alertmanager-conditionally-skip-tenant-suffix
# and is using an unpromoted or default configuration.
# CLI flag: -alertmanager.grafana-alertmanager-grace-period
[grafana_alertmanager_idle_grace_period: <duration> | default = 5m]

# (advanced) Maximum number of concurrent GET requests allowed per tenant. The
# zero value (and negative values) result in a limit of GOMAXPROCS or 8,
# whichever is larger. Status code 503 is served for GET requests that would
Expand Down
137 changes: 124 additions & 13 deletions pkg/alertmanager/multitenant.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,9 @@ type MultitenantAlertmanagerConfig struct {

EnableAPI bool `yaml:"enable_api" category:"advanced"`

GrafanaAlertmanagerCompatibilityEnabled bool `yaml:"grafana_alertmanager_compatibility_enabled" category:"experimental"`
GrafanaAlertmanagerTenantSuffix string `yaml:"grafana_alertmanager_conditionally_skip_tenant_suffix" category:"experimental"`
GrafanaAlertmanagerCompatibilityEnabled bool `yaml:"grafana_alertmanager_compatibility_enabled" category:"experimental"`
GrafanaAlertmanagerTenantSuffix string `yaml:"grafana_alertmanager_conditionally_skip_tenant_suffix" category:"experimental"`
GrafanaAlertmanagerIdleGracePeriod time.Duration `yaml:"grafana_alertmanager_idle_grace_period" category:"experimental"`

MaxConcurrentGetRequestsPerTenant int `yaml:"max_concurrent_get_requests_per_tenant" category:"advanced"`

Expand Down Expand Up @@ -114,7 +115,8 @@ type MultitenantAlertmanagerConfig struct {
}

const (
defaultPeerTimeout = 15 * time.Second
defaultGrafanaAlertmanagerGracePeriod = 5 * time.Minute
defaultPeerTimeout = 15 * time.Second
)

// RegisterFlags adds the features required to config this to the given FlagSet.
Expand All @@ -131,7 +133,8 @@ func (cfg *MultitenantAlertmanagerConfig) RegisterFlags(f *flag.FlagSet, logger

f.BoolVar(&cfg.EnableAPI, "alertmanager.enable-api", true, "Enable the alertmanager config API.")
f.BoolVar(&cfg.GrafanaAlertmanagerCompatibilityEnabled, "alertmanager.grafana-alertmanager-compatibility-enabled", false, "Enable routes to support the migration and operation of the Grafana Alertmanager.")
f.StringVar(&cfg.GrafanaAlertmanagerTenantSuffix, "alertmanager.grafana-alertmanager-conditionally-skip-tenant-suffix", "", "Skip starting the Alertmanager for tenants matching this suffix unless they have a promoted, non-default Grafana Alertmanager configuration.")
f.StringVar(&cfg.GrafanaAlertmanagerTenantSuffix, "alertmanager.grafana-alertmanager-conditionally-skip-tenant-suffix", "", "Skip starting the Alertmanager for tenants matching this suffix unless they have a promoted, non-default Grafana Alertmanager configuration or they are receiving requests.")
f.DurationVar(&cfg.GrafanaAlertmanagerIdleGracePeriod, "alertmanager.grafana-alertmanager-grace-period", defaultGrafanaAlertmanagerGracePeriod, "Duration to wait before shutting down an idle Alertmanager for a tenant that matches grafana-alertmanager-conditionally-skip-tenant-suffix and is using an unpromoted or default configuration.")
f.IntVar(&cfg.MaxConcurrentGetRequestsPerTenant, "alertmanager.max-concurrent-get-requests-per-tenant", 0, "Maximum number of concurrent GET requests allowed per tenant. The zero value (and negative values) result in a limit of GOMAXPROCS or 8, whichever is larger. Status code 503 is served for GET requests that would exceed the concurrency limit.")

f.BoolVar(&cfg.EnableStateCleanup, "alertmanager.enable-state-cleanup", true, "Enables periodic cleanup of alertmanager stateful data (notification logs and silences) from object storage. When enabled, data is removed for any tenant that does not have a configuration.")
Expand Down Expand Up @@ -195,6 +198,8 @@ type multitenantAlertmanagerMetrics struct {
grafanaStateSize *prometheus.GaugeVec
lastReloadSuccessful *prometheus.GaugeVec
lastReloadSuccessfulTimestamp *prometheus.GaugeVec
initializationsOnRequestTotal *prometheus.CounterVec
tenantsSkipped prometheus.Gauge
}

func newMultitenantAlertmanagerMetrics(reg prometheus.Registerer) *multitenantAlertmanagerMetrics {
Expand All @@ -218,6 +223,18 @@ func newMultitenantAlertmanagerMetrics(reg prometheus.Registerer) *multitenantAl
Help: "Timestamp of the last successful configuration reload.",
}, []string{"user"})

m.initializationsOnRequestTotal = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
Namespace: "cortex",
Name: "alertmanager_initializations_on_request_total",
Help: "Total number of on-request initializations for Alertmanagers that were previously skipped.",
}, []string{"user"})

m.tenantsSkipped = promauto.With(reg).NewGauge(prometheus.GaugeOpts{
Namespace: "cortex",
Name: "alertmanager_tenants_skipped",
Help: "Number of per-tenant alertmanagers that were skipped during the last configuration sync.",
})

return m
}

Expand Down Expand Up @@ -326,6 +343,11 @@ type MultitenantAlertmanager struct {
limits Limits
features featurecontrol.Flagger

// lastRequestTime tracks request timestamps for conditionally-started Grafana Alertmanagers.
// A zero-value timestamp for a tenant means that their Alertmanager was skipped during the last config sync.
// This map is used alongside the configured grace period to determine when to shut down idle Alertmanagers.
lastRequestTime sync.Map

registry prometheus.Registerer
ringCheckErrors prometheus.Counter
tenantsOwned prometheus.Gauge
Expand Down Expand Up @@ -398,6 +420,7 @@ func createMultitenantAlertmanager(cfg *MultitenantAlertmanagerConfig, fallbackC
fallbackConfig: string(fallbackConfig),
cfgs: map[string]alertspb.AlertConfigDesc{},
alertmanagers: map[string]*Alertmanager{},
lastRequestTime: sync.Map{},
alertmanagerMetrics: newAlertmanagerMetrics(logger),
multitenantMetrics: newMultitenantAlertmanagerMetrics(registerer),
store: store,
Expand Down Expand Up @@ -703,6 +726,7 @@ func (am *MultitenantAlertmanager) syncConfigs(ctx context.Context, cfgMap map[s
am.multitenantMetrics.lastReloadSuccessful.WithLabelValues(user).Set(float64(1))
am.multitenantMetrics.lastReloadSuccessfulTimestamp.WithLabelValues(user).SetToCurrentTime()
}
am.multitenantMetrics.tenantsSkipped.Set(float64(len(amInitSkipped)))

userAlertmanagersToStop := map[string]*Alertmanager{}
am.alertmanagersMtx.Lock()
Expand Down Expand Up @@ -731,29 +755,69 @@ func (am *MultitenantAlertmanager) syncConfigs(ctx context.Context, cfgMap map[s
// computeConfig takes an AlertConfigDescs struct containing Mimir and Grafana configurations.
// It returns the final configuration and a bool indicating whether the Alertmanager should be started for the tenant.
func (am *MultitenantAlertmanager) computeConfig(cfgs alertspb.AlertConfigDescs) (amConfig, bool, error) {
userID := cfgs.Mimir.User
cfg := amConfig{
AlertConfigDesc: cfgs.Mimir,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Concern: This method is doing a lot and appears to have code that doesn't match our current -grafana tenant strategy. For example, it's not clear to me what the following is meant to be doing anymore and why we don't need it in the new startAlertmanager code:

	if cfgs.Mimir.RawConfig == am.fallbackConfig || cfgs.Mimir.RawConfig == "" {
		level.Debug(am.logger).Log("msg", "using grafana config with the default globals", "user", cfgs.Mimir.User)
		cfg, err := am.createUsableGrafanaConfig(cfgs.Grafana, am.fallbackConfig)
		return cfg, true, err
	}

If it is in fact, not needed in startAlertmanager we might want to start cleaning up some of the redundant code that doesn't fit the current strategy. Or at least extract it somewhere where it is clear it's not part of the functional flow.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That bit of code gets executed after we've checked for usable (promoted, non-default, non-empty) Grafana configuration. If we reach this far, the tenant has a Grafana configuration we can use to start their Alertmanager.

It does indeed not match our current approach. It's part of the original one, where Grafana and non-Grafana tenants were the same, and they couldn't be distinguished by a suffix.

It's not necessary to add computeConfig() to the startAlertmanager() function as it will only be called for skipped Grafana Alertmanagers. If a Grafana Alertmanager was skipped, it had no usable configuration, so we can just use a default config.

tmplExternalURL: am.cfg.ExternalURL.URL,
}

// If the Grafana configuration is either default, not promoted, or empty, use the Mimir configuration.
if !cfgs.Grafana.Promoted || cfgs.Grafana.Default || cfgs.Grafana.RawConfig == "" {
level.Debug(am.logger).Log("msg", "using mimir config", "user", cfgs.Mimir.User)
isGrafanaTenant := am.cfg.GrafanaAlertmanagerTenantSuffix != "" && strings.HasSuffix(cfgs.Mimir.User, am.cfg.GrafanaAlertmanagerTenantSuffix)
return cfg, !isGrafanaTenant, nil
// Check if this tenant can be skipped (Grafana suffix matching).
skippable := am.canSkipTenant(userID)

// Check if the mimir config is non-empty and non-default.
isMimirConfigCustom := cfgs.Mimir.RawConfig != am.fallbackConfig && cfgs.Mimir.RawConfig != ""

// If Grafana config is not usable, skip if possible.
if !isGrafanaConfigUsable(cfgs.Grafana) {
if !skippable || isMimirConfigCustom {
return cfg, true, nil
}

// For skippable tenants, only run if receiving requests recently.
createdAt, loaded := am.lastRequestTime.LoadOrStore(userID, time.Time{}.Unix())
if !loaded || time.Unix(createdAt.(int64), 0).IsZero() {
return cfg, false, nil
}

gracePeriodExpired := time.Since(time.Unix(createdAt.(int64), 0)) >= am.cfg.GrafanaAlertmanagerIdleGracePeriod

// Use the zero value to signal that the tenant was skipped.
// If the value stored is not what we have in memory, the tenant received a request since the last read.
if gracePeriodExpired && am.lastRequestTime.CompareAndSwap(userID, createdAt, time.Time{}.Unix()) {
return cfg, false, nil
}

level.Debug(am.logger).Log("msg", "user has no usable config but is receiving requests, keeping Alertmanager active", "user", userID)
return cfg, true, nil
}

// Clear any previous skipped status since we now have a usable config.
if skippable {
if _, ok := am.lastRequestTime.LoadAndDelete(userID); ok {
level.Debug(am.logger).Log("msg", "user now has a usable config, removing it from skipped list", "user", userID)
}
}

// If the Mimir configuration is either default or empty, use the Grafana configuration.
if cfgs.Mimir.RawConfig == am.fallbackConfig || cfgs.Mimir.RawConfig == "" {
level.Debug(am.logger).Log("msg", "using grafana config with the default globals", "user", cfgs.Mimir.User)
if !isMimirConfigCustom {
level.Debug(am.logger).Log("msg", "using grafana config with the default globals", "user", userID)
cfg, err := am.createUsableGrafanaConfig(cfgs.Grafana, am.fallbackConfig)
return cfg, true, err
}

level.Warn(am.logger).Log("msg", "merging configurations not implemented, using mimir config", "user", cfgs.Mimir.User)
level.Warn(am.logger).Log("msg", "merging configurations not implemented, using mimir config", "user", userID)
return cfg, true, nil
}

// isGrafanaConfigUsable returns true if the Grafana configuration is promoted, non-default, and not empty.
func isGrafanaConfigUsable(cfg alertspb.GrafanaAlertConfigDesc) bool {
return cfg.Promoted && !cfg.Default && cfg.RawConfig != ""
}

// canSkipTenant returns true if the tenant can be skipped (Grafana suffix matching).
func (am *MultitenantAlertmanager) canSkipTenant(userID string) bool {
return am.cfg.GrafanaAlertmanagerTenantSuffix != "" && strings.HasSuffix(userID, am.cfg.GrafanaAlertmanagerTenantSuffix)
}

// syncStates promotes/unpromotes the Grafana state and updates the 'promoted' flag if needed.
func (am *MultitenantAlertmanager) syncStates(ctx context.Context, cfg amConfig) error {
// fetching grafana state first so we can register its size independently of it being promoted or not
Expand Down Expand Up @@ -1011,6 +1075,33 @@ func (am *MultitenantAlertmanager) serveRequest(w http.ResponseWriter, req *http

if ok {
userAM.mux.ServeHTTP(w, req)

// If needed, update the last time the Alertmanager received requests.
if _, ok := am.lastRequestTime.Load(userID); ok {
level.Debug(am.logger).Log("msg", "updating last request reception time", "user", userID)
am.lastRequestTime.Store(userID, time.Now().Unix())
}
return
}

// If the Alertmanager initialization was skipped, start the Alertmanager.
if ok := am.lastRequestTime.CompareAndSwap(userID, time.Time{}.Unix(), time.Now().Unix()); ok {
userAM, err = am.startAlertmanager(userID)
if err != nil {
if errors.Is(err, errNotUploadingFallback) {
level.Warn(am.logger).Log("msg", "not initializing Alertmanager", "user", userID, "err", err)
http.Error(w, "Not initializing the Alertmanager", http.StatusNotAcceptable)
return
}
level.Error(am.logger).Log("msg", "unable to initialize the Alertmanager", "user", userID, "err", err)
http.Error(w, "Failed to initialize the Alertmanager", http.StatusInternalServerError)
return
}

am.lastRequestTime.Store(userID, time.Now().Unix())
am.multitenantMetrics.initializationsOnRequestTotal.WithLabelValues(userID).Inc()
level.Debug(am.logger).Log("msg", "Alertmanager initialized after receiving request", "user", userID)
userAM.mux.ServeHTTP(w, req)
return
}

Expand All @@ -1034,6 +1125,26 @@ func (am *MultitenantAlertmanager) serveRequest(w http.ResponseWriter, req *http
http.Error(w, "the Alertmanager is not configured", http.StatusPreconditionFailed)
}

// startAlertmanager will start the Alertmanager for a tenant, using the fallback configuration if no config is found.
func (am *MultitenantAlertmanager) startAlertmanager(userID string) (*Alertmanager, error) {
// Avoid starting the Alertmanager for tenants not owned by this instance.
if !am.isUserOwned(userID) {
am.lastRequestTime.Delete(userID)
return nil, errors.Wrap(errNotUploadingFallback, "user not owned by this instance")
}

amConfig := amConfig{
AlertConfigDesc: alertspb.ToProto("", nil, userID),
tmplExternalURL: am.cfg.ExternalURL.URL,
}
if err := am.setConfig(amConfig); err != nil {
return nil, err
}
am.alertmanagersMtx.Lock()
defer am.alertmanagersMtx.Unlock()
return am.alertmanagers[userID], nil
}

func (am *MultitenantAlertmanager) alertmanagerFromFallbackConfig(ctx context.Context, userID string) (*Alertmanager, error) {
// Make sure we never create fallback instances for a user not owned by this instance.
// This check is not strictly necessary as the configuration polling loop will deactivate
Expand Down
Loading
Loading