-
Notifications
You must be signed in to change notification settings - Fork 569
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Alertmanager: Initialize skipped Grafana Alertmanagers receiving requests #10691
base: main
Are you sure you want to change the base?
Changes from 5 commits
e63e411
8bbf5de
eefbcbb
7883412
40572a2
d7bd126
395cf74
7699941
e23a766
2201e08
83d8c88
67cafb6
51524d0
79c89f8
373336b
45246b6
39da214
5ba975f
01303a9
c688474
8e88697
c3eb098
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -64,6 +64,8 @@ var ( | |
errInvalidExternalURLMissingHostname = errors.New("the configured external URL is invalid because it's missing the hostname") | ||
errZoneAwarenessEnabledWithoutZoneInfo = errors.New("the configured alertmanager has zone awareness enabled but zone is not set") | ||
errNotUploadingFallback = errors.New("not uploading fallback configuration") | ||
|
||
zeroTimeUnix = time.Time{}.Unix() | ||
) | ||
|
||
// MultitenantAlertmanagerConfig is the configuration for a multitenant Alertmanager. | ||
|
@@ -83,8 +85,9 @@ type MultitenantAlertmanagerConfig struct { | |
|
||
EnableAPI bool `yaml:"enable_api" category:"advanced"` | ||
|
||
GrafanaAlertmanagerCompatibilityEnabled bool `yaml:"grafana_alertmanager_compatibility_enabled" category:"experimental"` | ||
GrafanaAlertmanagerTenantSuffix string `yaml:"grafana_alertmanager_conditionally_skip_tenant_suffix" category:"experimental"` | ||
GrafanaAlertmanagerCompatibilityEnabled bool `yaml:"grafana_alertmanager_compatibility_enabled" category:"experimental"` | ||
GrafanaAlertmanagerTenantSuffix string `yaml:"grafana_alertmanager_conditionally_skip_tenant_suffix" category:"experimental"` | ||
GrafanaAlertmanagerIdleGracePeriod time.Duration `yaml:"grafana_alertmanager_idle_grace_period" category:"experimental"` | ||
|
||
MaxConcurrentGetRequestsPerTenant int `yaml:"max_concurrent_get_requests_per_tenant" category:"advanced"` | ||
|
||
|
@@ -111,7 +114,8 @@ type MultitenantAlertmanagerConfig struct { | |
} | ||
|
||
const ( | ||
defaultPeerTimeout = 15 * time.Second | ||
defaultGrafanaAlertmanagerGracePeriod = 5 * time.Minute | ||
defaultPeerTimeout = 15 * time.Second | ||
) | ||
|
||
// RegisterFlags adds the features required to config this to the given FlagSet. | ||
|
@@ -128,7 +132,8 @@ func (cfg *MultitenantAlertmanagerConfig) RegisterFlags(f *flag.FlagSet, logger | |
|
||
f.BoolVar(&cfg.EnableAPI, "alertmanager.enable-api", true, "Enable the alertmanager config API.") | ||
f.BoolVar(&cfg.GrafanaAlertmanagerCompatibilityEnabled, "alertmanager.grafana-alertmanager-compatibility-enabled", false, "Enable routes to support the migration and operation of the Grafana Alertmanager.") | ||
f.StringVar(&cfg.GrafanaAlertmanagerTenantSuffix, "alertmanager.grafana-alertmanager-conditionally-skip-tenant-suffix", "", "Skip starting the Alertmanager for tenants matching this suffix unless they have a promoted, non-default Grafana Alertmanager configuration.") | ||
f.StringVar(&cfg.GrafanaAlertmanagerTenantSuffix, "alertmanager.grafana-alertmanager-conditionally-skip-tenant-suffix", "", "Skip starting the Alertmanager for tenants matching this suffix unless they have a promoted, non-default Grafana Alertmanager configuration or they are receiving alerts.") | ||
f.DurationVar(&cfg.GrafanaAlertmanagerIdleGracePeriod, "alertmanager.grafana-alertmanager-grace-period", defaultGrafanaAlertmanagerGracePeriod, "Duration to wait before shutting down an idle Alertmanager for a tenant that matches grafana-alertmanager-conditionally-skip-tenant-suffix and is using an unpromoted or default configuration.") | ||
f.IntVar(&cfg.MaxConcurrentGetRequestsPerTenant, "alertmanager.max-concurrent-get-requests-per-tenant", 0, "Maximum number of concurrent GET requests allowed per tenant. The zero value (and negative values) result in a limit of GOMAXPROCS or 8, whichever is larger. Status code 503 is served for GET requests that would exceed the concurrency limit.") | ||
|
||
f.BoolVar(&cfg.EnableStateCleanup, "alertmanager.enable-state-cleanup", true, "Enables periodic cleanup of alertmanager stateful data (notification logs and silences) from object storage. When enabled, data is removed for any tenant that does not have a configuration.") | ||
|
@@ -189,6 +194,7 @@ type multitenantAlertmanagerMetrics struct { | |
grafanaStateSize *prometheus.GaugeVec | ||
lastReloadSuccessful *prometheus.GaugeVec | ||
lastReloadSuccessfulTimestamp *prometheus.GaugeVec | ||
tenantsSkipped prometheus.Gauge | ||
} | ||
|
||
func newMultitenantAlertmanagerMetrics(reg prometheus.Registerer) *multitenantAlertmanagerMetrics { | ||
|
@@ -212,6 +218,12 @@ func newMultitenantAlertmanagerMetrics(reg prometheus.Registerer) *multitenantAl | |
Help: "Timestamp of the last successful configuration reload.", | ||
}, []string{"user"}) | ||
|
||
m.tenantsSkipped = promauto.With(reg).NewGauge(prometheus.GaugeOpts{ | ||
Namespace: "cortex", | ||
Name: "alertmanager_tenants_skipped", | ||
Help: "Number of per-tenant alertmanagers that were skipped during the last configuration sync.", | ||
}) | ||
|
||
return m | ||
} | ||
|
||
|
@@ -318,6 +330,10 @@ type MultitenantAlertmanager struct { | |
limits Limits | ||
features featurecontrol.Flagger | ||
|
||
// Record the last time we received a request for a given Grafana tenant. | ||
// We can shut down an idle Alertmanager after the grace period elapses. | ||
receivingRequests sync.Map | ||
santihernandezc marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
registry prometheus.Registerer | ||
ringCheckErrors prometheus.Counter | ||
tenantsOwned prometheus.Gauge | ||
|
@@ -390,6 +406,7 @@ func createMultitenantAlertmanager(cfg *MultitenantAlertmanagerConfig, fallbackC | |
fallbackConfig: string(fallbackConfig), | ||
cfgs: map[string]alertspb.AlertConfigDesc{}, | ||
alertmanagers: map[string]*Alertmanager{}, | ||
receivingRequests: sync.Map{}, | ||
alertmanagerMetrics: newAlertmanagerMetrics(logger), | ||
multitenantMetrics: newMultitenantAlertmanagerMetrics(registerer), | ||
store: store, | ||
|
@@ -695,6 +712,7 @@ func (am *MultitenantAlertmanager) syncConfigs(ctx context.Context, cfgMap map[s | |
am.multitenantMetrics.lastReloadSuccessful.WithLabelValues(user).Set(float64(1)) | ||
am.multitenantMetrics.lastReloadSuccessfulTimestamp.WithLabelValues(user).SetToCurrentTime() | ||
} | ||
am.multitenantMetrics.tenantsSkipped.Set(float64(len(amInitSkipped))) | ||
|
||
userAlertmanagersToStop := map[string]*Alertmanager{} | ||
am.alertmanagersMtx.Lock() | ||
|
@@ -727,12 +745,31 @@ func (am *MultitenantAlertmanager) computeConfig(cfgs alertspb.AlertConfigDescs) | |
AlertConfigDesc: cfgs.Mimir, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Concern: This method is doing a lot and appears to have code that doesn't match our current if cfgs.Mimir.RawConfig == am.fallbackConfig || cfgs.Mimir.RawConfig == "" {
level.Debug(am.logger).Log("msg", "using grafana config with the default globals", "user", cfgs.Mimir.User)
cfg, err := am.createUsableGrafanaConfig(cfgs.Grafana, am.fallbackConfig)
return cfg, true, err
} If it is in fact, not needed in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That bit of code gets executed after we've checked for usable (promoted, non-default, non-empty) Grafana configuration. If we reach this far, the tenant has a Grafana configuration we can use to start their Alertmanager. It does indeed not match our current approach. It's part of the original one, where Grafana and non-Grafana tenants were the same, and they couldn't be distinguished by a suffix. It's not necessary to add |
||
tmplExternalURL: am.cfg.ExternalURL.URL, | ||
} | ||
strictInit := am.cfg.GrafanaAlertmanagerTenantSuffix != "" && strings.HasSuffix(cfgs.Mimir.User, am.cfg.GrafanaAlertmanagerTenantSuffix) | ||
santihernandezc marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
// If the Grafana configuration is either default, not promoted, or empty, use the Mimir configuration. | ||
// A Grafana configuration is considered usable if it's promoted, non-default, and not empty. | ||
if !cfgs.Grafana.Promoted || cfgs.Grafana.Default || cfgs.Grafana.RawConfig == "" { | ||
level.Debug(am.logger).Log("msg", "using mimir config", "user", cfgs.Mimir.User) | ||
isGrafanaTenant := am.cfg.GrafanaAlertmanagerTenantSuffix != "" && strings.HasSuffix(cfgs.Mimir.User, am.cfg.GrafanaAlertmanagerTenantSuffix) | ||
return cfg, !isGrafanaTenant, nil | ||
if !strictInit { | ||
return cfg, true, nil | ||
} | ||
|
||
// If the tenant ID matches the configured Grafana suffix, only run the Alertmanager if it's receiving requests. | ||
createdAt, ok := am.receivingRequests.Load(cfgs.Mimir.User) | ||
if !ok || time.Since(time.Unix(createdAt.(int64), 0)) >= am.cfg.GrafanaAlertmanagerIdleGracePeriod { | ||
// Use the zero-value to indicate that we've skipped the tenant. | ||
am.receivingRequests.Store(cfgs.Mimir.User, zeroTimeUnix) | ||
return cfg, false, nil | ||
} | ||
|
||
level.Debug(am.logger).Log("msg", "user has no usable config but is receiving alerts, keeping Alertmanager active", "user", cfgs.Mimir.User) | ||
return cfg, true, nil | ||
} | ||
|
||
// If the Alertmanager was previously skipped but now has a usable configuration, remove it from the skipped list. | ||
if strictInit { | ||
if _, ok := am.receivingRequests.LoadAndDelete(cfgs.Mimir.User); ok { | ||
level.Debug(am.logger).Log("msg", "user has now a usable config, removing it from skipped list", "user", cfgs.Mimir.User) | ||
} | ||
} | ||
|
||
// If the Mimir configuration is either default or empty, use the Grafana configuration. | ||
|
@@ -933,7 +970,6 @@ func (am *MultitenantAlertmanager) newAlertmanager(userID string, amConfig *defi | |
Limits: am.limits, | ||
Features: am.features, | ||
GrafanaAlertmanagerCompatibility: am.cfg.GrafanaAlertmanagerCompatibilityEnabled, | ||
GrafanaAlertmanagerTenantSuffix: am.cfg.GrafanaAlertmanagerTenantSuffix, | ||
}, reg) | ||
if err != nil { | ||
return nil, fmt.Errorf("unable to start Alertmanager for user %v: %v", userID, err) | ||
|
@@ -1002,6 +1038,27 @@ func (am *MultitenantAlertmanager) serveRequest(w http.ResponseWriter, req *http | |
|
||
if ok { | ||
userAM.mux.ServeHTTP(w, req) | ||
|
||
// If needed, update the last time the Alertmanager received requests. | ||
if _, ok := am.receivingRequests.Load(userID); ok { | ||
level.Debug(am.logger).Log("msg", "updating last alert reception time", "user", userID) | ||
am.receivingRequests.Store(userID, time.Now().Unix()) | ||
} | ||
return | ||
} | ||
|
||
// If the Alertmanager initialization was skipped, start the Alertmanager. | ||
if _, ok := am.receivingRequests.Load(userID); ok { | ||
userAM, err = am.startAlertmanager(req.Context(), userID) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems like a more interesting metric to collect: When a skipped config was initialized because of a request. This will give us a feel for flappiness and how effectively our idle timeout value is. |
||
if err != nil { | ||
level.Error(am.logger).Log("msg", "unable to initialize the Alertmanager", "user", userID, "err", err) | ||
http.Error(w, "Failed to initialize the Alertmanager", http.StatusInternalServerError) | ||
return | ||
} | ||
|
||
am.receivingRequests.Store(userID, time.Now().Unix()) | ||
level.Debug(am.logger).Log("msg", "Alertmanager initialized after receiving request", "user", userID) | ||
userAM.mux.ServeHTTP(w, req) | ||
return | ||
} | ||
|
||
|
@@ -1025,6 +1082,33 @@ func (am *MultitenantAlertmanager) serveRequest(w http.ResponseWriter, req *http | |
http.Error(w, "the Alertmanager is not configured", http.StatusPreconditionFailed) | ||
} | ||
|
||
// startAlertmanager will start the Alertmanager for a tenant, using the fallback configuration if no config is found. | ||
func (am *MultitenantAlertmanager) startAlertmanager(ctx context.Context, userID string) (*Alertmanager, error) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this method can be combined with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I didn't want to change the behavior of Mimir Alertmanager tenants, at least not yet. I was planning on doing that in a future PR, when applying strict initialization to all tenants. |
||
// Avoid starting the Alertmanager for tenants not owned by this instance. | ||
if !am.isUserOwned(userID) { | ||
return nil, errors.Wrap(errNotUploadingFallback, "user not owned by this instance") | ||
} | ||
|
||
cfg, err := am.store.GetAlertConfig(ctx, userID) | ||
if err != nil { | ||
if !errors.Is(err, alertspb.ErrNotFound) { | ||
return nil, errors.Wrap(err, "failed to check for existing configuration") | ||
} | ||
cfg = alertspb.ToProto("", nil, userID) | ||
} | ||
|
||
amConfig := amConfig{ | ||
AlertConfigDesc: cfg, | ||
tmplExternalURL: am.cfg.ExternalURL.URL, | ||
} | ||
if err := am.setConfig(amConfig); err != nil { | ||
return nil, err | ||
} | ||
am.alertmanagersMtx.Lock() | ||
defer am.alertmanagersMtx.Unlock() | ||
return am.alertmanagers[userID], nil | ||
} | ||
|
||
func (am *MultitenantAlertmanager) alertmanagerFromFallbackConfig(ctx context.Context, userID string) (*Alertmanager, error) { | ||
// Make sure we never create fallback instances for a user not owned by this instance. | ||
// This check is not strictly necessary as the configuration polling loop will deactivate | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unrelated fix, this was not being used here.