Skip to content

Commit

Permalink
mixin: add federation-frontend alert
Browse files Browse the repository at this point in the history
Signed-off-by: Dimitar Dimitrov <[email protected]>
  • Loading branch information
dimitarvdimitrov committed Feb 19, 2025
1 parent a656f81 commit f9681a4
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 1 deletion.
19 changes: 19 additions & 0 deletions operations/mimir-mixin-compiled-gem/alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1249,3 +1249,22 @@ groups:
sum by(cluster, namespace, test) (rate(mimir_continuous_test_query_result_checks_failed_total[10m])) > 0
labels:
severity: warning
- name: gem_alerts
rules:
- alert: MimirFederationFrontendRemoteClusterErrors
annotations:
message: |
The federation-frontend has been receiving {{ $value | humanizePercentage }} errors from cluster {{ $labels.remote_cluster }} over the last 15 minutes.
If partial responses are disabled (default), then clients of the federation-frontend are receiving errors.
If partial responses are enabled, then responses are now less complete.
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirfederationfrontendremoteclustererrors
expr: |
100 * (
sum by (remote_cluster) (rate(cortex_federation_frontend_cluster_remote_latency_seconds_count{status="server_error"}[1m]))
/
sum by (remote_cluster) (rate(cortex_federation_frontend_cluster_remote_latency_seconds_count[1m]))
) > 1
for: 15m
labels:
service: federation-frontend
severity: critical
Binary file modified operations/mimir-mixin-gem.zip
Binary file not shown.
3 changes: 2 additions & 1 deletion operations/mimir-mixin/alerts.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,6 @@
(import 'alerts/distributor.libsonnet') +
(import 'alerts/autoscaling.libsonnet') +
(if $._config.ingest_storage_enabled then import 'alerts/ingest-storage.libsonnet' else {}) +
(import 'alerts/continuous-test.libsonnet'),
(import 'alerts/continuous-test.libsonnet') +
(if $._config.gem_enabled then import 'alerts/gem.libsonnet' else {}),
}
35 changes: 35 additions & 0 deletions operations/mimir-mixin/alerts/gem.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
(import 'alerts-utils.libsonnet') {
local alertGroups = [
{
name: 'gem_alerts',
rules: [
{
alert: $.alertName('FederationFrontendRemoteClusterErrors'),
expr: |||
100 * (
sum by (remote_cluster) (rate(cortex_federation_frontend_cluster_remote_latency_seconds_count{status="server_error"}[%(range_interval)s]))
/
sum by (remote_cluster) (rate(cortex_federation_frontend_cluster_remote_latency_seconds_count[%(range_interval)s]))
) > 1
||| % {
range_interval: $.alertRangeInterval(1),
},
'for': '15m',
labels: {
severity: 'critical',
service: 'federation-frontend',
},
annotations: {
message: |||
The federation-frontend has been receiving {{ $value | humanizePercentage }} errors from cluster {{ $labels.remote_cluster }} over the last 15 minutes.
If partial responses are disabled (default), then clients of the federation-frontend are receiving errors.
If partial responses are enabled, then responses are now less complete.
||| % $._config,
},
},
],
},
],

groups+: $.withRunbookURL('https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#%s', $.withExtraLabelsAnnotations(alertGroups)),
}

0 comments on commit f9681a4

Please sign in to comment.