From d65b32cf43febef53eb4ba15dda8c230ef87856c Mon Sep 17 00:00:00 2001 From: Nick Pillitteri <56quarters@users.noreply.github.com> Date: Fri, 7 Mar 2025 14:52:18 -0500 Subject: [PATCH] mixin: Add req/sec floor to `MimirCacheRequestErrors` (#10832) Require at least 10 req/sec to the cache to consider alerting on request errors. This avoids noisy alerts in low-traffic clusters Fixes #10831 Signed-off-by: Nick Pillitteri --- CHANGELOG.md | 1 + .../templates/metamonitoring/mixin-alerts.yaml | 2 +- operations/mimir-mixin-compiled-baremetal/alerts.yaml | 2 +- operations/mimir-mixin-compiled-gem/alerts.yaml | 2 +- operations/mimir-mixin-compiled/alerts.yaml | 2 +- operations/mimir-mixin/alerts/alerts.libsonnet | 5 +++-- 6 files changed, 8 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1f192c6e76a..1a7f4252619 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -90,6 +90,7 @@ ### Mixin +* [CHANGE] Alerts: Only alert on errors performing cache operations if there are over 10 request/sec to avoid flapping. #10832 * [FEATURE] Add compiled mixin for GEM installations in `operations/mimir-mixin-compiled-gem`. #10690 * [ENHANCEMENT] Dashboards: clarify that the ingester and store-gateway panels on the 'Reads' dashboard show data from all query requests to that component, not just requests from the main query path (ie. requests from the ruler query path are included as well). #10598 * [ENHANCEMENT] Dashboards: add ingester and store-gateway panels from the 'Reads' dashboard to the 'Remote ruler reads' dashboard as well. #10598 diff --git a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml index 7bbc8d36677..f104c49f0a6 100644 --- a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml +++ b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml @@ -124,7 +124,7 @@ spec: / sum by(cluster, namespace, name, operation) ( rate(thanos_cache_operations_total{operation!~"add|delete"}[1m]) - ) + ) > 10 ) * 100 > 5 for: 5m labels: diff --git a/operations/mimir-mixin-compiled-baremetal/alerts.yaml b/operations/mimir-mixin-compiled-baremetal/alerts.yaml index 774ca2d529a..f171b5c4f0d 100644 --- a/operations/mimir-mixin-compiled-baremetal/alerts.yaml +++ b/operations/mimir-mixin-compiled-baremetal/alerts.yaml @@ -112,7 +112,7 @@ groups: / sum by(cluster, namespace, name, operation) ( rate(thanos_cache_operations_total{operation!~"add|delete"}[1m]) - ) + ) > 10 ) * 100 > 5 for: 5m labels: diff --git a/operations/mimir-mixin-compiled-gem/alerts.yaml b/operations/mimir-mixin-compiled-gem/alerts.yaml index 8b652c93799..a162fd690e3 100644 --- a/operations/mimir-mixin-compiled-gem/alerts.yaml +++ b/operations/mimir-mixin-compiled-gem/alerts.yaml @@ -112,7 +112,7 @@ groups: / sum by(cluster, namespace, name, operation) ( rate(thanos_cache_operations_total{operation!~"add|delete"}[1m]) - ) + ) > 10 ) * 100 > 5 for: 5m labels: diff --git a/operations/mimir-mixin-compiled/alerts.yaml b/operations/mimir-mixin-compiled/alerts.yaml index dfc68b652ce..06c090c8bc8 100644 --- a/operations/mimir-mixin-compiled/alerts.yaml +++ b/operations/mimir-mixin-compiled/alerts.yaml @@ -112,7 +112,7 @@ groups: / sum by(cluster, namespace, name, operation) ( rate(thanos_cache_operations_total{operation!~"add|delete"}[1m]) - ) + ) > 10 ) * 100 > 5 for: 5m labels: diff --git a/operations/mimir-mixin/alerts/alerts.libsonnet b/operations/mimir-mixin/alerts/alerts.libsonnet index 0f53637875d..03fd1a4f35c 100644 --- a/operations/mimir-mixin/alerts/alerts.libsonnet +++ b/operations/mimir-mixin/alerts/alerts.libsonnet @@ -204,7 +204,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; alert: $.alertName('CacheRequestErrors'), // Specifically exclude "add" and "delete" operations which are used for cache invalidation and "locking" // since they are expected to sometimes fail in normal operation (such as when a "lock" already exists or - // key being invalidated does not exist). + // key being invalidated does not exist). We also only alert when there at least 10 req/sec to the cache + // to avoid flapping alerts in low-traffic environments. expr: ||| ( sum by(%(group_by)s, name, operation) ( @@ -213,7 +214,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; / sum by(%(group_by)s, name, operation) ( rate(thanos_cache_operations_total{operation!~"add|delete"}[%(range_interval)s]) - ) + ) > 10 ) * 100 > 5 ||| % { group_by: $._config.alert_aggregation_labels,