diff --git a/CHANGELOG.md b/CHANGELOG.md index 1f192c6e76..1a7f425261 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -90,6 +90,7 @@ ### Mixin +* [CHANGE] Alerts: Only alert on errors performing cache operations if there are over 10 request/sec to avoid flapping. #10832 * [FEATURE] Add compiled mixin for GEM installations in `operations/mimir-mixin-compiled-gem`. #10690 * [ENHANCEMENT] Dashboards: clarify that the ingester and store-gateway panels on the 'Reads' dashboard show data from all query requests to that component, not just requests from the main query path (ie. requests from the ruler query path are included as well). #10598 * [ENHANCEMENT] Dashboards: add ingester and store-gateway panels from the 'Reads' dashboard to the 'Remote ruler reads' dashboard as well. #10598 diff --git a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml index 7bbc8d3667..f104c49f0a 100644 --- a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml +++ b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml @@ -124,7 +124,7 @@ spec: / sum by(cluster, namespace, name, operation) ( rate(thanos_cache_operations_total{operation!~"add|delete"}[1m]) - ) + ) > 10 ) * 100 > 5 for: 5m labels: diff --git a/operations/mimir-mixin-compiled-baremetal/alerts.yaml b/operations/mimir-mixin-compiled-baremetal/alerts.yaml index 774ca2d529..f171b5c4f0 100644 --- a/operations/mimir-mixin-compiled-baremetal/alerts.yaml +++ b/operations/mimir-mixin-compiled-baremetal/alerts.yaml @@ -112,7 +112,7 @@ groups: / sum by(cluster, namespace, name, operation) ( rate(thanos_cache_operations_total{operation!~"add|delete"}[1m]) - ) + ) > 10 ) * 100 > 5 for: 5m labels: diff --git a/operations/mimir-mixin-compiled-gem/alerts.yaml b/operations/mimir-mixin-compiled-gem/alerts.yaml index 8b652c9379..a162fd690e 100644 --- a/operations/mimir-mixin-compiled-gem/alerts.yaml +++ b/operations/mimir-mixin-compiled-gem/alerts.yaml @@ -112,7 +112,7 @@ groups: / sum by(cluster, namespace, name, operation) ( rate(thanos_cache_operations_total{operation!~"add|delete"}[1m]) - ) + ) > 10 ) * 100 > 5 for: 5m labels: diff --git a/operations/mimir-mixin-compiled/alerts.yaml b/operations/mimir-mixin-compiled/alerts.yaml index dfc68b652c..06c090c8bc 100644 --- a/operations/mimir-mixin-compiled/alerts.yaml +++ b/operations/mimir-mixin-compiled/alerts.yaml @@ -112,7 +112,7 @@ groups: / sum by(cluster, namespace, name, operation) ( rate(thanos_cache_operations_total{operation!~"add|delete"}[1m]) - ) + ) > 10 ) * 100 > 5 for: 5m labels: diff --git a/operations/mimir-mixin/alerts/alerts.libsonnet b/operations/mimir-mixin/alerts/alerts.libsonnet index 0f53637875..03fd1a4f35 100644 --- a/operations/mimir-mixin/alerts/alerts.libsonnet +++ b/operations/mimir-mixin/alerts/alerts.libsonnet @@ -204,7 +204,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; alert: $.alertName('CacheRequestErrors'), // Specifically exclude "add" and "delete" operations which are used for cache invalidation and "locking" // since they are expected to sometimes fail in normal operation (such as when a "lock" already exists or - // key being invalidated does not exist). + // key being invalidated does not exist). We also only alert when there at least 10 req/sec to the cache + // to avoid flapping alerts in low-traffic environments. expr: ||| ( sum by(%(group_by)s, name, operation) ( @@ -213,7 +214,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; / sum by(%(group_by)s, name, operation) ( rate(thanos_cache_operations_total{operation!~"add|delete"}[%(range_interval)s]) - ) + ) > 10 ) * 100 > 5 ||| % { group_by: $._config.alert_aggregation_labels,