Skip to content

Commit

Permalink
Update CPUThrottlingHigh and KubeContainerWaiting alerts (#942)
Browse files Browse the repository at this point in the history
* CPUThrottlingHigh: change aggregation to without to keep external labels. Add cadvisorSelector selector.

* KubeContainerWaiting: remove sum by, to keep external labels. Add reason in description.
  • Loading branch information
7840vz authored Nov 7, 2024
1 parent 3830dfd commit bdbf7f4
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 4 deletions.
4 changes: 2 additions & 2 deletions alerts/apps_alerts.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -211,13 +211,13 @@ local utils = import '../lib/utils.libsonnet';
},
{
expr: |||
sum by (namespace, pod, container, %(clusterLabel)s) (kube_pod_container_status_waiting_reason{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}) > 0
kube_pod_container_status_waiting_reason{reason!="CrashLoopBackOff", %(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} > 0
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour.',
description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour. (reason: "{{ $labels.reason }}").',
summary: 'Pod container waiting longer than 1 hour',
},
'for': '1h',
Expand Down
4 changes: 2 additions & 2 deletions alerts/resource_alerts.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -199,9 +199,9 @@
{
alert: 'CPUThrottlingHigh',
expr: |||
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", %(cpuThrottlingSelector)s}[5m])) by (%(clusterLabel)s, container, pod, namespace)
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", %(cadvisorSelector)s, %(cpuThrottlingSelector)s}[5m])) without (id, metrics_path, name, image, endpoint, job, node)
/
sum(increase(container_cpu_cfs_periods_total{%(cpuThrottlingSelector)s}[5m])) by (%(clusterLabel)s, container, pod, namespace)
sum(increase(container_cpu_cfs_periods_total{%(cadvisorSelector)s, %(cpuThrottlingSelector)s}[5m])) without (id, metrics_path, name, image, endpoint, job, node)
> ( %(cpuThrottlingPercent)s / 100 )
||| % $._config,
'for': '15m',
Expand Down

0 comments on commit bdbf7f4

Please sign in to comment.