From 3a905d6a943b4e9c6e4f0a7af8530401c1865414 Mon Sep 17 00:00:00 2001
From: Evan Purkhiser <evanpurkhiser@gmail.com>
Date: Wed, 13 Nov 2024 16:06:03 -0500
Subject: [PATCH] feat(crons): Implement `make_clock_tick_decision` (#80640)

This function returns a DecisionResult which encapsulates the
TickAnomalyDecision and AnomalyTransition values for a particular clock
tick.

In the future this logic will be run at each clock tick and the result
will later be used to decide if we can process issue occurrences in the
incident_occurrences consumer for a specific clock tick.

Part of GH-79328
---
 src/sentry/monitors/system_incidents.py       | 337 +++++++++++++++++-
 src/sentry/monitors/types.py                  |  21 --
 src/sentry/options/defaults.py                |  31 +-
 .../sentry/monitors/test_system_incidents.py  | 333 +++++++++++++++++
 4 files changed, 699 insertions(+), 23 deletions(-)

diff --git a/src/sentry/monitors/system_incidents.py b/src/sentry/monitors/system_incidents.py
index 6a339098a8e67..544d3269e5da6 100644
--- a/src/sentry/monitors/system_incidents.py
+++ b/src/sentry/monitors/system_incidents.py
@@ -11,8 +11,11 @@
 import logging
 import statistics
 from collections import Counter
-from collections.abc import Sequence
+from collections.abc import Generator, Sequence
+from dataclasses import dataclass
 from datetime import datetime, timedelta
+from enum import StrEnum
+from itertools import chain
 
 from django.conf import settings
 
@@ -27,6 +30,9 @@
 # This key is used to record the metric volume metric for the tick.
 MONITOR_TICK_METRIC = "sentry.monitors.volume_metric:{ts}"
 
+# This key is used to record the anomaly decision for a tick
+MONITOR_TICK_DECISION = "sentry.monitors.tick_decision:{ts}"
+
 # When fetching historic volume data to make a decision whether we have lost
 # data this value will determine how many historic volume data-points we fetch
 # of the window of the MONITOR_VOLUME_RETENTION. It is important to consider
@@ -42,6 +48,10 @@
 # We record 30 days worth of historical data for each minute of check-ins.
 MONITOR_VOLUME_RETENTION = timedelta(days=30)
 
+# This is the number of previous ticks we will consider the tick metrics and
+# tick decisions for to determine a decision about the tick being evaluated.
+MONITOR_TICK_DECISION_WINDOW = 5
+
 
 def update_check_in_volume(ts_list: Sequence[datetime]):
     """
@@ -166,6 +176,331 @@ def get_clock_tick_volume_metric(tick: datetime) -> float | None:
         return None
 
 
+class TickAnomalyDecision(StrEnum):
+    """
+    This enum represents the system incident anomaly decision made for a
+    clock-tick. Tick transitions are represented by the AnomalyTransition.
+    """
+
+    NORMAL = "normal"
+    """
+    The tick is within expected volume levels and does not show any
+    abnormalities. The system is working as normal.
+    """
+
+    ABNORMAL = "abnormal"
+    """
+    The volume metrics have indicated that we've seen an abnormal number of
+    check-ins for this tick. We may be entering an INCIDENT state.
+
+    All abnormal tick decisions will be contiguous, and will resolve into
+    either NORMAL or INCIDENT.
+    """
+
+    INCIDENT = "incident"
+    """
+    The volume metrics have indicated that we are in a system incident, this
+    means we are not processing as many check-ins as we typically do.
+
+    Once in an incident we will transition into RECOVERING once we've detected
+    enough normal volume metrics.
+    """
+
+    RECOVERING = "recovering"
+    """
+    We are transitioning out of an incident. Volume metrics must remain below
+    abnormal levels in order for RECOVERING to transition into NORMAL.
+
+    All recovering tick decisions will be contiguous, and will resolve into
+    either NORMAL or back into INCIDENT.
+    """
+
+    @classmethod
+    def from_str(cls, st: str) -> TickAnomalyDecision:
+        return cls[st.upper()]
+
+
+class AnomalyTransition(StrEnum):
+    ABNORMALITY_STARTED = "abnormality_started"
+    """
+    An abnormality has been detected during normal operations. We may
+    transition into a complete system incident, or the abnormality may recover
+    to normal.
+    """
+
+    ABNORMALITY_RECOVERED = "abnormality_recovered"
+    """
+    An abnormality has recovered back to a normal status.
+    """
+
+    INCIDENT_STARTED = "incident_started"
+    """
+    A system incident has been detected based on the historic check-in volume.
+    We are no longer able to reliably know that we are receving all check-ins.
+    """
+
+    INCIDENT_RECOVERING = "incident_recovering"
+    """
+    An incident has begun to recover. After this transition we will either
+    re-enter the incident va INCIDENT_STARTED or fully recover via
+    INCIDENT_RECOVERED.
+    """
+
+    INCIDENT_RECOVERY_FAILED = "incident_recovery_failed"
+    """
+    An incident failed to recover and has re-entered the incident state.
+    """
+
+    INCIDENT_RECOVERED = "incident_recovered"
+    """
+    An incident has recovered back to normal.
+    """
+
+
+@dataclass
+class DecisionResult:
+    decision: TickAnomalyDecision
+    """
+    The recorded decision made for the clock tick
+    """
+
+    transition: AnomalyTransition | None = None
+    """
+    Reflects the transition status when making a tick decision results in a
+    state transition. None if the decision has not changed.
+    """
+
+
+class Metric(StrEnum):
+    """
+    A metric is similar to a tick decision, however it represents a decision
+    made on the volume metric. The metric we current consider is percent mean
+    deviation from historic volumes.
+    """
+
+    NORMAL = "normal"
+    """
+    The metric is below the abnormal threshold.
+    """
+
+    ABNORMAL = "abnormal"
+    """
+    The metric has surpassed the normal threshold but is still below the
+    incident threshold.
+    """
+
+    INCIDENT = "incident"
+    """
+    The metric has surpassed the incident threshold
+    """
+
+    @staticmethod
+    def from_value(value: float | str | None) -> Metric:
+        """
+        Determine an individual decision for the percentage deviation metric of a
+        clock tick. This only considers metrics that are negative, indicating
+        there's been a drop in check-in volume.
+        """
+        # examples: -5% anomaly and -25% incident
+        anomaly_threshold = options.get("crons.system_incidents.pct_deviation_anomaly_threshold")
+        incident_threshold = options.get("crons.system_incidents.pct_deviation_incident_threshold")
+
+        # If we do not have a metric for this tick we must assume things are
+        # operating normally
+        if value is None:
+            return Metric.NORMAL
+
+        pct_deviation = float(value)
+
+        if pct_deviation <= incident_threshold:
+            return Metric.INCIDENT
+        if pct_deviation <= anomaly_threshold:
+            return Metric.ABNORMAL
+        return Metric.NORMAL
+
+
+def make_clock_tick_decision(tick: datetime) -> DecisionResult:
+    """
+    Given a clock tick timestamp determine based on the historic tick volume
+    metrics, and historic tick anomaly decisions, a DecisionResult.
+
+    This function will update previous decisions for earlier ticks detected as
+    ABNORMAL or RECOVERING to either NORMAL or INCIDENT.
+
+    The state transitions for tick decisions are as follows
+
+         ┌───D────────────────────────────┐
+    ┌────▼─┐   ┌────────┐   ┌────────┐   ┌┴─────────┐
+    │NORMAL├─A─►ABNORMAL├┬F─►INCIDENT├─C─►RECOVERING│
+    │      ◄─B─│        ││  │        ◄─E─┤          │
+    └────┬─┘   └────────┘│  └────────┘   └──────────┘
+         └───────────────┘
+
+    A: ABNORMALITY_STARTED
+    B: ABNORMALITY_RECOVERED
+    C: INCIDENT_RECOVERING
+    D: INCIDENT_RECOVERED
+    E: INCIDENT_RECOVERY_FAILED
+    F: INCIDENT_STARTED
+    """
+    # Alias TickAnomalyDecision to improve code readability
+    Decision = TickAnomalyDecision
+
+    if not options.get("crons.tick_volume_anomaly_detection"):
+        return DecisionResult(Decision.NORMAL)
+
+    redis_client = redis.redis_clusters.get(settings.SENTRY_MONITORS_REDIS_CLUSTER)
+
+    tick_decision_window = options.get("crons.system_incidents.tick_decision_window")
+
+    # The clock has just ticked to the next minute. Look at the previous tick
+    # and decision metrics.
+    past_ts = tick - timedelta(minutes=1)
+
+    past_window_ts_keys = [
+        _make_reference_ts(past_ts - timedelta(minutes=delta))
+        for delta in range(0, tick_decision_window)
+    ]
+
+    # Fetch histories for metrics and the last decision together. Window
+    # timestamps are reversed so the oldest metric is last.
+    redis_keys = chain(
+        (MONITOR_TICK_METRIC.format(ts=ts) for ts in reversed(past_window_ts_keys)),
+        (MONITOR_TICK_DECISION.format(ts=ts) for ts in [past_window_ts_keys[0]]),
+    )
+
+    values = redis_client.mget(redis_keys)
+
+    # Tick metrics are the first tick_decision_window values
+    tick_metrics = [Metric.from_value(value) for value in values[:-1]]
+    last_metric = tick_metrics[-1]
+
+    # The last decision is the last value fetched
+    if values[-1] is not None:
+        last_decision = Decision.from_str(values[-1])
+    else:
+        # By default the previous decision is used. If there was no previous
+        # decision we can only assume things are operating normally
+        last_decision = Decision.NORMAL
+
+    def make_decision(
+        decision: TickAnomalyDecision,
+        transition: AnomalyTransition | None = None,
+    ) -> DecisionResult:
+        decision_key = MONITOR_TICK_DECISION.format(ts=_make_reference_ts(tick))
+        pipeline = redis_client.pipeline()
+        pipeline.set(decision_key, decision)
+        pipeline.expire(decision_key, MONITOR_VOLUME_RETENTION)
+        pipeline.execute()
+
+        return DecisionResult(decision, transition)
+
+    def metrics_match(metric: Metric) -> Generator[bool]:
+        return (d == metric for d in tick_metrics)
+
+    # A: NORMAL -> ABNORMAL
+    #
+    # If we've detected an anomaly and we're not already in an incident,
+    # anomalous state, or recovering, mark this tick as anomalous.
+    if last_decision == Decision.NORMAL and last_metric == Metric.ABNORMAL:
+        return make_decision(Decision.ABNORMAL, AnomalyTransition.ABNORMALITY_STARTED)
+
+    # B: ABNORMAL -> NORMAL
+    #
+    # If the previous result was anomalous check and if we have recovered and can
+    # backfill these decisions as normal
+    if last_decision == Decision.ABNORMAL and all(metrics_match(Metric.NORMAL)):
+        _backfill_decisions(past_ts, Decision.NORMAL, Decision.ABNORMAL)
+        return make_decision(Decision.NORMAL, AnomalyTransition.ABNORMALITY_RECOVERED)
+
+    # C: INCIDENT -> RECOVERING
+    #
+    # If we are actively in an incident and the most recent metric value has
+    # recovered to normal we can de-escalate the incident to abnormal.
+    if last_decision == Decision.INCIDENT and last_metric == Metric.NORMAL:
+        return make_decision(Decision.RECOVERING, AnomalyTransition.INCIDENT_RECOVERING)
+
+    # D: RECOVERING -> NORMAL
+    #
+    # If the previous result was recovering, check if we have recovered and can
+    # backfill these decisions as normal.
+    if last_decision == Decision.RECOVERING and all(metrics_match(Metric.NORMAL)):
+        _backfill_decisions(past_ts, Decision.NORMAL, Decision.RECOVERING)
+        return make_decision(Decision.NORMAL, AnomalyTransition.INCIDENT_RECOVERED)
+
+    # E: RECOVERING -> INCIDENT
+    #
+    # If an incident had begun recovering but we've detected a non-normal
+    # metric, backfill all recovery decisions to an incident decision.
+    if last_decision == Decision.RECOVERING and last_metric != Metric.NORMAL:
+        _backfill_decisions(past_ts, Decision.INCIDENT, Decision.RECOVERING)
+        return make_decision(Decision.INCIDENT, AnomalyTransition.INCIDENT_RECOVERY_FAILED)
+
+    # F: [NORMAL, ABNORMAL] -> INCIDENT
+    #
+    # If we're not already in an incident and the most recent metric value is
+    # an incident, mark this tick as an incident and backfill all abnormal
+    # decisions to an incident decision.
+    if last_decision != Decision.INCIDENT and last_metric == Metric.INCIDENT:
+        _backfill_decisions(past_ts, Decision.INCIDENT, Decision.ABNORMAL)
+        return make_decision(Decision.INCIDENT, AnomalyTransition.INCIDENT_STARTED)
+
+    # NORMAL     -> NORMAL
+    # ABNORMAL   -> ABNORMAL
+    # INCIDENT   -> INCIDENT
+    # RECOVERING -> RECOVERING
+    #
+    # No decision transition. Use the previous decision
+    return make_decision(last_decision)
+
+
+def get_clock_tick_decision(tick: datetime) -> TickAnomalyDecision | None:
+    """
+    Retrieve the TickAnomalyDecision for a specific clock tick.
+    """
+    redis_client = redis.redis_clusters.get(settings.SENTRY_MONITORS_REDIS_CLUSTER)
+
+    if value := redis_client.get(MONITOR_TICK_DECISION.format(ts=_make_reference_ts(tick))):
+        return TickAnomalyDecision.from_str(value)
+    else:
+        return None
+
+
+def _backfill_decisions(
+    start: datetime,
+    decision: TickAnomalyDecision,
+    until_not: TickAnomalyDecision,
+) -> None:
+    """
+    Update historic tick decisions from `start` to `decision` until we no
+    longer see the `until_not` decision.
+    """
+    redis_client = redis.redis_clusters.get(settings.SENTRY_MONITORS_REDIS_CLUSTER)
+
+    ts = start
+    updates: dict[str | bytes, str] = {}
+
+    while True:
+        key = MONITOR_TICK_DECISION.format(ts=_make_reference_ts(ts))
+
+        # Nothing to backfill if we don't have a decision value
+        value = redis_client.get(key)
+        if value is None:
+            break
+
+        # Exit the backfill once we no longer have the until_not decision
+        prev_decision = TickAnomalyDecision.from_str(value)
+        if prev_decision != until_not:
+            break
+
+        updates[key] = decision.value
+        ts = ts - timedelta(minutes=1)
+
+    # Apply decision updates
+    if updates:
+        redis_client.mset(updates)
+
+
 def _make_reference_ts(ts: datetime):
     """
     Produce a timestamp number with the seconds and microsecond removed
diff --git a/src/sentry/monitors/types.py b/src/sentry/monitors/types.py
index f003b5b7d2fc2..afed758efa55f 100644
--- a/src/sentry/monitors/types.py
+++ b/src/sentry/monitors/types.py
@@ -2,7 +2,6 @@
 
 from dataclasses import dataclass
 from datetime import datetime
-from enum import StrEnum
 from typing import Literal, NotRequired, TypedDict, Union
 
 from django.utils.functional import cached_property
@@ -129,23 +128,3 @@ class IntervalSchedule:
 
 
 ScheduleConfig = Union[CrontabSchedule, IntervalSchedule]
-
-
-class TickVolumeAnomolyResult(StrEnum):
-    """
-    This enum represents the result of comparing the minute we ticked past
-    with it's historic volume data. This is used to determine if we may have
-    consumed an anomalous number of check-ins, indicating there is an upstream
-    incident and we care not able to reliably report misses and time-outs.
-
-    A NORMAL result means we've considered the volume to be within the expected
-    volume for that minute. A ANOMALY value indicates there was a drop in
-    volume significant enough to consider it abnormal.
-    """
-
-    NORMAL = "normal"
-    ABNORMAL = "abnormal"
-
-    @classmethod
-    def from_str(cls, st: str) -> TickVolumeAnomolyResult:
-        return cls[st.upper()]
diff --git a/src/sentry/options/defaults.py b/src/sentry/options/defaults.py
index e474ba7f9e1ab..49c13bc991020 100644
--- a/src/sentry/options/defaults.py
+++ b/src/sentry/options/defaults.py
@@ -2010,13 +2010,42 @@
 # Killswitch for monitor check-ins
 register("crons.organization.disable-check-in", type=Sequence, default=[])
 
-# Enables anomaly detection based on the volume of check-ins being processed
+# Enables system incident anomaly detection based on the volume of check-ins
+# being processed
 register(
     "crons.tick_volume_anomaly_detection",
     default=False,
     flags=FLAG_BOOL | FLAG_AUTOMATOR_MODIFIABLE,
 )
 
+# The threshold that the tick metric must surpass for a tick to be determined
+# as anomalous. This value should be negative, since we will only determine an
+# incident based on a decrease in volume.
+#
+# See the `monitors.system_incidents` module for more details
+register(
+    "crons.system_incidents.pct_deviation_anomaly_threshold",
+    default=-10,
+    flags=FLAG_AUTOMATOR_MODIFIABLE,
+)
+
+# The threshold that the tick metric must surpass to transition to an incident
+# state. This should be a fairly high value to avoid false positive incidents.
+register(
+    "crons.system_incidents.pct_deviation_incident_threshold",
+    default=-30,
+    flags=FLAG_AUTOMATOR_MODIFIABLE,
+)
+
+# This is the number of previous ticks we will consider the tick metrics and
+# tick decisions for to determine a decision about the tick being evaluated.
+register(
+    "crons.system_incidents.tick_decision_window",
+    default=5,
+    flags=FLAG_AUTOMATOR_MODIFIABLE,
+)
+
+
 # Sets the timeout for webhooks
 register(
     "sentry-apps.webhook.timeout.sec",
diff --git a/tests/sentry/monitors/test_system_incidents.py b/tests/sentry/monitors/test_system_incidents.py
index 74f59f372a6d0..06f71562b970d 100644
--- a/tests/sentry/monitors/test_system_incidents.py
+++ b/tests/sentry/monitors/test_system_incidents.py
@@ -7,14 +7,21 @@
 from django.utils import timezone
 
 from sentry.monitors.system_incidents import (
+    MONITOR_TICK_METRIC,
     MONITOR_VOLUME_DECISION_STEP,
     MONITOR_VOLUME_HISTORY,
     MONITOR_VOLUME_RETENTION,
+    AnomalyTransition,
+    TickAnomalyDecision,
+    _make_reference_ts,
+    get_clock_tick_decision,
     get_clock_tick_volume_metric,
+    make_clock_tick_decision,
     record_clock_tick_volume_metric,
     update_check_in_volume,
 )
 from sentry.testutils.helpers.options import override_options
+from sentry.testutils.pytest.fixtures import django_db_all
 from sentry.utils import redis
 
 redis_client = redis.redis_clusters.get(settings.SENTRY_MONITORS_REDIS_CLUSTER)
@@ -50,6 +57,24 @@ def fill_historic_volume(
     update_check_in_volume(ts_list)
 
 
+def fill_historic_metrics(start: datetime, metrics: Sequence[float | None]):
+    """
+    Creates historic metrics starting from the `start` datetime backfilling the
+    `metrics`, popping from the end of the list until all metrics have been
+    recorded.
+
+    Returns the timestamp the metrics begin at
+    """
+    values: dict[str | bytes, float] = {}
+    for index, metric in enumerate(metrics):
+        ts = _make_reference_ts(start + timedelta(minutes=index))
+        key = MONITOR_TICK_METRIC.format(ts=ts)
+        if metric:
+            values[key] = metric
+
+    redis_client.mset(values)
+
+
 def test_update_check_in_volume():
     redis_client = redis.redis_clusters.get(settings.SENTRY_MONITORS_REDIS_CLUSTER)
 
@@ -258,3 +283,311 @@ def test_record_clock_tiock_volume_metric_uniform(metrics, logger):
         sample_rate=1.0,
     )
     assert get_clock_tick_volume_metric(past_ts) == 0.0
+
+
+@django_db_all
+@override_options(
+    {
+        "crons.tick_volume_anomaly_detection": True,
+        "crons.system_incidents.tick_decision_window": 5,
+        "crons.system_incidents.pct_deviation_anomaly_threshold": -5,
+        "crons.system_incidents.pct_deviation_incident_threshold": -25,
+    }
+)
+def test_tick_decision_anomaly_recovery():
+    start = timezone.now().replace(second=0, microsecond=0)
+
+    test_metrics = [
+        # fmt: off
+        # Operating as normal
+        1.0, 4.0, 3.0, 2.0, 2.0, -3.0,
+        # Anomaly detected
+        -6.0, -7.0,
+        # Anomaly recovers to normal
+        -4.0, -3.0, -3.0, -4.0, -1.0
+        # fmt: on
+    ]
+
+    ts = start
+    fill_historic_metrics(ts, test_metrics)
+
+    # First 6 ticks are operating as normal
+    for _ in range(0, 6):
+        result = make_clock_tick_decision(ts := ts + timedelta(minutes=1))
+        assert result.decision == TickAnomalyDecision.NORMAL
+        assert result.transition is None
+
+    # Transition into anomalous state (-6)
+    for _ in range(0, 1):
+        result = make_clock_tick_decision(ts := ts + timedelta(minutes=1))
+        assert result.decision == TickAnomalyDecision.ABNORMAL
+        assert result.transition == AnomalyTransition.ABNORMALITY_STARTED
+
+    # Next 5 ticks (-7, -4, -3, -3, -4) stay in abnormal state
+    for _ in range(0, 5):
+        result = make_clock_tick_decision(ts := ts + timedelta(minutes=1))
+        assert result.decision == TickAnomalyDecision.ABNORMAL
+        assert result.transition is None
+
+    # Next tick recovers the abnormality after 5 ticks under the abnormality threshold
+    for _ in range(0, 1):
+        result = make_clock_tick_decision(ts := ts + timedelta(minutes=1))
+        assert result.decision == TickAnomalyDecision.NORMAL
+        assert result.transition == AnomalyTransition.ABNORMALITY_RECOVERED
+
+    # The last 6 ABNORMAL ticks transitioned to NORMAL
+    for i in range(1, 7):
+        assert get_clock_tick_decision(ts - timedelta(minutes=i)) == TickAnomalyDecision.NORMAL
+
+
+@django_db_all
+@override_options(
+    {
+        "crons.tick_volume_anomaly_detection": True,
+        "crons.system_incidents.tick_decision_window": 5,
+        "crons.system_incidents.pct_deviation_anomaly_threshold": -5,
+        "crons.system_incidents.pct_deviation_incident_threshold": -25,
+    }
+)
+def test_tick_decisions_simple_incident():
+    """
+    Tests incident detection for an incident that immediately starts and
+    immediately stops.
+    """
+    start = timezone.now().replace(second=0, microsecond=0)
+
+    test_metrics = [
+        # fmt: off
+        # Operating as normal
+        1.0, 4.0, 3.0, 2.0, 2.0, -3.0,
+        # Incident starts immediately
+        -35.0, -80.0, -100.0, -50.0,
+        # Incident quickly recovers
+        -3.0, -2.0, -4.0, -1.0, -4.0
+        # fmt: on
+    ]
+
+    ts = start
+    fill_historic_metrics(ts, test_metrics)
+
+    # First 6 ticks are operating as normal
+    for _ in range(0, 6):
+        result = make_clock_tick_decision(ts := ts + timedelta(minutes=1))
+        assert result.decision == TickAnomalyDecision.NORMAL
+        assert result.transition is None
+
+    # Transition into incident (-35)
+    for _ in range(0, 1):
+        result = make_clock_tick_decision(ts := ts + timedelta(minutes=1))
+        assert result.decision == TickAnomalyDecision.INCIDENT
+        assert result.transition == AnomalyTransition.INCIDENT_STARTED
+
+    # Incident continues (-80, -100, -50)
+    for _ in range(0, 3):
+        result = make_clock_tick_decision(ts := ts + timedelta(minutes=1))
+        assert result.decision == TickAnomalyDecision.INCIDENT
+        assert result.transition is None
+
+    # Incident recovers (-3)
+    for _ in range(0, 1):
+        result = make_clock_tick_decision(ts := ts + timedelta(minutes=1))
+        assert result.decision == TickAnomalyDecision.RECOVERING
+        assert result.transition == AnomalyTransition.INCIDENT_RECOVERING
+
+    # Incident continues recovery (-2, -4, -1)
+    for _ in range(0, 3):
+        result = make_clock_tick_decision(ts := ts + timedelta(minutes=1))
+        assert result.decision == TickAnomalyDecision.RECOVERING
+        assert result.transition is None
+
+    # Incident recovers
+    for _ in range(0, 1):
+        result = make_clock_tick_decision(ts := ts + timedelta(minutes=1))
+        assert result.decision == TickAnomalyDecision.NORMAL
+        assert result.transition == AnomalyTransition.INCIDENT_RECOVERED
+
+    # The last 4 RECOVERING ticks transitioned to NORMAL
+    for i in range(1, 5):
+        assert get_clock_tick_decision(ts - timedelta(minutes=i)) == TickAnomalyDecision.NORMAL
+
+
+@django_db_all
+@override_options(
+    {
+        "crons.tick_volume_anomaly_detection": True,
+        "crons.system_incidents.tick_decision_window": 5,
+        "crons.system_incidents.pct_deviation_anomaly_threshold": -5,
+        "crons.system_incidents.pct_deviation_incident_threshold": -25,
+    }
+)
+def test_tick_decisions_variable_incident():
+    """
+    Tests an incident that slowly starts and slowly recovers.
+    """
+    start = timezone.now().replace(second=0, microsecond=0)
+
+    test_metrics = [
+        # fmt: off
+        # Operating as normal
+        1.0, 4.0, 3.0, 2.0, 2.0, -3.0,
+        # Anomaly detected
+        -6.0, -7.0,
+        # Metrics below anomaly threshold, but not recovered
+        -4.0, -3.0,
+        # Metrics above anomaly threshold again, but not at incident threshold
+        -10.0,
+        # Incident threshold reached
+        -30.0, -40.0, -38.0, -42.0, -25.0, -20.0, -10.0,
+        # Incident recovering
+        -4.0, -3.0,
+        # Metrics above anomaly threshold, recovery failed
+        -6.0,
+        # Metrics back below anomaly threshold, begin recovering again
+        -2.0, -1.0,
+        # Metrics above incident threshold, recovery failed
+        -30.0,
+        # Metrics below anomaly threshold, incident will recover
+        -3.0, -2.0, -4.0, -4.0, -3.0,
+        # fmt: on
+    ]
+
+    ts = start
+    fill_historic_metrics(ts, test_metrics)
+
+    # First 6 ticks are operating as normal
+    for _ in range(0, 6):
+        result = make_clock_tick_decision(ts := ts + timedelta(minutes=1))
+        assert result.decision == TickAnomalyDecision.NORMAL
+        assert result.transition is None
+
+    # Transition into anomalous state (-6)
+    for _ in range(0, 1):
+        result = make_clock_tick_decision(ts := ts + timedelta(minutes=1))
+        assert result.decision == TickAnomalyDecision.ABNORMAL
+        assert result.transition == AnomalyTransition.ABNORMALITY_STARTED
+
+    # Next 4 ticks (-7, -4, -3, -10) stay in anomaly
+    for _ in range(0, 4):
+        result = make_clock_tick_decision(ts := ts + timedelta(minutes=1))
+        assert result.decision == TickAnomalyDecision.ABNORMAL
+        assert result.transition is None
+
+    # Incident starts (-30)
+    for _ in range(0, 1):
+        result = make_clock_tick_decision(ts := ts + timedelta(minutes=1))
+        assert result.decision == TickAnomalyDecision.INCIDENT
+        assert result.transition == AnomalyTransition.INCIDENT_STARTED
+
+    # The last 5 ABNORMAL ticks transitioned to INCIDENT
+    for i in range(1, 6):
+        assert get_clock_tick_decision(ts - timedelta(minutes=i)) == TickAnomalyDecision.INCIDENT
+
+    # Incident continues (-40, -38, -42, -25, -20, -10)
+    for _ in range(0, 6):
+        result = make_clock_tick_decision(ts := ts + timedelta(minutes=1))
+        assert result.decision == TickAnomalyDecision.INCIDENT
+        assert result.transition is None
+
+    # Incident begins recovering (-4)
+    for _ in range(0, 1):
+        result = make_clock_tick_decision(ts := ts + timedelta(minutes=1))
+        assert result.decision == TickAnomalyDecision.RECOVERING
+        assert result.transition == AnomalyTransition.INCIDENT_RECOVERING
+
+    # Incident continues to recover (-3)
+    for _ in range(0, 1):
+        result = make_clock_tick_decision(ts := ts + timedelta(minutes=1))
+        assert result.decision == TickAnomalyDecision.RECOVERING
+        assert result.transition is None
+
+    # Incident has anomalous tick again (-6), not fully recovered
+    for _ in range(0, 1):
+        result = make_clock_tick_decision(ts := ts + timedelta(minutes=1))
+        assert result.decision == TickAnomalyDecision.INCIDENT
+        assert result.transition == AnomalyTransition.INCIDENT_RECOVERY_FAILED
+
+    # The last 2 RECOVERING ticks transitioned back to incident
+    for i in range(1, 3):
+        assert get_clock_tick_decision(ts - timedelta(minutes=i)) == TickAnomalyDecision.INCIDENT
+
+    # Incident begins recovering again (-2)
+    for _ in range(0, 1):
+        result = make_clock_tick_decision(ts := ts + timedelta(minutes=1))
+        assert result.decision == TickAnomalyDecision.RECOVERING
+        assert result.transition == AnomalyTransition.INCIDENT_RECOVERING
+
+    # Incident continues to recover (-1)
+    for _ in range(0, 1):
+        result = make_clock_tick_decision(ts := ts + timedelta(minutes=1))
+        assert result.decision == TickAnomalyDecision.RECOVERING
+        assert result.transition is None
+
+    # Incident has incident tick again (-30), not fully recovered
+    for _ in range(0, 1):
+        result = make_clock_tick_decision(ts := ts + timedelta(minutes=1))
+        assert result.decision == TickAnomalyDecision.INCIDENT
+        assert result.transition == AnomalyTransition.INCIDENT_RECOVERY_FAILED
+
+    # The last 2 RECOVERING ticks transitioned back to incident
+    for i in range(1, 3):
+        assert get_clock_tick_decision(ts - timedelta(minutes=i)) == TickAnomalyDecision.INCIDENT
+
+    # Incident begins recovering again (-3)
+    for _ in range(0, 1):
+        result = make_clock_tick_decision(ts := ts + timedelta(minutes=1))
+        assert result.decision == TickAnomalyDecision.RECOVERING
+        assert result.transition == AnomalyTransition.INCIDENT_RECOVERING
+
+    # Incident continues to recover for the next 3 normal ticks (-2, -4, -4)
+    for _ in range(0, 3):
+        result = make_clock_tick_decision(ts := ts + timedelta(minutes=1))
+        assert result.decision == TickAnomalyDecision.RECOVERING
+        assert result.transition is None
+
+    # Incident recovers at the final 5th tick (-3)
+    for _ in range(0, 1):
+        result = make_clock_tick_decision(ts := ts + timedelta(minutes=1))
+        assert result.decision == TickAnomalyDecision.NORMAL
+        assert result.transition == AnomalyTransition.INCIDENT_RECOVERED
+
+    # The last 4 RECOVERING ticks transitioned to NORMAL
+    for i in range(1, 5):
+        assert get_clock_tick_decision(ts - timedelta(minutes=i)) == TickAnomalyDecision.NORMAL
+
+    # The final tick decision history looks like this
+    decision_history = [
+        TickAnomalyDecision.NORMAL,
+        TickAnomalyDecision.NORMAL,
+        TickAnomalyDecision.NORMAL,
+        TickAnomalyDecision.NORMAL,
+        TickAnomalyDecision.NORMAL,
+        TickAnomalyDecision.NORMAL,
+        TickAnomalyDecision.INCIDENT,
+        TickAnomalyDecision.INCIDENT,
+        TickAnomalyDecision.INCIDENT,
+        TickAnomalyDecision.INCIDENT,
+        TickAnomalyDecision.INCIDENT,
+        TickAnomalyDecision.INCIDENT,
+        TickAnomalyDecision.INCIDENT,
+        TickAnomalyDecision.INCIDENT,
+        TickAnomalyDecision.INCIDENT,
+        TickAnomalyDecision.INCIDENT,
+        TickAnomalyDecision.INCIDENT,
+        TickAnomalyDecision.INCIDENT,
+        TickAnomalyDecision.INCIDENT,
+        TickAnomalyDecision.INCIDENT,
+        TickAnomalyDecision.INCIDENT,
+        TickAnomalyDecision.INCIDENT,
+        TickAnomalyDecision.INCIDENT,
+        TickAnomalyDecision.INCIDENT,
+        TickAnomalyDecision.NORMAL,
+        TickAnomalyDecision.NORMAL,
+        TickAnomalyDecision.NORMAL,
+        TickAnomalyDecision.NORMAL,
+        TickAnomalyDecision.NORMAL,
+    ]
+
+    ts = start + timedelta(minutes=1)
+    for i, expected in enumerate(decision_history):
+        decision = get_clock_tick_decision(ts + timedelta(minutes=i))
+        assert decision == expected