diff --git a/src/sentry/monitors/system_incidents.py b/src/sentry/monitors/system_incidents.py index 6a339098a8e67a..544d3269e5da62 100644 --- a/src/sentry/monitors/system_incidents.py +++ b/src/sentry/monitors/system_incidents.py @@ -11,8 +11,11 @@ import logging import statistics from collections import Counter -from collections.abc import Sequence +from collections.abc import Generator, Sequence +from dataclasses import dataclass from datetime import datetime, timedelta +from enum import StrEnum +from itertools import chain from django.conf import settings @@ -27,6 +30,9 @@ # This key is used to record the metric volume metric for the tick. MONITOR_TICK_METRIC = "sentry.monitors.volume_metric:{ts}" +# This key is used to record the anomaly decision for a tick +MONITOR_TICK_DECISION = "sentry.monitors.tick_decision:{ts}" + # When fetching historic volume data to make a decision whether we have lost # data this value will determine how many historic volume data-points we fetch # of the window of the MONITOR_VOLUME_RETENTION. It is important to consider @@ -42,6 +48,10 @@ # We record 30 days worth of historical data for each minute of check-ins. MONITOR_VOLUME_RETENTION = timedelta(days=30) +# This is the number of previous ticks we will consider the tick metrics and +# tick decisions for to determine a decision about the tick being evaluated. +MONITOR_TICK_DECISION_WINDOW = 5 + def update_check_in_volume(ts_list: Sequence[datetime]): """ @@ -166,6 +176,331 @@ def get_clock_tick_volume_metric(tick: datetime) -> float | None: return None +class TickAnomalyDecision(StrEnum): + """ + This enum represents the system incident anomaly decision made for a + clock-tick. Tick transitions are represented by the AnomalyTransition. + """ + + NORMAL = "normal" + """ + The tick is within expected volume levels and does not show any + abnormalities. The system is working as normal. + """ + + ABNORMAL = "abnormal" + """ + The volume metrics have indicated that we've seen an abnormal number of + check-ins for this tick. We may be entering an INCIDENT state. + + All abnormal tick decisions will be contiguous, and will resolve into + either NORMAL or INCIDENT. + """ + + INCIDENT = "incident" + """ + The volume metrics have indicated that we are in a system incident, this + means we are not processing as many check-ins as we typically do. + + Once in an incident we will transition into RECOVERING once we've detected + enough normal volume metrics. + """ + + RECOVERING = "recovering" + """ + We are transitioning out of an incident. Volume metrics must remain below + abnormal levels in order for RECOVERING to transition into NORMAL. + + All recovering tick decisions will be contiguous, and will resolve into + either NORMAL or back into INCIDENT. + """ + + @classmethod + def from_str(cls, st: str) -> TickAnomalyDecision: + return cls[st.upper()] + + +class AnomalyTransition(StrEnum): + ABNORMALITY_STARTED = "abnormality_started" + """ + An abnormality has been detected during normal operations. We may + transition into a complete system incident, or the abnormality may recover + to normal. + """ + + ABNORMALITY_RECOVERED = "abnormality_recovered" + """ + An abnormality has recovered back to a normal status. + """ + + INCIDENT_STARTED = "incident_started" + """ + A system incident has been detected based on the historic check-in volume. + We are no longer able to reliably know that we are receving all check-ins. + """ + + INCIDENT_RECOVERING = "incident_recovering" + """ + An incident has begun to recover. After this transition we will either + re-enter the incident va INCIDENT_STARTED or fully recover via + INCIDENT_RECOVERED. + """ + + INCIDENT_RECOVERY_FAILED = "incident_recovery_failed" + """ + An incident failed to recover and has re-entered the incident state. + """ + + INCIDENT_RECOVERED = "incident_recovered" + """ + An incident has recovered back to normal. + """ + + +@dataclass +class DecisionResult: + decision: TickAnomalyDecision + """ + The recorded decision made for the clock tick + """ + + transition: AnomalyTransition | None = None + """ + Reflects the transition status when making a tick decision results in a + state transition. None if the decision has not changed. + """ + + +class Metric(StrEnum): + """ + A metric is similar to a tick decision, however it represents a decision + made on the volume metric. The metric we current consider is percent mean + deviation from historic volumes. + """ + + NORMAL = "normal" + """ + The metric is below the abnormal threshold. + """ + + ABNORMAL = "abnormal" + """ + The metric has surpassed the normal threshold but is still below the + incident threshold. + """ + + INCIDENT = "incident" + """ + The metric has surpassed the incident threshold + """ + + @staticmethod + def from_value(value: float | str | None) -> Metric: + """ + Determine an individual decision for the percentage deviation metric of a + clock tick. This only considers metrics that are negative, indicating + there's been a drop in check-in volume. + """ + # examples: -5% anomaly and -25% incident + anomaly_threshold = options.get("crons.system_incidents.pct_deviation_anomaly_threshold") + incident_threshold = options.get("crons.system_incidents.pct_deviation_incident_threshold") + + # If we do not have a metric for this tick we must assume things are + # operating normally + if value is None: + return Metric.NORMAL + + pct_deviation = float(value) + + if pct_deviation <= incident_threshold: + return Metric.INCIDENT + if pct_deviation <= anomaly_threshold: + return Metric.ABNORMAL + return Metric.NORMAL + + +def make_clock_tick_decision(tick: datetime) -> DecisionResult: + """ + Given a clock tick timestamp determine based on the historic tick volume + metrics, and historic tick anomaly decisions, a DecisionResult. + + This function will update previous decisions for earlier ticks detected as + ABNORMAL or RECOVERING to either NORMAL or INCIDENT. + + The state transitions for tick decisions are as follows + + ┌───D────────────────────────────┐ + ┌────▼─┐ ┌────────┐ ┌────────┐ ┌┴─────────┐ + │NORMAL├─A─►ABNORMAL├┬F─►INCIDENT├─C─►RECOVERING│ + │ ◄─B─│ ││ │ ◄─E─┤ │ + └────┬─┘ └────────┘│ └────────┘ └──────────┘ + └───────────────┘ + + A: ABNORMALITY_STARTED + B: ABNORMALITY_RECOVERED + C: INCIDENT_RECOVERING + D: INCIDENT_RECOVERED + E: INCIDENT_RECOVERY_FAILED + F: INCIDENT_STARTED + """ + # Alias TickAnomalyDecision to improve code readability + Decision = TickAnomalyDecision + + if not options.get("crons.tick_volume_anomaly_detection"): + return DecisionResult(Decision.NORMAL) + + redis_client = redis.redis_clusters.get(settings.SENTRY_MONITORS_REDIS_CLUSTER) + + tick_decision_window = options.get("crons.system_incidents.tick_decision_window") + + # The clock has just ticked to the next minute. Look at the previous tick + # and decision metrics. + past_ts = tick - timedelta(minutes=1) + + past_window_ts_keys = [ + _make_reference_ts(past_ts - timedelta(minutes=delta)) + for delta in range(0, tick_decision_window) + ] + + # Fetch histories for metrics and the last decision together. Window + # timestamps are reversed so the oldest metric is last. + redis_keys = chain( + (MONITOR_TICK_METRIC.format(ts=ts) for ts in reversed(past_window_ts_keys)), + (MONITOR_TICK_DECISION.format(ts=ts) for ts in [past_window_ts_keys[0]]), + ) + + values = redis_client.mget(redis_keys) + + # Tick metrics are the first tick_decision_window values + tick_metrics = [Metric.from_value(value) for value in values[:-1]] + last_metric = tick_metrics[-1] + + # The last decision is the last value fetched + if values[-1] is not None: + last_decision = Decision.from_str(values[-1]) + else: + # By default the previous decision is used. If there was no previous + # decision we can only assume things are operating normally + last_decision = Decision.NORMAL + + def make_decision( + decision: TickAnomalyDecision, + transition: AnomalyTransition | None = None, + ) -> DecisionResult: + decision_key = MONITOR_TICK_DECISION.format(ts=_make_reference_ts(tick)) + pipeline = redis_client.pipeline() + pipeline.set(decision_key, decision) + pipeline.expire(decision_key, MONITOR_VOLUME_RETENTION) + pipeline.execute() + + return DecisionResult(decision, transition) + + def metrics_match(metric: Metric) -> Generator[bool]: + return (d == metric for d in tick_metrics) + + # A: NORMAL -> ABNORMAL + # + # If we've detected an anomaly and we're not already in an incident, + # anomalous state, or recovering, mark this tick as anomalous. + if last_decision == Decision.NORMAL and last_metric == Metric.ABNORMAL: + return make_decision(Decision.ABNORMAL, AnomalyTransition.ABNORMALITY_STARTED) + + # B: ABNORMAL -> NORMAL + # + # If the previous result was anomalous check and if we have recovered and can + # backfill these decisions as normal + if last_decision == Decision.ABNORMAL and all(metrics_match(Metric.NORMAL)): + _backfill_decisions(past_ts, Decision.NORMAL, Decision.ABNORMAL) + return make_decision(Decision.NORMAL, AnomalyTransition.ABNORMALITY_RECOVERED) + + # C: INCIDENT -> RECOVERING + # + # If we are actively in an incident and the most recent metric value has + # recovered to normal we can de-escalate the incident to abnormal. + if last_decision == Decision.INCIDENT and last_metric == Metric.NORMAL: + return make_decision(Decision.RECOVERING, AnomalyTransition.INCIDENT_RECOVERING) + + # D: RECOVERING -> NORMAL + # + # If the previous result was recovering, check if we have recovered and can + # backfill these decisions as normal. + if last_decision == Decision.RECOVERING and all(metrics_match(Metric.NORMAL)): + _backfill_decisions(past_ts, Decision.NORMAL, Decision.RECOVERING) + return make_decision(Decision.NORMAL, AnomalyTransition.INCIDENT_RECOVERED) + + # E: RECOVERING -> INCIDENT + # + # If an incident had begun recovering but we've detected a non-normal + # metric, backfill all recovery decisions to an incident decision. + if last_decision == Decision.RECOVERING and last_metric != Metric.NORMAL: + _backfill_decisions(past_ts, Decision.INCIDENT, Decision.RECOVERING) + return make_decision(Decision.INCIDENT, AnomalyTransition.INCIDENT_RECOVERY_FAILED) + + # F: [NORMAL, ABNORMAL] -> INCIDENT + # + # If we're not already in an incident and the most recent metric value is + # an incident, mark this tick as an incident and backfill all abnormal + # decisions to an incident decision. + if last_decision != Decision.INCIDENT and last_metric == Metric.INCIDENT: + _backfill_decisions(past_ts, Decision.INCIDENT, Decision.ABNORMAL) + return make_decision(Decision.INCIDENT, AnomalyTransition.INCIDENT_STARTED) + + # NORMAL -> NORMAL + # ABNORMAL -> ABNORMAL + # INCIDENT -> INCIDENT + # RECOVERING -> RECOVERING + # + # No decision transition. Use the previous decision + return make_decision(last_decision) + + +def get_clock_tick_decision(tick: datetime) -> TickAnomalyDecision | None: + """ + Retrieve the TickAnomalyDecision for a specific clock tick. + """ + redis_client = redis.redis_clusters.get(settings.SENTRY_MONITORS_REDIS_CLUSTER) + + if value := redis_client.get(MONITOR_TICK_DECISION.format(ts=_make_reference_ts(tick))): + return TickAnomalyDecision.from_str(value) + else: + return None + + +def _backfill_decisions( + start: datetime, + decision: TickAnomalyDecision, + until_not: TickAnomalyDecision, +) -> None: + """ + Update historic tick decisions from `start` to `decision` until we no + longer see the `until_not` decision. + """ + redis_client = redis.redis_clusters.get(settings.SENTRY_MONITORS_REDIS_CLUSTER) + + ts = start + updates: dict[str | bytes, str] = {} + + while True: + key = MONITOR_TICK_DECISION.format(ts=_make_reference_ts(ts)) + + # Nothing to backfill if we don't have a decision value + value = redis_client.get(key) + if value is None: + break + + # Exit the backfill once we no longer have the until_not decision + prev_decision = TickAnomalyDecision.from_str(value) + if prev_decision != until_not: + break + + updates[key] = decision.value + ts = ts - timedelta(minutes=1) + + # Apply decision updates + if updates: + redis_client.mset(updates) + + def _make_reference_ts(ts: datetime): """ Produce a timestamp number with the seconds and microsecond removed diff --git a/src/sentry/monitors/types.py b/src/sentry/monitors/types.py index f003b5b7d2fc29..afed758efa55f0 100644 --- a/src/sentry/monitors/types.py +++ b/src/sentry/monitors/types.py @@ -2,7 +2,6 @@ from dataclasses import dataclass from datetime import datetime -from enum import StrEnum from typing import Literal, NotRequired, TypedDict, Union from django.utils.functional import cached_property @@ -129,23 +128,3 @@ class IntervalSchedule: ScheduleConfig = Union[CrontabSchedule, IntervalSchedule] - - -class TickVolumeAnomolyResult(StrEnum): - """ - This enum represents the result of comparing the minute we ticked past - with it's historic volume data. This is used to determine if we may have - consumed an anomalous number of check-ins, indicating there is an upstream - incident and we care not able to reliably report misses and time-outs. - - A NORMAL result means we've considered the volume to be within the expected - volume for that minute. A ANOMALY value indicates there was a drop in - volume significant enough to consider it abnormal. - """ - - NORMAL = "normal" - ABNORMAL = "abnormal" - - @classmethod - def from_str(cls, st: str) -> TickVolumeAnomolyResult: - return cls[st.upper()] diff --git a/src/sentry/options/defaults.py b/src/sentry/options/defaults.py index e474ba7f9e1ab6..49c13bc991020d 100644 --- a/src/sentry/options/defaults.py +++ b/src/sentry/options/defaults.py @@ -2010,13 +2010,42 @@ # Killswitch for monitor check-ins register("crons.organization.disable-check-in", type=Sequence, default=[]) -# Enables anomaly detection based on the volume of check-ins being processed +# Enables system incident anomaly detection based on the volume of check-ins +# being processed register( "crons.tick_volume_anomaly_detection", default=False, flags=FLAG_BOOL | FLAG_AUTOMATOR_MODIFIABLE, ) +# The threshold that the tick metric must surpass for a tick to be determined +# as anomalous. This value should be negative, since we will only determine an +# incident based on a decrease in volume. +# +# See the `monitors.system_incidents` module for more details +register( + "crons.system_incidents.pct_deviation_anomaly_threshold", + default=-10, + flags=FLAG_AUTOMATOR_MODIFIABLE, +) + +# The threshold that the tick metric must surpass to transition to an incident +# state. This should be a fairly high value to avoid false positive incidents. +register( + "crons.system_incidents.pct_deviation_incident_threshold", + default=-30, + flags=FLAG_AUTOMATOR_MODIFIABLE, +) + +# This is the number of previous ticks we will consider the tick metrics and +# tick decisions for to determine a decision about the tick being evaluated. +register( + "crons.system_incidents.tick_decision_window", + default=5, + flags=FLAG_AUTOMATOR_MODIFIABLE, +) + + # Sets the timeout for webhooks register( "sentry-apps.webhook.timeout.sec", diff --git a/tests/sentry/monitors/test_system_incidents.py b/tests/sentry/monitors/test_system_incidents.py index 74f59f372a6d04..06f71562b970d1 100644 --- a/tests/sentry/monitors/test_system_incidents.py +++ b/tests/sentry/monitors/test_system_incidents.py @@ -7,14 +7,21 @@ from django.utils import timezone from sentry.monitors.system_incidents import ( + MONITOR_TICK_METRIC, MONITOR_VOLUME_DECISION_STEP, MONITOR_VOLUME_HISTORY, MONITOR_VOLUME_RETENTION, + AnomalyTransition, + TickAnomalyDecision, + _make_reference_ts, + get_clock_tick_decision, get_clock_tick_volume_metric, + make_clock_tick_decision, record_clock_tick_volume_metric, update_check_in_volume, ) from sentry.testutils.helpers.options import override_options +from sentry.testutils.pytest.fixtures import django_db_all from sentry.utils import redis redis_client = redis.redis_clusters.get(settings.SENTRY_MONITORS_REDIS_CLUSTER) @@ -50,6 +57,24 @@ def fill_historic_volume( update_check_in_volume(ts_list) +def fill_historic_metrics(start: datetime, metrics: Sequence[float | None]): + """ + Creates historic metrics starting from the `start` datetime backfilling the + `metrics`, popping from the end of the list until all metrics have been + recorded. + + Returns the timestamp the metrics begin at + """ + values: dict[str | bytes, float] = {} + for index, metric in enumerate(metrics): + ts = _make_reference_ts(start + timedelta(minutes=index)) + key = MONITOR_TICK_METRIC.format(ts=ts) + if metric: + values[key] = metric + + redis_client.mset(values) + + def test_update_check_in_volume(): redis_client = redis.redis_clusters.get(settings.SENTRY_MONITORS_REDIS_CLUSTER) @@ -258,3 +283,311 @@ def test_record_clock_tiock_volume_metric_uniform(metrics, logger): sample_rate=1.0, ) assert get_clock_tick_volume_metric(past_ts) == 0.0 + + +@django_db_all +@override_options( + { + "crons.tick_volume_anomaly_detection": True, + "crons.system_incidents.tick_decision_window": 5, + "crons.system_incidents.pct_deviation_anomaly_threshold": -5, + "crons.system_incidents.pct_deviation_incident_threshold": -25, + } +) +def test_tick_decision_anomaly_recovery(): + start = timezone.now().replace(second=0, microsecond=0) + + test_metrics = [ + # fmt: off + # Operating as normal + 1.0, 4.0, 3.0, 2.0, 2.0, -3.0, + # Anomaly detected + -6.0, -7.0, + # Anomaly recovers to normal + -4.0, -3.0, -3.0, -4.0, -1.0 + # fmt: on + ] + + ts = start + fill_historic_metrics(ts, test_metrics) + + # First 6 ticks are operating as normal + for _ in range(0, 6): + result = make_clock_tick_decision(ts := ts + timedelta(minutes=1)) + assert result.decision == TickAnomalyDecision.NORMAL + assert result.transition is None + + # Transition into anomalous state (-6) + for _ in range(0, 1): + result = make_clock_tick_decision(ts := ts + timedelta(minutes=1)) + assert result.decision == TickAnomalyDecision.ABNORMAL + assert result.transition == AnomalyTransition.ABNORMALITY_STARTED + + # Next 5 ticks (-7, -4, -3, -3, -4) stay in abnormal state + for _ in range(0, 5): + result = make_clock_tick_decision(ts := ts + timedelta(minutes=1)) + assert result.decision == TickAnomalyDecision.ABNORMAL + assert result.transition is None + + # Next tick recovers the abnormality after 5 ticks under the abnormality threshold + for _ in range(0, 1): + result = make_clock_tick_decision(ts := ts + timedelta(minutes=1)) + assert result.decision == TickAnomalyDecision.NORMAL + assert result.transition == AnomalyTransition.ABNORMALITY_RECOVERED + + # The last 6 ABNORMAL ticks transitioned to NORMAL + for i in range(1, 7): + assert get_clock_tick_decision(ts - timedelta(minutes=i)) == TickAnomalyDecision.NORMAL + + +@django_db_all +@override_options( + { + "crons.tick_volume_anomaly_detection": True, + "crons.system_incidents.tick_decision_window": 5, + "crons.system_incidents.pct_deviation_anomaly_threshold": -5, + "crons.system_incidents.pct_deviation_incident_threshold": -25, + } +) +def test_tick_decisions_simple_incident(): + """ + Tests incident detection for an incident that immediately starts and + immediately stops. + """ + start = timezone.now().replace(second=0, microsecond=0) + + test_metrics = [ + # fmt: off + # Operating as normal + 1.0, 4.0, 3.0, 2.0, 2.0, -3.0, + # Incident starts immediately + -35.0, -80.0, -100.0, -50.0, + # Incident quickly recovers + -3.0, -2.0, -4.0, -1.0, -4.0 + # fmt: on + ] + + ts = start + fill_historic_metrics(ts, test_metrics) + + # First 6 ticks are operating as normal + for _ in range(0, 6): + result = make_clock_tick_decision(ts := ts + timedelta(minutes=1)) + assert result.decision == TickAnomalyDecision.NORMAL + assert result.transition is None + + # Transition into incident (-35) + for _ in range(0, 1): + result = make_clock_tick_decision(ts := ts + timedelta(minutes=1)) + assert result.decision == TickAnomalyDecision.INCIDENT + assert result.transition == AnomalyTransition.INCIDENT_STARTED + + # Incident continues (-80, -100, -50) + for _ in range(0, 3): + result = make_clock_tick_decision(ts := ts + timedelta(minutes=1)) + assert result.decision == TickAnomalyDecision.INCIDENT + assert result.transition is None + + # Incident recovers (-3) + for _ in range(0, 1): + result = make_clock_tick_decision(ts := ts + timedelta(minutes=1)) + assert result.decision == TickAnomalyDecision.RECOVERING + assert result.transition == AnomalyTransition.INCIDENT_RECOVERING + + # Incident continues recovery (-2, -4, -1) + for _ in range(0, 3): + result = make_clock_tick_decision(ts := ts + timedelta(minutes=1)) + assert result.decision == TickAnomalyDecision.RECOVERING + assert result.transition is None + + # Incident recovers + for _ in range(0, 1): + result = make_clock_tick_decision(ts := ts + timedelta(minutes=1)) + assert result.decision == TickAnomalyDecision.NORMAL + assert result.transition == AnomalyTransition.INCIDENT_RECOVERED + + # The last 4 RECOVERING ticks transitioned to NORMAL + for i in range(1, 5): + assert get_clock_tick_decision(ts - timedelta(minutes=i)) == TickAnomalyDecision.NORMAL + + +@django_db_all +@override_options( + { + "crons.tick_volume_anomaly_detection": True, + "crons.system_incidents.tick_decision_window": 5, + "crons.system_incidents.pct_deviation_anomaly_threshold": -5, + "crons.system_incidents.pct_deviation_incident_threshold": -25, + } +) +def test_tick_decisions_variable_incident(): + """ + Tests an incident that slowly starts and slowly recovers. + """ + start = timezone.now().replace(second=0, microsecond=0) + + test_metrics = [ + # fmt: off + # Operating as normal + 1.0, 4.0, 3.0, 2.0, 2.0, -3.0, + # Anomaly detected + -6.0, -7.0, + # Metrics below anomaly threshold, but not recovered + -4.0, -3.0, + # Metrics above anomaly threshold again, but not at incident threshold + -10.0, + # Incident threshold reached + -30.0, -40.0, -38.0, -42.0, -25.0, -20.0, -10.0, + # Incident recovering + -4.0, -3.0, + # Metrics above anomaly threshold, recovery failed + -6.0, + # Metrics back below anomaly threshold, begin recovering again + -2.0, -1.0, + # Metrics above incident threshold, recovery failed + -30.0, + # Metrics below anomaly threshold, incident will recover + -3.0, -2.0, -4.0, -4.0, -3.0, + # fmt: on + ] + + ts = start + fill_historic_metrics(ts, test_metrics) + + # First 6 ticks are operating as normal + for _ in range(0, 6): + result = make_clock_tick_decision(ts := ts + timedelta(minutes=1)) + assert result.decision == TickAnomalyDecision.NORMAL + assert result.transition is None + + # Transition into anomalous state (-6) + for _ in range(0, 1): + result = make_clock_tick_decision(ts := ts + timedelta(minutes=1)) + assert result.decision == TickAnomalyDecision.ABNORMAL + assert result.transition == AnomalyTransition.ABNORMALITY_STARTED + + # Next 4 ticks (-7, -4, -3, -10) stay in anomaly + for _ in range(0, 4): + result = make_clock_tick_decision(ts := ts + timedelta(minutes=1)) + assert result.decision == TickAnomalyDecision.ABNORMAL + assert result.transition is None + + # Incident starts (-30) + for _ in range(0, 1): + result = make_clock_tick_decision(ts := ts + timedelta(minutes=1)) + assert result.decision == TickAnomalyDecision.INCIDENT + assert result.transition == AnomalyTransition.INCIDENT_STARTED + + # The last 5 ABNORMAL ticks transitioned to INCIDENT + for i in range(1, 6): + assert get_clock_tick_decision(ts - timedelta(minutes=i)) == TickAnomalyDecision.INCIDENT + + # Incident continues (-40, -38, -42, -25, -20, -10) + for _ in range(0, 6): + result = make_clock_tick_decision(ts := ts + timedelta(minutes=1)) + assert result.decision == TickAnomalyDecision.INCIDENT + assert result.transition is None + + # Incident begins recovering (-4) + for _ in range(0, 1): + result = make_clock_tick_decision(ts := ts + timedelta(minutes=1)) + assert result.decision == TickAnomalyDecision.RECOVERING + assert result.transition == AnomalyTransition.INCIDENT_RECOVERING + + # Incident continues to recover (-3) + for _ in range(0, 1): + result = make_clock_tick_decision(ts := ts + timedelta(minutes=1)) + assert result.decision == TickAnomalyDecision.RECOVERING + assert result.transition is None + + # Incident has anomalous tick again (-6), not fully recovered + for _ in range(0, 1): + result = make_clock_tick_decision(ts := ts + timedelta(minutes=1)) + assert result.decision == TickAnomalyDecision.INCIDENT + assert result.transition == AnomalyTransition.INCIDENT_RECOVERY_FAILED + + # The last 2 RECOVERING ticks transitioned back to incident + for i in range(1, 3): + assert get_clock_tick_decision(ts - timedelta(minutes=i)) == TickAnomalyDecision.INCIDENT + + # Incident begins recovering again (-2) + for _ in range(0, 1): + result = make_clock_tick_decision(ts := ts + timedelta(minutes=1)) + assert result.decision == TickAnomalyDecision.RECOVERING + assert result.transition == AnomalyTransition.INCIDENT_RECOVERING + + # Incident continues to recover (-1) + for _ in range(0, 1): + result = make_clock_tick_decision(ts := ts + timedelta(minutes=1)) + assert result.decision == TickAnomalyDecision.RECOVERING + assert result.transition is None + + # Incident has incident tick again (-30), not fully recovered + for _ in range(0, 1): + result = make_clock_tick_decision(ts := ts + timedelta(minutes=1)) + assert result.decision == TickAnomalyDecision.INCIDENT + assert result.transition == AnomalyTransition.INCIDENT_RECOVERY_FAILED + + # The last 2 RECOVERING ticks transitioned back to incident + for i in range(1, 3): + assert get_clock_tick_decision(ts - timedelta(minutes=i)) == TickAnomalyDecision.INCIDENT + + # Incident begins recovering again (-3) + for _ in range(0, 1): + result = make_clock_tick_decision(ts := ts + timedelta(minutes=1)) + assert result.decision == TickAnomalyDecision.RECOVERING + assert result.transition == AnomalyTransition.INCIDENT_RECOVERING + + # Incident continues to recover for the next 3 normal ticks (-2, -4, -4) + for _ in range(0, 3): + result = make_clock_tick_decision(ts := ts + timedelta(minutes=1)) + assert result.decision == TickAnomalyDecision.RECOVERING + assert result.transition is None + + # Incident recovers at the final 5th tick (-3) + for _ in range(0, 1): + result = make_clock_tick_decision(ts := ts + timedelta(minutes=1)) + assert result.decision == TickAnomalyDecision.NORMAL + assert result.transition == AnomalyTransition.INCIDENT_RECOVERED + + # The last 4 RECOVERING ticks transitioned to NORMAL + for i in range(1, 5): + assert get_clock_tick_decision(ts - timedelta(minutes=i)) == TickAnomalyDecision.NORMAL + + # The final tick decision history looks like this + decision_history = [ + TickAnomalyDecision.NORMAL, + TickAnomalyDecision.NORMAL, + TickAnomalyDecision.NORMAL, + TickAnomalyDecision.NORMAL, + TickAnomalyDecision.NORMAL, + TickAnomalyDecision.NORMAL, + TickAnomalyDecision.INCIDENT, + TickAnomalyDecision.INCIDENT, + TickAnomalyDecision.INCIDENT, + TickAnomalyDecision.INCIDENT, + TickAnomalyDecision.INCIDENT, + TickAnomalyDecision.INCIDENT, + TickAnomalyDecision.INCIDENT, + TickAnomalyDecision.INCIDENT, + TickAnomalyDecision.INCIDENT, + TickAnomalyDecision.INCIDENT, + TickAnomalyDecision.INCIDENT, + TickAnomalyDecision.INCIDENT, + TickAnomalyDecision.INCIDENT, + TickAnomalyDecision.INCIDENT, + TickAnomalyDecision.INCIDENT, + TickAnomalyDecision.INCIDENT, + TickAnomalyDecision.INCIDENT, + TickAnomalyDecision.INCIDENT, + TickAnomalyDecision.NORMAL, + TickAnomalyDecision.NORMAL, + TickAnomalyDecision.NORMAL, + TickAnomalyDecision.NORMAL, + TickAnomalyDecision.NORMAL, + ] + + ts = start + timedelta(minutes=1) + for i, expected in enumerate(decision_history): + decision = get_clock_tick_decision(ts + timedelta(minutes=i)) + assert decision == expected