diff --git a/bootstrap/main.tf b/bootstrap/main.tf index da61f70..32e285b 100644 --- a/bootstrap/main.tf +++ b/bootstrap/main.tf @@ -23,6 +23,7 @@ module "ogmios_v1_feature" { api_key_salt = var.api_key_salt dcu_per_frame = var.dcu_per_frame dns_zone = var.dns_zone + resources = var.operator_resources } module "ogmios_v1_proxy" { @@ -76,8 +77,26 @@ module "ogmios_instances" { ogmios_image = each.value.ogmios_image node_private_dns = each.value.node_private_dns ogmios_version = each.value.ogmios_version - tolerations = each.value.tolerations replicas = each.value.replicas + tolerations = coalesce(each.value.tolerations, [ + { + effect = "NoSchedule" + key = "demeter.run/compute-profile" + operator = "Exists" + }, + { + effect = "NoSchedule" + key = "demeter.run/compute-arch" + operator = "Equal" + value = "arm64" + }, + { + effect = "NoSchedule" + key = "demeter.run/availability-sla" + operator = "Equal" + value = "consistent" + } + ]) } module "ogmios_services" { @@ -90,4 +109,8 @@ module "ogmios_services" { network = each.value.network } +module "ogmios_monitoring" { + source = "./monitoring" + o11y_datasource_uid = var.o11y_datasource_uid +} diff --git a/bootstrap/monitoring/main.tf b/bootstrap/monitoring/main.tf new file mode 100644 index 0000000..4f2085b --- /dev/null +++ b/bootstrap/monitoring/main.tf @@ -0,0 +1,100 @@ +terraform { + required_providers { + grafana = { + source = "grafana/grafana" + version = ">= 1.28.2" + } + } +} + +variable "o11y_datasource_uid" { + type = string +} + +resource "grafana_folder" "folder" { + title = "Ogmios" +} + +resource "grafana_rule_group" "instance_is_down" { + name = "Ogmios is down" + folder_uid = grafana_folder.folder.uid + interval_seconds = 60 + org_id = 1 + + rule { + name = "Ogmios is down" + condition = "B" + for = "5m" + no_data_state = "OK" + exec_err_state = "OK" + annotations = { + description = "We are not receiving more metrics from a particular Ogmios instance.", + summary = "{{ range $k, $v := $values -}}\n{{ if (match \"A[0-9]+\" $k) -}}\nPod: {{ $v.Labels.pod }}\n{{ end }}\n{{ end }}" + } + + data { + ref_id = "A" + datasource_uid = var.o11y_datasource_uid + + relative_time_range { + from = 3600 + to = 0 + } + + model = jsonencode({ + editorMode = "code", + expr = "count(avg_over_time(ogmios_connected[10m] offset 1h)) by (pod) unless count(avg_over_time(ogmios_connected[10m])) by (pod)", + hide = false, + intervalMs = 1000, + legendFormat = "__auto", + maxDataPoints = 43200, + range = true, + refId = "A" + }) + } + + data { + ref_id = "B" + datasource_uid = "-100" + + relative_time_range { + from = 3600 + to = 0 + } + + model = jsonencode({ + conditions = [ + { + evaluator = { + params = [0] + type = "gt" + }, + operator = { + type = "and" + }, + query = { + params : [ + "A" + ] + }, + reducer = { + params = [], + type = "count_non_null" + }, + type = "query" + } + ], + datasource = { + type = "__expr__", + uid = "-100" + }, + expression = "A", + hide = false, + intervalMs = 1000, + maxDataPoints = 43200, + refId = "B", + type = "classic_conditions" + }) + } + } +} diff --git a/bootstrap/variables.tf b/bootstrap/variables.tf index 800794e..9422e23 100644 --- a/bootstrap/variables.tf +++ b/bootstrap/variables.tf @@ -32,6 +32,10 @@ variable "versions" { default = ["5", "6"] } +variable "o11y_datasource_uid" { + type = string +} + // operator settings variable "operator_image_tag" { @@ -145,7 +149,6 @@ variable "instances" { ogmios_image = string node_private_dns = string ogmios_version = string - compute_arch = string replicas = number resources = optional(object({ limits = object({