Skip to content

Commit

Permalink
Deploy trino (#2280)
Browse files Browse the repository at this point in the history
* setup trino

* temp: update branch to watch

* Add trino to prod

* Add trino

* test

* fix

* fix: destroy trino

* fix: restore trino

* fix

* fix: increase writers by 10

* update log

* some changes for trino

* fixes

* use postgres

* further improvements

* More fixes

* redeploy trino

* fix

* more fix

* go big

* make weekly less concurrent

* restore flux branch
  • Loading branch information
ravenac95 authored Oct 2, 2024
1 parent df1ef3f commit cc78136
Show file tree
Hide file tree
Showing 14 changed files with 402 additions and 33 deletions.
109 changes: 109 additions & 0 deletions manual-work-log.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
# Manual work

```sql
create table "source"."default"."artifacts_by_project_v1_source"(
artifact_id VARCHAR,
artifact_source_id VARCHAR,
artifact_source VARCHAR,
artifact_namespace VARCHAR,
artifact_name VARCHAR,
project_id VARCHAR,
project_source VARCHAR,
project_namespace VARCHAR,
project_name VARCHAR
)
with (
external_location = 'gs://oso-dataset-transfer-bucket/trino/20240930/artifacts_by_project_v1/',
format = 'PARQUET'
);
```

```sql
create table "source"."default"."projects_by_collection_v1_source"(
project_id VARCHAR,
project_source VARCHAR,
project_namespace VARCHAR,
project_name VARCHAR,
collection_id VARCHAR,
collection_source VARCHAR,
collection_namespace VARCHAR,
collection_name VARCHAR
)
with (
external_location = 'gs://oso-dataset-transfer-bucket/trino/20240930/projects_by_collection_v1/',
format = 'PARQUET'
);
```

```sql
create table "source"."default"."timeseries_events_by_artifact_v0_source"(
time TIMESTAMP,
to_artifact_id VARCHAR,
from_artifact_id VARCHAR,
event_type VARCHAR,
event_source_id VARCHAR,
event_source VARCHAR,
amount DOUBLE
)
with (
external_location = 'gs://oso-dataset-transfer-bucket/trino/20240930/timeseries_events_by_artifact_v0/',
format = 'PARQUET'
);
```

```sql
create table "metrics"."default"."timeseries_events_by_artifact_v0"(
time TIMESTAMP,
to_artifact_id VARCHAR,
from_artifact_id VARCHAR,
event_type VARCHAR,
event_source_id VARCHAR,
event_source VARCHAR,
amount DOUBLE
)
with (partitioning = array['day(time)', 'event_type'])
```

```sql
create table "metrics"."default"."projects_by_collection_v1"(
project_id VARCHAR,
project_source VARCHAR,
project_namespace VARCHAR,
project_name VARCHAR,
collection_id VARCHAR,
collection_source VARCHAR,
collection_namespace VARCHAR,
collection_name VARCHAR
)
```

```sql
create table "metrics"."default"."artifacts_by_project_v1"(
artifact_id VARCHAR,
artifact_source_id VARCHAR,
artifact_source VARCHAR,
artifact_namespace VARCHAR,
artifact_name VARCHAR,
project_id VARCHAR,
project_source VARCHAR,
project_namespace VARCHAR,
project_name VARCHAR
)
```

Write the data from the parquet files into iceberg

```sql
INSERT INTO "metrics"."default"."timeseries_events_by_artifact_v0"
SELECT * FROM "source"."default"."timeseries_events_by_artifact_v0_source";
```

```sql
INSERT INTO "metrics"."default"."projects_by_collection_v1"
SELECT * FROM "source"."default"."projects_by_collection_v1_source";
```

```sql
INSERT INTO "metrics"."default"."artifacts_by_project_v1"
SELECT * FROM "source"."default"."artifacts_by_project_v1_source";
```
5 changes: 5 additions & 0 deletions ops/k8s-apps/base/trino/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: base-trino
resources:
- trino.yaml
120 changes: 120 additions & 0 deletions ops/k8s-apps/base/trino/trino.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
apiVersion: v1
kind: Namespace
metadata:
name: base-trino
labels:
toolkit.fluxcd.io/tenant: apps
ops.opensource.observer/environment: base
kube-secrets-init.doit-intl.com/enable-mutation: "true"
---
apiVersion: source.toolkit.fluxcd.io/v1
kind: HelmRepository
metadata:
name: trino
namespace: base-trino
spec:
interval: 5m
url: https://trinodb.github.io/charts
---
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: trino
namespace: base-trino
spec:
chart:
spec:
chart: trino
version: "0.30.0"
sourceRef:
kind: HelmRepository
name: trino
interval: 50m
install:
remediation:
retries: 3
values:
serviceAccount:
create: true
name: base-trino
coordinator:
jvm:
maxHeapSize: "25G"
resources:
requests:
cpu: 2000m
memory: 25600Mi
tolerations:
- key: pool_type
operator: Equal
value: trino-coordinator
effect: NoSchedule
nodeSelector:
pool_type: trino-coordinator

worker:
config:
query:
maxMemoryPerNode: 15GB
jvm:
maxHeapSize: "40G"
resources:
requests:
cpu: 2000m
memory: 40960Mi
tolerations:
- key: pool_type
operator: Equal
value: trino-worker
effect: NoSchedule
nodeSelector:
pool_type: trino-worker

server:
config:
query:
maxMemory: "500GB"
workers: 2
autoscaling:
enabled: true
maxReplicas: 100
targetCPUUtilizationPercentage: 70
behavior:
scaleDown:
stabilizationWindowSeconds: 300
policies:
- type: Percent
value: 100
periodSeconds: 15
scaleUp:
stabilizationWindowSeconds: 0
policies:
- type: Percent
value: 100
periodSeconds: 15
- type: Pods
value: 4
periodSeconds: 15
selectPolicy: Max
catalogs:
metrics: |
connector.name=iceberg
iceberg.catalog.type=hive_metastore
hive.metastore.uri=thrift://10.145.192.27:9083
hive.metastore-cache-ttl=0s
hive.metastore-refresh-interval=5s
hive.metastore-timeout=10s
iceberg.use-file-size-from-metadata=false
fs.native-gcs.enabled=true
gcs.project-id=opensource-observer
iceberg.max-partitions-per-writer=1000
# gcs.use-access-token=true
source: |
connector.name=hive
hive.metastore.uri=thrift://10.145.192.27:9083
fs.native-gcs.enabled=true
gcs.project-id=opensource-observer
# gcs.use-access-token=true
# bigquery: |
# connector.name=bigquery
# bigquery.project-id=opensource-observer
1 change: 1 addition & 0 deletions ops/k8s-apps/production/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ resources:
- ./dagster
- ./cloudsql-proxy
- ./redis
- ./trino
8 changes: 8 additions & 0 deletions ops/k8s-apps/production/trino/custom-helm-values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: production-trino
spec:
values:
serviceAccount:
name: production-trino
11 changes: 11 additions & 0 deletions ops/k8s-apps/production/trino/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ../../base/trino
namespace: production-trino
patches:
- path: ./custom-helm-values.yaml
target:
kind: HelmRelease
options:
allowNameChange: true
78 changes: 75 additions & 3 deletions ops/tf-modules/warehouse-cluster/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,49 @@ locals {
preemptible = true
initial_node_count = 0
},
# TRINO COORIDNATOR POOL
{
name = "${var.cluster_name}-trino-coordinator-node-pool"
machine_type = "e2-highmem-4"
node_locations = join(",", var.cluster_zones)
min_count = 0
max_count = 1
local_ssd_count = 0
spot = false
disk_size_gb = 200
disk_type = "pd-standard"
image_type = "COS_CONTAINERD"
enable_gcfs = false
enable_gvnic = false
logging_variant = "DEFAULT"
auto_repair = true
auto_upgrade = true
service_account = local.node_service_account_email
preemptible = false
initial_node_count = 0
},
# Trino worker pool
{
name = "${var.cluster_name}-trino-worker-node-pool"
machine_type = "n1-highmem-8"
node_locations = join(",", var.cluster_zones)
min_count = 0
max_count = 10
local_ssd_count = 0
spot = false
disk_size_gb = 200
disk_type = "pd-standard"
image_type = "COS_CONTAINERD"
enable_gcfs = false
enable_gvnic = false
logging_variant = "DEFAULT"
auto_repair = true
auto_upgrade = true
service_account = local.node_service_account_email
preemptible = false
initial_node_count = 0
},

], var.extra_node_pools)

node_pool_labels = merge({
Expand All @@ -107,6 +150,14 @@ locals {
default_node_pool = false
pool_type = "preemptible"
}
"${var.cluster_name}-trino-worker-node-pool" = {
default_node_pool = false
pool_type = "trino-worker"
}
"${var.cluster_name}-trino-coordinator-node-pool" = {
default_node_pool = false
pool_type = "trino-coordinator"
}
}, var.extra_node_labels)

node_pool_metadata = merge({
Expand Down Expand Up @@ -137,6 +188,20 @@ locals {
effect = "NO_SCHEDULE"
},
]
"${var.cluster_name}-trino-worker-node-pool" = [
{
key = "pool_type"
value = "trino-worker"
effect = "NO_SCHEDULE"
},
]
"${var.cluster_name}-trino-coordinator-node-pool" = [
{
key = "pool_type"
value = "trino-coordinator"
effect = "NO_SCHEDULE"
},
]
}, var.extra_node_taints)

node_pool_tags = merge({
Expand All @@ -152,6 +217,12 @@ locals {
"${var.cluster_name}-preemptible-node-pool" = [
"preemptible",
]
"${var.cluster_name}-trino-worker-pool" = [
"trino-worker",
]
"${var.cluster_name}-trino-coordinator-pool" = [
"trino-coordinator",
]
}, var.extra_node_tags)

node_pool_oauth_scopes = merge({
Expand All @@ -177,9 +248,10 @@ module "vpc" {

subnets = [
{
subnet_name = local.main_subnet_name
subnet_ip = var.main_subnet_cidr
subnet_region = var.cluster_region
subnet_name = local.main_subnet_name
subnet_ip = var.main_subnet_cidr
subnet_region = var.cluster_region
subnet_private_access = true
},
]

Expand Down
Loading

0 comments on commit cc78136

Please sign in to comment.