-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathjobset.yaml
71 lines (71 loc) · 2.64 KB
/
jobset.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
apiVersion: jobset.x-k8s.io/v1alpha2
kind: JobSet
metadata:
name: transformer
spec:
replicatedJobs:
- name: workers
template:
spec:
parallelism: 2
completions: 2
backoffLimit: 0
template:
spec:
containers:
- name: pytorch
image: us-central1-docker.pkg.dev/danielvm-dev/dvm-container-images/translate:latest
env:
- name: MASTER_ADDR
value: "transformer-workers-0-0.transformer"
- name: MASTER_PORT
value: "3389"
- name: RANK
valueFrom:
fieldRef:
fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index']
# Force python to not buffer output and write directly to stdout, so we can view training logs via `kubectl logs`.
- name: PYTHONUNBUFFERED
value: "0"
- name: NPROC_PER_NODE
value: 1
- name: NNODES
value: 2
# If your'e not using Weights and Biases for observability, you can delete the following 2 environment variables.
# Be sure to delete the corresponding command line args in the container command below as well.
- name: WANDB_PROJECT
value: dvm
- name: WANDB_API_KEY
value: 632fe598e13131654c0125571562a62b8af0f01f
# envFrom:
# - secretRef:
# name: <YOUR_WANDB_SECRET>
command:
- bash
- -xc
- |
torchrun --nproc_per_node=${NPROC_PER_NODE} \
--nnodes=${NNODES} \
translate_ai/train.py \
--dataset-file data/english-spanish.csv \
--device cuda \
--mixed-precision bf16 \
--epochs 10 \
--learning-rate .004 \
--batch-size 128 \
--num-layers 2 \
--embed-dim 128 \
--d-model 128 \
--ffwd-dim 512 \
--seq-len 128 \
--max-output-tokens 128 \
--eval-interval 200 \
--eval-iters 10 \
--checkpoint-interval 200 \
--save-checkpoint checkpoints/chkpt.pt \
--wandb-project ${WANDB_PROJECT} \
--wandb-api-key ${WANDB_API_KEY} \
--distributed
resources:
limits:
nvidia.com/gpu: 1