forked from kubeflow/kfp-tekton
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathe2e-mnist.py
318 lines (293 loc) · 11.7 KB
/
e2e-mnist.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
# ibmc-file-gold is the recommended ReadWriteMany storageclass for IBM Cloud.
import kfp
import kfp.dsl as dsl
from kfp import components
from kubeflow.katib import ApiClient
from kubeflow.katib import V1beta1ExperimentSpec
from kubeflow.katib import V1beta1AlgorithmSpec
from kubeflow.katib import V1beta1ObjectiveSpec
from kubeflow.katib import V1beta1ParameterSpec
from kubeflow.katib import V1beta1FeasibleSpace
from kubeflow.katib import V1beta1TrialTemplate
from kubeflow.katib import V1beta1TrialParameterSpec
# You should define the Experiment name, namespace and number of training steps in the arguments.
def create_katib_experiment_task(experiment_name, experiment_namespace, training_steps):
# Trial count specification.
max_trial_count = 5
max_failed_trial_count = 3
parallel_trial_count = 2
# Objective specification.
objective = V1beta1ObjectiveSpec(
type="minimize",
goal=0.001,
objective_metric_name="loss"
)
# Algorithm specification.
algorithm = V1beta1AlgorithmSpec(
algorithm_name="random",
)
# Experiment search space.
# In this example we tune learning rate and batch size.
parameters = [
V1beta1ParameterSpec(
name="learning_rate",
parameter_type="double",
feasible_space=V1beta1FeasibleSpace(
min="0.01",
max="0.05"
),
),
V1beta1ParameterSpec(
name="batch_size",
parameter_type="int",
feasible_space=V1beta1FeasibleSpace(
min="80",
max="100"
),
)
]
# Experiment Trial template.
# TODO (andreyvelich): Use community image for the mnist example.
trial_spec = {
"apiVersion": "kubeflow.org/v1",
"kind": "TFJob",
"spec": {
"tfReplicaSpecs": {
"Chief": {
"replicas": 1,
"restartPolicy": "OnFailure",
"template": {
"metadata": {
"annotations": {
"sidecar.istio.io/inject": "false"
}
},
"spec": {
"containers": [
{
"name": "tensorflow",
"image": "docker.io/liuhougangxa/tf-estimator-mnist",
"command": [
"python",
"/opt/model.py",
"--tf-train-steps=" + str(training_steps),
"--tf-learning-rate=${trialParameters.learningRate}",
"--tf-batch-size=${trialParameters.batchSize}"
]
}
]
}
}
},
"Worker": {
"replicas": 1,
"restartPolicy": "OnFailure",
"template": {
"metadata": {
"annotations": {
"sidecar.istio.io/inject": "false"
}
},
"spec": {
"containers": [
{
"name": "tensorflow",
"image": "docker.io/liuhougangxa/tf-estimator-mnist",
"command": [
"python",
"/opt/model.py",
"--tf-train-steps=" + str(training_steps),
"--tf-learning-rate=${trialParameters.learningRate}",
"--tf-batch-size=${trialParameters.batchSize}"
]
}
]
}
}
}
}
}
}
# Configure parameters for the Trial template.
trial_template = V1beta1TrialTemplate(
primary_container_name="tensorflow",
primary_pod_labels={"training.kubeflow.org/job-role": "master"},
trial_parameters=[
V1beta1TrialParameterSpec(
name="learningRate",
description="Learning rate for the training model",
reference="learning_rate"
),
V1beta1TrialParameterSpec(
name="batchSize",
description="Batch size for the model",
reference="batch_size"
),
],
trial_spec=trial_spec
)
# Create an Experiment from the above parameters.
experiment_spec = V1beta1ExperimentSpec(
max_trial_count=max_trial_count,
max_failed_trial_count=max_failed_trial_count,
parallel_trial_count=parallel_trial_count,
objective=objective,
algorithm=algorithm,
parameters=parameters,
trial_template=trial_template
)
# Create the KFP task for the Katib Experiment.
# Experiment Spec should be serialized to a valid Kubernetes object.
katib_experiment_launcher_op = components.load_component_from_url(
"https://raw.githubusercontent.com/kubeflow/pipelines/master/components/kubeflow/katib-launcher/component.yaml")
op = katib_experiment_launcher_op(
experiment_name=experiment_name,
experiment_namespace=experiment_namespace,
experiment_spec=ApiClient().sanitize_for_serialization(experiment_spec),
experiment_timeout_minutes=60,
delete_finished_experiment=False)
return op
# This function converts Katib Experiment HP results to args.
def convert_katib_results(katib_results) -> str:
import json
import pprint
katib_results_json = json.loads(katib_results)
print("Katib results:")
pprint.pprint(katib_results_json)
best_hps = []
for pa in katib_results_json["currentOptimalTrial"]["parameterAssignments"]:
if pa["name"] == "learning_rate":
best_hps.append("--tf-learning-rate=" + pa["value"])
elif pa["name"] == "batch_size":
best_hps.append("--tf-batch-size=" + pa["value"])
print("Best Hyperparameters: {}".format(best_hps))
return " ".join(best_hps)
# You should define the TFJob name, namespace, number of training steps, output of Katib and model volume tasks in the arguments.
def create_tfjob_task(tfjob_name, tfjob_namespace, training_steps, katib_op, model_volume_op):
import json
# Get parameters from the Katib Experiment.
# Parameters are in the format "--tf-learning-rate=0.01 --tf-batch-size=100"
convert_katib_results_op = components.func_to_container_op(convert_katib_results)
best_hp_op = convert_katib_results_op(katib_op.output)
best_hps = str(best_hp_op.output)
# Create the TFJob Chief and Worker specification with the best Hyperparameters.
# TODO (andreyvelich): Use community image for the mnist example.
tfjob_chief_spec = {
"replicas": 1,
"restartPolicy": "OnFailure",
"template": {
"metadata": {
"annotations": {
"sidecar.istio.io/inject": "false"
}
},
"spec": {
"containers": [
{
"name": "tensorflow",
"image": "docker.io/liuhougangxa/tf-estimator-mnist",
"command": [
"sh",
"-c"
],
"args": [
"python /opt/model.py --tf-export-dir=/mnt/export --tf-train-steps={} {}".format(training_steps, best_hps)
],
"volumeMounts": [
{
"mountPath": "/mnt/export",
"name": "model-volume"
}
]
}
],
"volumes": [
{
"name": "model-volume",
"persistentVolumeClaim": {
"claimName": str(model_volume_op.outputs["name"])
}
}
]
}
}
}
tfjob_worker_spec = {
"replicas": 1,
"restartPolicy": "OnFailure",
"template": {
"metadata": {
"annotations": {
"sidecar.istio.io/inject": "false"
}
},
"spec": {
"containers": [
{
"name": "tensorflow",
"image": "docker.io/liuhougangxa/tf-estimator-mnist",
"command": [
"sh",
"-c",
],
"args": [
"python /opt/model.py --tf-export-dir=/mnt/export --tf-train-steps={} {}".format(training_steps, best_hps)
],
}
],
}
}
}
# Create the KFP task for the TFJob.
tfjob_launcher_op = components.load_component_from_url(
"https://raw.githubusercontent.com/kubeflow/pipelines/master/components/kubeflow/launcher/component.yaml")
op = tfjob_launcher_op(
name=tfjob_name,
namespace=tfjob_namespace,
chief_spec=json.dumps(tfjob_chief_spec),
worker_spec=json.dumps(tfjob_worker_spec),
tfjob_timeout_minutes=60,
delete_finished_tfjob=False)
return op
# You should define the model name, namespace, output of the TFJob and model volume tasks in the arguments.
def create_kserve_task(model_name, model_namespace, tfjob_op, model_volume_op):
inference_service = '''
apiVersion: "serving.kserve.io/v1beta1"
kind: "InferenceService"
metadata:
name: {}
namespace: {}
annotations:
"sidecar.istio.io/inject": "false"
spec:
predictor:
tensorflow:
storageUri: "pvc://{}/"
'''.format(model_name, model_namespace, str(model_volume_op.outputs["name"]))
kserve_launcher_op = components.load_component_from_url(
'https://raw.githubusercontent.com/kubeflow/pipelines/master/components/kserve/component.yaml')
kserve_launcher_op(action="create", inferenceservice_yaml=inference_service).after(tfjob_op)
name="mnist-e2e"
namespace="kubeflow-user-example-com"
training_steps="200"
@dsl.pipeline(
name="end-to-end-pipeline",
description="An end to end mnist example including hyperparameter tuning, train and inference"
)
def mnist_pipeline(name=name, namespace=namespace, training_steps=training_steps):
# Run the hyperparameter tuning with Katib.
katib_op = create_katib_experiment_task(name, namespace, training_steps)
# Create volume to train and serve the model.
model_volume_op = dsl.VolumeOp(
name="model-volume",
resource_name="model-volume",
size="1Gi",
modes=dsl.VOLUME_MODE_RWM
)
# Run the distributive training with TFJob.
tfjob_op = create_tfjob_task(name, namespace, training_steps, katib_op, model_volume_op)
# Create the KServe inference.
create_kserve_task(name, namespace, tfjob_op, model_volume_op)
if __name__ == '__main__':
from kfp_tekton.compiler import TektonCompiler
TektonCompiler().compile(mnist_pipeline, __file__.replace('.py', '.yaml'))