Skip to content

Commit

Permalink
print job logs when the job failed for debugging (#591)
Browse files Browse the repository at this point in the history
* print job logs when the job failed for debugging
  • Loading branch information
wwvela authored Feb 28, 2025
1 parent ecc47a9 commit 2f68835
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 13 deletions.
11 changes: 7 additions & 4 deletions test/cases/neuron-inference/bert_inference_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,15 +67,18 @@ func TestNeuronInference(t *testing.T) {
job := &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{Name: "neuron-inference", Namespace: "default"},
}
err := wait.For(
if err := wait.For(
fwext.NewConditionExtension(cfg.Client().Resources()).JobSucceeded(job),
wait.WithTimeout(60*time.Minute),
)
if err != nil {
); err != nil {
log.Println("[ERROR] Neuron inference job failed. Gathering logs...")
if err := printJobLogs(ctx, cfg, "default", "neuron-inference"); err != nil {
t.Logf("[WARNING] Failed to retrieve neuron-inference job logs: %v", err)
}
t.Fatalf("[ERROR] Neuron inference job did not succeed: %v", err)
}
log.Println("[INFO] Neuron inference job succeeded. Gathering logs...")

log.Println("[INFO] Neuron inference job succeeded. Gathering logs...")
applyTime := ctx.Value("applyTime")
if applyTime != nil {
if start, ok := applyTime.(time.Time); ok {
Expand Down
11 changes: 7 additions & 4 deletions test/cases/nvidia-inference/bert_inference_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,15 +71,18 @@ func TestBertInference(t *testing.T) {
job := &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{Name: "bert-inference", Namespace: "default"},
}
err := wait.For(
if err := wait.For(
fwext.NewConditionExtension(cfg.Client().Resources()).JobSucceeded(job),
wait.WithTimeout(20*time.Minute),
)
if err != nil {
); err != nil {
log.Println("[ERROR] BERT inference job failed. Gathering logs...")
if err := printJobLogs(ctx, cfg, "default", "bert-inference"); err != nil {
t.Logf("[WARNING] Failed to retrieve bert-inference job logs: %v", err)
}
t.Fatalf("[ERROR] BERT inference job did not succeed: %v", err)
}
log.Println("[INFO] BERT inference job succeeded. Gathering logs...")

log.Println("[INFO] BERT inference job succeeded. Gathering logs...")
// Compute duration from manifest apply to job success
startVal := ctx.Value("applyTime")
if startVal != nil {
Expand Down
14 changes: 9 additions & 5 deletions test/cases/nvidia-training/bert_training_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,15 +68,19 @@ func TestBertTraining(t *testing.T) {
job := &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{Name: "bert-training-launcher", Namespace: "default"},
}
err := wait.For(fwext.NewConditionExtension(cfg.Client().Resources()).JobSucceeded(job),
if err := wait.For(fwext.NewConditionExtension(cfg.Client().Resources()).JobSucceeded(job),
wait.WithTimeout(time.Minute*20),
wait.WithContext(ctx),
)
if err != nil {
t.Error(err)
); err != nil {
t.Logf("[ERROR] BERT training job failed. Gathering logs...")
if err = printJobLogs(ctx, cfg, "default", "bert-training-launcher"); err != nil {
t.Logf("Warning: failed to retrieve bert-training job logs: %v", err)
}
t.Fatalf("[ERROR] BERT training job did not succeed: %v", err)
}
t.Logf("[INFO] BERT training job succeeded. Gathering logs...")

err = printJobLogs(ctx, cfg, "default", "bert-training-launcher")
err := printJobLogs(ctx, cfg, "default", "bert-training-launcher")
if err != nil {
t.Logf("Warning: failed to retrieve bert-training job logs: %v", err)
}
Expand Down

0 comments on commit 2f68835

Please sign in to comment.