Skip to content

Commit

Permalink
Remove dependency on mpi-operator
Browse files Browse the repository at this point in the history
  • Loading branch information
cartermckinnon committed Jan 8, 2025
1 parent c3d2cb1 commit d8a95a2
Show file tree
Hide file tree
Showing 22 changed files with 149 additions and 44 deletions.
16 changes: 4 additions & 12 deletions Config
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,11 @@
package.Aws-k8s-tester-mirror = {
interfaces = (1.0);

deploy = {
generic = true;
};

build-environment = {
chroot = basic;
network-access = blocked;
};

build-system = archivebuild;
build-system = bgo-wrap-make;
build-tools = {
1.0 = {
ArchiveBuild = 1.0;
BrazilMakeGo = 3.0;
GoLang = 1.x;
};
};
};
};
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include ${BGO_MAKEFILE}

pre-release::
go test -c -tags=e2e ./test/... -o $(GOBIN)
3 changes: 3 additions & 0 deletions bmg.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"binary_artifacts_only": true
}
2 changes: 0 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,6 @@ require (
)

require (
github.com/kubeflow/mpi-operator v0.4.0
k8s.io/cli-runtime v0.28.3
sigs.k8s.io/e2e-framework v0.3.0
)
Expand All @@ -322,7 +321,6 @@ require (
github.com/go-errors/errors v1.4.2 // indirect
github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 // indirect
github.com/gorilla/websocket v1.5.0 // indirect
github.com/kubeflow/common v0.4.6 // indirect
github.com/moby/spdystream v0.4.0 // indirect
github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 // indirect
github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect
Expand Down
5 changes: 0 additions & 5 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -538,10 +538,6 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/kubeflow/common v0.4.6 h1:yzJf/HEdS6ginD0GlVkgbOFie0Sp66VdGjXidAGZIlk=
github.com/kubeflow/common v0.4.6/go.mod h1:43MAof/uhpJA2C0urynqatE3oKFQc7m2HLmJty7waqY=
github.com/kubeflow/mpi-operator v0.4.0 h1:PS4jLoMuRyrk/DHuYkI0D46sQQYpQt375HjOV4KVMFs=
github.com/kubeflow/mpi-operator v0.4.0/go.mod h1:/A4mTy/RYh2UIgaGUiXUaW70eThjsogu80WbbcZpuMg=
github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
github.com/letsencrypt/boulder v0.0.0-20240318162201-5e68cbe552b9 h1:t+GNF6j5rgBKp7duiRWnIrwPzU22SkX7ZJJkX1+jTyM=
Expand Down Expand Up @@ -823,7 +819,6 @@ go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0 h1:4K4tsIX
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0/go.mod h1:jjdQuTGVsXV4vSs+CJ2qYDeDPf9yIJV23qlIzBm73Vg=
go.opentelemetry.io/otel v1.28.0 h1:/SqNcYk+idO0CxKEUOtKQClMK/MimZihKYMruSMViUo=
go.opentelemetry.io/otel v1.28.0/go.mod h1:q68ijF8Fc8CnMHKyzqL6akLO46ePnjkgfIMIjUIX9z4=
go.opentelemetry.io/otel/exporters/otlp v0.20.0 h1:PTNgq9MRmQqqJY0REVbZFvwkYOA85vbdQU/nVfxDyqg=
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.28.0 h1:3Q/xZUyC1BBkualc9ROb4G8qkH90LXEIICcs5zv1OYY=
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.28.0/go.mod h1:s75jGIWA9OfCMzF0xr+ZgfrB5FEbbV7UuYo32ahUiFI=
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.27.0 h1:qFffATk0X+HD+f1Z8lswGiOQYKHRlzfmdJm0wEaVrFA=
Expand Down
8 changes: 3 additions & 5 deletions internal/e2e/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@ import (
"io"
"os"

kubeflowv2beta1 "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1"
batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -130,10 +128,10 @@ func GetJobLogs(restConfig *rest.Config, job k8s.Object) (string, error) {
return "", err
}
var jobLabel string
switch job.(type) {
case *kubeflowv2beta1.MPIJob:
switch job.GetObjectKind().GroupVersionKind().Kind {
case "MPIJob":
jobLabel = fmt.Sprintf("job-name=%s-launcher", job.GetName())
case *batchv1.Job:
case "Job":
jobLabel = fmt.Sprintf("job-name=%s", job.GetName())
default:
return "", fmt.Errorf("unsupported job type %T", job)
Expand Down
45 changes: 45 additions & 0 deletions internal/e2e/mpioperator/types.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
package mpioperator

import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
)

var (
facadeSchemeBuilder = runtime.NewSchemeBuilder(addFacadeTypes)
AddFacadesToScheme = facadeSchemeBuilder.AddToScheme
)

func addFacadeTypes(s *runtime.Scheme) error {
for _, version := range []string{"v2beta1", runtime.APIVersionInternal} {
s.AddKnownTypeWithName(schema.GroupVersionKind{
Group: "kubeflow.org",
Version: version,
Kind: "MPIJob",
}, &MPIJobFacade{})
}
return nil
}

// MPIJobFacade is a utility type for working with specific fields of a kubeflow.org/v2beta1 MPIJob
// without needing a code dependency on the full external types
type MPIJobFacade struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata,omitempty"`
Status MPIJobFacadeStatus `json:"status"`
}

func (m *MPIJobFacade) DeepCopyObject() runtime.Object {
// no-op
return nil
}

type MPIJobFacadeStatus struct {
Conditions []MPIJobFacadeStatusCondition `json:"conditions"`
}

type MPIJobFacadeStatusCondition struct {
Type string `json:"type"`
Status string `json:"status"`
}
43 changes: 43 additions & 0 deletions internal/e2e/mpioperator/types_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
package mpioperator

import (
"testing"

"github.com/stretchr/testify/assert"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/serializer"
)

const mpiJobYaml = `---
apiVersion: kubeflow.org/v2beta1
kind: MPIJob
metadata:
name: foo
status:
conditions:
- type: "Succeeded"
status: "True"
`

func Test_serialization(t *testing.T) {
scheme := runtime.NewScheme()
err := AddFacadesToScheme(scheme)
if err != nil {
t.Errorf("failed to add facades to scheme: %v", err)
return
}
codecs := serializer.NewCodecFactory(scheme)
obj, gvk, err := codecs.UniversalDecoder().Decode([]byte(mpiJobYaml), nil, nil)
if err != nil {
t.Errorf("failed to decode: %v", err)
return
}
if gvk.Kind != "MPIJob" {
t.Errorf("failed to decode %q (wrong Kind)", gvk.Kind)
}
if facade, ok := obj.(*MPIJobFacade); !ok {
t.Errorf("failed to cast MPIJob to MPIJob facade!")
} else {
assert.Equal(t, "Succeeded", facade.Status.Conditions[0].Type)
}
}
2 changes: 2 additions & 0 deletions test/cases/netpol/main_test.go
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
//go:build e2e

package netpol

import (
Expand Down
2 changes: 2 additions & 0 deletions test/cases/netpol/np_test.go
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
//go:build e2e

package netpol

import (
Expand Down
2 changes: 2 additions & 0 deletions test/cases/neuron/main_test.go
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
//go:build e2e

package neuron

import (
Expand Down
17 changes: 9 additions & 8 deletions test/cases/neuron/neuron_test.go
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
//go:build e2e

package neuron

import (
Expand All @@ -7,15 +9,14 @@ import (
"testing"

fwext "github.com/aws/aws-k8s-tester/internal/e2e"
kubeflowv2beta1 "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1"
"github.com/aws/aws-k8s-tester/internal/e2e/mpioperator"
"sigs.k8s.io/e2e-framework/klient/k8s"
"sigs.k8s.io/e2e-framework/klient/wait"
"sigs.k8s.io/e2e-framework/klient/wait/conditions"
"sigs.k8s.io/e2e-framework/pkg/envconf"
"sigs.k8s.io/e2e-framework/pkg/features"

batchv1 "k8s.io/api/batch/v1"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

Expand Down Expand Up @@ -123,10 +124,10 @@ func TestNeuronNodes(t *testing.T) {
}).
Assess("NCCOM test succeeds", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
rsrc := cfg.Client().Resources()
if err := kubeflowv2beta1.AddToScheme(rsrc.GetScheme()); err != nil {
if err := mpioperator.AddFacadesToScheme(rsrc.GetScheme()); err != nil {
t.Fatal(err)
}
j := kubeflowv2beta1.MPIJob{
j := mpioperator.MPIJobFacade{
ObjectMeta: metav1.ObjectMeta{Name: "multi-node-nccom-test", Namespace: "default"},
}
t.Log("Waiting for MPIJob to complete")
Expand All @@ -136,7 +137,7 @@ func TestNeuronNodes(t *testing.T) {
t.Fatal(err)
}

log, err := fwext.GetJobLogs(cfg.Client().RESTConfig(), &kubeflowv2beta1.MPIJob{
log, err := fwext.GetJobLogs(cfg.Client().RESTConfig(), &mpioperator.MPIJobFacade{
ObjectMeta: metav1.ObjectMeta{Name: "multi-node-nccom-test", Namespace: "default"},
})
if err != nil {
Expand All @@ -159,10 +160,10 @@ func TestNeuronNodes(t *testing.T) {
}

func mpiJobSucceeded(obj k8s.Object) bool {
j := obj.(*kubeflowv2beta1.MPIJob)
j := obj.(*mpioperator.MPIJobFacade)
for _, c := range j.Status.Conditions {
if c.Type == kubeflowv2beta1.JobSucceeded {
return c.Status == v1.ConditionTrue
if c.Type == "Succeeded" {
return c.Status == "True"
}
}
return false
Expand Down
2 changes: 2 additions & 0 deletions test/cases/nvidia-inference/bert_inference_test.go
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
//go:build e2e

package inference

import (
Expand Down
2 changes: 2 additions & 0 deletions test/cases/nvidia-inference/main_test.go
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
//go:build e2e

package inference

import (
Expand Down
2 changes: 2 additions & 0 deletions test/cases/nvidia-training/bert_training_test.go
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
//go:build e2e

package training

import (
Expand Down
2 changes: 2 additions & 0 deletions test/cases/nvidia-training/main_test.go
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
//go:build e2e

package training

import (
Expand Down
2 changes: 2 additions & 0 deletions test/cases/nvidia-training/vars.go
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
//go:build e2e

package training

import (
Expand Down
2 changes: 2 additions & 0 deletions test/cases/nvidia/main_test.go
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
//go:build e2e

package nvidia

import (
Expand Down
23 changes: 12 additions & 11 deletions test/cases/nvidia/mpi_test.go
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
//go:build e2e

package nvidia

import (
Expand All @@ -8,14 +10,13 @@ import (
"testing"

fwext "github.com/aws/aws-k8s-tester/internal/e2e"
kubeflowv2beta1 "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1"
"github.com/aws/aws-k8s-tester/internal/e2e/mpioperator"
"sigs.k8s.io/e2e-framework/klient/k8s"
"sigs.k8s.io/e2e-framework/klient/wait"
"sigs.k8s.io/e2e-framework/klient/wait/conditions"
"sigs.k8s.io/e2e-framework/pkg/envconf"
"sigs.k8s.io/e2e-framework/pkg/features"

v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/utils/strings/slices"
)
Expand Down Expand Up @@ -57,10 +58,10 @@ func TestMPIJobPytorchTraining(t *testing.T) {
}).
Assess("MPIJob succeeds", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
rsrc := cfg.Client().Resources()
if err := kubeflowv2beta1.AddToScheme(rsrc.GetScheme()); err != nil {
if err := mpioperator.AddFacadesToScheme(rsrc.GetScheme()); err != nil {
t.Fatal(err)
}
j := kubeflowv2beta1.MPIJob{
j := mpioperator.MPIJobFacade{
ObjectMeta: metav1.ObjectMeta{Name: "pytorch-training-single-node", Namespace: "default"},
}
t.Log("Waiting for single node job to complete")
Expand All @@ -73,7 +74,7 @@ func TestMPIJobPytorchTraining(t *testing.T) {
return ctx
}).
Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
log, err := fwext.GetJobLogs(cfg.Client().RESTConfig(), &kubeflowv2beta1.MPIJob{
log, err := fwext.GetJobLogs(cfg.Client().RESTConfig(), &mpioperator.MPIJobFacade{
ObjectMeta: metav1.ObjectMeta{Name: "pytorch-training-single-node", Namespace: "default"},
})
if err != nil {
Expand Down Expand Up @@ -127,10 +128,10 @@ func TestMPIJobPytorchTraining(t *testing.T) {
}).
Assess("MPIJob succeeds", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
rsrc := cfg.Client().Resources()
if err := kubeflowv2beta1.AddToScheme(rsrc.GetScheme()); err != nil {
if err := mpioperator.AddFacadesToScheme(rsrc.GetScheme()); err != nil {
t.Fatal(err)
}
j := kubeflowv2beta1.MPIJob{
j := mpioperator.MPIJobFacade{
ObjectMeta: metav1.ObjectMeta{Name: "multi-node-nccl-test", Namespace: "default"},
}
t.Log("Waiting for multi node job to complete")
Expand All @@ -142,7 +143,7 @@ func TestMPIJobPytorchTraining(t *testing.T) {
t.Log("Multi node job completed")

// Verify GPU Direct RDMA is used on P4/P5
log, err := fwext.GetJobLogs(cfg.Client().RESTConfig(), &kubeflowv2beta1.MPIJob{
log, err := fwext.GetJobLogs(cfg.Client().RESTConfig(), &mpioperator.MPIJobFacade{
ObjectMeta: metav1.ObjectMeta{Name: "multi-node-nccl-test", Namespace: "default"},
})
if err != nil {
Expand Down Expand Up @@ -171,10 +172,10 @@ func TestMPIJobPytorchTraining(t *testing.T) {
}

func mpiJobSucceeded(obj k8s.Object) bool {
j := obj.(*kubeflowv2beta1.MPIJob)
j := obj.(*mpioperator.MPIJobFacade)
for _, c := range j.Status.Conditions {
if c.Type == kubeflowv2beta1.JobSucceeded {
return c.Status == v1.ConditionTrue
if c.Type == "Succeeded" {
return c.Status == "True"
}
}
return false
Expand Down
2 changes: 2 additions & 0 deletions test/cases/nvidia/unit_test.go
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
//go:build e2e

package nvidia

import (
Expand Down
2 changes: 2 additions & 0 deletions test/cases/quick/limit_test.go
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
//go:build e2e

package quick

import (
Expand Down
Loading

0 comments on commit d8a95a2

Please sign in to comment.