Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Benchmark backprop of a te.TransformerLayer. #2956

Merged
merged 5 commits into from
Sep 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 10 additions & 6 deletions tests/python/multidevice.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,19 +10,17 @@

class MultideviceTest:
def __init__(self):
comm = MPI.COMM_WORLD
self._size = comm.size
self._rank = comm.rank
self._communicator = MPI.COMM_WORLD
self._local_size = int(os.environ["OMPI_COMM_WORLD_LOCAL_SIZE"])
self._local_rank = int(os.environ["OMPI_COMM_WORLD_LOCAL_RANK"])

@property
def size(self):
return self._size
return self._communicator.size

@property
def rank(self):
return self._rank
return self._communicator.rank

@property
def local_size(self):
Expand All @@ -32,7 +30,13 @@ def local_size(self):
def local_rank(self):
return self._local_rank

def barrier(self):
self._communicator.barrier()


@pytest.fixture
def multidevice_test():
return MultideviceTest()
fixture = MultideviceTest()
yield fixture
# Sync all ranks after each test for isolation.
fixture.barrier()
46 changes: 42 additions & 4 deletions tests/python/test_transformer_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import pytest
import torch
import torch.distributed as dist
from enum import auto, Enum

import transformer_engine.pytorch as te

Expand All @@ -15,8 +16,18 @@
multidevice_test = multidevice.multidevice_test


class ComputeType(Enum):
FORWARD = auto()
BACKWARD = auto()


@pytest.mark.mpi
def test_transformer_layer(multidevice_test):
@pytest.mark.parametrize(
"compute_type",
[ComputeType.FORWARD, ComputeType.BACKWARD],
ids=["forward", "backward"],
)
def test_transformer_layer(multidevice_test, benchmark, compute_type):
# Hyperparameters for GPT-3
hidden_size = 12288
num_heads = 96
Expand All @@ -30,7 +41,9 @@ def test_transformer_layer(multidevice_test):

torch.cuda.set_device(rank)
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "29500"
# nvFuser's Communicator singleton is hardcoded to use port 29500. Use a
# different port here to avoid conflict.
os.environ["MASTER_PORT"] = "29400"
dist.init_process_group(
backend="nccl",
init_method="env://",
Expand All @@ -51,7 +64,32 @@ def test_transformer_layer(multidevice_test):
x = torch.randn(
batch_size, sequence_length, hidden_size, dtype=dtype, device="cuda"
)
y = transformer_layer(x)
assert y.size() == torch.Size([batch_size, sequence_length, hidden_size])

match compute_type:
case ComputeType.FORWARD:

def benchmark_fn():
return transformer_layer(x)

y = benchmark(benchmark_fn)
wujingyue marked this conversation as resolved.
Show resolved Hide resolved
assert y.size() == torch.Size([batch_size, sequence_length, hidden_size])
case ComputeType.BACKWARD:
# Due to
# https://github.com/Lightning-AI/lightning-thunder/issues/701, a
# limitation in TransformerEngine, we can't repeatedly call
# torch.autograd.backward to benchmark just backprop. As a
# workaround, the code below runs forward before each backprop but
# only measure the backprop time.
def setup_fn():
y = transformer_layer(x)
dy = torch.rand_like(y)
return (y, dy), {}

def benchmark_fn(y, dy):
torch.autograd.backward(y, dy)

benchmark.pedantic(
benchmark_fn, setup=setup_fn, warmup_rounds=2, iterations=1, rounds=5
)

dist.destroy_process_group()
Loading