Skip to content

Commit

Permalink
Introducing a pythonic CLI (#5)
Browse files Browse the repository at this point in the history
* Introducing a pythonic CLI

Signed-off-by: Marc Romeyn <[email protected]>

* Remove commented out code

Signed-off-by: Marc Romeyn <[email protected]>

* Small fix in readme

Signed-off-by: Marc Romeyn <[email protected]>

* Adding --interactive

Signed-off-by: Marc Romeyn <[email protected]>

* Starting to support plugins

Signed-off-by: Marc Romeyn <[email protected]>

* Using RunContext to simplify things

Signed-off-by: Marc Romeyn <[email protected]>

* Fix some bugs + improved example

Signed-off-by: Marc Romeyn <[email protected]>

* Fix typo

Signed-off-by: Marc Romeyn <[email protected]>

* Make sure code in README is the same as task.py

Signed-off-by: Marc Romeyn <[email protected]>

* Fixing failing tests

Signed-off-by: Marc Romeyn <[email protected]>

* Fix linting errors

Signed-off-by: Marc Romeyn <[email protected]>

* Run fmt

Signed-off-by: Marc Romeyn <[email protected]>

* Fix some more issues

Signed-off-by: Marc Romeyn <[email protected]>

* Run formatting again

Signed-off-by: Marc Romeyn <[email protected]>

* Fix spelling issue

Signed-off-by: Marc Romeyn <[email protected]>

* Change API to create an experiment entrypoint

Signed-off-by: Marc Romeyn <[email protected]>

* Fix docstring

Signed-off-by: Marc Romeyn <[email protected]>

* Adding default_factory arg to entrypoint

Signed-off-by: Marc Romeyn <[email protected]>

* Also expose default_factory in main

Signed-off-by: Marc Romeyn <[email protected]>

* Adding default_executor

Signed-off-by: Marc Romeyn <[email protected]>

* Adding default_plugins

Signed-off-by: Marc Romeyn <[email protected]>

* Some fixes

Signed-off-by: Marc Romeyn <[email protected]>

* Fix linting issues

Signed-off-by: Marc Romeyn <[email protected]>

* List -> list

Signed-off-by: Marc Romeyn <[email protected]>

* Copy over __main__.py in slurm packaging

Signed-off-by: Marc Romeyn <[email protected]>

* Copy over __main__.py in slurm packaging

Signed-off-by: Marc Romeyn <[email protected]>

* Add copyright header to fdl_runner

Signed-off-by: Marc Romeyn <[email protected]>

---------

Signed-off-by: Marc Romeyn <[email protected]>
  • Loading branch information
marcromeyn authored Aug 26, 2024
1 parent 20c8922 commit d614a8a
Show file tree
Hide file tree
Showing 43 changed files with 4,689 additions and 1,114 deletions.
403 changes: 403 additions & 0 deletions examples/entrypoint/README.md

Large diffs are not rendered by default.

107 changes: 107 additions & 0 deletions examples/entrypoint/experiment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
from dataclasses import dataclass
from typing import List

import nemo_run as run


@dataclass
class Model:
"""Dummy model config"""

hidden_size: int
num_layers: int
activation: str


@dataclass
class Optimizer:
"""Dummy optimizer config"""

learning_rate: float
weight_decay: float
betas: List[float]


@run.cli.entrypoint
def train_model(model: Model, optimizer: Optimizer, epochs: int = 10, batch_size: int = 32):
"""
Train a model using the specified configuration.
Args:
model (Model): Configuration for the model.
optimizer (Optimizer): Configuration for the optimizer.
epochs (int, optional): Number of training epochs. Defaults to 10.
batch_size (int, optional): Batch size for training. Defaults to 32.
"""
print("Training model with the following configuration:")
print(f"Model: {model}")
print(f"Optimizer: {optimizer}")
print(f"Epochs: {epochs}")
print(f"Batch size: {batch_size}")

# Simulating model training
for epoch in range(epochs):
print(f"Epoch {epoch + 1}/{epochs}")

print("Training completed!")


@run.cli.factory
@run.autoconvert
def my_model(hidden_size: int = 256, num_layers: int = 3, activation: str = "relu") -> Model:
"""
Create a model configuration.
"""
return Model(hidden_size=hidden_size, num_layers=num_layers, activation=activation)


@run.cli.factory
@run.autoconvert
def my_optimizer(
learning_rate: float = 0.001, weight_decay: float = 1e-5, betas: List[float] = [0.9, 0.999]
) -> Optimizer:
"""
Create an optimizer configuration.
"""
return Optimizer(learning_rate=learning_rate, weight_decay=weight_decay, betas=betas)


@run.cli.factory
@run.autoconvert
def local_executor() -> run.LocalExecutor:
return run.LocalExecutor()


@run.cli.entrypoint(type="experiment")
def train_models_experiment(
ctx: run.cli.RunContext,
models: List[Model] = [my_model(), my_model(hidden_size=512)],
optimizers: List[Optimizer] = [my_optimizer(), my_optimizer(learning_rate=0.01)],
epochs: int = 10,
batch_size: int = 32,
sequential: bool = False,
):
"""
Run an experiment to train multiple models with different configurations.
Args:
ctx (run.RunContext): The run context for the experiment.
models (List[Model]): List of model configurations to train.
optimizers (List[Optimizer]): List of optimizer configurations to use.
epochs (int): Number of training epochs for each model.
batch_size (int): Batch size for training.
"""

with run.Experiment("train_models_experiment") as exp:
for i, (model, optimizer) in enumerate(zip(models, optimizers)):
train = run.Partial(
train_model, model=model, optimizer=optimizer, epochs=epochs, batch_size=batch_size
)

exp.add(train, name=f"train_model_{i}", executor=ctx.executor)

ctx.launch(exp, sequential=sequential)


if __name__ == "__main__":
run.cli.main(train_models_experiment)
Binary file added examples/entrypoint/img/experiment-2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/entrypoint/img/experiment-3.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/entrypoint/img/experiment-4.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/entrypoint/img/experiment-5.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/entrypoint/img/experiment-6.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/entrypoint/img/experiment-help.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/entrypoint/img/task-2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/entrypoint/img/task-3.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/entrypoint/img/task-4.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/entrypoint/img/task-5.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/entrypoint/img/task-6.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/entrypoint/img/task-7.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/entrypoint/img/task-help.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/entrypoint/img/task-repl.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
73 changes: 73 additions & 0 deletions examples/entrypoint/task.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
from dataclasses import dataclass
from typing import List

import nemo_run as run


@dataclass
class Model:
"""Dummy model config"""

hidden_size: int
num_layers: int
activation: str


@dataclass
class Optimizer:
"""Dummy optimizer config"""

learning_rate: float
weight_decay: float
betas: List[float]


@run.cli.factory
@run.autoconvert
def my_model(hidden_size: int = 256, num_layers: int = 3, activation: str = "relu") -> Model:
"""
Create a model configuration.
"""
return Model(hidden_size=hidden_size, num_layers=num_layers, activation=activation)


@run.cli.factory
def my_optimizer(
learning_rate: float = 0.001, weight_decay: float = 1e-5, betas: List[float] = [0.9, 0.999]
) -> run.Config[Optimizer]:
"""Create an optimizer configuration."""
return run.Config(
Optimizer, learning_rate=learning_rate, weight_decay=weight_decay, betas=betas
)


def train_model(
model: Model,
optimizer: Optimizer,
epochs: int = 10,
batch_size: int = 32,
):
"""
Train a model using the specified configuration.
Args:
model (Model): Configuration for the model.
optimizer (Optimizer): Configuration for the optimizer.
epochs (int, optional): Number of training epochs. Defaults to 10.
batch_size (int, optional): Batch size for training. Defaults to 32.
"""
print("Training model with the following configuration:")
print(f"Model: {model}")
print(f"Optimizer: {optimizer}")
print(f"Epochs: {epochs}")
print(f"Batch size: {batch_size}")

# Simulating model training
for epoch in range(epochs):
print(f"Epoch {epoch + 1}/{epochs}")

print("Training completed!")


if __name__ == "__main__":
run.cli.main(train_model)
98 changes: 98 additions & 0 deletions examples/entrypoint/task_with_defaults.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
from dataclasses import dataclass
from typing import List

import nemo_run as run


@dataclass
class Model:
"""Dummy model config"""

hidden_size: int
num_layers: int
activation: str


@dataclass
class Optimizer:
"""Dummy optimizer config"""

learning_rate: float
weight_decay: float
betas: List[float]


@run.cli.factory
@run.autoconvert
def my_model(hidden_size: int = 256, num_layers: int = 3, activation: str = "relu") -> Model:
"""
Create a model configuration.
"""
return Model(hidden_size=hidden_size, num_layers=num_layers, activation=activation)


@run.cli.factory
def my_optimizer(
learning_rate: float = 0.001, weight_decay: float = 1e-5, betas: List[float] = [0.9, 0.999]
) -> run.Config[Optimizer]:
"""Create an optimizer configuration."""
return run.Config(
Optimizer, learning_rate=learning_rate, weight_decay=weight_decay, betas=betas
)


def train_model(
model: Model,
optimizer: Optimizer,
epochs: int = 10,
batch_size: int = 32,
):
"""
Train a model using the specified configuration.
Args:
model (Model): Configuration for the model.
optimizer (Optimizer): Configuration for the optimizer.
epochs (int, optional): Number of training epochs. Defaults to 10.
batch_size (int, optional): Batch size for training. Defaults to 32.
"""
print("Training model with the following configuration:")
print(f"Model: {model}")
print(f"Optimizer: {optimizer}")
print(f"Epochs: {epochs}")
print(f"Batch size: {batch_size}")

# Simulating model training
for epoch in range(epochs):
print(f"Epoch {epoch + 1}/{epochs}")

print("Training completed!")


def custom_defaults() -> run.Partial[train_model]:
return run.Partial(
train_model,
model=my_model(hidden_size=512),
optimizer=my_optimizer(learning_rate=0.0005),
epochs=50,
batch_size=2048,
)


@run.autoconvert
def local_executor() -> run.Executor:
return run.LocalExecutor()


class DummyPlugin(run.Plugin):
def setup(self, task: run.Partial[train_model], executor: run.Executor):
task.epochs *= 2


if __name__ == "__main__":
run.cli.main(
train_model,
default_factory=custom_defaults,
default_executor=local_executor(),
default_plugins=run.Config(DummyPlugin),
)
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ dependencies = [
"catalogue>=2.0.10",
"fabric>=3.2.2",
"fiddle>=0.3.0",
"lark>=1.1.9",
"torchx>=0.7.0",
"typer>=0.12.3",
"rich>=13.7.1",
Expand Down
4 changes: 2 additions & 2 deletions requirements-dev.lock
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
-e file:.
absl-py==2.1.0
# via fiddle
appnope==0.1.4
# via ipykernel
asttokens==2.4.1
# via stack-data
attrs==24.2.0
Expand Down Expand Up @@ -111,8 +113,6 @@ jupyterlab-widgets==3.0.11
# via ipywidgets
kubernetes==30.1.0
# via skypilot
lark==1.2.2
# via nemo-run
libcst==1.4.0
# via fiddle
markdown-it-py==3.0.0
Expand Down
2 changes: 0 additions & 2 deletions requirements.lock
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,6 @@ jsonschema-specifications==2023.12.1
# via jsonschema
kubernetes==30.1.0
# via skypilot
lark==1.2.2
# via nemo-run
libcst==1.4.0
# via fiddle
markdown-it-py==3.0.0
Expand Down
14 changes: 5 additions & 9 deletions src/nemo_run/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from nemo_run.api import autoconvert, list_tasks, task
from nemo_run import cli
from nemo_run.api import autoconvert, dryrun_fn
from nemo_run.config import Config, Partial, Script
from nemo_run.core.execution.base import (
Executor,
ExecutorMacros,
FaultTolerance,
Torchrun,
)
from nemo_run.core.execution.base import Executor, ExecutorMacros, FaultTolerance, Torchrun
from nemo_run.core.execution.local import LocalExecutor
from nemo_run.core.execution.skypilot import SkypilotExecutor
from nemo_run.core.execution.slurm import SlurmExecutor
Expand All @@ -35,6 +31,8 @@

__all__ = [
"autoconvert",
"cli",
"dryrun_fn",
"Config",
"DevSpace",
"Executor",
Expand All @@ -43,7 +41,6 @@
"FaultTolerance",
"GitArchivePackager",
"help",
"list_tasks",
"LocalExecutor",
"LocalTunnel",
"Packager",
Expand All @@ -54,7 +51,6 @@
"SkypilotExecutor",
"SlurmExecutor",
"SSHTunnel",
"task",
"Torchrun",
]

Expand Down
Loading

0 comments on commit d614a8a

Please sign in to comment.