Handle Dataset and Subset in train_ensemble() and results_multitask() (#42)

janosh · web-flow · commit e91f75af409f · 2022-04-12T12:53:13.000+02:00
* handle Dataset and Subset in train_ensemble() and results_multitask()

* more consise literal types with pipe char in doc strings
diff --git a/aviary/core.py b/aviary/core.py
@@ -44,7 +44,7 @@ def __init__(
         Args:
             task_dict (dict[str, TaskType]): Map target names to "regression" or "classification".
             robust (bool): Whether to estimate standard deviation for use in a robust loss function
-            device (type[torch.device] | Literal["cuda", "cpu"]): Device the model will run on.
+            device (torch.device | "cuda" | "cpu"): Device the model will run on.
             epoch (int, optional): Epoch model training will begin/resume from. Defaults to 1.
             best_val_scores (dict[str, float], optional): Validation score to use for early
                 stopping. Defaults to None.
@@ -228,12 +228,12 @@ def evaluate(
             optimizer (torch.optim.Optimizer): PyTorch Optimizer
             normalizer_dict (dict[str, Normalizer]): Dictionary of Normalizers to apply
                 to each task.
-            action (Literal["train", "val"], optional): Whether to track gradients depending on
+            action ("train" | "val"], optional): Whether to track gradients depending on
                 whether we are carrying out a training or validation pass. Defaults to "train".
             verbose (bool, optional): Whether to print out intermediate results. Defaults to False.
 
         Returns:
-            dict[str, dict[Literal["Loss", "MAE", "RMSE", "Acc", "F1"], np.ndarray]]: nested
+            dict[str, dict["Loss" | "MAE" | "RMSE" | "Acc" | "F1", np.ndarray]]: nested
                 dictionary of metrics for each task.
         """
         if action == "val":
diff --git a/aviary/utils.py b/aviary/utils.py
@@ -19,7 +19,7 @@
 from torch.nn import CrossEntropyLoss, L1Loss, MSELoss, NLLLoss
 from torch.optim import SGD, Adam, AdamW, Optimizer
 from torch.optim.lr_scheduler import MultiStepLR, _LRScheduler
-from torch.utils.data import DataLoader, Subset
+from torch.utils.data import DataLoader, Dataset, Subset
 from torch.utils.tensorboard import SummaryWriter
 
 from aviary.core import BaseModelClass, Normalizer, TaskType, sampled_softmax
@@ -45,13 +45,13 @@ def init_model(
     Args:
         model_class (type[BaseModelClass]): Which model class to initialize.
         model_params (dict[str, Any]): Dictionary containing model specific hyperparameters.
-        device (type[torch.device] | Literal["cuda", "cpu"]): Device the model will run on.
+        device (type[torch.device] | "cuda" | "cpu"): Device the model will run on.
         resume (str, optional): Path to model checkpoint to resume. Defaults to None.
         fine_tune (str, optional): Path to model checkpoint to fine tune. Defaults to None.
         transfer (str, optional): Path to model checkpoint to transfer. Defaults to None.
 
     Returns:
-        type[BaseModelClass]: An initialised model of type model_class
+        BaseModelClass: An initialised model of type model_class.
     """
     robust = model_params["robust"]
     n_targets = model_params["n_targets"]
@@ -149,11 +149,11 @@ def init_optim(
 
     Args:
         model (type[BaseModelClass]): Model to be optimized.
-        optim (type[Optimizer] | Literal["SGD", "Adam", "AdamW"]): Which optimizer to use
-        learning_rate (float): Learning rate for optimzation
+        optim (type[Optimizer] | "SGD" | "Adam" | "AdamW"): Which optimizer to use
+        learning_rate (float): Learning rate for optimization
         weight_decay (float): Weight decay for optimizer
         momentum (float): Momentum for optimizer
-        device (type[torch.device] | Literal["cuda", "cpu"]): Device the model will run on
+        device (type[torch.device] | "cuda" | "cpu"): Device the model will run on
         milestones (Iterable, optional): When to decay learning rate. Defaults to ().
         gamma (float, optional): Multiplier for learning rate decay. Defaults to 0.3.
         resume (str, optional): Path to model checkpoint to resume. Defaults to None.
@@ -203,7 +203,7 @@ def init_losses(
 
     Args:
         task_dict (dict[str, TaskType]): Map of target names to "regression" or "classification".
-        loss_dict (dict[str, Literal["L1", "L2", "CSE"]]): Map of target names to loss functions.
+        loss_dict (dict[str, "L1" | "L2" | "CSE"]): Map of target names to loss functions.
         robust (bool, optional): Whether to use an uncertainty adjusted loss. Defaults to False.
 
     Returns:
@@ -253,7 +253,7 @@ def init_normalizers(
 
     Args:
         task_dict (dict[str, TaskType]): Map of target names to "regression" or "classification".
-        device (type[torch.device] | Literal["cuda", "cpu"]): Device the model will run on
+        device (torch.device | "cuda" | "cpu"): Device the model will run on
         resume (str, optional): Path to model checkpoint to resume. Defaults to None.
 
     Returns:
@@ -284,8 +284,8 @@ def train_ensemble(
     run_id: int,
     ensemble_folds: int,
     epochs: int,
-    train_set: Subset,
-    val_set: Subset,
+    train_set: Dataset | Subset,
+    val_set: Dataset | Subset,
     log: bool,
     data_params: dict[str, Any],
     setup_params: dict[str, Any],
@@ -310,12 +310,17 @@ def train_ensemble(
         setup_params (dict[str, Any]): Dictionary of setup parameters
         restart_params (dict[str, Any]): Dictionary of restart parameters
         model_params (dict[str, Any]): Dictionary of model parameters
-        loss_dict (dict[str, Literal["L1", "L2", "CSE"]]): Map of target names
+        loss_dict (dict[str, "L1" | "L2" | "CSE"]): Map of target names
             to loss functions.
         patience (int, optional): Maximum number of epochs without improvement
             when early stopping. Defaults to None.
         verbose (bool, optional): Whether to show progress bars for each epoch.
     """
+    if isinstance(train_set, Subset):
+        train_set = train_set.dataset
+    if isinstance(val_set, Subset):
+        val_set = val_set.dataset
+
     train_generator = DataLoader(train_set, **data_params)
     print(f"Training on {len(train_set):,} samples")
 
@@ -350,13 +355,11 @@ def train_ensemble(
 
         for target, normalizer in normalizer_dict.items():
             if normalizer is not None:
-                sample_target = Tensor(
-                    train_set.dataset.df[target].iloc[train_set.indices].values
-                )
+                sample_target = Tensor(train_set.df[target].values)
                 if not restart_params["resume"]:
                     normalizer.fit(sample_target)
                 print(
-                    f"Dummy MAE: {torch.mean(torch.abs(sample_target-normalizer.mean)):.4f}"
+                    f"Dummy MAE: {(sample_target - normalizer.mean).abs().mean():.4f}"
                 )
 
         if log:
@@ -415,7 +418,7 @@ def results_multitask(  # noqa: C901
     model_name: str,
     run_id: int,
     ensemble_folds: int,
-    test_set: Subset,
+    test_set: Dataset | Subset,
     data_params: dict[str, Any],
     robust: bool,
     task_dict: dict[str, TaskType],
@@ -436,7 +439,7 @@ def results_multitask(  # noqa: C901
             loss function.
         task_dict (dict[str, TaskType]): Map of target names to "regression" or
             "classification".
-        device (type[torch.device] | Literal["cuda", "cpu"]): Device the model will run on
+        device (type[torch.device] | "cuda" | "cpu"): Device the model will run on
         eval_type (str, optional): Whether to use final or early-stopping checkpoints.
             Defaults to "checkpoint".
         print_results (bool, optional): Whether to print out summary metrics.
@@ -459,6 +462,9 @@ def results_multitask(  # noqa: C901
         "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"
     )
 
+    if isinstance(test_set, Subset):
+        test_set = test_set.dataset
+
     test_generator = DataLoader(test_set, **data_params)
     print(f"Testing on {len(test_set):,} samples")