NVIDIA · jstjohn · Mar 3, 2025 · Mar 3, 2025
diff --git a/nemo/collections/llm/gpt/model/hyena.py b/nemo/collections/llm/gpt/model/hyena.py
@@ -216,6 +216,9 @@ class HyenaConfig(TransformerConfig, io.IOMixin):
     use_te: bool = True
     to_upper: str = "normalized_weighted"  # choose between "weighted" and "normalized_weighted"
     use_short_conv_bias: bool = False
+    # Use this if you want to turn FP8 on for the linear layer in the mixer only. When using this, do not set
+    #  Fp8 in the mixed precision plugin.
+    vortex_style_fp8: bool = False
 
     def __post_init__(self):
         """

diff --git a/nemo/collections/llm/gpt/model/megatron/hyena/hyena_mixer.py b/nemo/collections/llm/gpt/model/megatron/hyena/hyena_mixer.py
@@ -43,8 +43,20 @@
 logger = logging.getLogger(__name__)
 
 try:
+    import transformer_engine.pytorch as te
     from transformer_engine.common.recipe import DelayedScaling, Format
 except ImportError:
+
+    def DelayedScaling(*args, **kwargs):
+        raise ImportError("transformer_engine not installed. Using default recipe.")
+
+    def Format(*args, **kwargs):
+        raise ImportError("transformer_engine not installed. Using default recipe.")
+
+    class te:
+        def __getattribute__(self, name: str) -> None:
@@ -62,3 +62,3 @@
            """Not imported: te. An error will be raised if this is called like a module."""
-            raise ImportError("transformer_engine not installed. Using default recipe.")
+            raise AttributeError(f"'_te' object has no attribute '{name}'")
@@ -62,3 +62,3 @@
            """Not imported: te. An error will be raised if this is called like a module."""
-            raise ImportError("transformer_engine not installed. Using default recipe.")
+            raise AttributeError(f"'_te' object has no attribute '{name}'")

+            raise ImportError("transformer_engine not installed. Using default recipe.")
+
     logger.warning("WARNING: transformer_engine not installed. Using default recipe.")
 
 
@@ -241,8 +253,11 @@
             _proj_use_cp = True
         else:
             _proj_use_cp = False
-
-        features, _ = self.dense_projection(x)
+        if self.transformer_config.vortex_style_fp8:
+            with te.fp8_autocast(enabled=True, fp8_recipe=set_format_recipe()):
+                features, _ = self.dense_projection(x)
+        else:
+            features, _ = self.dense_projection(x)
         features = rearrange(features, "l b d -> b l d").contiguous()
         features_L_last = features.permute(0, 2, 1)
         features_D_last = self.hyena_proj_conv(features_L_last, _use_cp=_proj_use_cp).permute(0, 2, 1)