merge

triton-lang · Feb 7, 2025 · 00bbd36 · 00bbd36
1 parent e8a4728
commit 00bbd36
Show file tree

Hide file tree

Showing 7 changed files with 75 additions and 996 deletions.
diff --git a/case.mlir b/case.mlir
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/AssignLatencies.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/AssignLatencies.cpp
@@ -206,7 +206,7 @@ loadOpsToIndirectionLevel(scf::ForOp forOp, bool pipelineWithoutDot,
 
   // If the loop has numStages attribute, also consider pipelining other loads
   // that are not directly used by dot ops.
-  if (pipelineWithoutDot && !seenDot) {
+  if (pipelineWithoutDot /*&& !seenDot*/) {
     for (Operation &op : forOp.getBody()->without_terminator()) {
       if (!isa<tt::LoadOp, tt::ExperimentalDescriptorLoadOp,
                tt::ExperimentalDescriptorGatherOp, tt::ConditionalLoadOp>(op))
@@ -263,6 +263,16 @@ DenseMap<Operation *, int> assignLatencies(ModuleOp moduleOp,
 
   DenseMap<Operation *, int> opLatency;
   for (auto forOp : loops) {
+    for (auto ifOp : forOp.getBody()->getOps<scf::IfOp>()) {
+      auto isLoad = [&](Operation &op) {
+        return isa<tt::LoadOp, tt::ExperimentalDescriptorLoadOp,
+                   tt::ExperimentalDescriptorGatherOp>(op);
+      };
+      if (llvm::any_of(*ifOp.thenBlock(), isLoad) ||
+          (ifOp.elseBlock() && llvm::any_of(*ifOp.elseBlock(), isLoad)))
+        ifOp->setAttr("ttg.conditional_load", UnitAttr::get(ifOp.getContext()));
+    }
+
     if (hasLatenciesAssigned(forOp)) {
       assignUserProvidedLatencies(forOp, opLatency);
       continue;
@@ -289,10 +299,21 @@ DenseMap<Operation *, int> assignLatencies(ModuleOp moduleOp,
         ++iter;
     }
 
+    int usedStages = 0;
+    for (Operation *loadOp :
+         llvm::to_vector(llvm::make_first_range(loadOpToIndLevel))) {
+      if (isa<tt::ConditionalLoadOp>(loadOp)) {
+        opLatency[loadOp] = 1;
+        usedStages = std::max(usedStages, loadOpToIndLevel[loadOp]);
+        loadOpToIndLevel.erase(loadOp);
+      }
+    }
+
     // Calculate the stage distance between applicable loads.
     auto vals = llvm::make_second_range(loadOpToIndLevel);
     int maxIndirectionLevel = vals.empty() ? 0 : *llvm::max_element(vals);
-    unsigned loadLatency = (numStages - 1) / (maxIndirectionLevel + 1);
+    unsigned loadLatency =
+        (numStages - 1 - usedStages) / (maxIndirectionLevel + 1);
 
     for (auto [loadOp, dist] : loadOpToIndLevel) {
       opLatency[loadOp] = loadLatency;

diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp
@@ -53,7 +53,7 @@ struct LoadInfo {
                  << "  sharedEncoding: " << sharedEncoding << "\n"
                  << "  blockedEncoding: " << blockedEncoding << "\n"
                  << "  isMMAv3Shared: " << isMMAv3Shared << "\n"
-                 << "  isMMAv3Registers: " << isMMAv3Registers << "\n"
+                 << "  isMMAv5Scale: " << isMMAv5Scale << "\n"
                  << "  distToUse: " << distToUse << "\n"
                  << "  usedByDot: " << usedByDot << "\n";
   }
@@ -112,8 +112,14 @@ static bool sameStageCluster(Operation *op1, Operation *op2) {
 // Return user of a loadOp with the lowest stage, if two users have the
 // same stage, return the user with lower cluster.
 static Operation *getFirstUseOfPipelinedLoad(Operation *loadOp) {
+  if (auto condOp = dyn_cast<tt::ConditionalLoadOp>(loadOp->getParentOp()))
+    loadOp = condOp;
+
   Operation *firstUser = nullptr;
   for (Operation *user : loadOp->getUsers()) {
+    if (isa<scf::YieldOp>(user))
+      continue;
+
     // Climb up to the containing op in the same block as the load.
     while (user->getBlock() != loadOp->getBlock())
       user = user->getParentOp();
@@ -522,6 +528,8 @@ assignMemoryLayouts(scf::ForOp &forOp,
     // as a pipelined load.
     auto [sLoad, _cLoad] = tt::getStageCluster(&op);
     Operation *firstUse = getFirstUseOfPipelinedLoad(&op);
+    if (!firstUse)
+      continue;
     LDBG("first use for load " << op);
     LDBG("  - use: " << *firstUse);
     auto firstUseStageCluster = tt::maybeGetStageCluster(firstUse);
@@ -535,6 +543,9 @@ assignMemoryLayouts(scf::ForOp &forOp,
     if (auto condOp = dyn_cast<tt::ConditionalLoadOp>(op)) {
       condOp->setAttr("ttg.pipelined_load", UnitAttr::get(op.getContext()));
       for (Operation *op : condOp.getLoads()) {
+        if (!isa<RankedTensorType>(op->getResultTypes().front()))
+          continue;
+
         loadsToPipeline.insert(op);
         LoadInfo &loadInfo = loadToInfo[op];
         loadInfo.distToUse = distToUse;
@@ -998,6 +1009,23 @@ static SmallVector<SplitCluster> splitIntoClusters(Block *block) {
   return result;
 }
 
+// To model an "undef" value, i.e. a value that is known to never be read on
+// live code paths, create a zero-valued constant where possible, otherwise use
+// a poison value. PTXAS appears to generate better code with zeros compared to
+// poison values.
+static Value createPoisonOrZero(ImplicitLocOpBuilder &b, Type type) {
+  Type elTy = getElementTypeOrSelf(type);
+  if (!elTy.isIntOrIndexOrFloat() ||
+      (!isa<RankedTensorType>(type) && type != elTy))
+    return b.create<ub::PoisonOp>(type);
+
+  TypedAttr attr = isa<FloatType>(elTy) ? TypedAttr(b.getFloatAttr(elTy, 0))
+                                        : b.getIntegerAttr(elTy, 0);
+  if (auto tensor = dyn_cast<RankedTensorType>(type))
+    attr = SplatElementsAttr::get(tensor, attr);
+  return b.create<arith::ConstantOp>(attr);
+}
+
 static void splitIntoClusters(scf::IfOp ifOp) {
   // First partition the regions into clusters.
   SmallVector<SplitCluster> thenClusters = splitIntoClusters(ifOp.thenBlock());
@@ -1026,7 +1054,7 @@ static void splitIntoClusters(scf::IfOp ifOp) {
     b.createBlock(&otherRegion);
     SmallVector<Value> undefs;
     for (Type type : cluster.getOutputTypes())
-      undefs.push_back(b.create<ub::PoisonOp>(type));
+      undefs.push_back(createPoisonOrZero(b, type));
     b.create<scf::YieldOp>(undefs);
   };
 
@@ -1041,9 +1069,24 @@ static void splitIntoClusters(scf::IfOp ifOp) {
         isThen ? clusterIf.getElseRegion() : clusterIf.getThenRegion());
   }
 
-  // Set the leftover select to the stage and cluster of the first use.
-  auto [stage, cluster] = tt::getStageCluster(getFirstUseOfPipelinedLoad(ifOp));
-  tt::setStageCluster(ifOp, stage, cluster);
+  // Break up the final if.
+  for (auto [trueVal, falseVal, result] :
+       llvm::zip(ifOp.thenYield().getOperands(), ifOp.elseYield().getOperands(),
+                 ifOp.getResults())) {
+    SmallVector<std::pair<int, int>> stageClusters;
+    if (Operation *op = trueVal.getDefiningOp())
+      stageClusters.push_back(tt::getStageCluster(op));
+    if (Operation *op = falseVal.getDefiningOp())
+      stageClusters.push_back(tt::getStageCluster(op));
+    auto [stage, cluster] = stageClusters.empty()
+                                ? tt::getStageCluster(ifOp)
+                                : *llvm::max_element(stageClusters);
+    auto select =
+        b.create<arith::SelectOp>(ifOp.getCondition(), trueVal, falseVal);
+    tt::setStageCluster(select, stage, cluster);
+    result.replaceAllUsesWith(select);
+  }
+  ifOp.erase();
 }
 
 static void processConditionalLoads(scf::ForOp forOp, int numStages) {
@@ -1071,6 +1114,9 @@ static void processConditionalLoads(scf::ForOp forOp, int numStages) {
     auto &firstUseCluster = clusters[clusterForFirstUse];
 
     for (Operation *loadOp : condOp.getLoads()) {
+      if (!isa<RankedTensorType>(loadOp->getResultTypes().front()))
+        continue;
+
       nestedSchedule.insert(loadOp, stage, loadCluster);
       nestedSchedule.insertDepsOfOp(loadOp, stage, loadCluster,
                                     /*includeArg=*/false);
@@ -1284,6 +1330,7 @@ createAsyncOps(scf::ForOp &forOp,
     if (condOp->removeAttr("ttg.pipelined_load"))
       splitIntoClusters(condOp);
   }
+  assert(succeeded(mlir::verify(forOp)) && "splitting produced invalid IR");
 
   tt::CoarseSchedule coarseSchedule(numStages);
   coarseSchedule.deSerialize(forOp);

diff --git a/manual.mlir b/manual.mlir