Skip to content

Commit

Permalink
merge
Browse files Browse the repository at this point in the history
  • Loading branch information
Mogball committed Feb 7, 2025
1 parent e8a4728 commit 00bbd36
Show file tree
Hide file tree
Showing 7 changed files with 75 additions and 996 deletions.
44 changes: 0 additions & 44 deletions case.mlir

This file was deleted.

25 changes: 23 additions & 2 deletions lib/Dialect/TritonGPU/Transforms/Pipeliner/AssignLatencies.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ loadOpsToIndirectionLevel(scf::ForOp forOp, bool pipelineWithoutDot,

// If the loop has numStages attribute, also consider pipelining other loads
// that are not directly used by dot ops.
if (pipelineWithoutDot && !seenDot) {
if (pipelineWithoutDot /*&& !seenDot*/) {
for (Operation &op : forOp.getBody()->without_terminator()) {
if (!isa<tt::LoadOp, tt::ExperimentalDescriptorLoadOp,
tt::ExperimentalDescriptorGatherOp, tt::ConditionalLoadOp>(op))
Expand Down Expand Up @@ -263,6 +263,16 @@ DenseMap<Operation *, int> assignLatencies(ModuleOp moduleOp,

DenseMap<Operation *, int> opLatency;
for (auto forOp : loops) {
for (auto ifOp : forOp.getBody()->getOps<scf::IfOp>()) {
auto isLoad = [&](Operation &op) {
return isa<tt::LoadOp, tt::ExperimentalDescriptorLoadOp,
tt::ExperimentalDescriptorGatherOp>(op);
};
if (llvm::any_of(*ifOp.thenBlock(), isLoad) ||
(ifOp.elseBlock() && llvm::any_of(*ifOp.elseBlock(), isLoad)))
ifOp->setAttr("ttg.conditional_load", UnitAttr::get(ifOp.getContext()));
}

if (hasLatenciesAssigned(forOp)) {
assignUserProvidedLatencies(forOp, opLatency);
continue;
Expand All @@ -289,10 +299,21 @@ DenseMap<Operation *, int> assignLatencies(ModuleOp moduleOp,
++iter;
}

int usedStages = 0;
for (Operation *loadOp :
llvm::to_vector(llvm::make_first_range(loadOpToIndLevel))) {
if (isa<tt::ConditionalLoadOp>(loadOp)) {
opLatency[loadOp] = 1;
usedStages = std::max(usedStages, loadOpToIndLevel[loadOp]);
loadOpToIndLevel.erase(loadOp);
}
}

// Calculate the stage distance between applicable loads.
auto vals = llvm::make_second_range(loadOpToIndLevel);
int maxIndirectionLevel = vals.empty() ? 0 : *llvm::max_element(vals);
unsigned loadLatency = (numStages - 1) / (maxIndirectionLevel + 1);
unsigned loadLatency =
(numStages - 1 - usedStages) / (maxIndirectionLevel + 1);

for (auto [loadOp, dist] : loadOpToIndLevel) {
opLatency[loadOp] = loadLatency;
Expand Down
57 changes: 52 additions & 5 deletions lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ struct LoadInfo {
<< " sharedEncoding: " << sharedEncoding << "\n"
<< " blockedEncoding: " << blockedEncoding << "\n"
<< " isMMAv3Shared: " << isMMAv3Shared << "\n"
<< " isMMAv3Registers: " << isMMAv3Registers << "\n"
<< " isMMAv5Scale: " << isMMAv5Scale << "\n"
<< " distToUse: " << distToUse << "\n"
<< " usedByDot: " << usedByDot << "\n";
}
Expand Down Expand Up @@ -112,8 +112,14 @@ static bool sameStageCluster(Operation *op1, Operation *op2) {
// Return user of a loadOp with the lowest stage, if two users have the
// same stage, return the user with lower cluster.
static Operation *getFirstUseOfPipelinedLoad(Operation *loadOp) {
if (auto condOp = dyn_cast<tt::ConditionalLoadOp>(loadOp->getParentOp()))
loadOp = condOp;

Operation *firstUser = nullptr;
for (Operation *user : loadOp->getUsers()) {
if (isa<scf::YieldOp>(user))
continue;

// Climb up to the containing op in the same block as the load.
while (user->getBlock() != loadOp->getBlock())
user = user->getParentOp();
Expand Down Expand Up @@ -522,6 +528,8 @@ assignMemoryLayouts(scf::ForOp &forOp,
// as a pipelined load.
auto [sLoad, _cLoad] = tt::getStageCluster(&op);
Operation *firstUse = getFirstUseOfPipelinedLoad(&op);
if (!firstUse)
continue;
LDBG("first use for load " << op);
LDBG(" - use: " << *firstUse);
auto firstUseStageCluster = tt::maybeGetStageCluster(firstUse);
Expand All @@ -535,6 +543,9 @@ assignMemoryLayouts(scf::ForOp &forOp,
if (auto condOp = dyn_cast<tt::ConditionalLoadOp>(op)) {
condOp->setAttr("ttg.pipelined_load", UnitAttr::get(op.getContext()));
for (Operation *op : condOp.getLoads()) {
if (!isa<RankedTensorType>(op->getResultTypes().front()))
continue;

loadsToPipeline.insert(op);
LoadInfo &loadInfo = loadToInfo[op];
loadInfo.distToUse = distToUse;
Expand Down Expand Up @@ -998,6 +1009,23 @@ static SmallVector<SplitCluster> splitIntoClusters(Block *block) {
return result;
}

// To model an "undef" value, i.e. a value that is known to never be read on
// live code paths, create a zero-valued constant where possible, otherwise use
// a poison value. PTXAS appears to generate better code with zeros compared to
// poison values.
static Value createPoisonOrZero(ImplicitLocOpBuilder &b, Type type) {
Type elTy = getElementTypeOrSelf(type);
if (!elTy.isIntOrIndexOrFloat() ||
(!isa<RankedTensorType>(type) && type != elTy))
return b.create<ub::PoisonOp>(type);

TypedAttr attr = isa<FloatType>(elTy) ? TypedAttr(b.getFloatAttr(elTy, 0))
: b.getIntegerAttr(elTy, 0);
if (auto tensor = dyn_cast<RankedTensorType>(type))
attr = SplatElementsAttr::get(tensor, attr);
return b.create<arith::ConstantOp>(attr);
}

static void splitIntoClusters(scf::IfOp ifOp) {
// First partition the regions into clusters.
SmallVector<SplitCluster> thenClusters = splitIntoClusters(ifOp.thenBlock());
Expand Down Expand Up @@ -1026,7 +1054,7 @@ static void splitIntoClusters(scf::IfOp ifOp) {
b.createBlock(&otherRegion);
SmallVector<Value> undefs;
for (Type type : cluster.getOutputTypes())
undefs.push_back(b.create<ub::PoisonOp>(type));
undefs.push_back(createPoisonOrZero(b, type));
b.create<scf::YieldOp>(undefs);
};

Expand All @@ -1041,9 +1069,24 @@ static void splitIntoClusters(scf::IfOp ifOp) {
isThen ? clusterIf.getElseRegion() : clusterIf.getThenRegion());
}

// Set the leftover select to the stage and cluster of the first use.
auto [stage, cluster] = tt::getStageCluster(getFirstUseOfPipelinedLoad(ifOp));
tt::setStageCluster(ifOp, stage, cluster);
// Break up the final if.
for (auto [trueVal, falseVal, result] :
llvm::zip(ifOp.thenYield().getOperands(), ifOp.elseYield().getOperands(),
ifOp.getResults())) {
SmallVector<std::pair<int, int>> stageClusters;
if (Operation *op = trueVal.getDefiningOp())
stageClusters.push_back(tt::getStageCluster(op));
if (Operation *op = falseVal.getDefiningOp())
stageClusters.push_back(tt::getStageCluster(op));
auto [stage, cluster] = stageClusters.empty()
? tt::getStageCluster(ifOp)
: *llvm::max_element(stageClusters);
auto select =
b.create<arith::SelectOp>(ifOp.getCondition(), trueVal, falseVal);
tt::setStageCluster(select, stage, cluster);
result.replaceAllUsesWith(select);
}
ifOp.erase();
}

static void processConditionalLoads(scf::ForOp forOp, int numStages) {
Expand Down Expand Up @@ -1071,6 +1114,9 @@ static void processConditionalLoads(scf::ForOp forOp, int numStages) {
auto &firstUseCluster = clusters[clusterForFirstUse];

for (Operation *loadOp : condOp.getLoads()) {
if (!isa<RankedTensorType>(loadOp->getResultTypes().front()))
continue;

nestedSchedule.insert(loadOp, stage, loadCluster);
nestedSchedule.insertDepsOfOp(loadOp, stage, loadCluster,
/*includeArg=*/false);
Expand Down Expand Up @@ -1284,6 +1330,7 @@ createAsyncOps(scf::ForOp &forOp,
if (condOp->removeAttr("ttg.pipelined_load"))
splitIntoClusters(condOp);
}
assert(succeeded(mlir::verify(forOp)) && "splitting produced invalid IR");

tt::CoarseSchedule coarseSchedule(numStages);
coarseSchedule.deSerialize(forOp);
Expand Down
86 changes: 0 additions & 86 deletions manual.mlir

This file was deleted.

Loading

0 comments on commit 00bbd36

Please sign in to comment.