AMDGPU: Fold bitcasts into readfirstlane, readlane, and permlane64 #128494

arsenm · 2025-02-24T11:10:15Z

We should handle this for all the handled readlane and dpp ops.

arsenm · 2025-02-24T11:10:32Z

Warning

This pull request is not mergeable via GitHub because a downstack PR is open. Once all requirements are satisfied, merge this PR as a stack on Graphite.
Learn more

This stack of pull requests is managed by Graphite. Learn more about stacking.

llvmbot · 2025-02-24T11:12:05Z

@llvm/pr-subscribers-llvm-transforms

@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)

Changes

We should handle this for all the handled readlane and dpp ops.

Full diff: https://github.com/llvm/llvm-project/pull/128494.diff

3 Files Affected:

(modified) llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp (+16)
(modified) llvm/test/Transforms/InstCombine/AMDGPU/bitcast-fold-lane-ops.ll (+25-27)
(modified) llvm/test/Transforms/InstCombine/AMDGPU/permlane64.ll (+3-3)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index bac3bb5fde7b0..1f56b0bfc86dc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1128,9 +1128,25 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
         simplifyDemandedLaneMaskArg(IC, II, 1))
       return &II;
 
+    // readfirstlane.ty0 (bitcast ty1 x to ty0) -> bitcast (readfirstlane.ty1)
+    if (auto *BC = dyn_cast<BitCastInst>(Src); BC && BC->hasOneUse()) {
+      Value *BCSrc = BC->getOperand(0);
+
+      // TODO: Handle this for update_dpp, mov_ddp8, and all permlane variants.
+      if (isTypeLegal(BCSrc->getType())) {
+        SmallVector<Value *, 2> Args(II.args());
+        Args[0] = BCSrc;
+        CallInst *NewCall = IC.Builder.CreateIntrinsic(
+            II.getIntrinsicID(), {BCSrc->getType()}, Args);
+        NewCall->takeName(&II);
+        return new BitCastInst(NewCall, II.getType());
+      }
+    }
+
     return std::nullopt;
   }
   case Intrinsic::amdgcn_writelane: {
+    // TODO: Fold bitcast like readlane.
     if (simplifyDemandedLaneMaskArg(IC, II, 1))
       return &II;
     return std::nullopt;
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/bitcast-fold-lane-ops.ll b/llvm/test/Transforms/InstCombine/AMDGPU/bitcast-fold-lane-ops.ll
index 13347fd1a280e..6d6da3b5b8fb2 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/bitcast-fold-lane-ops.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/bitcast-fold-lane-ops.ll
@@ -4,8 +4,8 @@
 define i32 @test_bitcast_f32_to_i32_readfirstlane(float %val) {
 ; CHECK-LABEL: define i32 @test_bitcast_f32_to_i32_readfirstlane(
 ; CHECK-SAME: float [[VAL:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast float [[VAL]] to i32
-; CHECK-NEXT:    [[RESULT:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[BITCAST]])
+; CHECK-NEXT:    [[RESULT1:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[VAL]])
+; CHECK-NEXT:    [[RESULT:%.*]] = bitcast float [[RESULT1]] to i32
 ; CHECK-NEXT:    ret i32 [[RESULT]]
 ;
   %bitcast = bitcast float %val to i32
@@ -16,9 +16,9 @@ define i32 @test_bitcast_f32_to_i32_readfirstlane(float %val) {
 define i32 @test_bitcast_f32_to_i32_readfirstlane_multi_use_store(float %val, ptr %use.ptr) {
 ; CHECK-LABEL: define i32 @test_bitcast_f32_to_i32_readfirstlane_multi_use_store(
 ; CHECK-SAME: float [[VAL:%.*]], ptr [[USE_PTR:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast float [[VAL]] to i32
 ; CHECK-NEXT:    store float [[VAL]], ptr [[USE_PTR]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[BITCAST]])
+; CHECK-NEXT:    [[RESULT:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[VAL]])
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[RESULT]] to i32
 ; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
   %bitcast = bitcast float %val to i32
@@ -46,9 +46,7 @@ define i32 @test_bitcast_f32_to_i32_readfirstlane_multi_use_call(float %val) {
 define float @test_bitcast_f32_to_i32_readfirstlane_bitcast(float %val) {
 ; CHECK-LABEL: define float @test_bitcast_f32_to_i32_readfirstlane_bitcast(
 ; CHECK-SAME: float [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast float [[VAL]] to i32
-; CHECK-NEXT:    [[CALL:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[BITCAST]])
-; CHECK-NEXT:    [[RESULT:%.*]] = bitcast i32 [[CALL]] to float
+; CHECK-NEXT:    [[RESULT:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[VAL]])
 ; CHECK-NEXT:    ret float [[RESULT]]
 ;
   %bitcast = bitcast float %val to i32
@@ -60,8 +58,8 @@ define float @test_bitcast_f32_to_i32_readfirstlane_bitcast(float %val) {
 define i32 @test_bitcast_v2f16_to_i32_readfirstlane(<2 x half> %val) {
 ; CHECK-LABEL: define i32 @test_bitcast_v2f16_to_i32_readfirstlane(
 ; CHECK-SAME: <2 x half> [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast <2 x half> [[VAL]] to i32
-; CHECK-NEXT:    [[RESULT:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[BITCAST]])
+; CHECK-NEXT:    [[RESULT1:%.*]] = call <2 x half> @llvm.amdgcn.readfirstlane.v2f16(<2 x half> [[VAL]])
+; CHECK-NEXT:    [[RESULT:%.*]] = bitcast <2 x half> [[RESULT1]] to i32
 ; CHECK-NEXT:    ret i32 [[RESULT]]
 ;
   %bitcast = bitcast <2 x half> %val to i32
@@ -72,8 +70,8 @@ define i32 @test_bitcast_v2f16_to_i32_readfirstlane(<2 x half> %val) {
 define i32 @test_bitcast_v2bf16_to_i32_readfirstlane(<2 x bfloat> %val) {
 ; CHECK-LABEL: define i32 @test_bitcast_v2bf16_to_i32_readfirstlane(
 ; CHECK-SAME: <2 x bfloat> [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast <2 x bfloat> [[VAL]] to i32
-; CHECK-NEXT:    [[RESULT:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[BITCAST]])
+; CHECK-NEXT:    [[RESULT1:%.*]] = call <2 x bfloat> @llvm.amdgcn.readfirstlane.v2bf16(<2 x bfloat> [[VAL]])
+; CHECK-NEXT:    [[RESULT:%.*]] = bitcast <2 x bfloat> [[RESULT1]] to i32
 ; CHECK-NEXT:    ret i32 [[RESULT]]
 ;
   %bitcast = bitcast <2 x bfloat> %val to i32
@@ -84,8 +82,8 @@ define i32 @test_bitcast_v2bf16_to_i32_readfirstlane(<2 x bfloat> %val) {
 define i64 @test_bitcast_f64_to_i64_readfirstlane(double %val) {
 ; CHECK-LABEL: define i64 @test_bitcast_f64_to_i64_readfirstlane(
 ; CHECK-SAME: double [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast double [[VAL]] to i64
-; CHECK-NEXT:    [[RESULT:%.*]] = call i64 @llvm.amdgcn.readfirstlane.i64(i64 [[BITCAST]])
+; CHECK-NEXT:    [[RESULT1:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[VAL]])
+; CHECK-NEXT:    [[RESULT:%.*]] = bitcast double [[RESULT1]] to i64
 ; CHECK-NEXT:    ret i64 [[RESULT]]
 ;
   %bitcast = bitcast double %val to i64
@@ -96,8 +94,8 @@ define i64 @test_bitcast_f64_to_i64_readfirstlane(double %val) {
 define <2 x i32> @test_bitcast_f64_to_v2i32_readfirstlane(double %val) {
 ; CHECK-LABEL: define <2 x i32> @test_bitcast_f64_to_v2i32_readfirstlane(
 ; CHECK-SAME: double [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast double [[VAL]] to <2 x i32>
-; CHECK-NEXT:    [[RESULT:%.*]] = call <2 x i32> @llvm.amdgcn.readfirstlane.v2i32(<2 x i32> [[BITCAST]])
+; CHECK-NEXT:    [[RESULT1:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[VAL]])
+; CHECK-NEXT:    [[RESULT:%.*]] = bitcast double [[RESULT1]] to <2 x i32>
 ; CHECK-NEXT:    ret <2 x i32> [[RESULT]]
 ;
   %bitcast = bitcast double %val to <2 x i32>
@@ -108,8 +106,8 @@ define <2 x i32> @test_bitcast_f64_to_v2i32_readfirstlane(double %val) {
 define i64 @test_bitcast_v4i16_to_i64_readfirstlane(<4 x i16> %val) {
 ; CHECK-LABEL: define i64 @test_bitcast_v4i16_to_i64_readfirstlane(
 ; CHECK-SAME: <4 x i16> [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast <4 x i16> [[VAL]] to i64
-; CHECK-NEXT:    [[RESULT:%.*]] = call i64 @llvm.amdgcn.readfirstlane.i64(i64 [[BITCAST]])
+; CHECK-NEXT:    [[RESULT1:%.*]] = call <4 x i16> @llvm.amdgcn.readfirstlane.v4i16(<4 x i16> [[VAL]])
+; CHECK-NEXT:    [[RESULT:%.*]] = bitcast <4 x i16> [[RESULT1]] to i64
 ; CHECK-NEXT:    ret i64 [[RESULT]]
 ;
   %bitcast = bitcast <4 x i16> %val to i64
@@ -145,8 +143,8 @@ define i32 @test_bitcast_v8i4_to_i32_readfirstlane(<8 x i4> %val) {
 define float @test_bitcast_i32_to_f32_readfirstlane(i32 %val) {
 ; CHECK-LABEL: define float @test_bitcast_i32_to_f32_readfirstlane(
 ; CHECK-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast i32 [[VAL]] to float
-; CHECK-NEXT:    [[RESULT:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[BITCAST]])
+; CHECK-NEXT:    [[RESULT1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[VAL]])
+; CHECK-NEXT:    [[RESULT:%.*]] = bitcast i32 [[RESULT1]] to float
 ; CHECK-NEXT:    ret float [[RESULT]]
 ;
   %bitcast = bitcast i32 %val to float
@@ -157,8 +155,8 @@ define float @test_bitcast_i32_to_f32_readfirstlane(i32 %val) {
 define i16 @test_bitcast_f16_to_i16_readfirstlane(half %val) {
 ; CHECK-LABEL: define i16 @test_bitcast_f16_to_i16_readfirstlane(
 ; CHECK-SAME: half [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast half [[VAL]] to i16
-; CHECK-NEXT:    [[RESULT:%.*]] = call i16 @llvm.amdgcn.readfirstlane.i16(i16 [[BITCAST]])
+; CHECK-NEXT:    [[RESULT1:%.*]] = call half @llvm.amdgcn.readfirstlane.f16(half [[VAL]])
+; CHECK-NEXT:    [[RESULT:%.*]] = bitcast half [[RESULT1]] to i16
 ; CHECK-NEXT:    ret i16 [[RESULT]]
 ;
   %bitcast = bitcast half %val to i16
@@ -181,8 +179,8 @@ define i16 @test_bitcast_v2i8_to_i16_readfirstlane(<2 x i8> %val) {
 define <16 x i32> @test_bitcast_v16f32_to_v16i32_readfirstlane(<16 x float> %val) {
 ; CHECK-LABEL: define <16 x i32> @test_bitcast_v16f32_to_v16i32_readfirstlane(
 ; CHECK-SAME: <16 x float> [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast <16 x float> [[VAL]] to <16 x i32>
-; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i32> @llvm.amdgcn.readfirstlane.v16i32(<16 x i32> [[BITCAST]])
+; CHECK-NEXT:    [[RESULT:%.*]] = call <16 x float> @llvm.amdgcn.readfirstlane.v16f32(<16 x float> [[VAL]])
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x float> [[RESULT]] to <16 x i32>
 ; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
 ;
   %bitcast = bitcast <16 x float> %val to <16 x i32>
@@ -193,8 +191,8 @@ define <16 x i32> @test_bitcast_v16f32_to_v16i32_readfirstlane(<16 x float> %val
 define <8 x i64> @test_bitcast_v16f32_to_v8i64_readfirstlane(<16 x float> %val) {
 ; CHECK-LABEL: define <8 x i64> @test_bitcast_v16f32_to_v8i64_readfirstlane(
 ; CHECK-SAME: <16 x float> [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast <16 x float> [[VAL]] to <8 x i64>
-; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i64> @llvm.amdgcn.readfirstlane.v8i64(<8 x i64> [[BITCAST]])
+; CHECK-NEXT:    [[RESULT:%.*]] = call <16 x float> @llvm.amdgcn.readfirstlane.v16f32(<16 x float> [[VAL]])
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x float> [[RESULT]] to <8 x i64>
 ; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
 ;
   %bitcast = bitcast <16 x float> %val to <8 x i64>
@@ -205,8 +203,8 @@ define <8 x i64> @test_bitcast_v16f32_to_v8i64_readfirstlane(<16 x float> %val)
 define i32 @test_bitcast_f32_to_i32_readlane(float %val, i32 inreg %lane.index) {
 ; CHECK-LABEL: define i32 @test_bitcast_f32_to_i32_readlane(
 ; CHECK-SAME: float [[VAL:%.*]], i32 inreg [[LANE_INDEX:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast float [[VAL]] to i32
-; CHECK-NEXT:    [[RESULT:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[BITCAST]], i32 [[LANE_INDEX]])
+; CHECK-NEXT:    [[RESULT1:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[VAL]], i32 [[LANE_INDEX]])
+; CHECK-NEXT:    [[RESULT:%.*]] = bitcast float [[RESULT1]] to i32
 ; CHECK-NEXT:    ret i32 [[RESULT]]
 ;
   %bitcast = bitcast float %val to i32
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/permlane64.ll b/llvm/test/Transforms/InstCombine/AMDGPU/permlane64.ll
index 5eaab6107192e..88d98b21c0f7d 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/permlane64.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/permlane64.ll
@@ -12,9 +12,9 @@ define i32 @test_constant() {
 
 define i32 @test_bitcast_f32_to_i32_permlane64(float %val) {
 ; CHECK-LABEL: @test_bitcast_f32_to_i32_permlane64(
-; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast float [[VAL:%.*]] to i32
-; CHECK-NEXT:    [[RESULT:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[BITCAST]])
-; CHECK-NEXT:    ret i32 [[RESULT]]
+; CHECK-NEXT:    [[VAL:%.*]] = call float @llvm.amdgcn.permlane64.f32(float [[VAL1:%.*]])
+; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast float [[VAL]] to i32
+; CHECK-NEXT:    ret i32 [[BITCAST]]
 ;
   %bitcast = bitcast float %val to i32
   %result = call i32 @llvm.amdgcn.permlane64.i32(i32 %bitcast)

pravinjagtap

LGTM

We should handle this for all the handled readlane and dpp ops.

This reverts commit bf17987.

arsenm · 2025-02-25T02:24:06Z

New version fixes dropping convergence bundles. @sameerds is there a less awkward way to do this?

arsenm mentioned this pull request Feb 24, 2025

TargetTransformInfo: Add missing consts to a couple of methods #128492

Merged

arsenm mentioned this pull request Feb 24, 2025

AMDGPU: Add baseline tests for bitcast + readlane intrinsics #128493

Open

arsenm added the backend:AMDGPU label Feb 24, 2025 — with Graphite App

arsenm requested review from jayfoad, Pierre-vh, shiltian, cdevadas, pravinjagtap and vikramRH February 24, 2025 11:11

arsenm marked this pull request as ready for review February 24, 2025 11:11

llvmbot added llvm:instcombine llvm:transforms labels Feb 24, 2025

arsenm force-pushed the users/arsenm/amdgpu/instcombine-baseline-bitcast-readlane-tests branch from 5b2d253 to d5797aa Compare February 24, 2025 12:09

arsenm force-pushed the users/arsenm/amdgpu/fold-bitcasts-readfirstlane branch from 521bdde to 3f0988d Compare February 24, 2025 12:10

pravinjagtap approved these changes Feb 24, 2025

View reviewed changes

arsenm added 5 commits February 25, 2025 09:01

AMDGPU: Fold bitcasts into readfirstlane, readlane, and permlane64

8568d7b

We should handle this for all the handled readlane and dpp ops.

Make sure convergence tokens are preserved

bf17987

Revert "Make sure convergence tokens are preserved"

87e44f3

This reverts commit bf17987.

Use bundle guard

92bd22e

Using CallInst directly actually works

a231960

arsenm force-pushed the users/arsenm/amdgpu/fold-bitcasts-readfirstlane branch from 3f0988d to a231960 Compare February 25, 2025 02:23

arsenm force-pushed the users/arsenm/amdgpu/instcombine-baseline-bitcast-readlane-tests branch from d5797aa to 37c1083 Compare February 25, 2025 02:23

arsenm requested a review from ssahasra February 25, 2025 02:23

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

AMDGPU: Fold bitcasts into readfirstlane, readlane, and permlane64 #128494

AMDGPU: Fold bitcasts into readfirstlane, readlane, and permlane64 #128494

arsenm commented Feb 24, 2025

arsenm commented Feb 24, 2025

llvmbot commented Feb 24, 2025 •

edited

Loading

pravinjagtap left a comment •

edited

Loading

arsenm commented Feb 25, 2025

AMDGPU: Fold bitcasts into readfirstlane, readlane, and permlane64 #128494

Are you sure you want to change the base?

AMDGPU: Fold bitcasts into readfirstlane, readlane, and permlane64 #128494

Conversation

arsenm commented Feb 24, 2025

arsenm commented Feb 24, 2025

llvmbot commented Feb 24, 2025 • edited Loading

pravinjagtap left a comment • edited Loading

Choose a reason for hiding this comment

arsenm commented Feb 25, 2025

llvmbot commented Feb 24, 2025 •

edited

Loading

pravinjagtap left a comment •

edited

Loading