F8I4 Grouped Gemm Optimization for Sparse M (pytorch#3854)

jwfromm · facebook-github-bot · commit 51a0aae1adce · 2025-03-21T11:08:14.000-07:00
Summary: X-link: facebookresearch/FBGEMM#945 In cases where there are many groups, but few have a non-zero amount of routed tokens, it turns out we pay a high overhead. For example if a single token is routed to one of 128 experts, while the compute is the same as 1 token being routed to one expert the runtime is much lower. Presumably there are some kernel inefficiencies involved in looping over the empty groups. This diff changes how kernel arguments are set up so that we do grouped gemm over min(total_M, groups). This allows us to ignore many of the groups where no compute is required and improves performance in those cases considerably. As an example of the effect of this diff, when total_M is 1 and there are 128 groups, latency will be 3X smaller thanks to this change. Reviewed By: jiawenliu64 Differential Revision: D71510967
diff --git a/fbgemm_gpu/experimental/gen_ai/bench/quantize_bench.py b/fbgemm_gpu/experimental/gen_ai/bench/quantize_bench.py
@@ -32,6 +32,29 @@ def __init__(self, *args, **kwargs):
 from .quantize_ops import get_quantize_ops, QuantizeOpBase
 
 
+def generate_group_tensor(G, M):
+    """
+    Generate a tensor with G elements whose integer elements sum to A.
+
+    Args:
+        G (int): Number of elements in the tensor.
+        M (int): Sum of the elements in the tensor.
+
+    Returns:
+        torch.Tensor: A tensor with G elements whose integer elements sum to M.
+    """
+
+    # First, we generate a random tensor with G elements
+    random_tensor = torch.rand(G)
+    # Then, we normalize this tensor so it sums up to 1
+    normalized_tensor = random_tensor / random_tensor.sum()
+    # Finally, we multiply this tensor by M and round to the nearest integer
+    output_tensor = torch.round(normalized_tensor * M).to(torch.int64)
+    # Adjust the last element to ensure the sum is exactly M
+    output_tensor[-1] += M - output_tensor.sum()
+    return output_tensor.tolist()
+
+
 def set_amd_env_vars() -> None:
     print("Setting environment variables for AMD GPU performance")
     os.environ["DISABLE_ADDMM_HIP_LT"] = "0"
@@ -177,9 +200,10 @@ def benchmark_grouped(
             output = [o[: m[i]] for i, o in enumerate(output)]
         # Compare the quantize op output to reference as a sanity check.
         for i in range(num_groups):
-            metrics.sim += float(
-                torch.mean(torch.pow(output[i] - out_ref[i], 2)).item()
-            )
+            if m[i] > 0:
+                metrics.sim += float(
+                    torch.mean(torch.pow(output[i] - out_ref[i], 2)).item()
+                )
         for _ in range(num_iters):
             # Now perform benchmark.
             if bench_quantize:
@@ -205,17 +229,16 @@ def benchmark_grouped(
                 metrics.tflops += (
                     2 * b[i] * m[i] * n[i] * k[i] / (ms_runtime / 1e3) / 1e12
                 )
-                metrics.gbps += (
-                    (
-                        quantized_vals[0][i][: m[i]].numel()
-                        * quantized_vals[0][i][: m[i]].element_size()
-                        + quantized_vals[1][i].numel()
-                        * quantized_vals[1][i].element_size()
-                        + output[i].numel() * output[i].element_size()
+                if m[i] > 0:
+                    metrics.gbps += (
+                        (
+                            b[i] * m[i] * k[i] * quantized_vals[0][0].element_size()
+                            + b[i] * n[i] * k[i] * quantized_vals[1][0].element_size()
+                            + b[i] * m[i] * n[i] * output[0].element_size()
+                        )
+                        / (ms_runtime / 1e3)
+                        / 1e9
                     )
-                    / (ms_runtime / 1e3)
-                    / 1e9
-                )
             metrics.ms += ms_runtime
         metrics.ms /= num_iters
         metrics.tflops /= num_iters
@@ -411,10 +434,22 @@ def main(args: Any):
     # When groups is provided transform shapes into grouped format.
     if args.groups:
         groups = int(args.groups)
-        MNK = [
-            [[b] * groups, [m] * groups, [n] * groups, [k] * groups]
-            for b, m, n, k in MNK
-        ]
+        if args.total_M:
+            M = generate_group_tensor(groups, int(args.total_M))
+            MNK = [
+                [
+                    [b] * groups,
+                    generate_group_tensor(groups, int(args.total_M)),
+                    [n] * groups,
+                    [k] * groups,
+                ]
+                for b, _, n, k in MNK
+            ]
+        else:
+            MNK = [
+                [[b] * groups, [m] * groups, [n] * groups, [k] * groups]
+                for b, m, n, k in MNK
+            ]
 
     # Iterate over shapes and benchmark.
     benchmark_results = []
@@ -512,6 +547,12 @@ def invoke_main() -> None:
         default=None,
         help="If set with grouped mode, repeat input shapes this many times.",
     )
+    parser.add_argument(
+        "--total_M",
+        default=None,
+        help="If set, Adjusts the M values to sum to this number. "
+        "This can help simulate real grouped workloads.",
+    )
     parser.add_argument(
         "--no_cuda_graph",
         default=False,
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8i4bf16_shuffled_grouped.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8i4bf16_shuffled_grouped.cu
@@ -73,36 +73,55 @@ __global__ void set_kernel_args(
   auto group_index = blockIdx.x * blockDim.x + threadIdx.x;
   // If this is a valid group, write kernel args to device.
   if (group_index < G) {
-    // First get the M value for this group.
+    // Since we are only writing a subset of the groups to kernel args,
+    // we need to start by initializing a counter and setting other groups
+    // to empty problems.
+    __shared__ int non_zero_counter;
+    // Initialize counter and problem memory for this group.
+    if (group_index == 0) {
+      non_zero_counter = 0;
+    }
+    // We set the problem shapes to empty by default to skip over
+    // these groups.
+    problem_shape_ptr[group_index] = ProblemShape(0, 0, 0);
+    // Sync threads to make sure state is shared across the block.
+    __syncthreads();
+
+    // Now check if this is a non-zero group.
     int M = M_sizes[group_index];
-    // Compute offset into tensor where this group begins.
-    int offset_M = 0;
-    // Compute cumulative sum of prior groups to find offset.
-    for (int i = 0; i < group_index; i++) {
-      offset_M += M_sizes[i];
+    // Only proceed if so.
+    if (M > 0) {
+      // Get the non-zero index for this group atomically.
+      int non_zero_idx = atomicAdd(&non_zero_counter, 1);
+      // Compute offset into tensor where this group begins.
+      int offset_M = 0;
+      // Compute cumulative sum of prior groups to find offset.
+      for (int i = 0; i < group_index; i++) {
+        offset_M += M_sizes[i];
+      }
+      // Set the problem shape for this group.
+      problem_shape_ptr[non_zero_idx] = ProblemShape(N, M, K);
+      // Set pointer to xq.
+      xq_ptr[non_zero_idx] = xq + (offset_M * K);
+      // Set pointer to wq, dividing by two as wq is packed into bytes.
+      wq_ptr[non_zero_idx] = wq + (group_index * N * K / 2);
+      // Set scale pointers.
+      x_scale_ptr[non_zero_idx] = x_scale + offset_M;
+      w_scale_ptr[non_zero_idx] = w_scale + (group_index * N);
+      w_scale_group_ptr[non_zero_idx] =
+          w_scale_group + (group_index * N * num_scale_groups);
+      // Set output pointer.
+      output_ptr[non_zero_idx] = output + (offset_M * N);
+      // Set stride pointers.
+      stride_a_ptr[non_zero_idx] = cutlass::make_cute_packed_stride(
+          StrideA{}, cute::make_shape(M, K, 1));
+      stride_b_ptr[non_zero_idx] = cute::tile_to_shape(
+          LayoutAtomQuant{}, cute::make_shape(N, K, cute::Int<1>{}));
+      stride_c_ptr[non_zero_idx] = cutlass::make_cute_packed_stride(
+          StrideC{}, cute::make_shape(N, M, 1));
+      stride_s_ptr[non_zero_idx] = cutlass::make_cute_packed_stride(
+          StrideS{}, cute::make_shape(N, num_scale_groups, 1));
     }
-    // Set the problem shape for this group.
-    problem_shape_ptr[group_index] = ProblemShape(N, M, K);
-    // Set pointer to xq.
-    xq_ptr[group_index] = xq + (offset_M * K);
-    // Set pointer to wq, dividing by two as wq is packed into bytes.
-    wq_ptr[group_index] = wq + (group_index * N * K / 2);
-    // Set scale pointers.
-    x_scale_ptr[group_index] = x_scale + offset_M;
-    w_scale_ptr[group_index] = w_scale + (group_index * N);
-    w_scale_group_ptr[group_index] =
-        w_scale_group + (group_index * N * num_scale_groups);
-    // Set output pointer.
-    output_ptr[group_index] = output + (offset_M * N);
-    // Set stride pointers.
-    stride_a_ptr[group_index] =
-        cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, 1));
-    stride_b_ptr[group_index] = cute::tile_to_shape(
-        LayoutAtomQuant{}, cute::make_shape(N, K, cute::Int<1>{}));
-    stride_c_ptr[group_index] =
-        cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(N, M, 1));
-    stride_s_ptr[group_index] = cutlass::make_cute_packed_stride(
-        StrideS{}, cute::make_shape(N, num_scale_groups, 1));
   }
 }
 
@@ -118,6 +137,8 @@ void _f8i4bf16_shuffled_grouped(
   // Get basic shape information.
   int G = M_sizes.size(0);
   // XQ is shape [total_M, K]
+  int total_M = XQ.size(0);
+  int kernel_groups = std::min(G, total_M);
   int K = XQ.size(-1);
   // WQ is shape [G, N, K/2]
   int N = WQ.size(1);
@@ -394,7 +415,7 @@ void _f8i4bf16_shuffled_grouped(
   // Define GEMM arguments.
   typename GemmShuffled::Arguments arguments{
       cutlass::gemm::GemmUniversalMode::kGrouped,
-      {G, problem_shape_ptr, nullptr},
+      {kernel_groups, problem_shape_ptr, nullptr},
       {wq_ptr,
        stride_b_ptr,
        xq_ptr,