|
| 1 | +""" |
| 2 | +Copyright (C) 2024, Amazon.com. All Rights Reserved |
| 3 | +
|
| 4 | +NKI implementation for SPMD tensor addition NKI tutorial. |
| 5 | +
|
| 6 | +""" |
| 7 | +import numpy as np |
| 8 | +# NKI_EXAMPLE_27_BEGIN |
| 9 | +import neuronxcc.nki as nki |
| 10 | +import neuronxcc.nki.language as nl |
| 11 | + |
| 12 | + |
| 13 | +@nki.jit |
| 14 | +def nki_tensor_add_kernel_(a_input, b_input): |
| 15 | + """NKI kernel to compute element-wise addition of two input tensors |
| 16 | +
|
| 17 | + This kernel assumes strict input/output sizes can be uniformly tiled to [128,512] |
| 18 | +
|
| 19 | + Args: |
| 20 | + a_input: a first input tensor |
| 21 | + b_input: a second input tensor |
| 22 | +
|
| 23 | + Returns: |
| 24 | + c_output: an output tensor |
| 25 | + """ |
| 26 | + # Create output tensor shared between all SPMD instances as result tensor |
| 27 | + c_output = nl.ndarray(a_input.shape, dtype=a_input.dtype, buffer=nl.shared_hbm) |
| 28 | + |
| 29 | + # Calculate tile offsets based on current 'program' |
| 30 | + offset_i_x = nl.program_id(0) * 128 |
| 31 | + offset_i_y = nl.program_id(1) * 512 |
| 32 | + |
| 33 | + # Generate tensor indices to index tensors a and b |
| 34 | + ix = offset_i_x + nl.arange(128)[:, None] |
| 35 | + iy = offset_i_y + nl.arange(512)[None, :] |
| 36 | + |
| 37 | + # Load input data from device memory (HBM) to on-chip memory (SBUF) |
| 38 | + # We refer to an indexed portion of a tensor as an intermediate tensor |
| 39 | + a_tile = nl.load(a_input[ix, iy]) |
| 40 | + b_tile = nl.load(b_input[ix, iy]) |
| 41 | + |
| 42 | + # compute a + b |
| 43 | + c_tile = a_tile + b_tile |
| 44 | + |
| 45 | + # store the addition results back to device memory (c_output) |
| 46 | + nl.store(c_output[ix, iy], value=c_tile) |
| 47 | + |
| 48 | + # Transfer the ownership of `c_output` to the caller |
| 49 | + return c_output |
| 50 | + # NKI_EXAMPLE_27_END |
| 51 | + |
| 52 | + |
| 53 | +# NKI_EXAMPLE_28_BEGIN |
| 54 | +def nki_tensor_add(a_input, b_input): |
| 55 | + """NKI kernel caller to compute element-wise addition of two input tensors |
| 56 | +
|
| 57 | + This kernel caller lifts tile-size restriction, by applying the kernel on tiles of the inputs/outputs |
| 58 | +
|
| 59 | + Args: |
| 60 | + a_input: a first input tensor, of shape [N*128, M*512] |
| 61 | + b_input: a second input tensor, of shape [N*128, M*512] |
| 62 | +
|
| 63 | + Returns: |
| 64 | + a tensor of shape [N*128, M*512], the result of a_input + b_input |
| 65 | + """ |
| 66 | + |
| 67 | + # The SPMD launch grid denotes the number of kernel instances. |
| 68 | + # In this case, we use a 2D grid where the size of each invocation is 128x512 |
| 69 | + grid_x = a_input.shape[0] // 128 |
| 70 | + grid_y = a_input.shape[1] // 512 |
| 71 | + |
| 72 | + return nki_tensor_add_kernel_[grid_x, grid_y](a_input, b_input) |
| 73 | + # NKI_EXAMPLE_28_END |
| 74 | + |
| 75 | +if __name__ == "__main__": |
| 76 | + a = np.random.rand(256, 1024).astype(np.float16) |
| 77 | + b = np.random.rand(256, 1024).astype(np.float16) |
| 78 | + |
| 79 | + output_nki = nki_tensor_add(a, b) |
| 80 | + print(f"output_nki={output_nki}") |
| 81 | + |
| 82 | + output_np = a + b |
| 83 | + print(f"output_np={output_np}") |
| 84 | + |
| 85 | + allclose = np.allclose(output_np, output_nki, atol=1e-4, rtol=1e-2) |
| 86 | + if allclose: |
| 87 | + print("NKI and NumPy match") |
| 88 | + else: |
| 89 | + print("NKI and NumPy differ") |
| 90 | + |
| 91 | + assert allclose |
0 commit comments