From 3262edcaf1613a6b677522d6bcdc9ef9477ff6a6 Mon Sep 17 00:00:00 2001 From: Kimish Patel Date: Thu, 6 Feb 2025 20:43:39 -0800 Subject: [PATCH] [ExecuTorch] Add broadcasting support to optimized op_div Summary: Similar to broadcast support in op_mul Test Plan: Tests added Reviewers: Subscribers: Tasks: Tags: ghstack-source-id: c9d05c8405401e10a45a08c3e160aa04eaf1be86 Pull Request resolved: https://github.com/pytorch/executorch/pull/8257 --- kernels/optimized/cpu/op_div.cpp | 65 +++++++++++++------------------- kernels/test/op_div_test.cpp | 54 ++++++++++++++++++++++++++ 2 files changed, 80 insertions(+), 39 deletions(-) diff --git a/kernels/optimized/cpu/op_div.cpp b/kernels/optimized/cpu/op_div.cpp index 4d7b8efe9e..12eb53d356 100644 --- a/kernels/optimized/cpu/op_div.cpp +++ b/kernels/optimized/cpu/op_div.cpp @@ -120,48 +120,35 @@ Tensor& opt_div_out( out.numel()); }); } else if (selected_optimized_path != ElementwiseOptimizedPath::kNone) { - const Tensor* lhs; - const Tensor* rhs; + // Reason for using alpha is becasuse handle_broadcast_elementwise + // is used for add and sub as well: + static constexpr const char op_name[] = "mul.out"; if (selected_optimized_path == - ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments) { - lhs = &b; - rhs = &a; + ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments || + selected_optimized_path == + ElementwiseOptimizedPath::kBroadcastLastDimReverseArguments || + selected_optimized_path == + ElementwiseOptimizedPath::kBroadcastNdByNdReverseArguments) { + // This behavior is a bit confusing. + // Reason we swap out args here is because handle_broadcast_elementwise + // handles this selected_optimized_path option a bit differently. + // This should really be resoled in handle_broadcast_elementwise. + // However, the current blocker is that handle_broadcast_elementwise tries + // to be agnostic of op. This should be fixed, likely by moving lambda + // creation to handle_broadcast_elementwise and it be aware of which op is + // being executed. + auto div_lambda = [](auto x, auto y, [[maybe_unused]] auto alpha) { + return y / x; + }; + return torch::executor::handle_broadcast_elementwise( + ctx, div_lambda, a, b, out, selected_optimized_path); } else { - // Catch failure to update logic when subing new broadcasting possibility. - ET_DCHECK( - selected_optimized_path == - ElementwiseOptimizedPath::kBroadcast2dBy1d); - lhs = &a; - rhs = &b; + auto div_lambda = [](auto x, auto y, [[maybe_unused]] auto alpha) { + return x / y; + }; + return torch::executor::handle_broadcast_elementwise( + ctx, div_lambda, a, b, out, selected_optimized_path); } - auto error = resize_tensor(out, lhs->sizes()); - ET_KERNEL_CHECK_MSG( - ctx, - error == Error::Ok, - InvalidArgument, - out, - "Failed to resize output tensor."); - ET_SWITCH_REALB_TYPES(out_type, ctx, "sub.out", CTYPE, [&]() { - using Vec = executorch::vec::Vectorized; - if (selected_optimized_path == - ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments) { - executorch::vec::broadcasting_map_2d_by_1d( - [](Vec x, Vec y) { return y / x; }, - out.mutable_data_ptr(), - lhs->const_data_ptr(), - rhs->const_data_ptr(), - lhs->sizes()[lhs->dim() - 2], - lhs->sizes()[lhs->dim() - 1]); - } else { - executorch::vec::broadcasting_map_2d_by_1d( - [](Vec x, Vec y) { return x / y; }, - out.mutable_data_ptr(), - lhs->const_data_ptr(), - rhs->const_data_ptr(), - lhs->sizes()[lhs->dim() - 2], - lhs->sizes()[lhs->dim() - 1]); - } - }); } else { ScalarType common_type = get_compute_type(a_type, b_type); ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); diff --git a/kernels/test/op_div_test.cpp b/kernels/test/op_div_test.cpp index 97d538971c..8f41419a8e 100644 --- a/kernels/test/op_div_test.cpp +++ b/kernels/test/op_div_test.cpp @@ -83,6 +83,52 @@ class OpDivOutTest : public OperatorTest { ET_EXPECT_KERNEL_FAILURE(context_, op_div_out(a, b, out)); } + template + void test_broadcast_3D() { + TensorFactory tf_a; + + Tensor a = + tf_a.make({2, 2, 3}, /*data=*/{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); + Tensor b = tf_a.make({2, 1, 3}, /*data=*/{2, 3, 4, 5, 6, 7}); + + // Destination for output of mul. + Tensor out = + tf_a.make({2, 2, 3}, /*data=*/{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); + Tensor expected = tf_a.make( + {2, 2, 3}, + /*data=*/ + {0.5000, + 0.6667, + 0.75002, + 2.0000, + 1.6667, + 1.5000, + 1.4000, + 1.3333, + 1.2857, + 2.0000, + 1.8333, + 1.7143}); + // Check that it matches the expected output. + EXPECT_TENSOR_CLOSE_WITH_TOL(op_div_out(a, b, out), expected, 1e-4, 1e-4); + expected = tf_a.make( + {2, 2, 3}, + /*data=*/ + {2.0000, + 1.5000, + 1.3333, + 0.5000, + 0.6000, + 0.6667, + 0.7143, + 0.7500, + 0.7778, + 0.5000, + 0.5455, + 0.5833}); + EXPECT_TENSOR_CLOSE_WITH_TOL(op_div_out(b, a, out), expected, 1e-4, 1e-4); + } + /** * Common testing for div operator, for float output types */ @@ -457,6 +503,14 @@ TEST_F(OpDivOutTest, DynamicShapeUpperBoundLargerThanExpected) { EXPECT_TENSOR_CLOSE(out, expected_result); } +TEST_F(OpDivOutTest, BroadcastNDTest) { + // Test 3D tensors + test_broadcast_3D(); + // half and bfloat16 are not supported for div quite yet + // test_broadcast_3D(); + // test_broadcast_3D(); +} + TEST_F(OpDivOutTest, DynamicShapeUnbound) { GTEST_SKIP() << "Dynamic shape not supported"; TensorFactory tf;