BroadcastIndexesRange: leading 1s don't require true broadcasting

swolchok · swolchok · commit 6eaf791dcc2a · 2025-03-19T19:57:06.000-07:00
Moved the mechanism we use to detect broadcasting from optimized/util/binary_ops.h ghstack-source-id: 3af3983e3474e86e20267f755def81d74893b9b6 ghstack-comment-id: 2738665656 Pull Request resolved: #9431
diff --git a/kernels/optimized/cpu/binary_ops.h b/kernels/optimized/cpu/binary_ops.h
@@ -10,34 +10,11 @@
 
 #include <executorch/kernels/optimized/vec/functional.h>
 #include <executorch/kernels/portable/cpu/scalar_utils.h>
+#include <executorch/kernels/portable/cpu/util/broadcast_indexes_range.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
 namespace torch {
 namespace executor {
-namespace internal {
-// NOTE: we bake ArrayRef iterators being pointers into the return
-// type here because we assume that iterators are portable across
-// ArrayRef copies.
-inline const Tensor::SizesType* arrayref_begin_ignoring_leading_1s(
-    ArrayRef<Tensor::SizesType> arr) {
-  return std::find_if(
-      arr.begin(), arr.end(), [](Tensor::SizesType x) { return x != 1; });
-}
-
-inline bool sizes_match_ignoring_leading_1s(
-    ArrayRef<Tensor::SizesType> lhs,
-    ArrayRef<Tensor::SizesType> rhs) {
-  auto lhs_begin = arrayref_begin_ignoring_leading_1s(lhs);
-  auto lhs_end = lhs.end();
-
-  auto rhs_begin = arrayref_begin_ignoring_leading_1s(rhs);
-  auto rhs_end = rhs.end();
-
-  return ((lhs_end - lhs_begin) == (rhs_end - rhs_begin)) &&
-      std::equal(lhs_begin, lhs_end, rhs_begin);
-}
-} // namespace internal
-
 enum class ElementwiseOptimizedPath {
   kNone,
   kTreatAs1d,
diff --git a/kernels/optimized/cpu/targets.bzl b/kernels/optimized/cpu/targets.bzl
@@ -130,7 +130,10 @@ def define_common_targets():
         srcs = [],
         exported_headers = ["op_add_sub_impl.h"],
         visibility = ["//executorch/kernels/optimized/cpu/..."],
-        exported_deps = ["//executorch/runtime/core:core"],
+        exported_deps = [
+            "//executorch/runtime/core:core",
+            "//executorch/kernels/portable/cpu/util:broadcast_indexes_range",
+        ],
     )
 
     runtime.cxx_library(
diff --git a/kernels/portable/cpu/util/broadcast_indexes_range.h b/kernels/portable/cpu/util/broadcast_indexes_range.h
@@ -21,6 +21,28 @@
 namespace torch::executor {
 
 namespace internal {
+// NOTE: we bake ArrayRef iterators being pointers into the return
+// type here because we assume that iterators are portable across
+// ArrayRef copies.
+inline const Tensor::SizesType* arrayref_begin_ignoring_leading_1s(
+    ArrayRef<Tensor::SizesType> arr) {
+  return std::find_if(
+      arr.begin(), arr.end(), [](Tensor::SizesType x) { return x != 1; });
+}
+
+inline bool sizes_match_ignoring_leading_1s(
+    ArrayRef<Tensor::SizesType> lhs,
+    ArrayRef<Tensor::SizesType> rhs) {
+  auto lhs_begin = arrayref_begin_ignoring_leading_1s(lhs);
+  auto lhs_end = lhs.end();
+
+  auto rhs_begin = arrayref_begin_ignoring_leading_1s(rhs);
+  auto rhs_end = rhs.end();
+
+  return ((lhs_end - lhs_begin) == (rhs_end - rhs_begin)) &&
+      std::equal(lhs_begin, lhs_end, rhs_begin);
+}
+
 template <std::size_t kNumInputs>
 class BroadcastIndexesIterator {
  public:
@@ -35,7 +57,10 @@ class BroadcastIndexesIterator {
   template <typename... Args>
   explicit BroadcastIndexesIterator(const Tensor& output, const Args&... args)
       : output_dim_or_zero_if_no_broadcasting_(
-            ((args.sizes() == output.sizes()) && ...) ? 0 : output.dim()),
+            (sizes_match_ignoring_leading_1s(args.sizes(), output.sizes()) &&
+             ...)
+                ? 0
+                : output.dim()),
         output_shape_(output.sizes()) {
     static_assert(
         sizeof...(args) == kNumInputs && (std::is_same_v<Args, Tensor> && ...),