diff --git a/.lintrunner.toml b/.lintrunner.toml index c2bbc05ae12..95d26a2627b 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -264,6 +264,10 @@ exclude_patterns = [ 'examples/**', 'exir/verification/bindings.cpp', 'extension/**', + # Uses properly-gated (ET_USE_PYTORCH_HEADERS) ATen include. + 'kernels/portable/cpu/util/elementwise_util.h', + 'kernels/portable/cpu/util/math_util.h', + 'kernels/portable/cpu/util/vectorized_math.h', 'kernels/optimized/**', 'runtime/core/exec_aten/**', # Want to be able to keep c10 in sync with PyTorch core. diff --git a/kernels/portable/cpu/op_atan2.cpp b/kernels/portable/cpu/op_atan2.cpp index 33d66cf2ad7..5390eb52820 100644 --- a/kernels/portable/cpu/op_atan2.cpp +++ b/kernels/portable/cpu/op_atan2.cpp @@ -60,7 +60,7 @@ Tensor& atan2_out( op_name, utils::SupportedTensorDtypes::FLOATHBF16>( [](const auto val_a, const auto val_b) { - return std::atan2(val_a, val_b); + return executorch::math::atan2(val_a, val_b); }, ctx, a, diff --git a/kernels/portable/cpu/op_elu.cpp b/kernels/portable/cpu/op_elu.cpp index d6533642860..d7477717a3a 100644 --- a/kernels/portable/cpu/op_elu.cpp +++ b/kernels/portable/cpu/op_elu.cpp @@ -48,8 +48,7 @@ Tensor& elu_out( CTYPE, op_name, utils::SupportedTensorDtypes::SAME_AS_COMMON>( - [negcoef, math_scale, math_input_scale](const auto x) { - // TODO: rewrite this to be vectorization-capable. + [negcoef, math_scale, math_input_scale](const CTYPE x) { return MathT(x) <= MathT(0) ? std::expm1(MathT(x) * math_input_scale) * negcoef : MathT(x) * math_scale; diff --git a/kernels/portable/cpu/op_fmod.cpp b/kernels/portable/cpu/op_fmod.cpp index 96a971b166a..40bb4a5e94c 100644 --- a/kernels/portable/cpu/op_fmod.cpp +++ b/kernels/portable/cpu/op_fmod.cpp @@ -61,7 +61,7 @@ Tensor& fmod_Tensor_out( utils::SupportedTensorDtypes::REALHBF16>( [&div_by_zero_error]( const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { - // TODO: rewrite this to be vectorization-capable. + // TODO: rewrite this to be vectorization-capable? CTYPE_COMPUTE value = 0; if (is_integral_type::value) { if (val_b == 0) { @@ -138,10 +138,8 @@ Tensor& fmod_Scalar_out( CTYPE_COMPUTE, op_name, utils::SupportedTensorDtypes::REALHBF16>( - [val_b](const CTYPE_COMPUTE val_a) { - // TODO: rewrite this to be vectorization-capable. - CTYPE_COMPUTE value = std::fmod(val_a, val_b); - return value; + [val_b](const auto val_a) { + return executorch::math::fmod(val_a, (decltype(val_a))val_b); }, ctx, a, diff --git a/kernels/portable/cpu/op_maximum.cpp b/kernels/portable/cpu/op_maximum.cpp index 3a84095a4df..c7979e40d7c 100644 --- a/kernels/portable/cpu/op_maximum.cpp +++ b/kernels/portable/cpu/op_maximum.cpp @@ -49,7 +49,7 @@ Tensor& maximum_out( CTYPE_COMPUTE, op_name, utils::SupportedTensorDtypes::REALHBBF16>( - [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { + [](const auto val_a, const auto val_b) { return utils::max_override(val_a, val_b); }, ctx, diff --git a/kernels/portable/cpu/op_minimum.cpp b/kernels/portable/cpu/op_minimum.cpp index 5c0e79eb9bb..1bac23187d8 100644 --- a/kernels/portable/cpu/op_minimum.cpp +++ b/kernels/portable/cpu/op_minimum.cpp @@ -49,8 +49,7 @@ Tensor& minimum_out( CTYPE_COMPUTE, op_name, utils::SupportedTensorDtypes::REALHBBF16>( - [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { - // TODO: rewrite this to be vectorization-capable. + [](const auto val_a, const auto val_b) { return utils::min_override(val_a, val_b); }, ctx, diff --git a/kernels/portable/cpu/op_mul.cpp b/kernels/portable/cpu/op_mul.cpp index 6156227732d..8b94ad54078 100644 --- a/kernels/portable/cpu/op_mul.cpp +++ b/kernels/portable/cpu/op_mul.cpp @@ -56,9 +56,7 @@ Tensor& mul_out( CTYPE_COMPUTE, op_name, utils::SupportedTensorDtypes::REALHBBF16>( - [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { - return val_a * val_b; - }, + [](const auto val_a, const auto val_b) { return val_a * val_b; }, ctx, a, utils::SupportedTensorDtypes::REALHBBF16, diff --git a/kernels/portable/cpu/op_pow.cpp b/kernels/portable/cpu/op_pow.cpp index 4d2673cb72d..171d33d393f 100644 --- a/kernels/portable/cpu/op_pow.cpp +++ b/kernels/portable/cpu/op_pow.cpp @@ -57,9 +57,9 @@ Tensor& pow_Tensor_Tensor_out( CTYPE_COMPUTE, op_name, utils::SupportedTensorDtypes::REALHBF16>( - [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { + [](const auto val_a, const auto val_b) { // TODO: rewrite this to be vectorization-capable. - return std::pow(val_a, val_b); + return executorch::math::pow(val_a, val_b); }, ctx, a, diff --git a/kernels/portable/cpu/op_sigmoid.cpp b/kernels/portable/cpu/op_sigmoid.cpp index acb743a2db6..caf2daf1ba5 100644 --- a/kernels/portable/cpu/op_sigmoid.cpp +++ b/kernels/portable/cpu/op_sigmoid.cpp @@ -49,8 +49,9 @@ Tensor& sigmoid_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) { CTYPE_COMPUTE, op_name, utils::SupportedTensorDtypes::FLOATHBF16>( - [](const auto val_in) -> CTYPE_COMPUTE { - // TODO: rewrite this to be vectorization-capable + [](const CTYPE_COMPUTE val_in) { + // TODO: rewrite this to be vectorization-capable; need + // unary - overload for Vectorized. CTYPE_COMPUTE out_val = static_cast(1.0) / (static_cast(1.0) + exp(-val_in)); return out_val; diff --git a/kernels/portable/cpu/op_where.cpp b/kernels/portable/cpu/op_where.cpp index 692e296ee00..7210b6fffc9 100644 --- a/kernels/portable/cpu/op_where.cpp +++ b/kernels/portable/cpu/op_where.cpp @@ -47,7 +47,7 @@ Tensor& where_out( CTYPE_COMPUTE, op_name, utils::SupportedTensorDtypes::SAME_AS_COMMON>( - [](const auto val_a, const auto val_b, const auto val_c) { + [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b, const CTYPE_COMPUTE val_c) { return val_c ? val_a : val_b; }, ctx, diff --git a/kernels/portable/cpu/util/elementwise_util.h b/kernels/portable/cpu/util/elementwise_util.h index e30b8af7d89..4dc19e3bb1c 100644 --- a/kernels/portable/cpu/util/elementwise_util.h +++ b/kernels/portable/cpu/util/elementwise_util.h @@ -12,9 +12,14 @@ #include #include #include +#include // Make vectorization support easy for clients. #include #include +#ifdef ET_USE_PYTORCH_HEADERS +#include +#endif // ET_USE_PYTORCH_HEADERS + #include #include @@ -51,6 +56,34 @@ inline int64_t scalar_to(const Scalar& s) { } namespace internal { +template +using ignore_first_yield_second = T; + +#ifdef ET_USE_PYTORCH_HEADERS +// Can I call a function of type Op with sizeof...(Args) arguments of type +// at::vec::Vectorized? +// +// See [NOTE: Generic lambdas] below for requirements on Op. +template +constexpr bool can_use_vectorized() { + using Vec = at::vec::Vectorized; + if constexpr (std::is_invocable_v< + Op, + ignore_first_yield_second...>) { + // For bool, we will get a false positive if we rely on only the + // is_invocable_v check above because at::vec::Vectorized is + // implicitly convertible to a pointer, which makes it implicitly + // convertible to bool (which was 15 minutes of fun to debug). Also + // just seems like good hygiene to make sure we get the Vectorized + // we're expecting. + return std::is_same_v< + std::invoke_result_t...>, + Vec>; + } + return false; +} +#endif // ET_USE_PYTORCH_HEADERS + template < typename CTYPE_COMPUTE, typename CTYPE_OUT, @@ -61,8 +94,71 @@ inline void dtype_specialized_elementwise_fn_impl( KernelRuntimeContext& ctx, const Tensor& out, Args... inputs) { + static_assert( + (std::is_same_v> && + ...)); constexpr auto kNumInputs = sizeof...(inputs); - ET_DCHECK(((inputs.first->element_size() == sizeof(CTYPE_COMPUTE)) && ...)); + // All inputs must be of type CTYPE_COMPUTE. + ET_DCHECK( + ((inputs.first->scalar_type() == + CppTypeToScalarType::value) && + ...)); + +#ifdef ET_USE_PYTORCH_HEADERS + if constexpr (can_use_vectorized()) { + const bool any_is_broadcasted = + !(torch::executor::internal::sizes_match_ignoring_leading_1s( + inputs.first->sizes(), out.sizes()) && + ...); + if (!any_is_broadcasted) { + using Vec = at::vec::Vectorized; + ::executorch::extension::parallel_for( + 0, + out.numel(), + ::executorch::extension::internal::GRAIN_SIZE, + [&](const auto begin, const auto end) { + std::array inputs_data_ptrs = { + inputs.first->template const_data_ptr()...}; + + CTYPE_OUT* const data_out = out.mutable_data_ptr(); + + const auto vectorized_begin = + begin + (Vec::size() - begin % Vec::size()) % Vec::size(); + const auto vectorized_end = end - (end % Vec::size()); + // Scalar prologue. + for (const auto idx : c10::irange(begin, vectorized_begin)) { + std::array loaded_inputs; + for (const auto input_idx : c10::irange(kNumInputs)) { + loaded_inputs[input_idx] = inputs_data_ptrs[input_idx][idx]; + } + data_out[idx] = std::apply(compute_fun, loaded_inputs); + } + + // Main vectorized loop. + for (auto idx = vectorized_begin; idx < vectorized_end; + idx += Vec::size()) { + std::array loaded_vec_inputs; + for (const auto input_idx : c10::irange(kNumInputs)) { + loaded_vec_inputs[input_idx] = + Vec::loadu(&inputs_data_ptrs[input_idx][idx]); + } + auto result_vec = std::apply(compute_fun, loaded_vec_inputs); + result_vec.store(&data_out[idx]); + } + + // Scalar epilogue. + for (const auto idx : c10::irange(vectorized_end, end)) { + std::array loaded_inputs; + for (const auto input_idx : c10::irange(kNumInputs)) { + loaded_inputs[input_idx] = inputs_data_ptrs[input_idx][idx]; + } + data_out[idx] = std::apply(compute_fun, loaded_inputs); + } + }); + return; + } + } +#endif ::executorch::extension::parallel_for( 0, @@ -240,6 +336,19 @@ inline void apply_unitensor_elementwise_fn( compute_fun, ctx, out, out_dtypes, std::make_pair(&a, a_dtypes)); } +/** + * Useful for unary elementwise operators. For each element of the + * input, call Op and write to the corresponding element of the + * output. Tensor broadcasting is applied wherever it is required. + * + * [NOTE: Generic lambdas]: If Op is a *generic* lambda (i.e., one with `auto` + * parameters; normal lambdas are fine), it must fulfill one of the + * following conditions. Either: + * 1) It must in fact compile when passed at::vec::Vectorized, or + * 2) It must be actively SFINAE-friendly, as per the C++17 examples in + * https://stackoverflow.com/questions/76525790/detecting-if-a-generic-lambda-with-certain-arguments-is-invocable + * . + */ template < typename CTYPE_COMPUTE, const char* op_name, @@ -281,6 +390,8 @@ inline void apply_bitensor_elementwise_fn( * Useful for bi-tensor elementwise operators. For each element of the inputs, * perform a computation and write to the corresponding element of the output. * Tensor broadcasting is applied wherever it is required. + * See [NOTE: Generic lambdas] if you want to pass a generic lambda for + * compute_fun. */ template < typename CTYPE_COMPUTE, @@ -347,6 +458,9 @@ inline void apply_tritensor_elementwise_fn( * * static constexpr const char op_name[] = "my_op"; * apply_ternary_elementwise_fn. + * + * See [NOTE: Generic lambdas] if you want to pass a generic lambda for + * compute_fun. */ template < typename CTYPE_COMPUTE, diff --git a/kernels/portable/cpu/util/math_util.h b/kernels/portable/cpu/util/math_util.h index 2ba068da18e..bd2cb40090a 100644 --- a/kernels/portable/cpu/util/math_util.h +++ b/kernels/portable/cpu/util/math_util.h @@ -8,6 +8,10 @@ #pragma once +#ifdef ET_USE_PYTORCH_HEADERS +#include +#endif + namespace torch { namespace executor { namespace native { @@ -138,6 +142,21 @@ T max_override(T a, T b) { return b; } +#ifdef ET_USE_PYTORCH_HEADERS +template +at::vec::Vectorized min_override( + at::vec::Vectorized a, + at::vec::Vectorized b) { + return at::vec::minimum(a, b); +} + +template +at::vec::Vectorized max_override( + at::vec::Vectorized a, + at::vec::Vectorized b) { + return at::vec::maximum(a, b); +} +#endif /** * There is a slight difference in how std::fmod works compared to how ATen * determines remainders: diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl index a623b9d4d7a..f60ec7419dc 100644 --- a/kernels/portable/cpu/util/targets.bzl +++ b/kernels/portable/cpu/util/targets.bzl @@ -32,6 +32,7 @@ def define_common_targets(): "//executorch/kernels/portable/cpu/util:slice_util", "//executorch/kernels/portable/cpu/util:elementwise_util", "//executorch/kernels/portable/cpu/util:upsample_util", + "//executorch/kernels/portable/cpu/util:vectorized_math", "//executorch/runtime/kernel:thread_parallel_interface", ], visibility = ["//executorch/...", "@EXECUTORCH_CLIENTS"], @@ -110,6 +111,8 @@ def define_common_targets(): ":broadcast_indexes_range", ":broadcast_util", ":dtype_util", + ":vectorized_math", + "//executorch/runtime/core/portable_type/c10/c10:aten_headers_for_executorch", "//executorch/runtime/kernel:kernel_runtime_context", "//executorch/runtime/kernel:thread_parallel_interface", ], @@ -260,6 +263,9 @@ def define_common_targets(): srcs = [], exported_headers = ["math_util.h"], visibility = ["//executorch/kernels/portable/cpu/...", "//executorch/kernels/quantized/..."], + exported_deps = [ + "//executorch/runtime/core/portable_type/c10/c10:aten_headers_for_executorch", + ], ) runtime.cxx_library( @@ -307,6 +313,15 @@ def define_common_targets(): ], ) + runtime.cxx_library( + name = "vectorized_math", + exported_headers = ["vectorized_math.h"], + visibility = ["//executorch/..."], + exported_deps = [ + "//executorch/runtime/core/exec_aten/util:scalar_type_util", + ], + ) + # Utility functions that can be used by operators that perform reduction for aten_mode in get_aten_mode_options(): suffix = "_aten" if aten_mode else "" diff --git a/kernels/portable/cpu/util/vectorized_math.h b/kernels/portable/cpu/util/vectorized_math.h new file mode 100644 index 00000000000..e1c6e84db13 --- /dev/null +++ b/kernels/portable/cpu/util/vectorized_math.h @@ -0,0 +1,111 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +#ifdef ET_USE_PYTORCH_HEADERS +#include +#endif // ET_USE_PYTORCH_HEADERS + +#define _ET_INTERNAL_STD_MATH_FUNC(name) \ + namespace executorch { \ + inline namespace math { \ + using std::name; \ + } \ + } // namespace executorch + +#ifdef ET_USE_PYTORCH_HEADERS +/** + * Internal-usage macro for making a vectorized variant of a unary + * function available in the executorch::math namespace. + */ +#define ET_INTERNAL_VECTORIZED_FLOAT_UNARY_FUNC(func_name) \ + namespace executorch { \ + inline namespace math { \ + template \ + auto func_name(at::vec::Vectorized vec) { \ + if constexpr (!::executorch::runtime::is_floating_point::value) { \ + return at::vec::convert(vec).func_name(); \ + } else { \ + return vec.func_name(); \ + } \ + } \ + } \ + } + +#define ET_INTERNAL_VECTORIZED_FLOAT_BINARY_FUNC(func_name) \ + namespace executorch { \ + inline namespace math { \ + template \ + auto func_name(at::vec::Vectorized vec0, at::vec::Vectorized vec1) { \ + if constexpr (!::executorch::runtime::is_floating_point::value) { \ + return at::vec::convert(vec0).func_name( \ + at::vec::convert(vec1)); \ + } else { \ + return vec0.func_name(vec1); \ + } \ + } \ + } \ + } + +/** + * Internal-usage macro for making a C++ standard library + * floating-point function and a vectorized variant of it available in + * the c10::math namespace. Should be used with functions where the + * corresponding operator is a "float op" in TensorIterator parlance + * (i.e., uses something like build_borrowing_binary_float_op()), + * because it converts non-floating-point arguments to floating point. + */ +#define ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(func_name) \ + _ET_INTERNAL_STD_MATH_FUNC(func_name) \ + ET_INTERNAL_VECTORIZED_FLOAT_UNARY_FUNC(func_name) + +#define ET_INTERNAL_VECTORIZED_STD_FLOAT_BINARY_FUNC(func_name) \ + _ET_INTERNAL_STD_MATH_FUNC(func_name) \ + ET_INTERNAL_VECTORIZED_FLOAT_BINARY_FUNC(func_name) + +#else // ET_USE_PYTORCH_HEADERS +#define ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(name) \ + _ET_INTERNAL_STD_MATH_FUNC(name) +#define ET_INTERNAL_VECTORIZED_STD_FLOAT_BINARY_FUNC(name) \ + _ET_INTERNAL_STD_MATH_FUNC(name) +#endif // ET_USE_PYTORCH_HEADERS + +// To simplify client code, we provide coverage for a bunch of float ops (the +// same ones listed in ATen vml.h) here. +ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(abs) +ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(acos) +ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(asin) +ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(atan) +ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(ceil) +ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(cos) +ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(cosh) +ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(erf) +ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(erfc) +ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(exp) +ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(expm1) +ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(floor) +ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(log) +ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(log10) +ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(log1p) +ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(log2) +ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(sin) +ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(sinh) +ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(sqrt) +ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(round) +ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(rsqrt) +ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(tan) +ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(tanh) +ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(trunc) +ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(lgamma) + +ET_INTERNAL_VECTORIZED_STD_FLOAT_BINARY_FUNC(atan2) +ET_INTERNAL_VECTORIZED_STD_FLOAT_BINARY_FUNC(fmod) +ET_INTERNAL_VECTORIZED_STD_FLOAT_BINARY_FUNC(pow) diff --git a/runtime/core/portable_type/c10/c10/targets.bzl b/runtime/core/portable_type/c10/c10/targets.bzl index d9d72b5be3f..375558e7b51 100644 --- a/runtime/core/portable_type/c10/c10/targets.bzl +++ b/runtime/core/portable_type/c10/c10/targets.bzl @@ -49,7 +49,10 @@ def define_common_targets(): runtime.cxx_library( name = "aten_headers_for_executorch", srcs = [], - visibility = ["//executorch/kernels/optimized/..."], + visibility = [ + "//executorch/kernels/optimized/...", + "//executorch/kernels/portable/cpu/util/...", + ], exported_deps = select({ "DEFAULT": [], "ovr_config//cpu:arm64": [