Add out-variants to support ET export

Differential Revision: D62385428 Pull Request resolved: #859
pytorch · Sep 16, 2024 · b2e1d49 · b2e1d49
1 parent a584e24
commit b2e1d49
Show file tree

Hide file tree

Showing 25 changed files with 743 additions and 387 deletions.
diff --git a/...xperimental/kernels/cpu/linear/channelwise_8bit_activation_groupwise_lowbit_weight-impl.h b/...xperimental/kernels/cpu/linear/channelwise_8bit_activation_groupwise_lowbit_weight-impl.h
@@ -5,15 +5,17 @@
 // LICENSE file in the root directory of this source tree.
 
 #pragma once
+#include <stdint.h>
 #include <torchao/experimental/kernels/cpu/macro.h>
 #include <torchao/experimental/kernels/cpu/parallel.h>
+#include <algorithm>
 #include <cassert>
 #include <cstdlib>
 
 namespace torchao::operators::cpu::linear::
     channelwise_8bit_activation_groupwise_lowbit_weight {
 
-PackWeightDataTilingParams get_default_pack_weight_data_tiling_params(
+inline PackWeightDataTilingParams get_default_pack_weight_data_tiling_params(
     const UKernelConfig& ukernel_config,
     int n,
     int target_panels_per_thread) {
@@ -38,7 +40,7 @@ PackWeightDataTilingParams get_default_pack_weight_data_tiling_params(
   return tiling_params;
 }
 
-void pack_weight_data_operator(
+inline void pack_weight_data_operator(
     const UKernelConfig& ukernel_config,
     const PackWeightDataTilingParams& tiling_params,
     // Outputs
@@ -79,7 +81,7 @@ void pack_weight_data_operator(
 }
 
 // This default mimics XNNPACK behavior if target_tiles_per_thread = 5
-LinearTilingParams get_default_linear_tiling_params(
+inline LinearTilingParams get_default_linear_tiling_params(
     const UKernelConfig& ukernel_config,
     int m,
     int n,
@@ -137,12 +139,12 @@ get_activation_data_buffer_size_with_tile_schedule_policy_parallel_mc_parallel_n
   return ukernel_config.activation_data_size_fn(m, k, group_size);
 }
 
-void linear_operator_with_tile_schedule_policy_single_mc_parallel_nc(
+inline void linear_operator_with_tile_schedule_policy_single_mc_parallel_nc(
     const UKernelConfig& ukernel_config,
     const LinearTilingParams& tiling_params,
     char* activation_data_buffer,
     // Outputs
-    float32_t* output,
+    float* output,
     // Inputs
     int m,
     int n,
@@ -199,12 +201,12 @@ void linear_operator_with_tile_schedule_policy_single_mc_parallel_nc(
   }
 }
 
-void linear_operator_with_tile_schedule_policy_parallel_mc_parallel_nc(
+inline void linear_operator_with_tile_schedule_policy_parallel_mc_parallel_nc(
     const UKernelConfig& ukernel_config,
     const LinearTilingParams& tiling_params,
     char* activation_data_buffer,
     // Outputs
-    float32_t* output,
+    float* output,
     // Inputs
     int m,
     int n,
@@ -271,7 +273,7 @@ void linear_operator_with_tile_schedule_policy_parallel_mc_parallel_nc(
 }
 } // namespace internal
 
-void linear_operator(
+inline void linear_operator(
     const UKernelConfig& ukernel_config,
     const LinearTilingParams& tiling_params,
     LinearTileSchedulingPolicy scheduling_policy,
@@ -363,7 +365,7 @@ namespace torchao::operators::cpu::linear::
     channelwise_8bit_activation_groupwise_lowbit_weight {
 template <int weight_nbit, bool has_weight_zeros, bool has_bias, bool has_clamp>
 
-UKernelConfig get_ukernel_config() {
+inline UKernelConfig get_ukernel_config() {
   UKernelConfig config;
 
   namespace ukernel = torchao::kernels::cpu::aarch64::linear::

diff --git a/...hao/experimental/kernels/cpu/linear/channelwise_8bit_activation_groupwise_lowbit_weight.h b/...hao/experimental/kernels/cpu/linear/channelwise_8bit_activation_groupwise_lowbit_weight.h
@@ -5,6 +5,7 @@
 // LICENSE file in the root directory of this source tree.
 
 #pragma once
+#include <stdint.h>
 
 // TODO: maybe move to operator directory
 namespace torchao::operators::cpu::linear::

diff --git a/torchao/experimental/kernels/cpu/linear/examples/torch_custom_op/CMakeLists.txt b/torchao/experimental/kernels/cpu/linear/examples/torch_custom_op/CMakeLists.txt
@@ -18,13 +18,41 @@ include_directories(${TORCHAO_LIBRARIES})
 
 add_subdirectory(${TORCHAO_LIBRARIES}/torchao/experimental/kernels/cpu/aarch64 ${CMAKE_CURRENT_BINARY_DIR}/kernel_aarch64)
 
-find_package(Torch REQUIRED)
-include_directories("${TORCH_INCLUDE_DIRS}")
+include(${TORCHAO_LIBRARIES}/torchao/experimental/kernels/cpu/Utils.cmake)
 
-add_library(torch_custom_op SHARED torch_custom_op.cpp)
-target_link_libraries(torch_custom_op PRIVATE "${TORCH_LIBRARIES}")
-target_link_libraries(torch_custom_op PRIVATE kernel_aarch64)
+set(PLATFORM "ATEN" CACHE STRING "Choose platform surface: ATEN, EXECUTORCH")
+string(TOUPPER ${PLATFORM} PLATFORM_TO_UPPER)
 
-include(${TORCHAO_LIBRARIES}/torchao/experimental/kernels/cpu/Utils.cmake)
-set(TORCHAO_PARALLEL_BACKEND "ATEN_OPENMP" CACHE STRING "Choose parallel backend to use for torchao parallelism (aten_openmp, openmp, pthreadpool, single_threaded)")
-target_link_torchao_parallel_backend(torch_custom_op "${TORCHAO_PARALLEL_BACKEND}")
+if(PLATFORM_TO_UPPER STREQUAL "ATEN")
+message(STATUS "Building with PLATFORM=ATEN")
+
+find_package(Torch REQUIRED)
+add_library(lowbit_op_aten SHARED lowbit_op_aten.cpp)
+target_link_libraries(lowbit_op_aten PRIVATE kernel_aarch64)
+target_include_directories(lowbit_op_aten PRIVATE "${TORCH_INCLUDE_DIRS}")
+target_link_libraries(lowbit_op_aten PRIVATE "${TORCH_LIBRARIES}")
+target_compile_definitions(lowbit_op_aten PRIVATE USE_ATEN=1)
+target_link_torchao_parallel_backend(lowbit_op_aten "ATEN_OPENMP")
+
+elseif(PLATFORM_TO_UPPER STREQUAL "EXECUTORCH")
+message(STATUS "Building with PLATFORM=EXECUTORCH")
+
+add_library(lowbit_op_executorch SHARED
+    lowbit_op_executorch/w2s.cpp
+    lowbit_op_executorch/w2sz.cpp
+    lowbit_op_executorch/w3s.cpp
+    lowbit_op_executorch/w3sz.cpp
+    lowbit_op_executorch/w4s.cpp
+    lowbit_op_executorch/w4sz.cpp
+    lowbit_op_executorch/w5s.cpp
+    lowbit_op_executorch/w5sz.cpp
+)
+target_include_directories(lowbit_op_executorch PRIVATE ${EXECUTORCH_INCLUDE_DIRS})
+target_compile_definitions(lowbit_op_executorch PRIVATE USE_EXECUTORCH=1)
+target_link_torchao_parallel_backend(lowbit_op_executorch "SINGLE_THREADED")
+target_link_libraries(lowbit_op_executorch PRIVATE ${EXECUTORCH_LIBRARIES})
+target_link_libraries(lowbit_op_executorch PRIVATE kernel_aarch64)
+
+else()
+message(FATAL_ERROR "Unknown PLATFORM: ${PLATFORM}. Please choose one of: ATEN, EXECUTORCH.")
+endif()
diff --git a/torchao/experimental/kernels/cpu/linear/examples/torch_custom_op/build_custom_op.sh b/torchao/experimental/kernels/cpu/linear/examples/torch_custom_op/build_custom_op.sh
@@ -13,7 +13,7 @@ echo "CMAKE_PREFIX_PATH: ${CMAKE_PREFIX_PATH}"
 export CMAKE_OUT=/tmp/cmake-out/torch_ao/examples/torch_custom_op
 cmake -DTORCHAO_LIBRARIES=${TORCHAO_LIBRARIES} \
     -DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH} \
-    -DTORCHAO_PARALLEL_BACKEND="aten_openmp" \
+    -DPLATFORM="ATEN" \
     -S ${TORCHAO_LIBRARIES}/torchao/experimental/kernels/cpu/linear/examples/torch_custom_op \
     -B ${CMAKE_OUT}
 cmake --build  ${CMAKE_OUT}