Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ORT 1.19.0 Release: Cherry-Pick Round 1 #21619

Merged
merged 22 commits into from
Aug 12, 2024
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
9fc5876
Mlas int4 int8 with avx2/512 (#20687)
liqunfu Aug 2, 2024
188e453
Utilize ext data location to reduce qd matmul memory usage (#21451)
fajin-corp Jul 30, 2024
7cea166
[CUDA] Fix MultiHeadAttention thread safe and bias support (#21498)
tianleiwu Jul 31, 2024
866630f
Fix quant_format argument for 4bit quantizer (#21581)
fajin-corp Jul 31, 2024
4366daa
[QNN EP] Update QNN SDK to 2.25 (#21623)
adrianlizarraga Aug 6, 2024
9a82bd7
Fix typos so to call correct vnni functions under vnni condition (#21…
liqunfu Aug 6, 2024
083d053
Add CUDA custom op header files to Linux tarball (#21551)
snnn Aug 1, 2024
f8c3c75
Extend Pad Fusion for AveragePool (#21556)
sumitsays Jul 30, 2024
d17b3d5
Unblock migraphx and linux GPU training ci pipelines (#21662)
tianleiwu Aug 9, 2024
44bcc20
Restructure MacOS framework package to fix malformed Framework errors…
vraspar Aug 4, 2024
10032f2
Updates to apple packaging (#21611)
skottmckay Aug 5, 2024
5db42fa
Fix usability checker CoreML config file path. (#21626)
edgchen1 Aug 6, 2024
0ad313f
Use zipped xcframework in nuget package (#21663)
skottmckay Aug 9, 2024
b67af53
Pin transformer and optimum version (#21650)
mszhanyi Aug 7, 2024
ac38c44
[CUDA] Fix MHA mask (#21655)
tianleiwu Aug 9, 2024
2792f23
Security Fuzz Test Fixes (#21608)
jingyanwangms Aug 11, 2024
7c27955
Fix docker image layer caching to avoid redundant docker building and…
mszhanyi Aug 6, 2024
00daa26
Fuse Pad even if Cast is present in-between (#21640)
sumitsays Aug 9, 2024
cd2e3e0
[TensorRT EP] No workspace size limit to TRT memory pool (#21643)
chilo-ms Aug 10, 2024
00cd5b4
fix handling of multiple QuantizeLinear nodes (#21675)
saurabhkale17 Aug 9, 2024
a3192b8
When quantize 4bit mamtul, force upgrade onnx domain opset to 21 (#21…
fajin-corp Aug 9, 2024
eeeee21
[DML EP] Update DML to 1.15.1 (#21695)
sumitsays Aug 12, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions cmake/onnxruntime.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,14 @@ function(get_c_cxx_api_headers HEADERS_VAR)

# need to add header files for enabled EPs
foreach(f ${ONNXRUNTIME_PROVIDER_NAMES})
file(GLOB _provider_headers CONFIGURE_DEPENDS
"${REPO_ROOT}/include/onnxruntime/core/providers/${f}/*.h"
)
list(APPEND _headers ${_provider_headers})
# The header files in include/onnxruntime/core/providers/cuda directory cannot be flattened to the same directory
# with onnxruntime_c_api.h . Most other EPs probably also do not work in this way.
if((NOT f STREQUAL cuda) AND (NOT f STREQUAL rocm))
file(GLOB _provider_headers CONFIGURE_DEPENDS
"${REPO_ROOT}/include/onnxruntime/core/providers/${f}/*.h"
)
list(APPEND _headers ${_provider_headers})
endif()
endforeach()

set(${HEADERS_VAR} ${_headers} PARENT_SCOPE)
Expand Down
13 changes: 11 additions & 2 deletions cmake/onnxruntime_mlas.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -555,8 +555,17 @@ else()
${MLAS_SRC_DIR}/intrinsics/avx2/qdwconv_avx2.cpp
${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx2.cpp
)
set_source_files_properties(${mlas_platform_srcs_avx2} PROPERTIES COMPILE_FLAGS "-mavx2 -mfma")

message(STATUS "CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}")
message(STATUS "CMAKE_CXX_COMPILER_VERSION: ${CMAKE_CXX_COMPILER_VERSION}")

if(NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" OR CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "10")
message(STATUS "Using -mavx2 -mfma -mavxvnni flags")
set_source_files_properties(${mlas_platform_srcs_avx2} PROPERTIES COMPILE_FLAGS "-mavx2 -mfma -mavxvnni")
else()
message(STATUS "Using -mavx2 -mfma flags")
set_source_files_properties(${mlas_platform_srcs_avx2} PROPERTIES COMPILE_FLAGS "-mavx2 -mfma")
endif()
set(mlas_platform_srcs_avx512f
${MLAS_SRC_DIR}/x86_64/DgemmKernelAvx512F.S
${MLAS_SRC_DIR}/x86_64/SgemmKernelAvx512F.S
Expand All @@ -575,7 +584,7 @@ else()
${MLAS_SRC_DIR}/x86_64/ConvSymKernelAvx512Core.S
${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx512.cpp
)
set_source_files_properties(${mlas_platform_srcs_avx512core} PROPERTIES COMPILE_FLAGS "-mavx512bw -mavx512dq -mavx512vl")
set_source_files_properties(${mlas_platform_srcs_avx512core} PROPERTIES COMPILE_FLAGS "-mfma -mavx512vnni -mavx512bw -mavx512dq -mavx512vl")

set(mlas_platform_srcs_avx512vnni
${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx512vnni.cpp
Expand Down
1 change: 1 addition & 0 deletions cmake/onnxruntime_providers_cpu.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,7 @@ if (onnxruntime_ENABLE_TRAINING)
endif()

install(FILES ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/providers/cpu/cpu_provider_factory.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/)
install(FILES ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/providers/resource.h ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/providers/custom_op_context.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/core/providers)
set_target_properties(onnxruntime_providers PROPERTIES LINKER_LANGUAGE CXX)
set_target_properties(onnxruntime_providers PROPERTIES FOLDER "ONNXRuntime")

Expand Down
9 changes: 8 additions & 1 deletion cmake/onnxruntime_providers_cuda.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -289,8 +289,15 @@
config_cuda_provider_shared_module(onnxruntime_providers_cuda_obj)
endif()
config_cuda_provider_shared_module(onnxruntime_providers_cuda)

# Cannot use glob because the file cuda_provider_options.h should not be exposed out.
set(ONNXRUNTIME_CUDA_PROVIDER_PUBLIC_HEADERS
"${REPO_ROOT}/include/onnxruntime/core/providers/cuda/cuda_context.h"
"${REPO_ROOT}/include/onnxruntime/core/providers/cuda/cuda_resource.h"
)
set_target_properties(onnxruntime_providers_cuda PROPERTIES
PUBLIC_HEADER "${ONNXRUNTIME_CUDA_PROVIDER_PUBLIC_HEADERS}")
install(TARGETS onnxruntime_providers_cuda
PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/core/providers/cuda
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
7 changes: 6 additions & 1 deletion cmake/onnxruntime_providers_rocm.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -223,8 +223,13 @@
if (onnxruntime_ENABLE_ATEN)
target_compile_definitions(onnxruntime_providers_rocm PRIVATE ENABLE_ATEN)
endif()

file(GLOB ONNXRUNTIME_ROCM_PROVIDER_PUBLIC_HEADERS CONFIGURE_DEPENDS
"${REPO_ROOT}/include/onnxruntime/core/providers/rocm/*.h"
)
set_target_properties(onnxruntime_providers_rocm PROPERTIES
PUBLIC_HEADER "${ONNXRUNTIME_ROCM_PROVIDER_PUBLIC_HEADERS}")
install(TARGETS onnxruntime_providers_rocm
PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/core/providers/rocm
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Condition="('$(OutputType)'!='Library' OR '$(IsAppExtension)'=='True')">
<NativeReference Include="$(MSBuildThisFileDirectory)..\..\runtimes\ios\native\onnxruntime.xcframework">
<NativeReference Include="$(MSBuildThisFileDirectory)..\..\runtimes\ios\native\onnxruntime.xcframework.zip">
<Kind>Static</Kind>
<IsCxx>True</IsCxx>
<SmartLink>True</SmartLink>
Expand All @@ -10,4 +10,4 @@
<WeakFrameworks>CoreML</WeakFrameworks>
</NativeReference>
</ItemGroup>
</Project>
</Project>
9 changes: 7 additions & 2 deletions include/onnxruntime/core/optimizer/graph_transformer_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,15 @@

#pragma once

#include <string>
#include <memory>
#include <unordered_map>
#include <unordered_set>
#include <vector>

#include "core/common/inlined_containers.h"
#include "core/framework/session_options.h"
#include "core/framework/tensor.h"
#include "core/optimizer/graph_transformer.h"
#include "core/platform/threadpool.h"

Expand Down Expand Up @@ -51,7 +54,8 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
const SessionOptions& session_options,
const IExecutionProvider& execution_provider /*required by constant folding*/,
const InlinedHashSet<std::string>& rules_and_transformers_to_disable = {},
concurrency::ThreadPool* intra_op_thread_pool = nullptr);
concurrency::ThreadPool* intra_op_thread_pool = nullptr,
std::unordered_map<std::string, std::unique_ptr<Tensor>>* p_buffered_tensors = nullptr);

#endif // !defined(ORT_MINIMAL_BUILD)

Expand Down Expand Up @@ -81,7 +85,8 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformersForMinimalB
const SatApplyContextVariant& apply_context,
const IExecutionProvider& cpu_execution_provider,
const InlinedHashSet<std::string>& rules_and_transformers_to_disable = {},
concurrency::ThreadPool* intra_op_thread_pool = nullptr);
concurrency::ThreadPool* intra_op_thread_pool = nullptr,
std::unordered_map<std::string, std::unique_ptr<Tensor>>* p_buffered_tensors = nullptr);

#endif // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
// can be updated using: UpdateTensorRTProviderOptionsWithValue
int trt_max_partition_iterations{1000}; // maximum iterations for TensorRT parser to get capability
int trt_min_subgraph_size{1}; // minimum size of TensorRT subgraphs
size_t trt_max_workspace_size{1 << 30}; // maximum workspace size for TensorRT.
size_t trt_max_workspace_size{0}; // maximum workspace size for TensorRT. Default is 0 means max device memory size

Check warning on line 22 in include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Lines should be <= 120 characters long [whitespace/line_length] [2] Raw Output: include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h:22: Lines should be <= 120 characters long [whitespace/line_length] [2]
int trt_fp16_enable{0}; // enable TensorRT FP16 precision. Default 0 = false, nonzero = true
int trt_int8_enable{0}; // enable TensorRT INT8 precision. Default 0 = false, nonzero = true
const char* trt_int8_calibration_table_name{nullptr}; // TensorRT INT8 calibration table name.
Expand Down
1 change: 0 additions & 1 deletion onnxruntime/contrib_ops/cpu/bert/attention_base.cc
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,6 @@ Status AttentionBase::CheckInputs(const TensorShape& input_shape,
output_parameters->scale = scale_;
output_parameters->mask_type = mask_type;
output_parameters->broadcast_res_pos_bias = broadcast_res_pos_bias;
output_parameters->pass_past_in_kv = false;
output_parameters->qkv_format = Q_K_V_BNSH;
}

Expand Down
13 changes: 10 additions & 3 deletions onnxruntime/contrib_ops/cpu/bert/attention_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@
namespace onnxruntime {
namespace contrib {

enum AttentionType {
kAttention,
kMultiHeadAttention,
kDecoderMaskedMultiHeadAttention,
};

enum AttentionMaskType {
MASK_NONE, // No mask
MASK_1D_KEY_SEQ_LEN, // [batch_size], key sequence length
Expand All @@ -24,10 +30,12 @@ enum AttentionQkvFormat {
UNKNOWN, // enum value not set, or depends on qkv projection implementation details
Q_K_V_BNSH, // for non-packed qkv, permuted
Q_K_V_BSNH, // for non-packed qkv, not permuted, used by memory efficient attention or MultiHeadAttention
QKV_BSN3H, // for TRT fused attention, qkv are packed
Q_K_V_BSNH_BNSH_BNSH, // for cross attention, k and v are permuted
Q_K_V_BNSH_QKV_BS3NH, // for TRT fused causal attention, data has two formats (qkv is 3BNSH, gemm_buffer is BS3NH)
Q_KV_BSNH_BSN2H, // for TRT fused cross attention, kv are packed
Q_K_V_TNH, // for memory efficient attention, qkv are not packed, and paddings are removed.
Q_KV_BSNH_BSN2H, // for TRT fused cross attention, kv are packed
QKV_BSN3H, // for TRT fused attention, qkv are packed
QKV_BS3NH, // for DecoderMaskedMultiHeadAttention, qkv are packed
QKV_TN3H, // for TRT fused attention, qkv are packed and paddings are removed
};

Expand Down Expand Up @@ -61,7 +69,6 @@ struct AttentionParameters {
bool past_present_share_buffer;
bool do_rotary;
bool broadcast_res_pos_bias;
bool pass_past_in_kv;
float mask_filter_value;
float scale;
bool use_tf32;
Expand Down
15 changes: 4 additions & 11 deletions onnxruntime/contrib_ops/cpu/bert/multihead_attention.cc
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ Status MultiHeadAttention<T>::Compute(OpKernelContext* context) const {
scale_,
is_unidirectional_,
past_present_share_buffer,
false));
kMultiHeadAttention));

const int batch_size = parameters.batch_size;
const int q_sequence_length = parameters.sequence_length;
Expand Down Expand Up @@ -121,20 +121,13 @@ Status MultiHeadAttention<T>::Compute(OpKernelContext* context) const {
AllocatorPtr allocator;
ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&allocator));

// For each of Q/K/V, there are multiple scenarios:
// 1) Combined QKV bias is null
// a) Q/K/V is (B, S, D)
// b) Q/K/V is (B, S, N, H)
// 2) No packed QKV in Q
// a) Q/K/V has seq_len = 1
// b) Q/K/V has seq_len > 1

OrtValue Q;
ORT_RETURN_IF_ERROR(MaybeTransposeToBNSHAndAddBias<T>(
context, allocator, batch_size, num_heads_, q_sequence_length, qk_head_size, query, bias, q_bias_offset, Q));

if (parameters.pass_past_in_kv) { // key and value in BNSH format
assert(bias == nullptr);
if (parameters.qkv_format == Q_K_V_BSNH_BNSH_BNSH) {
// For cross attention with k and v in BNSH format, we assume that bias for key and value are zeros.
// So we don't need to add bias for key and value here.
assert(past_key == nullptr);
assert(past_value == nullptr);
return ApplyAttention(Q.GetMutable<Tensor>()->MutableData<T>(),
Expand Down
Loading
Loading