microsoft · prathikr · Aug 12, 2024 · Aug 2, 2024 · Jul 30, 2024 · Jul 31, 2024
diff --git a/.pipelines/nuget_config/x64/packages.config b/.pipelines/nuget_config/x64/packages.config
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="utf-8"?>
 <packages>
   <package id="python" version="3.9.7" targetFramework="native" />
-  <package id="Microsoft.AI.DirectML" version="1.15.0" targetFramework="native" />
+  <package id="Microsoft.AI.DirectML" version="1.15.1" targetFramework="native" />
   <package id="Microsoft.Windows.CppWinRT" version="2.0.201201.7" targetFramework="native" />
 </packages>
diff --git a/.pipelines/nuget_config/x86/packages.config b/.pipelines/nuget_config/x86/packages.config
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="utf-8"?>
 <packages>
   <package id="pythonx86" version="3.9.7" targetFramework="native" />
-  <package id="Microsoft.AI.DirectML" version="1.15.0" targetFramework="native" />
+  <package id="Microsoft.AI.DirectML" version="1.15.1" targetFramework="native" />
   <package id="Microsoft.Windows.CppWinRT" version="2.0.201201.7" targetFramework="native" />
 </packages>
diff --git a/cmake/external/dml.cmake b/cmake/external/dml.cmake
@@ -41,7 +41,7 @@ if (NOT onnxruntime_USE_CUSTOM_DIRECTML)
   set(NUGET_CONFIG ${PROJECT_SOURCE_DIR}/../NuGet.config)
   set(PACKAGES_CONFIG ${PROJECT_SOURCE_DIR}/../packages.config)
   get_filename_component(PACKAGES_DIR ${CMAKE_CURRENT_BINARY_DIR}/../packages ABSOLUTE)
-  set(DML_PACKAGE_DIR ${PACKAGES_DIR}/Microsoft.AI.DirectML.1.15.0)
+  set(DML_PACKAGE_DIR ${PACKAGES_DIR}/Microsoft.AI.DirectML.1.15.1)
 
   # Restore nuget packages, which will pull down the DirectML redist package.
   add_custom_command(

diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake
@@ -38,10 +38,14 @@ function(get_c_cxx_api_headers HEADERS_VAR)
 
   # need to add header files for enabled EPs
   foreach(f ${ONNXRUNTIME_PROVIDER_NAMES})
-    file(GLOB _provider_headers CONFIGURE_DEPENDS
-      "${REPO_ROOT}/include/onnxruntime/core/providers/${f}/*.h"
-    )
-    list(APPEND _headers ${_provider_headers})
+    # The header files in include/onnxruntime/core/providers/cuda directory cannot be flattened to the same directory 
+    # with onnxruntime_c_api.h . Most other EPs probably also do not work in this way.
+    if((NOT f STREQUAL cuda) AND (NOT f STREQUAL rocm))
+      file(GLOB _provider_headers CONFIGURE_DEPENDS
+        "${REPO_ROOT}/include/onnxruntime/core/providers/${f}/*.h"
+      )
+      list(APPEND _headers ${_provider_headers})
+    endif()
   endforeach()
 
   set(${HEADERS_VAR} ${_headers} PARENT_SCOPE)

diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
@@ -555,8 +555,17 @@ else()
           ${MLAS_SRC_DIR}/intrinsics/avx2/qdwconv_avx2.cpp
           ${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx2.cpp
         )
-        set_source_files_properties(${mlas_platform_srcs_avx2} PROPERTIES COMPILE_FLAGS "-mavx2 -mfma")
 
+message(STATUS "CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}")
+message(STATUS "CMAKE_CXX_COMPILER_VERSION: ${CMAKE_CXX_COMPILER_VERSION}")
+
+if(NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" OR CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "10")
+          message(STATUS "Using -mavx2 -mfma -mavxvnni flags")
+          set_source_files_properties(${mlas_platform_srcs_avx2} PROPERTIES COMPILE_FLAGS "-mavx2 -mfma -mavxvnni")
+else()
+          message(STATUS "Using -mavx2 -mfma flags")
+          set_source_files_properties(${mlas_platform_srcs_avx2} PROPERTIES COMPILE_FLAGS "-mavx2 -mfma")
+endif()
         set(mlas_platform_srcs_avx512f
           ${MLAS_SRC_DIR}/x86_64/DgemmKernelAvx512F.S
           ${MLAS_SRC_DIR}/x86_64/SgemmKernelAvx512F.S
@@ -575,7 +584,7 @@ else()
           ${MLAS_SRC_DIR}/x86_64/ConvSymKernelAvx512Core.S
           ${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx512.cpp
         )
-        set_source_files_properties(${mlas_platform_srcs_avx512core} PROPERTIES COMPILE_FLAGS "-mavx512bw -mavx512dq -mavx512vl")
+        set_source_files_properties(${mlas_platform_srcs_avx512core} PROPERTIES COMPILE_FLAGS "-mfma -mavx512vnni -mavx512bw -mavx512dq -mavx512vl")
 
         set(mlas_platform_srcs_avx512vnni
           ${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx512vnni.cpp

diff --git a/cmake/onnxruntime_providers_cpu.cmake b/cmake/onnxruntime_providers_cpu.cmake
@@ -219,6 +219,7 @@ if (onnxruntime_ENABLE_TRAINING)
 endif()
 
 install(FILES ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/providers/cpu/cpu_provider_factory.h  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/)
+install(FILES ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/providers/resource.h ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/providers/custom_op_context.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/core/providers)
 set_target_properties(onnxruntime_providers PROPERTIES LINKER_LANGUAGE CXX)
 set_target_properties(onnxruntime_providers PROPERTIES FOLDER "ONNXRuntime")
 

diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake
@@ -289,8 +289,15 @@
     config_cuda_provider_shared_module(onnxruntime_providers_cuda_obj)
   endif()
   config_cuda_provider_shared_module(onnxruntime_providers_cuda)
-
+  # Cannot use glob because the file cuda_provider_options.h should not be exposed out.
+  set(ONNXRUNTIME_CUDA_PROVIDER_PUBLIC_HEADERS
+        "${REPO_ROOT}/include/onnxruntime/core/providers/cuda/cuda_context.h"
+        "${REPO_ROOT}/include/onnxruntime/core/providers/cuda/cuda_resource.h"
+      )
+  set_target_properties(onnxruntime_providers_cuda PROPERTIES
+    PUBLIC_HEADER "${ONNXRUNTIME_CUDA_PROVIDER_PUBLIC_HEADERS}")
   install(TARGETS onnxruntime_providers_cuda
+          PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/core/providers/cuda
           ARCHIVE  DESTINATION ${CMAKE_INSTALL_LIBDIR}
           LIBRARY  DESTINATION ${CMAKE_INSTALL_LIBDIR}
           RUNTIME  DESTINATION ${CMAKE_INSTALL_BINDIR})
diff --git a/cmake/onnxruntime_providers_rocm.cmake b/cmake/onnxruntime_providers_rocm.cmake
@@ -223,8 +223,13 @@
   if (onnxruntime_ENABLE_ATEN)
     target_compile_definitions(onnxruntime_providers_rocm PRIVATE ENABLE_ATEN)
   endif()
-
+  file(GLOB ONNXRUNTIME_ROCM_PROVIDER_PUBLIC_HEADERS CONFIGURE_DEPENDS
+        "${REPO_ROOT}/include/onnxruntime/core/providers/rocm/*.h"
+      )
+  set_target_properties(onnxruntime_providers_rocm PROPERTIES
+    PUBLIC_HEADER "${ONNXRUNTIME_ROCM_PROVIDER_PUBLIC_HEADERS}")
   install(TARGETS onnxruntime_providers_rocm
+          PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/core/providers/rocm
           ARCHIVE  DESTINATION ${CMAKE_INSTALL_LIBDIR}
           LIBRARY  DESTINATION ${CMAKE_INSTALL_LIBDIR}
           RUNTIME  DESTINATION ${CMAKE_INSTALL_BINDIR})
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/targets/net8.0-ios/targets.xml b/csharp/src/Microsoft.ML.OnnxRuntime/targets/net8.0-ios/targets.xml
@@ -1,7 +1,7 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <ItemGroup Condition="('$(OutputType)'!='Library' OR '$(IsAppExtension)'=='True')">
-    <NativeReference Include="$(MSBuildThisFileDirectory)..\..\runtimes\ios\native\onnxruntime.xcframework">
+    <NativeReference Include="$(MSBuildThisFileDirectory)..\..\runtimes\ios\native\onnxruntime.xcframework.zip">
       <Kind>Static</Kind>
       <IsCxx>True</IsCxx>
       <SmartLink>True</SmartLink>
@@ -10,4 +10,4 @@
       <WeakFrameworks>CoreML</WeakFrameworks>
     </NativeReference>
   </ItemGroup>
-</Project>
+</Project>
diff --git a/include/onnxruntime/core/optimizer/graph_transformer_utils.h b/include/onnxruntime/core/optimizer/graph_transformer_utils.h
@@ -3,12 +3,15 @@
 
 #pragma once
 
+#include <string>
 #include <memory>
+#include <unordered_map>
 #include <unordered_set>
 #include <vector>
 
 #include "core/common/inlined_containers.h"
 #include "core/framework/session_options.h"
+#include "core/framework/tensor.h"
 #include "core/optimizer/graph_transformer.h"
 #include "core/platform/threadpool.h"
 
@@ -51,7 +54,8 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
     const SessionOptions& session_options,
     const IExecutionProvider& execution_provider /*required by constant folding*/,
     const InlinedHashSet<std::string>& rules_and_transformers_to_disable = {},
-    concurrency::ThreadPool* intra_op_thread_pool = nullptr);
+    concurrency::ThreadPool* intra_op_thread_pool = nullptr,
+    std::unordered_map<std::string, std::unique_ptr<Tensor>>* p_buffered_tensors = nullptr);
 
 #endif  // !defined(ORT_MINIMAL_BUILD)
 
@@ -81,7 +85,8 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformersForMinimalB
     const SatApplyContextVariant& apply_context,
     const IExecutionProvider& cpu_execution_provider,
     const InlinedHashSet<std::string>& rules_and_transformers_to_disable = {},
-    concurrency::ThreadPool* intra_op_thread_pool = nullptr);
+    concurrency::ThreadPool* intra_op_thread_pool = nullptr,
+    std::unordered_map<std::string, std::unique_ptr<Tensor>>* p_buffered_tensors = nullptr);
 
 #endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
 

diff --git a/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h b/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h
@@ -19,7 +19,7 @@
                                                          // can be updated using: UpdateTensorRTProviderOptionsWithValue
   int trt_max_partition_iterations{1000};                // maximum iterations for TensorRT parser to get capability
   int trt_min_subgraph_size{1};                          // minimum size of TensorRT subgraphs
-  size_t trt_max_workspace_size{1 << 30};                // maximum workspace size for TensorRT.
+  size_t trt_max_workspace_size{0};                      // maximum workspace size for TensorRT. Default is 0 means max device memory size
   int trt_fp16_enable{0};                                // enable TensorRT FP16 precision. Default 0 = false, nonzero = true
   int trt_int8_enable{0};                                // enable TensorRT INT8 precision. Default 0 = false, nonzero = true
   const char* trt_int8_calibration_table_name{nullptr};  // TensorRT INT8 calibration table name.

diff --git a/onnxruntime/contrib_ops/cpu/bert/attention_base.cc b/onnxruntime/contrib_ops/cpu/bert/attention_base.cc
@@ -258,7 +258,6 @@ Status AttentionBase::CheckInputs(const TensorShape& input_shape,
     output_parameters->scale = scale_;
     output_parameters->mask_type = mask_type;
     output_parameters->broadcast_res_pos_bias = broadcast_res_pos_bias;
-    output_parameters->pass_past_in_kv = false;
     output_parameters->qkv_format = Q_K_V_BNSH;
   }
 

diff --git a/onnxruntime/contrib_ops/cpu/bert/attention_common.h b/onnxruntime/contrib_ops/cpu/bert/attention_common.h
@@ -6,6 +6,12 @@
 namespace onnxruntime {
 namespace contrib {
 
+enum AttentionType {
+  kAttention,
+  kMultiHeadAttention,
+  kDecoderMaskedMultiHeadAttention,
+};
+
 enum AttentionMaskType {
   MASK_NONE,                  // No mask
   MASK_1D_KEY_SEQ_LEN,        // [batch_size], key sequence length
@@ -24,10 +30,12 @@ enum AttentionQkvFormat {
   UNKNOWN,               // enum value not set, or depends on qkv projection implementation details
   Q_K_V_BNSH,            // for non-packed qkv, permuted
   Q_K_V_BSNH,            // for non-packed qkv, not permuted, used by memory efficient attention or MultiHeadAttention
-  QKV_BSN3H,             // for TRT fused attention, qkv are packed
+  Q_K_V_BSNH_BNSH_BNSH,  // for cross attention, k and v are permuted
   Q_K_V_BNSH_QKV_BS3NH,  // for TRT fused causal attention, data has two formats (qkv is 3BNSH, gemm_buffer is BS3NH)
-  Q_KV_BSNH_BSN2H,       // for TRT fused cross attention, kv are packed
   Q_K_V_TNH,             // for memory efficient attention, qkv are not packed, and paddings are removed.
+  Q_KV_BSNH_BSN2H,       // for TRT fused cross attention, kv are packed
+  QKV_BSN3H,             // for TRT fused attention, qkv are packed
+  QKV_BS3NH,             // for DecoderMaskedMultiHeadAttention, qkv are packed
   QKV_TN3H,              // for TRT fused attention, qkv are packed and paddings are removed
 };
 
@@ -61,7 +69,6 @@ struct AttentionParameters {
   bool past_present_share_buffer;
   bool do_rotary;
   bool broadcast_res_pos_bias;
-  bool pass_past_in_kv;
   float mask_filter_value;
   float scale;
   bool use_tf32;

diff --git a/onnxruntime/contrib_ops/cpu/bert/multihead_attention.cc b/onnxruntime/contrib_ops/cpu/bert/multihead_attention.cc
@@ -85,7 +85,7 @@ Status MultiHeadAttention<T>::Compute(OpKernelContext* context) const {
                                                                       scale_,
                                                                       is_unidirectional_,
                                                                       past_present_share_buffer,
-                                                                      false));
+                                                                      kMultiHeadAttention));
 
   const int batch_size = parameters.batch_size;
   const int q_sequence_length = parameters.sequence_length;
@@ -121,20 +121,13 @@ Status MultiHeadAttention<T>::Compute(OpKernelContext* context) const {
   AllocatorPtr allocator;
   ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&allocator));
 
-  // For each of Q/K/V, there are multiple scenarios:
-  // 1) Combined QKV bias is null
-  //    a) Q/K/V is (B, S, D)
-  //    b) Q/K/V is (B, S, N, H)
-  // 2) No packed QKV in Q
-  //    a) Q/K/V has seq_len = 1
-  //    b) Q/K/V has seq_len > 1
-
   OrtValue Q;
   ORT_RETURN_IF_ERROR(MaybeTransposeToBNSHAndAddBias<T>(
       context, allocator, batch_size, num_heads_, q_sequence_length, qk_head_size, query, bias, q_bias_offset, Q));
 
-  if (parameters.pass_past_in_kv) {  // key and value in BNSH format
-    assert(bias == nullptr);
+  if (parameters.qkv_format == Q_K_V_BSNH_BNSH_BNSH) {
+    // For cross attention with k and v in BNSH format, we assume that bias for key and value are zeros.
+    // So we don't need to add bias for key and value here.
     assert(past_key == nullptr);
     assert(past_value == nullptr);
     return ApplyAttention(Q.GetMutable<Tensor>()->MutableData<T>(),