PaddlePaddle · xiaoguoguo626807 · Sep 30, 2024 · Dec 26, 2023 · Dec 26, 2023 · Dec 26, 2023
diff --git a/.gitmodules b/.gitmodules
@@ -1,6 +1,7 @@
 [submodule "third_party/protobuf"]
 	path = third_party/protobuf
 	url = https://github.com/protocolbuffers/protobuf.git
+	tag = paddle
 	ignore = dirty
 [submodule "third_party/pocketfft"]
 	path = third_party/pocketfft
@@ -21,10 +22,11 @@
 [submodule "third_party/utf8proc"]
 	path = third_party/utf8proc
 	url = https://github.com/JuliaStrings/utf8proc.git
+	tag = v2.6.1
 	ignore = dirty
 [submodule "third_party/warpctc"]
 	path = third_party/warpctc
-	url = https://github.com/baidu-research/warp-ctc.git
+	url = http://pdegit.metax-internal.com/pde-ai/warp-ctc.git
 	ignore = dirty
 [submodule "third_party/warprnnt"]
 	path = third_party/warprnnt
@@ -33,10 +35,12 @@
 [submodule "third_party/xxhash"]
 	path = third_party/xxhash
 	url = https://github.com/Cyan4973/xxHash.git
+	tag = v0.6.5
 	ignore = dirty
 [submodule "third_party/pybind"]
 	path = third_party/pybind
 	url = https://github.com/pybind/pybind11.git
+	tag = v2.4.3
 	ignore = dirty
 [submodule "third_party/threadpool"]
 	path = third_party/threadpool
@@ -45,39 +49,25 @@
 [submodule "third_party/zlib"]
 	path = third_party/zlib
 	url = https://github.com/madler/zlib.git
+	tag = v1.2.8
 	ignore = dirty
 [submodule "third_party/glog"]
 	path = third_party/glog
 	url = https://github.com/google/glog.git
 	ignore = dirty
-[submodule "third_party/eigen3"]
-	path = third_party/eigen3
-	url = https://gitlab.com/libeigen/eigen.git
-	ignore = dirty
 [submodule "third_party/snappy"]
 	path = third_party/snappy
 	url = https://github.com/google/snappy.git
 	ignore = dirty
-[submodule "third_party/cub"]
-	path = third_party/cub
-	url = https://github.com/NVIDIA/cub.git
-	ignore = dirty
-[submodule "third_party/cutlass"]
-	path = third_party/cutlass
-	url = https://github.com/NVIDIA/cutlass.git
-	ignore = dirty
 [submodule "third_party/xbyak"]
 	path = third_party/xbyak
 	url = https://github.com/herumi/xbyak.git
+	tag = v5.81
 	ignore = dirty
 [submodule "third_party/mkldnn"]
 	path = third_party/mkldnn
 	url = https://github.com/oneapi-src/oneDNN.git
 	ignore = dirty
-[submodule "third_party/flashattn"]
-	path = third_party/flashattn
-	url = https://github.com/PaddlePaddle/flash-attention.git
-	ignore = dirty
 [submodule "third_party/gtest"]
 	path = third_party/gtest
 	url = https://github.com/google/googletest.git
@@ -98,15 +88,11 @@
 	path = third_party/rocksdb
 	url = https://github.com/Thunderbrook/rocksdb
 	ignore = dirty
-[submodule "third_party/absl"]
-	path = third_party/absl
-	url = https://github.com/abseil/abseil-cpp.git
-	ignore = dirty
-[submodule "third_party/jitify"]
-	path = third_party/jitify
-	url = https://github.com/NVIDIA/jitify.git
+[submodule "third_party/cutlass"]
+	path = third_party/cutlass
+	url = http://pdegit.metax-internal.com/pde-ai/cutlass.git
 	ignore = dirty
-[submodule "third_party/cccl"]
-	path = third_party/cccl
-	url = https://github.com/NVIDIA/cccl.git
+[submodule "third_party/eigen3"]
+	path = third_party/eigen3
+	url = ssh://gerrit.metax-internal.com:29418/MACA/library/mcEigen
 	ignore = dirty
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,3 +1,4 @@
+# 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved.   
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -24,7 +25,7 @@ endif()
 # https://cmake.org/cmake/help/v3.0/policy/CMP0026.html?highlight=cmp0026
 cmake_policy(SET CMP0026 OLD)
 cmake_policy(SET CMP0079 NEW)
-set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake" $ENV{CMAKE_MODULE_PATH})
 set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
 
@@ -92,6 +93,7 @@ endif()
 
 if(WITH_GPU AND NOT APPLE)
   enable_language(CUDA)
+  set(CMAKE_CUDA_COMPILER_VERSION 11.6)
   message(STATUS "CUDA compiler: ${CMAKE_CUDA_COMPILER}, version: "
                  "${CMAKE_CUDA_COMPILER_ID} ${CMAKE_CUDA_COMPILER_VERSION}")
 endif()
@@ -255,7 +257,7 @@ option(WITH_SYSTEM_BLAS "Use system blas library" OFF)
 option(WITH_DISTRIBUTE "Compile with distributed support" OFF)
 option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF)
 option(ON_INFER "Turn on inference optimization and inference-lib generation"
-       ON)
+       OFF)
 option(WITH_CPP_DIST "Install PaddlePaddle C++ distribution" OFF)
 option(WITH_GFLAGS "Compile PaddlePaddle with gflags support" OFF)
 ################################ Internal Configurations #######################################
@@ -283,7 +285,7 @@ option(
   OFF)
 option(WITH_LITE "Compile Paddle Fluid with Lite Engine" OFF)
 option(WITH_CINN "Compile PaddlePaddle with CINN" OFF)
-option(WITH_NCCL "Compile PaddlePaddle with NCCL support" ON)
+option(WITH_NCCL "Compile PaddlePaddle with NCCL support" OFF)
 option(WITH_RCCL "Compile PaddlePaddle with RCCL support" ON)
 option(WITH_XPU_BKCL "Compile PaddlePaddle with BAIDU KUNLUN XPU BKCL" OFF)
 option(WITH_CRYPTO "Compile PaddlePaddle with crypto support" ON)
@@ -474,6 +476,21 @@ if(WITH_GPU)
   # so include(cudnn) needs to be in front of include(third_party/lite)
   include(cudnn) # set cudnn libraries, must before configure
   include(tensorrt)
+
+  include_directories("$ENV{MACA_PATH}/tools/cu-bridge/include")
+  include_directories("$ENV{MACA_PATH}/include")
+  include_directories("$ENV{MACA_PATH}/include/mcblas")
+  include_directories("$ENV{MACA_PATH}/include/mcr")
+  include_directories("$ENV{MACA_PATH}/include/mcdnn")
+  include_directories("$ENV{MACA_PATH}/include/mcsim")
+  include_directories("$ENV{MACA_PATH}/include/mcsparse")
+  include_directories("$ENV{MACA_PATH}/include/mcfft")
+  include_directories("$ENV{MACA_PATH}/include/mcrand")
+  include_directories("$ENV{MACA_PATH}/include/common")
+  include_directories("$ENV{MACA_PATH}/include/mcsolver")
+  include_directories("$ENV{MACA_PATH}/include/mctx")
+  include_directories("$ENV{MACA_PATH}/include/mcpti")
+  include_directories("$ENV{MACA_PATH}/mxgpu_llvm/include")
   # there is no official support of nccl, cupti in windows
   if(NOT WIN32)
     include(cupti)

diff --git a/NOTICE b/NOTICE
@@ -0,0 +1,183 @@
+The following files may have been modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. in 2024. 
+
+.gitmodules
+CMakeLists.txt
+cmake/cuda.cmake
+cmake/cudnn.cmake
+cmake/cupti.cmake
+cmake/external/brpc.cmake
+cmake/external/cryptopp.cmake
+cmake/external/cutlass.cmake
+cmake/external/dgc.cmake
+cmake/external/dlpack.cmake
+cmake/external/eigen.cmake
+cmake/external/flashattn.cmake
+cmake/external/jemalloc.cmake
+cmake/external/lapack.cmake
+cmake/external/libmct.cmake
+cmake/external/mklml.cmake
+cmake/external/protobuf.cmake
+cmake/external/pybind11.cmake
+cmake/external/utf8proc.cmake
+cmake/flags.cmake
+cmake/generic.cmake
+cmake/inference_lib.cmake
+cmake/nccl.cmake
+cmake/third_party.cmake
+env.sh
+paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_with_brpc_test.cc
+paddle/fluid/eager/api/manual/eager_manual/forwards/multiply_fwd_func.cc
+paddle/fluid/eager/auto_code_generator/eager_generator.cc
+paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
+paddle/fluid/framework/details/build_strategy.cc
+paddle/fluid/framework/distributed_strategy.proto
+paddle/fluid/inference/api/resource_manager.cc
+paddle/fluid/inference/api/resource_manager.h
+paddle/fluid/inference/tensorrt/plugin/layernorm_shift_partition_op.cu
+paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.h
+paddle/fluid/inference/tensorrt/plugin/preln_residual_bias_plugin.cu
+paddle/fluid/memory/allocation/CMakeLists.txt
+paddle/fluid/memory/allocation/allocator_facade.cc
+paddle/fluid/operators/CMakeLists.txt
+paddle/fluid/operators/correlation_op.cu
+paddle/fluid/operators/elementwise/elementwise_op_function.h
+paddle/fluid/operators/fused/CMakeLists.txt
+paddle/fluid/operators/fused/attn_gemm_int8.h
+paddle/fluid/operators/fused/cublaslt.h
+paddle/fluid/operators/fused/fused_gate_attention.h
+paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
+paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
+paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cu
+paddle/fluid/operators/fused/fused_multi_transformer_op.cu
+paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h
+paddle/fluid/operators/fused/fused_softmax_mask.cu.h
+paddle/fluid/operators/math/inclusive_scan.h
+paddle/fluid/operators/matmul_op.cc
+paddle/fluid/operators/row_conv_op.cu
+paddle/fluid/operators/sparse_attention_op.cu
+paddle/fluid/platform/cuda_graph_with_memory_pool.cc
+paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
+paddle/fluid/platform/device/gpu/cuda_helper_test.cu
+paddle/fluid/platform/device/gpu/gpu_types.h
+paddle/fluid/platform/device_context.h
+paddle/fluid/platform/dynload/CMakeLists.txt
+paddle/fluid/platform/dynload/cublas.h
+paddle/fluid/platform/dynload/cublasLt.cc
+paddle/fluid/platform/dynload/cublasLt.h
+paddle/fluid/platform/dynload/cusparseLt.h
+paddle/fluid/platform/init.cc
+paddle/fluid/platform/init_phi_test.cc
+paddle/fluid/pybind/eager_legacy_op_function_generator.cc
+paddle/fluid/pybind/fleet_py.cc
+paddle/fluid/pybind/pybind.cc
+paddle/phi/api/profiler/profiler.cc
+paddle/phi/backends/dynload/CMakeLists.txt
+paddle/phi/backends/dynload/cublas.h
+paddle/phi/backends/dynload/cublasLt.cc
+paddle/phi/backends/dynload/cublasLt.h
+paddle/phi/backends/dynload/cuda_driver.h
+paddle/phi/backends/dynload/cudnn.h
+paddle/phi/backends/dynload/cufft.h
+paddle/phi/backends/dynload/cupti.h
+paddle/phi/backends/dynload/curand.h
+paddle/phi/backends/dynload/cusolver.h
+paddle/phi/backends/dynload/cusparse.h
+paddle/phi/backends/dynload/cusparseLt.h
+paddle/phi/backends/dynload/dynamic_loader.cc
+paddle/phi/backends/dynload/flashattn.h
+paddle/phi/backends/dynload/nccl.h
+paddle/phi/backends/dynload/nvjpeg.h
+paddle/phi/backends/dynload/nvrtc.h
+paddle/phi/backends/dynload/nvtx.h
+paddle/phi/backends/gpu/cuda/cuda_device_function.h
+paddle/phi/backends/gpu/cuda/cuda_helper.h
+paddle/phi/backends/gpu/forwards.h
+paddle/phi/backends/gpu/gpu_context.cc
+paddle/phi/backends/gpu/gpu_context.h
+paddle/phi/backends/gpu/gpu_decls.h
+paddle/phi/backends/gpu/gpu_resources.cc
+paddle/phi/backends/gpu/gpu_resources.h
+paddle/phi/backends/gpu/rocm/rocm_device_function.h
+paddle/phi/core/custom_kernel.cc
+paddle/phi/core/distributed/check/nccl_dynamic_check.h
+paddle/phi/core/distributed/comm_context_manager.h
+paddle/phi/core/enforce.h
+paddle/phi/core/flags.cc
+paddle/phi/core/visit_type.h
+paddle/phi/kernels/funcs/aligned_vector.h
+paddle/phi/kernels/funcs/blas/blas_impl.cu.h
+paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h
+paddle/phi/kernels/funcs/broadcast_function.h
+paddle/phi/kernels/funcs/concat_and_split_functor.cu
+paddle/phi/kernels/funcs/cublaslt.h
+paddle/phi/kernels/funcs/deformable_conv_functor.cu
+paddle/phi/kernels/funcs/distribution_helper.h
+paddle/phi/kernels/funcs/dropout_impl.cu.h
+paddle/phi/kernels/funcs/elementwise_base.h
+paddle/phi/kernels/funcs/elementwise_grad_base.h
+paddle/phi/kernels/funcs/fused_gemm_epilogue.h
+paddle/phi/kernels/funcs/gemm_int8_helper.h
+paddle/phi/kernels/funcs/inclusive_scan.h
+paddle/phi/kernels/funcs/layer_norm_impl.cu.h
+paddle/phi/kernels/funcs/math_cuda_utils.h
+paddle/phi/kernels/funcs/reduce_function.h
+paddle/phi/kernels/funcs/scatter.cu.h
+paddle/phi/kernels/funcs/top_k_function_cuda.h
+paddle/phi/kernels/funcs/weight_only_gemv.cu
+paddle/phi/kernels/fusion/cutlass/utils/cuda_utils.h
+paddle/phi/kernels/fusion/gpu/attn_gemm.h
+paddle/phi/kernels/fusion/gpu/fused_dropout_add_utils.h
+paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h
+paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h
+paddle/phi/kernels/fusion/gpu/fused_linear_param_grad_add_kernel.cu
+paddle/phi/kernels/fusion/gpu/fused_softmax_mask_upper_triangle_utils.h
+paddle/phi/kernels/fusion/gpu/fused_softmax_mask_utils.h
+paddle/phi/kernels/fusion/gpu/mmha_util.cu.h
+paddle/phi/kernels/gpu/accuracy_kernel.cu
+paddle/phi/kernels/gpu/amp_kernel.cu
+paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
+paddle/phi/kernels/gpu/contiguous_kernel.cu
+paddle/phi/kernels/gpu/decode_jpeg_kernel.cu
+paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
+paddle/phi/kernels/gpu/depthwise_conv.h
+paddle/phi/kernels/gpu/dist_kernel.cu
+paddle/phi/kernels/gpu/flash_attn_grad_kernel.cu
+paddle/phi/kernels/gpu/flash_attn_kernel.cu
+paddle/phi/kernels/gpu/flash_attn_utils.h
+paddle/phi/kernels/gpu/gelu_funcs.h
+paddle/phi/kernels/gpu/generate_proposals_kernel.cu
+paddle/phi/kernels/gpu/group_norm_kernel.cu
+paddle/phi/kernels/gpu/interpolate_grad_kernel.cu
+paddle/phi/kernels/gpu/kthvalue_kernel.cu
+paddle/phi/kernels/gpu/llm_int8_linear_kernel.cu
+paddle/phi/kernels/gpu/masked_select_kernel.cu
+paddle/phi/kernels/gpu/nonzero_kernel.cu
+paddle/phi/kernels/gpu/roi_align_grad_kernel.cu
+paddle/phi/kernels/gpu/roi_align_kernel.cu
+paddle/phi/kernels/gpu/strided_copy_kernel.cu
+paddle/phi/kernels/gpu/top_k_kernel.cu
+paddle/phi/kernels/gpu/top_p_sampling_kernel.cu
+paddle/phi/kernels/gpu/unique_consecutive_functor.h
+paddle/phi/kernels/gpu/unique_kernel.cu
+paddle/phi/kernels/gpudnn/conv_cudnn_v7.h
+paddle/phi/kernels/gpudnn/softmax_gpudnn.h
+paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
+paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h
+paddle/phi/kernels/impl/matmul_kernel_impl.h
+paddle/phi/kernels/impl/multi_dot_kernel_impl.h
+paddle/phi/kernels/primitive/datamover_primitives.h
+paddle/phi/kernels/primitive/kernel_primitives.h
+paddle/phi/tools/CMakeLists.txt
+paddle/utils/flat_hash_map.h
+patches/eigen/TensorReductionGpu.h
+python/paddle/base/framework.py
+python/paddle/distributed/launch/controllers/watcher.py
+python/paddle/profiler/profiler_statistic.py
+python/paddle/utils/cpp_extension/cpp_extension.py
+python/paddle/utils/cpp_extension/extension_utils.py
+test/CMakeLists.txt
+test/cpp/CMakeLists.txt
+test/cpp/jit/CMakeLists.txt
+test/cpp/new_executor/CMakeLists.txt
+test/legacy_test/test_flash_attention.py
+tools/ci_op_benchmark.sh
diff --git a/README.md b/README.md
@@ -20,7 +20,7 @@ PaddlePaddle is originated from industrial practices with dedication and commitm
 
 ## Installation
 
-### Latest PaddlePaddle Release: [v2.5](https://github.com/PaddlePaddle/Paddle/tree/release/2.5)
+### Latest PaddlePaddle Release: [v2.6](https://github.com/PaddlePaddle/Paddle/tree/release/2.6)
 
 Our vision is to enable deep learning for everyone via PaddlePaddle.
 Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest features of PaddlePaddle.

diff --git a/README_cn.md b/README_cn.md
@@ -18,9 +18,9 @@
 
 ## 安装
 
-### PaddlePaddle最新版本: [v2.5](https://github.com/PaddlePaddle/Paddle/tree/release/2.5)
+### PaddlePaddle 最新版本: [v2.6](https://github.com/PaddlePaddle/Paddle/tree/release/2.6)
 
-跟进PaddlePaddle最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases)
+跟进 PaddlePaddle 最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases)
 
 ### 安装最新稳定版本:
 ```

diff --git a/README_ja.md b/README_ja.md
@@ -20,7 +20,7 @@ PaddlePaddle は、工業化に対するコミットメントを持つ工業的
 
 ## インストール
 
-### PaddlePaddle の最新リリース: [v2.5](https://github.com/PaddlePaddle/Paddle/tree/release/2.5)
+### PaddlePaddle の最新リリース: [v2.6](https://github.com/PaddlePaddle/Paddle/tree/release/2.6)
 
 私たちのビジョンは、PaddlePaddle を通じて、誰もが深層学習を行えるようにすることです。
 PaddlePaddle の最新機能を追跡するために、私たちの[リリースのお知らせ](https://github.com/PaddlePaddle/Paddle/releases)を参照してください。