Skip to content

Commit

Permalink
resolve conflict with develop
Browse files Browse the repository at this point in the history
  • Loading branch information
chenwhql committed Sep 28, 2022
2 parents 6cf126d + 844d985 commit b384e8b
Show file tree
Hide file tree
Showing 59 changed files with 2,512 additions and 1,237 deletions.
71 changes: 70 additions & 1 deletion .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ ignore =
E701,E711,E712,E713,E714,E721,E722,E731,E741,

# F, see https://flake8.pycqa.org/en/latest/user/error-codes.html
F401,F402,F403,F405,
F402,F403,F405,
F522,F524,F541,
F601,F631,F632,
F811,F821,F841,
Expand All @@ -28,3 +28,72 @@ per-file-ignores =
# These files need tabs for testing.
python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py:E101,W191
python/paddle/fluid/tests/unittests/collective/fleet/test_hdfs1.py:E101,W191
# Ignore unused imports in __init__.py
__init__.py: F401
# These files will be fixed later
r/*:F401
cmake/*:F401
paddle/*:F401
tools/*:F401
python/paddle/signal.py:F401
python/paddle/common_ops_import.py:F401
python/paddle/check_import_scipy.py:F401
python/paddle/fft.py:F401
python/paddle/_C_ops.py:F401
python/paddle/framework/*:F401
python/paddle/reader/*:F401
python/paddle/nn/*:F401
python/paddle/distributed/*:F401
python/paddle/proto/*:F401
python/paddle/onnx/*:F401
python/paddle/optimizer/*:F401
python/paddle/hapi/*:F401
python/paddle/autograd/*:F401
python/paddle/dataset/*:F401
python/paddle/io/*:F401
python/paddle/cost_model/*:F401
python/paddle/tests/*:F401
python/paddle/distribution/*:F401
python/paddle/geometric/*:F401
python/paddle/utils/*:F401
python/paddle/vision/*:F401
python/paddle/quantization/*:F401
python/paddle/libs/*:F401
python/paddle/audio/*:F401
python/paddle/incubate/*:F401
python/paddle/amp/*:F401
python/paddle/jit/*:F401
python/paddle/static/*:F401
python/paddle/inference/*:F401
python/paddle/device/*:F401
python/paddle/profiler/*:F401
python/paddle/tensor/*:F401
python/paddle/text/*:F401
python/paddle/metric/*:F401
python/paddle/fluid/tests/custom_kernel/*:F401
python/paddle/fluid/tests/custom_runtime/*:F401
python/paddle/fluid/tests/unittests/interpreter/*:F401
python/paddle/fluid/tests/unittests/asp/*:F401
python/paddle/fluid/tests/unittests/autograd/*:F401
python/paddle/fluid/tests/unittests/ir/*:F401
python/paddle/fluid/tests/unittests/collective/*:F401
python/paddle/fluid/tests/unittests/tokenizer/*:F401
python/paddle/fluid/tests/unittests/xpu/*:F401
python/paddle/fluid/tests/unittests/distribution/*:F401
python/paddle/fluid/tests/unittests/mlu/*:F401
python/paddle/fluid/tests/unittests/npu/*:F401
python/paddle/fluid/tests/unittests/ipu/*:F401
python/paddle/fluid/tests/unittests/distributed_passes/*:F401
python/paddle/fluid/tests/unittests/auto_parallel/*:F401
python/paddle/fluid/tests/unittests/dygraph_to_static/*:F401
python/paddle/fluid/tests/unittests/ps/*:F401
python/paddle/fluid/tests/unittests/fft/*:F401
python/paddle/fluid/tests/unittests/white_list/*:F401
python/paddle/fluid/tests/unittests/sequence/*:F401
python/paddle/fluid/tests/unittests/mkldnn/*:F401
python/paddle/fluid/tests/unittests/rnn/*:F401
python/paddle/fluid/tests/book/*:F401
python/paddle/fluid/tests/custom_op/*:F401
python/paddle/fluid/tests/unittests/test_*:F401
python/paddle/fluid/tests/test_*:F401
python/paddle/fluid/tests/*:F401
5 changes: 0 additions & 5 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,6 @@ repos:
hooks:
- id: yapf
files: (.*\.(py|bzl)|BUILD|.*\.BUILD|WORKSPACE)$
exclude: |
(?x)^(
python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py|
python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py
)$
- repo: https://github.com/PyCQA/flake8
rev: 4.0.1
hooks:
Expand Down
49 changes: 17 additions & 32 deletions cmake/phi.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ function(kernel_declare TARGET_LIST)
string(
REGEX
MATCH
"(PD_REGISTER_KERNEL|PD_REGISTER_GENERAL_KERNEL)\\([ \t\r\n]*[a-z0-9_]*,[ \t\r\n\/]*[a-z0-9_]*"
"(PD_REGISTER_KERNEL|PD_REGISTER_GENERAL_KERNEL)\\([ \t\r\n]*[a-z0-9_]*,[[ \\\t\r\n\/]*[a-z0-9_]*]?[ \\\t\r\n]*[a-zA-Z]*,[ \\\t\r\n]*[A-Z_]*"
first_registry
"${kernel_impl}")
if(NOT first_registry STREQUAL "")
Expand All @@ -89,38 +89,23 @@ function(kernel_declare TARGET_LIST)
continue()
endif()
endif()
# parse the first kernel name
string(REPLACE "PD_REGISTER_KERNEL(" "" kernel_name "${first_registry}")
string(REPLACE "PD_REGISTER_GENERAL_KERNEL(" "" kernel_name
"${kernel_name}")
string(REPLACE "," "" kernel_name "${kernel_name}")
string(REGEX REPLACE "[ \t\r\n]+" "" kernel_name "${kernel_name}")
string(REGEX REPLACE "//cuda_only" "" kernel_name "${kernel_name}")
# parse the registerd kernel message
string(REPLACE "PD_REGISTER_KERNEL(" "" kernel_msg "${first_registry}")
string(REPLACE "PD_REGISTER_GENERAL_KERNEL(" "" kernel_msg
"${kernel_msg}")
string(REPLACE "," ";" kernel_msg "${kernel_msg}")
string(REGEX REPLACE "[ \\\t\r\n]+" "" kernel_msg "${kernel_msg}")
string(REGEX REPLACE "//cuda_only" "" kernel_msg "${kernel_msg}")

list(GET kernel_msg 0 kernel_name)
list(GET kernel_msg 1 kernel_backend)
list(GET kernel_msg 2 kernel_layout)

# append kernel declare into declarations.h
# TODO(chenweihang): default declare ALL_LAYOUT for each kernel
if(${kernel_path} MATCHES "./cpu\/")
file(APPEND ${kernel_declare_file}
"PD_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n")
elseif(${kernel_path} MATCHES "./gpu\/")
file(APPEND ${kernel_declare_file}
"PD_DECLARE_KERNEL(${kernel_name}, GPU, ALL_LAYOUT);\n")
elseif(${kernel_path} MATCHES "./xpu\/")
file(APPEND ${kernel_declare_file}
"PD_DECLARE_KERNEL(${kernel_name}, XPU, ALL_LAYOUT);\n")
elseif(${kernel_path} MATCHES "./gpudnn\/")
file(APPEND ${kernel_declare_file}
"PD_DECLARE_KERNEL(${kernel_name}, GPUDNN, ALL_LAYOUT);\n")
elseif(${kernel_path} MATCHES "./kps\/")
file(APPEND ${kernel_declare_file}
"PD_DECLARE_KERNEL(${kernel_name}, KPS, ALL_LAYOUT);\n")
elseif(${kernel_path} MATCHES "./onednn\/")
file(APPEND ${kernel_declare_file}
"PD_DECLARE_KERNEL(${kernel_name}, OneDNN, ALL_LAYOUT);\n")
else()
# deal with device independent kernel, now we use CPU temporaary
file(APPEND ${kernel_declare_file}
"PD_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n")
endif()
file(
APPEND ${kernel_declare_file}
"PD_DECLARE_KERNEL(${kernel_name}, ${kernel_backend}, ${kernel_layout});\n"
)
endif()
endforeach()
endfunction()
Expand Down
11 changes: 11 additions & 0 deletions paddle/fluid/framework/new_executor/data_transfer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -142,11 +142,22 @@ void DataTranferHelper::RunAndConstructOpFuncNode(
if (phi::KernelFactory::Instance().HasCompatiblePhiKernel(
op_with_kernel->Type())) {
auto phi_kernel_key = op_with_kernel->ChoosePhiKernel(exec_ctx);
auto phi_kernel_name = op_with_kernel->PhiKernelSignature()->name;
VLOG(6) << "phi_kernel_key " << phi_kernel_key << "\n";
VLOG(6) << "phi_kernel_name " << phi_kernel_name << "\n";

if (op_with_kernel->PhiKernel()->IsValid()) {
run_phi_kernel = true;
}

// For data transfer ops, they should not fallback to cpu.
// Though they're device-independent operations,
// their implementations are device-related.
// For example, consider changing the layout of a gpu tensor
// while the gpu kernel of transfer_layout op does not exist.
// To use the cpu kernel, you must insert memcpy_d2h/mepcpy_h2d op
// in addition. But such operation should not be done here.
// Maybe in future we will support this.
}

// 3. Execute transfer op and construct OpFuncNode
Expand Down
7 changes: 7 additions & 0 deletions paddle/fluid/framework/phi_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,13 @@ phi::KernelKey FallBackToCpu(const OpKernelType& expected_kernel_key,
phi::Backend::CPU, kernel_key.layout(), kernel_key.dtype());
}
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::is_gpu_place(expected_kernel_key.place_)) {
PADDLE_THROW(platform::errors::Unavailable(
"For GPU kernel, they must not fallback into CPU kernel."));
}
#endif

return phi::KernelKey();
}

Expand Down
92 changes: 47 additions & 45 deletions paddle/fluid/operators/fake_quantize_op.cu.h
Original file line number Diff line number Diff line change
Expand Up @@ -590,33 +590,29 @@ __global__ void ChannelClipAndQuantDequantKernelQuantAxis0(const T *in,
const T *scale,
const int bin_cnt,
const int round_type,
const int n,
const int c,
const int wh_size,
const int num,
const int cout,
T *out) {
int tid = threadIdx.x;

int channel_size = n / c;
const T *in_c = in + blockIdx.x * channel_size;
T *out_c = out + blockIdx.x * channel_size;

T s = scale[blockIdx.x];
T inv_s = inverse(s);
int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;

for (int i = tid; i < channel_size; i += blockDim.x) {
T x = in_c[i];
for (int64_t i = idx; i < num; i += blockDim.x * gridDim.x) {
T s = scale[(i / wh_size) % cout];
T inv_s = inverse(s);
T x = in[i];
if (round_type == 0) {
x = bin_cnt * inv_s * x;
x = roundWithTiesToEven(x);
T max_bound = bin_cnt;
T min_bound = -bin_cnt - static_cast<T>(1);
x = x > max_bound ? max_bound : x;
x = x < min_bound ? min_bound : x;
out_c[i] = (x * s) / bin_cnt;
out[i] = (x * s) / bin_cnt;
} else {
T v = x > s ? s : x;
v = v < -s ? -s : v;
v = bin_cnt * inv_s * v;
out_c[i] = round(v) * s / bin_cnt;
out[i] = round(v) * s / bin_cnt;
}
}
}
Expand All @@ -627,32 +623,29 @@ __global__ void ChannelClipAndQuantDequantKernelQuantAxis1(const T *in,
const T *scale,
const int bin_cnt,
const int round_type,
const int n,
const int cin,
const int wh_size,
const int num,
const int cout,
T *out) {
T s = scale[blockIdx.x % cout];
T inv_s = inverse(s);

int wh_size = n / (cin * cout);
const T *in_c = in + blockIdx.x * wh_size;
T *out_c = out + blockIdx.x * wh_size;
int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;

for (int i = threadIdx.x; i < wh_size; i += blockDim.x) {
T x = in_c[i];
for (int64_t i = idx; i < num; i += blockDim.x * gridDim.x) {
T s = scale[(i / wh_size) % cout];
T inv_s = inverse(s);
T x = in[i];
if (round_type == 0) {
x = bin_cnt * inv_s * x;
x = roundWithTiesToEven(x);
T max_bound = bin_cnt;
T min_bound = -bin_cnt - static_cast<T>(1);
x = x > max_bound ? max_bound : x;
x = x < min_bound ? min_bound : x;
out_c[i] = (x * s) / bin_cnt;
out[i] = (x * s) / bin_cnt;
} else {
T v = x > s ? s : x;
v = v < -s ? -s : v;
v = bin_cnt * inv_s * v;
out_c[i] = round(v) * s / bin_cnt;
out[i] = round(v) * s / bin_cnt;
}
}
}
Expand Down Expand Up @@ -682,30 +675,39 @@ struct ChannelClipFakeQuantDequantFunctor<phi::GPUContext, T> {
const T *scale_data = scale.data<T>();
T *out_data = out->mutable_data<T>(ctx.GetPlace());

int64_t block_size =
std::min(static_cast<int64_t>(num),
static_cast<int64_t>(ctx.GetMaxThreadsPerBlock() / 4));

int64_t max_threads = ctx.GetMaxPhysicalThreadCount(); // SM * block_per_SM
const int64_t max_blocks =
std::max(((max_threads - 1) / block_size + 1), static_cast<int64_t>(1));
const int64_t grid_size =
std::min(max_blocks, (num + block_size - 1) / block_size);

if (quant_axis == 0) {
int grid = in_dims[0];
int block = 1024;
const int window_size = num / in_dims[0];
ChannelClipAndQuantDequantKernelQuantAxis0<T>
<<<grid, block, 0, ctx.stream()>>>(in_data,
scale_data,
bin_cnt,
round_type,
num,
in_dims[0],
out_data);
<<<grid_size, block_size, 0, ctx.stream()>>>(in_data,
scale_data,
bin_cnt,
round_type,
window_size,
num,
in_dims[0],
out_data);
} else if (quant_axis == 1) {
int grid = in_dims[0] * in_dims[1];
int block = 1024;
const int window_size = num / (in_dims[0] * in_dims[1]);

ChannelClipAndQuantDequantKernelQuantAxis1<T>
<<<grid, block, 0, ctx.stream()>>>(in_data,
scale_data,
bin_cnt,
round_type,
num,
in_dims[0],
in_dims[1],
out_data);
<<<grid_size, block_size, 0, ctx.stream()>>>(in_data,
scale_data,
bin_cnt,
round_type,
window_size,
num,
in_dims[1],
out_data);
}
}
};
Expand Down
Loading

0 comments on commit b384e8b

Please sign in to comment.