Skip to content

Commit

Permalink
temporary save
Browse files Browse the repository at this point in the history
  • Loading branch information
pkuzyc committed Nov 9, 2023
1 parent 70ed633 commit b165424
Show file tree
Hide file tree
Showing 3 changed files with 268 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h"
#include <stdio.h>
#include "paddle/phi/core/distributed/comm_context_manager.h"
#include "paddle/phi/kernels/reduce_sum_kernel.h"

Expand Down Expand Up @@ -44,6 +45,64 @@ static inline int64_t NumBlocks(const int64_t N) {
kNumMaxinumNumBlocks);
}

template <typename T>
std::string PrintValue(const T& value) {
std::stringstream ss;
if (std::is_floating_point<T>::value) {
ss << std::showpoint;
}
ss << std::setprecision(std::numeric_limits<T>::max_digits10);

if (std::is_integral<T>::value) {
if (std::is_unsigned<T>::value) {
ss << static_cast<uint64_t>(value);
} else {
ss << static_cast<int64_t>(value);
}
} else {
ss << value;
}
return ss.str();
}

template <typename T>
std::string DebugString(const phi::DenseTensor& tensor) {
// auto* src = tensor->data<T>();
phi::DenseTensor tmp;
framework::TensorCopySync(tensor, CPUPlace(), &tmp);

std::stringstream ss;
ss << "pir print data=[";
size_t numel = tmp.numel();
const T* data = tmp.data<T>();
size_t print_num = 20L;

if (numel <= 2 * print_num) {
for (size_t i = 0; i < numel; ++i) {
if (i > 0) {
ss << ", ";
}
ss << PrintValue(data[i]);
}
} else {
for (size_t i = 0; i < print_num; ++i) {
if (i > 0) {
ss << ", ";
}
ss << PrintValue(data[i]);
}
ss << ", ... , ";
for (size_t i = numel - print_num; i < numel; ++i) {
ss << PrintValue(data[i]);
if (i != numel - 1) {
ss << ", ";
}
}
}
ss << "]";
return ss.str();
}

template <typename T, typename IndexT>
__global__ void MaskLabelByIndex(T* predicted_logits,
const T* logit,
Expand All @@ -54,6 +113,8 @@ __global__ void MaskLabelByIndex(T* predicted_logits,
const int64_t N,
const int64_t D,
const int nranks) {
// printf("start_index:%lld end_index:%lld N:%lld D:%lld\n", start_index,
// end_index, N, D);
CUDA_KERNEL_LOOP_TYPE(i, N, int64_t) {
auto real_label = label[i];
PADDLE_ENFORCE(((real_label < D * nranks) && (real_label >= 0)) ||
Expand All @@ -66,7 +127,11 @@ __global__ void MaskLabelByIndex(T* predicted_logits,
static_cast<int64_t>(ignore_index),
static_cast<int64_t>(real_label));

// printf("i:%d start_index:%lld end_index:%lld N:%lld D:%lld\n", i,
// start_index, end_index, N, D);
if (real_label >= start_index && real_label < end_index) {
// printf("in_if i:%d label:%u start_index:%lld end_index:%lld N:%lld
// D:%lld\n", i, real_label, start_index, end_index, N, D);
predicted_logits[i] = logit[i * D + real_label - start_index];
}
}
Expand Down Expand Up @@ -103,13 +168,16 @@ __global__ void MaskLabelByIndexGrad(T* logits_grad,
auto row = i / D;
auto col = i % D;
auto lbl = static_cast<int64_t>(labels[row]);
// printf("i:%lld label:%lld logits_grad_before:%f loss_grad:%f\n", i, lbl,
// logits_grad[i], loss_grad[row]);
if (lbl == ignore_index) {
logits_grad[i] = static_cast<T>(0.0);
} else if ((col + start_index) == labels[row]) {
logits_grad[i] = (logits_grad[i] - static_cast<T>(1.0)) * loss_grad[row];
} else {
logits_grad[i] *= loss_grad[row];
}
// printf("i:%lld label:%lld logits_grad:%f\n", i, lbl, logits_grad[i]);
}
}

Expand Down Expand Up @@ -141,6 +209,10 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
const int rid = ctx.Attr<int>("ring_id");
const int nranks = ctx.Attr<int>("nranks");
const int rank = ctx.Attr<int>("rank");
// std::cout << "****** logits *******" << std::endl;
// std::cout << DebugString<T>(*logits) << std::endl;
// std::cout << "****** labels *******" << std::endl;
// std::cout << DebugString<int64_t>(*labels) << std::endl;

const auto& place = ctx.GetPlace();
auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
Expand Down Expand Up @@ -225,6 +297,9 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
stream));
}

// std::cout << "****** logits_max *******" << std::endl;
// std::cout << DebugString<T>(logits_max) << std::endl;

// step 2, obtain logit - logit_max
Eigen::DSizes<int, 2> batch_by_one(N, 1);
Eigen::DSizes<int, 2> one_by_class(1, D);
Expand All @@ -249,6 +324,7 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
int threads = kNumCUDAThreads;
const auto& label_type = framework::TransToProtoVarType(labels->dtype());

// printf("N:%d D:%d start:%d end:%d\n", N, D, start_index, end_index);
if (label_type == framework::proto::VarType::INT32) {
MaskLabelByIndex<T, int32_t><<<blocks, threads, 0, dev_ctx.stream()>>>(
predicted_logits.data<T>(),
Expand Down Expand Up @@ -334,10 +410,18 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
ignore_index,
N);
}
// std::cout << "****** loss_2d *******" << std::endl;
// std::cout << DebugString<T>(loss_2d) << std::endl;
// std::cout << "****** predicted_logits *******" << std::endl;
// std::cout << DebugString<T>(predicted_logits) << std::endl;
// std::cout << "****** sun_exp *******" << std::endl;
// std::cout << DebugString<T>(sum_exp_logits) << std::endl;

eigen_softmax.device(*dev_ctx.eigen_device()) =
(eigen_softmax *
eigen_sum_exp_logits.inverse().broadcast(one_by_class));
// std::cout << "****** softmax *******" << std::endl;
// std::cout << DebugString<T>(*softmax) << std::endl;
}
};

Expand Down Expand Up @@ -517,6 +601,15 @@ class CSoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
const int64_t start_index = rank * D;
const int64_t end_index = start_index + D;

// std::cout << "****** logits_grad *******" << std::endl;
// std::cout << DebugString<T>(*logit_grad) << std::endl;
// std::cout << "****** softmax *******" << std::endl;
// std::cout << DebugString<T>(*softmax) << std::endl;
// std::cout << "****** loss_grad *******" << std::endl;
// std::cout << DebugString<T>(*loss_grad) << std::endl;
// std::cout << "****** labels *******" << std::endl;
// std::cout << DebugString<int64_t>(*labels) << std::endl;

if (label_type == framework::proto::VarType::INT32) {
MaskLabelByIndexGrad<T, int32_t>
<<<blocks, threads, 0, dev_ctx.stream()>>>(logit_grad_2d.data<T>(),
Expand All @@ -538,6 +631,8 @@ class CSoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
D,
ignore_index);
}
// std::cout << "****** logits_grad_final *******" << std::endl;
// std::cout << DebugString<T>(*logit_grad) << std::endl;
}
};

Expand Down
84 changes: 80 additions & 4 deletions paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,63 @@ namespace cub = hipcub;

namespace phi {

template <typename T>
std::string PrintValue(const T& value) {
std::stringstream ss;
if (std::is_floating_point<T>::value) {
ss << std::showpoint;
}
ss << std::setprecision(std::numeric_limits<T>::max_digits10);

if (std::is_integral<T>::value) {
if (std::is_unsigned<T>::value) {
ss << static_cast<uint64_t>(value);
} else {
ss << static_cast<int64_t>(value);
}
} else {
ss << value;
}
return ss.str();
}

template <typename T, typename Context>
std::string DebugString(const DenseTensor& tensor, const Context& dev_ctx) {
DenseTensor tmp;
phi::Copy(dev_ctx, tensor, CPUPlace(), true, &tmp);

std::stringstream ss;
ss << "adamw print data=[";
size_t numel = tmp.numel();
const T* data = tmp.data<T>();
size_t print_num = 20L;

if (numel <= 2 * print_num) {
for (size_t i = 0; i < numel; ++i) {
if (i > 0) {
ss << ", ";
}
ss << PrintValue(data[i]);
}
} else {
for (size_t i = 0; i < print_num; ++i) {
if (i > 0) {
ss << ", ";
}
ss << PrintValue(data[i]);
}
ss << ", ... , ";
for (size_t i = numel - print_num; i < numel; ++i) {
ss << PrintValue(data[i]);
if (i != numel - 1) {
ss << ", ";
}
}
}
ss << "]";
return ss.str();
}

template <typename T>
__global__ void SoftLabelCrossEntropyGradientKernel(T* logit_grad,
const T* loss_grad,
Expand Down Expand Up @@ -121,10 +178,10 @@ __global__ void SoftmaxWithCrossEntropyGradHardLabel(T* logits_grad,
const int64_t d,
const int ignore_index) {
int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
int64_t idx_n = idx / (d * dim);
int64_t idx_dim = (idx / d) % dim;
int64_t idx_d = idx % d;
int64_t ids = idx_n * d + idx_d;
int64_t idx_n = idx / (d * dim); // row
int64_t idx_dim = (idx / d) % dim; // col
int64_t idx_d = idx % d; // 0
int64_t ids = idx_n * d + idx_d; // row

if (idx < n * dim * d) {
auto lbl = static_cast<int64_t>(labels[ids]);
Expand All @@ -135,6 +192,12 @@ __global__ void SoftmaxWithCrossEntropyGradHardLabel(T* logits_grad,
} else {
logits_grad[idx] = softmax[idx] * loss_grad[ids];
}
printf("idx: %d, softmax:%f logits_grad:%f loss_grad:%f\n",
idx,
softmax[idx],
logits_grad[idx],
loss_grad[ids]);
// printf("%f %f\n", softmax_val, logit_grad);
}
}

Expand Down Expand Up @@ -215,9 +278,19 @@ void CrossEntropyWithSoftmaxGradGPUKernel(const GPUContext& dev_ctx,
SoftCrossEntropyGradientKernel<T><<<grid, block, 0, stream>>>(
logit_grad_data, loss_grad_data, label_data, n, d, remain);
} else {
// std::cout << "****** logits_grad *******" << std::endl;
// std::cout << DebugString<T>(*logits_grad, dev_ctx) << std::endl;
std::cout << "****** softmax *******" << std::endl;
std::cout << DebugString<T>(softmax, dev_ctx) << std::endl;
std::cout << "****** loss_grad *******" << std::endl;
std::cout << "loss_grad numel: " << loss_grad.numel() << std::endl;
std::cout << DebugString<T>(loss_grad, dev_ctx) << std::endl;
// std::cout << "****** labels *******" << std::endl;
// std::cout << DebugString<int64_t>(label, dev_ctx) << std::endl;
const T* softmax_data = softmax.data<T>();
const auto* label_data = label.data<LabelT>();
int grid = (n * d + block - 1) / block;
// cudaDeviceSynchronize();
SoftmaxWithCrossEntropyGradHardLabel<T>
<<<grid, block, 0, stream>>>(logit_grad_data,
loss_grad_data,
Expand All @@ -227,6 +300,9 @@ void CrossEntropyWithSoftmaxGradGPUKernel(const GPUContext& dev_ctx,
d / remain,
remain,
ignore_index);
// cudaDeviceSynchronize();
// std::cout << "****** logits_grad_final *******" << std::endl;
// std::cout << DebugString<T>(*logits_grad, dev_ctx) << std::endl;
}
}

Expand Down
Loading

0 comments on commit b165424

Please sign in to comment.