temporary save

PaddlePaddle · Nov 9, 2023 · b165424 · b165424
1 parent 70ed633
commit b165424
Show file tree

Hide file tree

Showing 3 changed files with 268 additions and 10 deletions.
diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h"
+#include <stdio.h>
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 #include "paddle/phi/kernels/reduce_sum_kernel.h"
 
@@ -44,6 +45,64 @@ static inline int64_t NumBlocks(const int64_t N) {
                   kNumMaxinumNumBlocks);
 }
 
+template <typename T>
+std::string PrintValue(const T& value) {
+  std::stringstream ss;
+  if (std::is_floating_point<T>::value) {
+    ss << std::showpoint;
+  }
+  ss << std::setprecision(std::numeric_limits<T>::max_digits10);
+
+  if (std::is_integral<T>::value) {
+    if (std::is_unsigned<T>::value) {
+      ss << static_cast<uint64_t>(value);
+    } else {
+      ss << static_cast<int64_t>(value);
+    }
+  } else {
+    ss << value;
+  }
+  return ss.str();
+}
+
+template <typename T>
+std::string DebugString(const phi::DenseTensor& tensor) {
+  // auto* src = tensor->data<T>();
+  phi::DenseTensor tmp;
+  framework::TensorCopySync(tensor, CPUPlace(), &tmp);
+
+  std::stringstream ss;
+  ss << "pir print data=[";
+  size_t numel = tmp.numel();
+  const T* data = tmp.data<T>();
+  size_t print_num = 20L;
+
+  if (numel <= 2 * print_num) {
+    for (size_t i = 0; i < numel; ++i) {
+      if (i > 0) {
+        ss << ", ";
+      }
+      ss << PrintValue(data[i]);
+    }
+  } else {
+    for (size_t i = 0; i < print_num; ++i) {
+      if (i > 0) {
+        ss << ", ";
+      }
+      ss << PrintValue(data[i]);
+    }
+    ss << ", ... , ";
+    for (size_t i = numel - print_num; i < numel; ++i) {
+      ss << PrintValue(data[i]);
+      if (i != numel - 1) {
+        ss << ", ";
+      }
+    }
+  }
+  ss << "]";
+  return ss.str();
+}
+
 template <typename T, typename IndexT>
 __global__ void MaskLabelByIndex(T* predicted_logits,
                                  const T* logit,
@@ -54,6 +113,8 @@ __global__ void MaskLabelByIndex(T* predicted_logits,
                                  const int64_t N,
                                  const int64_t D,
                                  const int nranks) {
+  // printf("start_index:%lld end_index:%lld N:%lld D:%lld\n", start_index,
+  // end_index, N, D);
   CUDA_KERNEL_LOOP_TYPE(i, N, int64_t) {
     auto real_label = label[i];
     PADDLE_ENFORCE(((real_label < D * nranks) && (real_label >= 0)) ||
@@ -66,7 +127,11 @@ __global__ void MaskLabelByIndex(T* predicted_logits,
                    static_cast<int64_t>(ignore_index),
                    static_cast<int64_t>(real_label));
 
+    // printf("i:%d start_index:%lld end_index:%lld N:%lld D:%lld\n", i,
+    // start_index, end_index, N, D);
     if (real_label >= start_index && real_label < end_index) {
+      // printf("in_if i:%d label:%u start_index:%lld end_index:%lld N:%lld
+      // D:%lld\n", i, real_label, start_index, end_index, N, D);
       predicted_logits[i] = logit[i * D + real_label - start_index];
     }
   }
@@ -103,13 +168,16 @@ __global__ void MaskLabelByIndexGrad(T* logits_grad,
     auto row = i / D;
     auto col = i % D;
     auto lbl = static_cast<int64_t>(labels[row]);
+    // printf("i:%lld label:%lld logits_grad_before:%f loss_grad:%f\n", i, lbl,
+    // logits_grad[i], loss_grad[row]);
     if (lbl == ignore_index) {
       logits_grad[i] = static_cast<T>(0.0);
     } else if ((col + start_index) == labels[row]) {
       logits_grad[i] = (logits_grad[i] - static_cast<T>(1.0)) * loss_grad[row];
     } else {
       logits_grad[i] *= loss_grad[row];
     }
+    // printf("i:%lld label:%lld logits_grad:%f\n", i, lbl, logits_grad[i]);
   }
 }
 
@@ -141,6 +209,10 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
     const int rid = ctx.Attr<int>("ring_id");
     const int nranks = ctx.Attr<int>("nranks");
     const int rank = ctx.Attr<int>("rank");
+    // std::cout << "****** logits *******" << std::endl;
+    // std::cout << DebugString<T>(*logits) << std::endl;
+    // std::cout << "****** labels *******" << std::endl;
+    // std::cout << DebugString<int64_t>(*labels) << std::endl;
 
     const auto& place = ctx.GetPlace();
     auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
@@ -225,6 +297,9 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
           stream));
     }
 
+    // std::cout << "****** logits_max *******" << std::endl;
+    // std::cout << DebugString<T>(logits_max) << std::endl;
+
     // step 2, obtain logit - logit_max
     Eigen::DSizes<int, 2> batch_by_one(N, 1);
     Eigen::DSizes<int, 2> one_by_class(1, D);
@@ -249,6 +324,7 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
     int threads = kNumCUDAThreads;
     const auto& label_type = framework::TransToProtoVarType(labels->dtype());
 
+    // printf("N:%d D:%d start:%d end:%d\n", N, D, start_index, end_index);
     if (label_type == framework::proto::VarType::INT32) {
       MaskLabelByIndex<T, int32_t><<<blocks, threads, 0, dev_ctx.stream()>>>(
           predicted_logits.data<T>(),
@@ -334,10 +410,18 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
                                                      ignore_index,
                                                      N);
     }
+    // std::cout << "****** loss_2d *******" << std::endl;
+    // std::cout << DebugString<T>(loss_2d) << std::endl;
+    // std::cout << "****** predicted_logits *******" << std::endl;
+    // std::cout << DebugString<T>(predicted_logits) << std::endl;
+    // std::cout << "****** sun_exp *******" << std::endl;
+    // std::cout << DebugString<T>(sum_exp_logits) << std::endl;
 
     eigen_softmax.device(*dev_ctx.eigen_device()) =
         (eigen_softmax *
          eigen_sum_exp_logits.inverse().broadcast(one_by_class));
+    // std::cout << "****** softmax *******" << std::endl;
+    // std::cout << DebugString<T>(*softmax) << std::endl;
   }
 };
 
@@ -517,6 +601,15 @@ class CSoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
     const int64_t start_index = rank * D;
     const int64_t end_index = start_index + D;
 
+    // std::cout << "****** logits_grad *******" << std::endl;
+    // std::cout << DebugString<T>(*logit_grad) << std::endl;
+    // std::cout << "****** softmax *******" << std::endl;
+    // std::cout << DebugString<T>(*softmax) << std::endl;
+    // std::cout << "****** loss_grad *******" << std::endl;
+    // std::cout << DebugString<T>(*loss_grad) << std::endl;
+    // std::cout << "****** labels *******" << std::endl;
+    // std::cout << DebugString<int64_t>(*labels) << std::endl;
+
     if (label_type == framework::proto::VarType::INT32) {
       MaskLabelByIndexGrad<T, int32_t>
           <<<blocks, threads, 0, dev_ctx.stream()>>>(logit_grad_2d.data<T>(),
@@ -538,6 +631,8 @@ class CSoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
                                                      D,
                                                      ignore_index);
     }
+    // std::cout << "****** logits_grad_final *******" << std::endl;
+    // std::cout << DebugString<T>(*logit_grad) << std::endl;
   }
 };
 

diff --git a/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu b/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
@@ -36,6 +36,63 @@ namespace cub = hipcub;
 
 namespace phi {
 
+template <typename T>
+std::string PrintValue(const T& value) {
+  std::stringstream ss;
+  if (std::is_floating_point<T>::value) {
+    ss << std::showpoint;
+  }
+  ss << std::setprecision(std::numeric_limits<T>::max_digits10);
+
+  if (std::is_integral<T>::value) {
+    if (std::is_unsigned<T>::value) {
+      ss << static_cast<uint64_t>(value);
+    } else {
+      ss << static_cast<int64_t>(value);
+    }
+  } else {
+    ss << value;
+  }
+  return ss.str();
+}
+
+template <typename T, typename Context>
+std::string DebugString(const DenseTensor& tensor, const Context& dev_ctx) {
+  DenseTensor tmp;
+  phi::Copy(dev_ctx, tensor, CPUPlace(), true, &tmp);
+
+  std::stringstream ss;
+  ss << "adamw print data=[";
+  size_t numel = tmp.numel();
+  const T* data = tmp.data<T>();
+  size_t print_num = 20L;
+
+  if (numel <= 2 * print_num) {
+    for (size_t i = 0; i < numel; ++i) {
+      if (i > 0) {
+        ss << ", ";
+      }
+      ss << PrintValue(data[i]);
+    }
+  } else {
+    for (size_t i = 0; i < print_num; ++i) {
+      if (i > 0) {
+        ss << ", ";
+      }
+      ss << PrintValue(data[i]);
+    }
+    ss << ", ... , ";
+    for (size_t i = numel - print_num; i < numel; ++i) {
+      ss << PrintValue(data[i]);
+      if (i != numel - 1) {
+        ss << ", ";
+      }
+    }
+  }
+  ss << "]";
+  return ss.str();
+}
+
 template <typename T>
 __global__ void SoftLabelCrossEntropyGradientKernel(T* logit_grad,
                                                     const T* loss_grad,
@@ -121,10 +178,10 @@ __global__ void SoftmaxWithCrossEntropyGradHardLabel(T* logits_grad,
                                                      const int64_t d,
                                                      const int ignore_index) {
   int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-  int64_t idx_n = idx / (d * dim);
-  int64_t idx_dim = (idx / d) % dim;
-  int64_t idx_d = idx % d;
-  int64_t ids = idx_n * d + idx_d;
+  int64_t idx_n = idx / (d * dim);    // row
+  int64_t idx_dim = (idx / d) % dim;  // col
+  int64_t idx_d = idx % d;            // 0
+  int64_t ids = idx_n * d + idx_d;    // row
 
   if (idx < n * dim * d) {
     auto lbl = static_cast<int64_t>(labels[ids]);
@@ -135,6 +192,12 @@ __global__ void SoftmaxWithCrossEntropyGradHardLabel(T* logits_grad,
     } else {
       logits_grad[idx] = softmax[idx] * loss_grad[ids];
     }
+    printf("idx: %d, softmax:%f logits_grad:%f loss_grad:%f\n",
+           idx,
+           softmax[idx],
+           logits_grad[idx],
+           loss_grad[ids]);
+    // printf("%f %f\n", softmax_val, logit_grad);
   }
 }
 
@@ -215,9 +278,19 @@ void CrossEntropyWithSoftmaxGradGPUKernel(const GPUContext& dev_ctx,
     SoftCrossEntropyGradientKernel<T><<<grid, block, 0, stream>>>(
         logit_grad_data, loss_grad_data, label_data, n, d, remain);
   } else {
+    // std::cout << "****** logits_grad *******" << std::endl;
+    // std::cout << DebugString<T>(*logits_grad, dev_ctx) << std::endl;
+    std::cout << "****** softmax *******" << std::endl;
+    std::cout << DebugString<T>(softmax, dev_ctx) << std::endl;
+    std::cout << "****** loss_grad *******" << std::endl;
+    std::cout << "loss_grad numel: " << loss_grad.numel() << std::endl;
+    std::cout << DebugString<T>(loss_grad, dev_ctx) << std::endl;
+    // std::cout << "****** labels *******" << std::endl;
+    // std::cout << DebugString<int64_t>(label, dev_ctx) << std::endl;
     const T* softmax_data = softmax.data<T>();
     const auto* label_data = label.data<LabelT>();
     int grid = (n * d + block - 1) / block;
+    // cudaDeviceSynchronize();
     SoftmaxWithCrossEntropyGradHardLabel<T>
         <<<grid, block, 0, stream>>>(logit_grad_data,
                                      loss_grad_data,
@@ -227,6 +300,9 @@ void CrossEntropyWithSoftmaxGradGPUKernel(const GPUContext& dev_ctx,
                                      d / remain,
                                      remain,
                                      ignore_index);
+    // cudaDeviceSynchronize();
+    // std::cout << "****** logits_grad_final *******" << std::endl;
+    // std::cout << DebugString<T>(*logits_grad, dev_ctx) << std::endl;
   }
 }