PaddlePaddle · reyoung · Sep 19, 2017 · Sep 19, 2017 · Sep 19, 2017
diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
@@ -24,4 +24,4 @@ cc_library(device_context SRCS device_context.cc DEPS memory buddy_allocator
 nv_test(device_context_test SRCS device_context_test.cc DEPS device_context gpu_info)
 
 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
-nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place)
+nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context)
diff --git a/paddle/platform/transform.h b/paddle/platform/transform.h
@@ -14,48 +14,55 @@
 
 #pragma once
 
+#include "paddle/platform/device_context.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/hostdevice.h"
 #include "paddle/platform/place.h"
 
 #include <algorithm>
 #include <type_traits>
 #ifdef __NVCC__
+#include <thrust/execution_policy.h>
 #include <thrust/transform.h>
 #include "paddle/platform/details/device_ptr_cast.h"
 #endif
 
 namespace paddle {
 namespace platform {
 // Transform on host or device. It provides the same API in std library.
-template <typename Place, typename InputIter, typename OutputIter,
-          typename UnaryOperation>
-void Transform(Place place, InputIter first, InputIter last, OutputIter result,
-               UnaryOperation op) {
+template <typename InputIter, typename OutputIter, typename UnaryOperation>
+void Transform(const DeviceContext& context, InputIter first, InputIter last,
+               OutputIter result, UnaryOperation op) {
+  auto place = context.GetPlace();
   if (is_cpu_place(place)) {
     std::transform(first, last, result, op);
   } else {
 #ifdef __NVCC__
+    auto& ctx = reinterpret_cast<const CUDADeviceContext&>(context);
     using namespace details;
-    thrust::transform(DevPtrCast(first), DevPtrCast(last), DevPtrCast(result),
-                      op);
+    thrust::transform(thrust::cuda::par.on(ctx.stream()), DevPtrCast(first),
+                      DevPtrCast(last), DevPtrCast(result), op);
 #else
     PADDLE_THROW("Do not invoke `Transform<GPUPlace>` in .cc file");
 #endif
   }
 }
 
-template <typename Place, typename InputIter1, typename InputIter2,
-          typename OutputIter, typename BinaryOperation>
-void Transform(Place place, InputIter1 first1, InputIter1 last1,
-               InputIter2 first2, OutputIter result, BinaryOperation op) {
+template <typename InputIter1, typename InputIter2, typename OutputIter,
+          typename BinaryOperation>
+void Transform(const DeviceContext& context, InputIter1 first1,
+               InputIter1 last1, InputIter2 first2, OutputIter result,
+               BinaryOperation op) {
+  auto place = context.GetPlace();
   if (is_cpu_place(place)) {
     std::transform(first1, last1, first2, result, op);
   } else {
 #ifdef __NVCC__
+    auto& ctx = reinterpret_cast<const CUDADeviceContext&>(context);
     using namespace details;
-    thrust::transform(DevPtrCast(first1), DevPtrCast(last1), DevPtrCast(first2),
-                      DevPtrCast(result), op);
+    thrust::transform(thrust::cuda::par.on(ctx.stream()), DevPtrCast(first1),
+                      DevPtrCast(last1), DevPtrCast(first2), DevPtrCast(result),
+                      op);
 #else
     PADDLE_THROW("Do not invoke `Transform<GPUPlace>` in .cc file");
 #endif

diff --git a/paddle/platform/transform_test.cu b/paddle/platform/transform_test.cu
@@ -36,8 +36,9 @@ class Multiply {
 
 TEST(Transform, CPUUnary) {
   using namespace paddle::platform;
+  CPUDeviceContext ctx;
   float buf[4] = {0.1, 0.2, 0.3, 0.4};
-  Transform(CPUPlace(), buf, buf + 4, buf, Scale<float>(10));
+  Transform(ctx, buf, buf + 4, buf, Scale<float>(10));
   for (int i = 0; i < 4; ++i) {
     ASSERT_NEAR(buf[i], static_cast<float>(i + 1), 1e-5);
   }
@@ -47,10 +48,12 @@ TEST(Transform, GPUUnary) {
   using namespace paddle::platform;
   using namespace paddle::memory;
   GPUPlace gpu0(0);
+  CUDADeviceContext ctx(gpu0);
   float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4};
   float* gpu_buf = static_cast<float*>(Alloc(gpu0, sizeof(float) * 4));
   Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf));
-  Transform(gpu0, gpu_buf, gpu_buf + 4, gpu_buf, Scale<float>(10));
+  Transform(ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale<float>(10));
+  ctx.Wait();
   Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf));
   Free(gpu0, gpu_buf);
   for (int i = 0; i < 4; ++i) {
@@ -62,7 +65,7 @@ TEST(Transform, CPUBinary) {
   using namespace paddle::platform;
   using namespace paddle::memory;
   int buf[4] = {1, 2, 3, 4};
-  Transform(CPUPlace(), buf, buf + 4, buf, buf, Multiply<int>());
+  Transform(CPUDeviceContext(), buf, buf + 4, buf, buf, Multiply<int>());
   for (int i = 0; i < 4; ++i) {
     ASSERT_EQ((i + 1) * (i + 1), buf[i]);
   }
@@ -73,9 +76,11 @@ TEST(Transform, GPUBinary) {
   using namespace paddle::memory;
   int buf[4] = {1, 2, 3, 4};
   GPUPlace gpu0(0);
+  CUDADeviceContext ctx(gpu0);
   int* gpu_buf = static_cast<int*>(Alloc(gpu0, sizeof(buf)));
   Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf));
-  Transform(gpu0, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply<int>());
+  Transform(ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply<int>());
+  ctx.Wait();
   Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf));
   Free(gpu0, gpu_buf);
   for (int i = 0; i < 4; ++i) {