Merge branch 'master' into nnvm-tf-pack

apache · Aug 10, 2018 · db2705b · db2705b
2 parents 1f467c7 + 48fc410
commit db2705b
Show file tree

Hide file tree

Showing 29 changed files with 484 additions and 127 deletions.
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
@@ -18,6 +18,7 @@ See the [community structure document](http://docs.tvm.ai/contribute/community.h
 - [Yuwei Hu](https://github.com/Huyuwei) TOPI
 - [Zhixun Tan](https://github.com/phisiart) OpenGL/WebGL backend
 - [Nick Hynes](https://github.com/nhynes) SGX and secured computing
+- [Lianmin Zheng](https://github.com/merrymercy) AutoTVM
 
 ## Reviewers
 - [Masahiro Masuda](https://github.com/masahi)
@@ -27,7 +28,6 @@ See the [community structure document](http://docs.tvm.ai/contribute/community.h
 - [Alex Weaver](https://github.com/alex-weaver)
 - [Eddie Yan](https://github.com/eqy)
 - [Joshua Z. Zhang](https://github.com/zhreshold)
-- [Lianmin Zheng](https://github.com/merrymercy)
 
 ## List of Contributors
 - [Full List of Contributors](https://github.com/dmlc/tvm/graphs/contributors)

diff --git a/apps/android_rpc/README.md b/apps/android_rpc/README.md
@@ -123,18 +123,25 @@ export TVM_NDK_CC=/opt/android-toolchain-arm64/bin/aarch64-linux-android-g++
 python android_rpc_test.py
 ```
 
-This will compile TVM IR to shared libraries (CPU and OpenCL) and run vector addition on your Android device. On my test device, it gives following results.
+This will compile TVM IR to shared libraries (CPU, OpenCL and Vulkan) and run vector addition on your Android device. To verify compiled TVM IR shared libraries on OpenCL target set [`'test_opencl = True'`](https://github.com/dmlc/tvm/blob/master/apps/android_rpc/tests/android_rpc_test.py#L25) and on Vulkan target set [`'test_vulkan = False'`](https://github.com/dmlc/tvm/blob/master/apps/android_rpc/tests/android_rpc_test.py#L27) in  [tests/android_rpc_test.py](https://github.com/dmlc/tvm/blob/master/apps/android_rpc/tests/android_rpc_test.py), by default on CPU target will execute.
+On my test device, it gives following results.
 
 ```bash
-TVM: Initializing cython mode...
-[01:21:43] src/codegen/llvm/codegen_llvm.cc:75: set native vector to be 32 for target aarch64
-[01:21:43] src/runtime/opencl/opencl_device_api.cc:194: Initialize OpenCL platform 'Apple'
-[01:21:43] src/runtime/opencl/opencl_device_api.cc:214: opencl(0)='Iris' cl_device_id=0x1024500
-[01:21:44] src/codegen/llvm/codegen_llvm.cc:75: set native vector to be 32 for target aarch64
-Run GPU test ...
-0.000155807 secs/op
 Run CPU test ...
-0.00139824 secs/op
+0.000962932 secs/op
+
+Run GPU(OpenCL Flavor) test ...
+0.000155807 secs/op
+
+[23:29:34] /home/tvm/src/runtime/vulkan/vulkan_device_api.cc:674: Cannot initialize vulkan: [23:29:34] /home/tvm/src/runtime/vulkan/vulkan_device_api.cc:512: Check failed: __e == VK_SUCCESS Vulan Error, code=-9: VK_ERROR_INCOMPATIBLE_DRIVER
+
+Stack trace returned 10 entries:
+[bt] (0) /home/user/.local/lib/python3.6/site-packages/tvm-0.4.0-py3.6-linux-x86_64.egg/tvm/libtvm.so(dmlc::StackTrace[abi:cxx11]()+0x53) [0x7f477f5399f3]
+.........
+
+You can still compile vulkan module but cannot run locally
+Run GPU(Vulkan Flavor) test ...
+0.000225198 secs/op
 ```
 
 You can define your own TVM operators and test via this RPC app on your Android device to find the most optimized TVM schedule.
diff --git a/apps/android_rpc/app/src/main/jni/Application.mk b/apps/android_rpc/app/src/main/jni/Application.mk
@@ -1,9 +1,9 @@
 ifndef config
-	ifneq ("$(wildcard ./config.mk)","")
-	  config ?= config.mk
-	else
-	  config ?= make/config.mk
-	endif
+    ifneq ("$(wildcard ./config.mk)","")
+        config ?= config.mk
+    else
+        config ?= make/config.mk
+    endif
 endif
 
 include $(config)
@@ -16,10 +16,10 @@ APP_STL := c++_static
 
 APP_CPPFLAGS += -DDMLC_LOG_STACK_TRACE=0 -DTVM4J_ANDROID=1 -std=c++11 -Oz -frtti
 ifeq ($(USE_OPENCL), 1)
-	APP_CPPFLAGS += -DTVM_OPENCL_RUNTIME=1
+    APP_CPPFLAGS += -DTVM_OPENCL_RUNTIME=1
 endif
 
 ifeq ($(USE_VULKAN), 1)
-	APP_CPPFLAGS += -DTVM_VULKAN_RUNTIME=1
-	APP_LDFLAGS += -lvulkan
+    APP_CPPFLAGS += -DTVM_VULKAN_RUNTIME=1
+    APP_LDFLAGS += -lvulkan
 endif
diff --git a/apps/android_rpc/tests/android_rpc_test.py b/apps/android_rpc/tests/android_rpc_test.py
@@ -21,59 +21,92 @@
 arch = "arm64"
 target = "llvm -target=%s-linux-android" % arch
 
+# whether enable to execute test on OpenCL target
+test_opencl = False
+# whether enable to execute test on Vulkan target
+test_vulkan = False
+
 def test_rpc_module():
     # graph
     n = tvm.convert(1024)
     A = tvm.placeholder((n,), name='A')
     B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
+    a_np = np.random.uniform(size=1024).astype(A.dtype)
     temp = util.tempdir()
-    s = tvm.create_schedule(B.op)
-    xo, xi = s[B].split(B.op.axis[0], factor=64)
-    s[B].bind(xi, tvm.thread_axis("threadIdx.x"))
-    s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
-    # Build the dynamic lib.
-    # If we don't want to do metal and only use cpu, just set target to be target
-    f = tvm.build(s, [A, B], "opencl", target_host=target, name="myadd")
-    path_dso1 = temp.relpath("dev_lib2.so")
-    f.export_library(path_dso1, ndk.create_shared)
 
+    # Establish remote connection with target hardware
+    tracker = rpc.connect_tracker(tracker_host, tracker_port)
+    remote = tracker.request(key, priority=0,
+                             session_timeout=60)
+
+    # Compile the Graph for CPU target
     s = tvm.create_schedule(B.op)
     xo, xi = s[B].split(B.op.axis[0], factor=64)
     s[B].parallel(xi)
     s[B].pragma(xo, "parallel_launch_point")
     s[B].pragma(xi, "parallel_barrier_when_finish")
     f = tvm.build(s, [A, B], target, name="myadd_cpu")
-    path_dso2 = temp.relpath("cpu_lib.so")
-    f.export_library(path_dso2, ndk.create_shared)
-
-    tracker = rpc.connect_tracker(tracker_host, tracker_port)
-    remote = tracker.request(key, priority=0,
-                             session_timeout=60)
+    path_dso_cpu = temp.relpath("cpu_lib.so")
+    f.export_library(path_dso_cpu, ndk.create_shared)
 
+    # Execute the portable graph on cpu target
     print('Run CPU test ...')
     ctx = remote.cpu(0)
-    remote.upload(path_dso2)
+    remote.upload(path_dso_cpu)
     f2 = remote.load_module("cpu_lib.so")
-    a_np = np.random.uniform(size=1024).astype(A.dtype)
     a = tvm.nd.array(a_np, ctx)
     b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
     time_f = f2.time_evaluator(f2.entry_name, ctx, number=10)
     cost = time_f(a, b).mean
-    print('%g secs/op' % cost)
+    print('%g secs/op\n' % cost)
     np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
 
+    # Compile the Graph for OpenCL target
+    if test_opencl:
+        s = tvm.create_schedule(B.op)
+        xo, xi = s[B].split(B.op.axis[0], factor=64)
+        s[B].bind(xi, tvm.thread_axis("threadIdx.x"))
+        s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
+        # Build the dynamic lib.
+        # If we don't want to do metal and only use cpu, just set target to be target
+        f = tvm.build(s, [A, B], "opencl", target_host=target, name="myadd")
+        path_dso_cl = temp.relpath("dev_lib_cl.so")
+        f.export_library(path_dso_cl, ndk.create_shared)
+
+        print('Run GPU(OpenCL Flavor) test ...')
+        ctx = remote.cl(0)
+        remote.upload(path_dso_cl)
+        f1 = remote.load_module("dev_lib_cl.so")
+        a = tvm.nd.array(a_np, ctx)
+        b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
+        time_f = f1.time_evaluator(f1.entry_name, ctx, number=10)
+        cost = time_f(a, b).mean
+        print('%g secs/op\n' % cost)
+        np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
+
+    # Compile the Graph for Vulkan target
+    if test_vulkan:
+        s = tvm.create_schedule(B.op)
+        xo, xi = s[B].split(B.op.axis[0], factor=64)
+        s[B].bind(xi, tvm.thread_axis("threadIdx.x"))
+        s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
+        # Build the dynamic lib.
+        # If we don't want to do metal and only use cpu, just set target to be target
+        f = tvm.build(s, [A, B], "vulkan", target_host=target, name="myadd")
+        path_dso_vulkan = temp.relpath("dev_lib_vulkan.so")
+        f.export_library(path_dso_vulkan, ndk.create_shared)
+
+        print('Run GPU(Vulkan Flavor) test ...')
+        ctx = remote.vulkan(0)
+        remote.upload(path_dso_vulkan)
+        f1 = remote.load_module("dev_lib_vulkan.so")
+        a = tvm.nd.array(a_np, ctx)
+        b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
+        time_f = f1.time_evaluator(f1.entry_name, ctx, number=10)
+        cost = time_f(a, b).mean
+        print('%g secs/op\n' % cost)
+        np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
 
-    print('Run GPU test ...')
-    ctx = remote.cl(0)
-    remote.upload(path_dso1)
-    f1 = remote.load_module("dev_lib2.so")
-    a_np = np.random.uniform(size=1024).astype(A.dtype)
-    a = tvm.nd.array(a_np, ctx)
-    b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
-    time_f = f1.time_evaluator(f1.entry_name, ctx, number=10)
-    cost = time_f(a, b).mean
-    print('%g secs/op' % cost)
-    np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
 
 if __name__ == "__main__":
     test_rpc_module()
diff --git a/docs/deploy/nnvm.md b/docs/deploy/nnvm.md
@@ -116,3 +116,60 @@ int main()
     return 0;
 }
 ```
+
+## Deploy as System Module
+C++ additionally support deployment as system module.
+This process need few additional options as given below to NNVM build.
+
+- For target llvm append --system-lib as ```target=llvm --system-lib```
+- For a GPU build (or non llvm) the additional option should be given to targat_host as ```target_host=llvm --system-lib```
+
+Module export require additional options for not to compile but save as ```lib.export_library (path, fcompile=False)```
+
+The output of above API is a tar compressed file containing object file ```(lib.o)``` and cpp source file ```(devc.cc)``` which embeds device blob. Thease two files should be compiled along with other files or objects while building c++ application.
+Please refer to [Makefile](https://github.com/dmlc/tvm/tree/master/apps/howto_deploy/Makefile#L32) for a reference.
+
+The c++ code to load this system module require the below change.
+
+```cpp
+    // tvm module for compiled functions
+    tvm::runtime::Module mod_syslib = (*tvm::runtime::Registry::Get("module._GetSystemLib"))();
+```
+
+Based on the build environment the system object, device blob source should be included in the final executable. An example with bazel build is given below.
+```bash
+cc_library(
+    name = "host_module",
+    srcs = ["lib.o"],
+    alwayslink=1
+)
+
+cc_library(
+    name = "device_module",
+    srcs = ["devc.cc"],
+    alwayslink=1
+)
+
+cc_library(
+    name = "tvm_runtime",
+    srcs = ["libtvm_runtime_pack.cc"],
+)
+
+cc_binary(
+    name = "bazel_deploy",
+    srcs = ["cpp_deploy.cc"],
+    deps = [
+        ":tvm_runtime", ":host_module", ":device_module"
+    ],
+    linkopts = [ "-lpthread -ldl" ]
+)
+
+```
+
+This build directive creates
+- new library ```host_module``` out of ```lib.o```
+- new library ```device_module``` out of ```devc.cc```
+
+These intermediate modules can be used as a dependency to final deploy application.
+
+In bazel ```alwayslink=1``` enforce embedding entire lib into application (even though it doesn't call any API from this module).
diff --git a/jvm/core/src/main/java/ml/dmlc/tvm/TVMContext.java b/jvm/core/src/main/java/ml/dmlc/tvm/TVMContext.java
@@ -30,6 +30,7 @@ public class TVMContext {
     MASK2STR.put(1, "cpu");
     MASK2STR.put(2, "gpu");
     MASK2STR.put(4, "opencl");
+    MASK2STR.put(7, "vulkan");
     MASK2STR.put(8, "metal");
     MASK2STR.put(9, "vpi");
 
@@ -38,6 +39,7 @@ public class TVMContext {
     STR2MASK.put("cuda", 2);
     STR2MASK.put("cl", 4);
     STR2MASK.put("opencl", 4);
+    STR2MASK.put("vulkan", 7);
     STR2MASK.put("metal", 8);
     STR2MASK.put("vpi", 9);
   }
@@ -81,6 +83,19 @@ public static TVMContext opencl() {
     return opencl(0);
   }
 
+  /**
+   * Construct a Vulkan device.
+   * @param devId The device id
+   * @return The created context
+   */
+  public static TVMContext vulkan(int devId) {
+    return new TVMContext(7, devId);
+  }
+
+  public static TVMContext vulkan() {
+    return vulkan(0);
+  }
+
   /**
    * Construct a metal device.
    * @param devId The device id

diff --git a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/RPCSession.java b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/RPCSession.java
@@ -143,6 +143,24 @@ public TVMContext cl() {
     return cl(0);
   }
 
+  /**
+   * Construct remote OpenCL device.
+   * @param devId device id.
+   * @return Remote OpenCL context.
+   */
+  public TVMContext vulkan(int devId) {
+    return context(7, devId);
+  }
+
+  /**
+   * Construct remote OpenCL device.
+   * @return Remote OpenCL context.
+   */
+  public TVMContext vulkan() {
+    return vulkan(0);
+  }
+
+
   /**
    * Construct remote Metal device.
    * @param devId device id.

diff --git a/nnvm/include/nnvm/top/nn.h b/nnvm/include/nnvm/top/nn.h
@@ -254,6 +254,7 @@ struct Conv2DTransposeParam : public dmlc::Parameter<Conv2DTransposeParam> {
   int groups;
   std::string layout;
   std::string kernel_layout;
+  int out_dtype;
   bool use_bias;
 
   DMLC_DECLARE_PARAMETER(Conv2DTransposeParam) {
@@ -286,6 +287,10 @@ struct Conv2DTransposeParam : public dmlc::Parameter<Conv2DTransposeParam> {
       .describe("Dimension ordering of data and weight. Can be 'OIHW', 'OIHW16o16i', etc."
                 "'O', 'I', 'H', 'W' stands for num_filter, input_channel, height, and width"
                 "dimensions respectively.");
+    DMLC_DECLARE_DTYPE_FIELD(out_dtype)
+        .add_enum("same", -1)
+        .set_default(-1)
+        .describe("Output data type, set to explicit type under mixed precision setting");
     DMLC_DECLARE_FIELD(use_bias).set_default(true)
       .describe("Whether the layer uses a bias vector.");
   }

diff --git a/nnvm/python/nnvm/testing/dcgan.py b/nnvm/python/nnvm/testing/dcgan.py
@@ -42,28 +42,31 @@ def deconv2d_bn_relu(data, prefix, **kwargs):
 
 def get_symbol(oshape, ngf=128, code=None):
     """get symbol of dcgan generator"""
-    assert oshape[-1] == 32, "Only support 32x32 image"
-    assert oshape[-2] == 32, "Only support 32x32 image"
+    assert oshape[-1] == 64, "Only support 64x64 image"
+    assert oshape[-2] == 64, "Only support 64x64 image"
 
     code = sym.Variable("data") if code is None else code
-    net = sym.dense(code, name="g1", units=4*4*ngf*4, use_bias=False)
+    net = sym.dense(code, name="g1", units=4*4*ngf*8, use_bias=False)
     net = sym.relu(net)
     # 4 x 4
-    net = sym.reshape(net, shape=(-1, ngf * 4, 4, 4))
+    net = sym.reshape(net, shape=(-1, ngf * 8, 4, 4))
     # 8 x 8
     net = deconv2d_bn_relu(
-        net, ishape=(ngf * 4, 4, 4), oshape=(ngf * 2, 8, 8), kshape=(4, 4), prefix="g2")
+        net, ishape=(ngf * 8, 4, 4), oshape=(ngf * 4, 8, 8), kshape=(4, 4), prefix="g2")
     # 16x16
     net = deconv2d_bn_relu(
-        net, ishape=(ngf * 2, 8, 8), oshape=(ngf, 16, 16), kshape=(4, 4), prefix="g3")
+        net, ishape=(ngf * 4, 8, 8), oshape=(ngf * 2, 16, 16), kshape=(4, 4), prefix="g3")
     # 32x32
+    net = deconv2d_bn_relu(
+        net, ishape=(ngf * 2, 16, 16), oshape=(ngf, 32, 32), kshape=(4, 4), prefix="g4")
+    # 64x64
     net = deconv2d(
-        net, ishape=(ngf, 16, 16), oshape=oshape[-3:], kshape=(4, 4), name="g4_deconv")
+        net, ishape=(ngf, 32, 32), oshape=oshape[-3:], kshape=(4, 4), name="g5_deconv")
     net = sym.tanh(net)
     return net
 
 
-def get_workload(batch_size, oshape=(3, 32, 32), ngf=128, random_len=100, dtype="float32"):
+def get_workload(batch_size, oshape=(3, 64, 64), ngf=128, random_len=100, dtype="float32"):
     """Get benchmark workload for a DCGAN generator
 
     Parameters

diff --git a/nnvm/python/nnvm/top/nn.py b/nnvm/python/nnvm/top/nn.py
@@ -251,11 +251,15 @@ def compute_conv2d_transpose(attrs, inputs, _):
     strides = attrs.get_int_tuple("strides")
     dilation = attrs.get_int_tuple("dilation")
     groups = attrs.get_int("groups")
+    out_dtype = attrs.get_string("out_dtype")
     layout = attrs["layout"]
+    out_dtype = inputs[0].dtype if out_dtype == "same" else out_dtype
+
     assert layout == "NCHW", "only support nchw for now"
     assert dilation == (1, 1), "not support dilate now"
     assert groups == 1, "only support groups == 1 for now"
-    out = topi.nn.conv2d_transpose_nchw(inputs[0], inputs[1], strides, padding)
+
+    out = topi.nn.conv2d_transpose_nchw(inputs[0], inputs[1], strides, padding, out_dtype)
     if attrs.get_bool("use_bias"):
         bias = inputs[2]
         bias = topi.expand_dims(bias, axis=1, num_newaxis=2)