Merge branch 'meta-project:main' into multi-comm

Tonny-Gu · Feb 27, 2022 · c924e80 · c924e80
2 parents efea2ff + df572d2
commit c924e80
Show file tree

Hide file tree

Showing 40 changed files with 587 additions and 455 deletions.
diff --git a/.github/workflows/ci_lint.yml b/.github/workflows/ci_lint.yml
@@ -21,7 +21,7 @@ jobs:
     if: github.repository == 'meta-project/meta'
     runs-on: self-hosted
     container:
-        image: metaprojdev/meta:ci_cpu-v0.18
+        image: metaprojdev/raf:ci_cpu-v0.18
     steps:
       - name: Checkout repository
         uses: actions/checkout@v2

diff --git a/.github/workflows/ci_unit_test.yml b/.github/workflows/ci_unit_test.yml
@@ -20,8 +20,8 @@ jobs:
     if: github.repository == 'meta-project/meta'
     runs-on: self-hosted
     outputs:
-      cpu_image: "metaprojdev/meta:ci_cpu-v0.18"
-      gpu_image: "metaprojdev/meta:ci_gpu-v0.20"
+      cpu_image: "metaprojdev/raf:ci_cpu-v0.18"
+      gpu_image: "metaprojdev/raf:ci_gpu-v0.20"
       skip_ci: ${{ steps.job_info.outputs.skip_ci }}
       ref: ${{ steps.job_info.outputs.ref }}
       repo: ${{ steps.job_info.outputs.repo }}
@@ -205,3 +205,35 @@ jobs:
             --command "bash ./ci/batch/cli.sh config_cmake GPU 75 &&
               bash ./ci/batch/cli.sh compile build multi-GPU ${{ needs.check_status.outputs.job_tag }} &&
               bash ./ci/batch/cli.sh unit_test multi-GPU"
+
+  update_ci_badge:
+    needs: [test_on_cpu, test_on_gpu, test_on_multi_gpu]
+    if: github.repository == 'meta-project/meta'
+    runs-on: self-hosted
+    steps:
+      - uses: haya14busa/action-workflow_run-status@v1
+      - name: Checkout repository
+        # No need to checkout submodules because we only need to get the HEAD commit hash.
+        uses: actions/checkout@v2
+      - name: Generate CI badge
+        id: badge
+        run: |
+          # env vars are unavailable in job.if so we have to implement it here.
+          if [ "${{ needs.check_status.outputs.pr }}" != "" ]; then
+            echo "No need to update badge for PR CI. Skip."
+            exit 0
+          fi
+          head_commit=$(git rev-parse --short HEAD)
+          echo "::set-output name=gist_id::630a36600930c8d68e6b15f16333b532"
+          echo "::set-output name=message::${head_commit}"
+      - name: Update CI badge
+        # Intentionally fail this step with empty gist_id.
+        uses: schneegans/dynamic-badges-action@v1.1.0
+        continue-on-error: true
+        with:
+          auth: ${{ secrets.DEPLOY_ACCESS_TOKEN }}
+          gistID: ${{ steps.badge.outputs.gist_id }}
+          filename: raf-ci-badge-last-pass.json
+          label: CI-Last-Success
+          message: ${{ steps.badge.outputs.message }}
+          color: blue
diff --git a/.github/workflows/deploy_docker.yml b/.github/workflows/deploy_docker.yml
@@ -35,7 +35,7 @@ jobs:
         with:
           context: docker
           file: docker/Dockerfile.${{ github.event.inputs.type }}
-          tags: metaprojdev/meta:${{ github.event.inputs.type }}-${{ github.event.inputs.tag }}
+          tags: metaprojdev/raf:${{ github.event.inputs.type }}-${{ github.event.inputs.tag }}
           push: true
       - name: Image digest
         run: echo ${{ steps.docker_build.outputs.digest }}

diff --git a/README.md b/README.md
@@ -1,14 +1,10 @@
 <!--- Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -->
 <!--- SPDX-License-Identifier: Apache-2.0  -->
 
-RAF
-===
-
-[![CI-Lint](https://github.com/meta-project/meta/actions/workflows/ci_lint.yml/badge.svg)](https://github.com/meta-project/meta/actions/workflows/ci_lint.yml)
-[![CI-UnitTest](https://github.com/meta-project/meta/actions/workflows/ci_unit_test.yml/badge.svg)](https://github.com/meta-project/meta/actions/workflows/ci_unit_test.yml)
+RAF: RAF Accelerates deep learning Frameworks
+=============================================
 
+![CI-Lass-Pass](https://img.shields.io/endpoint?url=https://gist.githubusercontent.com/aire-meta-bot/630a36600930c8d68e6b15f16333b532/raw/raf-ci-badge-last-pass.json)
 
 
 Please refer to our [wiki](docs/wiki) for more information.
-
-
diff --git a/ci/batch/README.md b/ci/batch/README.md
@@ -167,7 +167,7 @@ AWS Batch has to be properly configured to make the above flow working as expect
         "type": "container",
         "parameters": {},
         "containerProperties": {
-            "image": "metaprojdev/meta:ci_gpu-v0.20",
+            "image": "metaprojdev/raf:ci_gpu-v0.20",
             "command": [],
             "jobRoleArn": ***,
             "executionRoleArn": ***,
@@ -210,7 +210,7 @@ AWS Batch has to be properly configured to make the above flow working as expect
         "type": "container",
         "parameters": {},
         "containerProperties": {
-            "image": "metaprojdev/meta:ci_cpu-v0.18",
+            "image": "metaprojdev/raf:ci_cpu-v0.18",
             "command": [],
             "jobRoleArn": ***,
             "executionRoleArn": ***,
@@ -249,7 +249,7 @@ AWS Batch has to be properly configured to make the above flow working as expect
         "type": "container",
         "parameters": {},
         "containerProperties": {
-            "image": "metaprojdev/meta:ci_gpu-v0.20",
+            "image": "metaprojdev/raf:ci_gpu-v0.20",
             "command": [],
             "jobRoleArn": ***,
             "executionRoleArn": ***,

diff --git a/ci/batch/backup-ccache.sh b/ci/batch/backup-ccache.sh
@@ -10,7 +10,7 @@ MODE=$1     # upload or download
 PLATFORM=$2 # CPU, GPU, or multi-GPU
 TAG=$3      # e.g., refs/heads/main, pr-7
 
-S3_BUCKET="ci-meta"
+S3_BUCKET="ci-raf"
 S3_FOLDER=`echo cache-${TAG} | sed 's/\//_/g'`
 S3_PATH="s3://$S3_BUCKET/$S3_FOLDER"
 

diff --git a/docker/batch/entry.sh b/docker/batch/entry.sh
@@ -13,7 +13,7 @@ SOURCE_REF=$1
 REPO=$2
 COMMAND=$3
 SAVE_OUTPUT=$4
-REMOTE_FOLDER=$5 # e.g., s3://ci-meta/pr-7
+REMOTE_FOLDER=$5 # e.g., s3://ci-raf/pr-7
 
 echo "Job Info"
 echo "-------------------------------------"

diff --git a/docker/push.sh b/docker/push.sh
@@ -31,7 +31,7 @@ PASSWORD="$1"
 shift 1
 
 LOCAL_IMAGE_NAME=raf.${CONTAINER_TYPE}:latest
-REMOTE_IMAGE_NAME=${DOCKER_HUB_ACCOUNT}/meta:${CONTAINER_TYPE}-${VERSION}
+REMOTE_IMAGE_NAME=${DOCKER_HUB_ACCOUNT}/raf:${CONTAINER_TYPE}-${VERSION}
 
 echo "Login docker hub"
 docker login -u ${DOCKER_HUB_ACCOUNT} -p ${PASSWORD}

diff --git a/docs/wiki/3_dev_guide/Memory-Pool.md b/docs/wiki/3_dev_guide/Memory-Pool.md
@@ -7,7 +7,7 @@ This document introduces the Memory Pool of RAF.
 
 ## Strategies
 
-Currently, there are two types of memory pool in meta: (1) no_pool, (2) page_unit_pool.
+Currently, there are two types of memory pool in RAF: (1) no_pool, (2) page_unit_pool.
 By default, we choose page_unit_pool as our memory pool, which could bring down the running time by almost 50% for rn50/vgg/etc compared with no_pool. 
 
 The memory usage of these two strategies are similar. Here is an experiment on ResNet50 with Tesla T4 (15109MB)
@@ -115,4 +115,4 @@ Then you can create the Pool Class that derived from `raf::memory_pool::MemoryPo
 Remember to register your pool in the cpp file you created, the code should be like:
 `RAF_REGISTER_GLOBAL("raf.memory_pool._make.your_pool").set_body_typed(YourPool::make);`
 
-After re-make meta, you can enable your pool by calling `InitPool(contxt, pool_name)`.
+After re-make RAF, you can enable your pool by calling `InitPool(contxt, pool_name)`.
diff --git a/include/raf/op_utils.h b/include/raf/op_utils.h
@@ -121,7 +121,7 @@ inline bool IsInOpSet(const Expr& op, const OpSet& op_set) {
 inline bool IsReshapeOp(const Op& op) {
   static std::unordered_set<Op, ObjectPtrHash, ObjectPtrEqual> reshape_ops{
       Op::Get("raf.op.reshape"), Op::Get("raf.op.expand_dims"), Op::Get("raf.op.squeeze"),
-      Op::Get("raf.op.batch_flatten")};
+      Op::Get("raf.op.batch_flatten"), Op::Get("raf.op.reshape_like")};
   return IsInOpSet(op, reshape_ops);
 }
 
@@ -179,8 +179,9 @@ inline Array<tvm::PrimExpr> GetShapeExprFromValue(const Value& value) {
   ICHECK(value.defined());
   Array<tvm::PrimExpr> shape;
   if (auto ttv = value.as<TensorTypeValueObj>()) {
-    auto ndim = ttv->type->shape.size();
-    for (size_t i = 0; i < ndim; ++i) {
+    auto ndim = ttv->type->shape[0].as<ir::IntImmNode>();
+    ICHECK(ndim) << "Expected IntImm, but got " << ttv->type->shape[0]->GetTypeKey();
+    for (size_t i = 0; i < ndim->value; ++i) {
       shape.push_back(Any());
     }
   } else {

diff --git a/python/raf/_tvm_op/transform.py b/python/raf/_tvm_op/transform.py
@@ -129,6 +129,7 @@ def fcompute(*args):
 _reg.register_injective_schedule("raf.op.tvm.batch_flatten")
 _reg.register_injective_schedule("raf.op.tvm.arange")
 _reg.register_injective_schedule("raf.op.tvm.strided_slice")
+_reg.register_reduce_schedule("raf.op.tvm.collapse_sum_like")
 
 
 @register_compute("raf.op.tvm.take_dx")

diff --git a/python/raf/amp/type_hints.py b/python/raf/amp/type_hints.py
@@ -203,6 +203,7 @@ def _gen(args, ret_type, amp_dtype):
 register_op_cast_rule("raf.op.trunc", infer_cast(1))
 register_op_cast_rule("raf.op.mesh_grid", infer_cast(2))
 register_op_cast_rule("raf.op.reshape", infer_cast(1))
+register_op_cast_rule("raf.op.reshape_like", infer_cast(1))
 register_op_cast_rule("raf.op.resize2d", infer_cast(1))
 register_op_cast_rule("raf.op.ndarray_size", infer_cast(1))
 register_op_cast_rule("raf.op.transpose", infer_cast(1))

diff --git a/python/raf/testing/mlp.py b/python/raf/testing/mlp.py
@@ -0,0 +1,112 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""MLP model"""
+# pylint: disable=protected-access, attribute-defined-outside-init, too-many-locals
+# pylint: disable=missing-class-docstring, too-many-arguments, missing-function-docstring
+import torch.nn as nn
+import torch.nn.functional as F
+
+import raf
+from raf.model import Linear
+from .common import check, randn_torch, t2m_param, one_hot_torch
+from .utils import get_param, set_param
+
+
+class TorchMlp(nn.Module):  # pylint: disable=abstract-method
+    def __init__(self, num_inputs, num_outputs, num_hiddens1, num_hiddens2):
+        super(TorchMlp, self).__init__()
+        self.fc1 = nn.Linear(num_inputs, num_hiddens1)
+        self.fc2 = nn.Linear(num_hiddens1, num_hiddens2)
+        self.fc3 = nn.Linear(num_hiddens2, num_outputs)
+
+    def forward_infer(self, x):
+        y = self.fc1(x)
+        y = F.relu(y)
+        y = self.fc2(y)
+        y = F.relu(y)
+        y = self.fc3(y)
+        return y
+
+    def forward(self, x, y_true=None):  # pylint: disable=arguments-differ
+        y = self.forward_infer(x)
+        if self.training:
+            y_pred = F.log_softmax(y, dim=-1)
+            loss = F.nll_loss(y_pred, y_true)
+            return loss
+        return y
+
+
+class RAFMlp(raf.Model):
+    # pylint: disable=attribute-defined-outside-init
+    def build(self, num_inputs, num_outputs, num_hiddens1, num_hiddens2):
+        self.fc1 = Linear(num_inputs, num_hiddens1)
+        self.fc2 = Linear(num_hiddens1, num_hiddens2)
+        self.fc3 = Linear(num_hiddens2, num_outputs)
+
+    @raf.model.trace
+    def forward_infer(self, x):
+        y = self.fc1(x)
+        y = raf.relu(y)
+        y = self.fc2(y)
+        y = raf.relu(y)
+        y = self.fc3(y)
+        return y
+
+    @raf.model.trace
+    def forward(self, x, y_true):
+        y = self.forward_infer(x)
+        y_pred = raf.log_softmax(y)
+        loss = raf.nll_loss(y_true, y_pred)
+        return loss
+
+
+def _param_map(t_model):
+    """maps from m_model parameter name to t_model parameter value"""
+    res = {
+        "fc1.w": t_model.fc1.weight,
+        "fc1.b": t_model.fc1.bias,
+        "fc2.w": t_model.fc2.weight,
+        "fc2.b": t_model.fc2.bias,
+        "fc3.w": t_model.fc3.weight,
+        "fc3.b": t_model.fc3.bias,
+    }
+    return res
+
+
+def _init(m_model, t_model, device="cpu"):
+    """initialize meta model with parameters of torch model"""
+    # pylint: disable=no-member, line-too-long, too-many-statements
+    for m_name, t_w in _param_map(t_model).items():
+        set_param(m_model, m_name, t2m_param(t_w, device=device))
+
+
+def check_params(m_model, t_model, atol=1e-4, rtol=1e-4):
+    """check the parameters of m_model and t_model"""
+    # pylint: disable=no-member, line-too-long, too-many-statements
+    for m_name, t_w in _param_map(t_model).items():
+        m_w = get_param(m_model, m_name)
+        check(m_w, t_w, atol=atol, rtol=rtol)
+
+
+def get_model(config, train=True):
+    """get MLP model"""
+    m_model = RAFMlp(*config)
+    t_model = TorchMlp(*config)
+    _init(m_model, t_model)
+    if train:
+        m_model.train_mode()
+        t_model.train()
+    else:
+        m_model.infer_mode()
+        t_model.eval()
+    return m_model, t_model
+
+
+def get_input(config, batch_size=1, device="cpu", train=True):
+    """get MLP input"""
+    m_x, t_x = randn_torch([batch_size, config[0]], device=device, requires_grad=True)
+    if not train:
+        return [(m_x,), (t_x,)]
+    m_y, t_y = one_hot_torch(batch_size, num_classes=config[1], device=device)
+    return [(m_x, m_y), (t_x, t_y)]
diff --git a/scripts/src_codegen/def_op.py b/scripts/src_codegen/def_op.py
@@ -96,12 +96,12 @@
     Op(name="cross_entropy_dpred", schema_name="loss"),
     Op(name="cross_entropy_dtrue", schema_name="loss"),
     Op(name="reshape", schema_name="reshape"),
+    Op(name="reshape_like", schema_name="binary_like"),
     Op(name="resize2d", schema_name="resize2d"),
     Op(name="resize2d_dx", schema_name="resize2d_dx"),
     Op(name="ndarray_size", schema_name="unary"),
     Op(name="transpose", schema_name="transpose"),
-    Op(name="transpose_dx", schema_name="transpose_dx"),
-    Op(name="collapse_sum_like", schema_name="collapse_like"),
+    Op(name="transpose_dx", schema_name="transpose"),
     Op(name="sum", schema_name="sum"),
     Op(name="sum_dx", schema_name="sum_dx"),
     Op(name="cumsum", schema_name="cumsum"),
@@ -135,8 +135,9 @@
     Op(name="sequence_mask", schema_name="sequence_mask"),
     Op(name="reverse_sequence", schema_name="reverse_sequence"),
     Op(name="reverse", schema_name="reverse"),
-    Op(name="broadcast_to", schema_name="broadcast_to"),
-    Op(name="broadcast_to_like", schema_name="broadcast_to_like"),
+    Op(name="broadcast_to", schema_name="binary_to"),
+    Op(name="broadcast_to_like", schema_name="binary_like"),
+    Op(name="collapse_sum_like", schema_name="binary_like"),
     Op(name="concatenate", schema_name="concatenate"),
     Op(name="squeeze", schema_name="squeeze"),
     Op(name="stack", schema_name="stack"),
@@ -159,7 +160,7 @@
     Op(name="fuse_tensor", schema_name="fuse_tensor"),
     Op(name="defuse_tensor", schema_name="defuse_tensor"),
     Op(name="cast", schema_name="cast"),
-    Op(name="cast_like", schema_name="cast_like"),
+    Op(name="cast_like", schema_name="binary_like"),
     Op(name="gather", schema_name="gather"),
     Op(name="gather_dx", schema_name="gather_dx"),
     Op(name="gather_nd", schema_name="gather_nd"),