From ad810e96c492437708394ce1e842a3c5ac683485 Mon Sep 17 00:00:00 2001 From: Thierry Moreau Date: Fri, 22 Jun 2018 13:40:22 -0700 Subject: [PATCH] [UTILS, DOC] Use TVM file downloading utility, conv2d tutorial (#48) --- vta/examples/resnet18/pynq/README.md | 14 +++- .../resnet18/pynq/imagenet_predict.py | 24 +++--- vta/python/vta/bitstream.py | 39 +++++++--- vta/python/vta/testing/util.py | 47 +++++++----- .../python/integration/test_benchmark_gemm.py | 17 +++-- .../integration/test_benchmark_topi_conv2d.py | 3 +- .../python/pynq/test_benchmark_conv2d.py | 74 ++++++++++--------- vta/tutorials/matrix_multiply.py | 3 +- vta/tutorials/matrix_multiply_opt.py | 43 +++++------ 9 files changed, 156 insertions(+), 108 deletions(-) diff --git a/vta/examples/resnet18/pynq/README.md b/vta/examples/resnet18/pynq/README.md index 1213d94ec6b1f..e3bd3d85b1a2c 100644 --- a/vta/examples/resnet18/pynq/README.md +++ b/vta/examples/resnet18/pynq/README.md @@ -2,12 +2,20 @@ Follow the first two parts of the [Installation Guide](../../../docs/how_to/install.md) to make sure that the VTA python libraries are installed, and that the RPC server is running on the Pynq FPGA dev board. -Simply run the following python script: +We recommend leaving the `config.json` to its default parameterization (of course you can change the target between "sim" and "pynq"). + +Simply run the example program. We rely on pickle to store parameters which now only works with python2. ```bash -python imagenet_predict.py +python2 imagenet_predict.py ``` -This will run imagenet classification using the ResNet18 architecture on a VTA design that performs 8-bit integer inference, to perform classification on a cat image `cat.jpg`. +The script will first download the following files into `_data/` directory: +* `cat.jpg` which provides a test sample for the ImageNet classifier +* `quantize_graph.json` which describes the NNVM graph of the 8-bit ResNet-18 +* `quantize_params.plk` which contains the network parameters +* `synset.txt` which contains the ImageNet categories + +Next, it will run imagenet classification using the ResNet18 architecture on a VTA design that performs 8-bit integer inference, to perform classification on a cat image `cat.jpg`. The script reports runtime measured on the Pynq board (in seconds), and the top-1 result category: ``` diff --git a/vta/examples/resnet18/pynq/imagenet_predict.py b/vta/examples/resnet18/pynq/imagenet_predict.py index e5b4a1131461c..5ff6cc626d695 100644 --- a/vta/examples/resnet18/pynq/imagenet_predict.py +++ b/vta/examples/resnet18/pynq/imagenet_predict.py @@ -1,17 +1,18 @@ # some standard imports import nnvm import tvm -from nnvm.compiler import graph_attr import vta import vta.testing import os import numpy as np -from PIL import Image import pickle import json import logging -import wget + +from PIL import Image +from nnvm.compiler import graph_attr from tvm.contrib import graph_runtime, rpc, util +from tvm.contrib.download import download bfactor = 1 cfactor = 16 @@ -20,15 +21,20 @@ debug_fpga_only = False # Obtain model and hardware files (they're too large to check-in) +# Download them into _data dir +data_dir = "_data/" url = "https://homes.cs.washington.edu/~moreau/media/vta/" TEST_FILE = 'cat.jpg' CATEG_FILE = 'synset.txt' RESNET_GRAPH_FILE = 'quantize_graph.json' RESNET_PARAMS_FILE = 'quantize_params.pkl' +# Create data dir +if not os.path.exists(data_dir): + os.makedirs(data_dir) +# Download files for file in [TEST_FILE, CATEG_FILE, RESNET_GRAPH_FILE, RESNET_PARAMS_FILE]: if not os.path.isfile(file): - print ("Downloading {}".format(file)) - wget.download(url+file) + download(os.path.join(url, file), os.path.join(data_dir, file)) if verbose: logging.basicConfig(level=logging.DEBUG) @@ -40,8 +46,8 @@ if vta.get_env().TARGET == "sim": target_host = "llvm" -synset = eval(open(os.path.join(CATEG_FILE)).read()) -image = Image.open(os.path.join(TEST_FILE)).resize((224, 224)) +synset = eval(open(os.path.join(data_dir, CATEG_FILE)).read()) +image = Image.open(os.path.join(data_dir, TEST_FILE)).resize((224, 224)) def transform_image(image): image = np.array(image) - np.array([123., 117., 104.]) @@ -88,9 +94,9 @@ def mark_nop(graph, conv_layer=-1, skip_conv_layer=()): import nnvm.compiler np.random.seed(0) sym = nnvm.graph.load_json( - open(os.path.join(RESNET_GRAPH_FILE)).read()) + open(os.path.join(data_dir, RESNET_GRAPH_FILE)).read()) params = pickle.load( - open(os.path.join(RESNET_PARAMS_FILE))) + open(os.path.join(data_dir, RESNET_PARAMS_FILE), 'rb')) shape_dict = {"data": x.shape} dtype_dict = {"data": 'float32'} diff --git a/vta/python/vta/bitstream.py b/vta/python/vta/bitstream.py index dd0a9afdb6406..329e741f7d1ca 100644 --- a/vta/python/vta/bitstream.py +++ b/vta/python/vta/bitstream.py @@ -2,9 +2,16 @@ from __future__ import absolute_import as _abs import os -import urllib +import sys + +from tvm.contrib.download import download from .environment import get_env +if sys.version_info >= (3,): + import urllib.error as urllib2 +else: + import urllib2 + # bitstream repo BITSTREAM_URL = "https://github.com/uwsaml/vta-distro/raw/master/bitstreams/" @@ -41,15 +48,25 @@ def download_bitstream(): url = os.path.join(BITSTREAM_URL, env.TARGET) url = os.path.join(url, env.HW_VER) url = os.path.join(url, env.BITSTREAM) - # Check that the bitstream is accessible from the server - if urllib.urlopen(url).getcode() == 404: - # Raise error - the solution when this happens it to build your own bitstream and add it - # to your VTA_CACHE_PATH - raise RuntimeError( - "Error: {} is not available. It appears that this configuration has not been built." - .format(url)) - else: - urllib.urlretrieve(url, bit) - success = True + + try: + download(url, bit) + except urllib2.HTTPError as err: + if err.code == 404: + raise RuntimeError( + # Raise error - the solution when this happens it to build your + # own bitstream and add it to your $VTA_CACHE_PATH + "{} is not available. It appears that this configuration \ +bistream has not been cached. Please compile your own bitstream (see hardware \ +compilation guide to get Xilinx toolchains setup) and add it to your \ +$VTA_CACHE_PATH. Alternatively edit your config.json back to its default \ +settings. You can see the list of available bitstreams under {}" + .format(url, BITSTREAM_URL)) + else: + raise RuntimeError( + # This could happen when trying to access the URL behind a proxy + "Something went wrong when trying to access {}. Check your \ +internet connection or proxy settings." + .format(url)) return success diff --git a/vta/python/vta/testing/util.py b/vta/python/vta/testing/util.py index 18701320ffef0..67df2bfc2c431 100644 --- a/vta/python/vta/testing/util.py +++ b/vta/python/vta/testing/util.py @@ -15,23 +15,34 @@ def run(run_func): """ env = get_env() - # Run on local sim rpc if necessary - local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0")) - if local_rpc: - env.TARGET = "sim" - remote = rpc.connect("localhost", local_rpc) - run_func(env, remote) - else: - # run on simulator - if simulator.enabled(): - env.TARGET = "sim" + if env.TARGET == "sim": + + # Talk to local RPC if necessary to debug RPC server. + # Compile vta on your host with make at the root. + # Make sure TARGET is set to "sim" in the config.json file. + # Then launch the RPC server on the host machine + # with ./apps/pynq_rpc/start_rpc_server.sh + # Set your VTA_LOCAL_SIM_RPC environment variable to + # the port it's listening to, e.g. 9090 + local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0")) + if local_rpc: + remote = rpc.connect("localhost", local_rpc) + run_func(env, remote) + else: + # Make sure simulation library exists + # If this fails, build vta on host (make) + # with TARGET="sim" in the json.config file. + assert simulator.enabled() run_func(env, rpc.LocalSession()) - # Run on PYNQ if env variable exists - host = os.environ.get("VTA_PYNQ_RPC_HOST", None) - if host: - env.TARGET = "pynq" - port = os.environ.get("VTA_PYNQ_RPC_PORT", "9091") - port = int(port) - remote = rpc.connect(host, port) - run_func(env, remote) + elif env.TARGET == "pynq": + + # Run on PYNQ if env variable exists + host = os.environ.get("VTA_PYNQ_RPC_HOST", None) + port = int(os.environ.get("VTA_PYNQ_RPC_PORT", None)) + if host and port: + remote = rpc.connect(host, port) + run_func(env, remote) + else: + raise RuntimeError( + "Please set the VTA_PYNQ_RPC_HOST and VTA_PYNQ_RPC_PORT environment variables") diff --git a/vta/tests/python/integration/test_benchmark_gemm.py b/vta/tests/python/integration/test_benchmark_gemm.py index 4668acffc45a1..7201038b7be0f 100644 --- a/vta/tests/python/integration/test_benchmark_gemm.py +++ b/vta/tests/python/integration/test_benchmark_gemm.py @@ -18,7 +18,8 @@ def run_gemm_packed(env, remote, batch_size, channel, block): channel // env.BLOCK_OUT, env.BATCH, env.BLOCK_OUT) - num_ops = channel * channel * batch_size + # To compute number of ops, use a x2 factor for FMA + num_ops = 2 * channel * channel * batch_size ko = tvm.reduce_axis((0, channel // env.BLOCK_IN), name='ko') ki = tvm.reduce_axis((0, env.BLOCK_IN), name='ki') @@ -157,14 +158,14 @@ def run_schedule(load_inp, def gemm_normal(print_ir): mock = env.mock - print("----- GEMM GFLOPS End-to-End Test-------") + print("----- GEMM GOPS End-to-End Test-------") def run_test(header, print_ir, check_correctness): cost = run_schedule( env.dma_copy, env.dma_copy, env.gemm, env.alu, env.dma_copy, print_ir, check_correctness) gops = (num_ops / cost.mean) / float(10 ** 9) print(header) - print("\tTime cost = %g sec/op, %g GFLOPS" % (cost.mean, gops)) + print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops)) with vta.build_config(): run_test("NORMAL", print_ir, True) @@ -177,7 +178,7 @@ def run_test(header, print_ir): print_ir, False) gops = (num_ops / cost.mean) / float(10 ** 9) print(header) - print("\tTime cost = %g sec/op, %g GFLOPS" % (cost.mean, gops)) + print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops)) with vta.build_config(): run_test("NORMAL", print_ir) @@ -190,7 +191,7 @@ def run_test(header, print_ir): print_ir, False) gops = (num_ops / cost.mean) / float(10 ** 9) print(header) - print("\tTime cost = %g sec/op, %g GFLOPS" % (cost.mean, gops)) + print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops)) with vta.build_config(): run_test("NORMAL", print_ir) print("") @@ -204,7 +205,7 @@ def run_test(header, print_ir): gops = (num_ops / cost.mean) / float(10 ** 9) bandwith = (batch_size * channel * env.INP_WIDTH / cost.mean) / float(10 ** 9) print(header) - print("\tTime cost = %g sec/op, %g GFLOPS, bandwidth=%g Gbits" % ( + print("\tTime cost = %g sec/op, %g GOPS, bandwidth=%g Gbits" % ( cost.mean, gops, bandwith)) with vta.build_config(): run_test("NORMAL", print_ir) @@ -219,7 +220,7 @@ def run_test(header, print_ir): gops = (num_ops / cost.mean) / float(10 ** 9) bandwith = (channel * channel * env.WGT_WIDTH / cost.mean) / float(10 ** 9) print(header) - print("\tTime cost = %g sec/op, %g GFLOPS, bandwidth=%g Gbits" % ( + print("\tTime cost = %g sec/op, %g GOPS, bandwidth=%g Gbits" % ( cost.mean, gops, bandwith)) with vta.build_config(): run_test("NORMAL", print_ir) @@ -235,7 +236,7 @@ def run_test(header, print_ir): gops = (num_ops / cost.mean) / float(10 ** 9) bandwith = (batch_size * channel * env.OUT_WIDTH / cost.mean) / float(10 ** 9) print(header) - print("\tTime cost = %g sec/op, %g GFLOPS, bandwidth=%g Gbits" % ( + print("\tTime cost = %g sec/op, %g GOPS, bandwidth=%g Gbits" % ( cost.mean, gops, bandwith)) with vta.build_config(): run_test("NORMAL", print_ir) diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d.py b/vta/tests/python/integration/test_benchmark_topi_conv2d.py index 9721487ab4e81..b70fc9cfbf853 100644 --- a/vta/tests/python/integration/test_benchmark_topi_conv2d.py +++ b/vta/tests/python/integration/test_benchmark_topi_conv2d.py @@ -42,6 +42,7 @@ def run_vta_conv2d(env, remote, key, batch_size, wl, profile=True): res = my_clip(res, 0, 127) res = topi.cast(res, "int8") + # To compute number of ops, use a x2 factor for FMA num_ops = 2 * batch_size * fout_height * fout_width * wl.hkernel * wl.wkernel * wl.out_filter * wl.in_filter a_shape = (batch_size, wl.in_filter, wl.height, wl.width) @@ -118,7 +119,7 @@ def conv_normal(print_ir): print(vta.lower(s, [data, kernel, bias, res], simple_mode=True)) cost = verify(s, True) gops = (num_ops / cost.mean) / float(10 ** 9) - print("\tTime cost = %g sec/op, %g GFLOPS" % (cost.mean, gops)) + print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops)) conv_normal(False) diff --git a/vta/tests/python/pynq/test_benchmark_conv2d.py b/vta/tests/python/pynq/test_benchmark_conv2d.py index e02448a8d9939..5a26acb8b432d 100644 --- a/vta/tests/python/pynq/test_benchmark_conv2d.py +++ b/vta/tests/python/pynq/test_benchmark_conv2d.py @@ -46,10 +46,10 @@ def get_insn_count(layer, sched): env = vta.get_env() b, h, w, ci, co = sched b_factor = b - h_factor = layer.height / h - w_factor = layer.width / w - ci_factor = int(np.ceil(float(layer.in_filter) / (ci * env.BLOCK_IN))) - co_factor = int(np.ceil(float(layer.out_filter) / (co * env.BLOCK_OUT))) + h_factor = layer.height // h + w_factor = layer.width // w + ci_factor = layer.in_filter // (ci * env.BLOCK_IN) + co_factor = layer.out_filter // (co * env.BLOCK_OUT) input_xfers = b_factor * h_factor * w_factor * co_factor * ci_factor weight_xfers = b_factor * h_factor * w_factor * co_factor * ci_factor output_xfers = b_factor * h_factor * w_factor * co_factor @@ -69,11 +69,11 @@ def find_factors(n): factors.append(i) return factors # Scheduling exploration - batch_factors = find_factors(int(np.ceil(float(layer.batch) / env.BATCH))) - height_factors = find_factors(layer.height / layer.hstride) - width_factors = find_factors(layer.width / layer.wstride) - cin_factors = find_factors(int(np.ceil(float(layer.in_filter) / env.BLOCK_IN))) - cout_factors = find_factors(int(np.ceil(float(layer.out_filter) / env.BLOCK_OUT))) + batch_factors = find_factors(layer.batch // env.BATCH) + height_factors = find_factors(layer.height // layer.hstride) + width_factors = find_factors(layer.width // layer.wstride) + cin_factors = find_factors(layer.in_filter // env.BLOCK_IN) + cout_factors = find_factors(layer.out_filter // env.BLOCK_OUT) ht_factors = [1, 2] cot_factors = [1, 2] # Explore schedules @@ -124,7 +124,7 @@ def find_factors(n): if input_tile_elems*input_elem_size_b <= input_brams_capacity_b/(cot*ht) and \ weight_tile_elems*weight_elem_size_b <= weight_brams_capacity_b and \ output_tile_elems*output_elem_size_b <= output_brams_capacity_b/(cot*ht) and \ - insn_count <= env.MAX_XFER / 16 and \ + insn_count <= env.MAX_XFER // 16 and \ h > 2 and w > 2: schedule = Schedule(oc_factor=co, ko_factor=ci, h_factor=h, w_factor=w, oc_nthread=cot, h_nthread=ht) @@ -154,19 +154,19 @@ def get_data_movementB(sched, layer): weight_tile_elems = layer.hkernel * layer.wkernel * ci output_tile_elems = b * h * w * co # Derive factors - b_factor = int(np.ceil(float(layer.batch) / (b * env.BATCH))) - h_factor = (layer.height / layer.hstride) / h - w_factor = (layer.width / layer.wstride) / w - ci_factor = int(np.ceil(float(layer.in_filter) / (ci * env.BLOCK_IN))) - co_factor = int(np.ceil(float(layer.out_filter) / (co * env.BLOCK_OUT))) + b_factor = layer.batch // (b * env.BATCH) + h_factor = (layer.height // layer.hstride) // h + w_factor = (layer.width // layer.wstride) // w + ci_factor = int(np.ceil(float(layer.in_filter) // (ci * env.BLOCK_IN))) + co_factor = int(np.ceil(float(layer.out_filter) // (co * env.BLOCK_OUT))) # Derive transfers input_xfers = b_factor * h_factor * w_factor * co_factor * ci_factor weight_xfers = b_factor * h_factor * w_factor * co_factor * ci_factor output_xfers = b_factor * h_factor * w_factor * co_factor # Compute total transfer sizes - input_xfer_B = input_tile_elems * input_xfers * input_elem_size_b / 8 - weight_xfer_B = weight_tile_elems * weight_xfers * weight_elem_size_b / 8 - output_xfer_B = output_tile_elems * output_xfers * output_elem_size_b / 8 + input_xfer_B = input_tile_elems * input_xfers * input_elem_size_b // 8 + weight_xfer_B = weight_tile_elems * weight_xfers * weight_elem_size_b // 8 + output_xfer_B = output_tile_elems * output_xfers * output_elem_size_b // 8 total_xfer_B = input_xfer_B + weight_xfer_B + output_xfer_B return total_xfer_B @@ -175,13 +175,13 @@ def test_conv2d_chwv(layer, key, batch_size, wl, sched, log_frame, profile=True) assert batch_size % env.BATCH == 0 assert wl.in_filter % env.BLOCK_IN == 0 assert wl.out_filter % env.BLOCK_OUT == 0 - data_shape = (batch_size//env.BATCH, wl.in_filter//env.BLOCK_IN, + data_shape = (batch_size // env.BATCH, wl.in_filter // env.BLOCK_IN, wl.height, wl.width, env.BATCH, env.BLOCK_IN) - kernel_shape = (wl.out_filter//env.BLOCK_OUT, wl.in_filter//env.BLOCK_IN, + kernel_shape = (wl.out_filter // env.BLOCK_OUT, wl.in_filter // env.BLOCK_IN, wl.hkernel, wl.wkernel, env.BLOCK_OUT, env.BLOCK_IN) fout_height = (wl.height + 2 * wl.hpad - wl.hkernel) // wl.hstride + 1 fout_width = (wl.width + 2 * wl.wpad - wl.wkernel) // wl.wstride + 1 - res_shape = (batch_size//env.BATCH, wl.out_filter//env.BLOCK_OUT, + res_shape = (batch_size // env.BATCH, wl.out_filter // env.BLOCK_OUT, fout_height, fout_width, env.BATCH, env.BLOCK_OUT) data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype) kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype) @@ -201,7 +201,8 @@ def test_conv2d_chwv(layer, key, batch_size, wl, sched, log_frame, profile=True) kernel_buf[co, ko, di, dj, ci, ki].astype(env.acc_dtype), axis=[ko, di, dj, ki]), name="res_cnv") - res_shf = tvm.compute(res_shape, lambda *i: res_cnv(*i) >> 8, name="res_shf") + # res_shf = tvm.compute(res_shape, lambda *i: res_cnv(*i) >> 8, name="res_shf") + res_shf = topi.right_shift(res_cnv, 8) res = tvm.compute(res_shape, lambda *i: res_shf(*i).astype(env.inp_dtype), name="res") num_ops = batch_size * fout_height * fout_width * wl.hkernel * wl.wkernel * wl.out_filter * wl.in_filter total_xfer_B = get_data_movementB(sched, wl) @@ -310,7 +311,7 @@ def run_test(header, print_ir, check_correctness): print_ir, check_correctness) gops = (num_ops / cost.mean) / float(10 ** 9) print(header) - print("\tTime cost = %g sec/op, %g GFLOPS" % (cost.mean, gops)) + print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops)) log_frame["key"].append(key) log_frame["layer"].append(layer) log_frame["total-data"].append(total_xfer_B) @@ -347,7 +348,7 @@ def run_test(header, print_ir): print_ir, False) gops = (num_ops / cost.mean) / float(10 ** 9) print(header) - print("\tTime cost = %g sec/op, %g GFLOPS" % (cost.mean, gops)) + print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops)) log_frame["skip-alu-gops"].append(gops) log_frame["skip-alu-cost"].append(cost.mean) @@ -365,7 +366,7 @@ def run_test(header, print_ir): print_ir, False) gops = (num_ops / cost.mean) / float(10 ** 9) print(header) - print("\tTime cost = %g sec/op, %g GFLOPS" % (cost.mean, gops)) + print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops)) log_frame["gemm-gops"].append(gops) log_frame["gemm-cost"].append(cost.mean) with vta.build_config(): @@ -382,7 +383,7 @@ def run_test(header, print_ir): print_ir, False) gops = (num_ops / cost.mean) / float(10 ** 9) print(header) - print("\tTime cost = %g sec/op, %g GFLOPS" % (cost.mean, gops)) + print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops)) log_frame["alu-gops"].append(gops) log_frame["alu-cost"].append(cost.mean) with vta.build_config(): @@ -401,7 +402,7 @@ def run_test(header, print_ir): bandwith = (batch_size * wl.in_filter * wl.height * wl.width * env.INP_WIDTH / cost.mean) / float(10 ** 9) print(header) - print("\tTime cost = %g sec/op, %g GFLOPS, bandwith=%g gbits" % ( + print("\tTime cost = %g sec/op, %g GOPS, bandwith=%g gbits" % ( cost.mean, gops, bandwith)) log_frame["ld-inp-gbits"].append(bandwith) log_frame["ld-inp-cost"].append(cost.mean) @@ -421,7 +422,7 @@ def run_test(header, print_ir): bandwith = (wl.out_filter * wl.in_filter * wl.hkernel * wl.wkernel * env.WGT_WIDTH / cost.mean) / float(10 ** 9) print(header) - print("\tTime cost = %g sec/op, %g GFLOPS, bandwith=%g gbits" % ( + print("\tTime cost = %g sec/op, %g GOPS, bandwith=%g gbits" % ( cost.mean, gops, bandwith)) log_frame["ld-wgt-gbits"].append(bandwith) log_frame["ld-wgt-cost"].append(cost.mean) @@ -441,7 +442,7 @@ def run_test(header, print_ir): bandwith = (batch_size * wl.out_filter * fout_height * fout_width * env.OUT_WIDTH / cost.mean) / float(10 ** 9) print(header) - print("\tTime cost = %g sec/op, %g GFLOPS, bandwith=%g gbits" % ( + print("\tTime cost = %g sec/op, %g GOPS, bandwith=%g gbits" % ( cost.mean, gops, bandwith)) log_frame["st-out-gbits"].append(bandwith) log_frame["st-out-cost"].append(cost.mean) @@ -460,7 +461,7 @@ def run_test(header, print_ir): False) gops = (num_ops / cost.mean) / float(10 ** 9) print(header) - print("\tTime cost = %g sec/op, %g GFLOPS" % ( + print("\tTime cost = %g sec/op, %g GOPS" % ( cost.mean, gops)) with vta.build_config(): run_test("NORMAL", print_ir) @@ -532,10 +533,11 @@ def run_test(header, print_ir): key = "resnet-cfg[{}-{}]".format(l, plan) test_conv2d_chwv(l, key, batch_size, resnet[l], plan, log_frame, profile) -pd.set_option('expand_frame_repr', False) -log_df = pd.DataFrame() -for k in keys: - log_df[k] = log_frame[k] -print(log_df) -log_df.to_csv("conv2d.csv") +if profile: + pd.set_option('expand_frame_repr', False) + log_df = pd.DataFrame() + for k in keys: + log_df[k] = log_frame[k] + print(log_df) + log_df.to_csv("conv2d.csv") diff --git a/vta/tutorials/matrix_multiply.py b/vta/tutorials/matrix_multiply.py index f9d4d9fc7094b..0d1872170a4f7 100644 --- a/vta/tutorials/matrix_multiply.py +++ b/vta/tutorials/matrix_multiply.py @@ -91,6 +91,7 @@ # # .. image:: https://raw.githubusercontent.com/uwsaml/web-data/master/vta/tutorial/tensor_core.png # :align: center +# :width: 480px # # The dimensions of that matrix-matrix multiplication are specified in # the :code:`config.json` configuration file. @@ -109,6 +110,7 @@ # # .. image:: https://raw.githubusercontent.com/uwsaml/web-data/master/vta/tutorial/data_tiling.png # :align: center +# :width: 480px # # We first define the variables :code:`m`, :code:`n`, :code:`o` to represent # the shape of the matrix multiplication. These variables are multiplicative @@ -119,7 +121,6 @@ # 1 implies that our compute building block is vector-matrix multiply). # - ###################################################################### # .. note:: # diff --git a/vta/tutorials/matrix_multiply_opt.py b/vta/tutorials/matrix_multiply_opt.py index 9b62504a70078..4c916c7cddbca 100644 --- a/vta/tutorials/matrix_multiply_opt.py +++ b/vta/tutorials/matrix_multiply_opt.py @@ -66,7 +66,7 @@ # :code:`BATCH`, :code:`BLOCK_IN`, and :code:`BLOCK_OUT` respectively. # # We've added extra operators to the matrix multiplication that apply -# shifting and clipping to the output in order to mimic the a fixed-point +# shifting and clipping to the output in order to mimic a fixed-point # matrix multiplication followed by a rectified linear activation. # We describe the TVM dataflow graph of the fully connected layer below: # @@ -152,7 +152,7 @@ # Those include: # # - Computation blocking -# - Computation lowering to VTA hardware intrinsics +# - Lowering to VTA hardware intrinsics # Create TVM schedule @@ -161,8 +161,8 @@ print(tvm.lower(s, [data, weight, res], simple_mode=True)) ###################################################################### -# Tiling the Computation -# ~~~~~~~~~~~~~~~~~~~~~~ +# Blocking the Computation +# ~~~~~~~~~~~~~~~~~~~~~~~~ # The matrix multiplication is by default too large for activations or weights # to fit on VTA's on-chip buffers all at once. # We block the (1, 1024) by (1024, 1024) matrix multiplication into @@ -180,8 +180,7 @@ # # .. image:: https://raw.githubusercontent.com/uwsaml/web-data/master/vta/tutorial/blocking.png # :align: center -# :height: 367px -# :width: 387px +# :width: 480px # # .. note:: # @@ -236,7 +235,7 @@ s[res_max].compute_at(s[res], oc_out) s[res_min].compute_at(s[res], oc_out) -# Apply additional loop split along input channel axis +# Apply additional loop split along reduction axis (input channel) b_inn, oc_inn, b_tns, oc_tns = s[res_gemm].op.axis ic_out, ic_inn = s[res_gemm].split(ic, i_block) @@ -273,6 +272,8 @@ s[weight_buf].pragma(s[weight_buf].op.axis[0], env.dma_copy) # Use DMA copy pragma on SRAM->DRAM operation +# (this implies that these copies should be performed along b_inn, +# or result axis 2) s[res].pragma(s[res].op.axis[2], env.dma_copy) ###################################################################### @@ -313,21 +314,21 @@ # Get the remote device context ctx = remote.ext_dev(0) -# Initialize the A and B arrays randomly in the int range of (-128, 128] -data = np.random.randint( +# Initialize the data and weight arrays randomly in the int range of (-128, 128] +data_np = np.random.randint( -128, 128, size=(batch_size, in_channels)).astype(data.dtype) -weight = np.random.randint( +weight_np = np.random.randint( -128, 128, size=(out_channels, in_channels)).astype(weight.dtype) -# Apply packing to the A and B arrays from a 2D to a 4D packed layout -data_packed = data.reshape(batch_size // env.BATCH, - env.BATCH, - in_channels // env.BLOCK_IN, - env.BLOCK_IN).transpose((0, 2, 1, 3)) -weight_packed = weight.reshape(out_channels // env.BLOCK_OUT, - env.BLOCK_OUT, - in_channels // env.BLOCK_IN, - env.BLOCK_IN).transpose((0, 2, 1, 3)) +# Apply packing to the data and weight arrays from a 2D to a 4D packed layout +data_packed = data_np.reshape(batch_size // env.BATCH, + env.BATCH, + in_channels // env.BLOCK_IN, + env.BLOCK_IN).transpose((0, 2, 1, 3)) +weight_packed = weight_np.reshape(out_channels // env.BLOCK_OUT, + env.BLOCK_OUT, + in_channels // env.BLOCK_IN, + env.BLOCK_IN).transpose((0, 2, 1, 3)) # Format the input/output arrays with tvm.nd.array to the DLPack standard data_nd = tvm.nd.array(data_packed, ctx) @@ -338,8 +339,8 @@ f(data_nd, weight_nd, res_nd) # Verify against numpy implementation -res_ref = np.dot(data.astype(env.acc_dtype), - weight.T.astype(env.acc_dtype)) +res_ref = np.dot(data_np.astype(env.acc_dtype), + weight_np.T.astype(env.acc_dtype)) res_ref = res_ref >> env.INP_WIDTH res_ref = np.clip(res_ref, 0, inp_max) res_ref = res_ref.astype(res.dtype)