[Hardware] Multiplier integration, new vta conf format (#3)

jroesch · Nov 25, 2018 · 4a8e105 · 4a8e105
1 parent adc588f
commit 4a8e105
Show file tree

Hide file tree

Showing 16 changed files with 532 additions and 427 deletions.
diff --git a/vta/config/pynq_sample.json b/vta/config/pynq_sample.json
@@ -1,17 +1,20 @@
 {
   "TARGET" : "pynq",
+  "HW_VER" : "0.0.1",
   "HW_FREQ" : 100,
   "HW_CLK_TARGET" : 8,
-  "HW_VER" : "0.0.0",
+  "ALU" : true,
+  "GEMM_II" : 2,
+  "TALU_II" : 4,
   "LOG_INP_WIDTH" : 3,
-  "LOG_WGT_WIDTH" : 3,
+  "LOG_WGT_WIDTH" : 1,
   "LOG_ACC_WIDTH" : 5,
   "LOG_OUT_WIDTH" : 3,
   "LOG_BATCH" : 0,
-  "LOG_BLOCK_IN" : 4,
-  "LOG_BLOCK_OUT" : 4,
+  "LOG_BLOCK_IN" : 5,
+  "LOG_BLOCK_OUT" : 5,
   "LOG_UOP_BUFF_SIZE" : 15,
-  "LOG_INP_BUFF_SIZE" : 15,
-  "LOG_WGT_BUFF_SIZE" : 18,
+  "LOG_INP_BUFF_SIZE" : 17,
+  "LOG_WGT_BUFF_SIZE" : 17,
   "LOG_ACC_BUFF_SIZE" : 17
 }
diff --git a/vta/config/vta_config.json b/vta/config/vta_config.json
@@ -1,9 +1,11 @@
 {
-  "TARGET" : "sim",
+  "TARGET" : "pynq",
+  "HW_VER" : "0.0.2",
   "HW_FREQ" : 100,
-  "HW_CLK_TARGET" : 8,
-  "HW_VER" : "0.0.0",
-  "GEMM_II" : 2,
+  "HW_CLK_TARGET" : 7,
+  "ALU_EN" : true,
+  "MUL_EN" : true,
+  "GEMM_II" : 1,
   "TALU_II" : 2,
   "LOG_INP_WIDTH" : 3,
   "LOG_WGT_WIDTH" : 1,
@@ -13,7 +15,7 @@
   "LOG_BLOCK_IN" : 5,
   "LOG_BLOCK_OUT" : 5,
   "LOG_UOP_BUFF_SIZE" : 15,
-  "LOG_INP_BUFF_SIZE" : 17,
-  "LOG_WGT_BUFF_SIZE" : 17,
+  "LOG_INP_BUFF_SIZE" : 16,
+  "LOG_WGT_BUFF_SIZE" : 18,
   "LOG_ACC_BUFF_SIZE" : 17
 }
diff --git a/vta/config/vta_config.py b/vta/config/vta_config.py
@@ -38,6 +38,10 @@ def main():
                         help="print the target")
     parser.add_argument("--cfg-str", action="store_true",
                         help="print the configuration string")
+    parser.add_argument("--get-aluen", action="store_true",
+                        help="returns whether ALU is enabled")
+    parser.add_argument("--get-mulen", action="store_true",
+                        help="returns whether mul in ALU is enabled")
     parser.add_argument("--get-gemmii", action="store_true",
                         help="returns the GEMM core II")
     parser.add_argument("--get-taluii", action="store_true",
@@ -90,7 +94,28 @@ def main():
     if not ok_path_list:
         raise RuntimeError("Cannot find config in %s" % str(path_list))
     cfg = json.load(open(ok_path_list[0]))
-    cfg["LOG_OUT_BUFF_SIZE"] = cfg["LOG_ACC_BUFF_SIZE"] + cfg["LOG_ACC_WIDTH"] - cfg["LOG_OUT_WIDTH"]
+    cfg["LOG_OUT_BUFF_SIZE"] = cfg["LOG_ACC_BUFF_SIZE"] - cfg["LOG_ACC_WIDTH"] + cfg["LOG_OUT_WIDTH"]
+    # Generate bitstream config string.
+    # Needs to match the BITSTREAM string in python/vta/environment.py
+    cfg["BITSTREAM"] = "{}_{}x{}x{}_a{}w{}o{}_{}_{}_{}_{}_{}MHz_{}ns_gii{}".format(
+        cfg["HW_VER"].replace('.', '_'),
+        (1 << cfg["LOG_BATCH"]),
+        (1 << cfg["LOG_BLOCK_IN"]),
+        (1 << cfg["LOG_BLOCK_OUT"]),
+        (1 << cfg["LOG_INP_WIDTH"]),
+        (1 << cfg["LOG_WGT_WIDTH"]),
+        (1 << cfg["LOG_OUT_WIDTH"]),
+        cfg["LOG_UOP_BUFF_SIZE"],
+        cfg["LOG_INP_BUFF_SIZE"],
+        cfg["LOG_WGT_BUFF_SIZE"],
+        cfg["LOG_ACC_BUFF_SIZE"],
+        cfg["HW_FREQ"],
+        cfg["HW_CLK_TARGET"],
+        cfg["GEMM_II"])
+    if cfg["ALU_EN"]:
+        cfg["BITSTREAM"] += "_aii{}".format(cfg["TALU_II"])
+    if cfg["MUL_EN"] and cfg["ALU_EN"]:
+        cfg["BITSTREAM"] += "_mul"
     pkg = get_pkg_config(cfg)
 
     if args.target:
@@ -119,23 +144,13 @@ def main():
             fo.write(pkg.cfg_json)
 
     if args.cfg_str:
-        # Needs to match the BITSTREAM string in python/vta/environment.py
-        cfg_str = "{}x{}x{}_g{}_a{}_{}bx{}b_{}_{}_{}_{}_{}MHz_{}ns_v{}".format(
-            (1 << cfg["LOG_BATCH"]),
-            (1 << cfg["LOG_BLOCK_IN"]),
-            (1 << cfg["LOG_BLOCK_OUT"]),
-            cfg["GEMM_II"],
-            cfg["TALU_II"],
-            (1 << cfg["LOG_INP_WIDTH"]),
-            (1 << cfg["LOG_WGT_WIDTH"]),
-            cfg["LOG_UOP_BUFF_SIZE"],
-            cfg["LOG_INP_BUFF_SIZE"],
-            cfg["LOG_WGT_BUFF_SIZE"],
-            cfg["LOG_ACC_BUFF_SIZE"],
-            cfg["HW_FREQ"],
-            cfg["HW_CLK_TARGET"],
-            cfg["HW_VER"].replace('.', '_'))
-        print(cfg_str)
+        print(cfg["BITSTREAM"])
+
+    if args.get_aluen:
+        print(cfg["ALU_EN"])
+
+    if args.get_mulen:
+        print(cfg["MUL_EN"])
 
     if args.get_gemmii:
         print(cfg["GEMM_II"])

diff --git a/vta/hardware/xilinx/Makefile b/vta/hardware/xilinx/Makefile
@@ -16,13 +16,9 @@ HSI = hsi
 # HLS mode
 MODE = all
 # Debug flag
-DEBUG = false
+DEBUG = False
 # SLURM
-SLURM = false
-# Prevent generation of DSP
-NO_DSP = false
-# Prevent generation of ALU
-NO_ALU = true
+SLURM = False
 
 # Process VTA JSON config
 VTA_CONFIG = python $(CURDIR)/../../config/vta_config.py
@@ -44,6 +40,8 @@ VTA_INP_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-inpbuffsize)
 VTA_WGT_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-wgtbuffsize)
 VTA_ACC_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-accbuffsize)
 VTA_OUT_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-outbuffsize)
+VTA_ALU_EN := $(shell ${VTA_CONFIG} --get-aluen)
+VTA_MUL_EN := $(shell ${VTA_CONFIG} --get-mulen)
 
 #---------------------
 # FPGA Parameters
@@ -87,7 +85,8 @@ $(IP_PATH): $(SRC_DIR)/*
 	cd $(IP_BUILD_PATH) && \
 		$(VIVADO_HLS) -f $(SCRIPT_DIR)/hls.tcl \
 		-tclargs $(SRC_DIR) $(SIM_DIR) $(TEST_DIR) $(INCLUDE_DIR) \
-		$(MODE) $(DEBUG) $(NO_DSP) $(NO_ALU) $(VTA_TARGET_PER) $(VTA_GEMM_II) $(VTA_TALU_II) \
+		$(MODE) $(DEBUG) $(VTA_ALU_EN) $(VTA_MUL_EN) \
+		$(VTA_TARGET_PER) $(VTA_GEMM_II) $(VTA_TALU_II) \
 		$(VTA_INP_WIDTH) $(VTA_WGT_WIDTH) $(VTA_ACC_WIDTH) $(VTA_OUT_WIDTH) \
 		$(VTA_BATCH) $(VTA_IN_BLOCK) $(VTA_OUT_BLOCK) \
 		$(VTA_UOP_BUFF_SIZE) $(VTA_INP_BUFF_SIZE) $(VTA_WGT_BUFF_SIZE) \

diff --git a/vta/hardware/xilinx/scripts/hls.tcl b/vta/hardware/xilinx/scripts/hls.tcl
@@ -11,8 +11,8 @@
 # Arg 4: path to include sources
 # Arg 5: mode
 # Arg 6: debug
-# Arg 7: no_dsp
-# Arg 8: no_alu
+# Arg 7: alu_ena
+# Arg 8: mul_ena
 # Arg 9: target clock period
 # Arg 10: target II for GEMM
 # Arg 11: target II for tensor ALU
@@ -36,8 +36,8 @@ if { [llength $argv] eq 25 } {
 	set include_dir [lindex $argv 5]
 	set mode [lindex $argv 6]
 	set debug [lindex $argv 7]
-	set no_dsp [lindex $argv 8]
-	set no_alu [lindex $argv 9]
+	set alu_ena [lindex $argv 8]
+	set mul_ena [lindex $argv 9]
 	set target_period [lindex $argv 10]
 	set target_gemm_ii [lindex $argv 11]
 	set target_alu_ii [lindex $argv 12]
@@ -59,9 +59,9 @@ if { [llength $argv] eq 25 } {
 	set test_dir "../../src/test"
 	set include_dir "../../include"
 	set mode "all"
-	set debug "false"
-	set no_dsp "true"
-	set no_alu "false"
+	set debug "False"
+	set alu_ena "True"
+	set mul_ena "True"
 	set target_period 8
 	set target_gemm_ii 10
 	set target_alu_ii 16
@@ -83,7 +83,7 @@ if { [llength $argv] eq 25 } {
 # Initializes the HLS design and sets HLS pragmas for memory partitioning.
 # This is necessary because of a Vivado restriction that doesn't allow for
 # buses wider than 1024 bits.
-proc init_design {per ii inp_width wgt_width out_width acc_width batch block_in block_out no_alu} {
+proc init_design {per g_ii a_ii inp_width wgt_width out_width acc_width batch block_in block_out alu_ena} {
 
 	# Set device number
 	set_part {xc7z020clg484-1}
@@ -98,14 +98,14 @@ proc init_design {per ii inp_width wgt_width out_width acc_width batch block_in
 	create_clock -period $per -name default
 
 	# Set pipeline directive
-	set_directive_pipeline -II $ii "compute/READ_GEMM_UOP"
+	set_directive_pipeline -II $g_ii "compute/READ_GEMM_UOP"
 
-	if {$no_alu=="false"} {
-		set_directive_pipeline -II $ii "compute/READ_ALU_UOP"
+	if {$alu_ena=="True"} {
+		set_directive_pipeline -II $a_ii "compute/READ_ALU_UOP"
 	}
 
-	# Set input partition factor to (INP_VECTOR_WIDTH*BATCH/(1024*ii)
-	set inp_bus_width [expr {(1 << ($inp_width + $block_in + $batch)) / $ii}]
+	# Set input partition factor to (INP_VECTOR_WIDTH*BATCH/(1024*g_ii)
+	set inp_bus_width [expr {(1 << ($inp_width + $block_in + $batch)) / $g_ii}]
 	set inp_partition_factor [expr {$inp_bus_width / $max_width}]
 	if {$inp_partition_factor == 0} {
 		set inp_reshape_factor [expr {$inp_bus_width / $axi_width}]
@@ -118,8 +118,8 @@ proc init_design {per ii inp_width wgt_width out_width acc_width batch block_in
 		set_directive_array_reshape -type block -factor $inp_reshape_factor -dim 2 "load" inp_mem
 		set_directive_array_reshape -type block -factor $inp_reshape_factor -dim 2 "compute" inp_mem
 	}
-	# Set weight partition factor to (WGT_VECTOR_WIDTH*BLOCK_OUT/(1024*ii))
-	set wgt_bus_width [expr {(1 << ($wgt_width + $block_in + $block_out)) / $ii}]
+	# Set weight partition factor to (WGT_VECTOR_WIDTH*BLOCK_OUT/(1024*g_ii))
+	set wgt_bus_width [expr {(1 << ($wgt_width + $block_in + $block_out)) / $g_ii}]
 	set wgt_partition_factor [expr {$wgt_bus_width / $max_width}]
 	if {$wgt_partition_factor == 0} {
 		set wgt_reshape_factor [expr {$wgt_bus_width / $axi_width}]
@@ -132,8 +132,8 @@ proc init_design {per ii inp_width wgt_width out_width acc_width batch block_in
 		set_directive_array_reshape -type block -factor $wgt_reshape_factor -dim 2 "load" wgt_mem
 		set_directive_array_reshape -type block -factor $wgt_reshape_factor -dim 2 "compute" wgt_mem
 	}
-	# Set output partition factor to (OUT_VECTOR_WIDTH*BATCH/(1024*ii))
-	set out_bus_width [expr {(1 << ($out_width + $block_out + $batch)) / $ii}]
+	# Set output partition factor to (OUT_VECTOR_WIDTH*BATCH/(1024*g_ii))
+	set out_bus_width [expr {(1 << ($out_width + $block_out + $batch)) / $g_ii}]
 	set out_partition_factor [expr {$out_bus_width / $max_width}]
 	if {$out_partition_factor == 0} {
 		set out_reshape_factor [expr {$out_bus_width / $axi_width}]
@@ -147,9 +147,9 @@ proc init_design {per ii inp_width wgt_width out_width acc_width batch block_in
 		set_directive_array_reshape -type block -factor $out_reshape_factor -dim 2 "store" out_mem
 	}
 	# Set accumulator partition factor
-	# set acc_bus_width [expr {(1 << ($acc_width + $block_out + $batch)) / $ii}]
+	# set acc_bus_width [expr {(1 << ($acc_width + $block_out + $batch)) / $g_ii}]
 	# set acc_reshape_factor [expr {$acc_bus_width / $axi_width}]
-	# set_directive_array_reshape -type block -factor $acc_reshape_factor -dim 2 "compute" acc_mem
+	# set_directive_array_partition -type block -factor $acc_reshape_factor -dim 2 "compute" acc_mem
 }
 
 # C define flags to pass to compiler
@@ -160,14 +160,14 @@ set cflags "-I $include_dir -I $src_dir -I $test_dir \
 	-DVTA_LOG_UOP_BUFF_SIZE=$uop_buff_size -DVTA_LOG_INP_BUFF_SIZE=$inp_buff_size \
 	-DVTA_LOG_WGT_BUFF_SIZE=$wgt_buff_size -DVTA_LOG_ACC_BUFF_SIZE=$acc_buff_size \
 	-DVTA_LOG_OUT_BUFF_SIZE=$out_buff_size"
-if {$debug=="true"} {
+if {$debug=="True"} {
 	append cflags " -DVTA_DEBUG=1"
 }
-if {$no_dsp=="true"} {
-	append cflags " -DNO_DSP"
+if {$alu_ena=="True"} {
+	append cflags " -DALU_EN"
 }
-if {$no_alu=="true"} {
-	append cflags " -DNO_ALU"
+if {$mul_ena=="True"} {
+	append cflags " -DMUL_EN"
 }
 
 # HLS behavioral sim
@@ -178,7 +178,7 @@ if {$mode=="all" || $mode=="sim"} {
 	add_files -tb $sim_dir/vta_test.cc -cflags $cflags
 	add_files -tb $test_dir/test_lib.cc -cflags $cflags
 	open_solution "solution0"
-	init_design $target_period $target_gemm_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $no_alu
+	init_design $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
 	csim_design -clean
 	close_project
 }
@@ -189,7 +189,7 @@ if {$mode=="all" || $mode=="skip_sim" || $mode=="fetch"} {
 	set_top fetch
 	add_files $src_dir/vta.cc -cflags $cflags
 	open_solution "solution0"
-	init_design $target_period $target_gemm_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $no_alu
+	init_design $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
 	csynth_design
 	if {$mode=="all" || $mode=="skip_sim"} {
 		export_design -format ip_catalog
@@ -203,7 +203,7 @@ if {$mode=="all" || $mode=="skip_sim" || $mode=="load"} {
 	set_top load
 	add_files $src_dir/vta.cc -cflags $cflags
 	open_solution "solution0"
-	init_design $target_period $target_gemm_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $no_alu
+	init_design $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
 	csynth_design
 	if {$mode=="all" || $mode=="skip_sim"} {
 		export_design -format ip_catalog
@@ -217,7 +217,7 @@ if {$mode=="all" || $mode=="skip_sim" || $mode=="compute"} {
 	set_top compute
 	add_files $src_dir/vta.cc -cflags $cflags
 	open_solution "solution0"
-	init_design $target_period $target_gemm_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $no_alu
+	init_design $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
 	csynth_design
 	if {$mode=="all" || $mode=="skip_sim"} {
 		export_design -format ip_catalog
@@ -231,7 +231,7 @@ if {$mode=="all" || $mode=="skip_sim" || $mode=="store"} {
 	set_top store
 	add_files $src_dir/vta.cc -cflags $cflags
 	open_solution "solution0"
-	init_design $target_period $target_gemm_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $no_alu
+	init_design $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
 	csynth_design
 	if {$mode=="all" || $mode=="skip_sim"} {
 		export_design -format ip_catalog

diff --git a/vta/hardware/xilinx/sim/vta_test.cc b/vta/hardware/xilinx/sim/vta_test.cc
@@ -29,29 +29,41 @@ int main(void) {
 
     int status = 0;
 
-    // // Run ALU test (vector-scalar operators)
-    // status |= alu_test(VTA_ALU_OPCODE_MIN, true, VTA_BLOCK_OUT, 128, true);
-    // status |= alu_test(VTA_ALU_OPCODE_MIN, true, VTA_BLOCK_OUT, 128, false);
-    // status |= alu_test(VTA_ALU_OPCODE_MAX, true, VTA_BLOCK_OUT, 128, true);
-    // status |= alu_test(VTA_ALU_OPCODE_MAX, true, VTA_BLOCK_OUT, 128, false);
-    // status |= alu_test(VTA_ALU_OPCODE_ADD, true, VTA_BLOCK_OUT, 128, true);
-    // status |= alu_test(VTA_ALU_OPCODE_ADD, true, VTA_BLOCK_OUT, 128, false);
-    // status |= alu_test(VTA_ALU_OPCODE_SHR, true, VTA_BLOCK_OUT, 128, true);
-    // status |= alu_test(VTA_ALU_OPCODE_SHR, true, VTA_BLOCK_OUT, 128, false);
-
-    // // Run ALU test (vector-vector operators)
-    // status |= alu_test(VTA_ALU_OPCODE_MIN, false, VTA_BLOCK_OUT, 128, true);
-    // status |= alu_test(VTA_ALU_OPCODE_MIN, false, VTA_BLOCK_OUT, 128, false);
-    // status |= alu_test(VTA_ALU_OPCODE_MAX, false, VTA_BLOCK_OUT, 128, true);
-    // status |= alu_test(VTA_ALU_OPCODE_MAX, false, VTA_BLOCK_OUT, 128, false);
-    // status |= alu_test(VTA_ALU_OPCODE_ADD, false, VTA_BLOCK_OUT, 128, true);
-    // status |= alu_test(VTA_ALU_OPCODE_ADD, false, VTA_BLOCK_OUT, 128, false);
-
-    // // Run blocked GEMM test
-    // status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 2);
-    // status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 2);
-    // status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 1);
-    // status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 1);
+#ifdef ALU_EN
+    // Run ALU test (vector-scalar operators)
+    status |= alu_test(VTA_ALU_OPCODE_MIN, true, VTA_BLOCK_OUT, 128, true);
+    status |= alu_test(VTA_ALU_OPCODE_MIN, true, VTA_BLOCK_OUT, 128, false);
+    status |= alu_test(VTA_ALU_OPCODE_MAX, true, VTA_BLOCK_OUT, 128, true);
+    status |= alu_test(VTA_ALU_OPCODE_MAX, true, VTA_BLOCK_OUT, 128, false);
+    status |= alu_test(VTA_ALU_OPCODE_ADD, true, VTA_BLOCK_OUT, 128, true);
+    status |= alu_test(VTA_ALU_OPCODE_ADD, true, VTA_BLOCK_OUT, 128, false);
+    status |= alu_test(VTA_ALU_OPCODE_SHR, true, VTA_BLOCK_OUT, 128, true);
+    status |= alu_test(VTA_ALU_OPCODE_SHR, true, VTA_BLOCK_OUT, 128, false);
+
+    // Run ALU test (vector-vector operators)
+    status |= alu_test(VTA_ALU_OPCODE_MIN, false, VTA_BLOCK_OUT, 128, true);
+    status |= alu_test(VTA_ALU_OPCODE_MIN, false, VTA_BLOCK_OUT, 128, false);
+    status |= alu_test(VTA_ALU_OPCODE_MAX, false, VTA_BLOCK_OUT, 128, true);
+    status |= alu_test(VTA_ALU_OPCODE_MAX, false, VTA_BLOCK_OUT, 128, false);
+    status |= alu_test(VTA_ALU_OPCODE_ADD, false, VTA_BLOCK_OUT, 128, true);
+    status |= alu_test(VTA_ALU_OPCODE_ADD, false, VTA_BLOCK_OUT, 128, false);
+    status |= alu_test(VTA_ALU_OPCODE_SHR, false, VTA_BLOCK_OUT, 128, true);
+    status |= alu_test(VTA_ALU_OPCODE_SHR, false, VTA_BLOCK_OUT, 128, false);
+
+#ifdef MUL_EN
+    status |= alu_test(VTA_ALU_OPCODE_MUL, true, VTA_BLOCK_OUT, 128, true);
+    status |= alu_test(VTA_ALU_OPCODE_MUL, true, VTA_BLOCK_OUT, 128, false);
+    status |= alu_test(VTA_ALU_OPCODE_MUL, false, VTA_BLOCK_OUT, 128, true);
+    status |= alu_test(VTA_ALU_OPCODE_MUL, false, VTA_BLOCK_OUT, 128, false);
+#endif // MUL_EN
+
+#endif // ALU_EN
+
+    // Run blocked GEMM test
+    status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 2);
+    status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 2);
+    status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 1);
+    status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 1);
 
     // Simple GEMM unit test
     status |= gemm_test(4 * VTA_BATCH, 4 * VTA_BLOCK_OUT, 4 * VTA_BLOCK_IN, true);