diff --git a/vta/config/pynq_sample.json b/vta/config/pynq_sample.json
index 5c37108e6b12f..fd6190caa9ed8 100644
--- a/vta/config/pynq_sample.json
+++ b/vta/config/pynq_sample.json
@@ -1,17 +1,20 @@
 {
   "TARGET" : "pynq",
+  "HW_VER" : "0.0.1",
   "HW_FREQ" : 100,
   "HW_CLK_TARGET" : 8,
-  "HW_VER" : "0.0.0",
+  "ALU" : true,
+  "GEMM_II" : 2,
+  "TALU_II" : 4,
   "LOG_INP_WIDTH" : 3,
-  "LOG_WGT_WIDTH" : 3,
+  "LOG_WGT_WIDTH" : 1,
   "LOG_ACC_WIDTH" : 5,
   "LOG_OUT_WIDTH" : 3,
   "LOG_BATCH" : 0,
-  "LOG_BLOCK_IN" : 4,
-  "LOG_BLOCK_OUT" : 4,
+  "LOG_BLOCK_IN" : 5,
+  "LOG_BLOCK_OUT" : 5,
   "LOG_UOP_BUFF_SIZE" : 15,
-  "LOG_INP_BUFF_SIZE" : 15,
-  "LOG_WGT_BUFF_SIZE" : 18,
+  "LOG_INP_BUFF_SIZE" : 17,
+  "LOG_WGT_BUFF_SIZE" : 17,
   "LOG_ACC_BUFF_SIZE" : 17
 }
diff --git a/vta/config/vta_config.json b/vta/config/vta_config.json
index e9cc88a5479dc..8a7bcc01af62b 100644
--- a/vta/config/vta_config.json
+++ b/vta/config/vta_config.json
@@ -1,9 +1,11 @@
 {
-  "TARGET" : "sim",
+  "TARGET" : "pynq",
+  "HW_VER" : "0.0.2",
   "HW_FREQ" : 100,
-  "HW_CLK_TARGET" : 8,
-  "HW_VER" : "0.0.0",
-  "GEMM_II" : 2,
+  "HW_CLK_TARGET" : 7,
+  "ALU_EN" : true,
+  "MUL_EN" : true,
+  "GEMM_II" : 1,
   "TALU_II" : 2,
   "LOG_INP_WIDTH" : 3,
   "LOG_WGT_WIDTH" : 1,
@@ -13,7 +15,7 @@
   "LOG_BLOCK_IN" : 5,
   "LOG_BLOCK_OUT" : 5,
   "LOG_UOP_BUFF_SIZE" : 15,
-  "LOG_INP_BUFF_SIZE" : 17,
-  "LOG_WGT_BUFF_SIZE" : 17,
+  "LOG_INP_BUFF_SIZE" : 16,
+  "LOG_WGT_BUFF_SIZE" : 18,
   "LOG_ACC_BUFF_SIZE" : 17
 }
diff --git a/vta/config/vta_config.py b/vta/config/vta_config.py
index bc86b6887cc25..664b04db0b7a2 100644
--- a/vta/config/vta_config.py
+++ b/vta/config/vta_config.py
@@ -38,6 +38,10 @@ def main():
                         help="print the target")
     parser.add_argument("--cfg-str", action="store_true",
                         help="print the configuration string")
+    parser.add_argument("--get-aluen", action="store_true",
+                        help="returns whether ALU is enabled")
+    parser.add_argument("--get-mulen", action="store_true",
+                        help="returns whether mul in ALU is enabled")
     parser.add_argument("--get-gemmii", action="store_true",
                         help="returns the GEMM core II")
     parser.add_argument("--get-taluii", action="store_true",
@@ -90,7 +94,28 @@ def main():
     if not ok_path_list:
         raise RuntimeError("Cannot find config in %s" % str(path_list))
     cfg = json.load(open(ok_path_list[0]))
-    cfg["LOG_OUT_BUFF_SIZE"] = cfg["LOG_ACC_BUFF_SIZE"] + cfg["LOG_ACC_WIDTH"] - cfg["LOG_OUT_WIDTH"]
+    cfg["LOG_OUT_BUFF_SIZE"] = cfg["LOG_ACC_BUFF_SIZE"] - cfg["LOG_ACC_WIDTH"] + cfg["LOG_OUT_WIDTH"]
+    # Generate bitstream config string.
+    # Needs to match the BITSTREAM string in python/vta/environment.py
+    cfg["BITSTREAM"] = "{}_{}x{}x{}_a{}w{}o{}_{}_{}_{}_{}_{}MHz_{}ns_gii{}".format(
+        cfg["HW_VER"].replace('.', '_'),
+        (1 << cfg["LOG_BATCH"]),
+        (1 << cfg["LOG_BLOCK_IN"]),
+        (1 << cfg["LOG_BLOCK_OUT"]),
+        (1 << cfg["LOG_INP_WIDTH"]),
+        (1 << cfg["LOG_WGT_WIDTH"]),
+        (1 << cfg["LOG_OUT_WIDTH"]),
+        cfg["LOG_UOP_BUFF_SIZE"],
+        cfg["LOG_INP_BUFF_SIZE"],
+        cfg["LOG_WGT_BUFF_SIZE"],
+        cfg["LOG_ACC_BUFF_SIZE"],
+        cfg["HW_FREQ"],
+        cfg["HW_CLK_TARGET"],
+        cfg["GEMM_II"])
+    if cfg["ALU_EN"]:
+        cfg["BITSTREAM"] += "_aii{}".format(cfg["TALU_II"])
+    if cfg["MUL_EN"] and cfg["ALU_EN"]:
+        cfg["BITSTREAM"] += "_mul"
     pkg = get_pkg_config(cfg)
 
     if args.target:
@@ -119,23 +144,13 @@ def main():
             fo.write(pkg.cfg_json)
 
     if args.cfg_str:
-        # Needs to match the BITSTREAM string in python/vta/environment.py
-        cfg_str = "{}x{}x{}_g{}_a{}_{}bx{}b_{}_{}_{}_{}_{}MHz_{}ns_v{}".format(
-            (1 << cfg["LOG_BATCH"]),
-            (1 << cfg["LOG_BLOCK_IN"]),
-            (1 << cfg["LOG_BLOCK_OUT"]),
-            cfg["GEMM_II"],
-            cfg["TALU_II"],
-            (1 << cfg["LOG_INP_WIDTH"]),
-            (1 << cfg["LOG_WGT_WIDTH"]),
-            cfg["LOG_UOP_BUFF_SIZE"],
-            cfg["LOG_INP_BUFF_SIZE"],
-            cfg["LOG_WGT_BUFF_SIZE"],
-            cfg["LOG_ACC_BUFF_SIZE"],
-            cfg["HW_FREQ"],
-            cfg["HW_CLK_TARGET"],
-            cfg["HW_VER"].replace('.', '_'))
-        print(cfg_str)
+        print(cfg["BITSTREAM"])
+
+    if args.get_aluen:
+        print(cfg["ALU_EN"])
+
+    if args.get_mulen:
+        print(cfg["MUL_EN"])
 
     if args.get_gemmii:
         print(cfg["GEMM_II"])
diff --git a/vta/hardware/xilinx/Makefile b/vta/hardware/xilinx/Makefile
index 508709c6c5edb..fcd5b71d7788f 100644
--- a/vta/hardware/xilinx/Makefile
+++ b/vta/hardware/xilinx/Makefile
@@ -16,13 +16,9 @@ HSI = hsi
 # HLS mode
 MODE = all
 # Debug flag
-DEBUG = false
+DEBUG = False
 # SLURM
-SLURM = false
-# Prevent generation of DSP
-NO_DSP = false
-# Prevent generation of ALU
-NO_ALU = true
+SLURM = False
 
 # Process VTA JSON config
 VTA_CONFIG = python $(CURDIR)/../../config/vta_config.py
@@ -44,6 +40,8 @@ VTA_INP_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-inpbuffsize)
 VTA_WGT_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-wgtbuffsize)
 VTA_ACC_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-accbuffsize)
 VTA_OUT_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-outbuffsize)
+VTA_ALU_EN := $(shell ${VTA_CONFIG} --get-aluen)
+VTA_MUL_EN := $(shell ${VTA_CONFIG} --get-mulen)
 
 #---------------------
 # FPGA Parameters
@@ -87,7 +85,8 @@ $(IP_PATH): $(SRC_DIR)/*
 	cd $(IP_BUILD_PATH) && \
 		$(VIVADO_HLS) -f $(SCRIPT_DIR)/hls.tcl \
 		-tclargs $(SRC_DIR) $(SIM_DIR) $(TEST_DIR) $(INCLUDE_DIR) \
-		$(MODE) $(DEBUG) $(NO_DSP) $(NO_ALU) $(VTA_TARGET_PER) $(VTA_GEMM_II) $(VTA_TALU_II) \
+		$(MODE) $(DEBUG) $(VTA_ALU_EN) $(VTA_MUL_EN) \
+		$(VTA_TARGET_PER) $(VTA_GEMM_II) $(VTA_TALU_II) \
 		$(VTA_INP_WIDTH) $(VTA_WGT_WIDTH) $(VTA_ACC_WIDTH) $(VTA_OUT_WIDTH) \
 		$(VTA_BATCH) $(VTA_IN_BLOCK) $(VTA_OUT_BLOCK) \
 		$(VTA_UOP_BUFF_SIZE) $(VTA_INP_BUFF_SIZE) $(VTA_WGT_BUFF_SIZE) \
diff --git a/vta/hardware/xilinx/scripts/hls.tcl b/vta/hardware/xilinx/scripts/hls.tcl
index 4d593e26c2e38..3bddcc3ad1e60 100644
--- a/vta/hardware/xilinx/scripts/hls.tcl
+++ b/vta/hardware/xilinx/scripts/hls.tcl
@@ -11,8 +11,8 @@
 # Arg 4: path to include sources
 # Arg 5: mode
 # Arg 6: debug
-# Arg 7: no_dsp
-# Arg 8: no_alu
+# Arg 7: alu_ena
+# Arg 8: mul_ena
 # Arg 9: target clock period
 # Arg 10: target II for GEMM
 # Arg 11: target II for tensor ALU
@@ -36,8 +36,8 @@ if { [llength $argv] eq 25 } {
 	set include_dir [lindex $argv 5]
 	set mode [lindex $argv 6]
 	set debug [lindex $argv 7]
-	set no_dsp [lindex $argv 8]
-	set no_alu [lindex $argv 9]
+	set alu_ena [lindex $argv 8]
+	set mul_ena [lindex $argv 9]
 	set target_period [lindex $argv 10]
 	set target_gemm_ii [lindex $argv 11]
 	set target_alu_ii [lindex $argv 12]
@@ -59,9 +59,9 @@ if { [llength $argv] eq 25 } {
 	set test_dir "../../src/test"
 	set include_dir "../../include"
 	set mode "all"
-	set debug "false"
-	set no_dsp "true"
-	set no_alu "false"
+	set debug "False"
+	set alu_ena "True"
+	set mul_ena "True"
 	set target_period 8
 	set target_gemm_ii 10
 	set target_alu_ii 16
@@ -83,7 +83,7 @@ if { [llength $argv] eq 25 } {
 # Initializes the HLS design and sets HLS pragmas for memory partitioning.
 # This is necessary because of a Vivado restriction that doesn't allow for
 # buses wider than 1024 bits.
-proc init_design {per ii inp_width wgt_width out_width acc_width batch block_in block_out no_alu} {
+proc init_design {per g_ii a_ii inp_width wgt_width out_width acc_width batch block_in block_out alu_ena} {
 
 	# Set device number
 	set_part {xc7z020clg484-1}
@@ -98,14 +98,14 @@ proc init_design {per ii inp_width wgt_width out_width acc_width batch block_in
 	create_clock -period $per -name default
 
 	# Set pipeline directive
-	set_directive_pipeline -II $ii "compute/READ_GEMM_UOP"
+	set_directive_pipeline -II $g_ii "compute/READ_GEMM_UOP"
 
-	if {$no_alu=="false"} {
-		set_directive_pipeline -II $ii "compute/READ_ALU_UOP"
+	if {$alu_ena=="True"} {
+		set_directive_pipeline -II $a_ii "compute/READ_ALU_UOP"
 	}
 
-	# Set input partition factor to (INP_VECTOR_WIDTH*BATCH/(1024*ii)
-	set inp_bus_width [expr {(1 << ($inp_width + $block_in + $batch)) / $ii}]
+	# Set input partition factor to (INP_VECTOR_WIDTH*BATCH/(1024*g_ii)
+	set inp_bus_width [expr {(1 << ($inp_width + $block_in + $batch)) / $g_ii}]
 	set inp_partition_factor [expr {$inp_bus_width / $max_width}]
 	if {$inp_partition_factor == 0} {
 		set inp_reshape_factor [expr {$inp_bus_width / $axi_width}]
@@ -118,8 +118,8 @@ proc init_design {per ii inp_width wgt_width out_width acc_width batch block_in
 		set_directive_array_reshape -type block -factor $inp_reshape_factor -dim 2 "load" inp_mem
 		set_directive_array_reshape -type block -factor $inp_reshape_factor -dim 2 "compute" inp_mem
 	}
-	# Set weight partition factor to (WGT_VECTOR_WIDTH*BLOCK_OUT/(1024*ii))
-	set wgt_bus_width [expr {(1 << ($wgt_width + $block_in + $block_out)) / $ii}]
+	# Set weight partition factor to (WGT_VECTOR_WIDTH*BLOCK_OUT/(1024*g_ii))
+	set wgt_bus_width [expr {(1 << ($wgt_width + $block_in + $block_out)) / $g_ii}]
 	set wgt_partition_factor [expr {$wgt_bus_width / $max_width}]
 	if {$wgt_partition_factor == 0} {
 		set wgt_reshape_factor [expr {$wgt_bus_width / $axi_width}]
@@ -132,8 +132,8 @@ proc init_design {per ii inp_width wgt_width out_width acc_width batch block_in
 		set_directive_array_reshape -type block -factor $wgt_reshape_factor -dim 2 "load" wgt_mem
 		set_directive_array_reshape -type block -factor $wgt_reshape_factor -dim 2 "compute" wgt_mem
 	}
-	# Set output partition factor to (OUT_VECTOR_WIDTH*BATCH/(1024*ii))
-	set out_bus_width [expr {(1 << ($out_width + $block_out + $batch)) / $ii}]
+	# Set output partition factor to (OUT_VECTOR_WIDTH*BATCH/(1024*g_ii))
+	set out_bus_width [expr {(1 << ($out_width + $block_out + $batch)) / $g_ii}]
 	set out_partition_factor [expr {$out_bus_width / $max_width}]
 	if {$out_partition_factor == 0} {
 		set out_reshape_factor [expr {$out_bus_width / $axi_width}]
@@ -147,9 +147,9 @@ proc init_design {per ii inp_width wgt_width out_width acc_width batch block_in
 		set_directive_array_reshape -type block -factor $out_reshape_factor -dim 2 "store" out_mem
 	}
 	# Set accumulator partition factor
-	# set acc_bus_width [expr {(1 << ($acc_width + $block_out + $batch)) / $ii}]
+	# set acc_bus_width [expr {(1 << ($acc_width + $block_out + $batch)) / $g_ii}]
 	# set acc_reshape_factor [expr {$acc_bus_width / $axi_width}]
-	# set_directive_array_reshape -type block -factor $acc_reshape_factor -dim 2 "compute" acc_mem
+	# set_directive_array_partition -type block -factor $acc_reshape_factor -dim 2 "compute" acc_mem
 }
 
 # C define flags to pass to compiler
@@ -160,14 +160,14 @@ set cflags "-I $include_dir -I $src_dir -I $test_dir \
 	-DVTA_LOG_UOP_BUFF_SIZE=$uop_buff_size -DVTA_LOG_INP_BUFF_SIZE=$inp_buff_size \
 	-DVTA_LOG_WGT_BUFF_SIZE=$wgt_buff_size -DVTA_LOG_ACC_BUFF_SIZE=$acc_buff_size \
 	-DVTA_LOG_OUT_BUFF_SIZE=$out_buff_size"
-if {$debug=="true"} {
+if {$debug=="True"} {
 	append cflags " -DVTA_DEBUG=1"
 }
-if {$no_dsp=="true"} {
-	append cflags " -DNO_DSP"
+if {$alu_ena=="True"} {
+	append cflags " -DALU_EN"
 }
-if {$no_alu=="true"} {
-	append cflags " -DNO_ALU"
+if {$mul_ena=="True"} {
+	append cflags " -DMUL_EN"
 }
 
 # HLS behavioral sim
@@ -178,7 +178,7 @@ if {$mode=="all" || $mode=="sim"} {
 	add_files -tb $sim_dir/vta_test.cc -cflags $cflags
 	add_files -tb $test_dir/test_lib.cc -cflags $cflags
 	open_solution "solution0"
-	init_design $target_period $target_gemm_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $no_alu
+	init_design $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
 	csim_design -clean
 	close_project
 }
@@ -189,7 +189,7 @@ if {$mode=="all" || $mode=="skip_sim" || $mode=="fetch"} {
 	set_top fetch
 	add_files $src_dir/vta.cc -cflags $cflags
 	open_solution "solution0"
-	init_design $target_period $target_gemm_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $no_alu
+	init_design $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
 	csynth_design
 	if {$mode=="all" || $mode=="skip_sim"} {
 		export_design -format ip_catalog
@@ -203,7 +203,7 @@ if {$mode=="all" || $mode=="skip_sim" || $mode=="load"} {
 	set_top load
 	add_files $src_dir/vta.cc -cflags $cflags
 	open_solution "solution0"
-	init_design $target_period $target_gemm_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $no_alu
+	init_design $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
 	csynth_design
 	if {$mode=="all" || $mode=="skip_sim"} {
 		export_design -format ip_catalog
@@ -217,7 +217,7 @@ if {$mode=="all" || $mode=="skip_sim" || $mode=="compute"} {
 	set_top compute
 	add_files $src_dir/vta.cc -cflags $cflags
 	open_solution "solution0"
-	init_design $target_period $target_gemm_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $no_alu
+	init_design $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
 	csynth_design
 	if {$mode=="all" || $mode=="skip_sim"} {
 		export_design -format ip_catalog
@@ -231,7 +231,7 @@ if {$mode=="all" || $mode=="skip_sim" || $mode=="store"} {
 	set_top store
 	add_files $src_dir/vta.cc -cflags $cflags
 	open_solution "solution0"
-	init_design $target_period $target_gemm_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $no_alu
+	init_design $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
 	csynth_design
 	if {$mode=="all" || $mode=="skip_sim"} {
 		export_design -format ip_catalog
diff --git a/vta/hardware/xilinx/sim/vta_test.cc b/vta/hardware/xilinx/sim/vta_test.cc
index cd7449bfc8294..266eeaae2f9b8 100644
--- a/vta/hardware/xilinx/sim/vta_test.cc
+++ b/vta/hardware/xilinx/sim/vta_test.cc
@@ -29,29 +29,41 @@ int main(void) {
 
     int status = 0;
 
-    // // Run ALU test (vector-scalar operators)
-    // status |= alu_test(VTA_ALU_OPCODE_MIN, true, VTA_BLOCK_OUT, 128, true);
-    // status |= alu_test(VTA_ALU_OPCODE_MIN, true, VTA_BLOCK_OUT, 128, false);
-    // status |= alu_test(VTA_ALU_OPCODE_MAX, true, VTA_BLOCK_OUT, 128, true);
-    // status |= alu_test(VTA_ALU_OPCODE_MAX, true, VTA_BLOCK_OUT, 128, false);
-    // status |= alu_test(VTA_ALU_OPCODE_ADD, true, VTA_BLOCK_OUT, 128, true);
-    // status |= alu_test(VTA_ALU_OPCODE_ADD, true, VTA_BLOCK_OUT, 128, false);
-    // status |= alu_test(VTA_ALU_OPCODE_SHR, true, VTA_BLOCK_OUT, 128, true);
-    // status |= alu_test(VTA_ALU_OPCODE_SHR, true, VTA_BLOCK_OUT, 128, false);
-
-    // // Run ALU test (vector-vector operators)
-    // status |= alu_test(VTA_ALU_OPCODE_MIN, false, VTA_BLOCK_OUT, 128, true);
-    // status |= alu_test(VTA_ALU_OPCODE_MIN, false, VTA_BLOCK_OUT, 128, false);
-    // status |= alu_test(VTA_ALU_OPCODE_MAX, false, VTA_BLOCK_OUT, 128, true);
-    // status |= alu_test(VTA_ALU_OPCODE_MAX, false, VTA_BLOCK_OUT, 128, false);
-    // status |= alu_test(VTA_ALU_OPCODE_ADD, false, VTA_BLOCK_OUT, 128, true);
-    // status |= alu_test(VTA_ALU_OPCODE_ADD, false, VTA_BLOCK_OUT, 128, false);
-
-    // // Run blocked GEMM test
-    // status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 2);
-    // status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 2);
-    // status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 1);
-    // status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 1);
+#ifdef ALU_EN
+    // Run ALU test (vector-scalar operators)
+    status |= alu_test(VTA_ALU_OPCODE_MIN, true, VTA_BLOCK_OUT, 128, true);
+    status |= alu_test(VTA_ALU_OPCODE_MIN, true, VTA_BLOCK_OUT, 128, false);
+    status |= alu_test(VTA_ALU_OPCODE_MAX, true, VTA_BLOCK_OUT, 128, true);
+    status |= alu_test(VTA_ALU_OPCODE_MAX, true, VTA_BLOCK_OUT, 128, false);
+    status |= alu_test(VTA_ALU_OPCODE_ADD, true, VTA_BLOCK_OUT, 128, true);
+    status |= alu_test(VTA_ALU_OPCODE_ADD, true, VTA_BLOCK_OUT, 128, false);
+    status |= alu_test(VTA_ALU_OPCODE_SHR, true, VTA_BLOCK_OUT, 128, true);
+    status |= alu_test(VTA_ALU_OPCODE_SHR, true, VTA_BLOCK_OUT, 128, false);
+
+    // Run ALU test (vector-vector operators)
+    status |= alu_test(VTA_ALU_OPCODE_MIN, false, VTA_BLOCK_OUT, 128, true);
+    status |= alu_test(VTA_ALU_OPCODE_MIN, false, VTA_BLOCK_OUT, 128, false);
+    status |= alu_test(VTA_ALU_OPCODE_MAX, false, VTA_BLOCK_OUT, 128, true);
+    status |= alu_test(VTA_ALU_OPCODE_MAX, false, VTA_BLOCK_OUT, 128, false);
+    status |= alu_test(VTA_ALU_OPCODE_ADD, false, VTA_BLOCK_OUT, 128, true);
+    status |= alu_test(VTA_ALU_OPCODE_ADD, false, VTA_BLOCK_OUT, 128, false);
+    status |= alu_test(VTA_ALU_OPCODE_SHR, false, VTA_BLOCK_OUT, 128, true);
+    status |= alu_test(VTA_ALU_OPCODE_SHR, false, VTA_BLOCK_OUT, 128, false);
+
+#ifdef MUL_EN
+    status |= alu_test(VTA_ALU_OPCODE_MUL, true, VTA_BLOCK_OUT, 128, true);
+    status |= alu_test(VTA_ALU_OPCODE_MUL, true, VTA_BLOCK_OUT, 128, false);
+    status |= alu_test(VTA_ALU_OPCODE_MUL, false, VTA_BLOCK_OUT, 128, true);
+    status |= alu_test(VTA_ALU_OPCODE_MUL, false, VTA_BLOCK_OUT, 128, false);
+#endif // MUL_EN
+
+#endif // ALU_EN
+
+    // Run blocked GEMM test
+    status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 2);
+    status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 2);
+    status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 1);
+    status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 1);
 
     // Simple GEMM unit test
     status |= gemm_test(4 * VTA_BATCH, 4 * VTA_BLOCK_OUT, 4 * VTA_BLOCK_IN, true);
diff --git a/vta/hardware/xilinx/src/vta.cc b/vta/hardware/xilinx/src/vta.cc
index ca384b07635eb..f59f2a53b5a00 100644
--- a/vta/hardware/xilinx/src/vta.cc
+++ b/vta/hardware/xilinx/src/vta.cc
@@ -212,10 +212,17 @@ void compute(
 
   // Accumulator storage
   static axi_T acc_mem[VTA_ACC_BUFF_DEPTH][ACC_TENSOR_ELEMS];
-#pragma HLS ARRAY_PARTITION variable = acc_mem complete dim = 2
+#pragma HLS ARRAY_RESHAPE variable = acc_mem complete dim=2
 // This is necessary to obtain II=1
 #pragma HLS DEPENDENCE variable = acc_mem inter false
 
+
+#ifdef MUL_EN
+// This will limit DSP util when Multipliers are enabled in the ALU
+#pragma HLS allocation instances=mul limit=220 operation
+#endif  // MUL_EN
+
+
   // Pop GEMM instruction
   insn_T insn = gemm_queue.read();
 
@@ -350,13 +357,13 @@ void compute(
             }
 
             // Read in accum tensor
-            acc_T a_tensor[VTA_BATCH][VTA_BLOCK_OUT];
+            reg_T a_tensor[VTA_BATCH][VTA_BLOCK_OUT];
             for (int b = 0; b < VTA_BATCH; b++) {
               for (int p = 0; p < ACC_VEC_AXI_RATIO; p++) {
                 axi_T packet = acc_mem[dst_idx][b * ACC_VEC_AXI_RATIO + p];
                 for (int w = 0; w < AXI_ACC_RATIO; w++) {
                   a_tensor[b][p * AXI_ACC_RATIO + w] =
-                      packet.range((w + 1) * VTA_ACC_WIDTH - 1, w * VTA_ACC_WIDTH);
+                      packet.range(w * VTA_ACC_WIDTH + VTA_REG_WIDTH - 1, w * VTA_ACC_WIDTH);
                 }
               }
             }
@@ -368,7 +375,7 @@ void compute(
             for (int b = 0; b < VTA_BATCH; b++) {
               for (int oc = 0; oc < VTA_BLOCK_OUT; oc++) {
                 // Initialize the accumulator values
-                acc_T accum = a_tensor[b][oc];
+                reg_T accum = a_tensor[b][oc];
                 // Dot product sum
                 sum_T tmp = 0;
                 // Inner matrix multiplication loop (input channel/feature)
@@ -382,9 +389,9 @@ void compute(
                   tmp += (sum_T) prod;
                 }
                 // Update summation
-                accum += (acc_T) tmp;
+                accum += (reg_T) tmp;
                 // Write back result acc_mem
-                a_tensor[b][oc] = reset_out ? (acc_T) 0 : accum;
+                a_tensor[b][oc] = reset_out ? (reg_T) 0 : accum;
                 // And output vector
                 o_tensor[b][oc] = (out_T) accum.range(VTA_OUT_WIDTH - 1, 0);
               }
@@ -395,7 +402,7 @@ void compute(
               for (int p = 0; p < ACC_VEC_AXI_RATIO; p++) {
                 axi_T packet = 0;
                 for (int w = 0; w < AXI_ACC_RATIO; w++) {
-                  packet.range((w + 1) * VTA_ACC_WIDTH - 1, w * VTA_ACC_WIDTH) = a_tensor[b][p * AXI_ACC_RATIO + w];
+                  packet.range((w + 1) * VTA_ACC_WIDTH - 1, w * VTA_ACC_WIDTH) = (acc_T) a_tensor[b][p * AXI_ACC_RATIO + w];
                 }
                 acc_mem[dst_idx][b * ACC_VEC_AXI_RATIO + p] = packet;
               }
@@ -413,7 +420,7 @@ void compute(
             }
           }
         }
-#ifndef NO_ALU
+#ifdef ALU_EN
         else if (opcode == VTA_OPCODE_ALU) {
           // Iterate over micro op
           READ_ALU_UOP: for (int upc = uop_bgn; upc < uop_end; upc++) {
@@ -427,25 +434,25 @@ void compute(
                 uop.range(VTA_UOP_ALU_1_1, VTA_UOP_ALU_1_0) + src_offset_in;
 
             // Read in src tensor
-            acc_T src_tensor[VTA_BATCH][VTA_BLOCK_OUT];
+            reg_T src_tensor[VTA_BATCH][VTA_BLOCK_OUT];
             for (int b = 0; b < VTA_BATCH; b++) {
               for (int p = 0; p < ACC_VEC_AXI_RATIO; p++) {
                 axi_T packet = acc_mem[src_idx][b * ACC_VEC_AXI_RATIO + p];
                 for (int w = 0; w < AXI_ACC_RATIO; w++) {
                   src_tensor[b][p * AXI_ACC_RATIO + w] =
-                      packet.range((w + 1) * VTA_ACC_WIDTH - 1, w * VTA_ACC_WIDTH);
+                      packet.range(w * VTA_ACC_WIDTH + VTA_REG_WIDTH - 1, w * VTA_ACC_WIDTH);
                 }
               }
             }
 
             // Read in dst tensor
-            acc_T dst_tensor[VTA_BATCH][VTA_BLOCK_OUT];
+            reg_T dst_tensor[VTA_BATCH][VTA_BLOCK_OUT];
             for (int b = 0; b < VTA_BATCH; b++) {
               for (int p = 0; p < ACC_VEC_AXI_RATIO; p++) {
                 axi_T packet = acc_mem[dst_idx][b * ACC_VEC_AXI_RATIO + p];
                 for (int w = 0; w < AXI_ACC_RATIO; w++) {
                   dst_tensor[b][p * AXI_ACC_RATIO + w] =
-                      packet.range((w + 1) * VTA_ACC_WIDTH - 1, w * VTA_ACC_WIDTH);
+                      packet.range(w * VTA_ACC_WIDTH + VTA_REG_WIDTH - 1, w * VTA_ACC_WIDTH);
                 }
               }
             }
@@ -457,23 +464,40 @@ void compute(
             for (int i = 0; i < VTA_BATCH; i++) {
               for (int b = 0; b < VTA_BLOCK_OUT; b++) {
                 // Read in operands
-                acc_T src_0 = dst_tensor[i][b];
-                acc_T src_1 = use_imm ? (acc_T) imm : src_tensor[i][b];
-                // Compute Min/Max
-                acc_T mix_val = src_0 < src_1 ?
-                    (alu_opcode == VTA_ALU_OPCODE_MIN ? src_0 : src_1) :
-                    (alu_opcode == VTA_ALU_OPCODE_MIN ? src_1 : src_0);
-                dst_tensor[i][b] = mix_val;
-                o_tensor[i][b] = (out_T) mix_val.range(VTA_OUT_WIDTH - 1, 0);
-                // Compute Sum
-                acc_T add_val =
-                    src_0.range(VTA_ACC_WIDTH - 1, 0) + src_1.range(VTA_ACC_WIDTH - 1, 0);
-                dst_tensor[i][b] = add_val;
-                o_tensor[i][b] = (out_T) add_val.range(VTA_OUT_WIDTH - 1, 0);
-                // Compute Shift Right
-                acc_T shr_val = src_0 >> (aluop_sh_imm_T) src_1.range(VTA_LOG_ACC_WIDTH - 1, 0);
-                dst_tensor[i][b] = shr_val;
-                o_tensor[i][b] = (out_T) shr_val.range(VTA_OUT_WIDTH-1, 0);
+                reg_T src_0 = dst_tensor[i][b];
+                reg_T src_1 = use_imm ? (reg_T) imm : src_tensor[i][b];
+                aluop_shr_arg_T shft_by = src_1.range(VTA_SHR_ARG_BIT_WIDTH - 1, 0);
+                aluop_mul_arg_T mul_by = src_1.range(VTA_MUL_ARG_BIT_WIDTH - 1, 0);
+                if (alu_opcode == VTA_ALU_OPCODE_MIN || alu_opcode == VTA_ALU_OPCODE_MAX) {
+                  // Compute Min/Max
+                  reg_T mix_val = src_0 < src_1 ?
+                      (alu_opcode == VTA_ALU_OPCODE_MIN ? src_0 : src_1) :
+                      (alu_opcode == VTA_ALU_OPCODE_MIN ? src_1 : src_0);
+                  dst_tensor[i][b] = mix_val;
+                  o_tensor[i][b] = (out_T) mix_val.range(VTA_OUT_WIDTH - 1, 0);
+                } else if (alu_opcode == VTA_ALU_OPCODE_ADD) {
+                  // Compute Sum
+                  reg_T add_val =
+                      src_0.range(VTA_REG_WIDTH - 1, 0) + src_1.range(VTA_REG_WIDTH - 1, 0);
+                  dst_tensor[i][b] = add_val;
+                  o_tensor[i][b] = (out_T) add_val.range(VTA_OUT_WIDTH - 1, 0);
+                } else if (alu_opcode == VTA_ALU_OPCODE_SHR) {
+                  // Compute Shift Right
+                  reg_T shr_val = src_0 >> shft_by;
+                  dst_tensor[i][b] = shr_val;
+                  o_tensor[i][b] = (out_T) shr_val.range(VTA_OUT_WIDTH-1, 0);
+                }
+#ifdef MUL_EN
+                else if (alu_opcode == VTA_ALU_OPCODE_MUL) {
+                  // Compute Mul Right
+                  reg_T mul_val = src_0 * mul_by;
+#ifdef NO_DSP
+#pragma HLS RESOURCE variable = mul_val core = Mul_LUT
+#endif //  NO_DSP
+                  dst_tensor[i][b] = mul_val;
+                  o_tensor[i][b] = (out_T) mul_val.range(VTA_OUT_WIDTH-1, 0);
+                }
+#endif  // MUL_EN
               }
             }
 
@@ -482,7 +506,7 @@ void compute(
               for (int p = 0; p < ACC_VEC_AXI_RATIO; p++) {
                 axi_T packet = 0;
                 for (int w = 0; w < AXI_ACC_RATIO; w++) {
-                  packet.range((w + 1) * VTA_ACC_WIDTH - 1, w * VTA_ACC_WIDTH) = dst_tensor[b][p * AXI_ACC_RATIO + w];
+                  packet.range((w + 1) * VTA_ACC_WIDTH - 1, w * VTA_ACC_WIDTH) = (acc_T) dst_tensor[b][p * AXI_ACC_RATIO + w];
                 }
                 acc_mem[dst_idx][b * ACC_VEC_AXI_RATIO + p] = packet;
               }
@@ -493,14 +517,14 @@ void compute(
               for (int p = 0; p < OUT_VEC_AXI_RATIO; p++) {
                 axi_T packet = 0;
                 for (int w = 0; w < AXI_OUT_RATIO; w++) {
-                  packet.range((w + 1) * VTA_OUT_WIDTH - 1, w * VTA_OUT_WIDTH) = o_tensor[b][p * AXI_OUT_RATIO + w];
+                  packet.range((w + 1) * VTA_OUT_WIDTH - 1, w * VTA_OUT_WIDTH) = (acc_T) o_tensor[b][p * AXI_OUT_RATIO + w];
                 }
                 out_mem[dst_idx][b * OUT_VEC_AXI_RATIO + p] = packet;
               }
             }
           }
         }
-#endif  // NO_ALU
+#endif  // ALU_EN
 
         // Update offsets
         dst_offset_in += dst_factor_in;
diff --git a/vta/hardware/xilinx/src/vta.h b/vta/hardware/xilinx/src/vta.h
index f4e1999814ec0..ee3aadeb071ec 100644
--- a/vta/hardware/xilinx/src/vta.h
+++ b/vta/hardware/xilinx/src/vta.h
@@ -35,22 +35,25 @@ typedef ap_uint<VTA_AXI_WIDTH> axi_T;
 typedef ap_uint<VTA_UOP_WIDTH> uop_T;
 
 /* \typedef inp_T Input datatype*/
-typedef ap_uint<VTA_INP_WIDTH> inp_T;
+typedef ap_int<VTA_INP_WIDTH> inp_T;
 
 /* \typedef wgt_T Weight datatype*/
-typedef ap_uint<VTA_WGT_WIDTH> wgt_T;
+typedef ap_int<VTA_WGT_WIDTH> wgt_T;
 
 /* \typedef out_T Output datatype*/
-typedef ap_uint<VTA_OUT_WIDTH> out_T;
+typedef ap_int<VTA_OUT_WIDTH> out_T;
 
 /* \typedef acc_T Accumulator datatype*/
-typedef ap_uint<VTA_ACC_WIDTH> acc_T;
+typedef ap_int<VTA_ACC_WIDTH> acc_T;
+
+/* \typedef acc_T Accumulator datatype*/
+typedef ap_int<VTA_REG_WIDTH> reg_T;
 
 /* \typedef mul_T Multiplier output datatype*/
-typedef ap_uint<VTA_WGT_WIDTH+VTA_INP_WIDTH+1> mul_T;
+typedef ap_int<VTA_WGT_WIDTH+VTA_INP_WIDTH+1> mul_T;
 
 /* \typedef sum_T GEMM accumulator datatype*/
-typedef ap_uint<VTA_WGT_WIDTH+VTA_INP_WIDTH+VTA_LOG_BLOCK_IN+1> sum_T;
+typedef ap_int<VTA_WGT_WIDTH+VTA_INP_WIDTH+VTA_LOG_BLOCK_IN+1> sum_T;
 
 /* \typedef uop_idx_T Micro-op SRAM index datatype*/
 typedef ap_uint<VTA_LOG_UOP_BUFF_DEPTH+1> uop_idx_T;
@@ -94,11 +97,14 @@ typedef ap_uint<VTA_MEMOP_PAD_BIT_WIDTH> memop_pad_T;
 /* \typedef aluop_opcode_T ALU operation opcode datatype*/
 typedef ap_uint<VTA_ALU_OPCODE_BIT_WIDTH> aluop_opcode_T;
 
-/* \typedef aluop_opcode_T ALU operation immediate datatype*/
+/* \typedef aluop_imm_T ALU operation immediate datatype*/
 typedef ap_int<VTA_ALUOP_IMM_BIT_WIDTH> aluop_imm_T;
 
-/* \typedef aluop_opcode_T ALU operation shift immediate datatype*/
-typedef ap_int<VTA_LOG_ACC_WIDTH> aluop_sh_imm_T;
+/* \typedef aluop_shr_arg_T ALU operation shift right immediate datatype*/
+typedef ap_int<VTA_SHR_ARG_BIT_WIDTH> aluop_shr_arg_T;
+
+/* \typedef aluop_mul_arg_T ALU operation multiply datatype*/
+typedef ap_int<VTA_MUL_ARG_BIT_WIDTH> aluop_mul_arg_T;
 
 /*!
 * \brief Fetch module.
diff --git a/vta/include/vta/hw_spec.h b/vta/include/vta/hw_spec.h
index d21e8233aeb41..5d105ab6d940e 100644
--- a/vta/include/vta/hw_spec.h
+++ b/vta/include/vta/hw_spec.h
@@ -16,6 +16,9 @@ extern "C" {
 /*! AXI bus width */
 #define VTA_AXI_WIDTH 64
 
+/*! Register file width */
+#define VTA_REG_WIDTH 24
+
 /*! log2 of instruction data type width */
 #define VTA_LOG_INS_WIDTH 7
 /*! Instruction data type width */
@@ -50,6 +53,8 @@ extern "C" {
 #define VTA_INP_VECTOR_WIDTH (VTA_INP_WIDTH * VTA_BLOCK_IN)
 /*! Accumulator vector width */
 #define VTA_ACC_VECTOR_WIDTH (VTA_ACC_WIDTH * VTA_BLOCK_OUT)
+/*! Register file vector width */
+#define VTA_REG_VECTOR_WIDTH (VTA_REG_WIDTH * VTA_BLOCK_OUT)
 /*! Output vector width */
 #define VTA_OUT_VECTOR_WIDTH (VTA_OUT_WIDTH * VTA_BLOCK_OUT)
 
@@ -98,7 +103,7 @@ extern "C" {
 /*! Instruction opcode field bitwidth */
 #define VTA_OPCODE_BIT_WIDTH 3
 /*! ALU opcode field bitwidth */
-#define VTA_ALU_OPCODE_BIT_WIDTH 2
+#define VTA_ALU_OPCODE_BIT_WIDTH 3
 
 /*! Opcode: load encoding */
 #define VTA_OPCODE_LOAD 0
@@ -119,6 +124,8 @@ extern "C" {
 #define VTA_ALU_OPCODE_ADD 2
 /*! ALU opcode: shift right by immediate op */
 #define VTA_ALU_OPCODE_SHR 3
+/*! ALU opcode: shift right by immediate op */
+#define VTA_ALU_OPCODE_MUL 4
 
 /*! Memory type field bitwidth */
 #define VTA_MEMOP_ID_BIT_WIDTH 2
@@ -134,10 +141,14 @@ extern "C" {
 #define VTA_MEMOP_PAD_BIT_WIDTH 4
 /*! Load/Store Instruction: padding value encoding width*/
 #define VTA_MEMOP_PAD_VAL_BIT_WIDTH 2
-/*! ALU Instruction: immediate bitwidth*/
-#define VTA_ALUOP_IMM_BIT_WIDTH 16
 /*! GEMM/ALU Instruction: loop max iter bits */
 #define VTA_LOOP_ITER_WIDTH 14
+/*! ALU Instruction: immediate bitwidth*/
+#define VTA_ALUOP_IMM_BIT_WIDTH 16
+/*! ALU Instruction: shift arg bitwidth*/
+#define VTA_SHR_ARG_BIT_WIDTH (VTA_LOG_ACC_WIDTH)
+/*! ALU Instruction: multiply arg bitwidth*/
+#define VTA_MUL_ARG_BIT_WIDTH 8
 
 /*! Mem ID constant: uop memory */
 #define VTA_MEM_ID_UOP 0
diff --git a/vta/python/vta/bitstream.py b/vta/python/vta/bitstream.py
index 329e741f7d1ca..73ce062cb62c9 100644
--- a/vta/python/vta/bitstream.py
+++ b/vta/python/vta/bitstream.py
@@ -43,6 +43,10 @@ def download_bitstream():
 
     env = get_env()
 
+    if env.TARGET == "sim":
+        print("Skipping programming phase in sim mode")
+        return True
+
     success = False
     bit = get_bitstream_path()
     url = os.path.join(BITSTREAM_URL, env.TARGET)
diff --git a/vta/python/vta/environment.py b/vta/python/vta/environment.py
index a77e29ac3a52e..c270585860bd2 100644
--- a/vta/python/vta/environment.py
+++ b/vta/python/vta/environment.py
@@ -37,6 +37,7 @@ class DevContext(object):
     ALU_OPCODE_MAX = 1
     ALU_OPCODE_ADD = 2
     ALU_OPCODE_SHR = 3
+    ALU_OPCODE_MUL = 4
     # Task queue id (pipeline stage)
     QID_LOAD_INP = 1
     QID_LOAD_WGT = 1
@@ -138,20 +139,6 @@ def __init__(self, cfg):
         self.WGT_ELEM_BYTES = self.WGT_ELEM_BITS // 8
         self.ACC_ELEM_BYTES = self.ACC_ELEM_BITS // 8
         self.OUT_ELEM_BYTES = self.OUT_ELEM_BITS // 8
-        # Configuration bitstream name
-        self.BITSTREAM = "{}x{}x{}_{}bx{}b_{}_{}_{}_{}_{}MHz_{}ns_v{}.bit".format(
-            (1 << cfg["LOG_BATCH"]),
-            (1 << cfg["LOG_BLOCK_IN"]),
-            (1 << cfg["LOG_BLOCK_OUT"]),
-            (1 << cfg["LOG_INP_WIDTH"]),
-            (1 << cfg["LOG_WGT_WIDTH"]),
-            cfg["LOG_UOP_BUFF_SIZE"],
-            cfg["LOG_INP_BUFF_SIZE"],
-            cfg["LOG_WGT_BUFF_SIZE"],
-            cfg["LOG_ACC_BUFF_SIZE"],
-            cfg["HW_FREQ"],
-            cfg["HW_CLK_TARGET"],
-            cfg["HW_VER"].replace('.', '_'))
         # dtypes
         self.acc_dtype = "int%d" % self.ACC_WIDTH
         self.inp_dtype = "int%d" % self.INP_WIDTH
@@ -162,6 +149,27 @@ def __init__(self, cfg):
         self._mock_env = None
         self._dev_ctx = None
         self._last_env = None
+        #  derive bitstream name
+        self.BITSTREAM = "{}_{}x{}x{}_a{}w{}o{}_{}_{}_{}_{}_{}MHz_{}ns_gii{}".format(
+            self.HW_VER.replace('.', '_'),
+            self.BATCH,
+            self.BLOCK_IN,
+            self.BLOCK_OUT,
+            self.INP_WIDTH,
+            self.WGT_WIDTH,
+            self.OUT_WIDTH,
+            self.LOG_UOP_BUFF_SIZE,
+            self.LOG_INP_BUFF_SIZE,
+            self.LOG_WGT_BUFF_SIZE,
+            self.LOG_ACC_BUFF_SIZE,
+            self.HW_FREQ,
+            self.HW_CLK_TARGET,
+            self.GEMM_II)
+        if self.ALU_EN:
+            self.BITSTREAM += "_aii{}".format(self.TALU_II)
+        if self.MUL_EN and self.ALU_EN:
+            self.BITSTREAM += "_mul"
+        self.BITSTREAM += ".bit"
 
     def __enter__(self):
         self._last_env = Environment.current
diff --git a/vta/python/vta/pkg_config.py b/vta/python/vta/pkg_config.py
index 30b4808f5e2d8..2a589c546e799 100644
--- a/vta/python/vta/pkg_config.py
+++ b/vta/python/vta/pkg_config.py
@@ -22,9 +22,13 @@ class PkgConfig(object):
     """
     cfg_keys = [
         "TARGET",
+        "HW_VER",
         "HW_FREQ",
         "HW_CLK_TARGET",
-        "HW_VER",
+        "ALU_EN",
+        "MUL_EN",
+        "GEMM_II",
+        "TALU_II",
         "LOG_INP_WIDTH",
         "LOG_WGT_WIDTH",
         "LOG_ACC_WIDTH",
@@ -35,7 +39,7 @@ class PkgConfig(object):
         "LOG_UOP_BUFF_SIZE",
         "LOG_INP_BUFF_SIZE",
         "LOG_WGT_BUFF_SIZE",
-        "LOG_ACC_BUFF_SIZE",
+        "LOG_ACC_BUFF_SIZE"
     ]
     def __init__(self, cfg, proj_root):
         # include path
diff --git a/vta/python/vta/top/vta_conv2d.py b/vta/python/vta/top/vta_conv2d.py
index 549d7144d3214..7a73b58278052 100644
--- a/vta/python/vta/top/vta_conv2d.py
+++ b/vta/python/vta/top/vta_conv2d.py
@@ -330,7 +330,9 @@ def _get_workload(data, pad_data, kernel, output):
     w_str = (i_w + w_pad*2 - k_w) // (o_w - 1)
     return Workload(i_b, i_h, i_w, i_c, o_c, k_h, k_w, h_pad, w_pad, h_str, w_str)
 
-def schedule_packed_conv2d(outs, plan=None):
+def schedule_packed_conv2d(outs, plan=None, skip_load_inp=False, skip_load_wgt=False,
+                           skip_load_acc=False, skip_store_out=False, skip_alu=False,
+                           skip_gemm=False):
     """ Schedule the packed conv2d.
     """
     assert len(outs) == 1
@@ -369,10 +371,14 @@ def _traverse(op):
         plan = find_schedules(wrkld, vt_only=True, best_only=True)[0]
         logging.info("Trying to find plan for %s", wrkld)
     env = get_env()
+    mock = env.mock
 
-    load_inp = load_wgt = load_out = store_out = env.dma_copy
-    alu = env.alu
-    gemm = env.gemm
+    load_inp = mock.dma_copy if skip_load_inp else env.dma_copy
+    load_wgt = mock.dma_copy if skip_load_wgt else env.dma_copy
+    load_acc = mock.dma_copy if skip_load_acc else env.dma_copy
+    store_out = mock.dma_copy if skip_store_out else env.dma_copy
+    alu = mock.alu if skip_alu else env.alu
+    gemm = mock.gemm if skip_gemm else env.gemm
 
     # schedule1
     oshape = topi.util.get_const_tuple(output.shape)
@@ -418,7 +424,7 @@ def _traverse(op):
 
     for tensor in cache_read_ewise:
         s[tensor].compute_at(s[output], store_pt)
-        s[tensor].pragma(s[tensor].op.axis[0], load_out)
+        s[tensor].pragma(s[tensor].op.axis[0], load_acc)
 
     # virtual threading along output channel axes
     if plan.oc_nthread > 1:
diff --git a/vta/src/sim/sim_driver.cc b/vta/src/sim/sim_driver.cc
index 60645818757c5..8123d73c0fb58 100644
--- a/vta/src/sim/sim_driver.cc
+++ b/vta/src/sim/sim_driver.cc
@@ -406,7 +406,7 @@ class Device {
 
   void RunGEMM(const VTAGemInsn* op) {
     if (!op->reset_reg) {
-      prof_->gemm_counter += op->iter_out * op->iter_in;
+      prof_->gemm_counter += op->iter_out * op->iter_in * (op->uop_end - op->uop_bgn);
       for (uint32_t y = 0; y < op->iter_out; ++y) {
         for (uint32_t x = 0; x < op->iter_in; ++x) {
           for (uint32_t uindex = op->uop_bgn; uindex < op->uop_end; ++uindex) {
@@ -458,7 +458,6 @@ class Device {
   }
 
   void RunALU(const VTAAluInsn* op) {
-    prof_->alu_counter += op->iter_out * op->iter_in;
     if (op->use_imm) {
       RunALU_<true>(op);
     } else {
@@ -501,6 +500,7 @@ class Device {
 
   template<bool use_imm, typename F>
   void RunALULoop(const VTAAluInsn* op, F func) {
+    prof_->alu_counter += op->iter_out * op->iter_in * op->uop_end - op->uop_bgn;
     for (int y = 0; y < op->iter_out; ++y) {
       for (int x = 0; x < op->iter_in; ++x) {
         for (int k = op->uop_bgn; k < op->uop_end; ++k) {
diff --git a/vta/tests/hardware/common/test_lib.cc b/vta/tests/hardware/common/test_lib.cc
index ca42aea8b9051..f6a810cff7e9f 100644
--- a/vta/tests/hardware/common/test_lib.cc
+++ b/vta/tests/hardware/common/test_lib.cc
@@ -133,6 +133,8 @@ const char* getOpcodeString(int opcode, bool use_imm) {
     }
   } else if (opcode == VTA_ALU_OPCODE_SHR) {
     return "shr";
+  } else if (opcode == VTA_ALU_OPCODE_MUL) {
+    return "mul";
   }
   return "unknown op";
 }
@@ -737,230 +739,246 @@ void printMicroOp(int num_uop, VTAUop *uops) {
   }
 }
 
-// int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_compression) {
-//   // Some assertions
-//   assert(batch % VTA_BATCH == 0);
-//   assert(vector_size % VTA_BLOCK_OUT == 0);
-//   assert(!(opcode == VTA_ALU_OPCODE_SHR && !use_imm));
-//   printf("=====================================================================================\n");
-//   printf("INFO - ALU test of %s: batch=%d, vector_size=%d, uop_compression=%d\n",
-//     getOpcodeString(opcode, use_imm), batch, vector_size, uop_compression);
-
-//   // Instruction count
-//   int ins_size = 3 * batch / VTA_BATCH + 2;
-//   // Micro op count
-//   int uop_size = uop_compression ? 1 : vector_size / VTA_BLOCK_OUT;
-//   // Input/output elements in each transfer
-//   int tx_size = vector_size / VTA_BLOCK_OUT;
-//   // Number of input sets to be generated
-//   int input_sets = (use_imm) ? 1 : 2;
-//   // Make sure we don't exceed buffer bounds
-//   assert(uop_size <= VTA_UOP_BUFF_DEPTH);
-//   assert(tx_size * input_sets <= VTA_ACC_BUFF_DEPTH);
-
-//   // Immediate values
-//   acc_T *immediate = static_cast<acc_T *>(malloc(sizeof(acc_T) * batch / VTA_BATCH));
-//   for (int b = 0; b < batch / VTA_BATCH; b++) {
-//     if (opcode == VTA_ALU_OPCODE_MIN) {
-//       immediate[b] = static_cast<acc_T>(
-//           rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH / 2)) - (1LL << (VTA_INP_WIDTH / 2 - 1)));
-//     } else if (opcode == VTA_ALU_OPCODE_MAX) {
-//       immediate[b] = static_cast<acc_T>(
-//           rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH / 2)) - (1LL << (VTA_INP_WIDTH / 2 - 1)));
-//     } else if (opcode == VTA_ALU_OPCODE_ADD) {
-//       immediate[b] = static_cast<acc_T>(
-//           rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH / 2)) - (1LL << (VTA_INP_WIDTH / 2 - 1)));
-//     } else if (opcode == VTA_ALU_OPCODE_SHR) {
-//       immediate[b] = static_cast<acc_T>(
-//           rand_r(&globalSeed) % VTA_ACC_WIDTH - VTA_ACC_WIDTH/2);
-//     }
-//   }
-
-//   // Initialize instructions
-//   VTAGenericInsn *insn_buf =
-//       static_cast<VTAGenericInsn *>(allocBuffer(sizeof(VTAGenericInsn) * ins_size));
-//   int insn_idx = 0;
-//   insn_buf[insn_idx++] =
-//       get1DLoadStoreInsn(VTA_OPCODE_LOAD, VTA_MEM_ID_UOP, 0, 0, uop_size, 0, 0, 0, 0);
-//   for (int b = 0; b < batch; b += VTA_BATCH) {
-//     insn_buf[insn_idx++] = get2DLoadStoreInsn(
-//         VTA_OPCODE_LOAD,                   // opcode
-//         VTA_MEM_ID_ACC,                    // vector size
-//         0,                                 // sram offset
-//         b / VTA_BATCH * tx_size * input_sets,  // dram offset
-//         1,                                 // y size
-//         tx_size * input_sets,              // x size
-//         tx_size * input_sets,              // x stride
-//         0,                                 // y pad
-//         0,                                 // x pad
-//         0,                                 // pop prev dep
-//         b > 0,                             // pop next dep
-//         0,                                 // push prev dep
-//         0);                                // push next dep
-//     insn_buf[insn_idx++] = getALUInsn(
-//         opcode,                            // opcode
-//         tx_size,                           // vector size
-//         use_imm,                           // use imm
-//         immediate[b / VTA_BATCH],          // imm
-//         uop_compression,                   // uop compression
-//         0,                                 // pop prev dep
-//         0,                                 // pop next dep
-//         0,                                 // push prev dep
-//         1);                                // push next dep
-//     insn_buf[insn_idx++] = get2DLoadStoreInsn(
-//         VTA_OPCODE_STORE,                  // opcode
-//         VTA_MEM_ID_OUT,                    // vector size
-//         0,                                 // sram offset
-//         b / VTA_BATCH * tx_size,           // dram offset
-//         1,                                 // y size
-//         tx_size,                           // x size
-//         tx_size,                           // x stride
-//         0,                                 // y pad
-//         0,                                 // x pad
-//         1,                                 // pop prev dep
-//         0,                                 // pop next dep
-//         1,                                 // push prev dep
-//         0);                                // push next dep
-//   }
-//   // Finish
-//   insn_buf[insn_idx++] = getFinishInsn(0, 1);
-//   // Prepare the uop buffer
-//   VTAUop * uop_buf = getMapALUUops(tx_size, uop_compression);
-
-// #if VTA_DEBUG == 1
-//   printInstruction(ins_size, insn_buf);
-//   printMicroOp(uop_size, uop_buf);
-// #endif
-
-//   // Initialize the input/output data
-//   acc_T **inputs = alloc2dArray<acc_T>(batch, vector_size * input_sets);
-//   for (int i = 0; i < batch; i++) {
-//     for (int j = 0; j < vector_size * input_sets; j++) {
-//       if (opcode == VTA_ALU_OPCODE_MIN) {
-//         inputs[i][j] = static_cast<acc_T>(
-//             rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 1)) - (1LL << (VTA_INP_WIDTH - 2)));
-//       } else if (opcode == VTA_ALU_OPCODE_MAX) {
-//         inputs[i][j] = static_cast<acc_T>(
-//             rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 1)) - (1LL << (VTA_INP_WIDTH - 2)));
-//       } else if (opcode == VTA_ALU_OPCODE_ADD) {
-//         inputs[i][j] = static_cast<acc_T>(
-//             rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 1)) - (1LL << (VTA_INP_WIDTH - 2)));
-//       }
-//     }
-//   }
-
-//   // Compute reference output
-//   out_T **outputs_ref = alloc2dArray<out_T>(batch, vector_size);
-//   for (int i = 0; i < batch; i++) {
-//     for (int j = 0; j < vector_size; j++) {
-//       acc_T tmp = 0;
-//       if (opcode == VTA_ALU_OPCODE_MIN) {
-//         if (!use_imm) {
-//           tmp = inputs[i][j] < inputs[i][j + vector_size] ?
-//                     inputs[i][j] :
-//                     inputs[i][j + vector_size];
-//         } else {
-//           tmp = inputs[i][j] < immediate[i / VTA_BATCH] ?
-//                     inputs[i][j] :
-//                     immediate[i / VTA_BATCH];
-//         }
-//       } else if (opcode == VTA_ALU_OPCODE_MAX) {
-//         if (!use_imm) {
-//           tmp = inputs[i][j] > inputs[i][j + vector_size] ?
-//                     inputs[i][j] :
-//                     inputs[i][j + vector_size];
-//         } else {
-//           tmp = inputs[i][j] > immediate[i / VTA_BATCH] ?
-//                     inputs[i][j] :
-//                     immediate[i / VTA_BATCH];
-//         }
-//       } else if (opcode == VTA_ALU_OPCODE_ADD) {
-//         if (!use_imm) {
-//           tmp = inputs[i][j] + inputs[i][j + vector_size];
-//         } else {
-//           tmp = inputs[i][j] + immediate[i / VTA_BATCH];
-//         }
-//       } else if (opcode == VTA_ALU_OPCODE_SHR) {
-//         if (immediate[i / VTA_BATCH] >= 0) {
-//           tmp = inputs[i][j] >> immediate[i / VTA_BATCH];
-//         } else {
-//           tmp = inputs[i][j] << (0 - immediate[i / VTA_BATCH]);
-//         }
-//       }
-//       // Set
-//       outputs_ref[i][j] = (out_T) tmp;
-//     }
-//   }
-
-//   // Pack input buffer
-//   axi_T *bias_buf =
-//       static_cast<axi_T *>(allocBuffer(VTA_ACC_ELEM_BYTES * batch * tx_size * input_sets));
-//   packBuffer<axi_T, VTA_AXI_WIDTH, acc_T, VTA_ACC_WIDTH>(
-//       bias_buf, inputs, batch, vector_size * input_sets, VTA_BATCH, VTA_BLOCK_OUT);
-
-//   // Prepare output buffer
-//   axi_T *output_buf =
-//       static_cast<axi_T *>(allocBuffer(VTA_OUT_ELEM_BYTES * batch * tx_size * input_sets));
-
-// #ifdef NO_SIM
-//   // Invoke the VTA
-//   uint64_t t_fpga = vta(ins_size, insn_buf, uop_buf, NULL, NULL, bias_buf, output_buf);
-//   // Report on timining
-//   printf("INFO - Synchronization time: %.3fms\n", static_cast<float>(t_fpga) / 1E6);
-//   printf("INFO - Throughput: %.3fGOps/s\n", static_cast<float>(vector_size * batch) / t_fpga);
-// #else
-//   // Invoke the VTA
-//   vta(ins_size,
-//       (volatile insn_T *) insn_buf,
-//       (volatile uop_T *) uop_buf,
-//       (volatile axi_T *) NULL,
-//       (volatile axi_T *) NULL,
-//       (volatile axi_T *) bias_buf,
-//       (volatile axi_T *) output_buf);
-// #endif
-
-//   // Unpack output buffer
-//   out_T **outputs = alloc2dArray<out_T>(batch, vector_size);
-//   unpackBuffer<out_T, VTA_OUT_WIDTH, axi_T, VTA_AXI_WIDTH>(outputs,
-//                                                            output_buf,
-//                                                            batch,
-//                                                            vector_size,
-//                                                            VTA_BATCH,
-//                                                            VTA_BLOCK_OUT);
-
-//   // Correctness checks
-//   int err = 0;
-//   for (int i = 0; i < batch; i++) {
-//     for (int j = 0; j < vector_size; j++) {
-//       if (outputs_ref[i][j] != outputs[i][j]) {
-//         err++;
-// #if VTA_DEBUG == 1
-//         printf("DEBUG - %d, %d: expected 0x%x but got 0x%x\n", i, j,
-//                static_cast<int>(outputs_ref[i][j]),
-//                static_cast<int>(outputs[i][j]));
-// #endif
-//       }
-//     }
-//   }
-
-//   // Free all allocated arrays
-//   free(immediate);
-//   free2dArray<acc_T>(inputs, batch, vector_size * input_sets);
-//   free2dArray<out_T>(outputs_ref, batch, vector_size);
-//   free2dArray<out_T>(outputs, batch, vector_size);
-//   freeBuffer(insn_buf);
-//   freeBuffer(uop_buf);
-//   freeBuffer(bias_buf);
-//   freeBuffer(output_buf);
-
-//   if (err == 0) {
-//     printf("INFO - ALU test successful!\n");
-//     return 0;
-//   } else {
-//     printf("INFO - ALU test failed, got %d errors!\n", err);
-//     return -1;
-//   }
-// }
+int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_compression) {
+  // Some assertions
+  assert(batch % VTA_BATCH == 0);
+  assert(vector_size % VTA_BLOCK_OUT == 0);
+  printf("=====================================================================================\n");
+  printf("INFO - ALU test of %s: batch=%d, vector_size=%d, uop_compression=%d\n",
+    getOpcodeString(opcode, use_imm), batch, vector_size, uop_compression);
+
+  // Instruction count
+  int ins_size = 3 * batch / VTA_BATCH + 2;
+  // Micro op count
+  int uop_size = uop_compression ? 1 : vector_size / VTA_BLOCK_OUT;
+  // Input/output elements in each transfer
+  int tx_size = vector_size / VTA_BLOCK_OUT;
+  // Number of input sets to be generated
+  int input_sets = (use_imm) ? 1 : 2;
+  // Make sure we don't exceed buffer bounds
+  assert(uop_size <= VTA_UOP_BUFF_DEPTH);
+  assert(tx_size * input_sets <= VTA_ACC_BUFF_DEPTH);
+
+  // Immediate values
+  acc_T *immediate = static_cast<acc_T *>(malloc(sizeof(acc_T) * batch / VTA_BATCH));
+  for (int b = 0; b < batch / VTA_BATCH; b++) {
+    if (opcode == VTA_ALU_OPCODE_MIN) {
+      immediate[b] = static_cast<acc_T>(
+          rand_r(&globalSeed) % (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 1)) - (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 2)));
+    } else if (opcode == VTA_ALU_OPCODE_MAX) {
+      immediate[b] = static_cast<acc_T>(
+          rand_r(&globalSeed) % (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 1)) - (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 2)));
+    } else if (opcode == VTA_ALU_OPCODE_ADD) {
+      immediate[b] = static_cast<acc_T>(
+          rand_r(&globalSeed) % (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 1)) - (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 2)));
+    } else if (opcode == VTA_ALU_OPCODE_SHR) {
+      immediate[b] = static_cast<acc_T>(
+          rand_r(&globalSeed) % (1LL << (VTA_SHR_ARG_BIT_WIDTH - 1)) - (1LL << (VTA_SHR_ARG_BIT_WIDTH - 2)));
+    } else if (opcode == VTA_ALU_OPCODE_MUL) {
+      immediate[b] = static_cast<acc_T>(
+          rand_r(&globalSeed) % (1LL << (VTA_MUL_ARG_BIT_WIDTH - 1)) - (1LL << (VTA_MUL_ARG_BIT_WIDTH - 2)));
+    }
+  }
+
+  // Initialize instructions
+  VTAGenericInsn *insn_buf =
+      static_cast<VTAGenericInsn *>(allocBuffer(sizeof(VTAGenericInsn) * ins_size));
+  int insn_idx = 0;
+  insn_buf[insn_idx++] =
+      get1DLoadStoreInsn(VTA_OPCODE_LOAD, VTA_MEM_ID_UOP, 0, 0, uop_size, 0, 0, 0, 0);
+  for (int b = 0; b < batch; b += VTA_BATCH) {
+    insn_buf[insn_idx++] = get2DLoadStoreInsn(
+        VTA_OPCODE_LOAD,                   // opcode
+        VTA_MEM_ID_ACC,                    // vector size
+        0,                                 // sram offset
+        b / VTA_BATCH * tx_size * input_sets,  // dram offset
+        1,                                 // y size
+        tx_size * input_sets,              // x size
+        tx_size * input_sets,              // x stride
+        0,                                 // y pad
+        0,                                 // x pad
+        0,                                 // pop prev dep
+        b > 0,                             // pop next dep
+        0,                                 // push prev dep
+        0);                                // push next dep
+    insn_buf[insn_idx++] = getALUInsn(
+        opcode,                            // opcode
+        tx_size,                           // vector size
+        use_imm,                           // use imm
+        immediate[b / VTA_BATCH],          // imm
+        uop_compression,                   // uop compression
+        0,                                 // pop prev dep
+        0,                                 // pop next dep
+        0,                                 // push prev dep
+        1);                                // push next dep
+    insn_buf[insn_idx++] = get2DLoadStoreInsn(
+        VTA_OPCODE_STORE,                  // opcode
+        VTA_MEM_ID_OUT,                    // vector size
+        0,                                 // sram offset
+        b / VTA_BATCH * tx_size,           // dram offset
+        1,                                 // y size
+        tx_size,                           // x size
+        tx_size,                           // x stride
+        0,                                 // y pad
+        0,                                 // x pad
+        1,                                 // pop prev dep
+        0,                                 // pop next dep
+        1,                                 // push prev dep
+        0);                                // push next dep
+  }
+  // Finish
+  insn_buf[insn_idx++] = getFinishInsn(0, 1);
+  // Prepare the uop buffer
+  VTAUop * uop_buf = getMapALUUops(tx_size, uop_compression);
+
+#if VTA_DEBUG == 1
+  printInstruction(ins_size, insn_buf);
+  printMicroOp(uop_size, uop_buf);
+#endif
+
+  // Initialize the input/output data
+  acc_T **inputs = alloc2dArray<acc_T>(batch, vector_size * input_sets);
+  for (int i = 0; i < batch; i++) {
+    for (int j = 0; j < vector_size * input_sets; j++) {
+      if (opcode == VTA_ALU_OPCODE_MIN) {
+        inputs[i][j] = static_cast<acc_T>(
+            rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 1)) - (1LL << (VTA_INP_WIDTH - 2)));
+      } else if (opcode == VTA_ALU_OPCODE_MAX) {
+        inputs[i][j] = static_cast<acc_T>(
+            rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 1)) - (1LL << (VTA_INP_WIDTH - 2)));
+      } else if (opcode == VTA_ALU_OPCODE_ADD) {
+        inputs[i][j] = static_cast<acc_T>(
+            rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 2)) - (1LL << (VTA_INP_WIDTH - 3)));
+      } else if (opcode == VTA_ALU_OPCODE_SHR) {
+        inputs[i][j] = static_cast<acc_T>(
+            rand_r(&globalSeed) % (1LL << (VTA_SHR_ARG_BIT_WIDTH - 1)) - (1LL << (VTA_SHR_ARG_BIT_WIDTH - 2)));
+      } else if (opcode == VTA_ALU_OPCODE_MUL) {
+        inputs[i][j] = static_cast<acc_T>(
+            rand_r(&globalSeed) % (1LL << (VTA_MUL_ARG_BIT_WIDTH - 1)) - (1LL << (VTA_MUL_ARG_BIT_WIDTH - 2)));
+      }
+    }
+  }
+
+  // Compute reference output
+  out_T **outputs_ref = alloc2dArray<out_T>(batch, vector_size);
+  for (int i = 0; i < batch; i++) {
+    for (int j = 0; j < vector_size; j++) {
+      acc_T out_val = 0;
+      acc_T imm_val = immediate[i / VTA_BATCH];
+      acc_T src_val = inputs[i][j + vector_size];
+      if (opcode == VTA_ALU_OPCODE_MIN) {
+        if (!use_imm) {
+          out_val = inputs[i][j] < src_val ? inputs[i][j] : src_val;
+        } else {
+          out_val = inputs[i][j] < imm_val ? inputs[i][j] : imm_val;
+        }
+      } else if (opcode == VTA_ALU_OPCODE_MAX) {
+        if (!use_imm) {
+          out_val = inputs[i][j] > src_val ? inputs[i][j] : src_val;
+        } else {
+          out_val = inputs[i][j] > imm_val ? inputs[i][j] : imm_val;
+        }
+      } else if (opcode == VTA_ALU_OPCODE_ADD) {
+        if (!use_imm) {
+          out_val = inputs[i][j] + src_val;
+        } else {
+          out_val = inputs[i][j] + imm_val;
+        }
+      } else if (opcode == VTA_ALU_OPCODE_SHR) {
+        if (!use_imm) {
+          if (src_val >= 0) {
+            out_val = inputs[i][j] >> src_val;
+          } else {
+            out_val = inputs[i][j] << (0 - src_val);
+          }
+        } else {
+          if (imm_val >= 0) {
+            out_val = inputs[i][j] >> imm_val;
+          } else {
+            out_val = inputs[i][j] << (0 - imm_val);
+          }
+        }
+      } else if (opcode == VTA_ALU_OPCODE_MUL) {
+        if (!use_imm) {
+          out_val = inputs[i][j] * src_val;
+        } else {
+          out_val = inputs[i][j] * imm_val;
+        }
+      }
+      // Set
+      outputs_ref[i][j] = (out_T) out_val;
+    }
+  }
+
+  // Pack input buffer
+  axi_T *bias_buf =
+      static_cast<axi_T *>(allocBuffer(VTA_ACC_ELEM_BYTES * batch * tx_size * input_sets));
+  packBuffer<axi_T, VTA_AXI_WIDTH, acc_T, VTA_ACC_WIDTH>(
+      bias_buf, inputs, batch, vector_size * input_sets, VTA_BATCH, VTA_BLOCK_OUT);
+
+  // Prepare output buffer
+  axi_T *output_buf =
+      static_cast<axi_T *>(allocBuffer(VTA_OUT_ELEM_BYTES * batch * tx_size * input_sets));
+
+#ifdef NO_SIM
+  // Invoke the VTA
+  uint64_t t_fpga = vta(ins_size, insn_buf, uop_buf, NULL, NULL, bias_buf, output_buf);
+  // Report on timining
+  printf("INFO - Synchronization time: %.3fms\n", static_cast<float>(t_fpga) / 1E6);
+  printf("INFO - Throughput: %.3fGOps/s\n", static_cast<float>(vector_size * batch) / t_fpga);
+#else
+  // Invoke the VTA
+  vta(ins_size,
+      (volatile insn_T *) insn_buf,
+      (volatile uop_T *) uop_buf,
+      (volatile axi_T *) NULL,
+      (volatile axi_T *) NULL,
+      (volatile axi_T *) bias_buf,
+      (volatile axi_T *) output_buf);
+#endif
+
+  // Unpack output buffer
+  out_T **outputs = alloc2dArray<out_T>(batch, vector_size);
+  unpackBuffer<out_T, VTA_OUT_WIDTH, axi_T, VTA_AXI_WIDTH>(outputs,
+                                                           output_buf,
+                                                           batch,
+                                                           vector_size,
+                                                           VTA_BATCH,
+                                                           VTA_BLOCK_OUT);
+
+  // Correctness checks
+  int err = 0;
+  for (int i = 0; i < batch; i++) {
+    for (int j = 0; j < vector_size; j++) {
+      if (outputs_ref[i][j] != outputs[i][j]) {
+        err++;
+#if VTA_DEBUG == 1
+        printf("DEBUG - %d, %d: expected 0x%x but got 0x%x\n", i, j,
+               static_cast<int>(outputs_ref[i][j]),
+               static_cast<int>(outputs[i][j]));
+#endif
+      }
+    }
+  }
+
+  // Free all allocated arrays
+  free(immediate);
+  free2dArray<acc_T>(inputs, batch, vector_size * input_sets);
+  free2dArray<out_T>(outputs_ref, batch, vector_size);
+  free2dArray<out_T>(outputs, batch, vector_size);
+  freeBuffer(insn_buf);
+  freeBuffer(uop_buf);
+  freeBuffer(bias_buf);
+  freeBuffer(output_buf);
+
+  if (err == 0) {
+    printf("INFO - ALU test successful!\n");
+    return 0;
+  } else {
+    printf("INFO - ALU test failed, got %d errors!\n", err);
+    return -1;
+  }
+}
 
 int blocked_gemm_test(int batch, int channels, int block, bool uop_compression,
     int virtual_threads) {
diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d.py b/vta/tests/python/integration/test_benchmark_topi_conv2d.py
index b95103be182e2..5b7ceb7404e60 100644
--- a/vta/tests/python/integration/test_benchmark_topi_conv2d.py
+++ b/vta/tests/python/integration/test_benchmark_topi_conv2d.py
@@ -176,21 +176,42 @@ def run_vta_conv2d(env, remote, key, batch_size, wl, profile=True):
                         wl.hkernel, wl.wkernel, env.BLOCK_OUT, env.BLOCK_IN)
         bias_shape = (batch_size//env.BATCH, wl.out_filter//env.BLOCK_OUT,
                       1, 1, env.BATCH, env.BLOCK_OUT)
-
-        fout_height = (wl.height + 2 * wl.hpad - wl.hkernel) // wl.hstride + 1
-        fout_width = (wl.width + 2 * wl.wpad - wl.wkernel) // wl.wstride + 1
         data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype)
-        kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
         bias = tvm.placeholder(bias_shape, name="kernel", dtype=env.acc_dtype)
+        coeff = tvm.placeholder(bias_shape, name="kernel", dtype=env.acc_dtype)
+
+        # Handle quantized inputs (less than 8 bits)
+        # x_pack_factor = 1 << (3 - env.LOG_INP_WIDTH)
+        # data_shape_pack = data_shape[:-1] + (data_shape[-1]//x_pack_factor,)
+        # data_arg = tvm.placeholder(
+        #     data_shape_pack,
+        #     dtype="int8", name="data_arg")
+        # data = vta.reinterpret(data_arg, data_shape, dtype=env.inp_dtype)
+
+        # Handle quantized kernels (less than 8 bits)
+        w_pack_factor = 1 << (3 - env.LOG_WGT_WIDTH)
+        kernel_shape_pack = kernel_shape[:-1] + (kernel_shape[-1]//w_pack_factor,)
+        kernel_arg = tvm.placeholder(
+            kernel_shape_pack,
+            dtype="int8", name="kernel_arg")
+        kernel = vta.reinterpret(kernel_arg, kernel_shape, dtype=env.wgt_dtype)
 
         res_conv = vta.top.packed_conv2d(
             data, kernel, padding=(wl.hpad, wl.wpad), strides=(wl.hstride, wl.wstride))
         res = topi.right_shift(res_conv, 8)
         res = topi.add(res, bias)
+        res = topi.multiply(res, coeff)
         res = my_clip(res, 0, (1 << env.OUT_WIDTH-1)-1)
-        res = topi.cast(res, "int8")
+
+        # Handle quantized outputs (less than 8 bits)
+        # o_pack_factor = 1 << (3 - env.LOG_OUT_WIDTH)
+        res_shape = topi.util.get_const_tuple(res.shape)
+        # res_shape_pack = res_shape[:-1] + (res_shape[-1]//o_pack_factor,)
+        # res_arg = vta.reinterpret(res, res_shape_pack, dtype="int8")
 
         # To compute number of ops, use a x2 factor for FMA
+        fout_height = (wl.height + 2 * wl.hpad - wl.hkernel) // wl.hstride + 1
+        fout_width = (wl.width + 2 * wl.wpad - wl.wkernel) // wl.wstride + 1
         num_ops = 2 * batch_size * fout_height * fout_width * wl.hkernel * wl.wkernel * wl.out_filter * wl.in_filter
 
         a_shape = (batch_size, wl.in_filter, wl.height, wl.width)
@@ -202,57 +223,22 @@ def run_vta_conv2d(env, remote, key, batch_size, wl, profile=True):
         assert wl.hpad == wl.wpad
         padding = wl.hpad
 
-        # Handle packing for quantized activations (less than 8bits)
-        x_pack_factor = 1 << (3 - env.LOG_INP_WIDTH)
-        data_shape_pack = (batch_size//env.BATCH, wl.in_filter//env.BLOCK_IN,
-                           wl.height, wl.width, env.BATCH, env.BLOCK_IN//x_pack_factor)
-        data_arg_buffer = tvm.decl_buffer(
-            data_shape_pack,
-            dtype="int8", name="data_arg")
-        data_bind_buffer = tvm.decl_buffer(
-            data.shape, data.dtype, name=data.op.name,
-            data=data_arg_buffer.data)
-
-        # Handle packing for quantized weights (less than 8bits)
-        w_pack_factor = 1 << (3 - env.LOG_WGT_WIDTH)
-        kernel_shape_pack = (wl.out_filter//env.BLOCK_OUT, wl.in_filter//env.BLOCK_IN,
-                             wl.hkernel, wl.wkernel, env.BLOCK_OUT, env.BLOCK_IN//w_pack_factor)
-        kernel_arg_buffer = tvm.decl_buffer(
-            kernel_shape_pack,
-            dtype="int8", name="kernel_arg")
-        kernel_bind_buffer = tvm.decl_buffer(
-            kernel.shape, kernel.dtype, name=kernel.op.name,
-            data=kernel_arg_buffer.data)
-
-        # Handle packing for outputs (less than 8bits)
-        o_pack_factor = 1 << (3 - env.LOG_OUT_WIDTH)
-        res_shape = topi.util.get_const_tuple(res.shape)
-        res_shape_pack = res_shape[:-1] + (res_shape[-1]//o_pack_factor,)
-        res_arg_buffer = tvm.decl_buffer(
-            res_shape_pack,
-            dtype="int8", name="res_arg")
-        res_bind_buffer = tvm.decl_buffer(
-            res.shape, res.dtype, name=res.op.name,
-            data=res_arg_buffer.data)
-
-        binds = {kernel: kernel_bind_buffer, data: data_bind_buffer, res: res_bind_buffer}
-
         # @memoize("vta.tests.test_benchmark_topi.conv2d.verify_nhwc")
         def get_ref_data():
             # derive min max for input and weight types (max non inclusive)
             a_min, a_max = 0 - (1 << (env.INP_WIDTH - 1)), (1 << (env.INP_WIDTH - 1))
             w_min, w_max = 0 - (1 << (env.WGT_WIDTH - 1)), (1 << (env.WGT_WIDTH - 1))
             a_np = np.random.randint(
-                a_min, a_max, size=a_shape).astype("int8")
+                0, 2, size=a_shape).astype("int8")
             w_np = np.random.randint(
-                w_min, w_max, size=w_shape).astype("int8")
+                0, 2, size=w_shape).astype("int8")
             b_np = topi.testing.conv2d_nchw_python(
                 a_np.astype(acc_dtype), w_np.astype(acc_dtype), stride, padding).astype(acc_dtype)
             return a_np, w_np, b_np
 
         def verify(s, check_correctness):
-            mod = vta.build(s, [data_arg_buffer, kernel_arg_buffer, bias, res_arg_buffer], "ext_dev",
-                            env.target_host, name="conv2d", binds=binds)
+            mod = vta.build(s, [data, kernel_arg, bias, coeff, res], "ext_dev",
+                            env.target_host, name="conv2d")
             temp = util.tempdir()
 
             mod.save(temp.relpath("conv2d.o"))
@@ -262,8 +248,10 @@ def verify(s, check_correctness):
             ctx = remote.ext_dev(0)
             # Data in original format
             data_orig, kernel_orig, res_ref = get_ref_data()
-            bias_orig = (np.random.uniform(size=(wl.out_filter,)) * 4).astype("int32")
+            bias_orig = (np.random.uniform(size=(wl.out_filter,)) * 2).astype("int32")
             bias_orig = np.abs(bias_orig)
+            coeff_orig = (np.random.uniform(size=(wl.out_filter,)) * 2).astype("int32")
+            coeff_orig = np.abs(coeff_orig)
 
             data_packed = data_orig.reshape(
                 batch_size//env.BATCH, env.BATCH,
@@ -276,6 +264,9 @@ def verify(s, check_correctness):
             bias_packed = bias_orig.reshape(
                 batch_size // env.BATCH, wl.out_filter // env.BLOCK_OUT,
                 1, 1, env.BATCH, env.BLOCK_OUT)
+            coeff_packed = coeff_orig.reshape(
+                batch_size // env.BATCH, wl.out_filter // env.BLOCK_OUT,
+                1, 1, env.BATCH, env.BLOCK_OUT)
 
             # Quantized packing
             data_qpacked = _pack(data_packed, env.INP_WIDTH)
@@ -285,9 +276,10 @@ def verify(s, check_correctness):
             data_arr = tvm.nd.array(data_qpacked, ctx)
             kernel_arr = tvm.nd.array(kernel_qpacked, ctx)
             bias_arr = tvm.nd.array(bias_packed, ctx)
+            coeff_arr = tvm.nd.array(coeff_packed, ctx)
             res_arr = tvm.nd.array(res_np, ctx)
             time_f = f.time_evaluator("conv2d", ctx, number=5)
-            cost = time_f(data_arr, kernel_arr, bias_arr, res_arr)
+            cost = time_f(data_arr, kernel_arr, bias_arr, coeff_arr, res_arr)
 
             res_unpack = res_arr.asnumpy()
             res_unpack = _unpack(res_unpack.astype("int8"), env.OUT_WIDTH)
@@ -299,6 +291,7 @@ def verify(s, check_correctness):
                 padding = wl.hpad
                 res_ref = res_ref >> 8
                 res_ref += bias_orig.reshape(wl.out_filter, 1, 1)
+                res_ref *= coeff_orig.reshape(wl.out_filter, 1, 1)
                 res_ref = np.clip(res_ref, 0, (1 << env.OUT_WIDTH-1)-1).astype("int8")
                 np.testing.assert_allclose(res_unpack, res_ref)
             return cost
@@ -308,7 +301,7 @@ def conv_normal(print_ir):
             with vta.build_config():
                 s = vta.top.schedule_packed_conv2d([res])
                 if print_ir:
-                    print(vta.lower(s, [data_arg_buffer, kernel_arg_buffer, bias, res_arg_buffer], simple_mode=True))
+                    print(vta.lower(s, [data, kernel_arg, bias, coeff, res], simple_mode=True))
             cost = verify(s, True)
             gops = (num_ops / cost.mean) / float(10 ** 9)
             print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops))