diff --git a/vta/config/pynq_sample.json b/vta/config/pynq_sample.json index 5c37108e6b12f..fd6190caa9ed8 100644 --- a/vta/config/pynq_sample.json +++ b/vta/config/pynq_sample.json @@ -1,17 +1,20 @@ { "TARGET" : "pynq", + "HW_VER" : "0.0.1", "HW_FREQ" : 100, "HW_CLK_TARGET" : 8, - "HW_VER" : "0.0.0", + "ALU" : true, + "GEMM_II" : 2, + "TALU_II" : 4, "LOG_INP_WIDTH" : 3, - "LOG_WGT_WIDTH" : 3, + "LOG_WGT_WIDTH" : 1, "LOG_ACC_WIDTH" : 5, "LOG_OUT_WIDTH" : 3, "LOG_BATCH" : 0, - "LOG_BLOCK_IN" : 4, - "LOG_BLOCK_OUT" : 4, + "LOG_BLOCK_IN" : 5, + "LOG_BLOCK_OUT" : 5, "LOG_UOP_BUFF_SIZE" : 15, - "LOG_INP_BUFF_SIZE" : 15, - "LOG_WGT_BUFF_SIZE" : 18, + "LOG_INP_BUFF_SIZE" : 17, + "LOG_WGT_BUFF_SIZE" : 17, "LOG_ACC_BUFF_SIZE" : 17 } diff --git a/vta/config/vta_config.json b/vta/config/vta_config.json index e9cc88a5479dc..8a7bcc01af62b 100644 --- a/vta/config/vta_config.json +++ b/vta/config/vta_config.json @@ -1,9 +1,11 @@ { - "TARGET" : "sim", + "TARGET" : "pynq", + "HW_VER" : "0.0.2", "HW_FREQ" : 100, - "HW_CLK_TARGET" : 8, - "HW_VER" : "0.0.0", - "GEMM_II" : 2, + "HW_CLK_TARGET" : 7, + "ALU_EN" : true, + "MUL_EN" : true, + "GEMM_II" : 1, "TALU_II" : 2, "LOG_INP_WIDTH" : 3, "LOG_WGT_WIDTH" : 1, @@ -13,7 +15,7 @@ "LOG_BLOCK_IN" : 5, "LOG_BLOCK_OUT" : 5, "LOG_UOP_BUFF_SIZE" : 15, - "LOG_INP_BUFF_SIZE" : 17, - "LOG_WGT_BUFF_SIZE" : 17, + "LOG_INP_BUFF_SIZE" : 16, + "LOG_WGT_BUFF_SIZE" : 18, "LOG_ACC_BUFF_SIZE" : 17 } diff --git a/vta/config/vta_config.py b/vta/config/vta_config.py index bc86b6887cc25..664b04db0b7a2 100644 --- a/vta/config/vta_config.py +++ b/vta/config/vta_config.py @@ -38,6 +38,10 @@ def main(): help="print the target") parser.add_argument("--cfg-str", action="store_true", help="print the configuration string") + parser.add_argument("--get-aluen", action="store_true", + help="returns whether ALU is enabled") + parser.add_argument("--get-mulen", action="store_true", + help="returns whether mul in ALU is enabled") parser.add_argument("--get-gemmii", action="store_true", help="returns the GEMM core II") parser.add_argument("--get-taluii", action="store_true", @@ -90,7 +94,28 @@ def main(): if not ok_path_list: raise RuntimeError("Cannot find config in %s" % str(path_list)) cfg = json.load(open(ok_path_list[0])) - cfg["LOG_OUT_BUFF_SIZE"] = cfg["LOG_ACC_BUFF_SIZE"] + cfg["LOG_ACC_WIDTH"] - cfg["LOG_OUT_WIDTH"] + cfg["LOG_OUT_BUFF_SIZE"] = cfg["LOG_ACC_BUFF_SIZE"] - cfg["LOG_ACC_WIDTH"] + cfg["LOG_OUT_WIDTH"] + # Generate bitstream config string. + # Needs to match the BITSTREAM string in python/vta/environment.py + cfg["BITSTREAM"] = "{}_{}x{}x{}_a{}w{}o{}_{}_{}_{}_{}_{}MHz_{}ns_gii{}".format( + cfg["HW_VER"].replace('.', '_'), + (1 << cfg["LOG_BATCH"]), + (1 << cfg["LOG_BLOCK_IN"]), + (1 << cfg["LOG_BLOCK_OUT"]), + (1 << cfg["LOG_INP_WIDTH"]), + (1 << cfg["LOG_WGT_WIDTH"]), + (1 << cfg["LOG_OUT_WIDTH"]), + cfg["LOG_UOP_BUFF_SIZE"], + cfg["LOG_INP_BUFF_SIZE"], + cfg["LOG_WGT_BUFF_SIZE"], + cfg["LOG_ACC_BUFF_SIZE"], + cfg["HW_FREQ"], + cfg["HW_CLK_TARGET"], + cfg["GEMM_II"]) + if cfg["ALU_EN"]: + cfg["BITSTREAM"] += "_aii{}".format(cfg["TALU_II"]) + if cfg["MUL_EN"] and cfg["ALU_EN"]: + cfg["BITSTREAM"] += "_mul" pkg = get_pkg_config(cfg) if args.target: @@ -119,23 +144,13 @@ def main(): fo.write(pkg.cfg_json) if args.cfg_str: - # Needs to match the BITSTREAM string in python/vta/environment.py - cfg_str = "{}x{}x{}_g{}_a{}_{}bx{}b_{}_{}_{}_{}_{}MHz_{}ns_v{}".format( - (1 << cfg["LOG_BATCH"]), - (1 << cfg["LOG_BLOCK_IN"]), - (1 << cfg["LOG_BLOCK_OUT"]), - cfg["GEMM_II"], - cfg["TALU_II"], - (1 << cfg["LOG_INP_WIDTH"]), - (1 << cfg["LOG_WGT_WIDTH"]), - cfg["LOG_UOP_BUFF_SIZE"], - cfg["LOG_INP_BUFF_SIZE"], - cfg["LOG_WGT_BUFF_SIZE"], - cfg["LOG_ACC_BUFF_SIZE"], - cfg["HW_FREQ"], - cfg["HW_CLK_TARGET"], - cfg["HW_VER"].replace('.', '_')) - print(cfg_str) + print(cfg["BITSTREAM"]) + + if args.get_aluen: + print(cfg["ALU_EN"]) + + if args.get_mulen: + print(cfg["MUL_EN"]) if args.get_gemmii: print(cfg["GEMM_II"]) diff --git a/vta/hardware/xilinx/Makefile b/vta/hardware/xilinx/Makefile index 508709c6c5edb..fcd5b71d7788f 100644 --- a/vta/hardware/xilinx/Makefile +++ b/vta/hardware/xilinx/Makefile @@ -16,13 +16,9 @@ HSI = hsi # HLS mode MODE = all # Debug flag -DEBUG = false +DEBUG = False # SLURM -SLURM = false -# Prevent generation of DSP -NO_DSP = false -# Prevent generation of ALU -NO_ALU = true +SLURM = False # Process VTA JSON config VTA_CONFIG = python $(CURDIR)/../../config/vta_config.py @@ -44,6 +40,8 @@ VTA_INP_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-inpbuffsize) VTA_WGT_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-wgtbuffsize) VTA_ACC_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-accbuffsize) VTA_OUT_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-outbuffsize) +VTA_ALU_EN := $(shell ${VTA_CONFIG} --get-aluen) +VTA_MUL_EN := $(shell ${VTA_CONFIG} --get-mulen) #--------------------- # FPGA Parameters @@ -87,7 +85,8 @@ $(IP_PATH): $(SRC_DIR)/* cd $(IP_BUILD_PATH) && \ $(VIVADO_HLS) -f $(SCRIPT_DIR)/hls.tcl \ -tclargs $(SRC_DIR) $(SIM_DIR) $(TEST_DIR) $(INCLUDE_DIR) \ - $(MODE) $(DEBUG) $(NO_DSP) $(NO_ALU) $(VTA_TARGET_PER) $(VTA_GEMM_II) $(VTA_TALU_II) \ + $(MODE) $(DEBUG) $(VTA_ALU_EN) $(VTA_MUL_EN) \ + $(VTA_TARGET_PER) $(VTA_GEMM_II) $(VTA_TALU_II) \ $(VTA_INP_WIDTH) $(VTA_WGT_WIDTH) $(VTA_ACC_WIDTH) $(VTA_OUT_WIDTH) \ $(VTA_BATCH) $(VTA_IN_BLOCK) $(VTA_OUT_BLOCK) \ $(VTA_UOP_BUFF_SIZE) $(VTA_INP_BUFF_SIZE) $(VTA_WGT_BUFF_SIZE) \ diff --git a/vta/hardware/xilinx/scripts/hls.tcl b/vta/hardware/xilinx/scripts/hls.tcl index 4d593e26c2e38..3bddcc3ad1e60 100644 --- a/vta/hardware/xilinx/scripts/hls.tcl +++ b/vta/hardware/xilinx/scripts/hls.tcl @@ -11,8 +11,8 @@ # Arg 4: path to include sources # Arg 5: mode # Arg 6: debug -# Arg 7: no_dsp -# Arg 8: no_alu +# Arg 7: alu_ena +# Arg 8: mul_ena # Arg 9: target clock period # Arg 10: target II for GEMM # Arg 11: target II for tensor ALU @@ -36,8 +36,8 @@ if { [llength $argv] eq 25 } { set include_dir [lindex $argv 5] set mode [lindex $argv 6] set debug [lindex $argv 7] - set no_dsp [lindex $argv 8] - set no_alu [lindex $argv 9] + set alu_ena [lindex $argv 8] + set mul_ena [lindex $argv 9] set target_period [lindex $argv 10] set target_gemm_ii [lindex $argv 11] set target_alu_ii [lindex $argv 12] @@ -59,9 +59,9 @@ if { [llength $argv] eq 25 } { set test_dir "../../src/test" set include_dir "../../include" set mode "all" - set debug "false" - set no_dsp "true" - set no_alu "false" + set debug "False" + set alu_ena "True" + set mul_ena "True" set target_period 8 set target_gemm_ii 10 set target_alu_ii 16 @@ -83,7 +83,7 @@ if { [llength $argv] eq 25 } { # Initializes the HLS design and sets HLS pragmas for memory partitioning. # This is necessary because of a Vivado restriction that doesn't allow for # buses wider than 1024 bits. -proc init_design {per ii inp_width wgt_width out_width acc_width batch block_in block_out no_alu} { +proc init_design {per g_ii a_ii inp_width wgt_width out_width acc_width batch block_in block_out alu_ena} { # Set device number set_part {xc7z020clg484-1} @@ -98,14 +98,14 @@ proc init_design {per ii inp_width wgt_width out_width acc_width batch block_in create_clock -period $per -name default # Set pipeline directive - set_directive_pipeline -II $ii "compute/READ_GEMM_UOP" + set_directive_pipeline -II $g_ii "compute/READ_GEMM_UOP" - if {$no_alu=="false"} { - set_directive_pipeline -II $ii "compute/READ_ALU_UOP" + if {$alu_ena=="True"} { + set_directive_pipeline -II $a_ii "compute/READ_ALU_UOP" } - # Set input partition factor to (INP_VECTOR_WIDTH*BATCH/(1024*ii) - set inp_bus_width [expr {(1 << ($inp_width + $block_in + $batch)) / $ii}] + # Set input partition factor to (INP_VECTOR_WIDTH*BATCH/(1024*g_ii) + set inp_bus_width [expr {(1 << ($inp_width + $block_in + $batch)) / $g_ii}] set inp_partition_factor [expr {$inp_bus_width / $max_width}] if {$inp_partition_factor == 0} { set inp_reshape_factor [expr {$inp_bus_width / $axi_width}] @@ -118,8 +118,8 @@ proc init_design {per ii inp_width wgt_width out_width acc_width batch block_in set_directive_array_reshape -type block -factor $inp_reshape_factor -dim 2 "load" inp_mem set_directive_array_reshape -type block -factor $inp_reshape_factor -dim 2 "compute" inp_mem } - # Set weight partition factor to (WGT_VECTOR_WIDTH*BLOCK_OUT/(1024*ii)) - set wgt_bus_width [expr {(1 << ($wgt_width + $block_in + $block_out)) / $ii}] + # Set weight partition factor to (WGT_VECTOR_WIDTH*BLOCK_OUT/(1024*g_ii)) + set wgt_bus_width [expr {(1 << ($wgt_width + $block_in + $block_out)) / $g_ii}] set wgt_partition_factor [expr {$wgt_bus_width / $max_width}] if {$wgt_partition_factor == 0} { set wgt_reshape_factor [expr {$wgt_bus_width / $axi_width}] @@ -132,8 +132,8 @@ proc init_design {per ii inp_width wgt_width out_width acc_width batch block_in set_directive_array_reshape -type block -factor $wgt_reshape_factor -dim 2 "load" wgt_mem set_directive_array_reshape -type block -factor $wgt_reshape_factor -dim 2 "compute" wgt_mem } - # Set output partition factor to (OUT_VECTOR_WIDTH*BATCH/(1024*ii)) - set out_bus_width [expr {(1 << ($out_width + $block_out + $batch)) / $ii}] + # Set output partition factor to (OUT_VECTOR_WIDTH*BATCH/(1024*g_ii)) + set out_bus_width [expr {(1 << ($out_width + $block_out + $batch)) / $g_ii}] set out_partition_factor [expr {$out_bus_width / $max_width}] if {$out_partition_factor == 0} { set out_reshape_factor [expr {$out_bus_width / $axi_width}] @@ -147,9 +147,9 @@ proc init_design {per ii inp_width wgt_width out_width acc_width batch block_in set_directive_array_reshape -type block -factor $out_reshape_factor -dim 2 "store" out_mem } # Set accumulator partition factor - # set acc_bus_width [expr {(1 << ($acc_width + $block_out + $batch)) / $ii}] + # set acc_bus_width [expr {(1 << ($acc_width + $block_out + $batch)) / $g_ii}] # set acc_reshape_factor [expr {$acc_bus_width / $axi_width}] - # set_directive_array_reshape -type block -factor $acc_reshape_factor -dim 2 "compute" acc_mem + # set_directive_array_partition -type block -factor $acc_reshape_factor -dim 2 "compute" acc_mem } # C define flags to pass to compiler @@ -160,14 +160,14 @@ set cflags "-I $include_dir -I $src_dir -I $test_dir \ -DVTA_LOG_UOP_BUFF_SIZE=$uop_buff_size -DVTA_LOG_INP_BUFF_SIZE=$inp_buff_size \ -DVTA_LOG_WGT_BUFF_SIZE=$wgt_buff_size -DVTA_LOG_ACC_BUFF_SIZE=$acc_buff_size \ -DVTA_LOG_OUT_BUFF_SIZE=$out_buff_size" -if {$debug=="true"} { +if {$debug=="True"} { append cflags " -DVTA_DEBUG=1" } -if {$no_dsp=="true"} { - append cflags " -DNO_DSP" +if {$alu_ena=="True"} { + append cflags " -DALU_EN" } -if {$no_alu=="true"} { - append cflags " -DNO_ALU" +if {$mul_ena=="True"} { + append cflags " -DMUL_EN" } # HLS behavioral sim @@ -178,7 +178,7 @@ if {$mode=="all" || $mode=="sim"} { add_files -tb $sim_dir/vta_test.cc -cflags $cflags add_files -tb $test_dir/test_lib.cc -cflags $cflags open_solution "solution0" - init_design $target_period $target_gemm_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $no_alu + init_design $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena csim_design -clean close_project } @@ -189,7 +189,7 @@ if {$mode=="all" || $mode=="skip_sim" || $mode=="fetch"} { set_top fetch add_files $src_dir/vta.cc -cflags $cflags open_solution "solution0" - init_design $target_period $target_gemm_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $no_alu + init_design $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena csynth_design if {$mode=="all" || $mode=="skip_sim"} { export_design -format ip_catalog @@ -203,7 +203,7 @@ if {$mode=="all" || $mode=="skip_sim" || $mode=="load"} { set_top load add_files $src_dir/vta.cc -cflags $cflags open_solution "solution0" - init_design $target_period $target_gemm_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $no_alu + init_design $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena csynth_design if {$mode=="all" || $mode=="skip_sim"} { export_design -format ip_catalog @@ -217,7 +217,7 @@ if {$mode=="all" || $mode=="skip_sim" || $mode=="compute"} { set_top compute add_files $src_dir/vta.cc -cflags $cflags open_solution "solution0" - init_design $target_period $target_gemm_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $no_alu + init_design $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena csynth_design if {$mode=="all" || $mode=="skip_sim"} { export_design -format ip_catalog @@ -231,7 +231,7 @@ if {$mode=="all" || $mode=="skip_sim" || $mode=="store"} { set_top store add_files $src_dir/vta.cc -cflags $cflags open_solution "solution0" - init_design $target_period $target_gemm_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $no_alu + init_design $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena csynth_design if {$mode=="all" || $mode=="skip_sim"} { export_design -format ip_catalog diff --git a/vta/hardware/xilinx/sim/vta_test.cc b/vta/hardware/xilinx/sim/vta_test.cc index cd7449bfc8294..266eeaae2f9b8 100644 --- a/vta/hardware/xilinx/sim/vta_test.cc +++ b/vta/hardware/xilinx/sim/vta_test.cc @@ -29,29 +29,41 @@ int main(void) { int status = 0; - // // Run ALU test (vector-scalar operators) - // status |= alu_test(VTA_ALU_OPCODE_MIN, true, VTA_BLOCK_OUT, 128, true); - // status |= alu_test(VTA_ALU_OPCODE_MIN, true, VTA_BLOCK_OUT, 128, false); - // status |= alu_test(VTA_ALU_OPCODE_MAX, true, VTA_BLOCK_OUT, 128, true); - // status |= alu_test(VTA_ALU_OPCODE_MAX, true, VTA_BLOCK_OUT, 128, false); - // status |= alu_test(VTA_ALU_OPCODE_ADD, true, VTA_BLOCK_OUT, 128, true); - // status |= alu_test(VTA_ALU_OPCODE_ADD, true, VTA_BLOCK_OUT, 128, false); - // status |= alu_test(VTA_ALU_OPCODE_SHR, true, VTA_BLOCK_OUT, 128, true); - // status |= alu_test(VTA_ALU_OPCODE_SHR, true, VTA_BLOCK_OUT, 128, false); - - // // Run ALU test (vector-vector operators) - // status |= alu_test(VTA_ALU_OPCODE_MIN, false, VTA_BLOCK_OUT, 128, true); - // status |= alu_test(VTA_ALU_OPCODE_MIN, false, VTA_BLOCK_OUT, 128, false); - // status |= alu_test(VTA_ALU_OPCODE_MAX, false, VTA_BLOCK_OUT, 128, true); - // status |= alu_test(VTA_ALU_OPCODE_MAX, false, VTA_BLOCK_OUT, 128, false); - // status |= alu_test(VTA_ALU_OPCODE_ADD, false, VTA_BLOCK_OUT, 128, true); - // status |= alu_test(VTA_ALU_OPCODE_ADD, false, VTA_BLOCK_OUT, 128, false); - - // // Run blocked GEMM test - // status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 2); - // status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 2); - // status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 1); - // status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 1); +#ifdef ALU_EN + // Run ALU test (vector-scalar operators) + status |= alu_test(VTA_ALU_OPCODE_MIN, true, VTA_BLOCK_OUT, 128, true); + status |= alu_test(VTA_ALU_OPCODE_MIN, true, VTA_BLOCK_OUT, 128, false); + status |= alu_test(VTA_ALU_OPCODE_MAX, true, VTA_BLOCK_OUT, 128, true); + status |= alu_test(VTA_ALU_OPCODE_MAX, true, VTA_BLOCK_OUT, 128, false); + status |= alu_test(VTA_ALU_OPCODE_ADD, true, VTA_BLOCK_OUT, 128, true); + status |= alu_test(VTA_ALU_OPCODE_ADD, true, VTA_BLOCK_OUT, 128, false); + status |= alu_test(VTA_ALU_OPCODE_SHR, true, VTA_BLOCK_OUT, 128, true); + status |= alu_test(VTA_ALU_OPCODE_SHR, true, VTA_BLOCK_OUT, 128, false); + + // Run ALU test (vector-vector operators) + status |= alu_test(VTA_ALU_OPCODE_MIN, false, VTA_BLOCK_OUT, 128, true); + status |= alu_test(VTA_ALU_OPCODE_MIN, false, VTA_BLOCK_OUT, 128, false); + status |= alu_test(VTA_ALU_OPCODE_MAX, false, VTA_BLOCK_OUT, 128, true); + status |= alu_test(VTA_ALU_OPCODE_MAX, false, VTA_BLOCK_OUT, 128, false); + status |= alu_test(VTA_ALU_OPCODE_ADD, false, VTA_BLOCK_OUT, 128, true); + status |= alu_test(VTA_ALU_OPCODE_ADD, false, VTA_BLOCK_OUT, 128, false); + status |= alu_test(VTA_ALU_OPCODE_SHR, false, VTA_BLOCK_OUT, 128, true); + status |= alu_test(VTA_ALU_OPCODE_SHR, false, VTA_BLOCK_OUT, 128, false); + +#ifdef MUL_EN + status |= alu_test(VTA_ALU_OPCODE_MUL, true, VTA_BLOCK_OUT, 128, true); + status |= alu_test(VTA_ALU_OPCODE_MUL, true, VTA_BLOCK_OUT, 128, false); + status |= alu_test(VTA_ALU_OPCODE_MUL, false, VTA_BLOCK_OUT, 128, true); + status |= alu_test(VTA_ALU_OPCODE_MUL, false, VTA_BLOCK_OUT, 128, false); +#endif // MUL_EN + +#endif // ALU_EN + + // Run blocked GEMM test + status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 2); + status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 2); + status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 1); + status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 1); // Simple GEMM unit test status |= gemm_test(4 * VTA_BATCH, 4 * VTA_BLOCK_OUT, 4 * VTA_BLOCK_IN, true); diff --git a/vta/hardware/xilinx/src/vta.cc b/vta/hardware/xilinx/src/vta.cc index ca384b07635eb..f59f2a53b5a00 100644 --- a/vta/hardware/xilinx/src/vta.cc +++ b/vta/hardware/xilinx/src/vta.cc @@ -212,10 +212,17 @@ void compute( // Accumulator storage static axi_T acc_mem[VTA_ACC_BUFF_DEPTH][ACC_TENSOR_ELEMS]; -#pragma HLS ARRAY_PARTITION variable = acc_mem complete dim = 2 +#pragma HLS ARRAY_RESHAPE variable = acc_mem complete dim=2 // This is necessary to obtain II=1 #pragma HLS DEPENDENCE variable = acc_mem inter false + +#ifdef MUL_EN +// This will limit DSP util when Multipliers are enabled in the ALU +#pragma HLS allocation instances=mul limit=220 operation +#endif // MUL_EN + + // Pop GEMM instruction insn_T insn = gemm_queue.read(); @@ -350,13 +357,13 @@ void compute( } // Read in accum tensor - acc_T a_tensor[VTA_BATCH][VTA_BLOCK_OUT]; + reg_T a_tensor[VTA_BATCH][VTA_BLOCK_OUT]; for (int b = 0; b < VTA_BATCH; b++) { for (int p = 0; p < ACC_VEC_AXI_RATIO; p++) { axi_T packet = acc_mem[dst_idx][b * ACC_VEC_AXI_RATIO + p]; for (int w = 0; w < AXI_ACC_RATIO; w++) { a_tensor[b][p * AXI_ACC_RATIO + w] = - packet.range((w + 1) * VTA_ACC_WIDTH - 1, w * VTA_ACC_WIDTH); + packet.range(w * VTA_ACC_WIDTH + VTA_REG_WIDTH - 1, w * VTA_ACC_WIDTH); } } } @@ -368,7 +375,7 @@ void compute( for (int b = 0; b < VTA_BATCH; b++) { for (int oc = 0; oc < VTA_BLOCK_OUT; oc++) { // Initialize the accumulator values - acc_T accum = a_tensor[b][oc]; + reg_T accum = a_tensor[b][oc]; // Dot product sum sum_T tmp = 0; // Inner matrix multiplication loop (input channel/feature) @@ -382,9 +389,9 @@ void compute( tmp += (sum_T) prod; } // Update summation - accum += (acc_T) tmp; + accum += (reg_T) tmp; // Write back result acc_mem - a_tensor[b][oc] = reset_out ? (acc_T) 0 : accum; + a_tensor[b][oc] = reset_out ? (reg_T) 0 : accum; // And output vector o_tensor[b][oc] = (out_T) accum.range(VTA_OUT_WIDTH - 1, 0); } @@ -395,7 +402,7 @@ void compute( for (int p = 0; p < ACC_VEC_AXI_RATIO; p++) { axi_T packet = 0; for (int w = 0; w < AXI_ACC_RATIO; w++) { - packet.range((w + 1) * VTA_ACC_WIDTH - 1, w * VTA_ACC_WIDTH) = a_tensor[b][p * AXI_ACC_RATIO + w]; + packet.range((w + 1) * VTA_ACC_WIDTH - 1, w * VTA_ACC_WIDTH) = (acc_T) a_tensor[b][p * AXI_ACC_RATIO + w]; } acc_mem[dst_idx][b * ACC_VEC_AXI_RATIO + p] = packet; } @@ -413,7 +420,7 @@ void compute( } } } -#ifndef NO_ALU +#ifdef ALU_EN else if (opcode == VTA_OPCODE_ALU) { // Iterate over micro op READ_ALU_UOP: for (int upc = uop_bgn; upc < uop_end; upc++) { @@ -427,25 +434,25 @@ void compute( uop.range(VTA_UOP_ALU_1_1, VTA_UOP_ALU_1_0) + src_offset_in; // Read in src tensor - acc_T src_tensor[VTA_BATCH][VTA_BLOCK_OUT]; + reg_T src_tensor[VTA_BATCH][VTA_BLOCK_OUT]; for (int b = 0; b < VTA_BATCH; b++) { for (int p = 0; p < ACC_VEC_AXI_RATIO; p++) { axi_T packet = acc_mem[src_idx][b * ACC_VEC_AXI_RATIO + p]; for (int w = 0; w < AXI_ACC_RATIO; w++) { src_tensor[b][p * AXI_ACC_RATIO + w] = - packet.range((w + 1) * VTA_ACC_WIDTH - 1, w * VTA_ACC_WIDTH); + packet.range(w * VTA_ACC_WIDTH + VTA_REG_WIDTH - 1, w * VTA_ACC_WIDTH); } } } // Read in dst tensor - acc_T dst_tensor[VTA_BATCH][VTA_BLOCK_OUT]; + reg_T dst_tensor[VTA_BATCH][VTA_BLOCK_OUT]; for (int b = 0; b < VTA_BATCH; b++) { for (int p = 0; p < ACC_VEC_AXI_RATIO; p++) { axi_T packet = acc_mem[dst_idx][b * ACC_VEC_AXI_RATIO + p]; for (int w = 0; w < AXI_ACC_RATIO; w++) { dst_tensor[b][p * AXI_ACC_RATIO + w] = - packet.range((w + 1) * VTA_ACC_WIDTH - 1, w * VTA_ACC_WIDTH); + packet.range(w * VTA_ACC_WIDTH + VTA_REG_WIDTH - 1, w * VTA_ACC_WIDTH); } } } @@ -457,23 +464,40 @@ void compute( for (int i = 0; i < VTA_BATCH; i++) { for (int b = 0; b < VTA_BLOCK_OUT; b++) { // Read in operands - acc_T src_0 = dst_tensor[i][b]; - acc_T src_1 = use_imm ? (acc_T) imm : src_tensor[i][b]; - // Compute Min/Max - acc_T mix_val = src_0 < src_1 ? - (alu_opcode == VTA_ALU_OPCODE_MIN ? src_0 : src_1) : - (alu_opcode == VTA_ALU_OPCODE_MIN ? src_1 : src_0); - dst_tensor[i][b] = mix_val; - o_tensor[i][b] = (out_T) mix_val.range(VTA_OUT_WIDTH - 1, 0); - // Compute Sum - acc_T add_val = - src_0.range(VTA_ACC_WIDTH - 1, 0) + src_1.range(VTA_ACC_WIDTH - 1, 0); - dst_tensor[i][b] = add_val; - o_tensor[i][b] = (out_T) add_val.range(VTA_OUT_WIDTH - 1, 0); - // Compute Shift Right - acc_T shr_val = src_0 >> (aluop_sh_imm_T) src_1.range(VTA_LOG_ACC_WIDTH - 1, 0); - dst_tensor[i][b] = shr_val; - o_tensor[i][b] = (out_T) shr_val.range(VTA_OUT_WIDTH-1, 0); + reg_T src_0 = dst_tensor[i][b]; + reg_T src_1 = use_imm ? (reg_T) imm : src_tensor[i][b]; + aluop_shr_arg_T shft_by = src_1.range(VTA_SHR_ARG_BIT_WIDTH - 1, 0); + aluop_mul_arg_T mul_by = src_1.range(VTA_MUL_ARG_BIT_WIDTH - 1, 0); + if (alu_opcode == VTA_ALU_OPCODE_MIN || alu_opcode == VTA_ALU_OPCODE_MAX) { + // Compute Min/Max + reg_T mix_val = src_0 < src_1 ? + (alu_opcode == VTA_ALU_OPCODE_MIN ? src_0 : src_1) : + (alu_opcode == VTA_ALU_OPCODE_MIN ? src_1 : src_0); + dst_tensor[i][b] = mix_val; + o_tensor[i][b] = (out_T) mix_val.range(VTA_OUT_WIDTH - 1, 0); + } else if (alu_opcode == VTA_ALU_OPCODE_ADD) { + // Compute Sum + reg_T add_val = + src_0.range(VTA_REG_WIDTH - 1, 0) + src_1.range(VTA_REG_WIDTH - 1, 0); + dst_tensor[i][b] = add_val; + o_tensor[i][b] = (out_T) add_val.range(VTA_OUT_WIDTH - 1, 0); + } else if (alu_opcode == VTA_ALU_OPCODE_SHR) { + // Compute Shift Right + reg_T shr_val = src_0 >> shft_by; + dst_tensor[i][b] = shr_val; + o_tensor[i][b] = (out_T) shr_val.range(VTA_OUT_WIDTH-1, 0); + } +#ifdef MUL_EN + else if (alu_opcode == VTA_ALU_OPCODE_MUL) { + // Compute Mul Right + reg_T mul_val = src_0 * mul_by; +#ifdef NO_DSP +#pragma HLS RESOURCE variable = mul_val core = Mul_LUT +#endif // NO_DSP + dst_tensor[i][b] = mul_val; + o_tensor[i][b] = (out_T) mul_val.range(VTA_OUT_WIDTH-1, 0); + } +#endif // MUL_EN } } @@ -482,7 +506,7 @@ void compute( for (int p = 0; p < ACC_VEC_AXI_RATIO; p++) { axi_T packet = 0; for (int w = 0; w < AXI_ACC_RATIO; w++) { - packet.range((w + 1) * VTA_ACC_WIDTH - 1, w * VTA_ACC_WIDTH) = dst_tensor[b][p * AXI_ACC_RATIO + w]; + packet.range((w + 1) * VTA_ACC_WIDTH - 1, w * VTA_ACC_WIDTH) = (acc_T) dst_tensor[b][p * AXI_ACC_RATIO + w]; } acc_mem[dst_idx][b * ACC_VEC_AXI_RATIO + p] = packet; } @@ -493,14 +517,14 @@ void compute( for (int p = 0; p < OUT_VEC_AXI_RATIO; p++) { axi_T packet = 0; for (int w = 0; w < AXI_OUT_RATIO; w++) { - packet.range((w + 1) * VTA_OUT_WIDTH - 1, w * VTA_OUT_WIDTH) = o_tensor[b][p * AXI_OUT_RATIO + w]; + packet.range((w + 1) * VTA_OUT_WIDTH - 1, w * VTA_OUT_WIDTH) = (acc_T) o_tensor[b][p * AXI_OUT_RATIO + w]; } out_mem[dst_idx][b * OUT_VEC_AXI_RATIO + p] = packet; } } } } -#endif // NO_ALU +#endif // ALU_EN // Update offsets dst_offset_in += dst_factor_in; diff --git a/vta/hardware/xilinx/src/vta.h b/vta/hardware/xilinx/src/vta.h index f4e1999814ec0..ee3aadeb071ec 100644 --- a/vta/hardware/xilinx/src/vta.h +++ b/vta/hardware/xilinx/src/vta.h @@ -35,22 +35,25 @@ typedef ap_uint axi_T; typedef ap_uint uop_T; /* \typedef inp_T Input datatype*/ -typedef ap_uint inp_T; +typedef ap_int inp_T; /* \typedef wgt_T Weight datatype*/ -typedef ap_uint wgt_T; +typedef ap_int wgt_T; /* \typedef out_T Output datatype*/ -typedef ap_uint out_T; +typedef ap_int out_T; /* \typedef acc_T Accumulator datatype*/ -typedef ap_uint acc_T; +typedef ap_int acc_T; + +/* \typedef acc_T Accumulator datatype*/ +typedef ap_int reg_T; /* \typedef mul_T Multiplier output datatype*/ -typedef ap_uint mul_T; +typedef ap_int mul_T; /* \typedef sum_T GEMM accumulator datatype*/ -typedef ap_uint sum_T; +typedef ap_int sum_T; /* \typedef uop_idx_T Micro-op SRAM index datatype*/ typedef ap_uint uop_idx_T; @@ -94,11 +97,14 @@ typedef ap_uint memop_pad_T; /* \typedef aluop_opcode_T ALU operation opcode datatype*/ typedef ap_uint aluop_opcode_T; -/* \typedef aluop_opcode_T ALU operation immediate datatype*/ +/* \typedef aluop_imm_T ALU operation immediate datatype*/ typedef ap_int aluop_imm_T; -/* \typedef aluop_opcode_T ALU operation shift immediate datatype*/ -typedef ap_int aluop_sh_imm_T; +/* \typedef aluop_shr_arg_T ALU operation shift right immediate datatype*/ +typedef ap_int aluop_shr_arg_T; + +/* \typedef aluop_mul_arg_T ALU operation multiply datatype*/ +typedef ap_int aluop_mul_arg_T; /*! * \brief Fetch module. diff --git a/vta/include/vta/hw_spec.h b/vta/include/vta/hw_spec.h index d21e8233aeb41..5d105ab6d940e 100644 --- a/vta/include/vta/hw_spec.h +++ b/vta/include/vta/hw_spec.h @@ -16,6 +16,9 @@ extern "C" { /*! AXI bus width */ #define VTA_AXI_WIDTH 64 +/*! Register file width */ +#define VTA_REG_WIDTH 24 + /*! log2 of instruction data type width */ #define VTA_LOG_INS_WIDTH 7 /*! Instruction data type width */ @@ -50,6 +53,8 @@ extern "C" { #define VTA_INP_VECTOR_WIDTH (VTA_INP_WIDTH * VTA_BLOCK_IN) /*! Accumulator vector width */ #define VTA_ACC_VECTOR_WIDTH (VTA_ACC_WIDTH * VTA_BLOCK_OUT) +/*! Register file vector width */ +#define VTA_REG_VECTOR_WIDTH (VTA_REG_WIDTH * VTA_BLOCK_OUT) /*! Output vector width */ #define VTA_OUT_VECTOR_WIDTH (VTA_OUT_WIDTH * VTA_BLOCK_OUT) @@ -98,7 +103,7 @@ extern "C" { /*! Instruction opcode field bitwidth */ #define VTA_OPCODE_BIT_WIDTH 3 /*! ALU opcode field bitwidth */ -#define VTA_ALU_OPCODE_BIT_WIDTH 2 +#define VTA_ALU_OPCODE_BIT_WIDTH 3 /*! Opcode: load encoding */ #define VTA_OPCODE_LOAD 0 @@ -119,6 +124,8 @@ extern "C" { #define VTA_ALU_OPCODE_ADD 2 /*! ALU opcode: shift right by immediate op */ #define VTA_ALU_OPCODE_SHR 3 +/*! ALU opcode: shift right by immediate op */ +#define VTA_ALU_OPCODE_MUL 4 /*! Memory type field bitwidth */ #define VTA_MEMOP_ID_BIT_WIDTH 2 @@ -134,10 +141,14 @@ extern "C" { #define VTA_MEMOP_PAD_BIT_WIDTH 4 /*! Load/Store Instruction: padding value encoding width*/ #define VTA_MEMOP_PAD_VAL_BIT_WIDTH 2 -/*! ALU Instruction: immediate bitwidth*/ -#define VTA_ALUOP_IMM_BIT_WIDTH 16 /*! GEMM/ALU Instruction: loop max iter bits */ #define VTA_LOOP_ITER_WIDTH 14 +/*! ALU Instruction: immediate bitwidth*/ +#define VTA_ALUOP_IMM_BIT_WIDTH 16 +/*! ALU Instruction: shift arg bitwidth*/ +#define VTA_SHR_ARG_BIT_WIDTH (VTA_LOG_ACC_WIDTH) +/*! ALU Instruction: multiply arg bitwidth*/ +#define VTA_MUL_ARG_BIT_WIDTH 8 /*! Mem ID constant: uop memory */ #define VTA_MEM_ID_UOP 0 diff --git a/vta/python/vta/bitstream.py b/vta/python/vta/bitstream.py index 329e741f7d1ca..73ce062cb62c9 100644 --- a/vta/python/vta/bitstream.py +++ b/vta/python/vta/bitstream.py @@ -43,6 +43,10 @@ def download_bitstream(): env = get_env() + if env.TARGET == "sim": + print("Skipping programming phase in sim mode") + return True + success = False bit = get_bitstream_path() url = os.path.join(BITSTREAM_URL, env.TARGET) diff --git a/vta/python/vta/environment.py b/vta/python/vta/environment.py index a77e29ac3a52e..c270585860bd2 100644 --- a/vta/python/vta/environment.py +++ b/vta/python/vta/environment.py @@ -37,6 +37,7 @@ class DevContext(object): ALU_OPCODE_MAX = 1 ALU_OPCODE_ADD = 2 ALU_OPCODE_SHR = 3 + ALU_OPCODE_MUL = 4 # Task queue id (pipeline stage) QID_LOAD_INP = 1 QID_LOAD_WGT = 1 @@ -138,20 +139,6 @@ def __init__(self, cfg): self.WGT_ELEM_BYTES = self.WGT_ELEM_BITS // 8 self.ACC_ELEM_BYTES = self.ACC_ELEM_BITS // 8 self.OUT_ELEM_BYTES = self.OUT_ELEM_BITS // 8 - # Configuration bitstream name - self.BITSTREAM = "{}x{}x{}_{}bx{}b_{}_{}_{}_{}_{}MHz_{}ns_v{}.bit".format( - (1 << cfg["LOG_BATCH"]), - (1 << cfg["LOG_BLOCK_IN"]), - (1 << cfg["LOG_BLOCK_OUT"]), - (1 << cfg["LOG_INP_WIDTH"]), - (1 << cfg["LOG_WGT_WIDTH"]), - cfg["LOG_UOP_BUFF_SIZE"], - cfg["LOG_INP_BUFF_SIZE"], - cfg["LOG_WGT_BUFF_SIZE"], - cfg["LOG_ACC_BUFF_SIZE"], - cfg["HW_FREQ"], - cfg["HW_CLK_TARGET"], - cfg["HW_VER"].replace('.', '_')) # dtypes self.acc_dtype = "int%d" % self.ACC_WIDTH self.inp_dtype = "int%d" % self.INP_WIDTH @@ -162,6 +149,27 @@ def __init__(self, cfg): self._mock_env = None self._dev_ctx = None self._last_env = None + # derive bitstream name + self.BITSTREAM = "{}_{}x{}x{}_a{}w{}o{}_{}_{}_{}_{}_{}MHz_{}ns_gii{}".format( + self.HW_VER.replace('.', '_'), + self.BATCH, + self.BLOCK_IN, + self.BLOCK_OUT, + self.INP_WIDTH, + self.WGT_WIDTH, + self.OUT_WIDTH, + self.LOG_UOP_BUFF_SIZE, + self.LOG_INP_BUFF_SIZE, + self.LOG_WGT_BUFF_SIZE, + self.LOG_ACC_BUFF_SIZE, + self.HW_FREQ, + self.HW_CLK_TARGET, + self.GEMM_II) + if self.ALU_EN: + self.BITSTREAM += "_aii{}".format(self.TALU_II) + if self.MUL_EN and self.ALU_EN: + self.BITSTREAM += "_mul" + self.BITSTREAM += ".bit" def __enter__(self): self._last_env = Environment.current diff --git a/vta/python/vta/pkg_config.py b/vta/python/vta/pkg_config.py index 30b4808f5e2d8..2a589c546e799 100644 --- a/vta/python/vta/pkg_config.py +++ b/vta/python/vta/pkg_config.py @@ -22,9 +22,13 @@ class PkgConfig(object): """ cfg_keys = [ "TARGET", + "HW_VER", "HW_FREQ", "HW_CLK_TARGET", - "HW_VER", + "ALU_EN", + "MUL_EN", + "GEMM_II", + "TALU_II", "LOG_INP_WIDTH", "LOG_WGT_WIDTH", "LOG_ACC_WIDTH", @@ -35,7 +39,7 @@ class PkgConfig(object): "LOG_UOP_BUFF_SIZE", "LOG_INP_BUFF_SIZE", "LOG_WGT_BUFF_SIZE", - "LOG_ACC_BUFF_SIZE", + "LOG_ACC_BUFF_SIZE" ] def __init__(self, cfg, proj_root): # include path diff --git a/vta/python/vta/top/vta_conv2d.py b/vta/python/vta/top/vta_conv2d.py index 549d7144d3214..7a73b58278052 100644 --- a/vta/python/vta/top/vta_conv2d.py +++ b/vta/python/vta/top/vta_conv2d.py @@ -330,7 +330,9 @@ def _get_workload(data, pad_data, kernel, output): w_str = (i_w + w_pad*2 - k_w) // (o_w - 1) return Workload(i_b, i_h, i_w, i_c, o_c, k_h, k_w, h_pad, w_pad, h_str, w_str) -def schedule_packed_conv2d(outs, plan=None): +def schedule_packed_conv2d(outs, plan=None, skip_load_inp=False, skip_load_wgt=False, + skip_load_acc=False, skip_store_out=False, skip_alu=False, + skip_gemm=False): """ Schedule the packed conv2d. """ assert len(outs) == 1 @@ -369,10 +371,14 @@ def _traverse(op): plan = find_schedules(wrkld, vt_only=True, best_only=True)[0] logging.info("Trying to find plan for %s", wrkld) env = get_env() + mock = env.mock - load_inp = load_wgt = load_out = store_out = env.dma_copy - alu = env.alu - gemm = env.gemm + load_inp = mock.dma_copy if skip_load_inp else env.dma_copy + load_wgt = mock.dma_copy if skip_load_wgt else env.dma_copy + load_acc = mock.dma_copy if skip_load_acc else env.dma_copy + store_out = mock.dma_copy if skip_store_out else env.dma_copy + alu = mock.alu if skip_alu else env.alu + gemm = mock.gemm if skip_gemm else env.gemm # schedule1 oshape = topi.util.get_const_tuple(output.shape) @@ -418,7 +424,7 @@ def _traverse(op): for tensor in cache_read_ewise: s[tensor].compute_at(s[output], store_pt) - s[tensor].pragma(s[tensor].op.axis[0], load_out) + s[tensor].pragma(s[tensor].op.axis[0], load_acc) # virtual threading along output channel axes if plan.oc_nthread > 1: diff --git a/vta/src/sim/sim_driver.cc b/vta/src/sim/sim_driver.cc index 60645818757c5..8123d73c0fb58 100644 --- a/vta/src/sim/sim_driver.cc +++ b/vta/src/sim/sim_driver.cc @@ -406,7 +406,7 @@ class Device { void RunGEMM(const VTAGemInsn* op) { if (!op->reset_reg) { - prof_->gemm_counter += op->iter_out * op->iter_in; + prof_->gemm_counter += op->iter_out * op->iter_in * (op->uop_end - op->uop_bgn); for (uint32_t y = 0; y < op->iter_out; ++y) { for (uint32_t x = 0; x < op->iter_in; ++x) { for (uint32_t uindex = op->uop_bgn; uindex < op->uop_end; ++uindex) { @@ -458,7 +458,6 @@ class Device { } void RunALU(const VTAAluInsn* op) { - prof_->alu_counter += op->iter_out * op->iter_in; if (op->use_imm) { RunALU_(op); } else { @@ -501,6 +500,7 @@ class Device { template void RunALULoop(const VTAAluInsn* op, F func) { + prof_->alu_counter += op->iter_out * op->iter_in * op->uop_end - op->uop_bgn; for (int y = 0; y < op->iter_out; ++y) { for (int x = 0; x < op->iter_in; ++x) { for (int k = op->uop_bgn; k < op->uop_end; ++k) { diff --git a/vta/tests/hardware/common/test_lib.cc b/vta/tests/hardware/common/test_lib.cc index ca42aea8b9051..f6a810cff7e9f 100644 --- a/vta/tests/hardware/common/test_lib.cc +++ b/vta/tests/hardware/common/test_lib.cc @@ -133,6 +133,8 @@ const char* getOpcodeString(int opcode, bool use_imm) { } } else if (opcode == VTA_ALU_OPCODE_SHR) { return "shr"; + } else if (opcode == VTA_ALU_OPCODE_MUL) { + return "mul"; } return "unknown op"; } @@ -737,230 +739,246 @@ void printMicroOp(int num_uop, VTAUop *uops) { } } -// int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_compression) { -// // Some assertions -// assert(batch % VTA_BATCH == 0); -// assert(vector_size % VTA_BLOCK_OUT == 0); -// assert(!(opcode == VTA_ALU_OPCODE_SHR && !use_imm)); -// printf("=====================================================================================\n"); -// printf("INFO - ALU test of %s: batch=%d, vector_size=%d, uop_compression=%d\n", -// getOpcodeString(opcode, use_imm), batch, vector_size, uop_compression); - -// // Instruction count -// int ins_size = 3 * batch / VTA_BATCH + 2; -// // Micro op count -// int uop_size = uop_compression ? 1 : vector_size / VTA_BLOCK_OUT; -// // Input/output elements in each transfer -// int tx_size = vector_size / VTA_BLOCK_OUT; -// // Number of input sets to be generated -// int input_sets = (use_imm) ? 1 : 2; -// // Make sure we don't exceed buffer bounds -// assert(uop_size <= VTA_UOP_BUFF_DEPTH); -// assert(tx_size * input_sets <= VTA_ACC_BUFF_DEPTH); - -// // Immediate values -// acc_T *immediate = static_cast(malloc(sizeof(acc_T) * batch / VTA_BATCH)); -// for (int b = 0; b < batch / VTA_BATCH; b++) { -// if (opcode == VTA_ALU_OPCODE_MIN) { -// immediate[b] = static_cast( -// rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH / 2)) - (1LL << (VTA_INP_WIDTH / 2 - 1))); -// } else if (opcode == VTA_ALU_OPCODE_MAX) { -// immediate[b] = static_cast( -// rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH / 2)) - (1LL << (VTA_INP_WIDTH / 2 - 1))); -// } else if (opcode == VTA_ALU_OPCODE_ADD) { -// immediate[b] = static_cast( -// rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH / 2)) - (1LL << (VTA_INP_WIDTH / 2 - 1))); -// } else if (opcode == VTA_ALU_OPCODE_SHR) { -// immediate[b] = static_cast( -// rand_r(&globalSeed) % VTA_ACC_WIDTH - VTA_ACC_WIDTH/2); -// } -// } - -// // Initialize instructions -// VTAGenericInsn *insn_buf = -// static_cast(allocBuffer(sizeof(VTAGenericInsn) * ins_size)); -// int insn_idx = 0; -// insn_buf[insn_idx++] = -// get1DLoadStoreInsn(VTA_OPCODE_LOAD, VTA_MEM_ID_UOP, 0, 0, uop_size, 0, 0, 0, 0); -// for (int b = 0; b < batch; b += VTA_BATCH) { -// insn_buf[insn_idx++] = get2DLoadStoreInsn( -// VTA_OPCODE_LOAD, // opcode -// VTA_MEM_ID_ACC, // vector size -// 0, // sram offset -// b / VTA_BATCH * tx_size * input_sets, // dram offset -// 1, // y size -// tx_size * input_sets, // x size -// tx_size * input_sets, // x stride -// 0, // y pad -// 0, // x pad -// 0, // pop prev dep -// b > 0, // pop next dep -// 0, // push prev dep -// 0); // push next dep -// insn_buf[insn_idx++] = getALUInsn( -// opcode, // opcode -// tx_size, // vector size -// use_imm, // use imm -// immediate[b / VTA_BATCH], // imm -// uop_compression, // uop compression -// 0, // pop prev dep -// 0, // pop next dep -// 0, // push prev dep -// 1); // push next dep -// insn_buf[insn_idx++] = get2DLoadStoreInsn( -// VTA_OPCODE_STORE, // opcode -// VTA_MEM_ID_OUT, // vector size -// 0, // sram offset -// b / VTA_BATCH * tx_size, // dram offset -// 1, // y size -// tx_size, // x size -// tx_size, // x stride -// 0, // y pad -// 0, // x pad -// 1, // pop prev dep -// 0, // pop next dep -// 1, // push prev dep -// 0); // push next dep -// } -// // Finish -// insn_buf[insn_idx++] = getFinishInsn(0, 1); -// // Prepare the uop buffer -// VTAUop * uop_buf = getMapALUUops(tx_size, uop_compression); - -// #if VTA_DEBUG == 1 -// printInstruction(ins_size, insn_buf); -// printMicroOp(uop_size, uop_buf); -// #endif - -// // Initialize the input/output data -// acc_T **inputs = alloc2dArray(batch, vector_size * input_sets); -// for (int i = 0; i < batch; i++) { -// for (int j = 0; j < vector_size * input_sets; j++) { -// if (opcode == VTA_ALU_OPCODE_MIN) { -// inputs[i][j] = static_cast( -// rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 1)) - (1LL << (VTA_INP_WIDTH - 2))); -// } else if (opcode == VTA_ALU_OPCODE_MAX) { -// inputs[i][j] = static_cast( -// rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 1)) - (1LL << (VTA_INP_WIDTH - 2))); -// } else if (opcode == VTA_ALU_OPCODE_ADD) { -// inputs[i][j] = static_cast( -// rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 1)) - (1LL << (VTA_INP_WIDTH - 2))); -// } -// } -// } - -// // Compute reference output -// out_T **outputs_ref = alloc2dArray(batch, vector_size); -// for (int i = 0; i < batch; i++) { -// for (int j = 0; j < vector_size; j++) { -// acc_T tmp = 0; -// if (opcode == VTA_ALU_OPCODE_MIN) { -// if (!use_imm) { -// tmp = inputs[i][j] < inputs[i][j + vector_size] ? -// inputs[i][j] : -// inputs[i][j + vector_size]; -// } else { -// tmp = inputs[i][j] < immediate[i / VTA_BATCH] ? -// inputs[i][j] : -// immediate[i / VTA_BATCH]; -// } -// } else if (opcode == VTA_ALU_OPCODE_MAX) { -// if (!use_imm) { -// tmp = inputs[i][j] > inputs[i][j + vector_size] ? -// inputs[i][j] : -// inputs[i][j + vector_size]; -// } else { -// tmp = inputs[i][j] > immediate[i / VTA_BATCH] ? -// inputs[i][j] : -// immediate[i / VTA_BATCH]; -// } -// } else if (opcode == VTA_ALU_OPCODE_ADD) { -// if (!use_imm) { -// tmp = inputs[i][j] + inputs[i][j + vector_size]; -// } else { -// tmp = inputs[i][j] + immediate[i / VTA_BATCH]; -// } -// } else if (opcode == VTA_ALU_OPCODE_SHR) { -// if (immediate[i / VTA_BATCH] >= 0) { -// tmp = inputs[i][j] >> immediate[i / VTA_BATCH]; -// } else { -// tmp = inputs[i][j] << (0 - immediate[i / VTA_BATCH]); -// } -// } -// // Set -// outputs_ref[i][j] = (out_T) tmp; -// } -// } - -// // Pack input buffer -// axi_T *bias_buf = -// static_cast(allocBuffer(VTA_ACC_ELEM_BYTES * batch * tx_size * input_sets)); -// packBuffer( -// bias_buf, inputs, batch, vector_size * input_sets, VTA_BATCH, VTA_BLOCK_OUT); - -// // Prepare output buffer -// axi_T *output_buf = -// static_cast(allocBuffer(VTA_OUT_ELEM_BYTES * batch * tx_size * input_sets)); - -// #ifdef NO_SIM -// // Invoke the VTA -// uint64_t t_fpga = vta(ins_size, insn_buf, uop_buf, NULL, NULL, bias_buf, output_buf); -// // Report on timining -// printf("INFO - Synchronization time: %.3fms\n", static_cast(t_fpga) / 1E6); -// printf("INFO - Throughput: %.3fGOps/s\n", static_cast(vector_size * batch) / t_fpga); -// #else -// // Invoke the VTA -// vta(ins_size, -// (volatile insn_T *) insn_buf, -// (volatile uop_T *) uop_buf, -// (volatile axi_T *) NULL, -// (volatile axi_T *) NULL, -// (volatile axi_T *) bias_buf, -// (volatile axi_T *) output_buf); -// #endif - -// // Unpack output buffer -// out_T **outputs = alloc2dArray(batch, vector_size); -// unpackBuffer(outputs, -// output_buf, -// batch, -// vector_size, -// VTA_BATCH, -// VTA_BLOCK_OUT); - -// // Correctness checks -// int err = 0; -// for (int i = 0; i < batch; i++) { -// for (int j = 0; j < vector_size; j++) { -// if (outputs_ref[i][j] != outputs[i][j]) { -// err++; -// #if VTA_DEBUG == 1 -// printf("DEBUG - %d, %d: expected 0x%x but got 0x%x\n", i, j, -// static_cast(outputs_ref[i][j]), -// static_cast(outputs[i][j])); -// #endif -// } -// } -// } - -// // Free all allocated arrays -// free(immediate); -// free2dArray(inputs, batch, vector_size * input_sets); -// free2dArray(outputs_ref, batch, vector_size); -// free2dArray(outputs, batch, vector_size); -// freeBuffer(insn_buf); -// freeBuffer(uop_buf); -// freeBuffer(bias_buf); -// freeBuffer(output_buf); - -// if (err == 0) { -// printf("INFO - ALU test successful!\n"); -// return 0; -// } else { -// printf("INFO - ALU test failed, got %d errors!\n", err); -// return -1; -// } -// } +int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_compression) { + // Some assertions + assert(batch % VTA_BATCH == 0); + assert(vector_size % VTA_BLOCK_OUT == 0); + printf("=====================================================================================\n"); + printf("INFO - ALU test of %s: batch=%d, vector_size=%d, uop_compression=%d\n", + getOpcodeString(opcode, use_imm), batch, vector_size, uop_compression); + + // Instruction count + int ins_size = 3 * batch / VTA_BATCH + 2; + // Micro op count + int uop_size = uop_compression ? 1 : vector_size / VTA_BLOCK_OUT; + // Input/output elements in each transfer + int tx_size = vector_size / VTA_BLOCK_OUT; + // Number of input sets to be generated + int input_sets = (use_imm) ? 1 : 2; + // Make sure we don't exceed buffer bounds + assert(uop_size <= VTA_UOP_BUFF_DEPTH); + assert(tx_size * input_sets <= VTA_ACC_BUFF_DEPTH); + + // Immediate values + acc_T *immediate = static_cast(malloc(sizeof(acc_T) * batch / VTA_BATCH)); + for (int b = 0; b < batch / VTA_BATCH; b++) { + if (opcode == VTA_ALU_OPCODE_MIN) { + immediate[b] = static_cast( + rand_r(&globalSeed) % (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 1)) - (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 2))); + } else if (opcode == VTA_ALU_OPCODE_MAX) { + immediate[b] = static_cast( + rand_r(&globalSeed) % (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 1)) - (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 2))); + } else if (opcode == VTA_ALU_OPCODE_ADD) { + immediate[b] = static_cast( + rand_r(&globalSeed) % (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 1)) - (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 2))); + } else if (opcode == VTA_ALU_OPCODE_SHR) { + immediate[b] = static_cast( + rand_r(&globalSeed) % (1LL << (VTA_SHR_ARG_BIT_WIDTH - 1)) - (1LL << (VTA_SHR_ARG_BIT_WIDTH - 2))); + } else if (opcode == VTA_ALU_OPCODE_MUL) { + immediate[b] = static_cast( + rand_r(&globalSeed) % (1LL << (VTA_MUL_ARG_BIT_WIDTH - 1)) - (1LL << (VTA_MUL_ARG_BIT_WIDTH - 2))); + } + } + + // Initialize instructions + VTAGenericInsn *insn_buf = + static_cast(allocBuffer(sizeof(VTAGenericInsn) * ins_size)); + int insn_idx = 0; + insn_buf[insn_idx++] = + get1DLoadStoreInsn(VTA_OPCODE_LOAD, VTA_MEM_ID_UOP, 0, 0, uop_size, 0, 0, 0, 0); + for (int b = 0; b < batch; b += VTA_BATCH) { + insn_buf[insn_idx++] = get2DLoadStoreInsn( + VTA_OPCODE_LOAD, // opcode + VTA_MEM_ID_ACC, // vector size + 0, // sram offset + b / VTA_BATCH * tx_size * input_sets, // dram offset + 1, // y size + tx_size * input_sets, // x size + tx_size * input_sets, // x stride + 0, // y pad + 0, // x pad + 0, // pop prev dep + b > 0, // pop next dep + 0, // push prev dep + 0); // push next dep + insn_buf[insn_idx++] = getALUInsn( + opcode, // opcode + tx_size, // vector size + use_imm, // use imm + immediate[b / VTA_BATCH], // imm + uop_compression, // uop compression + 0, // pop prev dep + 0, // pop next dep + 0, // push prev dep + 1); // push next dep + insn_buf[insn_idx++] = get2DLoadStoreInsn( + VTA_OPCODE_STORE, // opcode + VTA_MEM_ID_OUT, // vector size + 0, // sram offset + b / VTA_BATCH * tx_size, // dram offset + 1, // y size + tx_size, // x size + tx_size, // x stride + 0, // y pad + 0, // x pad + 1, // pop prev dep + 0, // pop next dep + 1, // push prev dep + 0); // push next dep + } + // Finish + insn_buf[insn_idx++] = getFinishInsn(0, 1); + // Prepare the uop buffer + VTAUop * uop_buf = getMapALUUops(tx_size, uop_compression); + +#if VTA_DEBUG == 1 + printInstruction(ins_size, insn_buf); + printMicroOp(uop_size, uop_buf); +#endif + + // Initialize the input/output data + acc_T **inputs = alloc2dArray(batch, vector_size * input_sets); + for (int i = 0; i < batch; i++) { + for (int j = 0; j < vector_size * input_sets; j++) { + if (opcode == VTA_ALU_OPCODE_MIN) { + inputs[i][j] = static_cast( + rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 1)) - (1LL << (VTA_INP_WIDTH - 2))); + } else if (opcode == VTA_ALU_OPCODE_MAX) { + inputs[i][j] = static_cast( + rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 1)) - (1LL << (VTA_INP_WIDTH - 2))); + } else if (opcode == VTA_ALU_OPCODE_ADD) { + inputs[i][j] = static_cast( + rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 2)) - (1LL << (VTA_INP_WIDTH - 3))); + } else if (opcode == VTA_ALU_OPCODE_SHR) { + inputs[i][j] = static_cast( + rand_r(&globalSeed) % (1LL << (VTA_SHR_ARG_BIT_WIDTH - 1)) - (1LL << (VTA_SHR_ARG_BIT_WIDTH - 2))); + } else if (opcode == VTA_ALU_OPCODE_MUL) { + inputs[i][j] = static_cast( + rand_r(&globalSeed) % (1LL << (VTA_MUL_ARG_BIT_WIDTH - 1)) - (1LL << (VTA_MUL_ARG_BIT_WIDTH - 2))); + } + } + } + + // Compute reference output + out_T **outputs_ref = alloc2dArray(batch, vector_size); + for (int i = 0; i < batch; i++) { + for (int j = 0; j < vector_size; j++) { + acc_T out_val = 0; + acc_T imm_val = immediate[i / VTA_BATCH]; + acc_T src_val = inputs[i][j + vector_size]; + if (opcode == VTA_ALU_OPCODE_MIN) { + if (!use_imm) { + out_val = inputs[i][j] < src_val ? inputs[i][j] : src_val; + } else { + out_val = inputs[i][j] < imm_val ? inputs[i][j] : imm_val; + } + } else if (opcode == VTA_ALU_OPCODE_MAX) { + if (!use_imm) { + out_val = inputs[i][j] > src_val ? inputs[i][j] : src_val; + } else { + out_val = inputs[i][j] > imm_val ? inputs[i][j] : imm_val; + } + } else if (opcode == VTA_ALU_OPCODE_ADD) { + if (!use_imm) { + out_val = inputs[i][j] + src_val; + } else { + out_val = inputs[i][j] + imm_val; + } + } else if (opcode == VTA_ALU_OPCODE_SHR) { + if (!use_imm) { + if (src_val >= 0) { + out_val = inputs[i][j] >> src_val; + } else { + out_val = inputs[i][j] << (0 - src_val); + } + } else { + if (imm_val >= 0) { + out_val = inputs[i][j] >> imm_val; + } else { + out_val = inputs[i][j] << (0 - imm_val); + } + } + } else if (opcode == VTA_ALU_OPCODE_MUL) { + if (!use_imm) { + out_val = inputs[i][j] * src_val; + } else { + out_val = inputs[i][j] * imm_val; + } + } + // Set + outputs_ref[i][j] = (out_T) out_val; + } + } + + // Pack input buffer + axi_T *bias_buf = + static_cast(allocBuffer(VTA_ACC_ELEM_BYTES * batch * tx_size * input_sets)); + packBuffer( + bias_buf, inputs, batch, vector_size * input_sets, VTA_BATCH, VTA_BLOCK_OUT); + + // Prepare output buffer + axi_T *output_buf = + static_cast(allocBuffer(VTA_OUT_ELEM_BYTES * batch * tx_size * input_sets)); + +#ifdef NO_SIM + // Invoke the VTA + uint64_t t_fpga = vta(ins_size, insn_buf, uop_buf, NULL, NULL, bias_buf, output_buf); + // Report on timining + printf("INFO - Synchronization time: %.3fms\n", static_cast(t_fpga) / 1E6); + printf("INFO - Throughput: %.3fGOps/s\n", static_cast(vector_size * batch) / t_fpga); +#else + // Invoke the VTA + vta(ins_size, + (volatile insn_T *) insn_buf, + (volatile uop_T *) uop_buf, + (volatile axi_T *) NULL, + (volatile axi_T *) NULL, + (volatile axi_T *) bias_buf, + (volatile axi_T *) output_buf); +#endif + + // Unpack output buffer + out_T **outputs = alloc2dArray(batch, vector_size); + unpackBuffer(outputs, + output_buf, + batch, + vector_size, + VTA_BATCH, + VTA_BLOCK_OUT); + + // Correctness checks + int err = 0; + for (int i = 0; i < batch; i++) { + for (int j = 0; j < vector_size; j++) { + if (outputs_ref[i][j] != outputs[i][j]) { + err++; +#if VTA_DEBUG == 1 + printf("DEBUG - %d, %d: expected 0x%x but got 0x%x\n", i, j, + static_cast(outputs_ref[i][j]), + static_cast(outputs[i][j])); +#endif + } + } + } + + // Free all allocated arrays + free(immediate); + free2dArray(inputs, batch, vector_size * input_sets); + free2dArray(outputs_ref, batch, vector_size); + free2dArray(outputs, batch, vector_size); + freeBuffer(insn_buf); + freeBuffer(uop_buf); + freeBuffer(bias_buf); + freeBuffer(output_buf); + + if (err == 0) { + printf("INFO - ALU test successful!\n"); + return 0; + } else { + printf("INFO - ALU test failed, got %d errors!\n", err); + return -1; + } +} int blocked_gemm_test(int batch, int channels, int block, bool uop_compression, int virtual_threads) { diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d.py b/vta/tests/python/integration/test_benchmark_topi_conv2d.py index b95103be182e2..5b7ceb7404e60 100644 --- a/vta/tests/python/integration/test_benchmark_topi_conv2d.py +++ b/vta/tests/python/integration/test_benchmark_topi_conv2d.py @@ -176,21 +176,42 @@ def run_vta_conv2d(env, remote, key, batch_size, wl, profile=True): wl.hkernel, wl.wkernel, env.BLOCK_OUT, env.BLOCK_IN) bias_shape = (batch_size//env.BATCH, wl.out_filter//env.BLOCK_OUT, 1, 1, env.BATCH, env.BLOCK_OUT) - - fout_height = (wl.height + 2 * wl.hpad - wl.hkernel) // wl.hstride + 1 - fout_width = (wl.width + 2 * wl.wpad - wl.wkernel) // wl.wstride + 1 data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype) - kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype) bias = tvm.placeholder(bias_shape, name="kernel", dtype=env.acc_dtype) + coeff = tvm.placeholder(bias_shape, name="kernel", dtype=env.acc_dtype) + + # Handle quantized inputs (less than 8 bits) + # x_pack_factor = 1 << (3 - env.LOG_INP_WIDTH) + # data_shape_pack = data_shape[:-1] + (data_shape[-1]//x_pack_factor,) + # data_arg = tvm.placeholder( + # data_shape_pack, + # dtype="int8", name="data_arg") + # data = vta.reinterpret(data_arg, data_shape, dtype=env.inp_dtype) + + # Handle quantized kernels (less than 8 bits) + w_pack_factor = 1 << (3 - env.LOG_WGT_WIDTH) + kernel_shape_pack = kernel_shape[:-1] + (kernel_shape[-1]//w_pack_factor,) + kernel_arg = tvm.placeholder( + kernel_shape_pack, + dtype="int8", name="kernel_arg") + kernel = vta.reinterpret(kernel_arg, kernel_shape, dtype=env.wgt_dtype) res_conv = vta.top.packed_conv2d( data, kernel, padding=(wl.hpad, wl.wpad), strides=(wl.hstride, wl.wstride)) res = topi.right_shift(res_conv, 8) res = topi.add(res, bias) + res = topi.multiply(res, coeff) res = my_clip(res, 0, (1 << env.OUT_WIDTH-1)-1) - res = topi.cast(res, "int8") + + # Handle quantized outputs (less than 8 bits) + # o_pack_factor = 1 << (3 - env.LOG_OUT_WIDTH) + res_shape = topi.util.get_const_tuple(res.shape) + # res_shape_pack = res_shape[:-1] + (res_shape[-1]//o_pack_factor,) + # res_arg = vta.reinterpret(res, res_shape_pack, dtype="int8") # To compute number of ops, use a x2 factor for FMA + fout_height = (wl.height + 2 * wl.hpad - wl.hkernel) // wl.hstride + 1 + fout_width = (wl.width + 2 * wl.wpad - wl.wkernel) // wl.wstride + 1 num_ops = 2 * batch_size * fout_height * fout_width * wl.hkernel * wl.wkernel * wl.out_filter * wl.in_filter a_shape = (batch_size, wl.in_filter, wl.height, wl.width) @@ -202,57 +223,22 @@ def run_vta_conv2d(env, remote, key, batch_size, wl, profile=True): assert wl.hpad == wl.wpad padding = wl.hpad - # Handle packing for quantized activations (less than 8bits) - x_pack_factor = 1 << (3 - env.LOG_INP_WIDTH) - data_shape_pack = (batch_size//env.BATCH, wl.in_filter//env.BLOCK_IN, - wl.height, wl.width, env.BATCH, env.BLOCK_IN//x_pack_factor) - data_arg_buffer = tvm.decl_buffer( - data_shape_pack, - dtype="int8", name="data_arg") - data_bind_buffer = tvm.decl_buffer( - data.shape, data.dtype, name=data.op.name, - data=data_arg_buffer.data) - - # Handle packing for quantized weights (less than 8bits) - w_pack_factor = 1 << (3 - env.LOG_WGT_WIDTH) - kernel_shape_pack = (wl.out_filter//env.BLOCK_OUT, wl.in_filter//env.BLOCK_IN, - wl.hkernel, wl.wkernel, env.BLOCK_OUT, env.BLOCK_IN//w_pack_factor) - kernel_arg_buffer = tvm.decl_buffer( - kernel_shape_pack, - dtype="int8", name="kernel_arg") - kernel_bind_buffer = tvm.decl_buffer( - kernel.shape, kernel.dtype, name=kernel.op.name, - data=kernel_arg_buffer.data) - - # Handle packing for outputs (less than 8bits) - o_pack_factor = 1 << (3 - env.LOG_OUT_WIDTH) - res_shape = topi.util.get_const_tuple(res.shape) - res_shape_pack = res_shape[:-1] + (res_shape[-1]//o_pack_factor,) - res_arg_buffer = tvm.decl_buffer( - res_shape_pack, - dtype="int8", name="res_arg") - res_bind_buffer = tvm.decl_buffer( - res.shape, res.dtype, name=res.op.name, - data=res_arg_buffer.data) - - binds = {kernel: kernel_bind_buffer, data: data_bind_buffer, res: res_bind_buffer} - # @memoize("vta.tests.test_benchmark_topi.conv2d.verify_nhwc") def get_ref_data(): # derive min max for input and weight types (max non inclusive) a_min, a_max = 0 - (1 << (env.INP_WIDTH - 1)), (1 << (env.INP_WIDTH - 1)) w_min, w_max = 0 - (1 << (env.WGT_WIDTH - 1)), (1 << (env.WGT_WIDTH - 1)) a_np = np.random.randint( - a_min, a_max, size=a_shape).astype("int8") + 0, 2, size=a_shape).astype("int8") w_np = np.random.randint( - w_min, w_max, size=w_shape).astype("int8") + 0, 2, size=w_shape).astype("int8") b_np = topi.testing.conv2d_nchw_python( a_np.astype(acc_dtype), w_np.astype(acc_dtype), stride, padding).astype(acc_dtype) return a_np, w_np, b_np def verify(s, check_correctness): - mod = vta.build(s, [data_arg_buffer, kernel_arg_buffer, bias, res_arg_buffer], "ext_dev", - env.target_host, name="conv2d", binds=binds) + mod = vta.build(s, [data, kernel_arg, bias, coeff, res], "ext_dev", + env.target_host, name="conv2d") temp = util.tempdir() mod.save(temp.relpath("conv2d.o")) @@ -262,8 +248,10 @@ def verify(s, check_correctness): ctx = remote.ext_dev(0) # Data in original format data_orig, kernel_orig, res_ref = get_ref_data() - bias_orig = (np.random.uniform(size=(wl.out_filter,)) * 4).astype("int32") + bias_orig = (np.random.uniform(size=(wl.out_filter,)) * 2).astype("int32") bias_orig = np.abs(bias_orig) + coeff_orig = (np.random.uniform(size=(wl.out_filter,)) * 2).astype("int32") + coeff_orig = np.abs(coeff_orig) data_packed = data_orig.reshape( batch_size//env.BATCH, env.BATCH, @@ -276,6 +264,9 @@ def verify(s, check_correctness): bias_packed = bias_orig.reshape( batch_size // env.BATCH, wl.out_filter // env.BLOCK_OUT, 1, 1, env.BATCH, env.BLOCK_OUT) + coeff_packed = coeff_orig.reshape( + batch_size // env.BATCH, wl.out_filter // env.BLOCK_OUT, + 1, 1, env.BATCH, env.BLOCK_OUT) # Quantized packing data_qpacked = _pack(data_packed, env.INP_WIDTH) @@ -285,9 +276,10 @@ def verify(s, check_correctness): data_arr = tvm.nd.array(data_qpacked, ctx) kernel_arr = tvm.nd.array(kernel_qpacked, ctx) bias_arr = tvm.nd.array(bias_packed, ctx) + coeff_arr = tvm.nd.array(coeff_packed, ctx) res_arr = tvm.nd.array(res_np, ctx) time_f = f.time_evaluator("conv2d", ctx, number=5) - cost = time_f(data_arr, kernel_arr, bias_arr, res_arr) + cost = time_f(data_arr, kernel_arr, bias_arr, coeff_arr, res_arr) res_unpack = res_arr.asnumpy() res_unpack = _unpack(res_unpack.astype("int8"), env.OUT_WIDTH) @@ -299,6 +291,7 @@ def verify(s, check_correctness): padding = wl.hpad res_ref = res_ref >> 8 res_ref += bias_orig.reshape(wl.out_filter, 1, 1) + res_ref *= coeff_orig.reshape(wl.out_filter, 1, 1) res_ref = np.clip(res_ref, 0, (1 << env.OUT_WIDTH-1)-1).astype("int8") np.testing.assert_allclose(res_unpack, res_ref) return cost @@ -308,7 +301,7 @@ def conv_normal(print_ir): with vta.build_config(): s = vta.top.schedule_packed_conv2d([res]) if print_ir: - print(vta.lower(s, [data_arg_buffer, kernel_arg_buffer, bias, res_arg_buffer], simple_mode=True)) + print(vta.lower(s, [data, kernel_arg, bias, coeff, res], simple_mode=True)) cost = verify(s, True) gops = (num_ops / cost.mean) / float(10 ** 9) print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops))