Skip to content

Commit

Permalink
[Hardware] Multiplier integration, new vta conf format (#3)
Browse files Browse the repository at this point in the history
  • Loading branch information
tmoreau89 committed Nov 25, 2018
1 parent adc588f commit 4a8e105
Show file tree
Hide file tree
Showing 16 changed files with 532 additions and 427 deletions.
15 changes: 9 additions & 6 deletions vta/config/pynq_sample.json
Original file line number Diff line number Diff line change
@@ -1,17 +1,20 @@
{
"TARGET" : "pynq",
"HW_VER" : "0.0.1",
"HW_FREQ" : 100,
"HW_CLK_TARGET" : 8,
"HW_VER" : "0.0.0",
"ALU" : true,
"GEMM_II" : 2,
"TALU_II" : 4,
"LOG_INP_WIDTH" : 3,
"LOG_WGT_WIDTH" : 3,
"LOG_WGT_WIDTH" : 1,
"LOG_ACC_WIDTH" : 5,
"LOG_OUT_WIDTH" : 3,
"LOG_BATCH" : 0,
"LOG_BLOCK_IN" : 4,
"LOG_BLOCK_OUT" : 4,
"LOG_BLOCK_IN" : 5,
"LOG_BLOCK_OUT" : 5,
"LOG_UOP_BUFF_SIZE" : 15,
"LOG_INP_BUFF_SIZE" : 15,
"LOG_WGT_BUFF_SIZE" : 18,
"LOG_INP_BUFF_SIZE" : 17,
"LOG_WGT_BUFF_SIZE" : 17,
"LOG_ACC_BUFF_SIZE" : 17
}
14 changes: 8 additions & 6 deletions vta/config/vta_config.json
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
{
"TARGET" : "sim",
"TARGET" : "pynq",
"HW_VER" : "0.0.2",
"HW_FREQ" : 100,
"HW_CLK_TARGET" : 8,
"HW_VER" : "0.0.0",
"GEMM_II" : 2,
"HW_CLK_TARGET" : 7,
"ALU_EN" : true,
"MUL_EN" : true,
"GEMM_II" : 1,
"TALU_II" : 2,
"LOG_INP_WIDTH" : 3,
"LOG_WGT_WIDTH" : 1,
Expand All @@ -13,7 +15,7 @@
"LOG_BLOCK_IN" : 5,
"LOG_BLOCK_OUT" : 5,
"LOG_UOP_BUFF_SIZE" : 15,
"LOG_INP_BUFF_SIZE" : 17,
"LOG_WGT_BUFF_SIZE" : 17,
"LOG_INP_BUFF_SIZE" : 16,
"LOG_WGT_BUFF_SIZE" : 18,
"LOG_ACC_BUFF_SIZE" : 17
}
51 changes: 33 additions & 18 deletions vta/config/vta_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ def main():
help="print the target")
parser.add_argument("--cfg-str", action="store_true",
help="print the configuration string")
parser.add_argument("--get-aluen", action="store_true",
help="returns whether ALU is enabled")
parser.add_argument("--get-mulen", action="store_true",
help="returns whether mul in ALU is enabled")
parser.add_argument("--get-gemmii", action="store_true",
help="returns the GEMM core II")
parser.add_argument("--get-taluii", action="store_true",
Expand Down Expand Up @@ -90,7 +94,28 @@ def main():
if not ok_path_list:
raise RuntimeError("Cannot find config in %s" % str(path_list))
cfg = json.load(open(ok_path_list[0]))
cfg["LOG_OUT_BUFF_SIZE"] = cfg["LOG_ACC_BUFF_SIZE"] + cfg["LOG_ACC_WIDTH"] - cfg["LOG_OUT_WIDTH"]
cfg["LOG_OUT_BUFF_SIZE"] = cfg["LOG_ACC_BUFF_SIZE"] - cfg["LOG_ACC_WIDTH"] + cfg["LOG_OUT_WIDTH"]
# Generate bitstream config string.
# Needs to match the BITSTREAM string in python/vta/environment.py
cfg["BITSTREAM"] = "{}_{}x{}x{}_a{}w{}o{}_{}_{}_{}_{}_{}MHz_{}ns_gii{}".format(
cfg["HW_VER"].replace('.', '_'),
(1 << cfg["LOG_BATCH"]),
(1 << cfg["LOG_BLOCK_IN"]),
(1 << cfg["LOG_BLOCK_OUT"]),
(1 << cfg["LOG_INP_WIDTH"]),
(1 << cfg["LOG_WGT_WIDTH"]),
(1 << cfg["LOG_OUT_WIDTH"]),
cfg["LOG_UOP_BUFF_SIZE"],
cfg["LOG_INP_BUFF_SIZE"],
cfg["LOG_WGT_BUFF_SIZE"],
cfg["LOG_ACC_BUFF_SIZE"],
cfg["HW_FREQ"],
cfg["HW_CLK_TARGET"],
cfg["GEMM_II"])
if cfg["ALU_EN"]:
cfg["BITSTREAM"] += "_aii{}".format(cfg["TALU_II"])
if cfg["MUL_EN"] and cfg["ALU_EN"]:
cfg["BITSTREAM"] += "_mul"
pkg = get_pkg_config(cfg)

if args.target:
Expand Down Expand Up @@ -119,23 +144,13 @@ def main():
fo.write(pkg.cfg_json)

if args.cfg_str:
# Needs to match the BITSTREAM string in python/vta/environment.py
cfg_str = "{}x{}x{}_g{}_a{}_{}bx{}b_{}_{}_{}_{}_{}MHz_{}ns_v{}".format(
(1 << cfg["LOG_BATCH"]),
(1 << cfg["LOG_BLOCK_IN"]),
(1 << cfg["LOG_BLOCK_OUT"]),
cfg["GEMM_II"],
cfg["TALU_II"],
(1 << cfg["LOG_INP_WIDTH"]),
(1 << cfg["LOG_WGT_WIDTH"]),
cfg["LOG_UOP_BUFF_SIZE"],
cfg["LOG_INP_BUFF_SIZE"],
cfg["LOG_WGT_BUFF_SIZE"],
cfg["LOG_ACC_BUFF_SIZE"],
cfg["HW_FREQ"],
cfg["HW_CLK_TARGET"],
cfg["HW_VER"].replace('.', '_'))
print(cfg_str)
print(cfg["BITSTREAM"])

if args.get_aluen:
print(cfg["ALU_EN"])

if args.get_mulen:
print(cfg["MUL_EN"])

if args.get_gemmii:
print(cfg["GEMM_II"])
Expand Down
13 changes: 6 additions & 7 deletions vta/hardware/xilinx/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,9 @@ HSI = hsi
# HLS mode
MODE = all
# Debug flag
DEBUG = false
DEBUG = False
# SLURM
SLURM = false
# Prevent generation of DSP
NO_DSP = false
# Prevent generation of ALU
NO_ALU = true
SLURM = False

# Process VTA JSON config
VTA_CONFIG = python $(CURDIR)/../../config/vta_config.py
Expand All @@ -44,6 +40,8 @@ VTA_INP_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-inpbuffsize)
VTA_WGT_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-wgtbuffsize)
VTA_ACC_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-accbuffsize)
VTA_OUT_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-outbuffsize)
VTA_ALU_EN := $(shell ${VTA_CONFIG} --get-aluen)
VTA_MUL_EN := $(shell ${VTA_CONFIG} --get-mulen)

#---------------------
# FPGA Parameters
Expand Down Expand Up @@ -87,7 +85,8 @@ $(IP_PATH): $(SRC_DIR)/*
cd $(IP_BUILD_PATH) && \
$(VIVADO_HLS) -f $(SCRIPT_DIR)/hls.tcl \
-tclargs $(SRC_DIR) $(SIM_DIR) $(TEST_DIR) $(INCLUDE_DIR) \
$(MODE) $(DEBUG) $(NO_DSP) $(NO_ALU) $(VTA_TARGET_PER) $(VTA_GEMM_II) $(VTA_TALU_II) \
$(MODE) $(DEBUG) $(VTA_ALU_EN) $(VTA_MUL_EN) \
$(VTA_TARGET_PER) $(VTA_GEMM_II) $(VTA_TALU_II) \
$(VTA_INP_WIDTH) $(VTA_WGT_WIDTH) $(VTA_ACC_WIDTH) $(VTA_OUT_WIDTH) \
$(VTA_BATCH) $(VTA_IN_BLOCK) $(VTA_OUT_BLOCK) \
$(VTA_UOP_BUFF_SIZE) $(VTA_INP_BUFF_SIZE) $(VTA_WGT_BUFF_SIZE) \
Expand Down
58 changes: 29 additions & 29 deletions vta/hardware/xilinx/scripts/hls.tcl
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
# Arg 4: path to include sources
# Arg 5: mode
# Arg 6: debug
# Arg 7: no_dsp
# Arg 8: no_alu
# Arg 7: alu_ena
# Arg 8: mul_ena
# Arg 9: target clock period
# Arg 10: target II for GEMM
# Arg 11: target II for tensor ALU
Expand All @@ -36,8 +36,8 @@ if { [llength $argv] eq 25 } {
set include_dir [lindex $argv 5]
set mode [lindex $argv 6]
set debug [lindex $argv 7]
set no_dsp [lindex $argv 8]
set no_alu [lindex $argv 9]
set alu_ena [lindex $argv 8]
set mul_ena [lindex $argv 9]
set target_period [lindex $argv 10]
set target_gemm_ii [lindex $argv 11]
set target_alu_ii [lindex $argv 12]
Expand All @@ -59,9 +59,9 @@ if { [llength $argv] eq 25 } {
set test_dir "../../src/test"
set include_dir "../../include"
set mode "all"
set debug "false"
set no_dsp "true"
set no_alu "false"
set debug "False"
set alu_ena "True"
set mul_ena "True"
set target_period 8
set target_gemm_ii 10
set target_alu_ii 16
Expand All @@ -83,7 +83,7 @@ if { [llength $argv] eq 25 } {
# Initializes the HLS design and sets HLS pragmas for memory partitioning.
# This is necessary because of a Vivado restriction that doesn't allow for
# buses wider than 1024 bits.
proc init_design {per ii inp_width wgt_width out_width acc_width batch block_in block_out no_alu} {
proc init_design {per g_ii a_ii inp_width wgt_width out_width acc_width batch block_in block_out alu_ena} {

# Set device number
set_part {xc7z020clg484-1}
Expand All @@ -98,14 +98,14 @@ proc init_design {per ii inp_width wgt_width out_width acc_width batch block_in
create_clock -period $per -name default

# Set pipeline directive
set_directive_pipeline -II $ii "compute/READ_GEMM_UOP"
set_directive_pipeline -II $g_ii "compute/READ_GEMM_UOP"

if {$no_alu=="false"} {
set_directive_pipeline -II $ii "compute/READ_ALU_UOP"
if {$alu_ena=="True"} {
set_directive_pipeline -II $a_ii "compute/READ_ALU_UOP"
}

# Set input partition factor to (INP_VECTOR_WIDTH*BATCH/(1024*ii)
set inp_bus_width [expr {(1 << ($inp_width + $block_in + $batch)) / $ii}]
# Set input partition factor to (INP_VECTOR_WIDTH*BATCH/(1024*g_ii)
set inp_bus_width [expr {(1 << ($inp_width + $block_in + $batch)) / $g_ii}]
set inp_partition_factor [expr {$inp_bus_width / $max_width}]
if {$inp_partition_factor == 0} {
set inp_reshape_factor [expr {$inp_bus_width / $axi_width}]
Expand All @@ -118,8 +118,8 @@ proc init_design {per ii inp_width wgt_width out_width acc_width batch block_in
set_directive_array_reshape -type block -factor $inp_reshape_factor -dim 2 "load" inp_mem
set_directive_array_reshape -type block -factor $inp_reshape_factor -dim 2 "compute" inp_mem
}
# Set weight partition factor to (WGT_VECTOR_WIDTH*BLOCK_OUT/(1024*ii))
set wgt_bus_width [expr {(1 << ($wgt_width + $block_in + $block_out)) / $ii}]
# Set weight partition factor to (WGT_VECTOR_WIDTH*BLOCK_OUT/(1024*g_ii))
set wgt_bus_width [expr {(1 << ($wgt_width + $block_in + $block_out)) / $g_ii}]
set wgt_partition_factor [expr {$wgt_bus_width / $max_width}]
if {$wgt_partition_factor == 0} {
set wgt_reshape_factor [expr {$wgt_bus_width / $axi_width}]
Expand All @@ -132,8 +132,8 @@ proc init_design {per ii inp_width wgt_width out_width acc_width batch block_in
set_directive_array_reshape -type block -factor $wgt_reshape_factor -dim 2 "load" wgt_mem
set_directive_array_reshape -type block -factor $wgt_reshape_factor -dim 2 "compute" wgt_mem
}
# Set output partition factor to (OUT_VECTOR_WIDTH*BATCH/(1024*ii))
set out_bus_width [expr {(1 << ($out_width + $block_out + $batch)) / $ii}]
# Set output partition factor to (OUT_VECTOR_WIDTH*BATCH/(1024*g_ii))
set out_bus_width [expr {(1 << ($out_width + $block_out + $batch)) / $g_ii}]
set out_partition_factor [expr {$out_bus_width / $max_width}]
if {$out_partition_factor == 0} {
set out_reshape_factor [expr {$out_bus_width / $axi_width}]
Expand All @@ -147,9 +147,9 @@ proc init_design {per ii inp_width wgt_width out_width acc_width batch block_in
set_directive_array_reshape -type block -factor $out_reshape_factor -dim 2 "store" out_mem
}
# Set accumulator partition factor
# set acc_bus_width [expr {(1 << ($acc_width + $block_out + $batch)) / $ii}]
# set acc_bus_width [expr {(1 << ($acc_width + $block_out + $batch)) / $g_ii}]
# set acc_reshape_factor [expr {$acc_bus_width / $axi_width}]
# set_directive_array_reshape -type block -factor $acc_reshape_factor -dim 2 "compute" acc_mem
# set_directive_array_partition -type block -factor $acc_reshape_factor -dim 2 "compute" acc_mem
}

# C define flags to pass to compiler
Expand All @@ -160,14 +160,14 @@ set cflags "-I $include_dir -I $src_dir -I $test_dir \
-DVTA_LOG_UOP_BUFF_SIZE=$uop_buff_size -DVTA_LOG_INP_BUFF_SIZE=$inp_buff_size \
-DVTA_LOG_WGT_BUFF_SIZE=$wgt_buff_size -DVTA_LOG_ACC_BUFF_SIZE=$acc_buff_size \
-DVTA_LOG_OUT_BUFF_SIZE=$out_buff_size"
if {$debug=="true"} {
if {$debug=="True"} {
append cflags " -DVTA_DEBUG=1"
}
if {$no_dsp=="true"} {
append cflags " -DNO_DSP"
if {$alu_ena=="True"} {
append cflags " -DALU_EN"
}
if {$no_alu=="true"} {
append cflags " -DNO_ALU"
if {$mul_ena=="True"} {
append cflags " -DMUL_EN"
}

# HLS behavioral sim
Expand All @@ -178,7 +178,7 @@ if {$mode=="all" || $mode=="sim"} {
add_files -tb $sim_dir/vta_test.cc -cflags $cflags
add_files -tb $test_dir/test_lib.cc -cflags $cflags
open_solution "solution0"
init_design $target_period $target_gemm_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $no_alu
init_design $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
csim_design -clean
close_project
}
Expand All @@ -189,7 +189,7 @@ if {$mode=="all" || $mode=="skip_sim" || $mode=="fetch"} {
set_top fetch
add_files $src_dir/vta.cc -cflags $cflags
open_solution "solution0"
init_design $target_period $target_gemm_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $no_alu
init_design $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
csynth_design
if {$mode=="all" || $mode=="skip_sim"} {
export_design -format ip_catalog
Expand All @@ -203,7 +203,7 @@ if {$mode=="all" || $mode=="skip_sim" || $mode=="load"} {
set_top load
add_files $src_dir/vta.cc -cflags $cflags
open_solution "solution0"
init_design $target_period $target_gemm_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $no_alu
init_design $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
csynth_design
if {$mode=="all" || $mode=="skip_sim"} {
export_design -format ip_catalog
Expand All @@ -217,7 +217,7 @@ if {$mode=="all" || $mode=="skip_sim" || $mode=="compute"} {
set_top compute
add_files $src_dir/vta.cc -cflags $cflags
open_solution "solution0"
init_design $target_period $target_gemm_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $no_alu
init_design $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
csynth_design
if {$mode=="all" || $mode=="skip_sim"} {
export_design -format ip_catalog
Expand All @@ -231,7 +231,7 @@ if {$mode=="all" || $mode=="skip_sim" || $mode=="store"} {
set_top store
add_files $src_dir/vta.cc -cflags $cflags
open_solution "solution0"
init_design $target_period $target_gemm_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $no_alu
init_design $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
csynth_design
if {$mode=="all" || $mode=="skip_sim"} {
export_design -format ip_catalog
Expand Down
58 changes: 35 additions & 23 deletions vta/hardware/xilinx/sim/vta_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,29 +29,41 @@ int main(void) {

int status = 0;

// // Run ALU test (vector-scalar operators)
// status |= alu_test(VTA_ALU_OPCODE_MIN, true, VTA_BLOCK_OUT, 128, true);
// status |= alu_test(VTA_ALU_OPCODE_MIN, true, VTA_BLOCK_OUT, 128, false);
// status |= alu_test(VTA_ALU_OPCODE_MAX, true, VTA_BLOCK_OUT, 128, true);
// status |= alu_test(VTA_ALU_OPCODE_MAX, true, VTA_BLOCK_OUT, 128, false);
// status |= alu_test(VTA_ALU_OPCODE_ADD, true, VTA_BLOCK_OUT, 128, true);
// status |= alu_test(VTA_ALU_OPCODE_ADD, true, VTA_BLOCK_OUT, 128, false);
// status |= alu_test(VTA_ALU_OPCODE_SHR, true, VTA_BLOCK_OUT, 128, true);
// status |= alu_test(VTA_ALU_OPCODE_SHR, true, VTA_BLOCK_OUT, 128, false);

// // Run ALU test (vector-vector operators)
// status |= alu_test(VTA_ALU_OPCODE_MIN, false, VTA_BLOCK_OUT, 128, true);
// status |= alu_test(VTA_ALU_OPCODE_MIN, false, VTA_BLOCK_OUT, 128, false);
// status |= alu_test(VTA_ALU_OPCODE_MAX, false, VTA_BLOCK_OUT, 128, true);
// status |= alu_test(VTA_ALU_OPCODE_MAX, false, VTA_BLOCK_OUT, 128, false);
// status |= alu_test(VTA_ALU_OPCODE_ADD, false, VTA_BLOCK_OUT, 128, true);
// status |= alu_test(VTA_ALU_OPCODE_ADD, false, VTA_BLOCK_OUT, 128, false);

// // Run blocked GEMM test
// status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 2);
// status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 2);
// status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 1);
// status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 1);
#ifdef ALU_EN
// Run ALU test (vector-scalar operators)
status |= alu_test(VTA_ALU_OPCODE_MIN, true, VTA_BLOCK_OUT, 128, true);
status |= alu_test(VTA_ALU_OPCODE_MIN, true, VTA_BLOCK_OUT, 128, false);
status |= alu_test(VTA_ALU_OPCODE_MAX, true, VTA_BLOCK_OUT, 128, true);
status |= alu_test(VTA_ALU_OPCODE_MAX, true, VTA_BLOCK_OUT, 128, false);
status |= alu_test(VTA_ALU_OPCODE_ADD, true, VTA_BLOCK_OUT, 128, true);
status |= alu_test(VTA_ALU_OPCODE_ADD, true, VTA_BLOCK_OUT, 128, false);
status |= alu_test(VTA_ALU_OPCODE_SHR, true, VTA_BLOCK_OUT, 128, true);
status |= alu_test(VTA_ALU_OPCODE_SHR, true, VTA_BLOCK_OUT, 128, false);

// Run ALU test (vector-vector operators)
status |= alu_test(VTA_ALU_OPCODE_MIN, false, VTA_BLOCK_OUT, 128, true);
status |= alu_test(VTA_ALU_OPCODE_MIN, false, VTA_BLOCK_OUT, 128, false);
status |= alu_test(VTA_ALU_OPCODE_MAX, false, VTA_BLOCK_OUT, 128, true);
status |= alu_test(VTA_ALU_OPCODE_MAX, false, VTA_BLOCK_OUT, 128, false);
status |= alu_test(VTA_ALU_OPCODE_ADD, false, VTA_BLOCK_OUT, 128, true);
status |= alu_test(VTA_ALU_OPCODE_ADD, false, VTA_BLOCK_OUT, 128, false);
status |= alu_test(VTA_ALU_OPCODE_SHR, false, VTA_BLOCK_OUT, 128, true);
status |= alu_test(VTA_ALU_OPCODE_SHR, false, VTA_BLOCK_OUT, 128, false);

#ifdef MUL_EN
status |= alu_test(VTA_ALU_OPCODE_MUL, true, VTA_BLOCK_OUT, 128, true);
status |= alu_test(VTA_ALU_OPCODE_MUL, true, VTA_BLOCK_OUT, 128, false);
status |= alu_test(VTA_ALU_OPCODE_MUL, false, VTA_BLOCK_OUT, 128, true);
status |= alu_test(VTA_ALU_OPCODE_MUL, false, VTA_BLOCK_OUT, 128, false);
#endif // MUL_EN

#endif // ALU_EN

// Run blocked GEMM test
status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 2);
status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 2);
status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 1);
status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 1);

// Simple GEMM unit test
status |= gemm_test(4 * VTA_BATCH, 4 * VTA_BLOCK_OUT, 4 * VTA_BLOCK_IN, true);
Expand Down
Loading

0 comments on commit 4a8e105

Please sign in to comment.