Skip to content

Commit

Permalink
[VTA] Design refactoring and bug fixes (apache#20)
Browse files Browse the repository at this point in the history
* updating driver and dll paths for latest pynqv2.3 image

* unifying tcl scripting for multipleFPGA backends, making bus width parameterizable

* fix coherent interface on Ultra96

* 2d padded load template

* simplifying tensor load/store

* streamlining GEMM and ALU pipeline

* refactor VTA for simpler instruction decoding

* fixed bug, mixed DSP/LUT GEMM support

* adding latency directive in DSPs for better pipelining and timing closure

* dual channel memory interface for Ultra96

* hardware bug fixes, working compilation on PYNQ and ULTRA96

* checking for correctness

* support for pynq v2.3

* report inference time in ms

* update CMAKE for Pynq v2.3

* testing hardware support for batch norm

* adding support for multiplication in ALU

* defaulting to coherent buffers in runtime

* increasing size of CMA buffer allocation limit
  • Loading branch information
tmoreau89 committed Jan 2, 2019
1 parent c061463 commit 807bd51
Show file tree
Hide file tree
Showing 23 changed files with 2,388 additions and 3,822 deletions.
13 changes: 3 additions & 10 deletions cmake/modules/VTA.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -38,16 +38,9 @@ elseif(PYTHON)
set_target_properties(vta PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
endif(APPLE)

# PYNQ rules for Pynq v2.3
if(${VTA_TARGET} STREQUAL "pynq")
find_library(__cma_lib NAMES cma PATH /usr/lib)
target_link_libraries(vta ${__cma_lib})
endif()
# Ultra96 rules
if(${VTA_TARGET} STREQUAL "ultra96")
find_library(__sds_lib NAMES sds_lib PATH /usr/lib)
target_link_libraries(vta ${__sds_lib})
endif()
# PYNQ rules for pynq v2.3
find_library(__cma_lib NAMES cma PATH /usr/lib)
target_link_libraries(vta ${__cma_lib})
else()
message(STATUS "Cannot found python in env, VTA build is skipped..")
endif()
1 change: 1 addition & 0 deletions vta/config/pynq_sample.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
"LOG_BATCH" : 0,
"LOG_BLOCK_IN" : 4,
"LOG_BLOCK_OUT" : 4,
"LOG_BUS_WIDTH" : 6,
"LOG_UOP_BUFF_SIZE" : 15,
"LOG_INP_BUFF_SIZE" : 15,
"LOG_WGT_BUFF_SIZE" : 18,
Expand Down
1 change: 1 addition & 0 deletions vta/config/ultra96_sample.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
"LOG_BATCH" : 0,
"LOG_BLOCK_IN" : 4,
"LOG_BLOCK_OUT" : 4,
"LOG_BUS_WIDTH" : 7,
"LOG_UOP_BUFF_SIZE" : 15,
"LOG_INP_BUFF_SIZE" : 15,
"LOG_WGT_BUFF_SIZE" : 18,
Expand Down
5 changes: 5 additions & 0 deletions vta/config/vta_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ def main():
help="returns log of tensor block in dimension")
parser.add_argument("--get-blockout", action="store_true",
help="returns log of tensor block out dimension")
parser.add_argument("--get-buswidth", action="store_true",
help="returns log of bus width in b")
parser.add_argument("--get-uopbuffsize", action="store_true",
help="returns log of micro-op buffer size in B")
parser.add_argument("--get-inpbuffsize", action="store_true",
Expand Down Expand Up @@ -182,6 +184,9 @@ def main():
if args.get_blockout:
print(cfg["LOG_BLOCK_OUT"])

if args.get_buswidth:
print(cfg["LOG_BUS_WIDTH"])

if args.get_uopbuffsize:
print(cfg["LOG_UOP_BUFF_SIZE"])

Expand Down
7 changes: 4 additions & 3 deletions vta/hardware/xilinx/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ VIVADO = vivado
HSI = hsi

# HLS mode
MODE = skip_sim
MODE = all
# Debug flag
DEBUG = False
# SLURM
Expand All @@ -35,6 +35,7 @@ VTA_OUT_WIDTH := $(shell ${VTA_CONFIG} --get-outwidth)
VTA_BATCH := $(shell ${VTA_CONFIG} --get-batch)
VTA_IN_BLOCK := $(shell ${VTA_CONFIG} --get-blockin)
VTA_OUT_BLOCK := $(shell ${VTA_CONFIG} --get-blockout)
VTA_BUS_WIDTH := $(shell ${VTA_CONFIG} --get-buswidth)
VTA_UOP_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-uopbuffsize)
VTA_INP_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-inpbuffsize)
VTA_WGT_BUFF_SIZE := $(shell ${VTA_CONFIG} --get-wgtbuffsize)
Expand Down Expand Up @@ -90,7 +91,7 @@ $(IP_PATH): $(SRC_DIR)/*
$(MODE) $(DEBUG) $(VTA_ALU_EN) $(VTA_MUL_EN) \
$(VTA_TARGET_PER) $(VTA_GEMM_II) $(VTA_TALU_II) \
$(VTA_INP_WIDTH) $(VTA_WGT_WIDTH) $(VTA_ACC_WIDTH) $(VTA_OUT_WIDTH) \
$(VTA_BATCH) $(VTA_IN_BLOCK) $(VTA_OUT_BLOCK) \
$(VTA_BATCH) $(VTA_IN_BLOCK) $(VTA_OUT_BLOCK) $(VTA_BUS_WIDTH) \
$(VTA_UOP_BUFF_SIZE) $(VTA_INP_BUFF_SIZE) $(VTA_WGT_BUFF_SIZE) \
$(VTA_ACC_BUFF_SIZE) $(VTA_OUT_BUFF_SIZE)
ifeq ($(SLURM), True)
Expand All @@ -101,7 +102,7 @@ endif
$(BIT_PATH): $(IP_PATH)
mkdir -p $(HW_BUILD_PATH)
cd $(HW_BUILD_PATH) && \
$(VIVADO) -mode tcl -source $(SCRIPT_DIR)/ultra96.tcl \
$(VIVADO) -mode tcl -source $(SCRIPT_DIR)/vivado.tcl \
-tclargs $(VTA_TARGET) $(BUILD_DIR)/hls/$(CONF) $(VTA_HW_COMP_THREADS) \
$(VTA_CLOCK_FREQ) $(VTA_GEMM_II) \
$(VTA_INP_WIDTH) $(VTA_WGT_WIDTH) $(VTA_OUT_WIDTH) \
Expand Down
87 changes: 34 additions & 53 deletions vta/hardware/xilinx/scripts/hls.tcl
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,14 @@
# Arg 17: batch size (log)
# Arg 18: in block size (log)
# Arg 19: out block size (log)
# Arg 20: uop buffer size in B (log)
# Arg 21: inp buffer size in B (log)
# Arg 22: wgt buffer size in B (log)
# Arg 23: acc buffer size in B (log)
# Arg 24: out buffer size in B (log)

if { [llength $argv] eq 26 } {
# Arg 20: bus width in b (log)
# Arg 21: uop buffer size in B (log)
# Arg 22: inp buffer size in B (log)
# Arg 23: wgt buffer size in B (log)
# Arg 24: acc buffer size in B (log)
# Arg 25: out buffer size in B (log)

if { [llength $argv] eq 27 } {
set target [lindex $argv 2]
set src_dir [lindex $argv 3]
set sim_dir [lindex $argv 4]
Expand All @@ -50,43 +51,24 @@ if { [llength $argv] eq 26 } {
set batch [lindex $argv 18]
set block_in [lindex $argv 19]
set block_out [lindex $argv 20]
set uop_buff_size [lindex $argv 21]
set inp_buff_size [lindex $argv 22]
set wgt_buff_size [lindex $argv 23]
set acc_buff_size [lindex $argv 24]
set out_buff_size [lindex $argv 25]
set bus_width [lindex $argv 21]
set uop_buff_size [lindex $argv 22]
set inp_buff_size [lindex $argv 23]
set wgt_buff_size [lindex $argv 24]
set acc_buff_size [lindex $argv 25]
set out_buff_size [lindex $argv 26]
} else {
set target "pynq"
set src_dir "../src"
set sim_dir "../sim"
set test_dir "../../src/test"
set include_dir "../../include"
set mode "all"
set debug "False"
set alu_ena "True"
set mul_ena "True"
set target_period 8
set target_gemm_ii 10
set target_alu_ii 16
set inp_width 3
set wgt_width 3
set acc_width 5
set out_width 3
set batch 1
set block_in 4
set block_out 4
set uop_buff_size 15
set inp_buff_size 15
set wgt_buff_size 15
set acc_buff_size 17
set out_buff_size 15
puts "Not enough arguments provided!"
exit
}

puts "about to start doing some stuff"


# Initializes the HLS design and sets HLS pragmas for memory partitioning.
# This is necessary because of a Vivado restriction that doesn't allow for
# buses wider than 1024 bits.
proc init_design {target per g_ii a_ii inp_width wgt_width out_width acc_width batch block_in block_out alu_ena} {
proc init_design {target per g_ii a_ii bus_width inp_width wgt_width out_width acc_width batch block_in block_out alu_ena} {

# Set device number
if {$target=="pynq"} {
Expand All @@ -95,28 +77,25 @@ proc init_design {target per g_ii a_ii inp_width wgt_width out_width acc_width b
set_part {xczu3eg-sbva484-1-e}
} elseif {$target=="zcu102"} {
set_part {xczu9eg-ffvb1156-2-e}
} elseif {$target=="f1"} {
set_part {xcvu9p-flgb2104-2-i}
# config_interface -m_axi_addr64
}

# Max bus width (supported by Vivado)
set max_width 1024

# Set axi width (TODO derive from top level config)
if {$target=="pynq"} {
set axi_width 64
} elseif {$target=="ultra96"} {
set axi_width 128
} elseif {$target=="zcu102"} {
set axi_width 128
}
# Set axi width
set axi_width [expr {1 << $bus_width}]

# Set the clock frequency
create_clock -period $per -name default

# Set pipeline directive
set_directive_pipeline -II $g_ii "compute/READ_GEMM_UOP"
set_directive_pipeline -II $g_ii "gemm/READ_GEMM_UOP"

if {$alu_ena=="True"} {
set_directive_pipeline -II $a_ii "compute/READ_ALU_UOP"
set_directive_pipeline -II $a_ii "alu/READ_ALU_UOP"
}

# Set input partition factor to (INP_VECTOR_WIDTH*BATCH/(1024*g_ii)
Expand Down Expand Up @@ -174,7 +153,8 @@ set cflags "-I $include_dir -I $src_dir -I $test_dir \
-DVTA_LOG_BATCH=$batch -DVTA_LOG_BLOCK_OUT=$block_out -DVTA_LOG_BLOCK_IN=$block_in \
-DVTA_LOG_UOP_BUFF_SIZE=$uop_buff_size -DVTA_LOG_INP_BUFF_SIZE=$inp_buff_size \
-DVTA_LOG_WGT_BUFF_SIZE=$wgt_buff_size -DVTA_LOG_ACC_BUFF_SIZE=$acc_buff_size \
-DVTA_LOG_OUT_BUFF_SIZE=$out_buff_size"
-DVTA_LOG_OUT_BUFF_SIZE=$out_buff_size -DVTA_LOG_BUS_WIDTH=$bus_width \
-DVTA_GEMM_II=$target_gemm_ii"
if {$debug=="True"} {
append cflags " -DVTA_DEBUG=1"
}
Expand All @@ -185,6 +165,7 @@ if {$mul_ena=="True"} {
append cflags " -DMUL_EN"
}


# HLS behavioral sim
if {$mode=="all" || $mode=="sim"} {
open_project vta_sim
Expand All @@ -193,7 +174,7 @@ if {$mode=="all" || $mode=="sim"} {
add_files -tb $sim_dir/vta_test.cc -cflags $cflags
add_files -tb $test_dir/test_lib.cc -cflags $cflags
open_solution "solution0"
init_design $target $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
init_design $target $target_period $target_gemm_ii $target_alu_ii $bus_width $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
csim_design -clean
close_project
}
Expand All @@ -204,7 +185,7 @@ if {$mode=="all" || $mode=="skip_sim" || $mode=="fetch"} {
set_top fetch
add_files $src_dir/vta.cc -cflags $cflags
open_solution "solution0"
init_design $target $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
init_design $target $target_period $target_gemm_ii $target_alu_ii $bus_width $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
csynth_design
if {$mode=="all" || $mode=="skip_sim"} {
export_design -format ip_catalog
Expand All @@ -218,7 +199,7 @@ if {$mode=="all" || $mode=="skip_sim" || $mode=="load"} {
set_top load
add_files $src_dir/vta.cc -cflags $cflags
open_solution "solution0"
init_design $target $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
init_design $target $target_period $target_gemm_ii $target_alu_ii $bus_width $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
csynth_design
if {$mode=="all" || $mode=="skip_sim"} {
export_design -format ip_catalog
Expand All @@ -232,7 +213,7 @@ if {$mode=="all" || $mode=="skip_sim" || $mode=="compute"} {
set_top compute
add_files $src_dir/vta.cc -cflags $cflags
open_solution "solution0"
init_design $target $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
init_design $target $target_period $target_gemm_ii $target_alu_ii $bus_width $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
csynth_design
if {$mode=="all" || $mode=="skip_sim"} {
export_design -format ip_catalog
Expand All @@ -246,7 +227,7 @@ if {$mode=="all" || $mode=="skip_sim" || $mode=="store"} {
set_top store
add_files $src_dir/vta.cc -cflags $cflags
open_solution "solution0"
init_design $target $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
init_design $target $target_period $target_gemm_ii $target_alu_ii $bus_width $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
csynth_design
if {$mode=="all" || $mode=="skip_sim"} {
export_design -format ip_catalog
Expand Down
Loading

0 comments on commit 807bd51

Please sign in to comment.