From b6e24f0718b7001eadcc94395798113f28a29613 Mon Sep 17 00:00:00 2001 From: Seah <54855793+SeahK@users.noreply.github.com> Date: Mon, 22 May 2023 16:59:46 -0700 Subject: [PATCH 01/24] Residual addition FSM (#296) Added resadd support for LoopMatmul FSM, operand reuse in scratchpad, added stride option for CONV --------- Co-authored-by: Seah Kim --- software/gemmini-rocc-tests | 2 +- software/libgemmini | 2 +- src/main/scala/gemmini/LoopConv.scala | 53 +++++++----- src/main/scala/gemmini/LoopMatmul.scala | 102 ++++++++++++++++++------ 4 files changed, 115 insertions(+), 44 deletions(-) diff --git a/software/gemmini-rocc-tests b/software/gemmini-rocc-tests index 13e7e1fc..bb489531 160000 --- a/software/gemmini-rocc-tests +++ b/software/gemmini-rocc-tests @@ -1 +1 @@ -Subproject commit 13e7e1fce1a8d332eea563c14130136ef0533b16 +Subproject commit bb4895319b4a7f7181a613a3853bc82887000ee0 diff --git a/software/libgemmini b/software/libgemmini index 4be22079..d914e1c6 160000 --- a/software/libgemmini +++ b/software/libgemmini @@ -1 +1 @@ -Subproject commit 4be220794cfdb834e8ecc2ee7becdf8632cc268c +Subproject commit d914e1c6fe9ad81ab266b7ba3247e7ce36b3c9f8 diff --git a/src/main/scala/gemmini/LoopConv.scala b/src/main/scala/gemmini/LoopConv.scala index bc87ae10..a46a4576 100644 --- a/src/main/scala/gemmini/LoopConv.scala +++ b/src/main/scala/gemmini/LoopConv.scala @@ -16,6 +16,9 @@ class LoopConvOuterBounds(val large_iterator_bitwidth: Int, val small_iterator_b val in_channels = UInt(large_iterator_bitwidth.W) val out_channels = UInt(large_iterator_bitwidth.W) val out_dim = UInt(large_iterator_bitwidth.W) + val out_stride = UInt(large_iterator_bitwidth.W) //stride for output activation + val in_stride = UInt(large_iterator_bitwidth.W) //stride for input activation + val weight_stride = UInt(large_iterator_bitwidth.W) //stride for weight val pool_out_dim = UInt(small_iterator_bitwidth.W) val stride = UInt(tiny_iterator_bitwidth.W) val padding = UInt(tiny_iterator_bitwidth.W) @@ -272,11 +275,11 @@ class LoopConvLdInput(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitw val icol_padded = icol +& undilated(lpad).zext val is_zeros = irow < 0.S || irow >= irows_unpadded.zext || icol < 0.S || icol >= icols_unpadded.zext - val dram_stride = Mux(req.trans_input_3120, batch_size * (input_w/8).U, in_channels * (input_w/8).U) + val dram_stride = Mux(req.trans_input_3120, batch_size * (input_w/8).U, in_stride * (input_w/8).U) // Addresses val dram_offset = Mux(req.trans_input_3120, (((ich * in_dim * in_dim +& irow*in_dim +& icol) * batches +& b) * (input_w/8).U).asUInt, - (((b * in_dim * in_dim +& irow*in_dim +& icol) * in_channels +& ich) * (input_w/8).U).asUInt) + (((b * in_dim * in_dim +& irow*in_dim +& icol) * in_stride +& ich) * (input_w/8).U).asUInt) val dram_addr = Mux(is_zeros, 0.U, req.dram_addr + LoopConv.castDramOffset(dram_offset)) val spad_addr = Mux(req.trans_input_3120, // To prevent Verilator errors, we replace some "/ block_size.U" calls here with ">> log2Up(block_size)" @@ -333,7 +336,7 @@ class LoopConvLdInput(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitw io.idle := state === idle && !command_p.io.busy io.loop_id := req.loop_id - command_p.io.in.valid := state =/= idle && !io.wait_for_prev_loop + command_p.io.in.valid := state =/= idle && !io.wait_for_prev_loop && (req.dram_addr =/= 0.U) command_p.io.in.bits.cmd := Mux(state === config, config_cmd, mvin_cmd) command_p.io.in.bits.dram_addr := dram_addr command_p.io.in.bits.spad_addr := spad_addr @@ -355,7 +358,9 @@ class LoopConvLdInput(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitw } // Sending outputs - when(command_p.io.in.fire) { + when(req.dram_addr === 0.U){ + state := idle + }.elsewhen(command_p.io.in.fire) { when (state === config) { state := ld }.otherwise { @@ -442,7 +447,7 @@ class LoopConvLdWeight(block_size: Int, coreMaxAddrBits: Int, large_iterator_bit out_channels_per_bank * kcols * krows * kchs) val addr_start = req.addr_end - B_rows - val dram_stride = MuxCase(out_channels, Seq( + val dram_stride = MuxCase(weight_stride, Seq( req.dw -> 1.U, req.trans_weight_1203 -> (kernel_dim * kernel_dim * out_channels), req.trans_weight_0132 -> in_channels @@ -455,7 +460,7 @@ class LoopConvLdWeight(block_size: Int, coreMaxAddrBits: Int, large_iterator_bit val kch = Reg(UInt(large_iterator_bitwidth.W)) // Addresses - val dram_offset = MuxCase(((krow*kernel_dim*in_channels +& kcol*in_channels +& kch) * out_channels +& och) * (input_w/8).U, Seq( + val dram_offset = MuxCase(((krow*kernel_dim*in_channels +& kcol*in_channels +& kch) * weight_stride +& och) * (input_w/8).U, Seq( req.dw -> (krow * kernel_dim +& kcol) * (input_w/8).U, req.trans_weight_1203 -> (((kch*kernel_dim*kernel_dim +& krow*kernel_dim +& kcol) * out_channels +& och) * (input_w/8).U), req.trans_weight_0132 -> (((krow*kernel_dim*out_channels +& kcol*out_channels +& och) * in_channels +& kch) * (input_w/8).U) @@ -512,7 +517,7 @@ class LoopConvLdWeight(block_size: Int, coreMaxAddrBits: Int, large_iterator_bit io.idle := state === idle && !command_p.io.busy io.loop_id := req.loop_id - command_p.io.in.valid := state =/= idle && !io.wait_for_prev_loop + command_p.io.in.valid := state =/= idle && !io.wait_for_prev_loop && (req.dram_addr =/= 0.U) command_p.io.in.bits.cmd := Mux(state === config, config_cmd, mvin_cmd) command_p.io.in.bits.dram_addr := dram_addr command_p.io.in.bits.spad_addr := spad_addr @@ -534,7 +539,9 @@ class LoopConvLdWeight(block_size: Int, coreMaxAddrBits: Int, large_iterator_bit } // Sending outputs - when(command_p.io.in.fire) { + when(req.dram_addr === 0.U){ + state := idle + }.elsewhen(command_p.io.in.fire) { when (state === config) { state := ld }.otherwise { @@ -880,11 +887,11 @@ class LoopConvSt(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth: // Addresses val dram_offset = Mux(req.trans_output_1203, ((orow*out_dim*batch_size +& ocol*batch_size +& b) * out_channels +& och) * (input_w/8).U, - ((b*out_dim*out_dim +& orow*out_dim +& ocol) * out_channels +& och) * (input_w/8).U) + ((b*out_dim*out_dim +& orow*out_dim +& ocol) * out_stride +& och) * (input_w/8).U) val dram_addr = req.dram_addr + LoopConv.castDramOffset(dram_offset) val spad_addr = acc_addr_start +& (och / block_size.U(och.getWidth.W)) * batches * orows * ocols +& b * orows * ocols +& orow * ocols +& ocol - val pool_dram_addr = req.dram_addr + ((b * pool_out_dim * pool_out_dim) * out_channels + och) * (input_w/8).U + val pool_dram_addr = req.dram_addr + ((b * pool_out_dim * pool_out_dim) * out_stride + och) * (input_w/8).U val pool_spad_addr = acc_addr_start +& (och / block_size.U(och.getWidth.W)) * batches * orows * ocols +& b * orows * ocols // Sizes @@ -933,7 +940,7 @@ class LoopConvSt(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth: val pre_pool_config_cmd_rs2 = Wire(config_mvout_rs2_t.cloneType) pre_pool_config_cmd_rs2 := DontCare pre_pool_config_cmd_rs2.acc_scale := ACC_SCALE_NO_CHANGE - pre_pool_config_cmd_rs2.stride := out_channels * (input_w / 8).U + pre_pool_config_cmd_rs2.stride := out_stride * (input_w / 8).U pre_pool_config_cmd.rs2 := pre_pool_config_cmd_rs2.asUInt val post_pool_config_cmd = Wire(new RoCCCommand) @@ -949,7 +956,7 @@ class LoopConvSt(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth: val post_pool_config_cmd_rs2 = Wire(config_mvout_rs2_t.cloneType) post_pool_config_cmd_rs2 := DontCare post_pool_config_cmd_rs2.acc_scale := ACC_SCALE_NO_CHANGE - post_pool_config_cmd_rs2.stride := out_channels * (input_w / 8).U + post_pool_config_cmd_rs2.stride := out_stride * (input_w / 8).U post_pool_config_cmd.rs2 := post_pool_config_cmd_rs2.asUInt val pool_cmd = Wire(new RoCCCommand) @@ -1070,6 +1077,8 @@ class LoopConvState(val block_size: Int, val large_iterator_bitwidth: Int, val s val dw = Bool() val max_pixels_per_row = UInt(small_iterator_bitwidth.W) + val a_ex_spad_id = UInt(2.W) + val b_ex_spad_id = UInt(2.W) val configured = Bool() @@ -1306,11 +1315,14 @@ class LoopConv (block_size: Int, coreMaxAddrBits: Int, reservation_station_size: is (LOOP_CONV_WS_CONFIG_4) { loop_being_configured.inner_bounds.orows := cmd.bits.cmd.rs1(63, 48) loop_being_configured.inner_bounds.prad := cmd.bits.cmd.rs1(47, 32) - loop_being_configured.inner_bounds.pupad := cmd.bits.cmd.rs1(31, 16) - loop_being_configured.inner_bounds.pdpad := cmd.bits.cmd.rs1(15, 0) + loop_being_configured.inner_bounds.pupad := cmd.bits.cmd.rs1(31, 21) + loop_being_configured.inner_bounds.pdpad := cmd.bits.cmd.rs1(20, 10) + loop_being_configured.outer_bounds.kernel_dilation := cmd.bits.cmd.rs1(9, 0) loop_being_configured.inner_bounds.ocols := cmd.bits.cmd.rs2(15, 0) - loop_being_configured.outer_bounds.kernel_dilation := cmd.bits.cmd.rs2(31, 16) + loop_being_configured.outer_bounds.in_stride := cmd.bits.cmd.rs2(63, 48) + loop_being_configured.outer_bounds.weight_stride := cmd.bits.cmd.rs2(47, 32) + loop_being_configured.outer_bounds.out_stride := cmd.bits.cmd.rs2(31, 16) } is (LOOP_CONV_WS_CONFIG_5) { @@ -1334,6 +1346,9 @@ class LoopConv (block_size: Int, coreMaxAddrBits: Int, reservation_station_size: !has_first_layer_optimizations.B || config_max_pixels_per_row === 0.U, 1.U, config_max_pixels_per_row) + loop_being_configured.a_ex_spad_id := cmd.bits.cmd.rs1(19, 18) + loop_being_configured.b_ex_spad_id := cmd.bits.cmd.rs1(17, 16) + loop_being_configured.wrot180 := has_training_convs.B && cmd.bits.cmd.rs1(1) loop_being_configured.input_dilated := has_training_convs.B && cmd.bits.cmd.rs2(2) loop_being_configured.trans_output_1203 := has_training_convs.B && cmd.bits.cmd.rs1(2) @@ -1387,7 +1402,7 @@ class LoopConv (block_size: Int, coreMaxAddrBits: Int, reservation_station_size: ld_input.io.req.bits.outer_bounds := loop_requesting_ld_input.outer_bounds ld_input.io.req.bits.inner_bounds := loop_requesting_ld_input.inner_bounds ld_input.io.req.bits.derived_params := loop_requesting_ld_input.derived_params() - ld_input.io.req.bits.addr_start := loop_requesting_ld_input.a_addr_start + ld_input.io.req.bits.addr_start := Mux(loop_requesting_ld_input.a_ex_spad_id === 0.U, loop_requesting_ld_input.a_addr_start, (loop_requesting_ld_input.a_ex_spad_id - 1.U) * (max_addr / concurrent_loops).U) ld_input.io.req.bits.dram_addr := loop_requesting_ld_input.input_dram_addr ld_input.io.req.bits.downsample := loop_requesting_ld_input.downsample ld_input.io.req.bits.max_pixels_per_row := loop_requesting_ld_input.max_pixels_per_row @@ -1407,7 +1422,7 @@ class LoopConv (block_size: Int, coreMaxAddrBits: Int, reservation_station_size: ld_weights.io.req.bits.outer_bounds := loop_requesting_ld_weights.outer_bounds ld_weights.io.req.bits.inner_bounds := loop_requesting_ld_weights.inner_bounds ld_weights.io.req.bits.derived_params := loop_requesting_ld_weights.derived_params() - ld_weights.io.req.bits.addr_end := loop_requesting_ld_weights.b_addr_end + ld_weights.io.req.bits.addr_end := Mux(loop_requesting_ld_weights.b_ex_spad_id === 0.U, loop_requesting_ld_weights.b_addr_end, (loop_requesting_ld_weights.b_ex_spad_id) * (max_addr / concurrent_loops).U) ld_weights.io.req.bits.dram_addr := loop_requesting_ld_weights.weights_dram_addr ld_weights.io.req.bits.trans_weight_1203 := loop_requesting_ld_weights.trans_weight_1203 ld_weights.io.req.bits.trans_weight_0132 := loop_requesting_ld_weights.trans_weight_0132 @@ -1426,8 +1441,8 @@ class LoopConv (block_size: Int, coreMaxAddrBits: Int, reservation_station_size: ex.io.req.bits.outer_bounds := loop_requesting_ex.outer_bounds ex.io.req.bits.inner_bounds := loop_requesting_ex.inner_bounds ex.io.req.bits.derived_params := loop_requesting_ex.derived_params() - ex.io.req.bits.a_addr_start := loop_requesting_ex.a_addr_start - ex.io.req.bits.b_addr_end := loop_requesting_ex.b_addr_end + ex.io.req.bits.a_addr_start := Mux(loop_requesting_ex.a_ex_spad_id === 0.U, loop_requesting_ex.a_addr_start, (loop_requesting_ex.a_ex_spad_id - 1.U) * (max_addr / concurrent_loops).U) + ex.io.req.bits.b_addr_end := Mux(loop_requesting_ex.b_ex_spad_id === 0.U, loop_requesting_ex.b_addr_end, (loop_requesting_ex.b_ex_spad_id) * (max_addr / concurrent_loops).U) ex.io.req.bits.c_addr_start := ex_c_addr_start ex.io.req.bits.wrot180 := loop_requesting_ex.wrot180 ex.io.req.bits.downsample := loop_requesting_ex.downsample diff --git a/src/main/scala/gemmini/LoopMatmul.scala b/src/main/scala/gemmini/LoopMatmul.scala index c9e6fed3..11ed4006 100644 --- a/src/main/scala/gemmini/LoopMatmul.scala +++ b/src/main/scala/gemmini/LoopMatmul.scala @@ -22,6 +22,7 @@ class LoopMatmulLdAReq(val block_size: Int, val coreMaxAddrBits: Int, val iterat val transpose = Bool() val addr_start = UInt(log2Up(max_addr).W) val loop_id = UInt(log2Up(concurrent_loops).W) + val is_resadd = Bool() } class LoopMatmulLdA(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: Int, max_addr: Int, input_w: Int, @@ -80,18 +81,23 @@ class LoopMatmulLdA(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In mvin_cmd_rs2.num_cols := cols.asUInt mvin_cmd_rs2.local_addr := cast_to_sp_addr(mvin_cmd_rs2.local_addr, sp_addr) mvin_cmd.rs2 := mvin_cmd_rs2.asUInt + when(req.is_resadd){ + mvin_cmd_rs2.local_addr := cast_to_acc_addr(mvin_cmd_rs2.local_addr, sp_addr, accumulate = false.B, read_full = false.B) + } io.req.ready := state === idle io.i := i io.k := k io.idle := state === idle - io.cmd.valid := state =/= idle && !io.rob_overloaded + io.cmd.valid := state =/= idle && !io.rob_overloaded && req.dram_addr =/= 0.U io.cmd.bits := mvin_cmd io.loop_id := req.loop_id - when (io.cmd.fire) { + when(req.dram_addr === 0.U){ + state := idle + }.elsewhen(io.cmd.fire) { // The order here is k, j, i val i_blocks = Mux(req.transpose, max_blocks, 1.U) val k_blocks = Mux(req.transpose, 1.U, max_blocks) @@ -127,6 +133,7 @@ class LoopMatmulLdBReq(val block_size: Int, val coreMaxAddrBits: Int, val iterat val transpose = Bool() val addr_end = UInt(log2Up(max_addr+1).W) val loop_id = UInt(log2Up(concurrent_loops).W) + val is_resadd = Bool() } class LoopMatmulLdB(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: Int, max_addr: Int, input_w: Int, @@ -168,7 +175,7 @@ class LoopMatmulLdB(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In val max_col_dim = Mux(req.transpose, req.max_k, req.max_j) val max_blocks = Mux(max_col_dim <= max_block_len.U, max_col_dim, max_block_len.U) - val sp_addr_start = req.addr_end - req.max_k * req.max_j * block_size.U + val sp_addr_start = Mux(req.is_resadd, req.addr_end, req.addr_end - req.max_k * req.max_j * block_size.U) val dram_offset = (row_iterator * req.dram_stride + col_iterator) * block_size.U * (input_w/8).U val dram_addr = req.dram_addr + LoopMatmul.castDramOffset(dram_offset) @@ -189,17 +196,23 @@ class LoopMatmulLdB(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In mvin_cmd_rs2.local_addr := cast_to_sp_addr(mvin_cmd_rs2.local_addr, sp_addr) mvin_cmd.rs2 := mvin_cmd_rs2.asUInt + when (req.is_resadd){ + mvin_cmd_rs2.local_addr := cast_to_acc_addr(mvin_cmd_rs2.local_addr, sp_addr, accumulate = true.B, read_full = false.B) + } + io.req.ready := state === idle io.k := k io.j := j io.idle := state === idle - io.cmd.valid := state =/= idle && !io.rob_overloaded + io.cmd.valid := state =/= idle && !io.rob_overloaded && req.dram_addr =/= 0.U io.cmd.bits := mvin_cmd io.loop_id := req.loop_id - when (io.cmd.fire) { + when(req.dram_addr === 0.U){ + state := idle + }.elsewhen(io.cmd.fire) { // The order here is k, j, i val j_blocks = Mux(req.transpose, 1.U, max_blocks) val k_blocks = Mux(req.transpose, max_blocks, 1.U) @@ -333,6 +346,7 @@ class LoopMatmulExecuteReq(val block_size: Int, val coreMaxAddrBits: Int, val it val b_addr_end = UInt(log2Up(max_addr+1).W) val c_addr_start = UInt(log2Up(max_acc_addr).W) val loop_id = UInt(log2Up(concurrent_loops).W) + val skip = Bool() } class LoopMatmulExecute(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: Int, max_addr: Int, max_acc_addr: Int, concurrent_loops: Int, @@ -446,12 +460,14 @@ class LoopMatmulExecute(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth val ldd_ahead = io.ldd_completed val ld_ahead = lda_ahead && ldb_ahead && ldd_ahead - io.cmd.valid := state =/= idle && !io.rob_overloaded && ld_ahead + io.cmd.valid := state =/= idle && !io.rob_overloaded && ld_ahead && !req.skip io.cmd.bits := Mux(state === pre, pre_cmd, comp_cmd) io.loop_id := req.loop_id - when (io.cmd.fire) { + when(req.skip) { + state := idle + }.elsewhen (io.cmd.fire) { when (state === pre) { state := comp }.otherwise { @@ -492,6 +508,7 @@ class LoopMatmulStCReq(val block_size: Int, val coreMaxAddrBits: Int, val iterat val act = UInt(Activation.bitwidth.W) val addr_start = UInt(log2Up(max_acc_addr).W) val loop_id = UInt(log2Up(concurrent_loops).W) + val is_resadd = Bool() } class LoopMatmulStC(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: Int, max_acc_addr: Int, input_w: Int, acc_w: Int, max_block_len: Int, concurrent_loops: Int, mvout_rs2_t: MvoutRs2) @@ -607,11 +624,14 @@ class LoopMatmulStC(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In io.idle := state === idle // The order here is k, j, i when not doing LAYERNORM or SOFTMAX - val ex_ahead = io.ex_completed || + val ex_ahead = WireInit(io.ex_completed || ((req.act =/= Activation.LAYERNORM) && (req.act =/= Activation.SOFTMAX) && (io.ex_k === req.max_k - 1.U && (io.ex_j >= j + blocks || - ((io.ex_j === j + blocks - 1.U) && io.ex_i > i)))) + ((io.ex_j === j + blocks - 1.U) && io.ex_i > i))))) + when(req.is_resadd){ + ex_ahead := io.ex_completed || (io.ex_i > i || (io.ex_i === i && io.ex_j >= j + blocks)) + } io.cmd.valid := state =/= idle && !io.rob_overloaded && ex_ahead && req.dram_addr =/= 0.U io.cmd.bits := MuxCase(mvout_cmd, Seq( @@ -698,6 +718,8 @@ class LoopMatmulState(val iterator_bitwidth: Int, val coreMaxAddrBits: Int, val val full_c = Bool() val ex_accumulate = Bool() + val a_ex_spad_id = UInt(2.W) + val b_ex_spad_id = UInt(2.W) val configured = Bool() val running = Bool() @@ -718,6 +740,7 @@ class LoopMatmulState(val iterator_bitwidth: Int, val coreMaxAddrBits: Int, val val a_addr_start = UInt(log2Up(max_addr).W) val b_addr_end = UInt(log2Up(max_addr+1).W) + val resadd_addr_start = UInt(log2Up(max_acc_addr).W) def reset(): Unit = { configured := false.B @@ -735,6 +758,8 @@ class LoopMatmulState(val iterator_bitwidth: Int, val coreMaxAddrBits: Int, val ex_completed := false.B ldd_completed := false.B st_completed := false.B + + //is_resadd := false.B } } @@ -769,9 +794,12 @@ class LoopMatmul(block_size: Int, coreMaxAddrBits: Int, reservation_station_size val loop_being_configured_id = Mux(head_loop.configured, tail_loop_id, head_loop_id) val loop_being_configured = loops(loop_being_configured_id) + val is_resadd = RegInit(false.B) + + val max_all_addr = if(max_addr > max_acc_addr) max_addr else max_acc_addr // Create inner modules - val ldA = Module(new LoopMatmulLdA(block_size, coreMaxAddrBits, iterator_bitwidth, max_addr, input_w, max_block_len, concurrent_loops, mvin_rs2_t)) - val ldB = Module(new LoopMatmulLdB(block_size, coreMaxAddrBits, iterator_bitwidth, max_addr, input_w, max_block_len, concurrent_loops, mvin_rs2_t)) + val ldA = Module(new LoopMatmulLdA(block_size, coreMaxAddrBits, iterator_bitwidth, max_all_addr, input_w, max_block_len, concurrent_loops, mvin_rs2_t)) + val ldB = Module(new LoopMatmulLdB(block_size, coreMaxAddrBits, iterator_bitwidth, max_all_addr, input_w, max_block_len, concurrent_loops, mvin_rs2_t)) val ldD = Module(new LoopMatmulLdD(block_size, coreMaxAddrBits, iterator_bitwidth, max_acc_addr, input_w, acc_w, max_block_len, max_block_len_acc, concurrent_loops, mvin_rs2_t)) val ex = Module(new LoopMatmulExecute(block_size, coreMaxAddrBits, iterator_bitwidth, max_addr, max_acc_addr, concurrent_loops, preload_rs1_t, preload_rs2_t, compute_rs1_t, compute_rs2_t)) val stC = Module(new LoopMatmulStC(block_size, coreMaxAddrBits, iterator_bitwidth, max_acc_addr, input_w, acc_w, max_block_len, concurrent_loops, mvout_rs2_t)) @@ -786,8 +814,10 @@ class LoopMatmul(block_size: Int, coreMaxAddrBits: Int, reservation_station_size ldab_arb.io.inA <> ldA.io.cmd ldab_arb.io.inB <> ldB.io.cmd val ab_loads_on_same_loop = ldA.io.loop_id === ldB.io.loop_id - ldab_arb.io.forceA := !ab_loads_on_same_loop && ldA.io.loop_id === head_loop_id - ldab_arb.io.forceB := !ab_loads_on_same_loop && ldB.io.loop_id === head_loop_id + val forceA = !ab_loads_on_same_loop && ldA.io.loop_id === head_loop_id + val forceB = !ab_loads_on_same_loop && ldB.io.loop_id === head_loop_id + ldab_arb.io.forceA := Mux(is_resadd, ab_loads_on_same_loop && !ldA.io.idle, forceA) + ldab_arb.io.forceB := Mux(is_resadd, forceB || ldA.io.idle, forceB) ldab_arb.io.weightA := 0.U ldab_arb.io.inA_idle := ldA.io.idle ldab_arb.io.inB_idle := ldB.io.idle @@ -853,6 +883,17 @@ class LoopMatmul(block_size: Int, coreMaxAddrBits: Int, reservation_station_size stC.io.ex_j := ex.io.j stC.io.ex_i := ex.io.i + // when loop matmul is used as resadd unroller + // skip ex + // track ldB instead of ex + when(is_resadd){ + stC.io.ex_completed := (ldA.io.loop_id =/= stC.io.loop_id || ldA.io.idle) && (ldB.io.loop_id =/= stC.io.loop_id || ldB.io.idle) + stC.io.ex_k := 0.U // req.max_k shall be 1 + stC.io.ex_j := ldB.io.j + stC.io.ex_i := ldB.io.k + //ldB.io.rob_overloaded := ld_utilization >= max_lds.U || !((ldA.io.loop_id =/= ldB.io.loop_id) || ldA.io.idle) + } + val loops_configured = RegInit(0.U(16.W)) dontTouch(loops_configured) @@ -896,8 +937,11 @@ class LoopMatmul(block_size: Int, coreMaxAddrBits: Int, reservation_station_size loop_being_configured.low_d := cmd.bits.cmd.rs1(2) loop_being_configured.act := cmd.bits.cmd.rs1(8+Activation.bitwidth-1, 8) // TODO magic numbers + loop_being_configured.a_ex_spad_id := cmd.bits.cmd.rs1(19, 18) + loop_being_configured.b_ex_spad_id := cmd.bits.cmd.rs1(17, 16) loop_being_configured.a_transpose := cmd.bits.cmd.rs2(0) loop_being_configured.b_transpose := cmd.bits.cmd.rs2(1) + is_resadd := cmd.bits.cmd.rs2(2) loop_being_configured.configured := true.B @@ -913,15 +957,16 @@ class LoopMatmul(block_size: Int, coreMaxAddrBits: Int, reservation_station_size val loop_requesting_ldA_id = Mux(head_loop.lda_started, tail_loop_id, head_loop_id) val loop_requesting_ldA = loops(loop_requesting_ldA_id) - ldA.io.req.bits.max_k := loop_requesting_ldA.max_k + ldA.io.req.bits.max_k := Mux(is_resadd, loop_requesting_ldA.max_j, loop_requesting_ldA.max_k) ldA.io.req.bits.max_i := loop_requesting_ldA.max_i - ldA.io.req.bits.pad_k := loop_requesting_ldA.pad_k + ldA.io.req.bits.pad_k := Mux(is_resadd, loop_requesting_ldA.pad_j, loop_requesting_ldA.pad_k) ldA.io.req.bits.pad_i := loop_requesting_ldA.pad_i ldA.io.req.bits.dram_addr := loop_requesting_ldA.a_dram_addr ldA.io.req.bits.dram_stride := loop_requesting_ldA.a_dram_stride ldA.io.req.bits.transpose := loop_requesting_ldA.a_transpose - ldA.io.req.bits.addr_start := loop_requesting_ldA.a_addr_start + ldA.io.req.bits.addr_start := Mux(loop_requesting_ldA.a_ex_spad_id === 0.U, loop_requesting_ldA.a_addr_start, (loop_requesting_ldA.a_ex_spad_id - 1.U) * (max_addr / concurrent_loops).U) ldA.io.req.bits.loop_id := loop_requesting_ldA_id + ldA.io.req.bits.is_resadd := is_resadd ldA.io.req.valid := !loop_requesting_ldA.lda_started && loop_requesting_ldA.configured @@ -933,14 +978,15 @@ class LoopMatmul(block_size: Int, coreMaxAddrBits: Int, reservation_station_size val loop_requesting_ldB_id = Mux(head_loop.ldb_started, tail_loop_id, head_loop_id) val loop_requesting_ldB = loops(loop_requesting_ldB_id) ldB.io.req.bits.max_j := loop_requesting_ldB.max_j - ldB.io.req.bits.max_k := loop_requesting_ldB.max_k + ldB.io.req.bits.max_k := Mux(is_resadd, loop_requesting_ldB.max_i, loop_requesting_ldB.max_k) ldB.io.req.bits.pad_j := loop_requesting_ldB.pad_j - ldB.io.req.bits.pad_k := loop_requesting_ldB.pad_k + ldB.io.req.bits.pad_k := Mux(is_resadd, loop_requesting_ldB.pad_i, loop_requesting_ldB.pad_k) ldB.io.req.bits.dram_addr := loop_requesting_ldB.b_dram_addr ldB.io.req.bits.dram_stride := loop_requesting_ldB.b_dram_stride ldB.io.req.bits.transpose := loop_requesting_ldB.b_transpose - ldB.io.req.bits.addr_end := loop_requesting_ldB.b_addr_end + ldB.io.req.bits.addr_end := Mux(loop_requesting_ldB.b_ex_spad_id === 0.U, loop_requesting_ldB.b_addr_end, (loop_requesting_ldB.b_ex_spad_id) * (max_addr / concurrent_loops).U) ldB.io.req.bits.loop_id := loop_requesting_ldB_id + ldB.io.req.bits.is_resadd := is_resadd ldB.io.req.valid := !loop_requesting_ldB.ldb_started && loop_requesting_ldB.configured @@ -958,15 +1004,16 @@ class LoopMatmul(block_size: Int, coreMaxAddrBits: Int, reservation_station_size ex.io.req.bits.pad_k := loop_requesting_ex.pad_k ex.io.req.bits.pad_i := loop_requesting_ex.pad_i ex.io.req.bits.accumulate := loop_requesting_ex.ex_accumulate - ex.io.req.bits.a_addr_start := loop_requesting_ex.a_addr_start - ex.io.req.bits.b_addr_end := loop_requesting_ex.b_addr_end + ex.io.req.bits.a_addr_start := Mux(loop_requesting_ex.a_ex_spad_id === 0.U, loop_requesting_ex.a_addr_start, (loop_requesting_ex.a_ex_spad_id - 1.U) * (max_addr / concurrent_loops).U) + ex.io.req.bits.b_addr_end := Mux(loop_requesting_ex.b_ex_spad_id === 0.U, loop_requesting_ex.b_addr_end, (loop_requesting_ex.b_ex_spad_id) * (max_addr / concurrent_loops).U) ex.io.req.bits.a_tranpose := loop_requesting_ex.a_transpose ex.io.req.bits.b_tranpose := loop_requesting_ex.b_transpose ex.io.req.bits.c_addr_start := ex_c_addr_start ex.io.req.bits.loop_id := loop_requesting_ex_id + ex.io.req.bits.skip := is_resadd ex.io.req.valid := !loop_requesting_ex.ex_started && loop_requesting_ex.lda_started && - loop_requesting_ex.ldb_started && loop_requesting_ex.ldd_started && loop_requesting_ex.configured + loop_requesting_ex.ldb_started && loop_requesting_ex.ldd_started && loop_requesting_ex.configured when (ex.io.req.fire) { loop_requesting_ex.running := true.B @@ -1002,7 +1049,7 @@ class LoopMatmul(block_size: Int, coreMaxAddrBits: Int, reservation_station_size val loop_requesting_st_id = Mux(head_loop.st_started, tail_loop_id, head_loop_id) val loop_requesting_st = loops(loop_requesting_st_id) - stC.io.req.bits.max_k := loop_requesting_st.max_k + stC.io.req.bits.max_k := Mux(is_resadd, 1.U, loop_requesting_st.max_k) stC.io.req.bits.max_j := loop_requesting_st.max_j stC.io.req.bits.max_i := loop_requesting_st.max_i stC.io.req.bits.pad_j := loop_requesting_st.pad_j @@ -1013,6 +1060,8 @@ class LoopMatmul(block_size: Int, coreMaxAddrBits: Int, reservation_station_size stC.io.req.bits.act := loop_requesting_st.act stC.io.req.bits.addr_start := st_c_addr_start stC.io.req.bits.loop_id := loop_requesting_st_id + stC.io.req.bits.is_resadd := is_resadd + stC.io.req.valid := !loop_requesting_st.st_started && loop_requesting_st.ex_started && loop_requesting_st.configured @@ -1025,6 +1074,12 @@ class LoopMatmul(block_size: Int, coreMaxAddrBits: Int, reservation_station_size } } + when(is_resadd){ + ldA.io.req.bits.addr_start := loop_requesting_ldA.resadd_addr_start + ldB.io.req.bits.addr_end := loop_requesting_ldB.resadd_addr_start + stC.io.req.bits.addr_start := loop_requesting_st.resadd_addr_start + stC.io.req.valid := !loop_requesting_st.st_started && loop_requesting_st.configured + } // Handle completed signals when (ldA.io.idle && loops(ldA.io.loop_id).running && loops(ldA.io.loop_id).lda_started) { loops(ldA.io.loop_id).lda_completed := true.B @@ -1057,6 +1112,7 @@ class LoopMatmul(block_size: Int, coreMaxAddrBits: Int, reservation_station_size l.reset() l.a_addr_start := (i * (max_addr / concurrent_loops)).U l.b_addr_end := ((i+1) * (max_addr / concurrent_loops)).U + l.resadd_addr_start := (i * (max_acc_addr / concurrent_loops)).U } } } From 34c0e695c38856fe7355b60a74752f140ec30fb8 Mon Sep 17 00:00:00 2001 From: Hasan Genc Date: Wed, 14 Jun 2023 22:32:58 -0700 Subject: [PATCH 02/24] fix bitwidth issue in PE.scala when WS-only dataflow is used --- src/main/scala/gemmini/PE.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main/scala/gemmini/PE.scala b/src/main/scala/gemmini/PE.scala index 9518942f..6e065125 100644 --- a/src/main/scala/gemmini/PE.scala +++ b/src/main/scala/gemmini/PE.scala @@ -61,7 +61,8 @@ class PE[T <: Data](inputType: T, outputType: T, accType: T, df: Dataflow.Value, // elaboration/synthesis tools often fail to consolidate and de-duplicate // MAC units. To force mac circuitry to be re-used, we create a "mac_unit" // module here which just performs a single MAC operation - val mac_unit = Module(new MacUnit(inputType, cType, outputType)) + val mac_unit = Module(new MacUnit(inputType, + if (df == Dataflow.WS) outputType else accType, outputType)) val a = io.in_a val b = io.in_b From f13847e839baee4a3c8d5d51e76a3dc9e9c1933f Mon Sep 17 00:00:00 2001 From: Hasan Genc Date: Thu, 15 Jun 2023 11:07:59 -0700 Subject: [PATCH 03/24] bump submodules --- software/gemmini-rocc-tests | 2 +- software/libgemmini | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/software/gemmini-rocc-tests b/software/gemmini-rocc-tests index bb489531..ab748c52 160000 --- a/software/gemmini-rocc-tests +++ b/software/gemmini-rocc-tests @@ -1 +1 @@ -Subproject commit bb4895319b4a7f7181a613a3853bc82887000ee0 +Subproject commit ab748c525a1152b7205c74aee0c24aa0463cf124 diff --git a/software/libgemmini b/software/libgemmini index d914e1c6..d0247b7e 160000 --- a/software/libgemmini +++ b/software/libgemmini @@ -1 +1 @@ -Subproject commit d914e1c6fe9ad81ab266b7ba3247e7ce36b3c9f8 +Subproject commit d0247b7e3e1155584cb215fc0cd943cbeada89dd From 6adb9676ea33b359f5583cade7805e83a6a0acc0 Mon Sep 17 00:00:00 2001 From: Jerry Zhao Date: Thu, 15 Jun 2023 22:36:35 -0700 Subject: [PATCH 04/24] Bump libgemmini --- software/libgemmini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/software/libgemmini b/software/libgemmini index d0247b7e..71d804a4 160000 --- a/software/libgemmini +++ b/software/libgemmini @@ -1 +1 @@ -Subproject commit d0247b7e3e1155584cb215fc0cd943cbeada89dd +Subproject commit 71d804a4b27d509eb27b914056a4f94f82c9aa96 From 931be9391383e360189b831ddf6daeea6f6078f4 Mon Sep 17 00:00:00 2001 From: Jerry Zhao Date: Thu, 15 Jun 2023 15:15:20 -0700 Subject: [PATCH 05/24] Bump to latest CY --- .github/scripts/enable-conda.sh | 2 +- CHIPYARD.hash | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/scripts/enable-conda.sh b/.github/scripts/enable-conda.sh index 184ead9b..c064dd7a 100644 --- a/.github/scripts/enable-conda.sh +++ b/.github/scripts/enable-conda.sh @@ -7,7 +7,7 @@ conda activate base if ! { conda env list | grep 'chipyard'; } >/dev/null 2>&1; then conda create -n chipyard conda activate chipyard - conda install -c conda-forge conda-lock + conda install -c conda-forge conda-lock=1.4 fi conda activate chipyard diff --git a/CHIPYARD.hash b/CHIPYARD.hash index ed413fdd..43ea698b 100644 --- a/CHIPYARD.hash +++ b/CHIPYARD.hash @@ -1 +1 @@ -569917e2f30616f85a841d16a92914ae98ad7184 +7874f5148ef6d50c5191c375ee1fd047593f52ca From 771c84c51ac66b1e4f37c8d5fdf70aa0ea9bc237 Mon Sep 17 00:00:00 2001 From: Jerry Zhao Date: Tue, 20 Jun 2023 17:06:13 -0700 Subject: [PATCH 06/24] Bump to chisel3.6 --- build.sbt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/build.sbt b/build.sbt index 80242277..e915cf5d 100644 --- a/build.sbt +++ b/build.sbt @@ -7,9 +7,8 @@ version := "3.1.0" scalaVersion := "2.13.10" libraryDependencies ++= Seq( - "edu.berkeley.cs" %% "chisel3" % "3.5.6", + "edu.berkeley.cs" %% "chisel3" % "3.6.0", "edu.berkeley.cs" %% "rocketchip" % "1.2.+", - "edu.berkeley.cs" %% "chisel-iotesters" % "2.5.6", "org.scalanlp" %% "breeze" % "1.1") resolvers ++= Seq( From 80f451169b8727beb379d7cc7e2fab86ad8c8040 Mon Sep 17 00:00:00 2001 From: "Ruohan (Richard) Yan" Date: Mon, 26 Jun 2023 08:09:02 -0700 Subject: [PATCH 07/24] Normalizer optimizations (#260) Fused inv_sum_exp / inv_stddev with scale 16-cycle sqrt unit 2x fast fp divisions heterogenous accscale lanes --- build.sbt | 4 + software/gemmini-rocc-tests | 2 +- src/main/scala/gemmini/AccumulatorScale.scala | 184 +++++++++++--- src/main/scala/gemmini/Arithmetic.scala | 24 +- src/main/scala/gemmini/GemminiConfigs.scala | 2 +- src/main/scala/gemmini/LoopMatmul.scala | 2 +- src/main/scala/gemmini/Normalizer.scala | 238 ++++++++++++++++-- src/main/scala/gemmini/Scratchpad.scala | 11 +- 8 files changed, 380 insertions(+), 87 deletions(-) diff --git a/build.sbt b/build.sbt index 80242277..49b24651 100644 --- a/build.sbt +++ b/build.sbt @@ -16,3 +16,7 @@ resolvers ++= Seq( Resolver.sonatypeRepo("snapshots"), Resolver.sonatypeRepo("releases"), Resolver.mavenLocal) + +// specified commit BEFORE scala bump to 2.13 for compatibility +// need this version for MulRecFN and fast divider +// lazy val newHardfloat = RootProject(uri("https://github.com/ucb-bar/berkeley-hardfloat.git#74cc28")) diff --git a/software/gemmini-rocc-tests b/software/gemmini-rocc-tests index ab748c52..9b243700 160000 --- a/software/gemmini-rocc-tests +++ b/software/gemmini-rocc-tests @@ -1 +1 @@ -Subproject commit ab748c525a1152b7205c74aee0c24aa0463cf124 +Subproject commit 9b243700448dd10236aa1d627e7618f3f449f2f0 diff --git a/src/main/scala/gemmini/AccumulatorScale.scala b/src/main/scala/gemmini/AccumulatorScale.scala index e4aaa192..e1da8d23 100644 --- a/src/main/scala/gemmini/AccumulatorScale.scala +++ b/src/main/scala/gemmini/AccumulatorScale.scala @@ -28,7 +28,7 @@ class AccumulatorScaleIO[T <: Data: Arithmetic, U <: Data]( class AccScaleDataWithIndex[T <: Data: Arithmetic, U <: Data](t: T, u: U) extends Bundle { val scale = u.cloneType - val act = UInt(2.W) // TODO magic number + val act = UInt(Activation.bitwidth.W) val igelu_qb = t.cloneType val igelu_qc = t.cloneType val iexp_qln2 = t.cloneType @@ -56,19 +56,28 @@ class AccScalePipe[T <: Data, U <: Data](t: T, rDataType: Vec[Vec[T]], scale_fun val e = io.in.bits.data + val act = io.in.bits.act + // make sure no normalizations gets passed in if no functional units present + assert(has_normalizations.B || (!io.in.fire) || + (act =/= Activation.LAYERNORM && act =/= Activation.SOFTMAX && act =/= Activation.IGELU)) + val e_act = MuxCase(e, Seq( - (has_nonlinear_activations.B && io.in.bits.act === Activation.RELU) -> e.relu, - (has_nonlinear_activations.B && has_normalizations.B && io.in.bits.act === Activation.LAYERNORM) -> - (e - io.in.bits.mean).mult_with_reciprocal(io.in.bits.inv_stddev), - (has_nonlinear_activations.B && has_normalizations.B && io.in.bits.act === Activation.IGELU) -> + (has_nonlinear_activations.B && act === Activation.RELU) -> e.relu, + (has_nonlinear_activations.B && has_normalizations.B && act === Activation.LAYERNORM) -> + (e - io.in.bits.mean), + (has_nonlinear_activations.B && has_normalizations.B && act === Activation.IGELU) -> AccumulatorScale.igelu(e, io.in.bits.igelu_qb, io.in.bits.igelu_qc), - (has_nonlinear_activations.B && has_normalizations.B && io.in.bits.act === Activation.SOFTMAX) -> - scale_func( - AccumulatorScale.iexp(e - io.in.bits.max, io.in.bits.iexp_qln2, io.in.bits.iexp_qln2_inv, io.in.bits.igelu_qb, io.in.bits.igelu_qc), - io.in.bits.inv_sum_exp.asTypeOf(scale_t)), + (has_nonlinear_activations.B && has_normalizations.B && act === Activation.SOFTMAX) -> + AccumulatorScale.iexp(e - io.in.bits.max, io.in.bits.iexp_qln2, io.in.bits.iexp_qln2_inv, io.in.bits.igelu_qb, io.in.bits.igelu_qc), )) - val e_scaled = scale_func(e_act, io.in.bits.scale) + val e_scaled = scale_func(e_act, MuxCase(io.in.bits.scale, Seq( + (has_nonlinear_activations.B && has_normalizations.B && act === Activation.LAYERNORM) -> + io.in.bits.inv_stddev, + (has_nonlinear_activations.B && has_normalizations.B && act === Activation.SOFTMAX) -> + io.in.bits.inv_sum_exp.asTypeOf(scale_t) + )).asTypeOf(scale_t)) + val e_clipped = e_scaled.clippedToWidthOf(rDataType.head.head) out.bits.data := e_clipped @@ -108,16 +117,20 @@ class AccumulatorScale[T <: Data, U <: Data]( val e_act = MuxCase(e, Seq( (has_nonlinear_activations.B && act === Activation.RELU) -> e.relu, (has_nonlinear_activations.B && has_normalizations.B && act === Activation.LAYERNORM) -> - (e - io.in.bits.mean).mult_with_reciprocal(io.in.bits.inv_stddev), + (e - io.in.bits.mean), (has_nonlinear_activations.B && has_normalizations.B && act === Activation.IGELU) -> AccumulatorScale.igelu(e, igelu_qb, igelu_qc), (has_nonlinear_activations.B && has_normalizations.B && act === Activation.SOFTMAX) -> - scale_func( - AccumulatorScale.iexp(e - io.in.bits.max, iexp_qln2, iexp_qln2_inv, igelu_qb, igelu_qc), - io.in.bits.inv_sum_exp.asTypeOf(scale_t)), + AccumulatorScale.iexp(e - io.in.bits.max, iexp_qln2, iexp_qln2_inv, igelu_qb, igelu_qc), )) - val e_scaled = scale_func(e_act, scale) + val e_scaled = scale_func(e_act, MuxCase(scale, Seq( + (has_nonlinear_activations.B && has_normalizations.B && act === Activation.LAYERNORM) -> + io.in.bits.inv_stddev, + (has_nonlinear_activations.B && has_normalizations.B && act === Activation.SOFTMAX) -> + io.in.bits.inv_sum_exp.asTypeOf(scale_t) + )).asTypeOf(scale_t)) + val e_clipped = e_scaled.clippedToWidthOf(rDataType.head.head) e_clipped @@ -178,15 +191,66 @@ class AccumulatorScale[T <: Data, U <: Data]( tail_oh := (tail_oh << 1).asUInt | tail_oh(nEntries-1) } - val inputs = Seq.fill(width*nEntries) { Wire(Decoupled(new AccScaleDataWithIndex(t, scale_t)(ev))) } + val num_units_with_norm = 4 // TODO: move to configs + + val inputs_norm = Seq.fill(width*nEntries) { Wire(Decoupled(new AccScaleDataWithIndex(t, scale_t)(ev))) } + val inputs_non_norm = Seq.fill(width*nEntries) { Wire(Decoupled(new AccScaleDataWithIndex(t, scale_t)(ev))) } + + val norm_mask = regs.map(r => r.valid && ( + (r.bits.acc_read_resp.act === Activation.SOFTMAX) || + (r.bits.acc_read_resp.act === Activation.LAYERNORM) || + (r.bits.acc_read_resp.act === Activation.IGELU) + )) + + // input: norm_mask + // output: {b2, b1, b0} <-> b_i = whether entry i should use functional units with norm (1 = should) + val static_assignment_policy = Wire(Vec(1 << nEntries, UInt(nEntries.W))) + println("static policy for " + num_units_with_norm + " norm units:") + for (i <- 0 until (1 << nEntries)) { + val binaryString = String.format("%" + nEntries + "s", i.toBinaryString) + .replace(' ', '0').toCharArray.toList + val num_norm : Int = binaryString.count(_ == '1') + val ratio_of_norm_entries = num_norm.toFloat / nEntries.toFloat + val ratio_of_norm_units = num_units_with_norm.toFloat / num_scale_units.toFloat + if (ratio_of_norm_entries >= ratio_of_norm_units) { + // use norm units for all norm entries + static_assignment_policy(i.U) := i.U + println("input pattern " + binaryString.mkString("") + ": " + binaryString.mkString("")) + } else { + def flip_n_zeros (s: List[Char], n: Int): List[Char] = { + if (s.nonEmpty) { + if ((s.head == '0') && (n > 0)) + '1' :: flip_n_zeros(s.tail, n - 1) + else + s.head :: flip_n_zeros(s.tail, n) + } else { + assert(n == 0, "cannot flip " + n + " zeros in an empty string") + List.empty + } + } + val flippedString = flip_n_zeros( + binaryString, Math.round(ratio_of_norm_units * nEntries) - num_norm) + val flipped = Integer.parseInt(flippedString.mkString(""), 2) + static_assignment_policy(i.U) := flipped.U + println("input pattern " + binaryString.mkString("") + ": " + flipped.toBinaryString) + } + } + + // val inputs = Seq.fill(width*nEntries) { Wire(Decoupled(new AccScaleDataWithIndex(t, scale_t)(ev))) } + val current_policy = Wire(UInt(nEntries.W)) + val norm_mask_int = Wire(UInt(nEntries.W)) + norm_mask_int := VecInit(norm_mask).asUInt() + dontTouch(norm_mask_int) + current_policy := static_assignment_policy(norm_mask_int) + for (i <- 0 until nEntries) { for (w <- 0 until width) { - val input = inputs(i*width+w) + val input = inputs_norm(i*width+w) val acc_read_resp = regs(i).bits.acc_read_resp - input.valid := regs(i).valid && !fired_masks(i)(w) + input.valid := regs(i).valid && !fired_masks(i)(w) && /*norm_mask(i)*/ current_policy(i) input.bits.data := acc_read_resp.data(w / acc_read_data(0).size)(w % acc_read_data(0).size) input.bits.full_data := acc_read_resp.data(w / acc_read_data(0).size)(w % acc_read_data(0).size) input.bits.scale := acc_read_resp.scale @@ -206,8 +270,47 @@ class AccumulatorScale[T <: Data, U <: Data]( } } } + + for (i <- 0 until nEntries) { + for (w <- 0 until width) { + val input = inputs_non_norm(i*width+w) + + val acc_read_resp = regs(i).bits.acc_read_resp + + input.valid := regs(i).valid && !fired_masks(i)(w) && (!current_policy(i)) + input.bits.data := acc_read_resp.data(w / acc_read_data(0).size)(w % acc_read_data(0).size) + input.bits.full_data := acc_read_resp.data(w / acc_read_data(0).size)(w % acc_read_data(0).size) + input.bits.scale := acc_read_resp.scale + input.bits.act := acc_read_resp.act + input.bits.igelu_qb := DontCare + input.bits.igelu_qc := DontCare + input.bits.iexp_qln2 := DontCare + input.bits.iexp_qln2_inv := DontCare + input.bits.mean := DontCare + input.bits.max := DontCare + input.bits.inv_stddev := DontCare + input.bits.inv_sum_exp := DontCare + input.bits.id := i.U + input.bits.index := w.U + if (num_scale_units == num_units_with_norm) { + input.ready := false.B + } + when (input.fire) { + fired_masks(i)(w) := true.B + } + } + } + for (i <- 0 until num_scale_units) { - val arbIn = inputs.zipWithIndex.filter({ case (_, w) => w % num_scale_units == i }).map(_._1) + val norm_supported = (i < num_units_with_norm) && has_normalizations + + val arbIn = + if (norm_supported) + // for norm units, prioritize norm operations + inputs_norm.zipWithIndex.filter({ case (_, w) => w % num_units_with_norm == i }).map(_._1) + else + inputs_non_norm.zipWithIndex.filter({ case (_, w) => w % (num_scale_units - num_units_with_norm) == (i - num_units_with_norm) }).map(_._1) + val arb = Module(new RRArbiter(new AccScaleDataWithIndex(t, scale_t)(ev), arbIn.length)) arb.io.in <> arbIn arb.io.out.ready := true.B @@ -217,22 +320,34 @@ class AccumulatorScale[T <: Data, U <: Data]( when (reset.asBool) { arbOut.valid := false.B } - val pipe = Module(new AccScalePipe(t, rDataType, scale_func, scale_t, latency, has_nonlinear_activations, - has_normalizations)) + val pipe = Module(new AccScalePipe(t, rDataType, scale_func, scale_t, latency, + has_nonlinear_activations, norm_supported)) + pipe.io.in := arbOut val pipe_out = pipe.io.out for (j <- 0 until nEntries) { for (w <- 0 until width) { - if ((j*width+w) % num_scale_units == i) { - val id0 = w % acc_read_data(0).size - val id1 = w / acc_read_data(0).size - when (pipe_out.fire && pipe_out.bits.id === j.U && pipe_out.bits.index === w.U) { - out_regs(j).data (id1)(id0) := pipe_out.bits.data - out_regs(j).full_data(id1)(id0) := pipe_out.bits.full_data - completed_masks(j)(w) := true.B + val id0 = w % acc_read_data(0).size + val id1 = w / acc_read_data(0).size + if ((j*width+w) % num_units_with_norm == i) { + when (pipe_out.fire && pipe_out.bits.id === j.U && pipe_out.bits.index === w.U) { + out_regs(j).data (id1)(id0) := pipe_out.bits.data + out_regs(j).full_data(id1)(id0) := pipe_out.bits.full_data + completed_masks(j)(w) := true.B + } + } + if (num_scale_units > num_units_with_norm) { + if ((j*width+w) % (num_scale_units - num_units_with_norm) == (i - num_units_with_norm)) { + val id0 = w % acc_read_data(0).size + val id1 = w / acc_read_data(0).size + when (pipe_out.fire && pipe_out.bits.id === j.U && pipe_out.bits.index === w.U) { + out_regs(j).data (id1)(id0) := pipe_out.bits.data + out_regs(j).full_data(id1)(id0) := pipe_out.bits.full_data + completed_masks(j)(w) := true.B + } + } } - } } } } @@ -276,16 +391,17 @@ object AccumulatorScale { val zero = q.zero def neg(x: T) = zero-x - // qln2_inv needs scale to be - // 1 / (2 ** 16) / S - + // qln2_inv needs scale to be 1 / (2 ** 16) / S // qln2_inv / S / (2 ** 16) = 1 / ln2 // q * qln2_inv = x / S / ln2 * S * (2 ** 16) = x / ln2 * (2 ** 16) val neg_q_iexp = neg(q) - val z_iexp = (neg_q_iexp * qln2_inv).asUInt.do_>>(16).asTypeOf(q) // q is non-positive + val z_iexp = (neg_q_iexp * qln2_inv).asUInt().do_>>(16).asTypeOf(q) // q is non-positive + val z_iexp_saturated = Wire(z_iexp.cloneType) + z_iexp_saturated := Mux((5 until 16).map(z_iexp.asUInt()(_)).reduce(_ | _), 32.S, z_iexp) val qp_iexp = q.mac(z_iexp, qln2).withWidthOf(q) val q_poly_iexp = qc.mac(qp_iexp + qb, qp_iexp + qb).withWidthOf(q) // we dont want a rounding shift - (q_poly_iexp.asUInt.do_>>(z_iexp.asUInt(5, 0))).asTypeOf(q) + // TODO: z overflow + (q_poly_iexp.asUInt().do_>>(z_iexp_saturated.asUInt())).asTypeOf(q) }} diff --git a/src/main/scala/gemmini/Arithmetic.scala b/src/main/scala/gemmini/Arithmetic.scala index c6792578..7bd8d9e8 100644 --- a/src/main/scala/gemmini/Arithmetic.scala +++ b/src/main/scala/gemmini/Arithmetic.scala @@ -43,9 +43,9 @@ abstract class ArithmeticOps[T <: Data](self: T) { def minimum: T // Optional parameters, which only need to be defined if you want to enable various optimizations for transformers - def divider(denom_t: UInt): Option[(DecoupledIO[UInt], DecoupledIO[T])] = None + def divider(denom_t: UInt, options: Int = 0): Option[(DecoupledIO[UInt], DecoupledIO[T])] = None def sqrt: Option[(DecoupledIO[UInt], DecoupledIO[T])] = None - def reciprocal[U <: Data](u: U): Option[(DecoupledIO[UInt], DecoupledIO[U])] = None + def reciprocal[U <: Data](u: U, options: Int = 0): Option[(DecoupledIO[UInt], DecoupledIO[U])] = None def mult_with_reciprocal[U <: Data](reciprocal: U) = self } @@ -131,7 +131,7 @@ object Arithmetic { override def identity: SInt = 1.S override def minimum: SInt = (-(1 << (self.getWidth-1))).S - override def divider(denom_t: UInt): Option[(DecoupledIO[UInt], DecoupledIO[SInt])] = { + override def divider(denom_t: UInt, options: Int = 0): Option[(DecoupledIO[UInt], DecoupledIO[SInt])] = { // TODO this uses a floating point divider, but we should use an integer divider instead val input = Wire(Decoupled(denom_t.cloneType)) @@ -174,7 +174,7 @@ object Arithmetic { val denom_rec = uin_to_float(input.bits) // Instantiate the hardloat divider - val divider = Module(new DivSqrtRecFN_small(expWidth, sigWidth, 0)) + val divider = Module(new DivSqrtRecFN_small(expWidth, sigWidth, options)) input.ready := divider.io.inReady divider.io.inValid := input.valid @@ -244,7 +244,7 @@ object Arithmetic { Some((input, output)) } - override def reciprocal[U <: Data](u: U): Option[(DecoupledIO[UInt], DecoupledIO[U])] = u match { + override def reciprocal[U <: Data](u: U, options: Int = 0): Option[(DecoupledIO[UInt], DecoupledIO[U])] = u match { case Float(expWidth, sigWidth) => val input = Wire(Decoupled(UInt(0.W))) val output = Wire(Decoupled(u.cloneType)) @@ -266,7 +266,7 @@ object Arithmetic { val one_rec = in_to_float(1.S) // Instantiate the hardloat divider - val divider = Module(new DivSqrtRecFN_small(expWidth, sigWidth, 0)) + val divider = Module(new DivSqrtRecFN_small(expWidth, sigWidth, options)) input.ready := divider.io.inReady divider.io.inValid := input.valid @@ -311,14 +311,12 @@ object Arithmetic { val reciprocal_rec = recFNFromFN(expWidth, sigWidth, recip.bits) // Instantiate the hardloat divider - val muladder = Module(new MulAddRecFN(expWidth, sigWidth)) - muladder.io.op := 0.U + val muladder = Module(new MulRecFN(expWidth, sigWidth)) muladder.io.roundingMode := consts.round_near_even muladder.io.detectTininess := consts.tininess_afterRounding muladder.io.a := self_rec muladder.io.b := reciprocal_rec - muladder.io.c := 0.U float_to_in(muladder.io.out) @@ -341,15 +339,13 @@ object Arithmetic { t_resizer.io.detectTininess := consts.tininess_afterRounding val t_rec_resized = t_resizer.io.out - val muladder = Module(new MulAddRecFN(self.expWidth, self.sigWidth)) + val muladder = Module(new MulRecFN(self.expWidth, self.sigWidth)) - muladder.io.op := 0.U muladder.io.roundingMode := consts.round_near_even // consts.round_near_maxMag muladder.io.detectTininess := consts.tininess_afterRounding muladder.io.a := self_rec muladder.io.b := t_rec_resized - muladder.io.c := 0.U val out = Wire(Float(self.expWidth, self.sigWidth)) out.bits := fNFromRecFN(self.expWidth, self.sigWidth, muladder.io.out) @@ -451,15 +447,13 @@ object Arithmetic { assert(shift_exp =/= 0.U, "scaling by denormalized numbers is not currently supported") // Multiply self and 2^(-u) - val muladder = Module(new MulAddRecFN(self.expWidth, self.sigWidth)) + val muladder = Module(new MulRecFN(self.expWidth, self.sigWidth)) - muladder.io.op := 0.U muladder.io.roundingMode := consts.round_near_even // consts.round_near_maxMag muladder.io.detectTininess := consts.tininess_afterRounding muladder.io.a := self_rec muladder.io.b := shift_rec - muladder.io.c := 0.U val result = Wire(Float(self.expWidth, self.sigWidth)) result.bits := fNFromRecFN(self.expWidth, self.sigWidth, muladder.io.out) diff --git a/src/main/scala/gemmini/GemminiConfigs.scala b/src/main/scala/gemmini/GemminiConfigs.scala index 573581ec..98254299 100644 --- a/src/main/scala/gemmini/GemminiConfigs.scala +++ b/src/main/scala/gemmini/GemminiConfigs.scala @@ -494,7 +494,7 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( if (has_normalizations) { header ++= "#define HAS_NORMALIZATIONS\n" - header ++= "#define NORM_STAT_IDS 4\n\n" + header ++= "#define NORM_STAT_IDS 2\n\n" } header ++= s"#endif // $guard\n" diff --git a/src/main/scala/gemmini/LoopMatmul.scala b/src/main/scala/gemmini/LoopMatmul.scala index 11ed4006..91a4afbe 100644 --- a/src/main/scala/gemmini/LoopMatmul.scala +++ b/src/main/scala/gemmini/LoopMatmul.scala @@ -572,7 +572,7 @@ class LoopMatmulStC(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In val ln_cmd = Reg(UInt(iterator_bitwidth.W)) val ln_stat_id = Reg(UInt(iterator_bitwidth.W)) - val NORM_STAT_IDS = 4 // TODO magic number + val NORM_STAT_IDS = 2 // TODO magic number val ln_norm_cmds = VecInit(VecInit(NormCmd.SUM, NormCmd.MEAN), VecInit(NormCmd.VARIANCE, NormCmd.INV_STDDEV), VecInit(NormCmd.RESET, NormCmd.RESET)) diff --git a/src/main/scala/gemmini/Normalizer.scala b/src/main/scala/gemmini/Normalizer.scala index 67dd18ac..d7991bb2 100644 --- a/src/main/scala/gemmini/Normalizer.scala +++ b/src/main/scala/gemmini/Normalizer.scala @@ -5,7 +5,7 @@ import chisel3._ import chisel3.experimental.ChiselEnum import chisel3.util._ import gemmini.AccumulatorScale.iexp -import hardfloat.{DivSqrtRecFN_small, INToRecFN, consts, fNFromRecFN} +import hardfloat.{DivSqrtRecFN_small, INToRecFN, MulRecFN, consts, fNFromRecFN, recFNFromFN} class NormalizedInput[T <: Data: Arithmetic, U <: Data](max_len: Int, num_stats: Int, fullDataType: Vec[Vec[T]], scale_t: U) extends Bundle { @@ -94,7 +94,6 @@ class MaxLanes[T <: Data](num_stats: Int, acc_t: T, n_lanes: Int, latency: Int)( // Each lane computes a sum, or an error-squared sum import ev._ - import NormCmd._ class LaneOutput extends Bundle { val result = acc_t.cloneType @@ -117,7 +116,17 @@ class MaxLanes[T <: Data](num_stats: Int, acc_t: T, n_lanes: Int, latency: Int)( Mux(i.U < io.ins.bits.len, d.withWidthOf(acc_t), d.minimum) } - val result = data.reduce({ (max, x) => Mux(x > max, x, max) }) + def treeMax(x: Seq[T]): T = { + if (x.length == 1) { + x.head + } else { + val a = treeMax(x.slice(0, x.length / 2)) // ayy slice + val b = treeMax(x.slice(x.length / 2, x.length)) + Mux(a > b, a, b) + } + } + + val result = treeMax(data) val pipe = Module(new Pipeline[LaneOutput](new LaneOutput, latency)()) @@ -134,6 +143,97 @@ class MaxLanes[T <: Data](num_stats: Int, acc_t: T, n_lanes: Int, latency: Int)( io.busy := pipe.io.busy } +class IntSqrt(width: Int) extends Module { + val N = (width + 1) >> 1 + + val input = IO(Flipped(Decoupled(UInt(width.W)))) + val output = IO(Decoupled(UInt(N.W))) + + val x = Reg(UInt(width.W)) + val a = Reg(UInt(width.W)) + val t = Wire(UInt(width.W)) + val q = Reg(UInt(N.W)) + val sign = Wire(UInt(1.W)) + val busy = RegInit(false.B) + val resultValid = RegInit(false.B) + val counter = Reg(UInt(log2Ceil(N).W)) + + input.ready := ! busy + output.valid := resultValid + output.bits := DontCare + + t := Cat(a, x(width - 1, width - 2)) - Cat(q, 1.U(2.W)) + sign := t(width - 1) + output.bits := q + + when(busy) { + when (!resultValid) { + counter := counter - 1.U + x := Cat(x(width - 3, 0), 0.U(2.W)) + a := Mux(sign.asBool, Cat(a(width - 3, 0), x(width - 1, width - 2)), t) + q := Cat(q(N - 2, 0), ~sign) + + when(counter === 0.U) { + resultValid := true.B + } + } + + when(output.ready && resultValid) { + busy := false.B + resultValid := false.B + } + }.otherwise { + when(input.valid) { + val inputBundle = input.deq() + x := inputBundle + a := 0.U + q := 0.U + busy := true.B + counter := (N - 1).U + } + } +} + +class MulPipe[T <: Data, U <: Data](scale_t: U)(implicit ev: Arithmetic[T]) + extends Module { + + val io = IO(new Bundle { + val ins = Flipped(Decoupled(new Bundle { + val x = scale_t.cloneType + val y = scale_t.cloneType + })) + + val out = Decoupled(scale_t.cloneType) + }) + + scale_t match { + case Float(expWidth, sigWidth) => + val self_rec = recFNFromFN(expWidth, sigWidth, io.ins.bits.x.asUInt()) + val scale_rec = recFNFromFN(expWidth, sigWidth, io.ins.bits.y.asUInt()) + + val mul = Module(new MulRecFN(expWidth, sigWidth)) + + mul.io.roundingMode := consts.round_near_even + mul.io.detectTininess := consts.tininess_afterRounding + + mul.io.a := self_rec + mul.io.b := scale_rec + + val mul_result = fNFromRecFN(expWidth, sigWidth, mul.io.out).asTypeOf(scale_t) + + val pipe = Module(new Pipeline(scale_t.cloneType, 2)()) + + pipe.io.in.valid := io.ins.valid + pipe.io.in.bits := mul_result + io.ins.ready := pipe.io.in.ready + +// pipe.io.out.ready := io.out.ready +// io.out.bits := pipe.io.out.bits +// io.out.valid := pipe.io.out.valid + io.out <> pipe.io.out + } +} + class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_stats: Int, latency: Int, fullDataType: Vec[Vec[T]], scale_t: U) (implicit ev: Arithmetic[T]) extends Module { @@ -155,9 +255,12 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_ val idle, output = Value val get_sum = Value val get_mean, waiting_for_mean = Value - val get_variance, waiting_for_variance, get_stddev, waiting_for_stddev, get_inv_stddev, waiting_for_inv_stddev = Value + val get_variance, waiting_for_variance, get_stddev, waiting_for_stddev = Value + val get_inv_stddev, waiting_for_inv_stddev = Value + val get_scaled_inv_stddev, waiting_for_scaled_inv_stddev = Value val get_max = Value val get_inv_sum_exp, waiting_for_inv_sum_exp = Value + val get_scaled_inv_sum_exp, waiting_for_scaled_inv_sum_exp = Value } import State._ @@ -208,6 +311,7 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_ io.out.valid := stats(out_stats_id).state === output io.out.bits.acc_read_resp := stats(out_stats_id).req.acc_read_resp + io.out.bits.mean := stats(out_stats_id).mean io.out.bits.max := stats(out_stats_id).max io.out.bits.inv_stddev := stats(out_stats_id).inv_stddev.asTypeOf(scale_t) @@ -227,7 +331,8 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_ val len = Mux(stat.elems_left % n_lanes.U === 0.U, n_lanes.U, stat.elems_left % n_lanes.U) - lanes.io.ins.valid := stat.state === get_sum && stat.vec_groups_left > 0.U + lanes.io.ins.valid := stat.state === get_sum && stat.vec_groups_left > 0.U && + !max_lanes.io.busy // TODO We should be able to start the accumulation lanes if the max-lanes are busy with a different stat-id lanes.io.ins.bits.data := stat.vec_grouped(stat.vec_groups_left-1.U) lanes.io.ins.bits.mean := stat.mean lanes.io.ins.bits.max := stat.max @@ -286,8 +391,9 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_ val stat = stats(max_out_lanes_stats_id) when (max_lanes.io.out.fire()) { - stat.running_max := Mux(max_lanes.io.out.bits.result > stat.running_max, max_lanes.io.out.bits.result, stat.running_max) - //stat.max := Mux(max_lanes.io.out.bits.result > stat.max, max_lanes.io.out.bits.result, stat.max) + val new_max = Mux(max_lanes.io.out.bits.result > stat.running_max, max_lanes.io.out.bits.result, stat.running_max) + stat.running_max := new_max + stat.max := new_max } } @@ -296,7 +402,7 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_ (s.state === get_mean || s.state === get_variance) -> i.U } ) val sum_to_divide = stats(sum_to_divide_id).sum - val (divider_in, divider_out) = sum_to_divide.divider(stats.head.count).get + val (divider_in, divider_out) = sum_to_divide.divider(stats.head.count, 16).get { // Divider input @@ -328,12 +434,18 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_ (s.state === get_stddev) -> i.U } ) val variance_to_sqrt = stats(variance_to_sqrt_id).inv_stddev - val (sqrt_in, sqrt_out) = variance_to_sqrt.sqrt.get + + val sqrt_unit = Module(new IntSqrt(acc_t.getWidth)) + val sqrt_in = sqrt_unit.input + val sqrt_out = sqrt_unit.output + +// val (sqrt_in, sqrt_out) = variance_to_sqrt.sqrt.get { // Sqrt input val stat = stats(variance_to_sqrt_id) + sqrt_in.bits := variance_to_sqrt.asUInt() sqrt_in.valid := stat.state === get_stddev } @@ -353,7 +465,7 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_ when (stat.state === waiting_for_stddev) { stat.inv_stddev := Mux(sqrt_out.bits.asUInt === acc_t.zero.asUInt, 1.S(acc_t.getWidth.W).asTypeOf(acc_t), - sqrt_out.bits + sqrt_out.bits.asTypeOf(acc_t) ) } } @@ -363,7 +475,7 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_ (s.state === get_inv_stddev) -> i.U } ) val stddev_to_inv = stats(stddev_to_inv_id).inv_stddev - val (reciprocal_in, reciprocal_out) = stddev_to_inv.reciprocal(scale_t).get + val (reciprocal_in, reciprocal_out) = stddev_to_inv.reciprocal(scale_t, 16).get { // Reciprocal input @@ -388,6 +500,40 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_ } } + val inv_stddev_to_scale_id = MuxCase((num_stats-1).U, + stats.zipWithIndex.map { case (s,i) => + (s.state === get_scaled_inv_stddev) -> i.U } + ) + + val inv_stddev_scale_mul_pipe = Module(new MulPipe(scale_t)) + + { + // Scale input + val stat = stats(inv_stddev_to_scale_id) + + val ins = inv_stddev_scale_mul_pipe.io.ins + ins.bits.x := stats(inv_stddev_to_scale_id).inv_stddev.asTypeOf(scale_t) + ins.bits.y := stats(inv_stddev_to_scale_id).req.acc_read_resp.scale + ins.valid := stat.state === get_scaled_inv_stddev + } + + { + // Scale output + val waiting_for_scale_id = MuxCase((num_stats-1).U, + stats.zipWithIndex.map { case (s,i) => + (s.state === waiting_for_scaled_inv_stddev) -> i.U } + ) + val stat = stats(waiting_for_scale_id) + + val out = inv_stddev_scale_mul_pipe.io.out + out.ready := stat.state === waiting_for_scaled_inv_stddev + + when (stat.state === waiting_for_scaled_inv_stddev) { + stat.inv_stddev := out.bits.asTypeOf(stat.inv_stddev) + } + } + + val sum_exp_to_inv_id = MuxCase((num_stats-1).U, stats.zipWithIndex.map { case (s,i) => (s.state === get_inv_sum_exp) -> i.U } @@ -416,7 +562,7 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_ val one_rec = in_to_float(127.S) // softmax maximum is 127 for signed int8 // Instantiate the hardloat divider - val divider = Module(new DivSqrtRecFN_small(expWidth, sigWidth, 0)) + val divider = Module(new DivSqrtRecFN_small(expWidth, sigWidth, 16)) exp_divider_in.ready := divider.io.inReady divider.io.inValid := exp_divider_in.valid @@ -454,6 +600,39 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_ } } + val inv_sum_exp_to_scale_id = MuxCase((num_stats-1).U, + stats.zipWithIndex.map { case (s,i) => + (s.state === get_scaled_inv_sum_exp) -> i.U } + ) + + val inv_sum_exp_scale_mul_pipe = Module(new MulPipe(scale_t)) + + { + // Scale input + val stat = stats(inv_sum_exp_to_scale_id) + + val ins = inv_sum_exp_scale_mul_pipe.io.ins + ins.bits.x := stats(inv_sum_exp_to_scale_id).inv_sum_exp.asTypeOf(scale_t) + ins.bits.y := stats(inv_sum_exp_to_scale_id).req.acc_read_resp.scale + ins.valid := stat.state === get_scaled_inv_sum_exp + } + + { + // Scale output + val waiting_for_scale_id = MuxCase((num_stats-1).U, + stats.zipWithIndex.map { case (s,i) => + (s.state === waiting_for_scaled_inv_sum_exp) -> i.U } + ) + val stat = stats(waiting_for_scale_id) + + val out = inv_sum_exp_scale_mul_pipe.io.out + out.ready := stat.state === waiting_for_scaled_inv_sum_exp + + when (stat.state === waiting_for_scaled_inv_sum_exp) { + stat.inv_sum_exp := out.bits.asTypeOf(stat.inv_sum_exp) + } + } + // State transitions for (((stat, next_state), id) <- (stats zip next_states).zipWithIndex) { val state = stat.state @@ -527,14 +706,26 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_ next_state := Mux(reciprocal_in.fire() && stddev_to_inv_id === id.U, state.next, state) done := false.B }.elsewhen(state === waiting_for_inv_stddev) { - next_state := Mux(reciprocal_out.fire(), idle, state) - done := reciprocal_out.fire() + next_state := Mux(reciprocal_out.fire(), state.next, state) + done := false.B + }.elsewhen(state === get_scaled_inv_stddev) { + next_state := Mux(inv_stddev_scale_mul_pipe.io.ins.fire() && inv_stddev_to_scale_id === id.U, state.next, state) + done := false.B + }.elsewhen(state === waiting_for_scaled_inv_stddev) { + next_state := Mux(inv_stddev_scale_mul_pipe.io.out.fire(), idle, state) + done := inv_stddev_scale_mul_pipe.io.out.fire() }.elsewhen(state === get_inv_sum_exp) { next_state := Mux(exp_divider_in.fire() && sum_exp_to_inv_id === id.U, state.next, state) done := false.B }.elsewhen(state === waiting_for_inv_sum_exp) { - next_state := Mux(exp_divider_out.fire(), idle, state) - done := exp_divider_out.fire() + next_state := Mux(exp_divider_out.fire(), state.next, state) + done := false.B + }.elsewhen(state === get_scaled_inv_sum_exp) { + next_state := Mux(inv_sum_exp_scale_mul_pipe.io.ins.fire() && inv_sum_exp_to_scale_id === id.U, state.next, state) + done := false.B + }.elsewhen(state === waiting_for_scaled_inv_sum_exp) { + next_state := Mux(inv_sum_exp_scale_mul_pipe.io.out.fire(), idle, state) + done := inv_sum_exp_scale_mul_pipe.io.out.fire() }.otherwise { assert(false.B, "invalid state in Normalizer") next_state := DontCare @@ -544,9 +735,6 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_ when (io.in.fire() && in_stats_id === id.U) { next_state := Mux(io.in.bits.cmd === NormCmd.RESET, output, Mux(io.in.bits.cmd === NormCmd.MAX, get_max, get_sum)) - when (io.in.bits.cmd === NormCmd.SUM_EXP) { - stat.max := stat.running_max - } } } @@ -570,20 +758,20 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_ when(reset_running_state) { stat.sum := acc_t.zero stat.count := Mux(is_input, io.in.bits.len, 0.U) - } - - when (state =/= get_inv_sum_exp && next_state === get_inv_sum_exp) { stat.running_max := acc_t.minimum } - } - dontTouch(stats) +// when (state =/= get_max && next_state === get_max) { +// stat.running_max := acc_t.minimum +// stat.max := acc_t.minimum +// } + } // Assertions assert(PopCount(stats.map(s => s.state === waiting_for_mean || s.state === waiting_for_variance)) <= 1.U, "we don't support pipelining the divider/sqrt-unit/inv-unit right now") assert(PopCount(stats.map(_.state === waiting_for_stddev)) <= 1.U, "we don't support pipelining the divider/sqrt-unit/inv-unit right now") assert(PopCount(stats.map(_.state === waiting_for_inv_stddev)) <= 1.U, "we don't support pipelining the divider/sqrt-unit/inv-unit right now") - assert(PopCount(stats.map(_.state === output)) <= 1.U, "multiple outputs at same time") +// assert(PopCount(stats.map(_.state === output)) <= 1.U, "multiple outputs at same time") assert(acc_t.getWidth == scale_t.getWidth, "we use the same variable to hold both the variance and the inv-stddev, so we need them to see the width") // Resets diff --git a/src/main/scala/gemmini/Scratchpad.scala b/src/main/scala/gemmini/Scratchpad.scala index ceb8b0ae..b0c468c7 100644 --- a/src/main/scala/gemmini/Scratchpad.scala +++ b/src/main/scala/gemmini/Scratchpad.scala @@ -559,20 +559,11 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, val acc_row_t = Vec(meshColumns, Vec(tileColumns, accType)) val spad_row_t = Vec(meshColumns, Vec(tileColumns, inputType)) -// val acc_norm_unit = Module(new Normalizer( -// max_len = block_cols, -// num_reduce_lanes = -1, -// num_stats = 4, -// latency = 4, -// fullDataType = acc_row_t, -// scale_t = acc_scale_t, -// )) - val (acc_norm_unit_in, acc_norm_unit_out) = Normalizer( is_passthru = !config.has_normalizations, max_len = block_cols, num_reduce_lanes = -1, - num_stats = 4, + num_stats = 2, latency = 4, fullDataType = acc_row_t, scale_t = acc_scale_t, From f60effe4564c3d202e1fcc075f1033dce4009816 Mon Sep 17 00:00:00 2001 From: Jerry Zhao Date: Tue, 11 Jul 2023 12:00:17 -0700 Subject: [PATCH 08/24] Bump Chipyard --- CHIPYARD.hash | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHIPYARD.hash b/CHIPYARD.hash index ed413fdd..07c62705 100644 --- a/CHIPYARD.hash +++ b/CHIPYARD.hash @@ -1 +1 @@ -569917e2f30616f85a841d16a92914ae98ad7184 +ef3409f87ff2988fa862ea48c995d2c27c93c7a2 From 41041bca9d77350ae56555a4c90f96ffaff0e4f2 Mon Sep 17 00:00:00 2001 From: Abraham Gonzalez Date: Thu, 13 Jul 2023 12:56:27 -0700 Subject: [PATCH 09/24] Port to better CI (#307) * Port to better CI * Bump CY * Use make parallelism for running tests * Use default Chipyard build flags * Cleanup run RTL CI script * REMOVE ME * Create JAVA_TMP_DIR in CI * Setup conda w/ install-gemmini.sh * Move conda stuff into scripts * Don't cd unnecessarily * Don't parallelize RTL tests --- .github/scripts/defaults.sh | 31 +----- .github/scripts/do-rtl-build.sh | 5 +- .github/scripts/enable-conda.sh | 13 --- .github/scripts/install-gemmini.sh | 18 ++-- .github/scripts/remove-chipyard.sh | 10 -- .github/scripts/run-tests-rtl.sh | 7 +- .github/scripts/run-tests-spike.sh | 6 +- .github/workflows/config.yml | 160 +++++++++++------------------ CHIPYARD.hash | 2 +- 9 files changed, 77 insertions(+), 175 deletions(-) delete mode 100644 .github/scripts/enable-conda.sh delete mode 100755 .github/scripts/remove-chipyard.sh diff --git a/.github/scripts/defaults.sh b/.github/scripts/defaults.sh index b49ae66e..df48cb36 100755 --- a/.github/scripts/defaults.sh +++ b/.github/scripts/defaults.sh @@ -1,14 +1,5 @@ #!/bin/bash -# shared variables between the different services -# -# CircleCI set values: -# $SERVER - points to the millennium build server -# $AWS_SERVER - points to the aws manager instance -# $CI_DIR - home directory on build server -# $CI_AWS_DIR - home directory on aws - - ############# # SHARED VARS ############# @@ -17,26 +8,10 @@ CI_MAKE_NPROC=4 LOCAL_MAKE_NPROC=$CI_MAKE_NPROC -# verilator version -VERILATOR_VERSION=v4.034 +# local variables +LOCAL_CHECKOUT_DIR=$GITHUB_WORKSPACE -# local variables (aka within the docker container) -LOCAL_WORK_DIR=$HOME -LOCAL_CHECKOUT_DIR=$GITHUB_WORKSPACE/ -LOCAL_RISCV_DIR=$HOME/riscv-tools-install -LOCAL_ESP_DIR=$HOME/esp-tools-install -LOCAL_CHIPYARD_DIR=$HOME/chipyard +LOCAL_CHIPYARD_DIR=$REMOTE_WORK_DIR LOCAL_SIM_DIR=$LOCAL_CHIPYARD_DIR/sims/verilator -LOCAL_VERILATOR_DIR=$HOME/verilator-install -LOCAL_CONDA=/opt/conda/ CICONFIG=chipyard.config.WithNoDebug_GemminiRocketConfig - -echo "::set-output name=LOCAL_WORK_DIR::$LOCAL_WORK_DIR" -echo "::set-output name=LOCAL_CHECKOUT_DIR::$LOCAL_CHECKOUT_DIR" -echo "::set-output name=LOCAL_RISCV_DIR::$LOCAL_RISCV_DIR" -echo "::set-output name=LOCAL_ESP_DIR::$LOCAL_ESP_DIR" -echo "::set-output name=LOCAL_CHIPYARD_DIR::$LOCAL_CHIPYARD_DIR" -echo "::set-output name=LOCAL_SIM_DIR::$LOCAL_SIM_DIR" -echo "::set-output name=LOCAL_VERILATOR_DIR::$LOCAL_VERILATOR_DIR" -echo "::set-output name=LOCAL_CONDA::$LOCAL_CONDA" diff --git a/.github/scripts/do-rtl-build.sh b/.github/scripts/do-rtl-build.sh index 3e26a04f..59fe7585 100755 --- a/.github/scripts/do-rtl-build.sh +++ b/.github/scripts/do-rtl-build.sh @@ -7,12 +7,11 @@ set -ex SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )" source $SCRIPT_DIR/defaults.sh -source $SCRIPT_DIR/enable-conda.sh +eval "$(conda shell.bash hook)" cd $LOCAL_CHIPYARD_DIR source env.sh cd $LOCAL_SIM_DIR make -C $LOCAL_SIM_DIR clean -make -j$LOCAL_MAKE_NPROC -C $LOCAL_SIM_DIR VERILATOR_OPT_FLAGS="-O0 -OG" JAVA_OPTS="-Xmx2500M -Xss8M" SBT_OPTS="-Dsbt.ivy.home=$LOCAL_CHIPYARD_DIR/.ivy2 -Dsbt.supershell=false -Dsbt.global.base=$LOCAL_CHIPYARD_DIR/.sbt -Dsbt.boot.directory=$LOCAL_CHIPYARD_DIR/.sbt/boot" CONFIG=$CICONFIG - +make -j$LOCAL_MAKE_NPROC -C $LOCAL_SIM_DIR CONFIG=$CICONFIG diff --git a/.github/scripts/enable-conda.sh b/.github/scripts/enable-conda.sh deleted file mode 100644 index c064dd7a..00000000 --- a/.github/scripts/enable-conda.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -export PATH="$LOCAL_CONDA/bin:$PATH" -conda init -source ~/.bashrc -conda activate base -if ! { conda env list | grep 'chipyard'; } >/dev/null 2>&1; then - conda create -n chipyard - conda activate chipyard - conda install -c conda-forge conda-lock=1.4 -fi -conda activate chipyard - diff --git a/.github/scripts/install-gemmini.sh b/.github/scripts/install-gemmini.sh index daf2d891..3722ab72 100755 --- a/.github/scripts/install-gemmini.sh +++ b/.github/scripts/install-gemmini.sh @@ -1,11 +1,5 @@ #!/bin/bash -#------------------------------------------------------------- -# installs gemmini -# -# run location: circle ci docker image -#------------------------------------------------------------- - # turn echo on and error on earliest command set -ex @@ -13,17 +7,18 @@ set -ex SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )" source $SCRIPT_DIR/defaults.sh -source $SCRIPT_DIR/enable-conda.sh +eval "$(conda shell.bash hook)" + +mkdir -p $JAVA_TMP_DIR -cd $HOME -rm -rf chipyard -git clone --progress --verbose https://github.com/ucb-bar/chipyard.git chipyard +git clone --progress --verbose https://github.com/ucb-bar/chipyard.git $LOCAL_CHIPYARD_DIR cd $LOCAL_CHIPYARD_DIR git fetch git checkout $(cat $LOCAL_CHECKOUT_DIR/CHIPYARD.hash) -./build-setup.sh riscv-tools -f -s 6 -s 7 -s 8 -s 9 +export MAKEFLAGS="-j32" +./build-setup.sh riscv-tools -f -s 6 -s 7 -s 8 -s 9 -v source env.sh @@ -36,4 +31,3 @@ cd $LOCAL_CHECKOUT_DIR git submodule update --init --recursive software/gemmini-rocc-tests rm -rf $LOCAL_CHIPYARD_DIR/generators/gemmini/* $LOCAL_CHIPYARD_DIR/generators/gemmini/.git* mv -f $LOCAL_CHECKOUT_DIR/* $LOCAL_CHECKOUT_DIR/.git* $LOCAL_CHIPYARD_DIR/generators/gemmini/ - diff --git a/.github/scripts/remove-chipyard.sh b/.github/scripts/remove-chipyard.sh deleted file mode 100755 index 8b82019e..00000000 --- a/.github/scripts/remove-chipyard.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash - -set -ex - -SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )" -source $SCRIPT_DIR/defaults.sh - -rm -rf $LOCAL_CHIPYARD_DIR -rm -rf $LOCAL_CONDA - diff --git a/.github/scripts/run-tests-rtl.sh b/.github/scripts/run-tests-rtl.sh index e179bd37..8c1c1636 100755 --- a/.github/scripts/run-tests-rtl.sh +++ b/.github/scripts/run-tests-rtl.sh @@ -1,11 +1,12 @@ #!/bin/bash +# turn echo on and error on earliest command set -ex SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )" source $SCRIPT_DIR/defaults.sh -source $SCRIPT_DIR/enable-conda.sh +eval "$(conda shell.bash hook)" cd $LOCAL_CHIPYARD_DIR source env.sh @@ -14,5 +15,5 @@ cd $LOCAL_CHIPYARD_DIR/generators/gemmini/software/gemmini-rocc-tests CFLAGS=-DFAST ./build.sh cd build -make test-baremetal-bareMetalC RUNNER="'make -C $LOCAL_CHIPYARD_DIR/sims/verilator/ CONFIG=$CICONFIG run-binary-hex BINARY='" - +make test-baremetal-bareMetalC RUNNER="'make -C $LOCAL_SIM_DIR CONFIG=$CICONFIG run-binary-hex BINARY='" +#make -j$LOCAL_MAKE_NPROC test-baremetal-bareMetalC RUNNER="'make -C $LOCAL_SIM_DIR CONFIG=$CICONFIG run-binary-hex BINARY='" diff --git a/.github/scripts/run-tests-spike.sh b/.github/scripts/run-tests-spike.sh index 41774cfd..57c9b963 100755 --- a/.github/scripts/run-tests-spike.sh +++ b/.github/scripts/run-tests-spike.sh @@ -1,11 +1,12 @@ #!/bin/bash +# turn echo on and error on earliest command set -ex SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )" source $SCRIPT_DIR/defaults.sh -source $SCRIPT_DIR/enable-conda.sh +eval "$(conda shell.bash hook)" cd $LOCAL_CHIPYARD_DIR source env.sh @@ -25,5 +26,4 @@ cd $LOCAL_CHIPYARD_DIR/generators/gemmini/software/gemmini-rocc-tests ./build.sh cd build -make test-baremetal - +make -j$LOCAL_MAKE_NPROC test-baremetal diff --git a/.github/workflows/config.yml b/.github/workflows/config.yml index f4a2dfda..c3dbbdd6 100644 --- a/.github/workflows/config.yml +++ b/.github/workflows/config.yml @@ -1,118 +1,74 @@ name: Gemmini CI + on: [push] -jobs: - install-gemmini: - name: gemmini-install - runs-on: ubuntu-latest - container: - image: ucbbar/chipyard-ci-image:3f9150 - options: --entrypoint /bin/bash - steps: - - name: checkout - uses: actions/checkout@v2 - - name: get paths - run: .github/scripts/defaults.sh - id: get-paths - - name: install gemmini - run: .github/scripts/install-gemmini.sh +defaults: + run: + shell: bash -leo pipefail {0} - - name: cache gemmini install - uses: actions/cache@v2 - with: - path: | - ${{ steps.get-paths.outputs.LOCAL_CHIPYARD_DIR }} - ${{ steps.get-paths.outputs.LOCAL_CONDA }} - key: gemmini-install-${{ github.ref }}-${{ github.sha }} +env: + REMOTE_WORK_DIR: /scratch/buildbot/gemmini-ci-shared/gemmini-${{ github.sha }} + JAVA_TMP_DIR: /tmp/gemmini-${{ github.sha }}-full - build-gemmini-config: - name: build-gemmini-config - runs-on: self-hosted - needs: install-gemmini - container: - image: ucbbar/chipyard-ci-image:3f9150 - options: --entrypoint /bin/bash +jobs: + install-gemmini: + runs-on: jktqos steps: - - name: checkout - uses: actions/checkout@v2 - - name: get paths - run: .github/scripts/defaults.sh - id: get-paths - - - name: remove chipyard - run: .github/scripts/remove-chipyard.sh - - - name: restore cache gemmini install - uses: actions/cache@v2 - with: - path: | - ${{ steps.get-paths.outputs.LOCAL_CHIPYARD_DIR }} - ${{ steps.get-paths.outputs.LOCAL_CONDA }} - key: gemmini-install-${{ github.ref }}-${{ github.sha }} - - - name: Building Gemmini Config using Verilator - run: .github/scripts/do-rtl-build.sh - - - name: cache build-gemmini-config install - uses: actions/cache@v2 - with: - path: | - ${{ steps.get-paths.outputs.LOCAL_CHIPYARD_DIR }} - ${{ steps.get-paths.outputs.LOCAL_CONDA }} - key: build-gemmini-config-${{ github.ref }}-${{ github.sha }} + - name: Delete old checkout + run: | + rm -rf ${{ github.workspace }}/* || true + rm -rf ${{ github.workspace }}/.* || true + - uses: actions/checkout@v3 + - name: Setup repository + run: | + .github/scripts/install-gemmini.sh spike-run-tests: - name: spike-run-tests - runs-on: ubuntu-latest + runs-on: jktqos needs: install-gemmini - container: - image: ucbbar/chipyard-ci-image:3f9150 - options: --entrypoint /bin/bash steps: - - name: checkout - uses: actions/checkout@v2 - - name: get paths - run: .github/scripts/defaults.sh - id: get-paths - - - name: remove chipyard - run: .github/scripts/remove-chipyard.sh + - name: Delete old checkout + run: | + rm -rf ${{ github.workspace }}/* || true + rm -rf ${{ github.workspace }}/.* || true + - uses: actions/checkout@v3 + - name: Run Gemmini Spike tests + run: | + .github/scripts/run-tests-spike.sh - - name: restore cache gemmini install - uses: actions/cache@v2 - with: - path: | - ${{ steps.get-paths.outputs.LOCAL_CHIPYARD_DIR }} - ${{ steps.get-paths.outputs.LOCAL_CONDA }} - key: gemmini-install-${{ github.ref }}-${{ github.sha }} - - - name: run-tests - run: .github/scripts/run-tests-spike.sh + build-gemmini-config: + runs-on: jktqos + needs: install-gemmini + steps: + - name: Delete old checkout + run: | + rm -rf ${{ github.workspace }}/* || true + rm -rf ${{ github.workspace }}/.* || true + - uses: actions/checkout@v3 + - name: Building Gemmini Config using Verilator + run: | + .github/scripts/do-rtl-build.sh rtl-run-tests: - name: rtl-run-tests - runs-on: ubuntu-latest + runs-on: jktqos needs: build-gemmini-config - container: - image: ucbbar/chipyard-ci-image:3f9150 - options: --entrypoint /bin/bash steps: - - name: checkout - uses: actions/checkout@v2 - - name: get paths - run: .github/scripts/defaults.sh - id: get-paths - - - name: remove chipyard - run: .github/scripts/remove-chipyard.sh + - name: Delete old checkout + run: | + rm -rf ${{ github.workspace }}/* || true + rm -rf ${{ github.workspace }}/.* || true + - uses: actions/checkout@v3 + - name: Run Gemmini Config tests using Verilator + run: | + .github/scripts/run-tests-rtl.sh - - name: restore cache build-gemmini-config install - uses: actions/cache@v2 - with: - path: | - ${{ steps.get-paths.outputs.LOCAL_CHIPYARD_DIR }} - ${{ steps.get-paths.outputs.LOCAL_CONDA }} - key: build-gemmini-config-${{ github.ref }}-${{ github.sha }} - - - name: run-tests - run: .github/scripts/run-tests-rtl.sh + cleanup: + name: cleanup + needs: [spike-run-tests, rtl-run-tests] + runs-on: jktqos + if: ${{ always() }} + steps: + - name: Delete repo copy + run: | + rm -rf ${{ env.REMOTE_WORK_DIR }} + rm -rf ${{ env.JAVA_TMP_DIR }} diff --git a/CHIPYARD.hash b/CHIPYARD.hash index 43ea698b..0adca0cd 100644 --- a/CHIPYARD.hash +++ b/CHIPYARD.hash @@ -1 +1 @@ -7874f5148ef6d50c5191c375ee1fd047593f52ca +54536ed7173dd0aa7bdcc12c2d67453441c30042 From 16a9d3fefbe7f1007a3ed59883ac8d8ab44af19c Mon Sep 17 00:00:00 2001 From: Seah Kim Date: Sun, 23 Jul 2023 16:35:22 -0700 Subject: [PATCH 10/24] adding non square conv --- src/main/scala/gemmini/LoopConv.scala | 44 +++++++++++++++------------ 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/src/main/scala/gemmini/LoopConv.scala b/src/main/scala/gemmini/LoopConv.scala index a46a4576..07dfefdc 100644 --- a/src/main/scala/gemmini/LoopConv.scala +++ b/src/main/scala/gemmini/LoopConv.scala @@ -12,14 +12,17 @@ import Util._ class LoopConvOuterBounds(val large_iterator_bitwidth: Int, val small_iterator_bitwidth: Int, val tiny_iterator_bitwidth: Int) extends Bundle { val batch_size = UInt(large_iterator_bitwidth.W) - val in_dim = UInt(small_iterator_bitwidth.W) + val in_row_dim = UInt(small_iterator_bitwidth.W) + val in_col_dim = UInt(small_iterator_bitwidth.W) val in_channels = UInt(large_iterator_bitwidth.W) val out_channels = UInt(large_iterator_bitwidth.W) - val out_dim = UInt(large_iterator_bitwidth.W) + val out_col_dim = UInt(large_iterator_bitwidth.W) + val out_row_dim = UInt(large_iterator_bitwidth.W) val out_stride = UInt(large_iterator_bitwidth.W) //stride for output activation val in_stride = UInt(large_iterator_bitwidth.W) //stride for input activation val weight_stride = UInt(large_iterator_bitwidth.W) //stride for weight - val pool_out_dim = UInt(small_iterator_bitwidth.W) + val pool_out_row_dim = UInt(small_iterator_bitwidth.W) + val pool_out_col_dim = UInt(small_iterator_bitwidth.W) val stride = UInt(tiny_iterator_bitwidth.W) val padding = UInt(tiny_iterator_bitwidth.W) val kernel_dim = UInt(tiny_iterator_bitwidth.W) @@ -278,8 +281,8 @@ class LoopConvLdInput(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitw val dram_stride = Mux(req.trans_input_3120, batch_size * (input_w/8).U, in_stride * (input_w/8).U) // Addresses - val dram_offset = Mux(req.trans_input_3120, (((ich * in_dim * in_dim +& irow*in_dim +& icol) * batches +& b) * (input_w/8).U).asUInt, - (((b * in_dim * in_dim +& irow*in_dim +& icol) * in_stride +& ich) * (input_w/8).U).asUInt) + val dram_offset = Mux(req.trans_input_3120, (((ich * in_col_dim * in_row_dim +& irow*in_col_dim +& icol) * batches +& b) * (input_w/8).U).asUInt, + (((b * in_row_dim * in_col_dim +& irow*in_col_dim +& icol) * in_stride +& ich) * (input_w/8).U).asUInt) val dram_addr = Mux(is_zeros, 0.U, req.dram_addr + LoopConv.castDramOffset(dram_offset)) val spad_addr = Mux(req.trans_input_3120, // To prevent Verilator errors, we replace some "/ block_size.U" calls here with ">> log2Up(block_size)" @@ -886,12 +889,12 @@ class LoopConvSt(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth: // Addresses val dram_offset = Mux(req.trans_output_1203, - ((orow*out_dim*batch_size +& ocol*batch_size +& b) * out_channels +& och) * (input_w/8).U, - ((b*out_dim*out_dim +& orow*out_dim +& ocol) * out_stride +& och) * (input_w/8).U) + ((orow*out_col_dim*batch_size +& ocol*batch_size +& b) * out_channels +& och) * (input_w/8).U, + ((b*out_row_dim*out_col_dim +& orow*out_col_dim +& ocol) * out_stride +& och) * (input_w/8).U) val dram_addr = req.dram_addr + LoopConv.castDramOffset(dram_offset) val spad_addr = acc_addr_start +& (och / block_size.U(och.getWidth.W)) * batches * orows * ocols +& b * orows * ocols +& orow * ocols +& ocol - val pool_dram_addr = req.dram_addr + ((b * pool_out_dim * pool_out_dim) * out_stride + och) * (input_w/8).U + val pool_dram_addr = req.dram_addr + ((b * pool_out_col_dim * pool_out_row_dim) * out_stride + och) * (input_w/8).U val pool_spad_addr = acc_addr_start +& (och / block_size.U(och.getWidth.W)) * batches * orows * ocols +& b * orows * ocols // Sizes @@ -928,7 +931,7 @@ class LoopConvSt(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth: pre_pool_config_cmd_rs1.orows := orows pre_pool_config_cmd_rs1.pocols := pocols pre_pool_config_cmd_rs1.porows := porows - pre_pool_config_cmd_rs1.pool_out_dim := pool_out_dim + pre_pool_config_cmd_rs1.pool_out_dim := pool_out_col_dim pre_pool_config_cmd_rs1.lpad := plpad pre_pool_config_cmd_rs1.upad := pupad pre_pool_config_cmd_rs1.pool_size := pool_size @@ -1279,20 +1282,22 @@ class LoopConv (block_size: Int, coreMaxAddrBits: Int, reservation_station_size: is (LOOP_CONV_WS_CONFIG_1) { loop_being_configured.outer_bounds.out_channels := cmd.bits.cmd.rs1(63, 48) loop_being_configured.outer_bounds.in_channels := cmd.bits.cmd.rs1(47, 32) - loop_being_configured.outer_bounds.in_dim := cmd.bits.cmd.rs1(31, 16) + loop_being_configured.outer_bounds.in_row_dim := cmd.bits.cmd.rs1(31, 16) loop_being_configured.outer_bounds.batch_size := cmd.bits.cmd.rs1(15, 0) - loop_being_configured.outer_bounds.padding := cmd.bits.cmd.rs2(63, 48) - loop_being_configured.outer_bounds.stride := cmd.bits.cmd.rs2(47, 32) - loop_being_configured.outer_bounds.pool_out_dim := cmd.bits.cmd.rs2(31, 16) - loop_being_configured.outer_bounds.out_dim := cmd.bits.cmd.rs2(15, 0) + loop_being_configured.outer_bounds.padding := cmd.bits.cmd.rs2(63, 56) + loop_being_configured.outer_bounds.stride := cmd.bits.cmd.rs2(55, 48) + loop_being_configured.outer_bounds.out_col_dim := cmd.bits.cmd.rs2(47, 32) + loop_being_configured.outer_bounds.pool_out_row_dim := cmd.bits.cmd.rs2(31, 16) + loop_being_configured.outer_bounds.out_row_dim := cmd.bits.cmd.rs2(15, 0) } is (LOOP_CONV_WS_CONFIG_2) { loop_being_configured.outer_bounds.kernel_dim := cmd.bits.cmd.rs1(63, 48) - loop_being_configured.outer_bounds.pool_size := (if (!has_max_pool) 1.U else cmd.bits.cmd.rs1(47, 32)) - loop_being_configured.outer_bounds.pool_stride := (if (!has_max_pool) 1.U else cmd.bits.cmd.rs1(31, 16)) - loop_being_configured.outer_bounds.pool_padding := (if (!has_max_pool) 0.U else cmd.bits.cmd.rs1(15, 0)) + loop_being_configured.outer_bounds.pool_out_col_dim := cmd.bits.cmd.rs1(47, 32) + loop_being_configured.outer_bounds.pool_size := (if (!has_max_pool) 1.U else cmd.bits.cmd.rs1(31, 16)) + loop_being_configured.outer_bounds.pool_stride := (if (!has_max_pool) 1.U else cmd.bits.cmd.rs1(15, 8)) + loop_being_configured.outer_bounds.pool_padding := (if (!has_max_pool) 0.U else cmd.bits.cmd.rs1(7, 0)) loop_being_configured.inner_bounds.batches := cmd.bits.cmd.rs2(63, 48) loop_being_configured.inner_bounds.porows := cmd.bits.cmd.rs2(47, 32) @@ -1308,8 +1313,9 @@ class LoopConv (block_size: Int, coreMaxAddrBits: Int, reservation_station_size: loop_being_configured.inner_bounds.rpad := cmd.bits.cmd.rs2(63, 48) loop_being_configured.inner_bounds.upad := cmd.bits.cmd.rs2(47, 32) - loop_being_configured.inner_bounds.dpad := cmd.bits.cmd.rs2(31, 16) - loop_being_configured.inner_bounds.plpad := cmd.bits.cmd.rs2(15, 0) + loop_being_configured.inner_bounds.dpad := cmd.bits.cmd.rs2(31, 24) + loop_being_configured.inner_bounds.plpad := cmd.bits.cmd.rs2(23, 16) + loop_being_configured.outer_bounds.in_col_dim := cmd.bits.cmd.rs2(15, 0) } is (LOOP_CONV_WS_CONFIG_4) { From 641fefbeb843da17e1496041af1c20679406e04e Mon Sep 17 00:00:00 2001 From: Seah Kim Date: Sun, 23 Jul 2023 19:53:42 -0700 Subject: [PATCH 11/24] bump rocc tests for non square conv --- software/gemmini-rocc-tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/software/gemmini-rocc-tests b/software/gemmini-rocc-tests index 9b243700..1a1a1c6b 160000 --- a/software/gemmini-rocc-tests +++ b/software/gemmini-rocc-tests @@ -1 +1 @@ -Subproject commit 9b243700448dd10236aa1d627e7618f3f449f2f0 +Subproject commit 1a1a1c6bd60df6d7cae3d87aac96c8f406cae084 From 1514d11a960f3e0f5b69828261dcfa42e5e4017c Mon Sep 17 00:00:00 2001 From: Seah Kim Date: Sun, 23 Jul 2023 19:56:08 -0700 Subject: [PATCH 12/24] bump spike --- software/libgemmini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/software/libgemmini b/software/libgemmini index 71d804a4..d873aa8b 160000 --- a/software/libgemmini +++ b/software/libgemmini @@ -1 +1 @@ -Subproject commit 71d804a4b27d509eb27b914056a4f94f82c9aa96 +Subproject commit d873aa8b8f39a01bca225044970745632816ce3d From ff55883636545b43afa828f6470f832d13b8c297 Mon Sep 17 00:00:00 2001 From: Jerry Zhao Date: Tue, 25 Jul 2023 13:17:19 -0700 Subject: [PATCH 13/24] Remove asUInt --- src/main/scala/gemmini/AccumulatorScale.scala | 8 ++++---- src/main/scala/gemmini/Normalizer.scala | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/main/scala/gemmini/AccumulatorScale.scala b/src/main/scala/gemmini/AccumulatorScale.scala index e1da8d23..2541f9b1 100644 --- a/src/main/scala/gemmini/AccumulatorScale.scala +++ b/src/main/scala/gemmini/AccumulatorScale.scala @@ -239,7 +239,7 @@ class AccumulatorScale[T <: Data, U <: Data]( // val inputs = Seq.fill(width*nEntries) { Wire(Decoupled(new AccScaleDataWithIndex(t, scale_t)(ev))) } val current_policy = Wire(UInt(nEntries.W)) val norm_mask_int = Wire(UInt(nEntries.W)) - norm_mask_int := VecInit(norm_mask).asUInt() + norm_mask_int := VecInit(norm_mask).asUInt dontTouch(norm_mask_int) current_policy := static_assignment_policy(norm_mask_int) @@ -395,13 +395,13 @@ object AccumulatorScale { // qln2_inv / S / (2 ** 16) = 1 / ln2 // q * qln2_inv = x / S / ln2 * S * (2 ** 16) = x / ln2 * (2 ** 16) val neg_q_iexp = neg(q) - val z_iexp = (neg_q_iexp * qln2_inv).asUInt().do_>>(16).asTypeOf(q) // q is non-positive + val z_iexp = (neg_q_iexp * qln2_inv).asUInt.do_>>(16).asTypeOf(q) // q is non-positive val z_iexp_saturated = Wire(z_iexp.cloneType) - z_iexp_saturated := Mux((5 until 16).map(z_iexp.asUInt()(_)).reduce(_ | _), 32.S, z_iexp) + z_iexp_saturated := Mux((5 until 16).map(z_iexp.asUInt(_)).reduce(_ | _), 32.S, z_iexp) val qp_iexp = q.mac(z_iexp, qln2).withWidthOf(q) val q_poly_iexp = qc.mac(qp_iexp + qb, qp_iexp + qb).withWidthOf(q) // we dont want a rounding shift // TODO: z overflow - (q_poly_iexp.asUInt().do_>>(z_iexp_saturated.asUInt())).asTypeOf(q) + (q_poly_iexp.asUInt.do_>>(z_iexp_saturated.asUInt)).asTypeOf(q) }} diff --git a/src/main/scala/gemmini/Normalizer.scala b/src/main/scala/gemmini/Normalizer.scala index d7991bb2..de3f301b 100644 --- a/src/main/scala/gemmini/Normalizer.scala +++ b/src/main/scala/gemmini/Normalizer.scala @@ -208,8 +208,8 @@ class MulPipe[T <: Data, U <: Data](scale_t: U)(implicit ev: Arithmetic[T]) scale_t match { case Float(expWidth, sigWidth) => - val self_rec = recFNFromFN(expWidth, sigWidth, io.ins.bits.x.asUInt()) - val scale_rec = recFNFromFN(expWidth, sigWidth, io.ins.bits.y.asUInt()) + val self_rec = recFNFromFN(expWidth, sigWidth, io.ins.bits.x.asUInt) + val scale_rec = recFNFromFN(expWidth, sigWidth, io.ins.bits.y.asUInt) val mul = Module(new MulRecFN(expWidth, sigWidth)) @@ -445,7 +445,7 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_ // Sqrt input val stat = stats(variance_to_sqrt_id) - sqrt_in.bits := variance_to_sqrt.asUInt() + sqrt_in.bits := variance_to_sqrt.asUInt sqrt_in.valid := stat.state === get_stddev } From 8c8b38b9dea3e4d6ba9695346a883b2094bd7388 Mon Sep 17 00:00:00 2001 From: Jerry Zhao Date: Sun, 20 Aug 2023 10:08:40 -0700 Subject: [PATCH 14/24] Fix missing connections with DontCares --- src/main/scala/gemmini/DMA.scala | 4 ++++ src/main/scala/gemmini/FrontendTLB.scala | 5 +++-- src/main/scala/gemmini/Scratchpad.scala | 1 + 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/main/scala/gemmini/DMA.scala b/src/main/scala/gemmini/DMA.scala index 1fd0be82..dac1a369 100644 --- a/src/main/scala/gemmini/DMA.scala +++ b/src/main/scala/gemmini/DMA.scala @@ -108,6 +108,7 @@ class StreamReader[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T io.resp.bits.bytes_read := RegEnable(xactTracker.io.peek.entry.bytes_to_read, beatPacker.io.req.fire) io.resp.bits.last := beatPacker.io.out.bits.last + io.counter := DontCare io.counter.collect(core.module.io.counter) io.counter.collect(xactTracker.io.counter) } @@ -231,6 +232,7 @@ class StreamReaderCore[T <: Data, U <: Data, V <: Data](config: GemminiArrayConf tlb_q.io.enq <> tlb_arb.io.out io.tlb.req.valid := tlb_q.io.deq.valid + io.tlb.req.bits := DontCare io.tlb.req.bits.tlb_req.vaddr := tlb_q.io.deq.bits.vaddr io.tlb.req.bits.tlb_req.passthrough := false.B io.tlb.req.bits.tlb_req.size := 0.U // send_size @@ -305,6 +307,7 @@ class StreamReaderCore[T <: Data, U <: Data, V <: Data](config: GemminiArrayConf } // Performance counter + io.counter := DontCare CounterEventIO.init(io.counter) io.counter.connectEventSignal(CounterEvent.RDMA_ACTIVE_CYCLE, state =/= s_idle) io.counter.connectEventSignal(CounterEvent.RDMA_TLB_WAIT_CYCLES, io.tlb.resp.miss) @@ -522,6 +525,7 @@ class StreamWriter[T <: Data: Arithmetic](nXacts: Int, beatBits: Int, maxBytes: tlb_q.io.enq <> tlb_arb.io.out io.tlb.req.valid := tlb_q.io.deq.fire + io.tlb.req.bits := DontCare io.tlb.req.bits.tlb_req.vaddr := tlb_q.io.deq.bits.vaddr io.tlb.req.bits.tlb_req.passthrough := false.B io.tlb.req.bits.tlb_req.size := 0.U // send_size diff --git a/src/main/scala/gemmini/FrontendTLB.scala b/src/main/scala/gemmini/FrontendTLB.scala index 4a2e3d21..83229501 100644 --- a/src/main/scala/gemmini/FrontendTLB.scala +++ b/src/main/scala/gemmini/FrontendTLB.scala @@ -30,7 +30,7 @@ class DecoupledTLB(entries: Int, maxSize: Int, use_firesim_simulation_counters: extends CoreModule { val lgMaxSize = log2Ceil(maxSize) - val io = new Bundle { + val io = IO(new Bundle { val req = Flipped(Valid(new DecoupledTLBReq(lgMaxSize))) val resp = new TLBResp val ptw = new TLBPTWIO @@ -38,7 +38,7 @@ class DecoupledTLB(entries: Int, maxSize: Int, use_firesim_simulation_counters: val exp = new TLBExceptionIO val counter = new CounterEventIO() - } + }) val interrupt = RegInit(false.B) io.exp.interrupt := interrupt @@ -156,6 +156,7 @@ class FrontendTLB(nClients: Int, entries: Int, maxSize: Int, use_tlb_register_fi // TODO Return the sum of the TLB counters, rather than just the counters of the first TLB. This only matters if we're // not using the shared TLB + io.counter := DontCare tlbs.foreach(_.io.counter.external_reset := false.B) io.counter.collect(tlbs.head.io.counter) } diff --git a/src/main/scala/gemmini/Scratchpad.scala b/src/main/scala/gemmini/Scratchpad.scala index b0c468c7..cdd63062 100644 --- a/src/main/scala/gemmini/Scratchpad.scala +++ b/src/main/scala/gemmini/Scratchpad.scala @@ -825,6 +825,7 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, } // Counter connection + io.counter := DontCare io.counter.collect(reader.module.io.counter) io.counter.collect(writer.module.io.counter) } From 181a7329411ca57f9029195ac439c859126d0fd7 Mon Sep 17 00:00:00 2001 From: SeahK Date: Sun, 28 Jan 2024 01:44:32 -0800 Subject: [PATCH 15/24] bump spike --- software/libgemmini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/software/libgemmini b/software/libgemmini index d873aa8b..2b0f1cf6 160000 --- a/software/libgemmini +++ b/software/libgemmini @@ -1 +1 @@ -Subproject commit d873aa8b8f39a01bca225044970745632816ce3d +Subproject commit 2b0f1cf61f9ffaa6fe3efdb58e56c31954b93d36 From 435bb864e49644bf6fb42a31fc081e0ef589fc31 Mon Sep 17 00:00:00 2001 From: SeahK Date: Sat, 16 Mar 2024 00:41:54 -0700 Subject: [PATCH 16/24] rtml fp32 config --- src/main/scala/gemmini/ConfigsFP.scala | 34 ++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/src/main/scala/gemmini/ConfigsFP.scala b/src/main/scala/gemmini/ConfigsFP.scala index ceb9d2a0..31383d17 100644 --- a/src/main/scala/gemmini/ConfigsFP.scala +++ b/src/main/scala/gemmini/ConfigsFP.scala @@ -111,6 +111,28 @@ object GemminiFPConfigs { } + val chipFP32Config = FP32DefaultConfig.copy(sp_capacity=CapacityInKilobytes(32), acc_capacity=CapacityInKilobytes(8), dataflow=Dataflow.WS, + acc_scale_args = Some(ScaleArguments((t: Float, u: Float) => {t}, 1, Float(8, 24), -1, identity = "1.0", + c_str = "((x))" + )), + mvin_scale_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 3, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")), + mvin_scale_acc_args=None, + acc_singleported=false, + acc_sub_banks = 1, + acc_banks = 2, + mesh_output_delay = 2, + tile_latency = 1, + acc_latency = 3, + ex_read_from_acc=false, + ex_write_to_spad=false, + has_training_convs = false, + hardcode_d_to_garbage_addr = true, + acc_read_full_width = false, + max_in_flight_mem_reqs = 16, + headerFileName = "gemmini_params_fp32.h", + num_counter = 0, + clock_gate = true + ) //===========FP32 Default Config========= class GemminiFP32DefaultConfig extends Config((site, here, up) => { @@ -123,6 +145,18 @@ class GemminiFP32DefaultConfig extends Config((site, here, up) => { ) }) +class ChipFP32GemminiConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( + gemminiConfig: GemminiArrayConfig[T,U,V] = GemminiFPConfigs.chipFP32Config +) extends Config((site, here, up) => { + case BuildRoCC => up(BuildRoCC) ++ Seq( + (p: Parameters) => { + implicit val q = p + val gemmini = LazyModule(new Gemmini(gemminiConfig)) + gemmini + } + ) +}) + //===========FP16 Default Config========= class GemminiFP16DefaultConfig extends Config((site, here, up) => { From 08cfde926f8a9cc9942795924cc8c19164b42134 Mon Sep 17 00:00:00 2001 From: SeahK Date: Sat, 16 Mar 2024 00:42:49 -0700 Subject: [PATCH 17/24] cut critical path --- src/main/scala/gemmini/VectorScalarMultiplier.scala | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/main/scala/gemmini/VectorScalarMultiplier.scala b/src/main/scala/gemmini/VectorScalarMultiplier.scala index 2311b381..153fd23c 100644 --- a/src/main/scala/gemmini/VectorScalarMultiplier.scala +++ b/src/main/scala/gemmini/VectorScalarMultiplier.scala @@ -198,6 +198,8 @@ object VectorScalarMultiplier { ) = { assert(!is_acc || is_mvin) val vsm = Module(new VectorScalarMultiplier(scale_args, cols, t, tag_t)) - (vsm.io.req, vsm.io.resp) + val vsm_in_q = Module(new Queue(chiselTypeOf(vsm.io.req.bits), 2)) + vsm.io.req <> vsm_in_q.io.deq + (vsm_in_q.io.enq, vsm.io.resp) } } From 3e960fcf5fcac2bc96736925a03b9d615d895a76 Mon Sep 17 00:00:00 2001 From: SeahK Date: Sat, 16 Mar 2024 01:06:23 -0700 Subject: [PATCH 18/24] FP compilation fail fixes --- src/main/scala/gemmini/AccumulatorScale.scala | 2 +- src/main/scala/gemmini/ConfigsFP.scala | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/scala/gemmini/AccumulatorScale.scala b/src/main/scala/gemmini/AccumulatorScale.scala index 2541f9b1..f93275d6 100644 --- a/src/main/scala/gemmini/AccumulatorScale.scala +++ b/src/main/scala/gemmini/AccumulatorScale.scala @@ -397,7 +397,7 @@ object AccumulatorScale { val neg_q_iexp = neg(q) val z_iexp = (neg_q_iexp * qln2_inv).asUInt.do_>>(16).asTypeOf(q) // q is non-positive val z_iexp_saturated = Wire(z_iexp.cloneType) - z_iexp_saturated := Mux((5 until 16).map(z_iexp.asUInt(_)).reduce(_ | _), 32.S, z_iexp) + z_iexp_saturated := Mux((5 until 16).map(z_iexp.asUInt(_)).reduce(_ | _), 32.S.asTypeOf(z_iexp), z_iexp) val qp_iexp = q.mac(z_iexp, qln2).withWidthOf(q) val q_poly_iexp = qc.mac(qp_iexp + qb, qp_iexp + qb).withWidthOf(q) // we dont want a rounding shift diff --git a/src/main/scala/gemmini/ConfigsFP.scala b/src/main/scala/gemmini/ConfigsFP.scala index 31383d17..8644f9db 100644 --- a/src/main/scala/gemmini/ConfigsFP.scala +++ b/src/main/scala/gemmini/ConfigsFP.scala @@ -109,7 +109,6 @@ object GemminiFPConfigs { mvin_scale_acc_args = Some(ScaleArguments((t: Float, u: Float) => t * u, 4, Float(8, 24), -1, identity = "1.0", c_str="((x) * (scale))")), ) -} val chipFP32Config = FP32DefaultConfig.copy(sp_capacity=CapacityInKilobytes(32), acc_capacity=CapacityInKilobytes(8), dataflow=Dataflow.WS, acc_scale_args = Some(ScaleArguments((t: Float, u: Float) => {t}, 1, Float(8, 24), -1, identity = "1.0", @@ -133,6 +132,7 @@ object GemminiFPConfigs { num_counter = 0, clock_gate = true ) +} //===========FP32 Default Config========= class GemminiFP32DefaultConfig extends Config((site, here, up) => { From 39e46e2b6c8d0afb4557a8df7a95591383606dc4 Mon Sep 17 00:00:00 2001 From: SeahK Date: Fri, 22 Mar 2024 13:32:44 -0700 Subject: [PATCH 19/24] update chipyard hash --- CHIPYARD.hash | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHIPYARD.hash b/CHIPYARD.hash index 07c62705..eabe7a12 100644 --- a/CHIPYARD.hash +++ b/CHIPYARD.hash @@ -1 +1 @@ -ef3409f87ff2988fa862ea48c995d2c27c93c7a2 +b4aae0ddfdc5aaced32e0df90b633eab5b8327ca From a3eb23a38609212f271ab8f71198a9afb93d5625 Mon Sep 17 00:00:00 2001 From: Jerry Zhao Date: Sat, 20 Apr 2024 09:37:41 -0700 Subject: [PATCH 20/24] Switch to chisel6-compatible APIs --- src/main/scala/gemmini/Controller.scala | 2 +- src/main/scala/gemmini/DMA.scala | 5 +- src/main/scala/gemmini/LoopConv.scala | 2 +- src/main/scala/gemmini/LoopMatmul.scala | 6 +-- src/main/scala/gemmini/NormCmd.scala | 1 - src/main/scala/gemmini/Normalizer.scala | 53 +++++++++---------- .../scala/gemmini/ReservationStation.scala | 8 +-- src/main/scala/gemmini/Scratchpad.scala | 2 +- .../gemmini/TransposePreloadUnroller.scala | 1 - 9 files changed, 38 insertions(+), 42 deletions(-) diff --git a/src/main/scala/gemmini/Controller.scala b/src/main/scala/gemmini/Controller.scala index 0fdda55f..33e5fd93 100644 --- a/src/main/scala/gemmini/Controller.scala +++ b/src/main/scala/gemmini/Controller.scala @@ -403,7 +403,7 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data] // Debugging signals val pipeline_stall_counter = RegInit(0.U(32.W)) - when (io.cmd.fire()) { + when (io.cmd.fire) { pipeline_stall_counter := 0.U }.elsewhen(io.busy) { pipeline_stall_counter := pipeline_stall_counter + 1.U diff --git a/src/main/scala/gemmini/DMA.scala b/src/main/scala/gemmini/DMA.scala index dac1a369..29252955 100644 --- a/src/main/scala/gemmini/DMA.scala +++ b/src/main/scala/gemmini/DMA.scala @@ -3,7 +3,6 @@ package gemmini import chisel3._ import chisel3.util._ -import chisel3.experimental.DataMirror import org.chipsalliance.cde.config.Parameters import freechips.rocketchip.diplomacy.{IdRange, LazyModule, LazyModuleImp} @@ -211,7 +210,7 @@ class StreamReaderCore[T <: Data, U <: Data, V <: Data](config: GemminiArrayConf )._2 class TLBundleAWithInfo extends Bundle { - val tl_a = DataMirror.internal.chiselTypeClone[TLBundleA](tl.a.bits) + val tl_a = tl.a.bits.cloneType val vaddr = Output(UInt(vaddrBits.W)) val status = Output(new MStatus) } @@ -499,7 +498,7 @@ class StreamWriter[T <: Data: Arithmetic](nXacts: Int, beatBits: Int, maxBytes: )._2 class TLBundleAWithInfo extends Bundle { - val tl_a = DataMirror.internal.chiselTypeClone[TLBundleA](tl.a.bits) + val tl_a = tl.a.bits.cloneType val vaddr = Output(UInt(vaddrBits.W)) val status = Output(new MStatus) } diff --git a/src/main/scala/gemmini/LoopConv.scala b/src/main/scala/gemmini/LoopConv.scala index 07dfefdc..2db2b034 100644 --- a/src/main/scala/gemmini/LoopConv.scala +++ b/src/main/scala/gemmini/LoopConv.scala @@ -974,7 +974,7 @@ class LoopConvSt(block_size: Int, coreMaxAddrBits: Int, large_iterator_bitwidth: io.loop_id := req.loop_id command_p.io.in.valid := state =/= idle && !skip && io.ex_completed - command_p.io.in.bits.cmd := MuxLookup(state.asUInt, mvout_cmd, Seq( + command_p.io.in.bits.cmd := MuxLookup(state.asUInt, mvout_cmd)(Seq( pre_pool_config.asUInt -> pre_pool_config_cmd, pool.asUInt -> pool_cmd, post_pool_config.asUInt -> post_pool_config_cmd) diff --git a/src/main/scala/gemmini/LoopMatmul.scala b/src/main/scala/gemmini/LoopMatmul.scala index 91a4afbe..07d8d49e 100644 --- a/src/main/scala/gemmini/LoopMatmul.scala +++ b/src/main/scala/gemmini/LoopMatmul.scala @@ -643,7 +643,7 @@ class LoopMatmulStC(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In when (req.dram_addr === 0.U) { state := idle - }.elsewhen (io.cmd.fire() && state === st) { + }.elsewhen (io.cmd.fire && state === st) { // The order here is k, j, i val next_i = floorAdd(i, 1.U, req.max_i) val next_j = floorAdd(j, max_blocks, req.max_j, next_i === 0.U) @@ -654,9 +654,9 @@ class LoopMatmulStC(block_size: Int, coreMaxAddrBits: Int, iterator_bitwidth: In when (next_i === 0.U && next_j === 0.U) { state := idle } - }.elsewhen (io.cmd.fire() && state === ln_config) { + }.elsewhen (io.cmd.fire && state === ln_config) { state := ln_st - }.elsewhen (io.cmd.fire() && state === ln_st) { + }.elsewhen (io.cmd.fire && state === ln_st) { val next_j = floorAdd(j, max_blocks, req.max_j) val next_stat_id = floorAdd(ln_stat_id, 1.U, ln_stat_ids, next_j === 0.U) val next_cmd = floorAdd(ln_cmd, 1.U, ln_norm_cmds.size.U, next_j === 0.U && next_stat_id === 0.U) diff --git a/src/main/scala/gemmini/NormCmd.scala b/src/main/scala/gemmini/NormCmd.scala index 515fabb0..52da8cde 100644 --- a/src/main/scala/gemmini/NormCmd.scala +++ b/src/main/scala/gemmini/NormCmd.scala @@ -3,7 +3,6 @@ package gemmini import chisel3._ import chisel3.util._ -import chisel3.experimental.ChiselEnum object NormCmd extends ChiselEnum { val RESET, SUM, MEAN, VARIANCE, INV_STDDEV, MAX, SUM_EXP, INV_SUM_EXP = Value diff --git a/src/main/scala/gemmini/Normalizer.scala b/src/main/scala/gemmini/Normalizer.scala index de3f301b..c22e9af8 100644 --- a/src/main/scala/gemmini/Normalizer.scala +++ b/src/main/scala/gemmini/Normalizer.scala @@ -2,7 +2,6 @@ package gemmini import chisel3._ -import chisel3.experimental.ChiselEnum import chisel3.util._ import gemmini.AccumulatorScale.iexp import hardfloat.{DivSqrtRecFN_small, INToRecFN, MulRecFN, consts, fNFromRecFN, recFNFromFN} @@ -348,7 +347,7 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_ lanes.io.ins.bits.stats_id := in_lanes_stats_id lanes.io.ins.bits.iexp_const := iexp_const - when (lanes.io.ins.fire()) { + when (lanes.io.ins.fire) { stat.elems_left := stat.elems_left - len } } @@ -359,7 +358,7 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_ val stat = stats(out_lanes_stats_id) - when (lanes.io.out.fire()) { + when (lanes.io.out.fire) { stat.sum := stat.sum + lanes.io.out.bits.result } } @@ -379,7 +378,7 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_ max_lanes.io.ins.bits.len := len max_lanes.io.ins.bits.stats_id := max_in_lanes_stats_id - when (max_lanes.io.ins.fire()) { + when (max_lanes.io.ins.fire) { stat.elems_left := stat.elems_left - len } } @@ -390,7 +389,7 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_ val stat = stats(max_out_lanes_stats_id) - when (max_lanes.io.out.fire()) { + when (max_lanes.io.out.fire) { val new_max = Mux(max_lanes.io.out.bits.result > stat.running_max, max_lanes.io.out.bits.result, stat.running_max) stat.running_max := new_max stat.max := new_max @@ -645,13 +644,13 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_ next_state := idle done := DontCare }.elsewhen(state === output) { - next_state := Mux(io.out.fire() && out_stats_id === id.U, idle, state) - done := io.out.fire() && out_stats_id === id.U + next_state := Mux(io.out.fire && out_stats_id === id.U, idle, state) + done := io.out.fire && out_stats_id === id.U }.elsewhen(state === get_max) { val is_last_lane_input = stat.vec_groups_left === 0.U || (stat.vec_groups_left === 1.U && max_lanes.io.ins.bits.stats_id === id.U && - max_lanes.io.ins.fire()) + max_lanes.io.ins.fire) next_state := Mux( is_last_lane_input, @@ -667,7 +666,7 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_ val is_last_lane_input = stat.vec_groups_left === 0.U || (stat.vec_groups_left === 1.U && lanes.io.ins.bits.stats_id === id.U && - lanes.io.ins.fire()) + lanes.io.ins.fire) next_state := Mux( is_last_lane_input, @@ -688,51 +687,51 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_ done := is_last_lane_input && cmd =/= NormCmd.MEAN && cmd =/= NormCmd.INV_STDDEV && cmd =/= NormCmd.INV_SUM_EXP }.elsewhen(state === get_mean || state === get_variance) { - next_state := Mux(divider_in.fire() && sum_to_divide_id === id.U, state.next, state) + next_state := Mux(divider_in.fire && sum_to_divide_id === id.U, state.next, state) done := false.B }.elsewhen(state === waiting_for_mean) { - next_state := Mux(divider_out.fire(), idle, state) - done := divider_out.fire() + next_state := Mux(divider_out.fire, idle, state) + done := divider_out.fire }.elsewhen(state === waiting_for_variance) { - next_state := Mux(divider_out.fire(), get_stddev, state) + next_state := Mux(divider_out.fire, get_stddev, state) done := false.B }.elsewhen(state === get_stddev) { - next_state := Mux(sqrt_in.fire() && variance_to_sqrt_id === id.U, state.next, state) + next_state := Mux(sqrt_in.fire && variance_to_sqrt_id === id.U, state.next, state) done := false.B }.elsewhen(state === waiting_for_stddev) { - next_state := Mux(sqrt_out.fire(), state.next, state) + next_state := Mux(sqrt_out.fire, state.next, state) done := false.B }.elsewhen(state === get_inv_stddev) { - next_state := Mux(reciprocal_in.fire() && stddev_to_inv_id === id.U, state.next, state) + next_state := Mux(reciprocal_in.fire && stddev_to_inv_id === id.U, state.next, state) done := false.B }.elsewhen(state === waiting_for_inv_stddev) { - next_state := Mux(reciprocal_out.fire(), state.next, state) + next_state := Mux(reciprocal_out.fire, state.next, state) done := false.B }.elsewhen(state === get_scaled_inv_stddev) { - next_state := Mux(inv_stddev_scale_mul_pipe.io.ins.fire() && inv_stddev_to_scale_id === id.U, state.next, state) + next_state := Mux(inv_stddev_scale_mul_pipe.io.ins.fire && inv_stddev_to_scale_id === id.U, state.next, state) done := false.B }.elsewhen(state === waiting_for_scaled_inv_stddev) { - next_state := Mux(inv_stddev_scale_mul_pipe.io.out.fire(), idle, state) - done := inv_stddev_scale_mul_pipe.io.out.fire() + next_state := Mux(inv_stddev_scale_mul_pipe.io.out.fire, idle, state) + done := inv_stddev_scale_mul_pipe.io.out.fire }.elsewhen(state === get_inv_sum_exp) { - next_state := Mux(exp_divider_in.fire() && sum_exp_to_inv_id === id.U, state.next, state) + next_state := Mux(exp_divider_in.fire && sum_exp_to_inv_id === id.U, state.next, state) done := false.B }.elsewhen(state === waiting_for_inv_sum_exp) { - next_state := Mux(exp_divider_out.fire(), state.next, state) + next_state := Mux(exp_divider_out.fire, state.next, state) done := false.B }.elsewhen(state === get_scaled_inv_sum_exp) { - next_state := Mux(inv_sum_exp_scale_mul_pipe.io.ins.fire() && inv_sum_exp_to_scale_id === id.U, state.next, state) + next_state := Mux(inv_sum_exp_scale_mul_pipe.io.ins.fire && inv_sum_exp_to_scale_id === id.U, state.next, state) done := false.B }.elsewhen(state === waiting_for_scaled_inv_sum_exp) { - next_state := Mux(inv_sum_exp_scale_mul_pipe.io.out.fire(), idle, state) - done := inv_sum_exp_scale_mul_pipe.io.out.fire() + next_state := Mux(inv_sum_exp_scale_mul_pipe.io.out.fire, idle, state) + done := inv_sum_exp_scale_mul_pipe.io.out.fire }.otherwise { assert(false.B, "invalid state in Normalizer") next_state := DontCare done := DontCare } - when (io.in.fire() && in_stats_id === id.U) { + when (io.in.fire && in_stats_id === id.U) { next_state := Mux(io.in.bits.cmd === NormCmd.RESET, output, Mux(io.in.bits.cmd === NormCmd.MAX, get_max, get_sum)) } @@ -747,7 +746,7 @@ class Normalizer[T <: Data, U <: Data](max_len: Int, num_reduce_lanes: Int, num_ (state === get_mean && next_state =/= get_mean) || (state === get_variance && next_state =/= get_variance) - val is_input = io.in.fire() && in_stats_id === id.U + val is_input = io.in.fire && in_stats_id === id.U when (is_input) { stat.req := io.in.bits diff --git a/src/main/scala/gemmini/ReservationStation.scala b/src/main/scala/gemmini/ReservationStation.scala index 47dd5ef1..3c2600df 100644 --- a/src/main/scala/gemmini/ReservationStation.scala +++ b/src/main/scala/gemmini/ReservationStation.scala @@ -19,7 +19,7 @@ class ReservationStationIssue[T <: Data](cmd_t: T, id_width: Int) extends Bundle val cmd = Output(cmd_t.cloneType) val rob_id = Output(UInt(id_width.W)) - def fire(dummy: Int=0) = valid && ready + def fire = valid && ready } // TODO we don't need to store the full command in here. We should be able to release the command directly into the relevant controller and only store the associated metadata in the ROB. This would reduce the size considerably @@ -178,7 +178,7 @@ class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: G val new_entry_oh = new_allocs_oh_ld ++ new_allocs_oh_ex ++ new_allocs_oh_st new_entry_oh.foreach(_ := false.B) - val alloc_fire = io.alloc.fire() + val alloc_fire = io.alloc.fire io.alloc.ready := false.B when (io.alloc.valid) { @@ -410,7 +410,7 @@ class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: G val from_conv_fsm = entries_type(issue_id).bits.cmd.from_conv_fsm val from_matmul_fsm = entries_type(issue_id).bits.cmd.from_matmul_fsm - when (io.fire()) { + when (io.fire) { entries_type.zipWithIndex.foreach { case (e, i) => when (issue_sel(i)) { e.bits.issued := true.B @@ -519,7 +519,7 @@ class ReservationStation[T <: Data : Arithmetic, U <: Data, V <: Data](config: G val cycles_since_issue = RegInit(0.U(16.W)) - when (io.issue.ld.fire() || io.issue.st.fire() || io.issue.ex.fire() || !io.busy || io.completed.fire) { + when (io.issue.ld.fire || io.issue.st.fire || io.issue.ex.fire || !io.busy || io.completed.fire) { cycles_since_issue := 0.U }.elsewhen(io.busy) { cycles_since_issue := cycles_since_issue + 1.U diff --git a/src/main/scala/gemmini/Scratchpad.scala b/src/main/scala/gemmini/Scratchpad.scala index cdd63062..d07614b3 100644 --- a/src/main/scala/gemmini/Scratchpad.scala +++ b/src/main/scala/gemmini/Scratchpad.scala @@ -597,7 +597,7 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, acc_scale_unit.io.in.valid := acc_norm_unit_out.valid && acc_waiting_to_be_scaled acc_scale_unit.io.in.bits := acc_norm_unit_out.bits - when (acc_scale_unit.io.in.fire()) { + when (acc_scale_unit.io.in.fire) { write_issue_q.io.enq <> write_scale_q.io.deq } diff --git a/src/main/scala/gemmini/TransposePreloadUnroller.scala b/src/main/scala/gemmini/TransposePreloadUnroller.scala index 68407344..878eaa1a 100644 --- a/src/main/scala/gemmini/TransposePreloadUnroller.scala +++ b/src/main/scala/gemmini/TransposePreloadUnroller.scala @@ -2,7 +2,6 @@ package gemmini import chisel3._ import chisel3.util._ -import chisel3.experimental.ChiselEnum import org.chipsalliance.cde.config.Parameters import Util._ import midas.targetutils.PerfCounter From 25809f78323a729ef76fb68f3cedd8a24da2942b Mon Sep 17 00:00:00 2001 From: Jerry Zhao Date: Sat, 29 Jun 2024 21:11:03 -0700 Subject: [PATCH 21/24] Update to latest rocket-chip --- src/main/scala/gemmini/Configs.scala | 2 +- src/main/scala/gemmini/Controller.scala | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/scala/gemmini/Configs.scala b/src/main/scala/gemmini/Configs.scala index a849883d..4c83ff8e 100644 --- a/src/main/scala/gemmini/Configs.scala +++ b/src/main/scala/gemmini/Configs.scala @@ -4,7 +4,7 @@ import chisel3._ import org.chipsalliance.cde.config.{Config, Parameters} import freechips.rocketchip.diplomacy.LazyModule import freechips.rocketchip.subsystem._ -import freechips.rocketchip.tile.{BuildRoCC, OpcodeSet, XLen} +import freechips.rocketchip.tile.{BuildRoCC, OpcodeSet} import freechips.rocketchip.rocket._ import freechips.rocketchip.tile._ import freechips.rocketchip.system._ diff --git a/src/main/scala/gemmini/Controller.scala b/src/main/scala/gemmini/Controller.scala index 33e5fd93..c67f92df 100644 --- a/src/main/scala/gemmini/Controller.scala +++ b/src/main/scala/gemmini/Controller.scala @@ -32,7 +32,7 @@ class Gemmini[T <: Data : Arithmetic, U <: Data, V <: Data](val config: GemminiA System.exit(1) } - val xLen = p(XLen) + val xLen = p(TileKey).core.xLen val spad = LazyModule(new Scratchpad(config)) override lazy val module = new GemminiModule(this) From 6b8abdf692fbaf561e92c3bfe00cfdb78eee2c1b Mon Sep 17 00:00:00 2001 From: Jerry Zhao Date: Thu, 11 Jul 2024 15:05:48 -0700 Subject: [PATCH 22/24] Update cy reference --- CHIPYARD.hash | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHIPYARD.hash b/CHIPYARD.hash index eabe7a12..3cc0b75f 100644 --- a/CHIPYARD.hash +++ b/CHIPYARD.hash @@ -1 +1 @@ -b4aae0ddfdc5aaced32e0df90b633eab5b8327ca +d75934b0327e8ba44973769d17794df8c2c8ee8b From d8372efd504522e46183f7fba15771ca1c97fd88 Mon Sep 17 00:00:00 2001 From: SeahK Date: Thu, 11 Jul 2024 18:34:29 -0700 Subject: [PATCH 23/24] add option to disable loopconv --- src/main/scala/gemmini/Controller.scala | 5 +++-- src/main/scala/gemmini/GemminiConfigs.scala | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/main/scala/gemmini/Controller.scala b/src/main/scala/gemmini/Controller.scala index c67f92df..66d4db9b 100644 --- a/src/main/scala/gemmini/Controller.scala +++ b/src/main/scala/gemmini/Controller.scala @@ -142,7 +142,7 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data] val max_exs = reservation_station_entries_ex val max_sts = reservation_station_entries_st - val (conv_cmd, loop_conv_unroller_busy) = withClock (gated_clock) { LoopConv(raw_cmd, reservation_station.io.conv_ld_completed, reservation_station.io.conv_st_completed, reservation_station.io.conv_ex_completed, + val (conv_cmd, loop_conv_unroller_busy) = if (has_loop_conv) withClock (gated_clock) { LoopConv(raw_cmd, reservation_station.io.conv_ld_completed, reservation_station.io.conv_st_completed, reservation_station.io.conv_ex_completed, meshRows*tileRows, coreMaxAddrBits, reservation_station_entries, max_lds, max_exs, max_sts, sp_banks * sp_bank_entries, acc_banks * acc_bank_entries, inputType.getWidth, accType.getWidth, dma_maxbytes, new ConfigMvinRs1(mvin_scale_t_bits, block_stride_bits, pixel_repeats_bits), new MvinRs2(mvin_rows_bits, mvin_cols_bits, local_addr_t), @@ -151,8 +151,9 @@ class GemminiModule[T <: Data: Arithmetic, U <: Data, V <: Data] new PreloadRs(mvout_rows_bits, mvout_cols_bits, local_addr_t), new ComputeRs(mvin_rows_bits, mvin_cols_bits, local_addr_t), new ComputeRs(mvin_rows_bits, mvin_cols_bits, local_addr_t), has_training_convs, has_max_pool, has_first_layer_optimizations, has_dw_convs) } + else (raw_cmd, false.B) - val (loop_cmd, loop_matmul_unroller_busy) = withClock (gated_clock) { LoopMatmul(conv_cmd, reservation_station.io.matmul_ld_completed, reservation_station.io.matmul_st_completed, reservation_station.io.matmul_ex_completed, + val (loop_cmd, loop_matmul_unroller_busy) = withClock (gated_clock) { LoopMatmul(if (has_loop_conv) conv_cmd else raw_cmd, reservation_station.io.matmul_ld_completed, reservation_station.io.matmul_st_completed, reservation_station.io.matmul_ex_completed, meshRows*tileRows, coreMaxAddrBits, reservation_station_entries, max_lds, max_exs, max_sts, sp_banks * sp_bank_entries, acc_banks * acc_bank_entries, inputType.getWidth, accType.getWidth, dma_maxbytes, new MvinRs2(mvin_rows_bits, mvin_cols_bits, local_addr_t), new PreloadRs(mvin_rows_bits, mvin_cols_bits, local_addr_t), new PreloadRs(mvout_rows_bits, mvout_cols_bits, local_addr_t), diff --git a/src/main/scala/gemmini/GemminiConfigs.scala b/src/main/scala/gemmini/GemminiConfigs.scala index 98254299..40bc5303 100644 --- a/src/main/scala/gemmini/GemminiConfigs.scala +++ b/src/main/scala/gemmini/GemminiConfigs.scala @@ -88,6 +88,7 @@ case class GemminiArrayConfig[T <: Data : Arithmetic, U <: Data, V <: Data]( has_dw_convs: Boolean = true, has_normalizations: Boolean = false, has_first_layer_optimizations: Boolean = true, + has_loop_conv: Boolean = true, use_firesim_simulation_counters: Boolean = false, From 050cbe6a17b6fd5dc197f45af9e52bf8cac69ead Mon Sep 17 00:00:00 2001 From: richardyrh Date: Tue, 16 Jul 2024 14:59:18 -0700 Subject: [PATCH 24/24] fix deadlock --- src/main/scala/gemmini/Scratchpad.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/gemmini/Scratchpad.scala b/src/main/scala/gemmini/Scratchpad.scala index d07614b3..927781f1 100644 --- a/src/main/scala/gemmini/Scratchpad.scala +++ b/src/main/scala/gemmini/Scratchpad.scala @@ -499,7 +499,7 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, bio.read.resp.ready := Mux(bio.read.resp.bits.fromDMA, dma_read_resp.ready, ex_read_resp.ready) dma_read_pipe.ready := writer.module.io.req.ready && - !write_issue_q.io.deq.bits.laddr.is_acc_addr && write_issue_q.io.deq.bits.laddr.sp_bank() === i.U && // I believe we don't need to check that write_issue_q is valid here, because if the SRAM's resp is valid, then that means that the write_issue_q's deq should also be valid + ((!write_issue_q.io.deq.bits.laddr.is_acc_addr && write_issue_q.io.deq.bits.laddr.sp_bank() === i.U) && write_issue_q.io.deq.valid) && !write_issue_q.io.deq.bits.laddr.is_garbage() when (dma_read_pipe.fire) { writeData.valid := true.B