diff --git a/software/gemmini-rocc-tests b/software/gemmini-rocc-tests index f44e5093..7005bcc6 160000 --- a/software/gemmini-rocc-tests +++ b/software/gemmini-rocc-tests @@ -1 +1 @@ -Subproject commit f44e509380bdfc3bdbba92acab7377f5e94b29ed +Subproject commit 7005bcc62fcd45f4d0930ca47e3483f3ab4e8ef3 diff --git a/software/libgemmini b/software/libgemmini index 3ea939a0..19d82156 160000 --- a/software/libgemmini +++ b/software/libgemmini @@ -1 +1 @@ -Subproject commit 3ea939a05373d51770616360591e42baba0d4dd1 +Subproject commit 19d821561bd7c66689871c1cf586cd4db461ddf9 diff --git a/src/main/scala/gemmini/AccumulatorScale.scala b/src/main/scala/gemmini/AccumulatorScale.scala index 2541f9b1..2bed3702 100644 --- a/src/main/scala/gemmini/AccumulatorScale.scala +++ b/src/main/scala/gemmini/AccumulatorScale.scala @@ -386,22 +386,35 @@ object AccumulatorScale { } def iexp[T <: Data](q: T, qln2: T, qln2_inv: T, qb: T, qc: T)(implicit ev: Arithmetic[T]): T = { + // import ev._ + + // val zero = q.zero + // def neg(x: T) = zero-x + + // // qln2_inv needs scale to be 1 / (2 ** 16) / S + // // qln2_inv / S / (2 ** 16) = 1 / ln2 + // // q * qln2_inv = x / S / ln2 * S * (2 ** 16) = x / ln2 * (2 ** 16) + // val neg_q_iexp = neg(q) + // val z_iexp = (neg_q_iexp * qln2_inv).asUInt.do_>>(16).asTypeOf(q) // q is non-positive + // val z_iexp_saturated = Wire(z_iexp.cloneType) + // z_iexp_saturated := Mux((5 until 16).map(z_iexp.asUInt(_)).reduce(_ | _), 32.S, z_iexp) + // val qp_iexp = q.mac(z_iexp, qln2).withWidthOf(q) + // val q_poly_iexp = qc.mac(qp_iexp + qb, qp_iexp + qb).withWidthOf(q) + // // we dont want a rounding shift + // // TODO: z overflow + // (q_poly_iexp.asUInt.do_>>(z_iexp_saturated.asUInt)).asTypeOf(q) + import ev._ val zero = q.zero + val one = q.identity def neg(x: T) = zero-x - // qln2_inv needs scale to be 1 / (2 ** 16) / S - // qln2_inv / S / (2 ** 16) = 1 / ln2 - // q * qln2_inv = x / S / ln2 * S * (2 ** 16) = x / ln2 * (2 ** 16) - val neg_q_iexp = neg(q) - val z_iexp = (neg_q_iexp * qln2_inv).asUInt.do_>>(16).asTypeOf(q) // q is non-positive - val z_iexp_saturated = Wire(z_iexp.cloneType) - z_iexp_saturated := Mux((5 until 16).map(z_iexp.asUInt(_)).reduce(_ | _), 32.S, z_iexp) - val qp_iexp = q.mac(z_iexp, qln2).withWidthOf(q) - val q_poly_iexp = qc.mac(qp_iexp + qb, qp_iexp + qb).withWidthOf(q) - // we dont want a rounding shift - // TODO: z overflow - (q_poly_iexp.asUInt.do_>>(z_iexp_saturated.asUInt)).asTypeOf(q) + val q_sign = Mux(q.zero > q, neg(one), one) + val q_abs = Mux(q.zero > q, neg(q), q) + val q_clipped = Mux(q_abs > neg(qb), neg(qb), q_abs) + val q_poly = qc.mac(q_clipped + qb, q_clipped + qb).withWidthOf(q) + val q_erf = (q_sign * q_poly).withWidthOf(q) + (q * (q_erf + qc)).withWidthOf(q) }} diff --git a/src/main/scala/gemmini/ExecuteController.scala b/src/main/scala/gemmini/ExecuteController.scala index 00680e48..4a8ff129 100644 --- a/src/main/scala/gemmini/ExecuteController.scala +++ b/src/main/scala/gemmini/ExecuteController.scala @@ -121,6 +121,8 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In val bd_transpose = Reg(Bool()) val config_initialized = RegInit(false.B) + val is_gemv = WireInit(true.B) + val a_should_be_fed_into_transposer = Mux(current_dataflow === Dataflow.OS.id.U, !a_transpose, a_transpose) val a_address_place = Mux(preload_cmd_place === 0.U, 1.U, Mux(a_should_be_fed_into_transposer, 2.U, 0.U)) @@ -252,10 +254,8 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In val c_addr_stride = Reg(UInt(16.W)) // TODO magic numbers val a_address = (0 until tileColumns).map(i => a_address_rs1(i) + a_addr_offset(i)) - val b_address = b_address_rs2 + b_fire_counter - dontTouch(b_address) - dontTouch(b_address_rs2) - val d_address = d_address_rs1 + (block_size.U - 1.U - d_fire_counter) + val b_address = Mux(is_gemv, b_address_rs2, b_address_rs2 + b_fire_counter) + val d_address = Mux(is_gemv, d_address_rs1, d_address_rs1 + (block_size.U - 1.U - d_fire_counter)) dontTouch(d_address) val dataAbank = a_address.map(address => address.sp_bank()) @@ -458,8 +458,6 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In for (i <- 0 until sp_banks) { // val matching_a = dataAbank.indexOf(i.U) val matching_a = if (i < 4) i else -1; // TODO temp fix bc indexOf() doesn't work for some reason - val matching_a_wire = WireInit(matching_a.S(4.W)); - dontTouch(matching_a_wire) val read_a = if (matching_a == -1) false.B else a_valid(matching_a) && !a_read_from_acc && start_inputting_a && !multiply_garbage && a_row_is_not_all_zeros(matching_a) && !(im2col_wire&&im2col_en) val read_b = b_valid && !b_read_from_acc && dataBbank === i.U && start_inputting_b && !accumulate_zeros && b_row_is_not_all_zeros //&& !im2col_wire dontTouch(b_valid) @@ -607,6 +605,8 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In } } + // is_gemv := config_ex_rs1.is_gemv.asBool + a_addr_stride := config_ex_rs1.a_stride // TODO this needs to be kept in sync with ROB.scala c_addr_stride := config_ex_rs2.c_stride // TODO this needs to be kept in sync with ROB.scala config_initialized := true.B @@ -634,7 +634,7 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In //start_inputting_a := current_dataflow === Dataflow.OS.id.U //start_inputting_d := true.B - + start_inputting_a := a_should_be_fed_into_transposer start_inputting_b := b_should_be_fed_into_transposer start_inputting_d := true.B @@ -930,15 +930,15 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In } } - // TODO integrate this fully - val gemv_mode = RegInit(true.B) dontTouch(dataB) dontTouch(cntl_valid) dontTouch(mesh.io.a.valid) dontTouch(dataD) + dontTouch(is_gemv) - when (gemv_mode) { + when (is_gemv) { when ((current_dataflow === Dataflow.WS.id.U).asBool) { + // transpose A for (tc <- 0 until tileColumns) { for (mr <- 0 until meshRows) { for (tr <- 0 until tileRows) { @@ -946,21 +946,31 @@ class ExecuteController[T <: Data, U <: Data, V <: Data](xLen: Int, tagWidth: In } } } + // pass in duplicated elements of weights vector in reverse order for (tc <- 0 until tileColumns) { for (mc <- 0 until meshColumns) { - mesh.io.d.bits(mc)(tc) := dataD.asTypeOf(Vec(meshColumns, Vec(tileColumns, inputType)))(0)(0) + mesh.io.d.bits(mc)(tc) := dataD.asTypeOf(Vec(meshColumns, Vec(tileColumns, inputType)))(0)(meshRows.U - d_fire_counter) + } + } + // duplicate one element of the bias vector to the mesh + for (tc <- 0 until tileColumns) { + for (mc <- 0 until meshColumns) { + mesh.io.b.bits(mc)(tc) := dataB.asTypeOf(Vec(meshColumns, Vec(tileColumns, inputType)))(0)(b_fire_counter-1.U) } } - mesh.io.b.bits := dataB.asTypeOf(Vec(meshColumns, Vec(tileColumns, inputType))) }.otherwise { // TODO this only works when casted this way mesh.io.a.bits := dataA.asTypeOf(Vec(meshRows, Vec(tileColumns, Vec(tileRows, inputType)))) for (tc <- 0 until tileColumns) { for (mc <- 0 until meshColumns) { - mesh.io.b.bits(mc)(tc) := dataB.asTypeOf(Vec(meshColumns, Vec(tileColumns, inputType)))(0)(0) + mesh.io.b.bits(mc)(tc) := dataB.asTypeOf(Vec(meshColumns, Vec(tileColumns, inputType)))(0)(b_fire_counter-1.U) + } + } + for (tc <- 0 until tileColumns) { + for (mc <- 0 until meshColumns) { + mesh.io.d.bits(mc)(tc) := dataD.asTypeOf(Vec(meshColumns, Vec(tileColumns, inputType)))(0)(d_fire_counter-1.U) } } - mesh.io.d.bits := dataD.asTypeOf(Vec(meshColumns, Vec(tileColumns, inputType))) } }.otherwise { for (tc <- 0 until tileColumns) { diff --git a/src/main/scala/gemmini/Scratchpad.scala b/src/main/scala/gemmini/Scratchpad.scala index cdd63062..9fa3c3a1 100644 --- a/src/main/scala/gemmini/Scratchpad.scala +++ b/src/main/scala/gemmini/Scratchpad.scala @@ -251,9 +251,9 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, // From acc are ordered val write_norm_q = Module(new Queue(new ScratchpadMemWriteRequest(local_addr_t, accType.getWidth, acc_scale_t_bits), spad_read_delay+2)) val write_scale_q = Module(new Queue(new ScratchpadMemWriteRequest(local_addr_t, accType.getWidth, acc_scale_t_bits), spad_read_delay+2)) - val write_issue_q = Module(new Queue(new ScratchpadMemWriteRequest(local_addr_t, accType.getWidth, acc_scale_t_bits), spad_read_delay+1, pipe=true)) - val read_issue_q = Module(new Queue(new ScratchpadMemReadRequest(local_addr_t, mvin_scale_t_bits), spad_read_delay+1, pipe=true)) // TODO can't this just be a normal queue? - + val write_issue_q = Module(new Queue(new ScratchpadMemWriteRequest(local_addr_t, accType.getWidth, acc_scale_t_bits), spad_read_delay+1, pipe=true, flow=true)) + val read_issue_q = Module(new Queue(new ScratchpadMemReadRequest(local_addr_t, mvin_scale_t_bits), spad_read_delay+1, pipe=true)) + write_dispatch_q.ready := false.B write_norm_q.io.enq.valid := false.B @@ -444,10 +444,10 @@ class Scratchpad[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, io.busy := writer.module.io.busy || reader.module.io.busy || write_issue_q.io.deq.valid || write_norm_q.io.deq.valid || write_scale_q.io.deq.valid || write_dispatch_q.valid val spad_mems = { - val banks = Seq.fill(sp_banks) { Module(new ScratchpadBank( + val banks = Seq.tabulate(sp_banks) { bankId => Module(new ScratchpadBank( sp_bank_entries, spad_w, aligned_to, config.sp_singleported, - use_shared_ext_mem, is_dummy + use_shared_ext_mem, is_dummy=bankId > 5 )) } val bank_ios = VecInit(banks.map(_.io)) // Reading from the SRAM banks