diff --git a/src/main/resources/vsrc/RoccBlackBox.v b/src/main/resources/vsrc/RoccBlackBox.v index b8fa74df91..4588cd38de 100644 --- a/src/main/resources/vsrc/RoccBlackBox.v +++ b/src/main/resources/vsrc/RoccBlackBox.v @@ -119,6 +119,7 @@ module RoccBlackBox input rocc_mem_s2_xcpt_ae_ld, input rocc_mem_s2_xcpt_ae_st, input rocc_mem_ordered, + input rocc_mem_store_pending, input rocc_mem_perf_acquire, input rocc_mem_perf_release, input rocc_mem_perf_grant, @@ -159,6 +160,7 @@ module RoccBlackBox output [fLen:0] rocc_fpu_req_bits_in1, output [fLen:0] rocc_fpu_req_bits_in2, output [fLen:0] rocc_fpu_req_bits_in3, + output rocc_fpu_req_bits_vec, output rocc_fpu_resp_ready, input rocc_fpu_resp_valid, input [fLen:0] rocc_fpu_resp_bits_data, diff --git a/src/main/scala/rocket/AMOALU.scala b/src/main/scala/rocket/AMOALU.scala index 0aff648446..47295d18db 100644 --- a/src/main/scala/rocket/AMOALU.scala +++ b/src/main/scala/rocket/AMOALU.scala @@ -10,6 +10,7 @@ import org.chipsalliance.cde.config.Parameters class StoreGen(typ: UInt, addr: UInt, dat: UInt, maxSize: Int) { val size = Wire(UInt(log2Up(log2Up(maxSize)+1).W)) size := typ + val dat_padded = dat.pad(maxSize*8) def misaligned: Bool = (addr & ((1.U << size) - 1.U)(log2Up(maxSize)-1,0)).orR @@ -24,8 +25,8 @@ class StoreGen(typ: UInt, addr: UInt, dat: UInt, maxSize: Int) { } protected def genData(i: Int): UInt = - if (i >= log2Up(maxSize)) dat - else Mux(size === i.U, Fill(1 << (log2Up(maxSize)-i), dat((8 << i)-1,0)), genData(i+1)) + if (i >= log2Up(maxSize)) dat_padded + else Mux(size === i.U, Fill(1 << (log2Up(maxSize)-i), dat_padded((8 << i)-1,0)), genData(i+1)) def data = genData(0) def wordData = genData(2) diff --git a/src/main/scala/rocket/CSR.scala b/src/main/scala/rocket/CSR.scala index 5c0e540a80..8095efaa00 100644 --- a/src/main/scala/rocket/CSR.scala +++ b/src/main/scala/rocket/CSR.scala @@ -245,6 +245,7 @@ class CSRDecodeIO(implicit p: Parameters) extends CoreBundle { val fp_illegal = Output(Bool()) val vector_illegal = Output(Bool()) val fp_csr = Output(Bool()) + val vector_csr = Output(Bool()) val rocc_illegal = Output(Bool()) val read_illegal = Output(Bool()) val write_illegal = Output(Bool()) @@ -914,6 +915,7 @@ class CSRFile( io_dec.fp_illegal := io.status.fs === 0.U || reg_mstatus.v && reg_vsstatus.fs === 0.U || !reg_misa('f'-'a') io_dec.vector_illegal := io.status.vs === 0.U || reg_mstatus.v && reg_vsstatus.vs === 0.U || !reg_misa('v'-'a') io_dec.fp_csr := decodeFast(fp_csrs.keys.toList) + io_dec.vector_csr := decodeFast(vector_csrs.keys.toList) io_dec.rocc_illegal := io.status.xs === 0.U || reg_mstatus.v && reg_vsstatus.xs === 0.U || !reg_misa('x'-'a') val csr_addr_legal = reg_mstatus.prv >= CSR.mode(addr) || usingHypervisor.B && !reg_mstatus.v && reg_mstatus.prv === PRV.S.U && CSR.mode(addr) === PRV.H.U diff --git a/src/main/scala/rocket/DCache.scala b/src/main/scala/rocket/DCache.scala index c2dc4843b9..e196fbc573 100644 --- a/src/main/scala/rocket/DCache.scala +++ b/src/main/scala/rocket/DCache.scala @@ -93,7 +93,7 @@ class DCache(staticIdForMetadataUseOnly: Int, val crossing: ClockCrossingType)(i class DCacheTLBPort(implicit p: Parameters) extends CoreBundle()(p) { val req = Flipped(Decoupled(new TLBReq(coreDataBytes.log2))) - val s1_resp = Output(new TLBResp) + val s1_resp = Output(new TLBResp(coreDataBytes.log2)) val s2_kill = Input(Bool()) } @@ -926,6 +926,7 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) { val s1_isSlavePortAccess = s1_req.no_xcpt val s2_isSlavePortAccess = s2_req.no_xcpt io.cpu.ordered := !(s1_valid && !s1_isSlavePortAccess || s2_valid && !s2_isSlavePortAccess || cached_grant_wait || uncachedInFlight.asUInt.orR) + io.cpu.store_pending := (cached_grant_wait && isWrite(s2_req.cmd)) || uncachedInFlight.asUInt.orR val s1_xcpt_valid = tlb.io.req.valid && !s1_isSlavePortAccess && !s1_nack io.cpu.s2_xcpt := Mux(RegNext(s1_xcpt_valid), s2_tlb_xcpt, 0.U.asTypeOf(s2_tlb_xcpt)) diff --git a/src/main/scala/rocket/DebugROB.scala b/src/main/scala/rocket/DebugROB.scala index 3c4422d87e..39dec03a78 100644 --- a/src/main/scala/rocket/DebugROB.scala +++ b/src/main/scala/rocket/DebugROB.scala @@ -29,7 +29,6 @@ class WidenedTracedInstruction extends Bundle { // These is not synthesizable, they use a C++ blackbox to implement the // write-back reordering class DebugROBPushTrace(implicit val p: Parameters) extends BlackBox with HasBlackBoxResource with HasCoreParameters { - require(traceHasWdata && (vLen max xLen) <= 512) val io = IO(new Bundle { val clock = Input(Clock()) val reset = Input(Bool()) @@ -45,7 +44,6 @@ class DebugROBPushTrace(implicit val p: Parameters) extends BlackBox with HasBla class DebugROBPushWb(implicit val p: Parameters) extends BlackBox with HasBlackBoxResource with HasCoreParameters { - require(traceHasWdata && (vLen max xLen) <= 512) val io = IO(new Bundle { val clock = Input(Clock()) val reset = Input(Bool()) @@ -59,7 +57,6 @@ class DebugROBPushWb(implicit val p: Parameters) extends BlackBox } class DebugROBPopTrace(implicit val p: Parameters) extends BlackBox with HasBlackBoxResource with HasCoreParameters { - require(traceHasWdata && (vLen max xLen) <= 512) val io = IO(new Bundle { val clock = Input(Clock()) val reset = Input(Bool()) diff --git a/src/main/scala/rocket/HellaCache.scala b/src/main/scala/rocket/HellaCache.scala index f0d7bdf8d0..29333c13fa 100644 --- a/src/main/scala/rocket/HellaCache.scala +++ b/src/main/scala/rocket/HellaCache.scala @@ -187,6 +187,7 @@ class HellaCacheIO(implicit p: Parameters) extends CoreBundle()(p) { val s2_gpa_is_pte = Input(Bool()) val uncached_resp = tileParams.dcache.get.separateUncachedResp.option(Flipped(Decoupled(new HellaCacheResp))) val ordered = Input(Bool()) + val store_pending = Input(Bool()) // there is a store in a store buffer somewhere val perf = Input(new HellaCachePerfEvents()) val keep_clock_enabled = Output(Bool()) // should D$ avoid clock-gating itself? diff --git a/src/main/scala/rocket/HellaCacheArbiter.scala b/src/main/scala/rocket/HellaCacheArbiter.scala index 4b9fc08f4c..cdc287d504 100644 --- a/src/main/scala/rocket/HellaCacheArbiter.scala +++ b/src/main/scala/rocket/HellaCacheArbiter.scala @@ -63,6 +63,7 @@ class HellaCacheArbiter(n: Int)(implicit p: Parameters) extends Module io.requestor(i).s2_gpa := io.mem.s2_gpa io.requestor(i).s2_gpa_is_pte := io.mem.s2_gpa_is_pte io.requestor(i).ordered := io.mem.ordered + io.requestor(i).store_pending := io.mem.store_pending io.requestor(i).perf := io.mem.perf io.requestor(i).s2_nack := io.mem.s2_nack && s2_id === i.U io.requestor(i).s2_nack_cause_raw := io.mem.s2_nack_cause_raw diff --git a/src/main/scala/rocket/IDecode.scala b/src/main/scala/rocket/IDecode.scala index 86ee7eef58..7c3aea6ecb 100644 --- a/src/main/scala/rocket/IDecode.scala +++ b/src/main/scala/rocket/IDecode.scala @@ -44,6 +44,7 @@ class IntCtrlSigs(aluFn: ALUFN = ALUFN())(implicit val p: Parameters) extends Bu val fence = Bool() val amo = Bool() val dp = Bool() + val vec = Bool() def default: List[BitPat] = // jal renf1 fence.i @@ -433,6 +434,15 @@ class D64Decode(aluFn: ALUFN = ALUFN())(implicit val p: Parameters) extends Deco FCVT_D_LU-> List(Y,Y,N,N,N,N,N,Y,A2_X, A1_RS1, IMM_X, DW_X, aluFn.FN_X, N,M_X, N,N,N,Y,N,N,N,CSR.N,N,N,N,Y)) } +class VCFGDecode(aluFn: ALUFN = ALUFN())(implicit val p: Parameters) extends DecodeConstants +{ + val table: Array[(BitPat, List[BitPat])] = Array( + VSETVLI -> List(Y,N,N,N,N,N,N,Y,A2_X, A1_X, IMM_X, DW_X, aluFn.FN_X, N,M_X, N,N,N,N,N,N,Y,CSR.N,N,N,N,N), + VSETIVLI -> List(Y,N,N,N,N,N,N,N,A2_X, A1_X, IMM_X, DW_X, aluFn.FN_X, N,M_X, N,N,N,N,N,N,Y,CSR.N,N,N,N,N), + VSETVL -> List(Y,N,N,N,N,N,Y,Y,A2_X, A1_X, IMM_X, DW_X, aluFn.FN_X, N,M_X, N,N,N,N,N,N,Y,CSR.N,N,N,N,N)) +} + + class RoCCDecode(aluFn: ALUFN = ALUFN())(implicit val p: Parameters) extends DecodeConstants { val table: Array[(BitPat, List[BitPat])] = Array( diff --git a/src/main/scala/rocket/NBDcache.scala b/src/main/scala/rocket/NBDcache.scala index dfe6e9ac59..a0f9f72b91 100644 --- a/src/main/scala/rocket/NBDcache.scala +++ b/src/main/scala/rocket/NBDcache.scala @@ -56,6 +56,7 @@ class IOMSHR(id: Int)(implicit edge: TLEdgeOut, p: Parameters) extends L1HellaCa val mem_access = Decoupled(new TLBundleA(edge.bundle)) val mem_ack = Flipped(Valid(new TLBundleD(edge.bundle))) val replay_next = Output(Bool()) + val store_pending = Output(Bool()) }) def beatOffset(addr: UInt) = addr.extract(beatOffBits - 1, wordOffBits) @@ -119,6 +120,7 @@ class IOMSHR(id: Int)(implicit edge: TLEdgeOut, p: Parameters) extends L1HellaCa io.resp.bits.data_word_bypass := loadgen.wordData io.resp.bits.store_data := req.data io.resp.bits.replay := true.B + io.store_pending := state =/= s_idle && isWrite(req.cmd) when (io.req.fire) { req := io.req.bits @@ -335,6 +337,7 @@ class MSHRFile(implicit edge: TLEdgeOut, p: Parameters) extends L1HellaCacheModu val probe_rdy = Output(Bool()) val fence_rdy = Output(Bool()) val replay_next = Output(Bool()) + val store_pending = Output(Bool()) }) // determine if the request is cacheable or not @@ -443,6 +446,8 @@ class MSHRFile(implicit edge: TLEdgeOut, p: Parameters) extends L1HellaCacheModu TLArbiter.lowestFromSeq(edge, io.mem_acquire, mshrs.map(_.io.mem_acquire) ++ mmios.map(_.io.mem_access)) TLArbiter.lowestFromSeq(edge, io.mem_finish, mshrs.map(_.io.mem_finish)) + io.store_pending := sdq_val =/= 0.U || mmios.map(_.io.store_pending).orR + io.resp <> resp_arb.io.out io.req.ready := Mux(!cacheable, mmio_rdy, @@ -1051,6 +1056,7 @@ class NonBlockingDCacheModule(outer: NonBlockingDCache) extends HellaCacheModule io.cpu.resp.bits.data_word_bypass := loadgen.wordData io.cpu.resp.bits.data_raw := s2_data_word io.cpu.ordered := mshrs.io.fence_rdy && !s1_valid && !s2_valid + io.cpu.store_pending := mshrs.io.store_pending io.cpu.replay_next := (s1_replay && s1_read) || mshrs.io.replay_next val s1_xcpt_valid = dtlb.io.req.valid && !s1_nack diff --git a/src/main/scala/rocket/RocketCore.scala b/src/main/scala/rocket/RocketCore.scala index ec2f9bcea9..2e96155cca 100644 --- a/src/main/scala/rocket/RocketCore.scala +++ b/src/main/scala/rocket/RocketCore.scala @@ -51,6 +51,7 @@ case class RocketCoreParams( debugROB: Option[DebugROBParams] = None, // if size < 1, SW ROB, else HW ROB haveCease: Boolean = true, // non-standard CEASE instruction haveSimTimeout: Boolean = true, // add plusarg for simulation timeout + vector: Option[RocketCoreVectorParams] = None ) extends CoreParams { val lgPauseCycles = 5 val haveFSDirty = false @@ -62,6 +63,10 @@ case class RocketCoreParams( val instBits: Int = if (useCompressed) 16 else 32 val lrscCycles: Int = 80 // worst case is 14 mispredicted branches + slop val traceHasWdata: Boolean = debugROB.isDefined // ooo wb, so no wdata in trace + override val useVector = vector.isDefined + override val vectorUseDCache = vector.map(_.useDCache).getOrElse(false) + override def vLen = vector.map(_.vLen).getOrElse(0) + override def vMemDataBits = vector.map(_.vMemDataBits).getOrElse(0) override val customIsaExt = Option.when(haveCease)("xrocket") // CEASE instruction override def minFLen: Int = fpu.map(_.minFLen).getOrElse(32) override def customCSRs(implicit p: Parameters) = new RocketCustomCSRs @@ -133,6 +138,7 @@ trait HasRocketCoreIO extends HasRocketCoreParameters { val cease = Output(Bool()) val wfi = Output(Bool()) val traceStall = Input(Bool()) + val vector = if (usingVector) Some(Flipped(new VectorCoreIO)) else None }) } @@ -219,6 +225,7 @@ class Rocket(tile: RocketTile)(implicit p: Parameters) extends CoreModule()(p) Seq(new FenceIDecode(tile.dcache.flushOnFenceI, aluFn)) ++: coreParams.haveCFlush.option(new CFlushDecode(tile.dcache.canSupportCFlushLine, aluFn)) ++: rocketParams.haveCease.option(new CeaseDecode(aluFn)) ++: + usingVector.option(new VCFGDecode(aluFn)) ++: Seq(new IDecode(aluFn)) } flatMap(_.table) @@ -241,6 +248,7 @@ class Rocket(tile: RocketTile)(implicit p: Parameters) extends CoreModule()(p) val ex_reg_inst = Reg(Bits()) val ex_reg_raw_inst = Reg(UInt()) val ex_reg_wphit = Reg(Vec(nBreakpoints, Bool())) + val ex_reg_set_vconfig = Reg(Bool()) val mem_reg_xcpt_interrupt = Reg(Bool()) val mem_reg_valid = Reg(Bool()) @@ -253,6 +261,7 @@ class Rocket(tile: RocketTile)(implicit p: Parameters) extends CoreModule()(p) val mem_reg_slow_bypass = Reg(Bool()) val mem_reg_load = Reg(Bool()) val mem_reg_store = Reg(Bool()) + val mem_reg_set_vconfig = Reg(Bool()) val mem_reg_sfence = Reg(Bool()) val mem_reg_pc = Reg(UInt()) val mem_reg_inst = Reg(Bits()) @@ -270,6 +279,7 @@ class Rocket(tile: RocketTile)(implicit p: Parameters) extends CoreModule()(p) val wb_reg_replay = Reg(Bool()) val wb_reg_flush_pipe = Reg(Bool()) val wb_reg_cause = Reg(UInt()) + val wb_reg_set_vconfig = Reg(Bool()) val wb_reg_sfence = Reg(Bool()) val wb_reg_pc = Reg(UInt()) val wb_reg_mem_size = Reg(UInt()) @@ -298,6 +308,7 @@ class Rocket(tile: RocketTile)(implicit p: Parameters) extends CoreModule()(p) require(!(coreParams.useRVE && coreParams.fpu.nonEmpty), "Can't select both RVE and floating-point") require(!(coreParams.useRVE && coreParams.useHypervisor), "Can't select both RVE and Hypervisor") val id_ctrl = Wire(new IntCtrlSigs(aluFn)).decode(id_inst(0), decode_table) + val lgNXRegs = if (coreParams.useRVE) 4 else 5 val regAddrMask = (1 << lgNXRegs) - 1 @@ -322,11 +333,45 @@ class Rocket(tile: RocketTile)(implicit p: Parameters) extends CoreModule()(p) val id_csr_ren = id_ctrl.csr.isOneOf(CSR.S, CSR.C) && id_expanded_inst(0).rs1 === 0.U val id_csr = Mux(id_system_insn && id_ctrl.mem, CSR.N, Mux(id_csr_ren, CSR.R, id_ctrl.csr)) val id_csr_flush = id_system_insn || (id_csr_en && !id_csr_ren && csr.io.decode(0).write_flush) + val id_set_vconfig = Seq(Instructions.VSETVLI, Instructions.VSETIVLI, Instructions.VSETVL).map(_ === id_inst(0)).orR && usingVector.B + + id_ctrl.vec := false.B + if (usingVector) { + val v_decode = rocketParams.vector.get.decoder(p) + v_decode.io.inst := id_inst(0) + v_decode.io.vconfig := csr.io.vector.get.vconfig + when (v_decode.io.legal) { + id_ctrl.legal := !csr.io.vector.get.vconfig.vtype.vill + id_ctrl.fp := v_decode.io.fp + id_ctrl.rocc := false.B + id_ctrl.branch := false.B + id_ctrl.jal := false.B + id_ctrl.jalr := false.B + id_ctrl.rxs2 := v_decode.io.read_rs2 + id_ctrl.rxs1 := v_decode.io.read_rs1 + id_ctrl.mem := false.B + id_ctrl.rfs1 := v_decode.io.read_frs1 + id_ctrl.rfs2 := false.B + id_ctrl.rfs3 := false.B + id_ctrl.wfd := v_decode.io.write_frd + id_ctrl.mul := false.B + id_ctrl.div := false.B + id_ctrl.wxd := v_decode.io.write_rd + id_ctrl.csr := CSR.N + id_ctrl.fence_i := false.B + id_ctrl.fence := false.B + id_ctrl.amo := false.B + id_ctrl.dp := false.B + id_ctrl.vec := true.B + } + } + val id_illegal_insn = !id_ctrl.legal || (id_ctrl.mul || id_ctrl.div) && !csr.io.status.isa('m'-'a') || id_ctrl.amo && !csr.io.status.isa('a'-'a') || - id_ctrl.fp && (csr.io.decode(0).fp_illegal || io.fpu.illegal_rm) || + id_ctrl.fp && (csr.io.decode(0).fp_illegal || (io.fpu.illegal_rm && !id_ctrl.vec)) || + (id_ctrl.vec) && (csr.io.decode(0).vector_illegal || csr.io.vector.map(_.vconfig.vtype.vill).getOrElse(false.B)) || id_ctrl.dp && !csr.io.status.isa('d'-'a') || ibuf.io.inst(0).bits.rvc && !csr.io.status.isa('c'-'a') || id_raddr2_illegal && id_ctrl.rxs2 || @@ -350,7 +395,9 @@ class Rocket(tile: RocketTile)(implicit p: Parameters) extends CoreModule()(p) (io.rocc.busy || ex_reg_valid && ex_ctrl.rocc || mem_reg_valid && mem_ctrl.rocc || wb_reg_valid && wb_ctrl.rocc) val id_csr_rocc_write = tile.roccCSRs.flatten.map(_.id.U === id_inst(0)(31,20)).orR && id_csr_en && !id_csr_ren + val id_vec_busy = io.vector.map(v => v.backend_busy || v.trap_check_busy).getOrElse(false.B) val id_do_fence = WireDefault(id_rocc_busy && (id_ctrl.fence || id_csr_rocc_write) || + id_vec_busy && id_ctrl.fence || id_mem_busy && (id_ctrl.amo && id_amo_rl || id_ctrl.fence_i || id_reg_fence && (id_ctrl.mem || id_ctrl.rocc))) val bpu = Module(new BreakpointUnit(nBreakpoints)) @@ -418,6 +465,23 @@ class Rocket(tile: RocketTile)(implicit p: Parameters) extends CoreModule()(p) A2_IMM -> ex_imm, A2_SIZE -> Mux(ex_reg_rvc, 2.S, 4.S))) + val (ex_new_vl, ex_new_vconfig) = if (usingVector) { + val ex_new_vtype = VType.fromUInt(MuxCase(ex_rs(1), Seq( + ex_reg_inst(31,30).andR -> ex_reg_inst(29,20), + !ex_reg_inst(31) -> ex_reg_inst(30,20)))) + val ex_avl = Mux(ex_ctrl.rxs1, + Mux(ex_reg_inst(19,15) === 0.U, + Mux(ex_reg_inst(11,6) === 0.U, csr.io.vector.get.vconfig.vl, ex_new_vtype.vlMax), + ex_rs(0) + ), + ex_reg_inst(19,15)) + val ex_new_vl = ex_new_vtype.vl(ex_avl, csr.io.vector.get.vconfig.vl, false.B, false.B, false.B) + val ex_new_vconfig = Wire(new VConfig) + ex_new_vconfig.vtype := ex_new_vtype + ex_new_vconfig.vl := ex_new_vl + (Some(ex_new_vl), Some(ex_new_vconfig)) + } else { (None, None) } + val alu = Module(aluFn match { case _: ALUFN => new ALU }) @@ -507,13 +571,15 @@ class Rocket(tile: RocketTile)(implicit p: Parameters) extends CoreModule()(p) ex_reg_pc := ibuf.io.pc ex_reg_btb_resp := ibuf.io.btb_resp ex_reg_wphit := bpu.io.bpwatch.map { bpw => bpw.ivalid(0) } + ex_reg_set_vconfig := id_set_vconfig && !id_xcpt } // replay inst in ex stage? val ex_pc_valid = ex_reg_valid || ex_reg_replay || ex_reg_xcpt_interrupt val wb_dcache_miss = wb_ctrl.mem && !io.dmem.resp.valid val replay_ex_structural = ex_ctrl.mem && !io.dmem.req.ready || - ex_ctrl.div && !div.io.req.ready + ex_ctrl.div && !div.io.req.ready || + ex_ctrl.vec && !io.vector.map(_.ex.ready).getOrElse(true.B) val replay_ex_load_use = wb_dcache_miss && ex_reg_load_use val replay_ex = ex_reg_replay || (ex_reg_valid && (replay_ex_structural || replay_ex_load_use)) val ctrl_killx = take_pc_mem_wb || replay_ex || !ex_reg_valid @@ -564,6 +630,7 @@ class Rocket(tile: RocketTile)(implicit p: Parameters) extends CoreModule()(p) mem_reg_flush_pipe := ex_reg_flush_pipe mem_reg_slow_bypass := ex_slow_bypass mem_reg_wphit := ex_reg_wphit + mem_reg_set_vconfig := ex_reg_set_vconfig mem_reg_cause := ex_cause mem_reg_inst := ex_reg_inst @@ -572,13 +639,17 @@ class Rocket(tile: RocketTile)(implicit p: Parameters) extends CoreModule()(p) mem_reg_hls_or_dv := io.dmem.req.bits.dv mem_reg_pc := ex_reg_pc // IDecode ensured they are 1H - mem_reg_wdata := alu.io.out + mem_reg_wdata := Mux(ex_reg_set_vconfig, ex_new_vl.getOrElse(alu.io.out), alu.io.out) mem_br_taken := alu.io.cmp_out + when (ex_ctrl.rxs2 && (ex_ctrl.mem || ex_ctrl.rocc || ex_sfence)) { val size = Mux(ex_ctrl.rocc, log2Ceil(xLen/8).U, ex_reg_mem_size) mem_reg_rs2 := new StoreGen(size, 0.U, ex_rs(1), coreDataBytes).data } + if (usingVector) { when (ex_reg_set_vconfig) { + mem_reg_rs2 := ex_new_vconfig.get.asUInt + } } when (ex_ctrl.jalr && csr.io.status.debug) { // flush I$ on D-mode JALR to effect uncached fetch without D$ flush mem_ctrl.fence_i := true.B @@ -606,21 +677,23 @@ class Rocket(tile: RocketTile)(implicit p: Parameters) extends CoreModule()(p) val dcache_kill_mem = mem_reg_valid && mem_ctrl.wxd && io.dmem.replay_next // structural hazard on writeback port val fpu_kill_mem = mem_reg_valid && mem_ctrl.fp && io.fpu.nack_mem - val replay_mem = dcache_kill_mem || mem_reg_replay || fpu_kill_mem + val vec_kill_mem = mem_reg_valid && mem_ctrl.mem && io.vector.map(_.mem.block_mem).getOrElse(false.B) + val vec_kill_all = mem_reg_valid && io.vector.map(_.mem.block_all).getOrElse(false.B) + val replay_mem = dcache_kill_mem || mem_reg_replay || fpu_kill_mem || vec_kill_mem || vec_kill_all val killm_common = dcache_kill_mem || take_pc_wb || mem_reg_xcpt || !mem_reg_valid div.io.kill := killm_common && RegNext(div.io.req.fire) - val ctrl_killm = killm_common || mem_xcpt || fpu_kill_mem + val ctrl_killm = killm_common || mem_xcpt || fpu_kill_mem || vec_kill_mem // writeback stage wb_reg_valid := !ctrl_killm wb_reg_replay := replay_mem && !take_pc_wb - wb_reg_xcpt := mem_xcpt && !take_pc_wb + wb_reg_xcpt := mem_xcpt && !take_pc_wb && !io.vector.map(_.mem.block_all).getOrElse(false.B) wb_reg_flush_pipe := !ctrl_killm && mem_reg_flush_pipe when (mem_pc_valid) { wb_ctrl := mem_ctrl wb_reg_sfence := mem_reg_sfence wb_reg_wdata := Mux(!mem_reg_xcpt && mem_ctrl.fp && mem_ctrl.wxd, io.fpu.toint_data, mem_int_wdata) - when (mem_ctrl.rocc || mem_reg_sfence) { + when (mem_ctrl.rocc || mem_reg_sfence || mem_reg_set_vconfig) { wb_reg_rs2 := mem_reg_rs2 } wb_reg_cause := mem_cause @@ -632,7 +705,7 @@ class Rocket(tile: RocketTile)(implicit p: Parameters) extends CoreModule()(p) wb_reg_hfence_g := mem_ctrl.mem_cmd === M_HFENCEG wb_reg_pc := mem_reg_pc wb_reg_wphit := mem_reg_wphit | bpu.io.bpwatch.map { bpw => (bpw.rvalid(0) && mem_reg_load) || (bpw.wvalid(0) && mem_reg_store) } - + wb_reg_set_vconfig := mem_reg_set_vconfig } val (wb_xcpt, wb_cause) = checkExceptions(List( @@ -663,11 +736,12 @@ class Rocket(tile: RocketTile)(implicit p: Parameters) extends CoreModule()(p) val wb_pc_valid = wb_reg_valid || wb_reg_replay || wb_reg_xcpt val wb_wxd = wb_reg_valid && wb_ctrl.wxd - val wb_set_sboard = wb_ctrl.div || wb_dcache_miss || wb_ctrl.rocc + val wb_set_sboard = wb_ctrl.div || wb_dcache_miss || wb_ctrl.rocc || wb_ctrl.vec val replay_wb_common = io.dmem.s2_nack || wb_reg_replay val replay_wb_rocc = wb_reg_valid && wb_ctrl.rocc && !io.rocc.cmd.ready val replay_wb_csr: Bool = wb_reg_valid && csr.io.rw_stall - val replay_wb = replay_wb_common || replay_wb_rocc || replay_wb_csr + val replay_wb_vec = wb_reg_valid && io.vector.map(_.wb.replay).getOrElse(false.B) + val replay_wb = replay_wb_common || replay_wb_rocc || replay_wb_csr || replay_wb_vec take_pc_wb := replay_wb || wb_xcpt || csr.io.eret || wb_reg_flush_pipe // writeback arbitration @@ -677,30 +751,44 @@ class Rocket(tile: RocketTile)(implicit p: Parameters) extends CoreModule()(p) val dmem_resp_valid = io.dmem.resp.valid && io.dmem.resp.bits.has_data val dmem_resp_replay = dmem_resp_valid && io.dmem.resp.bits.replay - div.io.resp.ready := !wb_wxd - val ll_wdata = WireDefault(div.io.resp.bits.data) - val ll_waddr = WireDefault(div.io.resp.bits.tag) - val ll_wen = WireDefault(div.io.resp.fire) + val ll_arb = Module(new Arbiter(new Bundle { + val data = UInt(xLen.W) + val tag = UInt(5.W) + }, 3)) // div, rocc, vec + ll_arb.io.in.foreach(_.valid := false.B) + ll_arb.io.in.foreach(_.bits := DontCare) + val ll_wdata = WireInit(ll_arb.io.out.bits.data) + val ll_waddr = WireInit(ll_arb.io.out.bits.tag) + val ll_wen = WireInit(ll_arb.io.out.fire) + ll_arb.io.out.ready := !wb_wxd + + div.io.resp.ready := ll_arb.io.in(0).ready + ll_arb.io.in(0).valid := div.io.resp.valid + ll_arb.io.in(0).bits.data := div.io.resp.bits.data + ll_arb.io.in(0).bits.tag := div.io.resp.bits.tag + if (usingRoCC) { - io.rocc.resp.ready := !wb_wxd - when (io.rocc.resp.fire) { - div.io.resp.ready := false.B - ll_wdata := io.rocc.resp.bits.data - ll_waddr := io.rocc.resp.bits.rd - ll_wen := true.B - } + io.rocc.resp.ready := ll_arb.io.in(1).ready + ll_arb.io.in(1).valid := io.rocc.resp.valid + ll_arb.io.in(1).bits.data := io.rocc.resp.bits.data + ll_arb.io.in(1).bits.tag := io.rocc.resp.bits.rd } else { // tie off RoCC io.rocc.resp.ready := false.B io.rocc.mem.req.ready := false.B } + + io.vector.map { v => + v.resp.ready := Mux(v.resp.bits.fp, !(dmem_resp_valid && dmem_resp_fpu), ll_arb.io.in(2).ready) + ll_arb.io.in(2).valid := v.resp.valid && !v.resp.bits.fp + ll_arb.io.in(2).bits.data := v.resp.bits.data + ll_arb.io.in(2).bits.tag := v.resp.bits.rd + } // Dont care mem since not all RoCC need accessing memory io.rocc.mem := DontCare when (dmem_resp_replay && dmem_resp_xpu) { - div.io.resp.ready := false.B - if (usingRoCC) - io.rocc.resp.ready := false.B + ll_arb.io.out.ready := false.B ll_waddr := dmem_resp_waddr ll_wen := true.B } @@ -726,11 +814,15 @@ class Rocket(tile: RocketTile)(implicit p: Parameters) extends CoreModule()(p) csr.io.interrupts := io.interrupts csr.io.hartid := io.hartid io.fpu.fcsr_rm := csr.io.fcsr_rm - csr.io.fcsr_flags := io.fpu.fcsr_flags + val vector_fcsr_flags = io.vector.map(_.set_fflags.bits).getOrElse(0.U(5.W)) + val vector_fcsr_flags_valid = io.vector.map(_.set_fflags.valid).getOrElse(false.B) + csr.io.fcsr_flags.valid := io.fpu.fcsr_flags.valid | vector_fcsr_flags_valid + csr.io.fcsr_flags.bits := (io.fpu.fcsr_flags.bits & Fill(5, io.fpu.fcsr_flags.valid)) | (vector_fcsr_flags & Fill(5, vector_fcsr_flags_valid)) io.fpu.time := csr.io.time(31,0) io.fpu.hartid := io.hartid csr.io.rocc_interrupt := io.rocc.interrupt csr.io.pc := wb_reg_pc + val tval_dmem_addr = !wb_reg_xcpt val tval_any_addr = tval_dmem_addr || wb_reg_cause.isOneOf(Causes.breakpoint.U, Causes.fetch_access.U, Causes.fetch_page_fault.U, Causes.fetch_guest_page_fault.U) @@ -751,6 +843,40 @@ class Rocket(tile: RocketTile)(implicit p: Parameters) extends CoreModule()(p) val mhtinst_read_pseudo = (io.imem.gpa_is_pte && htval_valid_imem) || (io.dmem.s2_gpa_is_pte && htval_valid_dmem) (htval, mhtinst_read_pseudo) } + + csr.io.vector.foreach { v => + v.set_vconfig.valid := wb_reg_set_vconfig && wb_reg_valid + v.set_vconfig.bits := wb_reg_rs2.asTypeOf(new VConfig) + v.set_vs_dirty := wb_valid && wb_ctrl.vec + v.set_vstart.valid := wb_valid && wb_reg_set_vconfig + v.set_vstart.bits := 0.U + } + + io.vector.foreach { v => + when (v.wb.retire || v.wb.xcpt || wb_ctrl.vec) { + csr.io.pc := v.wb.pc + csr.io.retire := v.wb.retire + csr.io.inst(0) := v.wb.inst + when (v.wb.xcpt && !wb_reg_xcpt) { + wb_xcpt := true.B + wb_cause := v.wb.cause + csr.io.tval := v.wb.tval + } + } + v.wb.store_pending := io.dmem.store_pending + v.wb.vxrm := csr.io.vector.get.vxrm + v.wb.frm := csr.io.fcsr_rm + csr.io.vector.get.set_vxsat := v.set_vxsat + when (v.set_vconfig.valid) { + csr.io.vector.get.set_vconfig.valid := true.B + csr.io.vector.get.set_vconfig.bits := v.set_vconfig.bits + } + when (v.set_vstart.valid) { + csr.io.vector.get.set_vstart.valid := true.B + csr.io.vector.get.set_vstart.bits := v.set_vstart.bits + } + } + csr.io.htval := htval csr.io.mhtinst_read_pseudo := mhtinst_read_pseudo io.ptw.ptbr := csr.io.ptbr @@ -764,6 +890,8 @@ class Rocket(tile: RocketTile)(implicit p: Parameters) extends CoreModule()(p) csr.io.rw.addr := wb_reg_inst(31,20) csr.io.rw.cmd := CSR.maskCmd(wb_reg_valid, wb_ctrl.csr) csr.io.rw.wdata := wb_reg_wdata + + io.rocc.csrs <> csr.io.roccCSRs io.trace.time := csr.io.time io.trace.insns := csr.io.trace @@ -772,16 +900,25 @@ class Rocket(tile: RocketTile)(implicit p: Parameters) extends CoreModule()(p) if (sz < 1) { // use unsynthesizable ROB val csr_trace_with_wdata = WireInit(csr.io.trace(0)) csr_trace_with_wdata.wdata.get := rf_wdata + val should_wb = WireInit((wb_ctrl.wfd || (wb_ctrl.wxd && wb_waddr =/= 0.U)) && !csr.io.trace(0).exception) + val has_wb = WireInit(wb_ctrl.wxd && wb_wen && !wb_set_sboard) + val wb_addr = WireInit(wb_waddr + Mux(wb_ctrl.wfd, 32.U, 0.U)) + + io.vector.foreach { v => when (v.wb.retire) { + should_wb := v.wb.rob_should_wb + has_wb := false.B + wb_addr := Cat(v.wb.rob_should_wb_fp, csr_trace_with_wdata.insn(11,7)) + }} + DebugROB.pushTrace(clock, reset, io.hartid, csr_trace_with_wdata, - (wb_ctrl.wfd || (wb_ctrl.wxd && wb_waddr =/= 0.U)) && !csr.io.trace(0).exception, - wb_ctrl.wxd && wb_wen && !wb_set_sboard, - wb_waddr + Mux(wb_ctrl.wfd, 32.U, 0.U)) + should_wb, has_wb, wb_addr) io.trace.insns(0) := DebugROB.popTrace(clock, reset, io.hartid) DebugROB.pushWb(clock, reset, io.hartid, ll_wen, rf_waddr, rf_wdata) } else { // synthesizable ROB (no FPRs) + require(!usingVector, "Synthesizable ROB does not support vector implementations") val csr_trace_with_wdata = WireInit(csr.io.trace(0)) csr_trace_with_wdata.wdata.get := rf_wdata @@ -843,6 +980,10 @@ class Rocket(tile: RocketTile)(implicit p: Parameters) extends CoreModule()(p) val fp_data_hazard_mem = id_ctrl.fp && mem_ctrl.wfd && checkHazards(fp_hazard_targets, _ === mem_waddr) val id_mem_hazard = mem_reg_valid && (data_hazard_mem && mem_cannot_bypass || fp_data_hazard_mem) id_load_use := mem_reg_valid && data_hazard_mem && mem_ctrl.mem + val id_vconfig_hazard = id_ctrl.vec && ( + (ex_reg_valid && ex_reg_set_vconfig) || + (mem_reg_valid && mem_reg_set_vconfig) || + (wb_reg_valid && wb_reg_set_vconfig)) // stall for RAW/WAW hazards on load/AMO misses and mul/div in writeback. val data_hazard_wb = wb_ctrl.wxd && checkHazards(hazard_targets, _ === wb_waddr) @@ -851,8 +992,9 @@ class Rocket(tile: RocketTile)(implicit p: Parameters) extends CoreModule()(p) val id_stall_fpu = if (usingFPU) { val fp_sboard = new Scoreboard(32) - fp_sboard.set((wb_dcache_miss && wb_ctrl.wfd || io.fpu.sboard_set) && wb_valid, wb_waddr) - fp_sboard.clear(dmem_resp_replay && dmem_resp_fpu, dmem_resp_waddr) + fp_sboard.set(((wb_dcache_miss || wb_ctrl.vec) && wb_ctrl.wfd || io.fpu.sboard_set) && wb_valid, wb_waddr) + val v_ll = io.vector.map(v => v.resp.fire && v.resp.bits.fp).getOrElse(false.B) + fp_sboard.clear((dmem_resp_replay && dmem_resp_fpu) || v_ll, io.fpu.ll_resp_tag) fp_sboard.clear(io.fpu.sboard_clr, io.fpu.sboard_clra) checkHazards(fp_hazard_targets, fp_sboard.read _) @@ -869,8 +1011,10 @@ class Rocket(tile: RocketTile)(implicit p: Parameters) extends CoreModule()(p) val ctrl_stalld = id_ex_hazard || id_mem_hazard || id_wb_hazard || id_sboard_hazard || + id_vconfig_hazard || csr.io.singleStep && (ex_reg_valid || mem_reg_valid || wb_reg_valid) || id_csr_en && csr.io.decode(0).fp_csr && !io.fpu.fcsr_rdy || + id_csr_en && csr.io.decode(0).vector_csr && id_vec_busy || id_ctrl.fp && id_stall_fpu || id_ctrl.mem && dcache_blocked || // reduce activity during D$ misses id_ctrl.rocc && rocc_blocked || // reduce activity while RoCC is busy @@ -890,7 +1034,7 @@ class Rocket(tile: RocketTile)(implicit p: Parameters) extends CoreModule()(p) mem_npc)) // flush or branch misprediction io.imem.flush_icache := wb_reg_valid && wb_ctrl.fence_i && !io.dmem.s2_nack io.imem.might_request := { - imem_might_request_reg := ex_pc_valid || mem_pc_valid || io.ptw.customCSRs.disableICacheClockGate + imem_might_request_reg := ex_pc_valid || mem_pc_valid || io.ptw.customCSRs.disableICacheClockGate || io.vector.map(_.trap_check_busy).getOrElse(false.B) imem_might_request_reg } io.imem.progress := RegNext(wb_reg_valid && !replay_wb_common) @@ -933,12 +1077,37 @@ class Rocket(tile: RocketTile)(implicit p: Parameters) extends CoreModule()(p) io.fpu.killm := killm_common io.fpu.inst := id_inst(0) io.fpu.fromint_data := ex_rs(0) - io.fpu.dmem_resp_val := dmem_resp_valid && dmem_resp_fpu - io.fpu.dmem_resp_data := (if (minFLen == 32) io.dmem.resp.bits.data_word_bypass else io.dmem.resp.bits.data) - io.fpu.dmem_resp_type := io.dmem.resp.bits.size - io.fpu.dmem_resp_tag := dmem_resp_waddr + io.fpu.ll_resp_val := dmem_resp_valid && dmem_resp_fpu + io.fpu.ll_resp_data := (if (minFLen == 32) io.dmem.resp.bits.data_word_bypass else io.dmem.resp.bits.data) + io.fpu.ll_resp_type := io.dmem.resp.bits.size + io.fpu.ll_resp_tag := dmem_resp_waddr io.fpu.keep_clock_enabled := io.ptw.customCSRs.disableCoreClockGate + io.fpu.v_sew := csr.io.vector.map(_.vconfig.vtype.vsew).getOrElse(0.U) + + io.vector.map { v => + when (!(dmem_resp_valid && dmem_resp_fpu)) { + io.fpu.ll_resp_val := v.resp.valid && v.resp.bits.fp + io.fpu.ll_resp_data := v.resp.bits.data + io.fpu.ll_resp_type := v.resp.bits.size + io.fpu.ll_resp_tag := v.resp.bits.rd + } + } + + io.vector.foreach { v => + v.ex.valid := ex_reg_valid && (ex_ctrl.vec || rocketParams.vector.get.issueVConfig.B && ex_reg_set_vconfig) && !ctrl_killx + v.ex.inst := ex_reg_inst + v.ex.vconfig := csr.io.vector.get.vconfig + v.ex.vstart := Mux(mem_reg_valid && mem_ctrl.vec || wb_reg_valid && wb_ctrl.vec, 0.U, csr.io.vector.get.vstart) + v.ex.rs1 := ex_rs(0) + v.ex.rs2 := ex_rs(1) + v.ex.pc := ex_reg_pc + v.mem.frs1 := io.fpu.store_data + v.killm := killm_common + v.status := csr.io.status + } + + io.dmem.req.valid := ex_reg_valid && ex_ctrl.mem val ex_dcache_tag = Cat(ex_waddr, ex_ctrl.fp) require(coreParams.dcacheReqTagBits >= ex_dcache_tag.getWidth) @@ -960,7 +1129,7 @@ class Rocket(tile: RocketTile)(implicit p: Parameters) extends CoreModule()(p) io.dmem.s1_data.data := (if (fLen == 0) mem_reg_rs2 else Mux(mem_ctrl.fp, Fill(coreDataBits / fLen, io.fpu.store_data), mem_reg_rs2)) io.dmem.s1_data.mask := DontCare - io.dmem.s1_kill := killm_common || mem_ldst_xcpt || fpu_kill_mem + io.dmem.s1_kill := killm_common || mem_ldst_xcpt || fpu_kill_mem || vec_kill_mem io.dmem.s2_kill := false.B // don't let D$ go to sleep if we're probably going to use it soon io.dmem.keep_clock_enabled := ibuf.io.inst(0).valid && id_ctrl.mem && !csr.io.csr_stall @@ -1087,7 +1256,7 @@ class Rocket(tile: RocketTile)(implicit p: Parameters) extends CoreModule()(p) val rocketImpl = withClock (gated_clock) { new RocketImpl } def checkExceptions(x: Seq[(Bool, UInt)]) = - (x.map(_._1).reduce(_||_), PriorityMux(x)) + (WireInit(x.map(_._1).reduce(_||_)), WireInit(PriorityMux(x))) def coverExceptions(exceptionValid: Bool, cause: UInt, labelPrefix: String, coverCausesLabels: Seq[(Int, String)]): Unit = { for ((coverCause, label) <- coverCausesLabels) { diff --git a/src/main/scala/rocket/TLB.scala b/src/main/scala/rocket/TLB.scala index 27ef9f008b..6f78db43cb 100644 --- a/src/main/scala/rocket/TLB.scala +++ b/src/main/scala/rocket/TLB.scala @@ -72,7 +72,7 @@ class TLBExceptions extends Bundle { val inst = Bool() } -class TLBResp(implicit p: Parameters) extends CoreBundle()(p) { +class TLBResp(lgMaxSize: Int = 3)(implicit p: Parameters) extends CoreBundle()(p) { // lookup responses val miss = Bool() /** physical address */ @@ -93,6 +93,10 @@ class TLBResp(implicit p: Parameters) extends CoreBundle()(p) { val must_alloc = Bool() /** if this address is prefetchable for caches*/ val prefetchable = Bool() + /** size/cmd of request that generated this response*/ + val size = UInt(log2Ceil(lgMaxSize + 1).W) + val cmd = UInt(M_SZ.W) + } class TLBEntryData(implicit p: Parameters) extends CoreBundle()(p) { @@ -317,7 +321,7 @@ class TLB(instruction: Boolean, lgMaxSize: Int, cfg: TLBConfig)(implicit edge: T /** request from Core */ val req = Flipped(Decoupled(new TLBReq(lgMaxSize))) /** response to Core */ - val resp = Output(new TLBResp()) + val resp = Output(new TLBResp(lgMaxSize)) /** SFence Input */ val sfence = Flipped(Valid(new SFenceReq)) /** IO to PTW */ @@ -646,6 +650,8 @@ class TLB(instruction: Boolean, lgMaxSize: Int, cfg: TLBConfig)(implicit edge: T io.resp.prefetchable := (prefetchable_array & hits).orR && edge.manager.managers.forall(m => !m.supportsAcquireB || m.supportsHint).B io.resp.miss := do_refill || vsatp_mode_mismatch || tlb_miss || multipleHits io.resp.paddr := Cat(ppn, io.req.bits.vaddr(pgIdxBits-1, 0)) + io.resp.size := io.req.bits.size + io.resp.cmd := io.req.bits.cmd io.resp.gpa_is_pte := vstage1_en && r_gpa_is_pte io.resp.gpa := { val page = Mux(!vstage1_en, Cat(bad_gpa, vpn), r_gpa >> pgIdxBits) diff --git a/src/main/scala/rocket/VectorUnit.scala b/src/main/scala/rocket/VectorUnit.scala new file mode 100644 index 0000000000..1a81c48d1f --- /dev/null +++ b/src/main/scala/rocket/VectorUnit.scala @@ -0,0 +1,98 @@ +package freechips.rocketchip.rocket + +import chisel3._ +import chisel3.util._ +import org.chipsalliance.cde.config._ +import freechips.rocketchip.tile._ +import freechips.rocketchip.diplomacy._ +import freechips.rocketchip.tilelink._ + +case class RocketCoreVectorParams( + build: Parameters => RocketVectorUnit, + vLen: Int, + vMemDataBits: Int, + decoder: Parameters => RocketVectorDecoder, + useDCache: Boolean, + issueVConfig: Boolean +) + +class VectorCoreIO(implicit p: Parameters) extends CoreBundle()(p) { + val status = Input(new MStatus) + val ex = new Bundle { + val valid = Input(Bool()) + val ready = Output(Bool()) + val inst = Input(UInt(32.W)) + val pc = Input(UInt(vaddrBitsExtended.W)) + val vconfig = Input(new VConfig) + val vstart = Input(UInt(log2Ceil(maxVLMax).W)) + val rs1 = Input(UInt(xLen.W)) + val rs2 = Input(UInt(xLen.W)) + } + val killm = Input(Bool()) + val mem = new Bundle { + val frs1 = Input(UInt(fLen.W)) + val block_mem = Output(Bool()) + val block_all = Output(Bool()) + } + + val wb = new Bundle { + val store_pending = Input(Bool()) + val replay = Output(Bool()) + val retire = Output(Bool()) + val inst = Output(UInt(32.W)) + val rob_should_wb = Output(Bool()) // debug + val rob_should_wb_fp = Output(Bool()) // debug + val pc = Output(UInt(vaddrBitsExtended.W)) + val xcpt = Output(Bool()) + val cause = Output(UInt(log2Ceil(Causes.all.max).W)) + val tval = Output(UInt(coreMaxAddrBits.W)) + val vxrm = Input(UInt(2.W)) + val frm = Input(UInt(3.W)) + } + + val resp = Decoupled(new Bundle { + val fp = Bool() + val size = UInt(2.W) + val rd = UInt(5.W) + val data = UInt((xLen max fLen).W) + }) + + val set_vstart = Valid(UInt(log2Ceil(maxVLMax).W)) + val set_vxsat = Output(Bool()) + val set_vconfig = Valid(new VConfig) + val set_fflags = Valid(UInt(5.W)) + + val trap_check_busy = Output(Bool()) + val backend_busy = Output(Bool()) +} + +abstract class RocketVectorUnit(implicit p: Parameters) extends LazyModule { + val module: RocketVectorUnitModuleImp + val tlNode: TLNode = TLIdentityNode() + val atlNode: TLNode = TLIdentityNode() +} + +class RocketVectorUnitModuleImp(outer: RocketVectorUnit) extends LazyModuleImp(outer) { + val io = IO(new Bundle { + val core = new VectorCoreIO + val tlb = Flipped(new DCacheTLBPort) + val dmem = new HellaCacheIO + + val fp_req = Decoupled(new FPInput()) + val fp_resp = Flipped(Decoupled(new FPResult())) + }) +} + +abstract class RocketVectorDecoder(implicit p: Parameters) extends CoreModule()(p) { + val io = IO(new Bundle { + val inst = Input(UInt(32.W)) + val vconfig = Input(new VConfig) + val legal = Output(Bool()) + val fp = Output(Bool()) + val read_rs1 = Output(Bool()) + val read_rs2 = Output(Bool()) + val read_frs1 = Output(Bool()) + val write_rd = Output(Bool()) + val write_frd = Output(Bool()) + }) +} diff --git a/src/main/scala/tile/FPU.scala b/src/main/scala/tile/FPU.scala index 71d6e8d827..94f037e4a4 100644 --- a/src/main/scala/tile/FPU.scala +++ b/src/main/scala/tile/FPU.scala @@ -45,6 +45,7 @@ trait HasFPUCtrlSigs { val div = Bool() val sqrt = Bool() val wflags = Bool() + val vec = Bool() } class FPUCtrlSigs extends Bundle with HasFPUCtrlSigs @@ -57,121 +58,122 @@ class FPUDecoder(implicit p: Parameters) extends FPUModule()(p) { private val X2 = BitPat.dontCare(2) - val default = List(X,X,X,X,X,X,X,X2,X2,X,X,X,X,X,X,X) + val default = List(X,X,X,X,X,X,X,X2,X2,X,X,X,X,X,X,X,N) val h: Array[(BitPat, List[BitPat])] = - Array(FLH -> List(Y,Y,N,N,N,X,X,X2,X2,N,N,N,N,N,N,N), - FSH -> List(Y,N,N,Y,N,Y,X, I, H,N,Y,N,N,N,N,N), - FMV_H_X -> List(N,Y,N,N,N,X,X, H, I,Y,N,N,N,N,N,N), - FCVT_H_W -> List(N,Y,N,N,N,X,X, H, H,Y,N,N,N,N,N,Y), - FCVT_H_WU-> List(N,Y,N,N,N,X,X, H, H,Y,N,N,N,N,N,Y), - FCVT_H_L -> List(N,Y,N,N,N,X,X, H, H,Y,N,N,N,N,N,Y), - FCVT_H_LU-> List(N,Y,N,N,N,X,X, H, H,Y,N,N,N,N,N,Y), - FMV_X_H -> List(N,N,Y,N,N,N,X, I, H,N,Y,N,N,N,N,N), - FCLASS_H -> List(N,N,Y,N,N,N,X, H, H,N,Y,N,N,N,N,N), - FCVT_W_H -> List(N,N,Y,N,N,N,X, H,X2,N,Y,N,N,N,N,Y), - FCVT_WU_H-> List(N,N,Y,N,N,N,X, H,X2,N,Y,N,N,N,N,Y), - FCVT_L_H -> List(N,N,Y,N,N,N,X, H,X2,N,Y,N,N,N,N,Y), - FCVT_LU_H-> List(N,N,Y,N,N,N,X, H,X2,N,Y,N,N,N,N,Y), - FCVT_S_H -> List(N,Y,Y,N,N,N,X, H, S,N,N,Y,N,N,N,Y), - FCVT_H_S -> List(N,Y,Y,N,N,N,X, S, H,N,N,Y,N,N,N,Y), - FEQ_H -> List(N,N,Y,Y,N,N,N, H, H,N,Y,N,N,N,N,Y), - FLT_H -> List(N,N,Y,Y,N,N,N, H, H,N,Y,N,N,N,N,Y), - FLE_H -> List(N,N,Y,Y,N,N,N, H, H,N,Y,N,N,N,N,Y), - FSGNJ_H -> List(N,Y,Y,Y,N,N,N, H, H,N,N,Y,N,N,N,N), - FSGNJN_H -> List(N,Y,Y,Y,N,N,N, H, H,N,N,Y,N,N,N,N), - FSGNJX_H -> List(N,Y,Y,Y,N,N,N, H, H,N,N,Y,N,N,N,N), - FMIN_H -> List(N,Y,Y,Y,N,N,N, H, H,N,N,Y,N,N,N,Y), - FMAX_H -> List(N,Y,Y,Y,N,N,N, H, H,N,N,Y,N,N,N,Y), - FADD_H -> List(N,Y,Y,Y,N,N,Y, H, H,N,N,N,Y,N,N,Y), - FSUB_H -> List(N,Y,Y,Y,N,N,Y, H, H,N,N,N,Y,N,N,Y), - FMUL_H -> List(N,Y,Y,Y,N,N,N, H, H,N,N,N,Y,N,N,Y), - FMADD_H -> List(N,Y,Y,Y,Y,N,N, H, H,N,N,N,Y,N,N,Y), - FMSUB_H -> List(N,Y,Y,Y,Y,N,N, H, H,N,N,N,Y,N,N,Y), - FNMADD_H -> List(N,Y,Y,Y,Y,N,N, H, H,N,N,N,Y,N,N,Y), - FNMSUB_H -> List(N,Y,Y,Y,Y,N,N, H, H,N,N,N,Y,N,N,Y), - FDIV_H -> List(N,Y,Y,Y,N,N,N, H, H,N,N,N,N,Y,N,Y), - FSQRT_H -> List(N,Y,Y,N,N,N,X, H, H,N,N,N,N,N,Y,Y)) + Array(FLH -> List(Y,Y,N,N,N,X,X,X2,X2,N,N,N,N,N,N,N,N), + FSH -> List(Y,N,N,Y,N,Y,X, I, H,N,Y,N,N,N,N,N,N), + FMV_H_X -> List(N,Y,N,N,N,X,X, H, I,Y,N,N,N,N,N,N,N), + FCVT_H_W -> List(N,Y,N,N,N,X,X, H, H,Y,N,N,N,N,N,Y,N), + FCVT_H_WU-> List(N,Y,N,N,N,X,X, H, H,Y,N,N,N,N,N,Y,N), + FCVT_H_L -> List(N,Y,N,N,N,X,X, H, H,Y,N,N,N,N,N,Y,N), + FCVT_H_LU-> List(N,Y,N,N,N,X,X, H, H,Y,N,N,N,N,N,Y,N), + FMV_X_H -> List(N,N,Y,N,N,N,X, I, H,N,Y,N,N,N,N,N,N), + FCLASS_H -> List(N,N,Y,N,N,N,X, H, H,N,Y,N,N,N,N,N,N), + FCVT_W_H -> List(N,N,Y,N,N,N,X, H,X2,N,Y,N,N,N,N,Y,N), + FCVT_WU_H-> List(N,N,Y,N,N,N,X, H,X2,N,Y,N,N,N,N,Y,N), + FCVT_L_H -> List(N,N,Y,N,N,N,X, H,X2,N,Y,N,N,N,N,Y,N), + FCVT_LU_H-> List(N,N,Y,N,N,N,X, H,X2,N,Y,N,N,N,N,Y,N), + FCVT_S_H -> List(N,Y,Y,N,N,N,X, H, S,N,N,Y,N,N,N,Y,N), + FCVT_H_S -> List(N,Y,Y,N,N,N,X, S, H,N,N,Y,N,N,N,Y,N), + FEQ_H -> List(N,N,Y,Y,N,N,N, H, H,N,Y,N,N,N,N,Y,N), + FLT_H -> List(N,N,Y,Y,N,N,N, H, H,N,Y,N,N,N,N,Y,N), + FLE_H -> List(N,N,Y,Y,N,N,N, H, H,N,Y,N,N,N,N,Y,N), + FSGNJ_H -> List(N,Y,Y,Y,N,N,N, H, H,N,N,Y,N,N,N,N,N), + FSGNJN_H -> List(N,Y,Y,Y,N,N,N, H, H,N,N,Y,N,N,N,N,N), + FSGNJX_H -> List(N,Y,Y,Y,N,N,N, H, H,N,N,Y,N,N,N,N,N), + FMIN_H -> List(N,Y,Y,Y,N,N,N, H, H,N,N,Y,N,N,N,Y,N), + FMAX_H -> List(N,Y,Y,Y,N,N,N, H, H,N,N,Y,N,N,N,Y,N), + FADD_H -> List(N,Y,Y,Y,N,N,Y, H, H,N,N,N,Y,N,N,Y,N), + FSUB_H -> List(N,Y,Y,Y,N,N,Y, H, H,N,N,N,Y,N,N,Y,N), + FMUL_H -> List(N,Y,Y,Y,N,N,N, H, H,N,N,N,Y,N,N,Y,N), + FMADD_H -> List(N,Y,Y,Y,Y,N,N, H, H,N,N,N,Y,N,N,Y,N), + FMSUB_H -> List(N,Y,Y,Y,Y,N,N, H, H,N,N,N,Y,N,N,Y,N), + FNMADD_H -> List(N,Y,Y,Y,Y,N,N, H, H,N,N,N,Y,N,N,Y,N), + FNMSUB_H -> List(N,Y,Y,Y,Y,N,N, H, H,N,N,N,Y,N,N,Y,N), + FDIV_H -> List(N,Y,Y,Y,N,N,N, H, H,N,N,N,N,Y,N,Y,N), + FSQRT_H -> List(N,Y,Y,N,N,N,X, H, H,N,N,N,N,N,Y,Y,N)) val f: Array[(BitPat, List[BitPat])] = - Array(FLW -> List(Y,Y,N,N,N,X,X,X2,X2,N,N,N,N,N,N,N), - FSW -> List(Y,N,N,Y,N,Y,X, I, S,N,Y,N,N,N,N,N), - FMV_W_X -> List(N,Y,N,N,N,X,X, S, I,Y,N,N,N,N,N,N), - FCVT_S_W -> List(N,Y,N,N,N,X,X, S, S,Y,N,N,N,N,N,Y), - FCVT_S_WU-> List(N,Y,N,N,N,X,X, S, S,Y,N,N,N,N,N,Y), - FCVT_S_L -> List(N,Y,N,N,N,X,X, S, S,Y,N,N,N,N,N,Y), - FCVT_S_LU-> List(N,Y,N,N,N,X,X, S, S,Y,N,N,N,N,N,Y), - FMV_X_W -> List(N,N,Y,N,N,N,X, I, S,N,Y,N,N,N,N,N), - FCLASS_S -> List(N,N,Y,N,N,N,X, S, S,N,Y,N,N,N,N,N), - FCVT_W_S -> List(N,N,Y,N,N,N,X, S,X2,N,Y,N,N,N,N,Y), - FCVT_WU_S-> List(N,N,Y,N,N,N,X, S,X2,N,Y,N,N,N,N,Y), - FCVT_L_S -> List(N,N,Y,N,N,N,X, S,X2,N,Y,N,N,N,N,Y), - FCVT_LU_S-> List(N,N,Y,N,N,N,X, S,X2,N,Y,N,N,N,N,Y), - FEQ_S -> List(N,N,Y,Y,N,N,N, S, S,N,Y,N,N,N,N,Y), - FLT_S -> List(N,N,Y,Y,N,N,N, S, S,N,Y,N,N,N,N,Y), - FLE_S -> List(N,N,Y,Y,N,N,N, S, S,N,Y,N,N,N,N,Y), - FSGNJ_S -> List(N,Y,Y,Y,N,N,N, S, S,N,N,Y,N,N,N,N), - FSGNJN_S -> List(N,Y,Y,Y,N,N,N, S, S,N,N,Y,N,N,N,N), - FSGNJX_S -> List(N,Y,Y,Y,N,N,N, S, S,N,N,Y,N,N,N,N), - FMIN_S -> List(N,Y,Y,Y,N,N,N, S, S,N,N,Y,N,N,N,Y), - FMAX_S -> List(N,Y,Y,Y,N,N,N, S, S,N,N,Y,N,N,N,Y), - FADD_S -> List(N,Y,Y,Y,N,N,Y, S, S,N,N,N,Y,N,N,Y), - FSUB_S -> List(N,Y,Y,Y,N,N,Y, S, S,N,N,N,Y,N,N,Y), - FMUL_S -> List(N,Y,Y,Y,N,N,N, S, S,N,N,N,Y,N,N,Y), - FMADD_S -> List(N,Y,Y,Y,Y,N,N, S, S,N,N,N,Y,N,N,Y), - FMSUB_S -> List(N,Y,Y,Y,Y,N,N, S, S,N,N,N,Y,N,N,Y), - FNMADD_S -> List(N,Y,Y,Y,Y,N,N, S, S,N,N,N,Y,N,N,Y), - FNMSUB_S -> List(N,Y,Y,Y,Y,N,N, S, S,N,N,N,Y,N,N,Y), - FDIV_S -> List(N,Y,Y,Y,N,N,N, S, S,N,N,N,N,Y,N,Y), - FSQRT_S -> List(N,Y,Y,N,N,N,X, S, S,N,N,N,N,N,Y,Y)) + Array(FLW -> List(Y,Y,N,N,N,X,X,X2,X2,N,N,N,N,N,N,N,N), + FSW -> List(Y,N,N,Y,N,Y,X, I, S,N,Y,N,N,N,N,N,N), + FMV_W_X -> List(N,Y,N,N,N,X,X, S, I,Y,N,N,N,N,N,N,N), + FCVT_S_W -> List(N,Y,N,N,N,X,X, S, S,Y,N,N,N,N,N,Y,N), + FCVT_S_WU-> List(N,Y,N,N,N,X,X, S, S,Y,N,N,N,N,N,Y,N), + FCVT_S_L -> List(N,Y,N,N,N,X,X, S, S,Y,N,N,N,N,N,Y,N), + FCVT_S_LU-> List(N,Y,N,N,N,X,X, S, S,Y,N,N,N,N,N,Y,N), + FMV_X_W -> List(N,N,Y,N,N,N,X, I, S,N,Y,N,N,N,N,N,N), + FCLASS_S -> List(N,N,Y,N,N,N,X, S, S,N,Y,N,N,N,N,N,N), + FCVT_W_S -> List(N,N,Y,N,N,N,X, S,X2,N,Y,N,N,N,N,Y,N), + FCVT_WU_S-> List(N,N,Y,N,N,N,X, S,X2,N,Y,N,N,N,N,Y,N), + FCVT_L_S -> List(N,N,Y,N,N,N,X, S,X2,N,Y,N,N,N,N,Y,N), + FCVT_LU_S-> List(N,N,Y,N,N,N,X, S,X2,N,Y,N,N,N,N,Y,N), + FEQ_S -> List(N,N,Y,Y,N,N,N, S, S,N,Y,N,N,N,N,Y,N), + FLT_S -> List(N,N,Y,Y,N,N,N, S, S,N,Y,N,N,N,N,Y,N), + FLE_S -> List(N,N,Y,Y,N,N,N, S, S,N,Y,N,N,N,N,Y,N), + FSGNJ_S -> List(N,Y,Y,Y,N,N,N, S, S,N,N,Y,N,N,N,N,N), + FSGNJN_S -> List(N,Y,Y,Y,N,N,N, S, S,N,N,Y,N,N,N,N,N), + FSGNJX_S -> List(N,Y,Y,Y,N,N,N, S, S,N,N,Y,N,N,N,N,N), + FMIN_S -> List(N,Y,Y,Y,N,N,N, S, S,N,N,Y,N,N,N,Y,N), + FMAX_S -> List(N,Y,Y,Y,N,N,N, S, S,N,N,Y,N,N,N,Y,N), + FADD_S -> List(N,Y,Y,Y,N,N,Y, S, S,N,N,N,Y,N,N,Y,N), + FSUB_S -> List(N,Y,Y,Y,N,N,Y, S, S,N,N,N,Y,N,N,Y,N), + FMUL_S -> List(N,Y,Y,Y,N,N,N, S, S,N,N,N,Y,N,N,Y,N), + FMADD_S -> List(N,Y,Y,Y,Y,N,N, S, S,N,N,N,Y,N,N,Y,N), + FMSUB_S -> List(N,Y,Y,Y,Y,N,N, S, S,N,N,N,Y,N,N,Y,N), + FNMADD_S -> List(N,Y,Y,Y,Y,N,N, S, S,N,N,N,Y,N,N,Y,N), + FNMSUB_S -> List(N,Y,Y,Y,Y,N,N, S, S,N,N,N,Y,N,N,Y,N), + FDIV_S -> List(N,Y,Y,Y,N,N,N, S, S,N,N,N,N,Y,N,Y,N), + FSQRT_S -> List(N,Y,Y,N,N,N,X, S, S,N,N,N,N,N,Y,Y,N)) val d: Array[(BitPat, List[BitPat])] = - Array(FLD -> List(Y,Y,N,N,N,X,X,X2,X2,N,N,N,N,N,N,N), - FSD -> List(Y,N,N,Y,N,Y,X, I, D,N,Y,N,N,N,N,N), - FMV_D_X -> List(N,Y,N,N,N,X,X, D, I,Y,N,N,N,N,N,N), - FCVT_D_W -> List(N,Y,N,N,N,X,X, D, D,Y,N,N,N,N,N,Y), - FCVT_D_WU-> List(N,Y,N,N,N,X,X, D, D,Y,N,N,N,N,N,Y), - FCVT_D_L -> List(N,Y,N,N,N,X,X, D, D,Y,N,N,N,N,N,Y), - FCVT_D_LU-> List(N,Y,N,N,N,X,X, D, D,Y,N,N,N,N,N,Y), - FMV_X_D -> List(N,N,Y,N,N,N,X, I, D,N,Y,N,N,N,N,N), - FCLASS_D -> List(N,N,Y,N,N,N,X, D, D,N,Y,N,N,N,N,N), - FCVT_W_D -> List(N,N,Y,N,N,N,X, D,X2,N,Y,N,N,N,N,Y), - FCVT_WU_D-> List(N,N,Y,N,N,N,X, D,X2,N,Y,N,N,N,N,Y), - FCVT_L_D -> List(N,N,Y,N,N,N,X, D,X2,N,Y,N,N,N,N,Y), - FCVT_LU_D-> List(N,N,Y,N,N,N,X, D,X2,N,Y,N,N,N,N,Y), - FCVT_S_D -> List(N,Y,Y,N,N,N,X, D, S,N,N,Y,N,N,N,Y), - FCVT_D_S -> List(N,Y,Y,N,N,N,X, S, D,N,N,Y,N,N,N,Y), - FEQ_D -> List(N,N,Y,Y,N,N,N, D, D,N,Y,N,N,N,N,Y), - FLT_D -> List(N,N,Y,Y,N,N,N, D, D,N,Y,N,N,N,N,Y), - FLE_D -> List(N,N,Y,Y,N,N,N, D, D,N,Y,N,N,N,N,Y), - FSGNJ_D -> List(N,Y,Y,Y,N,N,N, D, D,N,N,Y,N,N,N,N), - FSGNJN_D -> List(N,Y,Y,Y,N,N,N, D, D,N,N,Y,N,N,N,N), - FSGNJX_D -> List(N,Y,Y,Y,N,N,N, D, D,N,N,Y,N,N,N,N), - FMIN_D -> List(N,Y,Y,Y,N,N,N, D, D,N,N,Y,N,N,N,Y), - FMAX_D -> List(N,Y,Y,Y,N,N,N, D, D,N,N,Y,N,N,N,Y), - FADD_D -> List(N,Y,Y,Y,N,N,Y, D, D,N,N,N,Y,N,N,Y), - FSUB_D -> List(N,Y,Y,Y,N,N,Y, D, D,N,N,N,Y,N,N,Y), - FMUL_D -> List(N,Y,Y,Y,N,N,N, D, D,N,N,N,Y,N,N,Y), - FMADD_D -> List(N,Y,Y,Y,Y,N,N, D, D,N,N,N,Y,N,N,Y), - FMSUB_D -> List(N,Y,Y,Y,Y,N,N, D, D,N,N,N,Y,N,N,Y), - FNMADD_D -> List(N,Y,Y,Y,Y,N,N, D, D,N,N,N,Y,N,N,Y), - FNMSUB_D -> List(N,Y,Y,Y,Y,N,N, D, D,N,N,N,Y,N,N,Y), - FDIV_D -> List(N,Y,Y,Y,N,N,N, D, D,N,N,N,N,Y,N,Y), - FSQRT_D -> List(N,Y,Y,N,N,N,X, D, D,N,N,N,N,N,Y,Y)) + Array(FLD -> List(Y,Y,N,N,N,X,X,X2,X2,N,N,N,N,N,N,N,N), + FSD -> List(Y,N,N,Y,N,Y,X, I, D,N,Y,N,N,N,N,N,N), + FMV_D_X -> List(N,Y,N,N,N,X,X, D, I,Y,N,N,N,N,N,N,N), + FCVT_D_W -> List(N,Y,N,N,N,X,X, D, D,Y,N,N,N,N,N,Y,N), + FCVT_D_WU-> List(N,Y,N,N,N,X,X, D, D,Y,N,N,N,N,N,Y,N), + FCVT_D_L -> List(N,Y,N,N,N,X,X, D, D,Y,N,N,N,N,N,Y,N), + FCVT_D_LU-> List(N,Y,N,N,N,X,X, D, D,Y,N,N,N,N,N,Y,N), + FMV_X_D -> List(N,N,Y,N,N,N,X, I, D,N,Y,N,N,N,N,N,N), + FCLASS_D -> List(N,N,Y,N,N,N,X, D, D,N,Y,N,N,N,N,N,N), + FCVT_W_D -> List(N,N,Y,N,N,N,X, D,X2,N,Y,N,N,N,N,Y,N), + FCVT_WU_D-> List(N,N,Y,N,N,N,X, D,X2,N,Y,N,N,N,N,Y,N), + FCVT_L_D -> List(N,N,Y,N,N,N,X, D,X2,N,Y,N,N,N,N,Y,N), + FCVT_LU_D-> List(N,N,Y,N,N,N,X, D,X2,N,Y,N,N,N,N,Y,N), + FCVT_S_D -> List(N,Y,Y,N,N,N,X, D, S,N,N,Y,N,N,N,Y,N), + FCVT_D_S -> List(N,Y,Y,N,N,N,X, S, D,N,N,Y,N,N,N,Y,N), + FEQ_D -> List(N,N,Y,Y,N,N,N, D, D,N,Y,N,N,N,N,Y,N), + FLT_D -> List(N,N,Y,Y,N,N,N, D, D,N,Y,N,N,N,N,Y,N), + FLE_D -> List(N,N,Y,Y,N,N,N, D, D,N,Y,N,N,N,N,Y,N), + FSGNJ_D -> List(N,Y,Y,Y,N,N,N, D, D,N,N,Y,N,N,N,N,N), + FSGNJN_D -> List(N,Y,Y,Y,N,N,N, D, D,N,N,Y,N,N,N,N,N), + FSGNJX_D -> List(N,Y,Y,Y,N,N,N, D, D,N,N,Y,N,N,N,N,N), + FMIN_D -> List(N,Y,Y,Y,N,N,N, D, D,N,N,Y,N,N,N,Y,N), + FMAX_D -> List(N,Y,Y,Y,N,N,N, D, D,N,N,Y,N,N,N,Y,N), + FADD_D -> List(N,Y,Y,Y,N,N,Y, D, D,N,N,N,Y,N,N,Y,N), + FSUB_D -> List(N,Y,Y,Y,N,N,Y, D, D,N,N,N,Y,N,N,Y,N), + FMUL_D -> List(N,Y,Y,Y,N,N,N, D, D,N,N,N,Y,N,N,Y,N), + FMADD_D -> List(N,Y,Y,Y,Y,N,N, D, D,N,N,N,Y,N,N,Y,N), + FMSUB_D -> List(N,Y,Y,Y,Y,N,N, D, D,N,N,N,Y,N,N,Y,N), + FNMADD_D -> List(N,Y,Y,Y,Y,N,N, D, D,N,N,N,Y,N,N,Y,N), + FNMSUB_D -> List(N,Y,Y,Y,Y,N,N, D, D,N,N,N,Y,N,N,Y,N), + FDIV_D -> List(N,Y,Y,Y,N,N,N, D, D,N,N,N,N,Y,N,Y,N), + FSQRT_D -> List(N,Y,Y,N,N,N,X, D, D,N,N,N,N,N,Y,Y,N)) val fcvt_hd: Array[(BitPat, List[BitPat])] = - Array(FCVT_H_D -> List(N,Y,Y,N,N,N,X, D, H,N,N,Y,N,N,N,Y), - FCVT_D_H -> List(N,Y,Y,N,N,N,X, H, D,N,N,Y,N,N,N,Y)) + Array(FCVT_H_D -> List(N,Y,Y,N,N,N,X, D, H,N,N,Y,N,N,N,Y,N), + FCVT_D_H -> List(N,Y,Y,N,N,N,X, H, D,N,N,Y,N,N,N,Y,N)) + val vfmv_f_s: Array[(BitPat, List[BitPat])] = + Array(VFMV_F_S -> List(N,Y,N,N,N,N,X,X2,X2,N,N,N,N,N,N,N,Y)) - val insns = (minFLen, fLen) match { + val insns = ((minFLen, fLen) match { case (32, 32) => f case (16, 32) => h ++ f case (32, 64) => f ++ d case (16, 64) => h ++ f ++ d ++ fcvt_hd - case other => throw new Exception(s"minFLen = ${minFLen} & fLen = ${fLen} is an unsupported configuration") - } + }) ++ (if (usingVector) vfmv_f_s else Array[(BitPat, List[BitPat])]()) val decoder = DecodeLogic(io.inst, default, insns) val s = io.sigs val sigs = Seq(s.ldst, s.wen, s.ren1, s.ren2, s.ren3, s.swap12, s.swap23, s.typeTagIn, s.typeTagOut, s.fromint, s.toint, - s.fastpipe, s.fma, s.div, s.sqrt, s.wflags) + s.fastpipe, s.fma, s.div, s.sqrt, s.wflags, s.vec) sigs zip decoder map {case(s,d) => s := d} } @@ -185,13 +187,15 @@ class FPUCoreIO(implicit p: Parameters) extends CoreBundle()(p) { val fcsr_rm = Input(Bits(FPConstants.RM_SZ.W)) val fcsr_flags = Valid(Bits(FPConstants.FLAGS_SZ.W)) + val v_sew = Input(UInt(3.W)) + val store_data = Output(Bits(fLen.W)) val toint_data = Output(Bits(xLen.W)) - val dmem_resp_val = Input(Bool()) - val dmem_resp_type = Input(Bits(3.W)) - val dmem_resp_tag = Input(UInt(5.W)) - val dmem_resp_data = Input(Bits(fLen.W)) + val ll_resp_val = Input(Bool()) + val ll_resp_type = Input(Bits(3.W)) + val ll_resp_tag = Input(UInt(5.W)) + val ll_resp_data = Input(Bits(fLen.W)) val valid = Input(Bool()) val fcsr_rdy = Output(Bool()) @@ -468,6 +472,7 @@ class FPToInt(implicit p: Parameters) extends FPUModule()(p) with ShouldBeRetime val tag = in.typeTagOut val store = (floatTypes.map(t => if (t == FType.H) Fill(maxType.ieeeWidth / minXLen, ieee(in.in1)(15, 0).sextTo(minXLen)) else Fill(maxType.ieeeWidth / t.ieeeWidth, ieee(in.in1)(t.ieeeWidth - 1, 0))): Seq[UInt])(tag) + val toint = WireDefault(store) val intType = WireDefault(in.fmt(0)) io.out.bits.store := store @@ -740,18 +745,31 @@ class FPU(cfg: FPUParams)(implicit p: Parameters) extends FPUModule()(p) { val fp_decoder = Module(new FPUDecoder) fp_decoder.io.inst := io.inst - val id_ctrl = fp_decoder.io.sigs + val id_ctrl = WireInit(fp_decoder.io.sigs) + coreParams match { case r: RocketCoreParams => r.vector.map(v => { + val v_decode = v.decoder(p) // Only need to get ren1 + v_decode.io.inst := io.inst + v_decode.io.vconfig := DontCare // core deals with this + when (v_decode.io.legal && v_decode.io.read_frs1) { + id_ctrl.ren1 := true.B + id_ctrl.swap12 := false.B + id_ctrl.toint := true.B + id_ctrl.typeTagIn := I + id_ctrl.typeTagOut := Mux(io.v_sew === 3.U, D, S) + } + when (v_decode.io.write_frd) { id_ctrl.wen := true.B } + })} val ex_reg_valid = RegNext(io.valid, false.B) val ex_reg_inst = RegEnable(io.inst, io.valid) val ex_reg_ctrl = RegEnable(id_ctrl, io.valid) val ex_ra = List.fill(3)(Reg(UInt())) - // load response - val load_wb = RegNext(io.dmem_resp_val) - val load_wb_typeTag = RegEnable(io.dmem_resp_type(1,0) - typeTagWbOffset, io.dmem_resp_val) - val load_wb_data = RegEnable(io.dmem_resp_data, io.dmem_resp_val) - val load_wb_tag = RegEnable(io.dmem_resp_tag, io.dmem_resp_val) + // load/vector response + val load_wb = RegNext(io.ll_resp_val) + val load_wb_typeTag = RegEnable(io.ll_resp_type(1,0) - typeTagWbOffset, io.ll_resp_val) + val load_wb_data = RegEnable(io.ll_resp_data, io.ll_resp_val) + val load_wb_tag = RegEnable(io.ll_resp_tag, io.ll_resp_val) class FPUImpl { // entering gated-clock domain @@ -835,6 +853,10 @@ class FPU(cfg: FPUParams)(implicit p: Parameters) extends FPUModule()(p) { req.fmaCmd := ex_reg_inst(3,2) | (!ex_ctrl.ren3 && ex_reg_inst(27)) when (ex_cp_valid) { req := io.cp_req.bits + when (io.cp_req.bits.swap12) { + req.in1 := io.cp_req.bits.in2 + req.in2 := io.cp_req.bits.in1 + } when (io.cp_req.bits.swap23) { req.in2 := io.cp_req.bits.in3 req.in3 := io.cp_req.bits.in2 @@ -870,6 +892,7 @@ class FPU(cfg: FPUParams)(implicit p: Parameters) extends FPUModule()(p) { val divSqrt_wen = WireDefault(false.B) val divSqrt_inFlight = WireDefault(false.B) val divSqrt_waddr = Reg(UInt(5.W)) + val divSqrt_cp = Reg(Bool()) val divSqrt_typeTag = Wire(UInt(log2Up(floatTypes.size).W)) val divSqrt_wdata = Wire(UInt((fLen+1).W)) val divSqrt_flags = Wire(UInt(FPConstants.FLAGS_SZ.W)) @@ -934,6 +957,7 @@ class FPU(cfg: FPUParams)(implicit p: Parameters) extends FPUModule()(p) { } val waddr = Mux(divSqrt_wen, divSqrt_waddr, wbInfo(0).rd) + val wb_cp = Mux(divSqrt_wen, divSqrt_cp, wbInfo(0).cp) val wtypeTag = Mux(divSqrt_wen, divSqrt_typeTag, wbInfo(0).typeTag) val wdata = box(Mux(divSqrt_wen, divSqrt_wdata, (pipes.map(_.res.data): Seq[UInt])(wbInfo(0).pipeid)), wtypeTag) val wexc = (pipes.map(_.res.exc): Seq[UInt])(wbInfo(0).pipeid) @@ -951,11 +975,12 @@ class FPU(cfg: FPUParams)(implicit p: Parameters) extends FPUModule()(p) { DebugROB.pushWb(clock, reset, io.hartid, (!wbInfo(0).cp && wen(0)) || divSqrt_wen, waddr + 32.U, ieee(wdata)) } - when (wbInfo(0).cp && wen(0)) { + when (wb_cp && (wen(0) || divSqrt_wen)) { io.cp_resp.bits.data := wdata io.cp_resp.valid := true.B } - io.cp_req.ready := !ex_reg_valid + // Avoid structural hazards and nacking of external requests + io.cp_req.ready := !ex_reg_valid && !mem_reg_valid && !wb_reg_valid val wb_toint_valid = wb_reg_valid && wb_ctrl.toint val wb_toint_exc = RegEnable(fpiu.io.out.bits.exc, mem_ctrl.toint) @@ -968,9 +993,9 @@ class FPU(cfg: FPUParams)(implicit p: Parameters) extends FPUModule()(p) { val divSqrt_write_port_busy = (mem_ctrl.div || mem_ctrl.sqrt) && wen.orR io.fcsr_rdy := !(ex_reg_valid && ex_ctrl.wflags || mem_reg_valid && mem_ctrl.wflags || wb_reg_valid && wb_ctrl.toint || wen.orR || divSqrt_inFlight) io.nack_mem := write_port_busy || divSqrt_write_port_busy || divSqrt_inFlight - io.dec <> fp_decoder.io.sigs + io.dec <> id_ctrl def useScoreboard(f: ((Pipe, Int)) => Bool) = pipes.zipWithIndex.filter(_._1.lat > 3).map(x => f(x)).fold(false.B)(_||_) - io.sboard_set := wb_reg_valid && !wb_cp_valid && RegNext(useScoreboard(_._1.cond(mem_ctrl)) || mem_ctrl.div || mem_ctrl.sqrt) + io.sboard_set := wb_reg_valid && !wb_cp_valid && RegNext(useScoreboard(_._1.cond(mem_ctrl)) || mem_ctrl.div || mem_ctrl.sqrt || mem_ctrl.vec) io.sboard_clr := !wb_cp_valid && (divSqrt_wen || (wen(0) && useScoreboard(x => wbInfo(0).pipeid === x._2.U))) io.sboard_clra := waddr ccover(io.sboard_clr && load_wb, "DUAL_WRITEBACK", "load and FMA writeback on same cycle") @@ -982,6 +1007,7 @@ class FPU(cfg: FPUParams)(implicit p: Parameters) extends FPUModule()(p) { val divSqrt_killed = RegNext(divSqrt_inValid && killm, true.B) when (divSqrt_inValid) { divSqrt_waddr := mem_reg_inst(11,7) + divSqrt_cp := mem_cp_valid } ccover(divSqrt_inFlight && divSqrt_killed, "DIV_KILLED", "divide killed after issued to divider") @@ -1021,7 +1047,7 @@ class FPU(cfg: FPUParams)(implicit p: Parameters) extends FPUModule()(p) { mem_reg_valid || mem_cp_valid || // MEM stage wb_reg_valid || wb_cp_valid || // WB stage wen.orR || divSqrt_inFlight || // post-WB stage - io.dmem_resp_val // load writeback + io.ll_resp_val // load writeback } // leaving gated-clock domain val fpuImpl = withClock (gated_clock) { new FPUImpl } diff --git a/src/main/scala/tile/LazyRoCC.scala b/src/main/scala/tile/LazyRoCC.scala index d218d44bb8..3b869b9aca 100644 --- a/src/main/scala/tile/LazyRoCC.scala +++ b/src/main/scala/tile/LazyRoCC.scala @@ -95,7 +95,7 @@ trait HasLazyRoCC extends CanHavePTW { this: BaseTile => } trait HasLazyRoCCModule extends CanHavePTWModule - with HasCoreParameters { this: RocketTileModuleImp with HasFpuOpt => + with HasCoreParameters { this: RocketTileModuleImp => val (respArb, cmdRouter) = if(outer.roccs.nonEmpty) { val respArb = Module(new RRArbiter(new RoCCResponse()(outer.p), outer.roccs.size)) @@ -108,23 +108,6 @@ trait HasLazyRoCCModule extends CanHavePTWModule dcachePorts += dcIF.io.cache respArb.io.in(i) <> Queue(rocc.module.io.resp) } - - fpuOpt foreach { fpu => - val nFPUPorts = outer.roccs.count(_.usesFPU) - if (usingFPU && nFPUPorts > 0) { - val fpArb = Module(new InOrderArbiter(new FPInput()(outer.p), new FPResult()(outer.p), nFPUPorts)) - val fp_rocc_ios = outer.roccs.filter(_.usesFPU).map(_.module.io) - fpArb.io.in_req <> fp_rocc_ios.map(_.fpu_req) - fp_rocc_ios.zip(fpArb.io.in_resp).foreach { - case (rocc, arb) => rocc.fpu_resp <> arb - } - fpu.io.cp_req <> fpArb.io.out_req - fpArb.io.out_resp <> fpu.io.cp_resp - } else { - fpu.io.cp_req.valid := false.B - fpu.io.cp_resp.ready := false.B - } - } (Some(respArb), Some(cmdRouter)) } else { (None, None) diff --git a/src/main/scala/tile/RocketTile.scala b/src/main/scala/tile/RocketTile.scala index ceee449f5e..f1d4803d31 100644 --- a/src/main/scala/tile/RocketTile.scala +++ b/src/main/scala/tile/RocketTile.scala @@ -22,7 +22,7 @@ import freechips.rocketchip.rocket.{ } import freechips.rocketchip.subsystem.HierarchicalElementCrossingParamsLike import freechips.rocketchip.prci.ClockSinkParameters -import freechips.rocketchip.util.Annotated +import freechips.rocketchip.util.{Annotated, InOrderArbiter} import freechips.rocketchip.util.BooleanToAugmentedBoolean @@ -39,7 +39,7 @@ case class RocketTileParams( blockerCtrlAddr: Option[BigInt] = None, clockSinkParams: ClockSinkParameters = ClockSinkParameters(), boundaryBuffers: Option[RocketTileBoundaryBufferParams] = None - ) extends InstantiableTileParams[RocketTile] { + ) extends InstantiableTileParams[RocketTile] { require(icache.isDefined) require(dcache.isDefined) val baseName = "rockettile" @@ -93,7 +93,7 @@ class RocketTile private( masterNode :=* tlOtherMastersNode DisableMonitors { implicit p => tlSlaveXbar.node :*= slaveNode } - nDCachePorts += 1 /*core */ + (dtim_adapter.isDefined).toInt + nDCachePorts += 1 /*core */ + (dtim_adapter.isDefined).toInt + rocketParams.core.vector.map(_.useDCache.toInt).getOrElse(0) val dtimProperty = dtim_adapter.map(d => Map( "sifive,dtim" -> d.device.asProperty)).getOrElse(Nil) @@ -112,6 +112,11 @@ class RocketTile private( } } + val vector_unit = rocketParams.core.vector.map(v => LazyModule(v.build(p))) + vector_unit.foreach(vu => tlMasterXbar.node :=* vu.atlNode) + vector_unit.foreach(vu => tlOtherMastersNode :=* vu.tlNode) + + ResourceBinding { Resource(cpuDevice, "reg").bind(ResourceAddress(tileId)) } @@ -138,6 +143,10 @@ class RocketTileModuleImp(outer: RocketTile) extends BaseTileModuleImp(outer) Annotated.params(this, outer.rocketParams) val core = Module(new Rocket(outer)(outer.p)) + outer.vector_unit.foreach { v => + core.io.vector.get <> v.module.io.core + v.module.io.tlb <> outer.dcache.module.io.tlb_port + } // reset vector is connected in the Frontend to s2_pc core.io.reset_vector := DontCare @@ -177,12 +186,15 @@ class RocketTileModuleImp(outer: RocketTile) extends BaseTileModuleImp(outer) dcachePorts += core.io.dmem // TODO outer.dcachePorts += () => module.core.io.dmem ?? fpuOpt foreach { fpu => core.io.fpu :<>= fpu.io.waiveAs[FPUCoreIO](_.cp_req, _.cp_resp) - fpu.io.cp_req := DontCare - fpu.io.cp_resp := DontCare } if (fpuOpt.isEmpty) { core.io.fpu := DontCare } + outer.vector_unit foreach { v => if (outer.rocketParams.core.vector.get.useDCache) { + dcachePorts += v.module.io.dmem + } else { + v.module.io.dmem := DontCare + } } core.io.ptw <> ptw.io.dpath // Connect the coprocessor interfaces @@ -226,4 +238,27 @@ class RocketTileModuleImp(outer: RocketTile) extends BaseTileModuleImp(outer) trait HasFpuOpt { this: RocketTileModuleImp => val fpuOpt = outer.tileParams.core.fpu.map(params => Module(new FPU(params)(outer.p))) + fpuOpt.foreach { fpu => + val nRoCCFPUPorts = outer.roccs.count(_.usesFPU) + val nFPUPorts = nRoCCFPUPorts + outer.rocketParams.core.useVector.toInt + if (nFPUPorts > 0) { + val fpArb = Module(new InOrderArbiter(new FPInput()(outer.p), new FPResult()(outer.p), nFPUPorts)) + fpu.io.cp_req <> fpArb.io.out_req + fpArb.io.out_resp <> fpu.io.cp_resp + + val fp_rocc_ios = outer.roccs.filter(_.usesFPU).map(_.module.io) + for (i <- 0 until nRoCCFPUPorts) { + fpArb.io.in_req(i) <> fp_rocc_ios(i).fpu_req + fp_rocc_ios(i).fpu_resp <> fpArb.io.in_resp(i) + } + outer.vector_unit.foreach(vu => { + fpArb.io.in_req(nRoCCFPUPorts) <> vu.module.io.fp_req + vu.module.io.fp_resp <> fpArb.io.in_resp(nRoCCFPUPorts) + }) + } else { + fpu.io.cp_req.valid := false.B + fpu.io.cp_req.bits := DontCare + fpu.io.cp_resp.ready := false.B + } + } } diff --git a/src/main/scala/tilelink/SRAM.scala b/src/main/scala/tilelink/SRAM.scala index 3b3b8eecd5..48849a06bf 100644 --- a/src/main/scala/tilelink/SRAM.scala +++ b/src/main/scala/tilelink/SRAM.scala @@ -50,7 +50,7 @@ class TLRAM( supportsPutFull = TransferSizes(1, beatBytes), supportsArithmetic = if (atomics) TransferSizes(1, beatBytes) else TransferSizes.none, supportsLogical = if (atomics) TransferSizes(1, beatBytes) else TransferSizes.none, - fifoId = Some(0))), // requests are handled in order + fifoId = Some(0)).v2copy(name=devName)), // requests are handled in order beatBytes = beatBytes, minLatency = 1))) // no bypass needed for this device diff --git a/src/main/scala/util/DelayQueue.scala b/src/main/scala/util/DelayQueue.scala index cdfa2f5074..47a9ace983 100644 --- a/src/main/scala/util/DelayQueue.scala +++ b/src/main/scala/util/DelayQueue.scala @@ -15,23 +15,23 @@ import chisel3.util._ * @param timer cycle count timer * @param entries cycle delay */ -class DelayQueue[T <: Data](gen: T, entries: Int) extends Module { +class DelayQueue[T <: Data](gen: T, entries: Int, width: Int) extends Module { val io = IO(new Bundle { val enq = Flipped(DecoupledIO(gen)) val deq = DecoupledIO(gen) - val timer = Input(UInt()) - val delay = Input(UInt()) + val timer = Input(UInt(width.W)) + val delay = Input(UInt(width.W)) }) val q = Module(new Queue(new Bundle { - val data = gen - val time = UInt(io.timer.getWidth.W) + val data = gen.cloneType + val time = UInt(width.W) }, entries, flow=true)) - val delay_r = RegInit(0.U(io.delay.getWidth.W)) + val delay_r = RegInit(0.U(width.W)) when (delay_r =/= io.delay) { delay_r := io.delay - assert(q.io.count == 0, "Undefined behavior when delay is changed while queue has elements.") + //assert(q.io.count == 0, "Undefined behavior when delay is changed while queue has elements.") } q.io.enq.bits.data := io.enq.bits @@ -53,7 +53,7 @@ object DelayQueue { * @param depth queue size */ def apply[T <: Data](source: DecoupledIO[T], timer: UInt, delay: UInt, depth: Int): DecoupledIO[T] = { - val delayQueue = Module(new DelayQueue(chiselTypeOf(source.bits), depth)) + val delayQueue = Module(new DelayQueue(chiselTypeOf(source.bits), depth, timer.getWidth)) delayQueue.io.enq <> source delayQueue.io.timer := timer delayQueue.io.delay := delay