From 28365499356ce3a42fc7718c82378520f154947c Mon Sep 17 00:00:00 2001 From: Andrew Waterman Date: Thu, 4 Jun 2020 18:03:06 -0700 Subject: [PATCH 1/4] TLB: Check for PutPartial support separately from PutFull Previously, the TLB would report that M_PWR (PutPartial) was permissible anywhere that M_XWR (PutFull) was. This is fine for cache transactions, since the caches can handle PutPartial internally. It's not fine for uncached transactions, since PutFull support does not imply PutPartial support. So, check for PutPartial support separately, similar to AMOs. If the access is cached, assume the access can proceed, since it'll be handled in cache. If the access is uncached, check the PMA. Rocket never issues M_PWR, so there's no functional change on that front. --- src/main/scala/rocket/TLB.scala | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/main/scala/rocket/TLB.scala b/src/main/scala/rocket/TLB.scala index 769ab62ba30..60aa0d97538 100644 --- a/src/main/scala/rocket/TLB.scala +++ b/src/main/scala/rocket/TLB.scala @@ -64,6 +64,7 @@ class TLBEntryData(implicit p: Parameters) extends CoreBundle()(p) { val pw = Bool() val px = Bool() val pr = Bool() + val ppp = Bool() // PutPartial val pal = Bool() // AMO logical val paa = Bool() // AMO arithmetic val eff = Bool() // get/put effects @@ -199,6 +200,7 @@ class TLB(instruction: Boolean, lgMaxSize: Int, cfg: TLBConfig)(implicit edge: T val deny_access_to_debug = mpu_priv <= PRV.M && p(DebugModuleKey).map(dmp => dmp.address.contains(mpu_physaddr)).getOrElse(false) val prot_r = fastCheck(_.supportsGet) && !deny_access_to_debug && pmp.io.r val prot_w = fastCheck(_.supportsPutFull) && !deny_access_to_debug && pmp.io.w + val prot_pp = fastCheck(_.supportsPutPartial) val prot_al = fastCheck(_.supportsLogical) val prot_aa = fastCheck(_.supportsArithmetic) val prot_x = fastCheck(_.executable) && !deny_access_to_debug && pmp.io.x @@ -226,6 +228,7 @@ class TLB(instruction: Boolean, lgMaxSize: Int, cfg: TLBConfig)(implicit edge: T newEntry.pr := prot_r newEntry.pw := prot_w newEntry.px := prot_x + newEntry.ppp := prot_pp newEntry.pal := prot_al newEntry.paa := prot_aa newEntry.eff := prot_eff @@ -260,8 +263,10 @@ class TLB(instruction: Boolean, lgMaxSize: Int, cfg: TLBConfig)(implicit edge: T val px_array = Cat(Fill(nPhysicalEntries, prot_x), normal_entries.map(_.px).asUInt) & ~ptw_ae_array val eff_array = Cat(Fill(nPhysicalEntries, prot_eff), normal_entries.map(_.eff).asUInt) val c_array = Cat(Fill(nPhysicalEntries, cacheable), normal_entries.map(_.c).asUInt) + val ppp_array = Cat(Fill(nPhysicalEntries, prot_pp), normal_entries.map(_.ppp).asUInt) val paa_array = Cat(Fill(nPhysicalEntries, prot_aa), normal_entries.map(_.paa).asUInt) val pal_array = Cat(Fill(nPhysicalEntries, prot_al), normal_entries.map(_.pal).asUInt) + val ppp_array_if_cached = ppp_array | c_array val paa_array_if_cached = paa_array | Mux(usingAtomicsInCache, c_array, 0.U) val pal_array_if_cached = pal_array | Mux(usingAtomicsInCache, c_array, 0.U) val prefetchable_array = Cat((cacheable && homogeneous) << (nPhysicalEntries-1), normal_entries.map(_.c).asUInt) @@ -280,6 +285,7 @@ class TLB(instruction: Boolean, lgMaxSize: Int, cfg: TLBConfig)(implicit edge: T val cmd_lrsc = Bool(usingAtomics) && io.req.bits.cmd.isOneOf(M_XLR, M_XSC) val cmd_amo_logical = Bool(usingAtomics) && isAMOLogical(io.req.bits.cmd) val cmd_amo_arithmetic = Bool(usingAtomics) && isAMOArithmetic(io.req.bits.cmd) + val cmd_put_partial = io.req.bits.cmd === M_PWR val cmd_read = isRead(io.req.bits.cmd) val cmd_write = isWrite(io.req.bits.cmd) val cmd_write_perms = cmd_write || @@ -292,9 +298,11 @@ class TLB(instruction: Boolean, lgMaxSize: Int, cfg: TLBConfig)(implicit edge: T val ae_ld_array = Mux(cmd_read, ae_array | ~pr_array, 0.U) val ae_st_array = Mux(cmd_write_perms, ae_array | ~pw_array, 0.U) | + Mux(cmd_put_partial, ~ppp_array_if_cached, 0.U) | Mux(cmd_amo_logical, ~pal_array_if_cached, 0.U) | Mux(cmd_amo_arithmetic, ~paa_array_if_cached, 0.U) val must_alloc_array = + Mux(cmd_put_partial, ~ppp_array, 0.U) | Mux(cmd_amo_logical, ~paa_array, 0.U) | Mux(cmd_amo_arithmetic, ~pal_array, 0.U) | Mux(cmd_lrsc, ~0.U(pal_array.getWidth.W), 0.U) From f34d7cad59702c301bf8b555fbd834e17fff3331 Mon Sep 17 00:00:00 2001 From: Andrew Waterman Date: Thu, 4 Jun 2020 22:22:24 -0700 Subject: [PATCH 2/4] Remove newly unnecessary D$ PutPartial assertion --- src/main/scala/rocket/DCache.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/src/main/scala/rocket/DCache.scala b/src/main/scala/rocket/DCache.scala index cd051e76c15..c8fb8145012 100644 --- a/src/main/scala/rocket/DCache.scala +++ b/src/main/scala/rocket/DCache.scala @@ -897,7 +897,6 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) { // We could consider turning some of these into dynamic PMA checks. require(!m.supportsAcquireB || m.supportsGet, "With a vector unit, cacheable memory must support Get") require(!m.supportsAcquireT || m.supportsPutPartial, "With a vector unit, cacheable memory must support PutPartial") - require(!m.supportsPutFull || m.supportsPutPartial, "With a vector unit, writable memory must support PutPartial") } } From 35e7da90f3f344bf52a3577fd6d7b09f20ca4f95 Mon Sep 17 00:00:00 2001 From: Andrew Waterman Date: Fri, 5 Jun 2020 14:02:10 -0700 Subject: [PATCH 3/4] D$: break up AMOALUs if coreDataBits > xLen Rather than having one big ALU with a coreDataBits-sized adder, have two smaller ALUs with shorter critical paths. This is just a QoR improvement for e.g. RV32D, which has xLen=32 and coreDataBits=64. For other uses of the D$ that require coreDataBits>64, it's also a functional fix. --- src/main/scala/rocket/DCache.scala | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/main/scala/rocket/DCache.scala b/src/main/scala/rocket/DCache.scala index c8fb8145012..7d83e7822ec 100644 --- a/src/main/scala/rocket/DCache.scala +++ b/src/main/scala/rocket/DCache.scala @@ -877,15 +877,17 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) { // AMOs if (usingRMW) { - // when xLen < coreDataBits (e.g. RV32D), this AMOALU is wider than necessary - val amoalu = Module(new AMOALU(coreDataBits)) - amoalu.io.mask := pstore1_mask - amoalu.io.cmd := (if (usingAtomicsInCache) pstore1_cmd else M_XWR) - amoalu.io.lhs := s2_data_word - amoalu.io.rhs := pstore1_data - pstore1_storegen_data := (if (!usingDataScratchpad) amoalu.io.out else { + val amoalus = (0 until coreDataBits / xLen).map { i => + val amoalu = Module(new AMOALU(xLen)) + amoalu.io.mask := pstore1_mask >> (i * xBytes) + amoalu.io.cmd := (if (usingAtomicsInCache) pstore1_cmd else M_XWR) + amoalu.io.lhs := s2_data_word >> (i * xLen) + amoalu.io.rhs := pstore1_data >> (i * xLen) + amoalu + } + pstore1_storegen_data := (if (!usingDataScratchpad) amoalus.map(_.io.out).asUInt else { val mask = FillInterleaved(8, Mux(s2_correct, 0.U, pstore1_mask)) - amoalu.io.out_unmasked & mask | s2_data_word_corrected & ~mask + amoalus.map(_.io.out_unmasked).asUInt & mask | s2_data_word_corrected & ~mask }) } else if (!usingAtomics) { assert(!(s1_valid_masked && s1_read && s1_write), "unsupported D$ operation") From f2a71ae28add5e5d9c413957f3e31274700257d2 Mon Sep 17 00:00:00 2001 From: Andrew Waterman Date: Fri, 5 Jun 2020 14:04:54 -0700 Subject: [PATCH 4/4] Temporarily disable tests hitting OOM in Actions --- regression/run-test-bucket | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/regression/run-test-bucket b/regression/run-test-bucket index 5ed757cdcc0..231be123666 100755 --- a/regression/run-test-bucket +++ b/regression/run-test-bucket @@ -51,8 +51,9 @@ bucket_number=$1 set -x case "${bucket_number}" in 1) - travis_wait 100 make emulator-ndebug -C regression SUITE=UnittestSuite JVM_MEMORY=3G VERILATOR_THREADS=1 - travis_wait 100 make emulator-regression-tests -C regression SUITE=UnittestSuite JVM_MEMORY=3G VERILATOR_THREADS=1 + # Temporarily disable this bucket, which is hitting OOM on Actions + #travis_wait 100 make emulator-ndebug -C regression SUITE=UnittestSuite JVM_MEMORY=3G VERILATOR_THREADS=1 + #travis_wait 100 make emulator-regression-tests -C regression SUITE=UnittestSuite JVM_MEMORY=3G VERILATOR_THREADS=1 ;; 2)