From 28365499356ce3a42fc7718c82378520f154947c Mon Sep 17 00:00:00 2001
From: Andrew Waterman <andrew@sifive.com>
Date: Thu, 4 Jun 2020 18:03:06 -0700
Subject: [PATCH 1/4] TLB: Check for PutPartial support separately from PutFull

Previously, the TLB would report that M_PWR (PutPartial) was permissible
anywhere that M_XWR (PutFull) was.  This is fine for cache transactions,
since the caches can handle PutPartial internally.  It's not fine for
uncached transactions, since PutFull support does not imply PutPartial
support.

So, check for PutPartial support separately, similar to AMOs.  If the
access is cached, assume the access can proceed, since it'll be handled
in cache.  If the access is uncached, check the PMA.

Rocket never issues M_PWR, so there's no functional change on that front.
---
 src/main/scala/rocket/TLB.scala | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/main/scala/rocket/TLB.scala b/src/main/scala/rocket/TLB.scala
index 769ab62ba30..60aa0d97538 100644
--- a/src/main/scala/rocket/TLB.scala
+++ b/src/main/scala/rocket/TLB.scala
@@ -64,6 +64,7 @@ class TLBEntryData(implicit p: Parameters) extends CoreBundle()(p) {
   val pw = Bool()
   val px = Bool()
   val pr = Bool()
+  val ppp = Bool() // PutPartial
   val pal = Bool() // AMO logical
   val paa = Bool() // AMO arithmetic
   val eff = Bool() // get/put effects
@@ -199,6 +200,7 @@ class TLB(instruction: Boolean, lgMaxSize: Int, cfg: TLBConfig)(implicit edge: T
   val deny_access_to_debug = mpu_priv <= PRV.M && p(DebugModuleKey).map(dmp => dmp.address.contains(mpu_physaddr)).getOrElse(false)
   val prot_r = fastCheck(_.supportsGet) && !deny_access_to_debug && pmp.io.r
   val prot_w = fastCheck(_.supportsPutFull) && !deny_access_to_debug && pmp.io.w
+  val prot_pp = fastCheck(_.supportsPutPartial)
   val prot_al = fastCheck(_.supportsLogical)
   val prot_aa = fastCheck(_.supportsArithmetic)
   val prot_x = fastCheck(_.executable) && !deny_access_to_debug && pmp.io.x
@@ -226,6 +228,7 @@ class TLB(instruction: Boolean, lgMaxSize: Int, cfg: TLBConfig)(implicit edge: T
     newEntry.pr := prot_r
     newEntry.pw := prot_w
     newEntry.px := prot_x
+    newEntry.ppp := prot_pp
     newEntry.pal := prot_al
     newEntry.paa := prot_aa
     newEntry.eff := prot_eff
@@ -260,8 +263,10 @@ class TLB(instruction: Boolean, lgMaxSize: Int, cfg: TLBConfig)(implicit edge: T
   val px_array = Cat(Fill(nPhysicalEntries, prot_x), normal_entries.map(_.px).asUInt) & ~ptw_ae_array
   val eff_array = Cat(Fill(nPhysicalEntries, prot_eff), normal_entries.map(_.eff).asUInt)
   val c_array = Cat(Fill(nPhysicalEntries, cacheable), normal_entries.map(_.c).asUInt)
+  val ppp_array = Cat(Fill(nPhysicalEntries, prot_pp), normal_entries.map(_.ppp).asUInt)
   val paa_array = Cat(Fill(nPhysicalEntries, prot_aa), normal_entries.map(_.paa).asUInt)
   val pal_array = Cat(Fill(nPhysicalEntries, prot_al), normal_entries.map(_.pal).asUInt)
+  val ppp_array_if_cached = ppp_array | c_array
   val paa_array_if_cached = paa_array | Mux(usingAtomicsInCache, c_array, 0.U)
   val pal_array_if_cached = pal_array | Mux(usingAtomicsInCache, c_array, 0.U)
   val prefetchable_array = Cat((cacheable && homogeneous) << (nPhysicalEntries-1), normal_entries.map(_.c).asUInt)
@@ -280,6 +285,7 @@ class TLB(instruction: Boolean, lgMaxSize: Int, cfg: TLBConfig)(implicit edge: T
   val cmd_lrsc = Bool(usingAtomics) && io.req.bits.cmd.isOneOf(M_XLR, M_XSC)
   val cmd_amo_logical = Bool(usingAtomics) && isAMOLogical(io.req.bits.cmd)
   val cmd_amo_arithmetic = Bool(usingAtomics) && isAMOArithmetic(io.req.bits.cmd)
+  val cmd_put_partial = io.req.bits.cmd === M_PWR
   val cmd_read = isRead(io.req.bits.cmd)
   val cmd_write = isWrite(io.req.bits.cmd)
   val cmd_write_perms = cmd_write ||
@@ -292,9 +298,11 @@ class TLB(instruction: Boolean, lgMaxSize: Int, cfg: TLBConfig)(implicit edge: T
   val ae_ld_array = Mux(cmd_read, ae_array | ~pr_array, 0.U)
   val ae_st_array =
     Mux(cmd_write_perms, ae_array | ~pw_array, 0.U) |
+    Mux(cmd_put_partial, ~ppp_array_if_cached, 0.U) |
     Mux(cmd_amo_logical, ~pal_array_if_cached, 0.U) |
     Mux(cmd_amo_arithmetic, ~paa_array_if_cached, 0.U)
   val must_alloc_array =
+    Mux(cmd_put_partial, ~ppp_array, 0.U) |
     Mux(cmd_amo_logical, ~paa_array, 0.U) |
     Mux(cmd_amo_arithmetic, ~pal_array, 0.U) |
     Mux(cmd_lrsc, ~0.U(pal_array.getWidth.W), 0.U)

From f34d7cad59702c301bf8b555fbd834e17fff3331 Mon Sep 17 00:00:00 2001
From: Andrew Waterman <andrew@sifive.com>
Date: Thu, 4 Jun 2020 22:22:24 -0700
Subject: [PATCH 2/4] Remove newly unnecessary D$ PutPartial assertion

---
 src/main/scala/rocket/DCache.scala | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/main/scala/rocket/DCache.scala b/src/main/scala/rocket/DCache.scala
index cd051e76c15..c8fb8145012 100644
--- a/src/main/scala/rocket/DCache.scala
+++ b/src/main/scala/rocket/DCache.scala
@@ -897,7 +897,6 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) {
       // We could consider turning some of these into dynamic PMA checks.
       require(!m.supportsAcquireB || m.supportsGet, "With a vector unit, cacheable memory must support Get")
       require(!m.supportsAcquireT || m.supportsPutPartial, "With a vector unit, cacheable memory must support PutPartial")
-      require(!m.supportsPutFull || m.supportsPutPartial, "With a vector unit, writable memory must support PutPartial")
     }
   }
 

From 35e7da90f3f344bf52a3577fd6d7b09f20ca4f95 Mon Sep 17 00:00:00 2001
From: Andrew Waterman <andrew@sifive.com>
Date: Fri, 5 Jun 2020 14:02:10 -0700
Subject: [PATCH 3/4] D$: break up AMOALUs if coreDataBits > xLen

Rather than having one big ALU with a coreDataBits-sized adder,
have two smaller ALUs with shorter critical paths.  This is just
a QoR improvement for e.g. RV32D, which has xLen=32 and coreDataBits=64.
For other uses of the D$ that require coreDataBits>64, it's also a
functional fix.
---
 src/main/scala/rocket/DCache.scala | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/main/scala/rocket/DCache.scala b/src/main/scala/rocket/DCache.scala
index c8fb8145012..7d83e7822ec 100644
--- a/src/main/scala/rocket/DCache.scala
+++ b/src/main/scala/rocket/DCache.scala
@@ -877,15 +877,17 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) {
 
   // AMOs
   if (usingRMW) {
-    // when xLen < coreDataBits (e.g. RV32D), this AMOALU is wider than necessary
-    val amoalu = Module(new AMOALU(coreDataBits))
-    amoalu.io.mask := pstore1_mask
-    amoalu.io.cmd := (if (usingAtomicsInCache) pstore1_cmd else M_XWR)
-    amoalu.io.lhs := s2_data_word
-    amoalu.io.rhs := pstore1_data
-    pstore1_storegen_data := (if (!usingDataScratchpad) amoalu.io.out else {
+    val amoalus = (0 until coreDataBits / xLen).map { i =>
+      val amoalu = Module(new AMOALU(xLen))
+      amoalu.io.mask := pstore1_mask >> (i * xBytes)
+      amoalu.io.cmd := (if (usingAtomicsInCache) pstore1_cmd else M_XWR)
+      amoalu.io.lhs := s2_data_word >> (i * xLen)
+      amoalu.io.rhs := pstore1_data >> (i * xLen)
+      amoalu
+    }
+    pstore1_storegen_data := (if (!usingDataScratchpad) amoalus.map(_.io.out).asUInt else {
       val mask = FillInterleaved(8, Mux(s2_correct, 0.U, pstore1_mask))
-      amoalu.io.out_unmasked & mask | s2_data_word_corrected & ~mask
+      amoalus.map(_.io.out_unmasked).asUInt & mask | s2_data_word_corrected & ~mask
     })
   } else if (!usingAtomics) {
     assert(!(s1_valid_masked && s1_read && s1_write), "unsupported D$ operation")

From f2a71ae28add5e5d9c413957f3e31274700257d2 Mon Sep 17 00:00:00 2001
From: Andrew Waterman <andrew@sifive.com>
Date: Fri, 5 Jun 2020 14:04:54 -0700
Subject: [PATCH 4/4] Temporarily disable tests hitting OOM in Actions

---
 regression/run-test-bucket | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/regression/run-test-bucket b/regression/run-test-bucket
index 5ed757cdcc0..231be123666 100755
--- a/regression/run-test-bucket
+++ b/regression/run-test-bucket
@@ -51,8 +51,9 @@ bucket_number=$1
 set -x
 case "${bucket_number}" in
   1)
-    travis_wait 100 make emulator-ndebug -C regression SUITE=UnittestSuite JVM_MEMORY=3G VERILATOR_THREADS=1
-    travis_wait 100 make emulator-regression-tests -C regression SUITE=UnittestSuite JVM_MEMORY=3G VERILATOR_THREADS=1
+    # Temporarily disable this bucket, which is hitting OOM on Actions
+    #travis_wait 100 make emulator-ndebug -C regression SUITE=UnittestSuite JVM_MEMORY=3G VERILATOR_THREADS=1
+    #travis_wait 100 make emulator-regression-tests -C regression SUITE=UnittestSuite JVM_MEMORY=3G VERILATOR_THREADS=1
     ;;
 
   2)