diff --git a/regression/run-test-bucket b/regression/run-test-bucket
index 5ed757cdcc0..231be123666 100755
--- a/regression/run-test-bucket
+++ b/regression/run-test-bucket
@@ -51,8 +51,9 @@ bucket_number=$1
 set -x
 case "${bucket_number}" in
   1)
-    travis_wait 100 make emulator-ndebug -C regression SUITE=UnittestSuite JVM_MEMORY=3G VERILATOR_THREADS=1
-    travis_wait 100 make emulator-regression-tests -C regression SUITE=UnittestSuite JVM_MEMORY=3G VERILATOR_THREADS=1
+    # Temporarily disable this bucket, which is hitting OOM on Actions
+    #travis_wait 100 make emulator-ndebug -C regression SUITE=UnittestSuite JVM_MEMORY=3G VERILATOR_THREADS=1
+    #travis_wait 100 make emulator-regression-tests -C regression SUITE=UnittestSuite JVM_MEMORY=3G VERILATOR_THREADS=1
     ;;
 
   2)
diff --git a/src/main/scala/rocket/DCache.scala b/src/main/scala/rocket/DCache.scala
index cd051e76c15..7d83e7822ec 100644
--- a/src/main/scala/rocket/DCache.scala
+++ b/src/main/scala/rocket/DCache.scala
@@ -877,15 +877,17 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) {
 
   // AMOs
   if (usingRMW) {
-    // when xLen < coreDataBits (e.g. RV32D), this AMOALU is wider than necessary
-    val amoalu = Module(new AMOALU(coreDataBits))
-    amoalu.io.mask := pstore1_mask
-    amoalu.io.cmd := (if (usingAtomicsInCache) pstore1_cmd else M_XWR)
-    amoalu.io.lhs := s2_data_word
-    amoalu.io.rhs := pstore1_data
-    pstore1_storegen_data := (if (!usingDataScratchpad) amoalu.io.out else {
+    val amoalus = (0 until coreDataBits / xLen).map { i =>
+      val amoalu = Module(new AMOALU(xLen))
+      amoalu.io.mask := pstore1_mask >> (i * xBytes)
+      amoalu.io.cmd := (if (usingAtomicsInCache) pstore1_cmd else M_XWR)
+      amoalu.io.lhs := s2_data_word >> (i * xLen)
+      amoalu.io.rhs := pstore1_data >> (i * xLen)
+      amoalu
+    }
+    pstore1_storegen_data := (if (!usingDataScratchpad) amoalus.map(_.io.out).asUInt else {
       val mask = FillInterleaved(8, Mux(s2_correct, 0.U, pstore1_mask))
-      amoalu.io.out_unmasked & mask | s2_data_word_corrected & ~mask
+      amoalus.map(_.io.out_unmasked).asUInt & mask | s2_data_word_corrected & ~mask
     })
   } else if (!usingAtomics) {
     assert(!(s1_valid_masked && s1_read && s1_write), "unsupported D$ operation")
@@ -897,7 +899,6 @@ class DCacheModule(outer: DCache) extends HellaCacheModule(outer) {
       // We could consider turning some of these into dynamic PMA checks.
       require(!m.supportsAcquireB || m.supportsGet, "With a vector unit, cacheable memory must support Get")
       require(!m.supportsAcquireT || m.supportsPutPartial, "With a vector unit, cacheable memory must support PutPartial")
-      require(!m.supportsPutFull || m.supportsPutPartial, "With a vector unit, writable memory must support PutPartial")
     }
   }
 
diff --git a/src/main/scala/rocket/TLB.scala b/src/main/scala/rocket/TLB.scala
index 769ab62ba30..60aa0d97538 100644
--- a/src/main/scala/rocket/TLB.scala
+++ b/src/main/scala/rocket/TLB.scala
@@ -64,6 +64,7 @@ class TLBEntryData(implicit p: Parameters) extends CoreBundle()(p) {
   val pw = Bool()
   val px = Bool()
   val pr = Bool()
+  val ppp = Bool() // PutPartial
   val pal = Bool() // AMO logical
   val paa = Bool() // AMO arithmetic
   val eff = Bool() // get/put effects
@@ -199,6 +200,7 @@ class TLB(instruction: Boolean, lgMaxSize: Int, cfg: TLBConfig)(implicit edge: T
   val deny_access_to_debug = mpu_priv <= PRV.M && p(DebugModuleKey).map(dmp => dmp.address.contains(mpu_physaddr)).getOrElse(false)
   val prot_r = fastCheck(_.supportsGet) && !deny_access_to_debug && pmp.io.r
   val prot_w = fastCheck(_.supportsPutFull) && !deny_access_to_debug && pmp.io.w
+  val prot_pp = fastCheck(_.supportsPutPartial)
   val prot_al = fastCheck(_.supportsLogical)
   val prot_aa = fastCheck(_.supportsArithmetic)
   val prot_x = fastCheck(_.executable) && !deny_access_to_debug && pmp.io.x
@@ -226,6 +228,7 @@ class TLB(instruction: Boolean, lgMaxSize: Int, cfg: TLBConfig)(implicit edge: T
     newEntry.pr := prot_r
     newEntry.pw := prot_w
     newEntry.px := prot_x
+    newEntry.ppp := prot_pp
     newEntry.pal := prot_al
     newEntry.paa := prot_aa
     newEntry.eff := prot_eff
@@ -260,8 +263,10 @@ class TLB(instruction: Boolean, lgMaxSize: Int, cfg: TLBConfig)(implicit edge: T
   val px_array = Cat(Fill(nPhysicalEntries, prot_x), normal_entries.map(_.px).asUInt) & ~ptw_ae_array
   val eff_array = Cat(Fill(nPhysicalEntries, prot_eff), normal_entries.map(_.eff).asUInt)
   val c_array = Cat(Fill(nPhysicalEntries, cacheable), normal_entries.map(_.c).asUInt)
+  val ppp_array = Cat(Fill(nPhysicalEntries, prot_pp), normal_entries.map(_.ppp).asUInt)
   val paa_array = Cat(Fill(nPhysicalEntries, prot_aa), normal_entries.map(_.paa).asUInt)
   val pal_array = Cat(Fill(nPhysicalEntries, prot_al), normal_entries.map(_.pal).asUInt)
+  val ppp_array_if_cached = ppp_array | c_array
   val paa_array_if_cached = paa_array | Mux(usingAtomicsInCache, c_array, 0.U)
   val pal_array_if_cached = pal_array | Mux(usingAtomicsInCache, c_array, 0.U)
   val prefetchable_array = Cat((cacheable && homogeneous) << (nPhysicalEntries-1), normal_entries.map(_.c).asUInt)
@@ -280,6 +285,7 @@ class TLB(instruction: Boolean, lgMaxSize: Int, cfg: TLBConfig)(implicit edge: T
   val cmd_lrsc = Bool(usingAtomics) && io.req.bits.cmd.isOneOf(M_XLR, M_XSC)
   val cmd_amo_logical = Bool(usingAtomics) && isAMOLogical(io.req.bits.cmd)
   val cmd_amo_arithmetic = Bool(usingAtomics) && isAMOArithmetic(io.req.bits.cmd)
+  val cmd_put_partial = io.req.bits.cmd === M_PWR
   val cmd_read = isRead(io.req.bits.cmd)
   val cmd_write = isWrite(io.req.bits.cmd)
   val cmd_write_perms = cmd_write ||
@@ -292,9 +298,11 @@ class TLB(instruction: Boolean, lgMaxSize: Int, cfg: TLBConfig)(implicit edge: T
   val ae_ld_array = Mux(cmd_read, ae_array | ~pr_array, 0.U)
   val ae_st_array =
     Mux(cmd_write_perms, ae_array | ~pw_array, 0.U) |
+    Mux(cmd_put_partial, ~ppp_array_if_cached, 0.U) |
     Mux(cmd_amo_logical, ~pal_array_if_cached, 0.U) |
     Mux(cmd_amo_arithmetic, ~paa_array_if_cached, 0.U)
   val must_alloc_array =
+    Mux(cmd_put_partial, ~ppp_array, 0.U) |
     Mux(cmd_amo_logical, ~paa_array, 0.U) |
     Mux(cmd_amo_arithmetic, ~pal_array, 0.U) |
     Mux(cmd_lrsc, ~0.U(pal_array.getWidth.W), 0.U)