diff --git a/apps/interpolate/Makefile b/apps/interpolate/Makefile
index 95c165b533ee..e5760d9f0039 100644
--- a/apps/interpolate/Makefile
+++ b/apps/interpolate/Makefile
@@ -1,6 +1,7 @@
 include ../support/Makefile.inc
 
 .PHONY: build clean test
+.SECONDARY:
 
 build: $(BIN)/$(HL_TARGET)/filter
 
diff --git a/apps/interpolate/interpolate_generator.cpp b/apps/interpolate/interpolate_generator.cpp
index 1e4026b9ef87..ca751bab253f 100644
--- a/apps/interpolate/interpolate_generator.cpp
+++ b/apps/interpolate/interpolate_generator.cpp
@@ -79,6 +79,7 @@ class Interpolate : public Halide::Generator<Interpolate> {
             Var yo, yi, xo, xi, ci, xii, yii;
             if (get_target().has_gpu_feature()) {
                 normalize
+                    .never_partition_all()
                     .bound(x, 0, input.width())
                     .bound(y, 0, input.height())
                     .bound(c, 0, 3)
@@ -94,6 +95,7 @@ class Interpolate : public Halide::Generator<Interpolate> {
                 for (int l = 1; l < levels; l++) {
                     downsampled[l]
                         .compute_root()
+                        .never_partition_all()
                         .reorder(c, x, y)
                         .unroll(c)
                         .gpu_tile(x, y, xi, yi, 16, 16);
@@ -102,6 +104,7 @@ class Interpolate : public Halide::Generator<Interpolate> {
                 for (int l = 3; l < levels; l += 2) {
                     interpolated[l]
                         .compute_root()
+                        .never_partition_all()
                         .reorder(c, x, y)
                         .tile(x, y, xi, yi, 32, 32, TailStrategy::RoundUp)
                         .tile(xi, yi, xii, yii, 2, 2)
@@ -114,6 +117,7 @@ class Interpolate : public Halide::Generator<Interpolate> {
 
                 upsampledx[1]
                     .compute_at(normalize, x)
+                    .never_partition_all()
                     .reorder(c, x, y)
                     .tile(x, y, xi, yi, 2, 1)
                     .unroll(xi)
@@ -123,6 +127,7 @@ class Interpolate : public Halide::Generator<Interpolate> {
 
                 interpolated[1]
                     .compute_at(normalize, x)
+                    .never_partition_all()
                     .reorder(c, x, y)
                     .tile(x, y, xi, yi, 2, 2)
                     .unroll(xi)
@@ -132,6 +137,7 @@ class Interpolate : public Halide::Generator<Interpolate> {
 
                 interpolated[2]
                     .compute_at(normalize, x)
+                    .never_partition_all()
                     .reorder(c, x, y)
                     .unroll(c)
                     .gpu_threads(x, y);
@@ -148,6 +154,7 @@ class Interpolate : public Halide::Generator<Interpolate> {
                     // the local_laplacian app.
                     downsampled[l]
                         .compute_root()
+                        .never_partition(x)
                         .reorder(x, c, y)
                         .split(y, yo, yi, 8)
                         .parallel(yo)
@@ -165,12 +172,14 @@ class Interpolate : public Halide::Generator<Interpolate> {
                     .compute_at(downsampled[1], yi)
                     .reorder(c, x, y)
                     .unroll(c)
-                    .vectorize(x, vec);
+                    .vectorize(x, vec)
+                    .never_partition(y);
 
                 normalize
                     .bound(x, 0, input.width())
                     .bound(y, 0, input.height())
                     .bound(c, 0, 3)
+                    .never_partition(y)
                     .split(x, xo, xi, vec)
                     .split(y, yo, yi, 32)
                     .reorder(xi, c, xo, yi, yo)
@@ -182,6 +191,7 @@ class Interpolate : public Halide::Generator<Interpolate> {
                     interpolated[l]
                         .store_at(normalize, yo)
                         .compute_at(normalize, yi)
+                        .never_partition_all()
                         .vectorize(x, vec);
                 }
 
diff --git a/apps/local_laplacian/Makefile b/apps/local_laplacian/Makefile
index a9f57b4de81a..a2c9991151f8 100644
--- a/apps/local_laplacian/Makefile
+++ b/apps/local_laplacian/Makefile
@@ -1,6 +1,7 @@
 include ../support/Makefile.inc
 
 .PHONY: build clean test
+.SECONDARY:
 
 build: $(BIN)/$(HL_TARGET)/process
 
diff --git a/apps/local_laplacian/local_laplacian_generator.cpp b/apps/local_laplacian/local_laplacian_generator.cpp
index ef305837c6cc..860540e74517 100644
--- a/apps/local_laplacian/local_laplacian_generator.cpp
+++ b/apps/local_laplacian/local_laplacian_generator.cpp
@@ -81,10 +81,10 @@ class LocalLaplacian : public Halide::Generator<LocalLaplacian> {
         // Reintroduce color (Connelly: use eps to avoid scaling up noise w/ apollo3.png input)
         Func color;
         float eps = 0.01f;
-        color(x, y, c) = outGPyramid[0](x, y) * (floating(x, y, c) + eps) / (gray(x, y) + eps);
+        color(x, y, c) = input(x, y, c) * (outGPyramid[0](x, y) + eps) / (gray(x, y) + eps);
 
         // Convert back to 16-bit
-        output(x, y, c) = cast<uint16_t>(clamp(color(x, y, c), 0.0f, 1.0f) * 65535.0f);
+        output(x, y, c) = cast<uint16_t>(clamp(color(x, y, c), 0.0f, 65535.0f));
 
         /* ESTIMATES */
         // (This can be useful in conjunction with RunGen and benchmarks as well
@@ -102,10 +102,15 @@ class LocalLaplacian : public Halide::Generator<LocalLaplacian> {
             // Nothing.
         } else if (get_target().has_gpu_feature()) {
             // GPU schedule.
-            // 3.19ms on an RTX 2060.
+            // 2.9ms on an RTX 2060.
+
+            // All loop partitioning disabled, which has no effect on runtime,
+            // but saves 15% compile time and 45% ptx shader code size.
             remap.compute_root();
             Var xi, yi;
-            output.compute_root().gpu_tile(x, y, xi, yi, 16, 8);
+            output.compute_root()
+                .never_partition_all()
+                .gpu_tile(x, y, xi, yi, 16, 8);
             for (int j = 0; j < J; j++) {
                 int blockw = 16, blockh = 8;
                 if (j > 3) {
@@ -113,10 +118,20 @@ class LocalLaplacian : public Halide::Generator<LocalLaplacian> {
                     blockh = 2;
                 }
                 if (j > 0) {
-                    inGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw, blockh);
-                    gPyramid[j].compute_root().reorder(k, x, y).gpu_tile(x, y, xi, yi, blockw, blockh);
+                    inGPyramid[j]
+                        .compute_root()
+                        .never_partition_all()
+                        .gpu_tile(x, y, xi, yi, blockw, blockh);
+                    gPyramid[j]
+                        .compute_root()
+                        .reorder(k, x, y)
+                        .never_partition_all()
+                        .gpu_tile(x, y, xi, yi, blockw, blockh);
                 }
-                outGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw, blockh);
+                outGPyramid[j]
+                    .compute_root()
+                    .never_partition_all()
+                    .gpu_tile(x, y, xi, yi, blockw, blockh);
             }
         } else {
             // CPU schedule.
@@ -131,8 +146,16 @@ class LocalLaplacian : public Halide::Generator<LocalLaplacian> {
 
             remap.compute_root();
             Var yo;
-            output.reorder(c, x, y).split(y, yo, y, 64).parallel(yo).vectorize(x, 8);
-            gray.compute_root().parallel(y, 32).vectorize(x, 8);
+            output
+                .reorder(c, x, y)
+                .split(y, yo, y, 64)
+                .parallel(yo)
+                .vectorize(x, 8);
+            gray
+                .compute_root()
+                .never_partition(y)
+                .parallel(y, 32)
+                .vectorize(x, 8);
             for (int j = 1; j < 5; j++) {
                 inGPyramid[j]
                     .compute_root()
@@ -148,12 +171,19 @@ class LocalLaplacian : public Halide::Generator<LocalLaplacian> {
                     .store_at(output, yo)
                     .compute_at(output, y)
                     .fold_storage(y, 4)
-                    .vectorize(x, 8);
+                    .vectorize(x, 8, TailStrategy::RoundUp);
+                if (j > 1) {
+                    // Turn off loop partitioning at higher pyramid levels. This
+                    // shaves about 3% off code size and compile time without
+                    // affecting performance.
+                    inGPyramid[j].never_partition_all();
+                    gPyramid[j].never_partition_all();
+                }
             }
             outGPyramid[0]
                 .compute_at(output, y)
                 .hoist_storage(output, yo)
-                .vectorize(x, 8);
+                .vectorize(x, 8, TailStrategy::RoundUp);
             for (int j = 5; j < J; j++) {
                 inGPyramid[j].compute_root();
                 gPyramid[j].compute_root().parallel(k);
diff --git a/src/Func.cpp b/src/Func.cpp
index a8190876c6b2..37b64df5af5b 100644
--- a/src/Func.cpp
+++ b/src/Func.cpp
@@ -1649,6 +1649,38 @@ Stage &Stage::partition(const VarOrRVar &var, Partition policy) {
     return *this;
 }
 
+Stage &Stage::never_partition(const std::vector<VarOrRVar> &vars) {
+    for (const auto &v : vars) {
+        partition(v, Partition::Never);
+    }
+    return *this;
+}
+
+Stage &Stage::never_partition_all() {
+    definition.schedule().touched() = true;
+    vector<Dim> &dims = definition.schedule().dims();
+    for (auto &dim : dims) {
+        dim.partition_policy = Partition::Never;
+    }
+    return *this;
+}
+
+Stage &Stage::always_partition(const std::vector<VarOrRVar> &vars) {
+    for (const auto &v : vars) {
+        partition(v, Partition::Always);
+    }
+    return *this;
+}
+
+Stage &Stage::always_partition_all() {
+    definition.schedule().touched() = true;
+    vector<Dim> &dims = definition.schedule().dims();
+    for (auto &dim : dims) {
+        dim.partition_policy = Partition::Always;
+    }
+    return *this;
+}
+
 Stage &Stage::tile(const VarOrRVar &x, const VarOrRVar &y,
                    const VarOrRVar &xo, const VarOrRVar &yo,
                    const VarOrRVar &xi, const VarOrRVar &yi,
@@ -2342,6 +2374,30 @@ Func &Func::partition(const VarOrRVar &var, Partition policy) {
     return *this;
 }
 
+Func &Func::never_partition(const std::vector<VarOrRVar> &vars) {
+    invalidate_cache();
+    Stage(func, func.definition(), 0).never_partition(vars);
+    return *this;
+}
+
+Func &Func::never_partition_all() {
+    invalidate_cache();
+    Stage(func, func.definition(), 0).never_partition_all();
+    return *this;
+}
+
+Func &Func::always_partition(const std::vector<VarOrRVar> &vars) {
+    invalidate_cache();
+    Stage(func, func.definition(), 0).always_partition(vars);
+    return *this;
+}
+
+Func &Func::always_partition_all() {
+    invalidate_cache();
+    Stage(func, func.definition(), 0).always_partition_all();
+    return *this;
+}
+
 Func &Func::bound(const Var &var, Expr min, Expr extent) {
     user_assert(!min.defined() || Int(32).can_represent(min.type())) << "Can't represent min bound in int32\n";
     user_assert(extent.defined()) << "Extent bound of a Func can't be undefined\n";
diff --git a/src/Func.h b/src/Func.h
index 2cad7160b823..ccadef338c29 100644
--- a/src/Func.h
+++ b/src/Func.h
@@ -349,6 +349,11 @@ class Stage {
     Stage &vectorize(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
     Stage &unroll(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
     Stage &partition(const VarOrRVar &var, Partition partition_policy);
+    Stage &never_partition_all();
+    Stage &never_partition(const std::vector<VarOrRVar> &vars);
+    Stage &always_partition_all();
+    Stage &always_partition(const std::vector<VarOrRVar> &vars);
+
     Stage &tile(const VarOrRVar &x, const VarOrRVar &y,
                 const VarOrRVar &xo, const VarOrRVar &yo,
                 const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor,
@@ -380,6 +385,20 @@ class Stage {
         return reorder(collected_args);
     }
 
+    template<typename... Args>
+    HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<VarOrRVar, Args...>::value, Stage &>::type
+    never_partition(const VarOrRVar &x, Args &&...args) {
+        std::vector<VarOrRVar> collected_args{x, std::forward<Args>(args)...};
+        return never_partition(collected_args);
+    }
+
+    template<typename... Args>
+    HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<VarOrRVar, Args...>::value, Stage &>::type
+    always_partition(const VarOrRVar &x, Args &&...args) {
+        std::vector<VarOrRVar> collected_args{x, std::forward<Args>(args)...};
+        return always_partition(collected_args);
+    }
+
     Stage &rename(const VarOrRVar &old_name, const VarOrRVar &new_name);
     Stage specialize(const Expr &condition);
     void specialize_fail(const std::string &message);
@@ -1450,6 +1469,40 @@ class Func {
      * The default policy is Auto. */
     Func &partition(const VarOrRVar &var, Partition partition_policy);
 
+    /** Set the loop partition policy to Never for a vector of Vars and
+     * RVars. */
+    Func &never_partition(const std::vector<VarOrRVar> &vars);
+
+    /** Set the loop partition policy to Never for some number of Vars and RVars. */
+    template<typename... Args>
+    HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<VarOrRVar, Args...>::value, Func &>::type
+    never_partition(const VarOrRVar &x, Args &&...args) {
+        std::vector<VarOrRVar> collected_args{x, std::forward<Args>(args)...};
+        return never_partition(collected_args);
+    }
+
+    /** Set the loop partition policy to Never for all Vars and RVar of the
+     * initial definition of the Func. It must be called separately on any
+     * update definitions. */
+    Func &never_partition_all();
+
+    /** Set the loop partition policy to Always for a vector of Vars and
+     * RVars. */
+    Func &always_partition(const std::vector<VarOrRVar> &vars);
+
+    /** Set the loop partition policy to Always for some number of Vars and RVars. */
+    template<typename... Args>
+    HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<VarOrRVar, Args...>::value, Func &>::type
+    always_partition(const VarOrRVar &x, Args &&...args) {
+        std::vector<VarOrRVar> collected_args{x, std::forward<Args>(args)...};
+        return always_partition(collected_args);
+    }
+
+    /** Set the loop partition policy to Always for all Vars and RVar of the
+     * initial definition of the Func. It must be called separately on any
+     * update definitions. */
+    Func &always_partition_all();
+
     /** Statically declare that the range over which a function should
      * be evaluated is given by the second and third arguments. This
      * can let Halide perform some optimizations. E.g. if you know
diff --git a/src/Generator.h b/src/Generator.h
index 9bc335b52ed7..1df0a1dda15b 100644
--- a/src/Generator.h
+++ b/src/Generator.h
@@ -3052,6 +3052,7 @@ class NamesInterface {
     using LoopLevel = Halide::LoopLevel;
     using MemoryType = Halide::MemoryType;
     using NameMangling = Halide::NameMangling;
+    using Partition = Halide::Partition;
     using Pipeline = Halide::Pipeline;
     using PrefetchBoundStrategy = Halide::PrefetchBoundStrategy;
     using RDom = Halide::RDom;
diff --git a/src/LoopPartitioningDirective.h b/src/LoopPartitioningDirective.h
index 3189add52d1a..c4c14de48f2a 100644
--- a/src/LoopPartitioningDirective.h
+++ b/src/LoopPartitioningDirective.h
@@ -20,8 +20,9 @@ enum class Partition {
     /** Disallow loop partitioning. */
     Never,
 
-    /** Force partitioning of the loop. If Halide can't find a way to partition this loop,
-     * it will raise an error. */
+    /** Force partitioning of the loop, even in the tail cases of outer
+     * partitioned loops. If Halide can't find a way to partition this loop, it
+     * will raise an error. */
     Always
 };
 
diff --git a/src/PartitionLoops.cpp b/src/PartitionLoops.cpp
index 7e2060d25c49..e2cf610d373a 100644
--- a/src/PartitionLoops.cpp
+++ b/src/PartitionLoops.cpp
@@ -517,10 +517,13 @@ class PartitionLoops : public IRMutator {
     using IRMutator::visit;
 
     bool in_gpu_loop = false;
+    bool in_tail = false;
 
     Stmt visit(const For *op) override {
-        // Do not partition if the schedule explicitly forbids.
-        if (op->partition_policy == Partition::Never) {
+        // Do not partition if the schedule explicitly forbids, or if it's set
+        // to automatic and we're in a loop tail.
+        if (op->partition_policy == Partition::Never ||
+            (op->partition_policy == Partition::Auto && in_tail)) {
             return IRMutator::visit(op);
         }
 
@@ -687,6 +690,13 @@ class PartitionLoops : public IRMutator {
         // Recurse on the middle section.
         simpler_body = mutate(simpler_body);
 
+        // Recurse on the prologue and epilogue, just for loops set to Partition::Always
+        {
+            ScopedValue<bool> s(in_tail, true);
+            epilogue = mutate(epilogue);
+            prologue = mutate(prologue);
+        }
+
         // Construct variables for the bounds of the simplified middle section
         Expr min_steady = op->min, max_steady = op->extent + op->min;
         Expr prologue_val, epilogue_val;
diff --git a/test/correctness/likely.cpp b/test/correctness/likely.cpp
index fe834199f015..10a46ed94e2e 100644
--- a/test/correctness/likely.cpp
+++ b/test/correctness/likely.cpp
@@ -127,12 +127,12 @@ int main(int argc, char **argv) {
         count_partitions(g, 1);
     }
 
-    // The slicing applies to every loop level starting from the
-    // outermost one, but only recursively simplifies the clean steady
-    // state. It either splits things three (start, middle, end). So
-    // adding a boundary condition to a 2D computation will produce 5
-    // code paths for the top, bottom, left, right, and center of the
-    // image.
+    // The slicing applies to every loop level starting from the outermost one,
+    // but only recursively simplifies the clean steady state. It either splits
+    // things three (start, middle, end). So adding a boundary condition to a 2D
+    // computation will produce 5 code paths for the top, bottom, left, right,
+    // and center of the image. With explicit control over loop partitioning, we
+    // might produce more or fewer.
     {
         Var y;
         Func g;
@@ -144,7 +144,6 @@ int main(int argc, char **argv) {
         {
             debug(1) << "Never partition y, always partition x:\n";
             Func h2 = h;
-            // check that disabling works.
             h2.partition(x, Partition::Always);
             h2.partition(y, Partition::Never);
             count_partitions(h2, 3);  // We expect left-center-right
@@ -153,7 +152,6 @@ int main(int argc, char **argv) {
         {
             debug(1) << "Never partition x, always partition y:\n";
             Func h2 = h;
-            // check that disabling works.
             h2.partition(x, Partition::Never);
             h2.partition(y, Partition::Always);
             count_partitions(h2, 3);  // We expect top-middle-bottom
@@ -162,7 +160,6 @@ int main(int argc, char **argv) {
         {
             debug(1) << "Never partition x and y.\n";
             Func h2 = h;
-            // check that disabling works.
             h2.partition(x, Partition::Never);
             h2.partition(y, Partition::Never);
             count_partitions(h2, 1);
@@ -171,10 +168,19 @@ int main(int argc, char **argv) {
         {
             debug(1) << "Always partition x and y.\n";
             Func h2 = h;
-            // check that disabling works.
             h2.partition(x, Partition::Always);
             h2.partition(y, Partition::Always);
-            count_partitions(h2, 5);
+            // All loops get partitioned, including the tails of outer loops, so we expect 9 zones:
+            /*
+               ----------------------------------------------
+               | top left    | top middle    | top right    |
+               | ------------------------------------------ |
+               | left        | middle        | right        |
+               | ------------------------------------------ |
+               | bottom left | bottom middle | bottom right |
+               ----------------------------------------------
+            */
+            count_partitions(h2, 9);
         }
     }