diff --git a/apps/interpolate/Makefile b/apps/interpolate/Makefile index 95c165b533ee..e5760d9f0039 100644 --- a/apps/interpolate/Makefile +++ b/apps/interpolate/Makefile @@ -1,6 +1,7 @@ include ../support/Makefile.inc .PHONY: build clean test +.SECONDARY: build: $(BIN)/$(HL_TARGET)/filter diff --git a/apps/interpolate/interpolate_generator.cpp b/apps/interpolate/interpolate_generator.cpp index 1e4026b9ef87..ca751bab253f 100644 --- a/apps/interpolate/interpolate_generator.cpp +++ b/apps/interpolate/interpolate_generator.cpp @@ -79,6 +79,7 @@ class Interpolate : public Halide::Generator { Var yo, yi, xo, xi, ci, xii, yii; if (get_target().has_gpu_feature()) { normalize + .never_partition_all() .bound(x, 0, input.width()) .bound(y, 0, input.height()) .bound(c, 0, 3) @@ -94,6 +95,7 @@ class Interpolate : public Halide::Generator { for (int l = 1; l < levels; l++) { downsampled[l] .compute_root() + .never_partition_all() .reorder(c, x, y) .unroll(c) .gpu_tile(x, y, xi, yi, 16, 16); @@ -102,6 +104,7 @@ class Interpolate : public Halide::Generator { for (int l = 3; l < levels; l += 2) { interpolated[l] .compute_root() + .never_partition_all() .reorder(c, x, y) .tile(x, y, xi, yi, 32, 32, TailStrategy::RoundUp) .tile(xi, yi, xii, yii, 2, 2) @@ -114,6 +117,7 @@ class Interpolate : public Halide::Generator { upsampledx[1] .compute_at(normalize, x) + .never_partition_all() .reorder(c, x, y) .tile(x, y, xi, yi, 2, 1) .unroll(xi) @@ -123,6 +127,7 @@ class Interpolate : public Halide::Generator { interpolated[1] .compute_at(normalize, x) + .never_partition_all() .reorder(c, x, y) .tile(x, y, xi, yi, 2, 2) .unroll(xi) @@ -132,6 +137,7 @@ class Interpolate : public Halide::Generator { interpolated[2] .compute_at(normalize, x) + .never_partition_all() .reorder(c, x, y) .unroll(c) .gpu_threads(x, y); @@ -148,6 +154,7 @@ class Interpolate : public Halide::Generator { // the local_laplacian app. downsampled[l] .compute_root() + .never_partition(x) .reorder(x, c, y) .split(y, yo, yi, 8) .parallel(yo) @@ -165,12 +172,14 @@ class Interpolate : public Halide::Generator { .compute_at(downsampled[1], yi) .reorder(c, x, y) .unroll(c) - .vectorize(x, vec); + .vectorize(x, vec) + .never_partition(y); normalize .bound(x, 0, input.width()) .bound(y, 0, input.height()) .bound(c, 0, 3) + .never_partition(y) .split(x, xo, xi, vec) .split(y, yo, yi, 32) .reorder(xi, c, xo, yi, yo) @@ -182,6 +191,7 @@ class Interpolate : public Halide::Generator { interpolated[l] .store_at(normalize, yo) .compute_at(normalize, yi) + .never_partition_all() .vectorize(x, vec); } diff --git a/apps/local_laplacian/Makefile b/apps/local_laplacian/Makefile index a9f57b4de81a..a2c9991151f8 100644 --- a/apps/local_laplacian/Makefile +++ b/apps/local_laplacian/Makefile @@ -1,6 +1,7 @@ include ../support/Makefile.inc .PHONY: build clean test +.SECONDARY: build: $(BIN)/$(HL_TARGET)/process diff --git a/apps/local_laplacian/local_laplacian_generator.cpp b/apps/local_laplacian/local_laplacian_generator.cpp index ef305837c6cc..860540e74517 100644 --- a/apps/local_laplacian/local_laplacian_generator.cpp +++ b/apps/local_laplacian/local_laplacian_generator.cpp @@ -81,10 +81,10 @@ class LocalLaplacian : public Halide::Generator { // Reintroduce color (Connelly: use eps to avoid scaling up noise w/ apollo3.png input) Func color; float eps = 0.01f; - color(x, y, c) = outGPyramid[0](x, y) * (floating(x, y, c) + eps) / (gray(x, y) + eps); + color(x, y, c) = input(x, y, c) * (outGPyramid[0](x, y) + eps) / (gray(x, y) + eps); // Convert back to 16-bit - output(x, y, c) = cast(clamp(color(x, y, c), 0.0f, 1.0f) * 65535.0f); + output(x, y, c) = cast(clamp(color(x, y, c), 0.0f, 65535.0f)); /* ESTIMATES */ // (This can be useful in conjunction with RunGen and benchmarks as well @@ -102,10 +102,15 @@ class LocalLaplacian : public Halide::Generator { // Nothing. } else if (get_target().has_gpu_feature()) { // GPU schedule. - // 3.19ms on an RTX 2060. + // 2.9ms on an RTX 2060. + + // All loop partitioning disabled, which has no effect on runtime, + // but saves 15% compile time and 45% ptx shader code size. remap.compute_root(); Var xi, yi; - output.compute_root().gpu_tile(x, y, xi, yi, 16, 8); + output.compute_root() + .never_partition_all() + .gpu_tile(x, y, xi, yi, 16, 8); for (int j = 0; j < J; j++) { int blockw = 16, blockh = 8; if (j > 3) { @@ -113,10 +118,20 @@ class LocalLaplacian : public Halide::Generator { blockh = 2; } if (j > 0) { - inGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw, blockh); - gPyramid[j].compute_root().reorder(k, x, y).gpu_tile(x, y, xi, yi, blockw, blockh); + inGPyramid[j] + .compute_root() + .never_partition_all() + .gpu_tile(x, y, xi, yi, blockw, blockh); + gPyramid[j] + .compute_root() + .reorder(k, x, y) + .never_partition_all() + .gpu_tile(x, y, xi, yi, blockw, blockh); } - outGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw, blockh); + outGPyramid[j] + .compute_root() + .never_partition_all() + .gpu_tile(x, y, xi, yi, blockw, blockh); } } else { // CPU schedule. @@ -131,8 +146,16 @@ class LocalLaplacian : public Halide::Generator { remap.compute_root(); Var yo; - output.reorder(c, x, y).split(y, yo, y, 64).parallel(yo).vectorize(x, 8); - gray.compute_root().parallel(y, 32).vectorize(x, 8); + output + .reorder(c, x, y) + .split(y, yo, y, 64) + .parallel(yo) + .vectorize(x, 8); + gray + .compute_root() + .never_partition(y) + .parallel(y, 32) + .vectorize(x, 8); for (int j = 1; j < 5; j++) { inGPyramid[j] .compute_root() @@ -148,12 +171,19 @@ class LocalLaplacian : public Halide::Generator { .store_at(output, yo) .compute_at(output, y) .fold_storage(y, 4) - .vectorize(x, 8); + .vectorize(x, 8, TailStrategy::RoundUp); + if (j > 1) { + // Turn off loop partitioning at higher pyramid levels. This + // shaves about 3% off code size and compile time without + // affecting performance. + inGPyramid[j].never_partition_all(); + gPyramid[j].never_partition_all(); + } } outGPyramid[0] .compute_at(output, y) .hoist_storage(output, yo) - .vectorize(x, 8); + .vectorize(x, 8, TailStrategy::RoundUp); for (int j = 5; j < J; j++) { inGPyramid[j].compute_root(); gPyramid[j].compute_root().parallel(k); diff --git a/src/Func.cpp b/src/Func.cpp index a8190876c6b2..37b64df5af5b 100644 --- a/src/Func.cpp +++ b/src/Func.cpp @@ -1649,6 +1649,38 @@ Stage &Stage::partition(const VarOrRVar &var, Partition policy) { return *this; } +Stage &Stage::never_partition(const std::vector &vars) { + for (const auto &v : vars) { + partition(v, Partition::Never); + } + return *this; +} + +Stage &Stage::never_partition_all() { + definition.schedule().touched() = true; + vector &dims = definition.schedule().dims(); + for (auto &dim : dims) { + dim.partition_policy = Partition::Never; + } + return *this; +} + +Stage &Stage::always_partition(const std::vector &vars) { + for (const auto &v : vars) { + partition(v, Partition::Always); + } + return *this; +} + +Stage &Stage::always_partition_all() { + definition.schedule().touched() = true; + vector &dims = definition.schedule().dims(); + for (auto &dim : dims) { + dim.partition_policy = Partition::Always; + } + return *this; +} + Stage &Stage::tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &xo, const VarOrRVar &yo, const VarOrRVar &xi, const VarOrRVar &yi, @@ -2342,6 +2374,30 @@ Func &Func::partition(const VarOrRVar &var, Partition policy) { return *this; } +Func &Func::never_partition(const std::vector &vars) { + invalidate_cache(); + Stage(func, func.definition(), 0).never_partition(vars); + return *this; +} + +Func &Func::never_partition_all() { + invalidate_cache(); + Stage(func, func.definition(), 0).never_partition_all(); + return *this; +} + +Func &Func::always_partition(const std::vector &vars) { + invalidate_cache(); + Stage(func, func.definition(), 0).always_partition(vars); + return *this; +} + +Func &Func::always_partition_all() { + invalidate_cache(); + Stage(func, func.definition(), 0).always_partition_all(); + return *this; +} + Func &Func::bound(const Var &var, Expr min, Expr extent) { user_assert(!min.defined() || Int(32).can_represent(min.type())) << "Can't represent min bound in int32\n"; user_assert(extent.defined()) << "Extent bound of a Func can't be undefined\n"; diff --git a/src/Func.h b/src/Func.h index 2cad7160b823..ccadef338c29 100644 --- a/src/Func.h +++ b/src/Func.h @@ -349,6 +349,11 @@ class Stage { Stage &vectorize(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto); Stage &unroll(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto); Stage &partition(const VarOrRVar &var, Partition partition_policy); + Stage &never_partition_all(); + Stage &never_partition(const std::vector &vars); + Stage &always_partition_all(); + Stage &always_partition(const std::vector &vars); + Stage &tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &xo, const VarOrRVar &yo, const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor, @@ -380,6 +385,20 @@ class Stage { return reorder(collected_args); } + template + HALIDE_NO_USER_CODE_INLINE typename std::enable_if::value, Stage &>::type + never_partition(const VarOrRVar &x, Args &&...args) { + std::vector collected_args{x, std::forward(args)...}; + return never_partition(collected_args); + } + + template + HALIDE_NO_USER_CODE_INLINE typename std::enable_if::value, Stage &>::type + always_partition(const VarOrRVar &x, Args &&...args) { + std::vector collected_args{x, std::forward(args)...}; + return always_partition(collected_args); + } + Stage &rename(const VarOrRVar &old_name, const VarOrRVar &new_name); Stage specialize(const Expr &condition); void specialize_fail(const std::string &message); @@ -1450,6 +1469,40 @@ class Func { * The default policy is Auto. */ Func &partition(const VarOrRVar &var, Partition partition_policy); + /** Set the loop partition policy to Never for a vector of Vars and + * RVars. */ + Func &never_partition(const std::vector &vars); + + /** Set the loop partition policy to Never for some number of Vars and RVars. */ + template + HALIDE_NO_USER_CODE_INLINE typename std::enable_if::value, Func &>::type + never_partition(const VarOrRVar &x, Args &&...args) { + std::vector collected_args{x, std::forward(args)...}; + return never_partition(collected_args); + } + + /** Set the loop partition policy to Never for all Vars and RVar of the + * initial definition of the Func. It must be called separately on any + * update definitions. */ + Func &never_partition_all(); + + /** Set the loop partition policy to Always for a vector of Vars and + * RVars. */ + Func &always_partition(const std::vector &vars); + + /** Set the loop partition policy to Always for some number of Vars and RVars. */ + template + HALIDE_NO_USER_CODE_INLINE typename std::enable_if::value, Func &>::type + always_partition(const VarOrRVar &x, Args &&...args) { + std::vector collected_args{x, std::forward(args)...}; + return always_partition(collected_args); + } + + /** Set the loop partition policy to Always for all Vars and RVar of the + * initial definition of the Func. It must be called separately on any + * update definitions. */ + Func &always_partition_all(); + /** Statically declare that the range over which a function should * be evaluated is given by the second and third arguments. This * can let Halide perform some optimizations. E.g. if you know diff --git a/src/Generator.h b/src/Generator.h index 9bc335b52ed7..1df0a1dda15b 100644 --- a/src/Generator.h +++ b/src/Generator.h @@ -3052,6 +3052,7 @@ class NamesInterface { using LoopLevel = Halide::LoopLevel; using MemoryType = Halide::MemoryType; using NameMangling = Halide::NameMangling; + using Partition = Halide::Partition; using Pipeline = Halide::Pipeline; using PrefetchBoundStrategy = Halide::PrefetchBoundStrategy; using RDom = Halide::RDom; diff --git a/src/LoopPartitioningDirective.h b/src/LoopPartitioningDirective.h index 3189add52d1a..c4c14de48f2a 100644 --- a/src/LoopPartitioningDirective.h +++ b/src/LoopPartitioningDirective.h @@ -20,8 +20,9 @@ enum class Partition { /** Disallow loop partitioning. */ Never, - /** Force partitioning of the loop. If Halide can't find a way to partition this loop, - * it will raise an error. */ + /** Force partitioning of the loop, even in the tail cases of outer + * partitioned loops. If Halide can't find a way to partition this loop, it + * will raise an error. */ Always }; diff --git a/src/PartitionLoops.cpp b/src/PartitionLoops.cpp index 7e2060d25c49..e2cf610d373a 100644 --- a/src/PartitionLoops.cpp +++ b/src/PartitionLoops.cpp @@ -517,10 +517,13 @@ class PartitionLoops : public IRMutator { using IRMutator::visit; bool in_gpu_loop = false; + bool in_tail = false; Stmt visit(const For *op) override { - // Do not partition if the schedule explicitly forbids. - if (op->partition_policy == Partition::Never) { + // Do not partition if the schedule explicitly forbids, or if it's set + // to automatic and we're in a loop tail. + if (op->partition_policy == Partition::Never || + (op->partition_policy == Partition::Auto && in_tail)) { return IRMutator::visit(op); } @@ -687,6 +690,13 @@ class PartitionLoops : public IRMutator { // Recurse on the middle section. simpler_body = mutate(simpler_body); + // Recurse on the prologue and epilogue, just for loops set to Partition::Always + { + ScopedValue s(in_tail, true); + epilogue = mutate(epilogue); + prologue = mutate(prologue); + } + // Construct variables for the bounds of the simplified middle section Expr min_steady = op->min, max_steady = op->extent + op->min; Expr prologue_val, epilogue_val; diff --git a/test/correctness/likely.cpp b/test/correctness/likely.cpp index fe834199f015..10a46ed94e2e 100644 --- a/test/correctness/likely.cpp +++ b/test/correctness/likely.cpp @@ -127,12 +127,12 @@ int main(int argc, char **argv) { count_partitions(g, 1); } - // The slicing applies to every loop level starting from the - // outermost one, but only recursively simplifies the clean steady - // state. It either splits things three (start, middle, end). So - // adding a boundary condition to a 2D computation will produce 5 - // code paths for the top, bottom, left, right, and center of the - // image. + // The slicing applies to every loop level starting from the outermost one, + // but only recursively simplifies the clean steady state. It either splits + // things three (start, middle, end). So adding a boundary condition to a 2D + // computation will produce 5 code paths for the top, bottom, left, right, + // and center of the image. With explicit control over loop partitioning, we + // might produce more or fewer. { Var y; Func g; @@ -144,7 +144,6 @@ int main(int argc, char **argv) { { debug(1) << "Never partition y, always partition x:\n"; Func h2 = h; - // check that disabling works. h2.partition(x, Partition::Always); h2.partition(y, Partition::Never); count_partitions(h2, 3); // We expect left-center-right @@ -153,7 +152,6 @@ int main(int argc, char **argv) { { debug(1) << "Never partition x, always partition y:\n"; Func h2 = h; - // check that disabling works. h2.partition(x, Partition::Never); h2.partition(y, Partition::Always); count_partitions(h2, 3); // We expect top-middle-bottom @@ -162,7 +160,6 @@ int main(int argc, char **argv) { { debug(1) << "Never partition x and y.\n"; Func h2 = h; - // check that disabling works. h2.partition(x, Partition::Never); h2.partition(y, Partition::Never); count_partitions(h2, 1); @@ -171,10 +168,19 @@ int main(int argc, char **argv) { { debug(1) << "Always partition x and y.\n"; Func h2 = h; - // check that disabling works. h2.partition(x, Partition::Always); h2.partition(y, Partition::Always); - count_partitions(h2, 5); + // All loops get partitioned, including the tails of outer loops, so we expect 9 zones: + /* + ---------------------------------------------- + | top left | top middle | top right | + | ------------------------------------------ | + | left | middle | right | + | ------------------------------------------ | + | bottom left | bottom middle | bottom right | + ---------------------------------------------- + */ + count_partitions(h2, 9); } }