Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[TOPI] Use integer arithmetic for topi.image.resize #13530

Closed

Conversation

Lunderberg
Copy link
Contributor

Prior to this commit, floating point expressions were used to map between different-sized pixel arrays. These floating point expressions are less aggressively optimized by RewriteSimplifier, which can prevent some optimizations

This was first noticed during investigation into issue #13508. Benchmarks of topi.image.resize showed 1000x and 50x performance improvements using the LLVM and CUDA backends, respectively, by using integer expressions instead of floating point. This performance improvement is partly driven by enabling
tir.transform.VectorizeLoops to recognize vectorizable indices, where the round-trip through floating point previously prevented that optimization.

@tvm-bot
Copy link
Collaborator

tvm-bot commented Dec 1, 2022

Thanks for contributing to TVM! Please refer to the contributing guidelines https://tvm.apache.org/docs/contribute/ for useful information and tips. Please request code reviews from Reviewers by @-ing them in a comment.

  • No users to tag found in teams: topi See #10317 for details

Generated by tvm-bot

@Lunderberg
Copy link
Contributor Author

The performance testing script and the results on the LLVM, CUDA, and Vulkan backends are included below.

image

Click to expand test script

First, apply the following diff to patch float-indexing back into topi.image.resize.

diff --git a/python/tvm/topi/image/resize.py b/python/tvm/topi/image/resize.py
index 0383dd7ae..25675b614 100644
--- a/python/tvm/topi/image/resize.py
+++ b/python/tvm/topi/image/resize.py
@@ -661,6 +661,13 @@ def _resize_2d(
             roi[2],
         )
 
+    import os
+
+    use_float_indexing = "float" in os.environ.get("RESIZE_INDEXING", "")
+    if use_float_indexing:
+        in_x = in_x.astype("float32")
+        in_y = in_y.astype("float32")
+
     if method == "nearest_neighbor":
         if rounding_method == "":
             if coordinate_transformation_mode == "align_corners":

Then, run the following script.

#!/usr/bin/env python3

import os
import sys
import time

import numpy as np
import pytest

import tvm
from tvm import topi, te


pytest_plugins = [
    "pytest-benchmark",
    "tvm.testing.plugin",
]


def resize2d_dx_compute(inp, dy):
    """compute definition for resize2d_dx op"""
    size = (64, 32)
    layout = "NCHW"
    method = "cubic"
    coord_trans = "half_pixel"
    rounding_method = ""
    cubic_alpha = -0.75
    cubic_exclude = 0
    out_dtype = "float32"

    out = topi.image.resize2d(
        inp,
        (None, None, None, None),
        size,
        layout,
        method,
        coord_trans,
        rounding_method,
        bicubic_alpha=cubic_alpha,
        bicubic_exclude=cubic_exclude,
        out_dtype=out_dtype,
    )
    grads = tvm.te.gradient(out, [inp], head=dy)
    return grads


resize_indexing = tvm.testing.parameter("resize_int_indexing", "resize_float_indexing")
constrained_booleans = tvm.testing.parameter(
    by_dict={"kApplyConstraintsToBooleanBranches": "true", "": ""}
)


def _run_benchmark(
    benchmark, stage, resize_indexing, constrained_booleans="", target=None, dev=None
):
    os.environ["RESIZE_INDEXING"] = resize_indexing
    os.environ["REMOVE_NO_OP_CONSTRAINED_BOOLEANS"] = constrained_booleans

    inp_shape = (32, 3, 32, 32)
    dy_shape = (32, 3, 64, 32)

    inp = tvm.te.placeholder(inp_shape, name="inp")
    dy = tvm.te.placeholder(dy_shape, name="dy")

    if stage == "topi":
        benchmark(resize2d_dx_compute, inp, dy)
        return
    else:
        grad = resize2d_dx_compute(inp, dy)[0]

    # if resize_indexing == "resize_float_indexing" and constrained_booleans:
    if constrained_booleans:
        pytest.skip("Runs too slowly to effectively benchmark")

    target = tvm.target.Target(target)
    with target:
        if "gpu" in target.keys:
            sch = topi.cuda.injective.schedule_injective(grad)
        else:
            sch = topi.x86.injective.schedule_injective(grad)

    if stage == "tvm.lower":
        benchmark(tvm.lower, sch, [inp, dy, grad], simple_mode=True)
        return
    else:
        tvm.lower(sch, [inp, dy, grad], simple_mode=True)

    if stage == "tvm.build":
        benchmark(tvm.build, sch, [inp, dy, grad], target=target)
        return
    else:
        func = tvm.build(sch, [inp, dy, grad], target=target)

    inp_np = np.random.uniform(size=inp_shape).astype(inp.dtype)
    dy_np = np.random.uniform(size=dy_shape).astype(inp.dtype)

    inp_tvm = tvm.nd.array(inp_np, dev)
    dy_tvm = tvm.nd.array(dy_np, dev)
    grad_tvm = tvm.nd.empty(grad.shape, grad.dtype, dev)

    def execute_read():
        func(inp_tvm, dy_tvm, grad_tvm)
        dev.sync()

    if stage == "execute":
        iterations = 10 if "llvm" == target.kind.name else 100
        benchmark.pedantic(execute_read, iterations=iterations, warmup_rounds=5, rounds=10)
        return


def test_benchmark_topi(benchmark, resize_indexing):
    _run_benchmark(benchmark, "topi", resize_indexing)


def test_benchmark_lowering(benchmark, resize_indexing, constrained_booleans, target):
    _run_benchmark(benchmark, "tvm.lower", resize_indexing, constrained_booleans, target)


def test_benchmark_build(benchmark, resize_indexing, constrained_booleans, target):
    _run_benchmark(benchmark, "tvm.build", resize_indexing, constrained_booleans, target)


def test_benchmark_execute(benchmark, resize_indexing, constrained_booleans, target, dev):
    _run_benchmark(benchmark, "execute", resize_indexing, constrained_booleans, target, dev)


if __name__ == "__main__":
    sys.exit(pytest.main(sys.argv))

@Lunderberg Lunderberg force-pushed the topi_resize_performance_investigation branch 3 times, most recently from 4e4d4aa to 82c553f Compare December 1, 2022 17:11
@Lunderberg
Copy link
Contributor Author

@comaniac @tqchen This PR should be the last step in closing issue #13508

@huanmei9 @wrongtest-intellif If I understand #12315 correctly, this PR is effectively a stronger form of the same type of simplification. The main difference is that where #12315 applied to cases where the input index was an integer multiple of the resized index, this PR applies to any of the integer to integer indexing computations in topi.image.resize. Since the two PRs are closely related, any review/feedback would be appreciated.

@Lunderberg Lunderberg force-pushed the topi_resize_performance_investigation branch from 82c553f to c27d266 Compare December 1, 2022 17:39
to check if an optimization is permissible (e.g. vectorized
computations require linear buffer access), use of integer
expressions may provide significant performance benefits.
However, writing the simplified form
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this sentence is not complete

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you, and updated.

src/tir/transforms/remove_no_op.cc Outdated Show resolved Hide resolved
@wrongtest-intellif
Copy link
Contributor

this PR is effectively a stronger form

Yes. Many thanks for the extension!

For #12315 it only try handle limited cases due to lack of fraction representation.

It originally aims to solve tiling issue because backend interger analysis works poorly with float values. We could not tile the resize space properly before... I think after this PR every modes of resize can be well scheduled :).

@Lunderberg
Copy link
Contributor Author

@wrongtest-intellif Thank you, and that makes sense! I wanted to make sure that I was extending, and not removing, existing functionality.

Good point on the fraction representation. I was originally uncertain whether it would be overengineering, as compared to writing out the explicit integer formulas. The fun surprise for me was realizing that this approach would automatically handle cases where a user-provided parameter could be inferred to be a fraction (e.g. resizing with a cubic spline based on user-provided spline stiffness alpha).

@Lunderberg Lunderberg force-pushed the topi_resize_performance_investigation branch from c980554 to 3a98d27 Compare January 4, 2023 16:39
Lunderberg added a commit to Lunderberg/tvm that referenced this pull request Jan 5, 2023
Negative numerators to modulo/remainder operations are not supported
by the Vulkan API.  While the SPIR-V instructions
[`OpSRem`](https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#OpSRem)
and
[`OpSMod`](https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#OpSMod)
have identical semantics to `tir::Mod` and `tir::FloorMod`,
respectively, use of either instruction within Vulkan results in
undefined behavior.  From the [Vulkan
spec](https://registry.khronos.org/vulkan/specs/1.3/html/chap37.html#spirvenv-op-prec):

> For the OpSRem and OpSMod instructions, if either operand is
> negative the result is undefined.
>
> Note: While the OpSRem and OpSMod instructions are supported by the
> Vulkan environment, they require non-negative values and thus do not
> enable additional functionality beyond what OpUMod provides.

This issue was first noticed in
apache#13530, where use of integer
arithmetic resulted in negative numerators.  This hadn't caused issues
previously, because most use of div/mod use a denominator that is a
power of two.  In these cases, `tir.LowerIntrin` implements floordiv
and floormod using only bitwise operations.  When the denominator
isn't a power of two, both `tir::FloorDiv` and `tir::FloorMod` are
implemented in terms of `tir::Mod`, which triggers the undefined
behavior for negative numerators.

This commit implements additional simplification rules that
preferentially removes negative values from the numerators.  For
example, simplifying `floormod(i - 2, 8)` to `floormod(i + 6, 8)`, and
simplifying `floordiv(i - 2, 8)` to `floordiv(i + 6, 8) - 1`.  These
handle the most common case, where some index variable is being offset
by a negative constant.
Lunderberg added a commit to Lunderberg/tvm that referenced this pull request Jan 7, 2023
Negative numerators to modulo/remainder operations are not supported
by the Vulkan API.  While the SPIR-V instructions
[`OpSRem`](https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#OpSRem)
and
[`OpSMod`](https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#OpSMod)
have identical semantics to `tir::Mod` and `tir::FloorMod`,
respectively, use of either instruction within Vulkan results in
undefined behavior.  From the [Vulkan
spec](https://registry.khronos.org/vulkan/specs/1.3/html/chap37.html#spirvenv-op-prec):

> For the OpSRem and OpSMod instructions, if either operand is
> negative the result is undefined.
>
> Note: While the OpSRem and OpSMod instructions are supported by the
> Vulkan environment, they require non-negative values and thus do not
> enable additional functionality beyond what OpUMod provides.

This issue was first noticed in
apache#13530, where use of integer
arithmetic resulted in negative numerators.  This hadn't caused issues
previously, because most use of div/mod use a denominator that is a
power of two.  In these cases, `tir.LowerIntrin` implements floordiv
and floormod using only bitwise operations.  When the denominator
isn't a power of two, both `tir::FloorDiv` and `tir::FloorMod` are
implemented in terms of `tir::Mod`, which triggers the undefined
behavior for negative numerators.

This commit alters the lowering of FloorDiv/FloorMod to
TruncDiv/TruncMod, in cases where the denominator is positive, the
numerator is sometimes negative, and the range of the numerator is
known.  In these cases, the FloorDiv/FloorMod is now implemented by
offsetting the numerator such that it is always positive.
Lunderberg added a commit that referenced this pull request Jan 10, 2023
…13724)

* [Arith] Use ConstIntBound to remove negative numerator when lowering

Negative numerators to modulo/remainder operations are not supported
by the Vulkan API.  While the SPIR-V instructions
[`OpSRem`](https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#OpSRem)
and
[`OpSMod`](https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#OpSMod)
have identical semantics to `tir::Mod` and `tir::FloorMod`,
respectively, use of either instruction within Vulkan results in
undefined behavior.  From the [Vulkan
spec](https://registry.khronos.org/vulkan/specs/1.3/html/chap37.html#spirvenv-op-prec):

> For the OpSRem and OpSMod instructions, if either operand is
> negative the result is undefined.
>
> Note: While the OpSRem and OpSMod instructions are supported by the
> Vulkan environment, they require non-negative values and thus do not
> enable additional functionality beyond what OpUMod provides.

This issue was first noticed in
#13530, where use of integer
arithmetic resulted in negative numerators.  This hadn't caused issues
previously, because most use of div/mod use a denominator that is a
power of two.  In these cases, `tir.LowerIntrin` implements floordiv
and floormod using only bitwise operations.  When the denominator
isn't a power of two, both `tir::FloorDiv` and `tir::FloorMod` are
implemented in terms of `tir::Mod`, which triggers the undefined
behavior for negative numerators.

This commit alters the lowering of FloorDiv/FloorMod to
TruncDiv/TruncMod, in cases where the denominator is positive, the
numerator is sometimes negative, and the range of the numerator is
known.  In these cases, the FloorDiv/FloorMod is now implemented by
offsetting the numerator such that it is always positive.

* Add check to avoid -INT32_MIN

* Updated to use `tvm::min_value(DataType)`

* Added derivation for floordiv/floormod in terms of truncdiv/trundmod
@Lunderberg Lunderberg force-pushed the topi_resize_performance_investigation branch from 6efeb40 to f96fa82 Compare January 11, 2023 19:55
Lunderberg added a commit to Lunderberg/tvm that referenced this pull request Jan 12, 2023
The `crop_and_resize` operator uses floating-point arithmetic to
determine whether an index is within a view-box.  This can cause the
use of `extrapolation_value` to depend on target-dependent rounding
differences.

For example, this issue was initially noticed on Vulkan during
debugging of apache#13530, and was the
result of computing `0.2*223.0 + 0.8*223.0 < 223.0`.  If all
intermediates are cast to float32, the left-hand side evaluates to
`223.00002`.  If intermediates are kept at a higher precision, the
left-hand side evaluates to `223.0`.

The floating-point indexing can't be removed, because the operator
must match the API defined by TensorFlow's operator implementation.
The TensorFlow documentation for
[`CropAndResize`](https://www.tensorflow.org/api_docs/cc/class/tensorflow/ops/crop-and-resize)
does not specify behavior in these cases, nor do the current
TensorFlow unit tests check cases of rounding error.  Since the
TensorFlow unit tests only use binary fractions for the `boxes`
argument, which largely avoids the rounding issue, this commit updates
the TVM unit tests to avoid depending on floating-point precision.
Lunderberg added a commit to Lunderberg/tvm that referenced this pull request Jan 12, 2023
Parametrization helped in the debugging of
apache#13530, but is not otherwise related
to that PR.
echuraev pushed a commit that referenced this pull request Jan 13, 2023
)

Parametrization helped in the debugging of
#13530, but is not otherwise related
to that PR.
@Lunderberg Lunderberg force-pushed the topi_resize_performance_investigation branch from e5ddc73 to ba19bf6 Compare January 24, 2023 17:16
Lunderberg added a commit to Lunderberg/tvm that referenced this pull request Jan 25, 2023
The `crop_and_resize` operator uses floating-point arithmetic to
determine whether an index is within a view-box.  This can cause the
use of `extrapolation_value` to depend on target-dependent rounding
differences.

For example, this issue was initially noticed on Vulkan during
debugging of apache#13530, and was the
result of computing `0.2*223.0 + 0.8*223.0 < 223.0`.  If all
intermediates are cast to float32, the left-hand side evaluates to
`223.00002`.  If intermediates are kept at a higher precision, the
left-hand side evaluates to `223.0`.

The floating-point indexing can't be removed, because the operator
must match the API defined by TensorFlow's operator implementation.
The TensorFlow documentation for
[`CropAndResize`](https://www.tensorflow.org/api_docs/cc/class/tensorflow/ops/crop-and-resize)
does not specify behavior in these cases, nor do the current
TensorFlow unit tests check cases of rounding error.  Since the
TensorFlow unit tests only use binary fractions for the `boxes`
argument, which largely avoids the rounding issue, this commit updates
the TVM unit tests to avoid depending on floating-point precision.
@Lunderberg Lunderberg force-pushed the topi_resize_performance_investigation branch from ba19bf6 to 0391888 Compare January 25, 2023 14:43
fzi-peccia pushed a commit to fzi-peccia/tvm that referenced this pull request Mar 27, 2023
…pache#13724)

* [Arith] Use ConstIntBound to remove negative numerator when lowering

Negative numerators to modulo/remainder operations are not supported
by the Vulkan API.  While the SPIR-V instructions
[`OpSRem`](https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#OpSRem)
and
[`OpSMod`](https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#OpSMod)
have identical semantics to `tir::Mod` and `tir::FloorMod`,
respectively, use of either instruction within Vulkan results in
undefined behavior.  From the [Vulkan
spec](https://registry.khronos.org/vulkan/specs/1.3/html/chap37.html#spirvenv-op-prec):

> For the OpSRem and OpSMod instructions, if either operand is
> negative the result is undefined.
>
> Note: While the OpSRem and OpSMod instructions are supported by the
> Vulkan environment, they require non-negative values and thus do not
> enable additional functionality beyond what OpUMod provides.

This issue was first noticed in
apache#13530, where use of integer
arithmetic resulted in negative numerators.  This hadn't caused issues
previously, because most use of div/mod use a denominator that is a
power of two.  In these cases, `tir.LowerIntrin` implements floordiv
and floormod using only bitwise operations.  When the denominator
isn't a power of two, both `tir::FloorDiv` and `tir::FloorMod` are
implemented in terms of `tir::Mod`, which triggers the undefined
behavior for negative numerators.

This commit alters the lowering of FloorDiv/FloorMod to
TruncDiv/TruncMod, in cases where the denominator is positive, the
numerator is sometimes negative, and the range of the numerator is
known.  In these cases, the FloorDiv/FloorMod is now implemented by
offsetting the numerator such that it is always positive.

* Add check to avoid -INT32_MIN

* Updated to use `tvm::min_value(DataType)`

* Added derivation for floordiv/floormod in terms of truncdiv/trundmod
fzi-peccia pushed a commit to fzi-peccia/tvm that referenced this pull request Mar 27, 2023
…che#13774)

Parametrization helped in the debugging of
apache#13530, but is not otherwise related
to that PR.
Lunderberg added a commit to Lunderberg/tvm that referenced this pull request Mar 31, 2023
The `crop_and_resize` operator uses floating-point arithmetic to
determine whether an index is within a view-box.  This can cause the
use of `extrapolation_value` to depend on target-dependent rounding
differences.

For example, this issue was initially noticed on Vulkan during
debugging of apache#13530, and was the
result of computing `0.2*223.0 + 0.8*223.0 < 223.0`.  If all
intermediates are cast to float32, the left-hand side evaluates to
`223.00002`.  If intermediates are kept at a higher precision, the
left-hand side evaluates to `223.0`.

The floating-point indexing can't be removed, because the operator
must match the API defined by TensorFlow's operator implementation.
The TensorFlow documentation for
[`CropAndResize`](https://www.tensorflow.org/api_docs/cc/class/tensorflow/ops/crop-and-resize)
does not specify behavior in these cases, nor do the current
TensorFlow unit tests check cases of rounding error.  Since the
TensorFlow unit tests only use binary fractions for the `boxes`
argument, which largely avoids the rounding issue, this commit updates
the TVM unit tests to avoid depending on floating-point precision.
masahi pushed a commit that referenced this pull request Apr 5, 2023
…vide tests (#13773)

* [Test][Topi] Use binary fractions for crop_and_divide unit tests

The `crop_and_resize` operator uses floating-point arithmetic to
determine whether an index is within a view-box.  This can cause the
use of `extrapolation_value` to depend on target-dependent rounding
differences.

For example, this issue was initially noticed on Vulkan during
debugging of #13530, and was the
result of computing `0.2*223.0 + 0.8*223.0 < 223.0`.  If all
intermediates are cast to float32, the left-hand side evaluates to
`223.00002`.  If intermediates are kept at a higher precision, the
left-hand side evaluates to `223.0`.

The floating-point indexing can't be removed, because the operator
must match the API defined by TensorFlow's operator implementation.
The TensorFlow documentation for
[`CropAndResize`](https://www.tensorflow.org/api_docs/cc/class/tensorflow/ops/crop-and-resize)
does not specify behavior in these cases, nor do the current
TensorFlow unit tests check cases of rounding error.  Since the
TensorFlow unit tests only use binary fractions for the `boxes`
argument, which largely avoids the rounding issue, this commit updates
the TVM unit tests to avoid depending on floating-point precision.

* Use seeded random data for unit test

* Add epsilon offset to avoid depending on floating-point behavior

* Use randomly-generated boxes for unit tests

This mimics the example usage of `tf.image.crop_and_resize`, whose API
this operator is intended to follow.  Using any boxes with edges
representable as integer fractions has the potential for the in-bounds
check to be impacted by rounding error (e.g. `0.2*x + 0.8*x < x`).
Unfortunately, there's no way to remove this possibility altogether
without changing the API, such as accepting the box location as an
integer fraction, rather than a `float32`, but that would break
compatibility.

To avoid the risk of a flaky unit test based on the specific random
boxes used, the PRNG is seeded prior to generation.
@Lunderberg Lunderberg force-pushed the topi_resize_performance_investigation branch from 0391888 to caa10fb Compare April 5, 2023 20:02
Lunderberg and others added 12 commits September 11, 2024 10:18
Prior to this commit, floating point expressions were used to map
between different-sized pixel arrays.  These floating point
expressions are less aggressively optimized by `RewriteSimplifier`,
which can prevent some optimizations

This was first noticed during investigation into issue apache#13508.
Benchmarks of `topi.image.resize` showed 1000x and 50x performance
improvements using the LLVM and CUDA backends, respectively, by using
integer expressions instead of floating point.  This performance
improvement is partly driven by enabling
`tir.transform.VectorizeLoops` to recognize vectorizable indices,
where the round-trip through floating point previously prevented that
optimization.
Relay uses `DataType::Void()` to represent unspecified data types, the
FFI converts `DataType` objects to strings, and `DataType::Void()` is
represented as the empty string.
These are passed into the topi resize library for dynamic relay
shapes, and should be supported as possible integer types.
@Lunderberg Lunderberg force-pushed the topi_resize_performance_investigation branch from caa10fb to dc07d5d Compare September 11, 2024 15:21
@Lunderberg
Copy link
Contributor Author

Cleaning out old PRs, rebasing this onto main. If it passes CI, great, but if not this clearly isn't a critical PR to land.

@Lunderberg Lunderberg closed this Sep 12, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

4 participants