[TOPI] Use integer arithmetic for topi.image.resize #13530

Lunderberg · 2022-12-01T16:07:41Z

Prior to this commit, floating point expressions were used to map between different-sized pixel arrays. These floating point expressions are less aggressively optimized by RewriteSimplifier, which can prevent some optimizations

This was first noticed during investigation into issue #13508. Benchmarks of topi.image.resize showed 1000x and 50x performance improvements using the LLVM and CUDA backends, respectively, by using integer expressions instead of floating point. This performance improvement is partly driven by enabling
tir.transform.VectorizeLoops to recognize vectorizable indices, where the round-trip through floating point previously prevented that optimization.

tvm-bot · 2022-12-01T16:07:44Z

Thanks for contributing to TVM! Please refer to the contributing guidelines https://tvm.apache.org/docs/contribute/ for useful information and tips. Please request code reviews from Reviewers by @-ing them in a comment.

No users to tag found in teams: topi _{See #10317 for details}

_{Generated by tvm-bot}

Lunderberg · 2022-12-01T16:19:03Z

The performance testing script and the results on the LLVM, CUDA, and Vulkan backends are included below.

Click to expand test script

First, apply the following diff to patch float-indexing back into topi.image.resize.

diff --git a/python/tvm/topi/image/resize.py b/python/tvm/topi/image/resize.py
index 0383dd7ae..25675b614 100644
--- a/python/tvm/topi/image/resize.py
+++ b/python/tvm/topi/image/resize.py
@@ -661,6 +661,13 @@ def _resize_2d(
             roi[2],
         )
 
+    import os
+
+    use_float_indexing = "float" in os.environ.get("RESIZE_INDEXING", "")
+    if use_float_indexing:
+        in_x = in_x.astype("float32")
+        in_y = in_y.astype("float32")
+
     if method == "nearest_neighbor":
         if rounding_method == "":
             if coordinate_transformation_mode == "align_corners":

Then, run the following script.

#!/usr/bin/env python3

import os
import sys
import time

import numpy as np
import pytest

import tvm
from tvm import topi, te


pytest_plugins = [
    "pytest-benchmark",
    "tvm.testing.plugin",
]


def resize2d_dx_compute(inp, dy):
    """compute definition for resize2d_dx op"""
    size = (64, 32)
    layout = "NCHW"
    method = "cubic"
    coord_trans = "half_pixel"
    rounding_method = ""
    cubic_alpha = -0.75
    cubic_exclude = 0
    out_dtype = "float32"

    out = topi.image.resize2d(
        inp,
        (None, None, None, None),
        size,
        layout,
        method,
        coord_trans,
        rounding_method,
        bicubic_alpha=cubic_alpha,
        bicubic_exclude=cubic_exclude,
        out_dtype=out_dtype,
    )
    grads = tvm.te.gradient(out, [inp], head=dy)
    return grads


resize_indexing = tvm.testing.parameter("resize_int_indexing", "resize_float_indexing")
constrained_booleans = tvm.testing.parameter(
    by_dict={"kApplyConstraintsToBooleanBranches": "true", "": ""}
)


def _run_benchmark(
    benchmark, stage, resize_indexing, constrained_booleans="", target=None, dev=None
):
    os.environ["RESIZE_INDEXING"] = resize_indexing
    os.environ["REMOVE_NO_OP_CONSTRAINED_BOOLEANS"] = constrained_booleans

    inp_shape = (32, 3, 32, 32)
    dy_shape = (32, 3, 64, 32)

    inp = tvm.te.placeholder(inp_shape, name="inp")
    dy = tvm.te.placeholder(dy_shape, name="dy")

    if stage == "topi":
        benchmark(resize2d_dx_compute, inp, dy)
        return
    else:
        grad = resize2d_dx_compute(inp, dy)[0]

    # if resize_indexing == "resize_float_indexing" and constrained_booleans:
    if constrained_booleans:
        pytest.skip("Runs too slowly to effectively benchmark")

    target = tvm.target.Target(target)
    with target:
        if "gpu" in target.keys:
            sch = topi.cuda.injective.schedule_injective(grad)
        else:
            sch = topi.x86.injective.schedule_injective(grad)

    if stage == "tvm.lower":
        benchmark(tvm.lower, sch, [inp, dy, grad], simple_mode=True)
        return
    else:
        tvm.lower(sch, [inp, dy, grad], simple_mode=True)

    if stage == "tvm.build":
        benchmark(tvm.build, sch, [inp, dy, grad], target=target)
        return
    else:
        func = tvm.build(sch, [inp, dy, grad], target=target)

    inp_np = np.random.uniform(size=inp_shape).astype(inp.dtype)
    dy_np = np.random.uniform(size=dy_shape).astype(inp.dtype)

    inp_tvm = tvm.nd.array(inp_np, dev)
    dy_tvm = tvm.nd.array(dy_np, dev)
    grad_tvm = tvm.nd.empty(grad.shape, grad.dtype, dev)

    def execute_read():
        func(inp_tvm, dy_tvm, grad_tvm)
        dev.sync()

    if stage == "execute":
        iterations = 10 if "llvm" == target.kind.name else 100
        benchmark.pedantic(execute_read, iterations=iterations, warmup_rounds=5, rounds=10)
        return


def test_benchmark_topi(benchmark, resize_indexing):
    _run_benchmark(benchmark, "topi", resize_indexing)


def test_benchmark_lowering(benchmark, resize_indexing, constrained_booleans, target):
    _run_benchmark(benchmark, "tvm.lower", resize_indexing, constrained_booleans, target)


def test_benchmark_build(benchmark, resize_indexing, constrained_booleans, target):
    _run_benchmark(benchmark, "tvm.build", resize_indexing, constrained_booleans, target)


def test_benchmark_execute(benchmark, resize_indexing, constrained_booleans, target, dev):
    _run_benchmark(benchmark, "execute", resize_indexing, constrained_booleans, target, dev)


if __name__ == "__main__":
    sys.exit(pytest.main(sys.argv))

Lunderberg · 2022-12-01T17:23:33Z

@comaniac @tqchen This PR should be the last step in closing issue #13508

@huanmei9 @wrongtest-intellif If I understand #12315 correctly, this PR is effectively a stronger form of the same type of simplification. The main difference is that where #12315 applied to cases where the input index was an integer multiple of the resized index, this PR applies to any of the integer to integer indexing computations in topi.image.resize. Since the two PRs are closely related, any review/feedback would be appreciated.

vinx13 · 2022-12-01T20:57:13Z

python/tvm/topi/utils.py

+    to check if an optimization is permissible (e.g. vectorized
+    computations require linear buffer access), use of integer
+    expressions may provide significant performance benefits.
+    However, writing the simplified form


this sentence is not complete

Thank you, and updated.

src/tir/transforms/remove_no_op.cc

python/tvm/topi/image/resize.py

wrongtest-intellif · 2022-12-02T04:56:43Z

this PR is effectively a stronger form

Yes. Many thanks for the extension!

For #12315 it only try handle limited cases due to lack of fraction representation.

It originally aims to solve tiling issue because backend interger analysis works poorly with float values. We could not tile the resize space properly before... I think after this PR every modes of resize can be well scheduled :).

Lunderberg · 2022-12-02T21:14:28Z

@wrongtest-intellif Thank you, and that makes sense! I wanted to make sure that I was extending, and not removing, existing functionality.

Good point on the fraction representation. I was originally uncertain whether it would be overengineering, as compared to writing out the explicit integer formulas. The fun surprise for me was realizing that this approach would automatically handle cases where a user-provided parameter could be inferred to be a fraction (e.g. resizing with a cubic spline based on user-provided spline stiffness alpha).

Negative numerators to modulo/remainder operations are not supported by the Vulkan API. While the SPIR-V instructions [`OpSRem`](https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#OpSRem) and [`OpSMod`](https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#OpSMod) have identical semantics to `tir::Mod` and `tir::FloorMod`, respectively, use of either instruction within Vulkan results in undefined behavior. From the [Vulkan spec](https://registry.khronos.org/vulkan/specs/1.3/html/chap37.html#spirvenv-op-prec): > For the OpSRem and OpSMod instructions, if either operand is > negative the result is undefined. > > Note: While the OpSRem and OpSMod instructions are supported by the > Vulkan environment, they require non-negative values and thus do not > enable additional functionality beyond what OpUMod provides. This issue was first noticed in apache#13530, where use of integer arithmetic resulted in negative numerators. This hadn't caused issues previously, because most use of div/mod use a denominator that is a power of two. In these cases, `tir.LowerIntrin` implements floordiv and floormod using only bitwise operations. When the denominator isn't a power of two, both `tir::FloorDiv` and `tir::FloorMod` are implemented in terms of `tir::Mod`, which triggers the undefined behavior for negative numerators. This commit implements additional simplification rules that preferentially removes negative values from the numerators. For example, simplifying `floormod(i - 2, 8)` to `floormod(i + 6, 8)`, and simplifying `floordiv(i - 2, 8)` to `floordiv(i + 6, 8) - 1`. These handle the most common case, where some index variable is being offset by a negative constant.

Negative numerators to modulo/remainder operations are not supported by the Vulkan API. While the SPIR-V instructions [`OpSRem`](https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#OpSRem) and [`OpSMod`](https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#OpSMod) have identical semantics to `tir::Mod` and `tir::FloorMod`, respectively, use of either instruction within Vulkan results in undefined behavior. From the [Vulkan spec](https://registry.khronos.org/vulkan/specs/1.3/html/chap37.html#spirvenv-op-prec): > For the OpSRem and OpSMod instructions, if either operand is > negative the result is undefined. > > Note: While the OpSRem and OpSMod instructions are supported by the > Vulkan environment, they require non-negative values and thus do not > enable additional functionality beyond what OpUMod provides. This issue was first noticed in apache#13530, where use of integer arithmetic resulted in negative numerators. This hadn't caused issues previously, because most use of div/mod use a denominator that is a power of two. In these cases, `tir.LowerIntrin` implements floordiv and floormod using only bitwise operations. When the denominator isn't a power of two, both `tir::FloorDiv` and `tir::FloorMod` are implemented in terms of `tir::Mod`, which triggers the undefined behavior for negative numerators. This commit alters the lowering of FloorDiv/FloorMod to TruncDiv/TruncMod, in cases where the denominator is positive, the numerator is sometimes negative, and the range of the numerator is known. In these cases, the FloorDiv/FloorMod is now implemented by offsetting the numerator such that it is always positive.

…13724) * [Arith] Use ConstIntBound to remove negative numerator when lowering Negative numerators to modulo/remainder operations are not supported by the Vulkan API. While the SPIR-V instructions [`OpSRem`](https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#OpSRem) and [`OpSMod`](https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#OpSMod) have identical semantics to `tir::Mod` and `tir::FloorMod`, respectively, use of either instruction within Vulkan results in undefined behavior. From the [Vulkan spec](https://registry.khronos.org/vulkan/specs/1.3/html/chap37.html#spirvenv-op-prec): > For the OpSRem and OpSMod instructions, if either operand is > negative the result is undefined. > > Note: While the OpSRem and OpSMod instructions are supported by the > Vulkan environment, they require non-negative values and thus do not > enable additional functionality beyond what OpUMod provides. This issue was first noticed in #13530, where use of integer arithmetic resulted in negative numerators. This hadn't caused issues previously, because most use of div/mod use a denominator that is a power of two. In these cases, `tir.LowerIntrin` implements floordiv and floormod using only bitwise operations. When the denominator isn't a power of two, both `tir::FloorDiv` and `tir::FloorMod` are implemented in terms of `tir::Mod`, which triggers the undefined behavior for negative numerators. This commit alters the lowering of FloorDiv/FloorMod to TruncDiv/TruncMod, in cases where the denominator is positive, the numerator is sometimes negative, and the range of the numerator is known. In these cases, the FloorDiv/FloorMod is now implemented by offsetting the numerator such that it is always positive. * Add check to avoid -INT32_MIN * Updated to use `tvm::min_value(DataType)` * Added derivation for floordiv/floormod in terms of truncdiv/trundmod

The `crop_and_resize` operator uses floating-point arithmetic to determine whether an index is within a view-box. This can cause the use of `extrapolation_value` to depend on target-dependent rounding differences. For example, this issue was initially noticed on Vulkan during debugging of apache#13530, and was the result of computing `0.2*223.0 + 0.8*223.0 < 223.0`. If all intermediates are cast to float32, the left-hand side evaluates to `223.00002`. If intermediates are kept at a higher precision, the left-hand side evaluates to `223.0`. The floating-point indexing can't be removed, because the operator must match the API defined by TensorFlow's operator implementation. The TensorFlow documentation for [`CropAndResize`](https://www.tensorflow.org/api_docs/cc/class/tensorflow/ops/crop-and-resize) does not specify behavior in these cases, nor do the current TensorFlow unit tests check cases of rounding error. Since the TensorFlow unit tests only use binary fractions for the `boxes` argument, which largely avoids the rounding issue, this commit updates the TVM unit tests to avoid depending on floating-point precision.

Parametrization helped in the debugging of apache#13530, but is not otherwise related to that PR.

) Parametrization helped in the debugging of #13530, but is not otherwise related to that PR.

The `crop_and_resize` operator uses floating-point arithmetic to determine whether an index is within a view-box. This can cause the use of `extrapolation_value` to depend on target-dependent rounding differences. For example, this issue was initially noticed on Vulkan during debugging of apache#13530, and was the result of computing `0.2*223.0 + 0.8*223.0 < 223.0`. If all intermediates are cast to float32, the left-hand side evaluates to `223.00002`. If intermediates are kept at a higher precision, the left-hand side evaluates to `223.0`. The floating-point indexing can't be removed, because the operator must match the API defined by TensorFlow's operator implementation. The TensorFlow documentation for [`CropAndResize`](https://www.tensorflow.org/api_docs/cc/class/tensorflow/ops/crop-and-resize) does not specify behavior in these cases, nor do the current TensorFlow unit tests check cases of rounding error. Since the TensorFlow unit tests only use binary fractions for the `boxes` argument, which largely avoids the rounding issue, this commit updates the TVM unit tests to avoid depending on floating-point precision.

…pache#13724) * [Arith] Use ConstIntBound to remove negative numerator when lowering Negative numerators to modulo/remainder operations are not supported by the Vulkan API. While the SPIR-V instructions [`OpSRem`](https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#OpSRem) and [`OpSMod`](https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#OpSMod) have identical semantics to `tir::Mod` and `tir::FloorMod`, respectively, use of either instruction within Vulkan results in undefined behavior. From the [Vulkan spec](https://registry.khronos.org/vulkan/specs/1.3/html/chap37.html#spirvenv-op-prec): > For the OpSRem and OpSMod instructions, if either operand is > negative the result is undefined. > > Note: While the OpSRem and OpSMod instructions are supported by the > Vulkan environment, they require non-negative values and thus do not > enable additional functionality beyond what OpUMod provides. This issue was first noticed in apache#13530, where use of integer arithmetic resulted in negative numerators. This hadn't caused issues previously, because most use of div/mod use a denominator that is a power of two. In these cases, `tir.LowerIntrin` implements floordiv and floormod using only bitwise operations. When the denominator isn't a power of two, both `tir::FloorDiv` and `tir::FloorMod` are implemented in terms of `tir::Mod`, which triggers the undefined behavior for negative numerators. This commit alters the lowering of FloorDiv/FloorMod to TruncDiv/TruncMod, in cases where the denominator is positive, the numerator is sometimes negative, and the range of the numerator is known. In these cases, the FloorDiv/FloorMod is now implemented by offsetting the numerator such that it is always positive. * Add check to avoid -INT32_MIN * Updated to use `tvm::min_value(DataType)` * Added derivation for floordiv/floormod in terms of truncdiv/trundmod

…che#13774) Parametrization helped in the debugging of apache#13530, but is not otherwise related to that PR.

The `crop_and_resize` operator uses floating-point arithmetic to determine whether an index is within a view-box. This can cause the use of `extrapolation_value` to depend on target-dependent rounding differences. For example, this issue was initially noticed on Vulkan during debugging of apache#13530, and was the result of computing `0.2*223.0 + 0.8*223.0 < 223.0`. If all intermediates are cast to float32, the left-hand side evaluates to `223.00002`. If intermediates are kept at a higher precision, the left-hand side evaluates to `223.0`. The floating-point indexing can't be removed, because the operator must match the API defined by TensorFlow's operator implementation. The TensorFlow documentation for [`CropAndResize`](https://www.tensorflow.org/api_docs/cc/class/tensorflow/ops/crop-and-resize) does not specify behavior in these cases, nor do the current TensorFlow unit tests check cases of rounding error. Since the TensorFlow unit tests only use binary fractions for the `boxes` argument, which largely avoids the rounding issue, this commit updates the TVM unit tests to avoid depending on floating-point precision.

…vide tests (#13773) * [Test][Topi] Use binary fractions for crop_and_divide unit tests The `crop_and_resize` operator uses floating-point arithmetic to determine whether an index is within a view-box. This can cause the use of `extrapolation_value` to depend on target-dependent rounding differences. For example, this issue was initially noticed on Vulkan during debugging of #13530, and was the result of computing `0.2*223.0 + 0.8*223.0 < 223.0`. If all intermediates are cast to float32, the left-hand side evaluates to `223.00002`. If intermediates are kept at a higher precision, the left-hand side evaluates to `223.0`. The floating-point indexing can't be removed, because the operator must match the API defined by TensorFlow's operator implementation. The TensorFlow documentation for [`CropAndResize`](https://www.tensorflow.org/api_docs/cc/class/tensorflow/ops/crop-and-resize) does not specify behavior in these cases, nor do the current TensorFlow unit tests check cases of rounding error. Since the TensorFlow unit tests only use binary fractions for the `boxes` argument, which largely avoids the rounding issue, this commit updates the TVM unit tests to avoid depending on floating-point precision. * Use seeded random data for unit test * Add epsilon offset to avoid depending on floating-point behavior * Use randomly-generated boxes for unit tests This mimics the example usage of `tf.image.crop_and_resize`, whose API this operator is intended to follow. Using any boxes with edges representable as integer fractions has the potential for the in-bounds check to be impacted by rounding error (e.g. `0.2*x + 0.8*x < x`). Unfortunately, there's no way to remove this possibility altogether without changing the API, such as accepting the box location as an integer fraction, rather than a `float32`, but that would break compatibility. To avoid the risk of a flaky unit test based on the specific random boxes used, the PRNG is seeded prior to generation.

Prior to this commit, floating point expressions were used to map between different-sized pixel arrays. These floating point expressions are less aggressively optimized by `RewriteSimplifier`, which can prevent some optimizations This was first noticed during investigation into issue apache#13508. Benchmarks of `topi.image.resize` showed 1000x and 50x performance improvements using the LLVM and CUDA backends, respectively, by using integer expressions instead of floating point. This performance improvement is partly driven by enabling `tir.transform.VectorizeLoops` to recognize vectorizable indices, where the round-trip through floating point previously prevented that optimization.

Relay uses `DataType::Void()` to represent unspecified data types, the FFI converts `DataType` objects to strings, and `DataType::Void()` is represented as the empty string.

These are passed into the topi resize library for dynamic relay shapes, and should be supported as possible integer types.

Lunderberg · 2024-09-11T15:22:13Z

Cleaning out old PRs, rebasing this onto main. If it passes CI, great, but if not this clearly isn't a critical PR to land.

Lunderberg force-pushed the topi_resize_performance_investigation branch 3 times, most recently from 4e4d4aa to 82c553f Compare December 1, 2022 17:11

Lunderberg force-pushed the topi_resize_performance_investigation branch from 82c553f to c27d266 Compare December 1, 2022 17:39

vinx13 reviewed Dec 1, 2022

View reviewed changes

wrongtest-intellif reviewed Dec 2, 2022

View reviewed changes

python/tvm/topi/image/resize.py Show resolved Hide resolved

Lunderberg force-pushed the topi_resize_performance_investigation branch from c980554 to 3a98d27 Compare January 4, 2023 16:39

Lunderberg mentioned this pull request Jan 5, 2023

[Arith] Simplify to positive numerators in floordiv/floormod #13708

Closed

Lunderberg mentioned this pull request Jan 7, 2023

[Arith] Use ConstIntBound to remove negative numerator when lowering #13724

Merged

Lunderberg force-pushed the topi_resize_performance_investigation branch from 6efeb40 to f96fa82 Compare January 11, 2023 19:55

Lunderberg mentioned this pull request Jan 12, 2023

[Test][Topi] Avoid depending on f32 rounding behavior for crop_and_divide tests #13773

Merged

Lunderberg added a commit to Lunderberg/tvm that referenced this pull request Jan 12, 2023

[UnitTest] Parametrized test_arith_iter_affine_map::test_padding

0a05435

Parametrization helped in the debugging of apache#13530, but is not otherwise related to that PR.

Lunderberg mentioned this pull request Jan 12, 2023

[UnitTest] Parametrized test_arith_iter_affine_map::test_padding #13774

Merged

echuraev pushed a commit that referenced this pull request Jan 13, 2023

[UnitTest] Parametrized test_arith_iter_affine_map::test_padding (#13774

f975992

) Parametrization helped in the debugging of #13530, but is not otherwise related to that PR.

Lunderberg force-pushed the topi_resize_performance_investigation branch from e5ddc73 to ba19bf6 Compare January 24, 2023 17:16

Lunderberg force-pushed the topi_resize_performance_investigation branch from ba19bf6 to 0391888 Compare January 25, 2023 14:43

fzi-peccia pushed a commit to fzi-peccia/tvm that referenced this pull request Mar 27, 2023

[UnitTest] Parametrized test_arith_iter_affine_map::test_padding (apa…

96a1089

…che#13774) Parametrization helped in the debugging of apache#13530, but is not otherwise related to that PR.

Lunderberg force-pushed the topi_resize_performance_investigation branch from 0391888 to caa10fb Compare April 5, 2023 20:02

Lunderberg and others added 12 commits September 11, 2024 10:18

Infer out_dtype for empty string

0901475

Relay uses `DataType::Void()` to represent unspecified data types, the FFI converts `DataType` objects to strings, and `DataType::Void()` is represented as the empty string.

Removed unnecessary <cstdlib>

c9614e5

Completed incomplete sentence

3783b22

Allow use of integer te.Tensor, te.TensorSlice in Fraction

9501dae

These are passed into the topi resize library for dynamic relay shapes, and should be supported as possible integer types.

Infer out_dtype for empty string

fcaa709

Corrected ordering of _sum_products arguments

0f93558

Correction to "round_prefer_floor"

4db6522

Avoid calling te.if_then_else with internal Fraction utility

a118d7e

[Debug] Print statements for RewriteSimplifier

ff60a88

[Debug] Print statements in iter affine

884229c

[Debug] Print statements in Analyzer::Simplify

dc07d5d

Lunderberg force-pushed the topi_resize_performance_investigation branch from caa10fb to dc07d5d Compare September 11, 2024 15:21

Lunderberg closed this Sep 12, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[TOPI] Use integer arithmetic for topi.image.resize #13530

[TOPI] Use integer arithmetic for topi.image.resize #13530

Lunderberg commented Dec 1, 2022

tvm-bot commented Dec 1, 2022

Lunderberg commented Dec 1, 2022

Lunderberg commented Dec 1, 2022

vinx13 Dec 1, 2022

Lunderberg Dec 2, 2022

wrongtest-intellif commented Dec 2, 2022

Lunderberg commented Dec 2, 2022

Lunderberg commented Sep 11, 2024

[TOPI] Use integer arithmetic for topi.image.resize #13530

[TOPI] Use integer arithmetic for topi.image.resize #13530

Conversation

Lunderberg commented Dec 1, 2022

tvm-bot commented Dec 1, 2022

Lunderberg commented Dec 1, 2022

Lunderberg commented Dec 1, 2022

vinx13 Dec 1, 2022

Choose a reason for hiding this comment

Lunderberg Dec 2, 2022

Choose a reason for hiding this comment

wrongtest-intellif commented Dec 2, 2022

Lunderberg commented Dec 2, 2022

Lunderberg commented Sep 11, 2024