apache · csullivan · Jul 12, 2022 · Jun 3, 2022 · Jun 15, 2022 · Jun 15, 2022
diff --git a/python/tvm/topi/hexagon/slice_ops/__init__.py b/python/tvm/topi/hexagon/slice_ops/__init__.py
@@ -20,3 +20,9 @@
 # pylint: disable=wildcard-import
 
 from .avg_pool2d import avg_pool2d_compute, avg_pool2d_STIR_schedule
+from .cast import (
+    cast_f16_f32_compute,
+    cast_f16_f32_schedule,
+    cast_f32_f16_compute,
+    cast_f32_f16_schedule,
+)
diff --git a/python/tvm/topi/hexagon/slice_ops/cast.py b/python/tvm/topi/hexagon/slice_ops/cast.py
@@ -0,0 +1,188 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+""" Hexagon slice cast op compute and schedule"""
+
+from tvm import te
+from tvm import tir
+from tvm.tir import IndexMap
+
+# pylint: disable=invalid-name
+
+
+def layout_transform_nhwc_8h2w32c2w(n, h, w, c):
+    return [
+        n,
+        h // 8,
+        w // 4,
+        c // 32,
+        IndexMap.AXIS_SEPARATOR,
+        h % 8,
+        (w % 4) // 2,
+        c % 32,
+        w % 2,
+    ]
+
+
+def layout_transform_nc_1024c(n, c):
+    return [
+        n,
+        c // 1024,
+        IndexMap.AXIS_SEPARATOR,
+        c % 1024,
+    ]
+
+
+def layout_transform_nhwc_4h2w32c2w(n, h, w, c):
+    return [
+        n,
+        h // 4,
+        w // 4,
+        c // 32,
+        IndexMap.AXIS_SEPARATOR,
+        h % 4,
+        (w % 4) // 2,
+        (c % 32),
+        w % 2,
+    ]
+
+
+def layout_transform_nc_512c(n, c):
+    return [
+        n,
+        c // 512,
+        IndexMap.AXIS_SEPARATOR,
+        c % 512,
+    ]
+
+
+def get_layout_transform_for_f32(f32_layout_string):
+    """
+    Given f32 layout string, return transform_layout function and
+    channel/height split factor to be used for scheduling
+    """
+    if f32_layout_string == "nhwc-8h2w32c2w-2d":
+        return [layout_transform_nhwc_8h2w32c2w, 8]
+    if f32_layout_string == "nhwc-4h2w32c2w-2d":
+        return [layout_transform_nhwc_4h2w32c2w, 4]
+    if f32_layout_string == "nc-1024c-2d":
+        return [layout_transform_nc_1024c, 1024]
+    if f32_layout_string == "nc-512c-2d":
+        return [layout_transform_nc_512c, 512]
+    raise RuntimeError(f"Unexpected f32_layout '{f32_layout_string}'")
+
+
+def cast_f16_f32_compute(in_tensor):
+    out_tensor = te.compute(
+        in_tensor.shape, lambda *indices: in_tensor[indices].astype("float32"), name="CastF16F32"
+    )
+    return out_tensor
+
+
+def cast_f16_f32_stir_schedule_nhwc(func, in_layout, out_layout, h_split_factor):
+    """Schedule for nhwc f16 to f32 cast: nhwc layout"""
+    sch = tir.Schedule(func, debug_mask="all")
+    block_name = "CastF16F32"
+    n, h, w, c = sch.get_loops(sch.get_block(block_name))
+    h_outer, h_inner = sch.split(h, [None, h_split_factor])
+    w_outer, w_inner = sch.split(w, [None, 4])
+    c_outer, c_inner = sch.split(c, [None, 32])
+    w_inner_o, w_inner_i = sch.split(w_inner, [None, 2])
+    sch.reorder(n, h_outer, w_outer, c_outer, h_inner, w_inner_o, c_inner, w_inner_i)
+    sch.transform_layout(block_name, "A", in_layout)
+    sch.transform_layout(block_name, block_name, out_layout)
+    fused = sch.fuse(c_inner, w_inner_i)
+    sch.vectorize(fused)
+    return sch
+
+
+def cast_f16_f32_stir_schedule_nc(func, in_layout, out_layout, c_split_factor):
+    """Schedule for nc f16 to f32 cast: nc layout"""
+    sch = tir.Schedule(func, debug_mask="all")
+    block_name = "CastF16F32"
+    _, c = sch.get_loops(sch.get_block(block_name))
+    _, c_inner = sch.split(c, [None, c_split_factor])
+    sch.transform_layout(block_name, "A", in_layout)
+    sch.transform_layout(block_name, block_name, out_layout)
+    sch.vectorize(c_inner)
+    return sch
+
+
+def cast_f16_f32_schedule(cast_func, in_layout_str, out_layout_str):
+    """Schedule for f16 to f32 cast: top level function"""
+    f32_layout_transform_func, split_factor = get_layout_transform_for_f32(out_layout_str)
+    if in_layout_str == "nhwc-8h2w32c2w-2d":
+        return cast_f16_f32_stir_schedule_nhwc(
+            cast_func,
+            layout_transform_nhwc_8h2w32c2w,
+            f32_layout_transform_func,
+            split_factor,
+        )
+    if in_layout_str == "nc-1024c-2d":
+        return cast_f16_f32_stir_schedule_nc(
+            cast_func, layout_transform_nc_1024c, f32_layout_transform_func, split_factor
+        )
+    raise RuntimeError(f"Unexpected input_layout, output_layout '{input_layout, output_layout}'")
+
+
+def cast_f32_f16_compute(in_tensor):
+    out_tensor = te.compute(
+        in_tensor.shape, lambda *indices: in_tensor[indices].astype("float16"), name="CastF32F16"
+    )
+    return out_tensor
+
+
+def cast_f32_f16_stir_schedule_nhwc(func, in_layout, out_layout, h_split_factor):
+    """Schedule for nhwc f32 to f16 cast: nhwc layout"""
+    sch = tir.Schedule(func, debug_mask="all")
+    block_name = "CastF32F16"
+    n, h, w, c = sch.get_loops(sch.get_block(block_name))
+    h_outer, h_inner = sch.split(h, [None, h_split_factor])
+    w_outer, w_inner = sch.split(w, [None, 4])
+    c_outer, c_inner = sch.split(c, [None, 32])
+    w_inner_o, w_inner_i = sch.split(w_inner, [None, 2])
+    sch.reorder(n, h_outer, w_outer, c_outer, h_inner, w_inner_o, c_inner, w_inner_i)
+    sch.transform_layout(block_name, "A", in_layout)
+    sch.transform_layout(block_name, block_name, out_layout)
+    fused = sch.fuse(c_inner, w_inner_i)
+    sch.vectorize(fused)
+    return sch
+
+
+def cast_f32_f16_stir_schedule_nc(func, in_layout, out_layout, c_split_factor):
+    """Schedule for nc f32 to f16 cast: nc layout"""
+    sch = tir.Schedule(func, debug_mask="all")
+    block_name = "CastF32F16"
+    _, c = sch.get_loops(sch.get_block(block_name))
+    _, c_inner = sch.split(c, [None, c_split_factor])
+    sch.transform_layout(block_name, "A", in_layout)
+    sch.transform_layout(block_name, block_name, out_layout)
+    sch.vectorize(c_inner)
+    return sch
+
+
+def cast_f32_f16_schedule(cast_func, in_layout_str, out_layout_str):
+    """Schedule for f32 to f16 cast: top level function"""
+    f32_layout_transform_func, split_factor = get_layout_transform_for_f32(in_layout_str)
+    if out_layout_str == "nhwc-8h2w32c2w-2d":
+        return cast_f32_f16_stir_schedule_nhwc(
+            cast_func, f32_layout_transform_func, layout_transform_nhwc_8h2w32c2w, split_factor
+        )
+    if out_layout_str == "nc-1024c-2d":
+        return cast_f32_f16_stir_schedule_nc(
+            cast_func, f32_layout_transform_func, layout_transform_nc_1024c, split_factor
+        )
+    raise RuntimeError(f"Unexpected input_layout, output_layout '{input_layout, output_layout}'")