From 8299fe3eacf67a258fd9f682ba2605e7422b0276 Mon Sep 17 00:00:00 2001 From: Michael Gschwind Date: Sun, 3 Mar 2024 21:42:19 -0800 Subject: [PATCH 001/290] Add mixed dtype linear (#2023) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/2023 Add mixed dtype linear Reviewed By: mavlyutovr, manuelcandales Differential Revision: D53995591 fbshipit-source-id: f68e2fd1254cb3717f2276eef9375c944cb99d60 --- examples/models/llama2/ops/quantized.yaml | 6 + kernels/portable/cpu/vec_ops.h | 38 +++++- kernels/quantized/cpu/op_mixed_linear.cpp | 137 ++++++++++++++++++++++ kernels/quantized/cpu/targets.bzl | 6 + kernels/quantized/quantized.yaml | 6 + 5 files changed, 192 insertions(+), 1 deletion(-) create mode 100644 kernels/quantized/cpu/op_mixed_linear.cpp diff --git a/examples/models/llama2/ops/quantized.yaml b/examples/models/llama2/ops/quantized.yaml index 64165d17fa..f912b6ce0e 100644 --- a/examples/models/llama2/ops/quantized.yaml +++ b/examples/models/llama2/ops/quantized.yaml @@ -3,3 +3,9 @@ kernels: - arg_meta: null kernel_name: torch::executor::quantized_embedding_byte_out + +- func: quantized_decomposed::mixed_linear.out(Tensor input, Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!) + variants: function + kernels: + - arg_meta: null + kernel_name: torch::executor::quantized_mixed_linear_out diff --git a/kernels/portable/cpu/vec_ops.h b/kernels/portable/cpu/vec_ops.h index db2ecb1f89..dc74523127 100644 --- a/kernels/portable/cpu/vec_ops.h +++ b/kernels/portable/cpu/vec_ops.h @@ -10,10 +10,12 @@ #include #include +#include #include +#include #include +#include #include - /** * @file * This header defines common, low-level operations that can often be @@ -103,6 +105,40 @@ inline void vec_quantized_matmul_int8( } } +static inline size_t bounds_min(size_t a, size_t b) { + return (a < b) ? a : b; +} + +/// x: m * n, y: p * n, z: m * p, s: p * groups +/// z[i][j] = sum(x[i][k] * y[j][k] * s[j][k/g]) +template +inline void vec_quantized_matmul_transb_int8( + T* __restrict__ z, + const U* __restrict__ x, + const int8_t* __restrict__ y, + const V* __restrict__ s, + int64_t m, + int64_t n, + int64_t p, + int64_t g) { + int64_t n_over_g = (n + g - 1) / g; + + for (size_t i = 0; i < m; ++i) { + for (size_t j = 0; j < p; ++j) { + T sum = 0; + for (size_t k = 0; k < n; k += g) { + T psum = 0; + // the last group may have fewer than g elements + for (size_t k2 = k; k2 < bounds_min(k + g, n); k2++) { + psum += x[i * n + k2] * y[j * n + k2]; + } + sum += psum * s[j * n_over_g + k / g]; + } + z[i * p + j] = sum; + } + } +} + // mat1 (m x n), mat2 (n x p), out (m, p), self (m x p) // z[i][j] = sum(x[i][k] * y[k][j]), for k in range(n) // T for tensor dtype, U for scalar type diff --git a/kernels/quantized/cpu/op_mixed_linear.cpp b/kernels/quantized/cpu/op_mixed_linear.cpp new file mode 100644 index 0000000000..d00fdd05c9 --- /dev/null +++ b/kernels/quantized/cpu/op_mixed_linear.cpp @@ -0,0 +1,137 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +namespace torch { +namespace executor { +namespace native { + +using Tensor = exec_aten::Tensor; + +bool check_quantized_mixed_linear_args( + const Tensor& in, + const Tensor& weight, + const Tensor& weight_scales, + const optional& opt_weight_zero_points, + const optional dtype, + Tensor& out) { + ET_LOG_AND_RETURN_IF_FALSE(tensor_is_rank(in, 2)); + ET_LOG_AND_RETURN_IF_FALSE(tensor_is_rank(weight, 2)); + ET_LOG_AND_RETURN_IF_FALSE(tensor_is_rank(weight_scales, 1)); + ET_LOG_AND_RETURN_IF_FALSE(tensor_is_rank(out, 2)); + + ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_size_at_dims(in, 1, weight, 1)); + ET_LOG_AND_RETURN_IF_FALSE( + tensors_have_same_size_at_dims(weight_scales, 0, weight, 0)); + ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_size_at_dims(in, 1, weight, 1)); + + ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, weight_scales)); + if (dtype.has_value()) { + ET_LOG_AND_RETURN_IF_FALSE(out.scalar_type() == dtype.value()); + ET_LOG_MSG_AND_RETURN_IF_FALSE( + dtype.value() == ScalarType::Float || dtype.value() == ScalarType::Half, + "dtype must be Float or Half"); + } + ET_LOG_MSG_AND_RETURN_IF_FALSE( + weight.scalar_type() == ScalarType::Char, "weight dtype must be int8"); + ET_LOG_MSG_AND_RETURN_IF_FALSE( + in.scalar_type() == ScalarType::Float || + in.scalar_type() == ScalarType::Half, + "input dtype must be Float or Half"); + + if (opt_weight_zero_points.has_value()) { + ET_LOG_AND_RETURN_IF_FALSE( + tensors_have_same_shape(opt_weight_zero_points.value(), weight_scales)); + ET_LOG_AND_RETURN_IF_FALSE( + tensors_have_same_dtype(opt_weight_zero_points.value(), in)); + } + + // Support for non-null zero points is not implemented yet. + ET_LOG_MSG_AND_RETURN_IF_FALSE( + !opt_weight_zero_points.has_value(), "zero points not supported yet."); + return true; +} + +Tensor& quantized_mixed_linear_out( + const Tensor& in, + const Tensor& weight, + const Tensor& weight_scales, + const optional& opt_weight_zero_points, + const optional dtype, + Tensor& out) { + ET_KERNEL_CHECK( + ctx, + check_quantized_mixed_linear_args( + in, weight, weight_scales, opt_weight_zero_points, dtype, out), + InvalidArgument, + out); + + ScalarType out_dtype = dtype.has_value() ? dtype.value() : out.scalar_type(); + + size_t output_ndim = 2; + exec_aten::SizesType output_sizes[kTensorDimensionLimit]; + output_sizes[0] = in.size(0); + output_sizes[1] = weight.size(0); + + ET_KERNEL_CHECK( + ctx, + resize_tensor(out, {output_sizes, output_ndim}) == Error::Ok, + InvalidArgument, + out); + + constexpr auto name = "quantized_decomposed::mixed_linear.out"; + + ET_SWITCH_TWO_TYPES(Float, Half, in.scalar_type(), ctx, name, CTYPE, [&]() { + ET_SWITCH_FLOAT_TYPES_AND(Half, out_dtype, ctx, name, CTYPE_OUT, [&]() { + size_t m = in.size(0); + size_t n = in.size(1); + size_t p = weight.size(0); + size_t g = n; + + if (weight_scales.dim() == 2) { + g = (n + weight_scales.size(1) - 1) / weight_scales.size(1); + }; + + // FIXME: this currently ignores dtype + vec_quantized_matmul_transb_int8< + CTYPE_OUT, // T *z + CTYPE>( // U *x, U *s + out.mutable_data_ptr(), + in.const_data_ptr(), + weight.const_data_ptr(), + weight_scales.const_data_ptr(), + m, + n, + p, + g); + }); + }); + + return out; +} + +Tensor& quantized_mixed_linear_out( + RuntimeContext& ctx, + const Tensor& in, + const Tensor& weight, + const Tensor& weight_scales, + const optional& opt_weight_zero_points, + const optional dtype, + Tensor& out) { + // TODO(mcandales): Remove the need for this wrapper + // TODO(mkg): add support for dtype + (void)ctx; + return quantized_mixed_linear_out( + in, weight, weight_scales, opt_weight_zero_points, dtype, out); +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/kernels/quantized/cpu/targets.bzl b/kernels/quantized/cpu/targets.bzl index 57ffe8059a..3a6f74631a 100644 --- a/kernels/quantized/cpu/targets.bzl +++ b/kernels/quantized/cpu/targets.bzl @@ -29,6 +29,12 @@ _QUANT_OPS = ( "//executorch/kernels/portable/cpu:vec_ops", ], ), + op_target( + name = "op_mixed_linear", + deps = [ + "//executorch/kernels/portable/cpu:vec_ops", + ], + ), op_target( name = "op_quantize", deps = [ diff --git a/kernels/quantized/quantized.yaml b/kernels/quantized/quantized.yaml index dd03bf25fd..93fc1569d5 100644 --- a/kernels/quantized/quantized.yaml +++ b/kernels/quantized/quantized.yaml @@ -46,6 +46,12 @@ - arg_meta: null kernel_name: torch::executor::quantized_mixed_mm_out +- func: quantized_decomposed::mixed_linear.out(Tensor input, Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!) + variants: function + kernels: + - arg_meta: null + kernel_name: torch::executor::quantized_mixed_linear_out + - func: quantized_decomposed::quantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!) variants: function kernels: From e585a57bb64fd2702e6311d42a6739d3f8bf881c Mon Sep 17 00:00:00 2001 From: Mergen Nachin Date: Mon, 4 Mar 2024 08:25:19 -0800 Subject: [PATCH 002/290] Refactor llama2/models.py (#2054) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/2054 Splitting models.py into transformer.py and models.py Putting everything related to model architecture, like Transformer blocks into its own file (llama_transformer.py) that only depends on torch The reason is that if/when we create a separate repo, only llama_transformer.py will moved. Moving the original LICENSE header to llama_transfomer.py and for remaining models.py to BSD bypass-github-export-checks bypass-github-pytorch-ci-checks bypass-github-executorch-ci-checks Reviewed By: larryliu0820 Differential Revision: D54118314 Privacy Context Container: L1124100 fbshipit-source-id: 2141e0e18b8d46659e1942a74ae8a5f175e44256 --- examples/models/llama2/TARGETS | 16 +- examples/models/llama2/llama_transformer.py | 505 ++++++++++++++++++++ examples/models/llama2/model.py | 503 +------------------ setup.py | 10 +- 4 files changed, 530 insertions(+), 504 deletions(-) create mode 100644 examples/models/llama2/llama_transformer.py diff --git a/examples/models/llama2/TARGETS b/examples/models/llama2/TARGETS index 5492459bf6..b967a9faaf 100644 --- a/examples/models/llama2/TARGETS +++ b/examples/models/llama2/TARGETS @@ -8,6 +8,20 @@ oncall("executorch") define_common_targets() +runtime.python_library( + name = "llama_transformer", + srcs = ["llama_transformer.py"], + _is_external_target = True, + base_module = "executorch.examples.models.llama2", + visibility = [ + "//executorch/...", + ], + deps = [ + "//caffe2:torch", + "//executorch/examples/models/llama2/custom_ops:llama_custom_ops_aot_lib", + ], +) + runtime.python_library( name = "llama2_model", srcs = [ @@ -28,7 +42,7 @@ runtime.python_library( deps = [ "//caffe2:torch", "//executorch/examples/models:model_base", - "//executorch/examples/models/llama2/custom_ops:llama_custom_ops_aot_lib", + "//executorch/examples/models/llama2:llama_transformer", "//executorch/examples/models/llama2/ops:quantized_aot_lib", ], ) diff --git a/examples/models/llama2/llama_transformer.py b/examples/models/llama2/llama_transformer.py new file mode 100644 index 0000000000..fdfb41830d --- /dev/null +++ b/examples/models/llama2/llama_transformer.py @@ -0,0 +1,505 @@ +# @lint-ignore-every LICENSELINT +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# Llama 2 is licensed under the LLAMA 2 Community License, +# Copyright (c) Meta Platforms, Inc. All Rights Reserved. + +# Please refer to README.md in the same folder for more information. + +from dataclasses import dataclass +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn.functional as F + +from torch import nn + + +class RMSNorm(torch.nn.Module): + def __init__(self, dim: int, eps: float = 1e-6): + """ + Initialize the RMSNorm normalization layer. + + Args: + dim (int): The dimension of the input tensor. + eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6. + + Attributes: + eps (float): A small value added to the denominator for numerical stability. + weight (nn.Parameter): Learnable scaling parameter. + + """ + super().__init__() + self.eps = eps + self.weight = nn.Parameter(torch.ones(dim)) + + def _norm(self, x): + """ + Apply the RMSNorm normalization to the input tensor. + + Args: + x (torch.Tensor): The input tensor. + + Returns: + torch.Tensor: The normalized tensor. + + """ + return x * torch.rsqrt((x * x).mean(-1, keepdim=True) + self.eps) + + def forward(self, x): + """ + Forward pass through the RMSNorm layer. + + Args: + x (torch.Tensor): The input tensor. + + Returns: + torch.Tensor: The output tensor after applying RMSNorm. + + """ + output = self._norm(x.float()).type_as(x) + return output * self.weight + + +@dataclass +class ModelArgs: + dim: int = 4096 + n_layers: int = 32 + n_heads: int = 32 + n_kv_heads: Optional[int] = None + vocab_size: int = -1 # defined later by tokenizer + hidden_dim: Optional[int] = None + multiple_of: int = 256 # make SwiGLU hidden layer size multiple of large power of 2 + ffn_dim_multiplier: Optional[float] = None + norm_eps: float = 1e-5 + max_batch_size: int = 32 + max_seq_len: int = 2048 + moe: bool = False # True to enable the MoE (Mixture of Experts) + num_experts: int = 8 # Number of experts + num_activated_experts: int = 2 # Number of experts to activate + use_kv_cache: bool = False # Use key/value cache + use_sdpa_with_kv_cache_op: bool = ( + False # Use custom sdpa op that updates kv cache in-place + ) + # Additional Model Metadata needed at runtime + bos_idx: int = 1 + eos_idx: int = 3 + bos_count: int = -1 # i.e., a single EOS is used as BOS + eos_count: int = 2 + + def __post_init__(self): + if self.n_kv_heads is None: + self.n_kv_heads = self.n_heads + + if self.use_sdpa_with_kv_cache_op: + assert self.use_kv_cache, "use_sdpa_with_kv_cache_op requires use_kv_cache" + + +def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor: + """torch.repeat_interleave(x, dim=2, repeats=n_rep)""" + bs, slen, n_kv_heads, head_dim = x.shape + if n_rep == 1: + return x + return ( + x[:, :, :, None, :] + .expand(bs, slen, n_kv_heads, n_rep, head_dim) + .reshape(bs, slen, n_kv_heads * n_rep, head_dim) + ) + + +def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0): + freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)) + t = torch.arange(end, device=freqs.device) # pyre-ignore + freqs = torch.outer(t, freqs).float() # pyre-ignore + freqs_cos = torch.cos(freqs) + freqs_sin = torch.sin(freqs) + return freqs_cos, freqs_sin + + +def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor): + ndim = x.ndim + assert 0 <= 1 < ndim + assert freqs_cis.shape == (x.shape[1], x.shape[-1]) + shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)] + return freqs_cis.view(shape) + + +def apply_rotary_emb( + xq: torch.Tensor, xk: torch.Tensor, freqs_cos: torch.Tensor, freqs_sin: torch.Tensor +) -> Tuple[torch.Tensor, torch.Tensor]: + xq_r, xq_i = xq.float().reshape(xq.shape[:-1] + (-1, 2)).unbind(-1) + xk_r, xk_i = xk.float().reshape(xk.shape[:-1] + (-1, 2)).unbind(-1) + + freqs_cos = reshape_for_broadcast(freqs_cos, xq_r) + freqs_sin = reshape_for_broadcast(freqs_sin, xq_r) + + xq_out_r = xq_r * freqs_cos - xq_i * freqs_sin + xq_out_i = xq_r * freqs_sin + xq_i * freqs_cos + xk_out_r = xk_r * freqs_cos - xk_i * freqs_sin + xk_out_i = xk_r * freqs_sin + xk_i * freqs_cos + + xq_out = torch.stack([xq_out_r, xq_out_i], dim=-1).flatten(3) + xk_out = torch.stack([xk_out_r, xk_out_i], dim=-1).flatten(3) + + return xq_out.type_as(xq), xk_out.type_as(xk) + + +class Attention(nn.Module): + def __init__(self, args: ModelArgs, layer_id: int): + super().__init__() + self.use_kv_cache = args.use_kv_cache + self.n_kv_heads = args.n_heads if args.n_kv_heads is None else args.n_kv_heads + assert args.n_heads % self.n_kv_heads == 0 + model_parallel_size = 1 + self.n_local_heads = args.n_heads // model_parallel_size + self.n_local_kv_heads = self.n_kv_heads // model_parallel_size + self.n_rep = self.n_local_heads // self.n_local_kv_heads + self.head_dim = args.dim // args.n_heads + self.max_batch_size = args.max_batch_size + self.max_seq_len = args.max_seq_len + # args.dim = 4096, args.n_heads = 32, self.head_dim = 4096 / 32 = 125 + self.wq = nn.Linear(args.dim, args.n_heads * self.head_dim, bias=False) + self.wk = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False) + self.wv = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False) + self.wo = nn.Linear(args.n_heads * self.head_dim, args.dim, bias=False) + + self.use_sdpa_with_kv_cache_op = args.use_sdpa_with_kv_cache_op + self.layer_id = layer_id + + mask = torch.full( + (1, 1, args.max_seq_len, args.max_seq_len), + float("-inf"), + ) + + mask = torch.triu(mask, diagonal=1) + self.register_buffer("mask", mask) + + # This is what we would use if ExecuTorch could support mutable buffers. We can't at this time, so instead + # what is done is this module takes in the cache as io. + # self.cache_k = torch.zeros( + # (args.max_batch_size, args.max_seq_len, self.n_kv_heads, self.head_dim) + # ) + # self.cache_v = torch.zeros( + # (args.max_batch_size, args.max_seq_len, self.n_kv_heads, self.head_dim) + # ) + self.kv_cache_sizes = [ + args.max_batch_size, + args.max_seq_len, + self.n_kv_heads, + self.head_dim, + ] + + def forward( + self, + x: torch.Tensor, + freqs_cos: torch.Tensor, + freqs_sin: torch.Tensor, + start_pos: Optional[int] = None, + cache_k: Optional[torch.Tensor] = None, + # if use_sdpa_with_kv_cache_op + # shape: (num_layers, args.max_batch_size, args.max_seq_len, self.n_kv_heads, self.head_dim) + # otherwise + # shape: (args.max_batch_size, args.max_seq_len, self.n_kv_heads, self.head_dim) + cache_v: Optional[torch.Tensor] = None, + # if use_sdpa_with_kv_cache_op + # shape: (num_layers, args.max_batch_size, args.max_seq_len, self.n_kv_heads, self.head_dim) + # otherwise + # shape: (args.max_batch_size, args.max_seq_len, self.n_kv_heads, self.head_dim) + ): + bsz, seqlen, _ = x.shape + + # QKV + xq, xk, xv = self.wq(x), self.wk(x), self.wv(x) + # We need view_copy elimination + xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim) + xk = xk.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim) + xv = xv.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim) + + # RoPE relative positional embeddings + xq, xk = apply_rotary_emb(xq, xk, freqs_cos, freqs_sin) + + if self.use_kv_cache: + assert start_pos is not None + assert cache_k is not None and cache_v is not None + + # TODO(T180671810) + # Refactor this code to make custom op based + # SDPA into a separate optimized attention module + if self.use_sdpa_with_kv_cache_op: + from .custom_ops.sdpa_with_kv_cache import sdpa_with_kv_cache # noqa + + output = torch.ops.llama.sdpa_with_kv_cache( + xq, + xk, + xv, + cache_k, + cache_v, + self.layer_id, + start_pos, + seqlen, + ) + output = output.view(bsz, seqlen, -1) + output = self.wo(output) + return output, cache_k, cache_v + else: + # Replace the entry in the cache for this token + # The following lines are equivalent to: + # cache_k[:bsz, start_pos : start_pos + seqlen] = xk + # cache_v[:bsz, start_pos : start_pos + seqlen] = xv + # We use .narrow() here to make the compiler happy + narrowed_k = cache_k[:bsz].narrow(1, start_pos, seqlen) + narrowed_v = cache_v[:bsz].narrow(1, start_pos, seqlen) + + narrowed_k.copy_(xk) + narrowed_v.copy_(xv) + + keys = cache_k[:bsz].narrow(1, 0, start_pos + seqlen) + values = cache_v[:bsz].narrow(1, 0, start_pos + seqlen) + else: + keys = xk + values = xv + + # grouped multiquery attention: expand out keys and values + keys = repeat_kv(keys, self.n_rep) # (bs, seqlen, n_local_heads, head_dim) + values = repeat_kv(values, self.n_rep) # (bs, seqlen, n_local_heads, head_dim) + + # make heads into a batch dimension + xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim) + keys = keys.transpose(1, 2) + values = values.transpose(1, 2) + + assert hasattr(self, "mask") + mask = self.mask[:, :, :seqlen, :seqlen] + + # this is needed to support xnnpack which requires mask shape to be 2d. + # this is a temporary workaround. once we update xnnpack we should be able to handle this. + # shape before: [1, 1, l, s], after: [l, s] + # we make sure to specify the dimensions to be squeezed [0, 1] to ensure that the output + # tensor will be 2-dimensional, regarldess of the values of l & s + mask = torch.squeeze(mask, [0, 1]) + + output = F.scaled_dot_product_attention( + xq, keys, values, attn_mask=mask, dropout_p=0.0 + ) + + output = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1) + + output = self.wo(output) + + if self.use_kv_cache: + return output, cache_k, cache_v + else: + return output, None, None + + +class FeedForward(nn.Module): + def __init__(self, args: ModelArgs): + super().__init__() + dim = args.dim + hidden_dim = args.hidden_dim + if hidden_dim is None: + # If hidden_dim is not explicitly set in the ModelArgs, + # then calculate implicitly based on dim and also multiple of `args.multiple_of` + multiple_of = args.multiple_of + hidden_dim = 4 * dim + hidden_dim = int(2 * hidden_dim / 3) + hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) + + self.w1 = nn.Linear(dim, hidden_dim, bias=False) + self.w2 = nn.Linear(hidden_dim, dim, bias=False) + self.w3 = nn.Linear(dim, hidden_dim, bias=False) + + def forward(self, x): + return self.w2(F.silu(self.w1(x)) * self.w3(x)) + + +class ConditionalFeedForward(nn.Module): + def __init__(self, args: ModelArgs): + super().__init__() + self.dim = args.dim + hidden_dim = args.hidden_dim + if hidden_dim is None: + # If hidden_dim is not explicitly set in the ModelArgs, + # then calculate implicitly based on dim and also multiple of `args.multiple_of` + multiple_of = args.multiple_of + hidden_dim = 4 * self.dim + hidden_dim = int(2 * hidden_dim / 3) + hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) + + self.w1 = nn.Parameter(torch.randn(args.num_experts, hidden_dim, self.dim)) + self.w2 = nn.Parameter(torch.randn(args.num_experts, hidden_dim, self.dim)) + self.w3 = nn.Parameter(torch.randn(args.num_experts, hidden_dim, self.dim)) + self.num_experts = args.num_experts + + def forward(self, x: torch.Tensor, expert_indices: torch.Tensor) -> torch.Tensor: + w1_weights = self.w1[expert_indices].transpose(-1, -2) # [T, A, D, D] + w3_weights = self.w3[expert_indices].transpose(-1, -2) # [T, A, D, D] + w2_weights = self.w2[expert_indices] # [T, A, D, D] + x1 = F.silu(torch.einsum("ti,taio -> tao", x, w1_weights)) + x3 = torch.einsum("ti, taio -> tao", x, w3_weights) + expert_outs = torch.einsum("tao, taoi -> tai", (x1 * x3), w2_weights) + return expert_outs + + +class MOEFeedForward(nn.Module): + def __init__(self, config) -> None: + super().__init__() + self.gate = nn.Linear(config.dim, config.num_experts, bias=False) + self.cond_ffn = ConditionalFeedForward(config) + self.dim = config.dim + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = x.view(-1, self.dim) + # T = num_tokens, E = num_experts, D = hidden dim, A = activated experts + # x: [T, D] + scores = self.gate(x) # [T, E] + expert_weights, expert_indices = torch.topk(scores, 2, dim=-1) # [T, A], [T, A] + expert_weights = expert_weights.softmax(dim=-1) # [T, A] + expert_outs = self.cond_ffn(x, expert_indices) + return torch.einsum("tai,ta -> ti", expert_outs, expert_weights) + + +class TransformerBlock(nn.Module): + def __init__(self, layer_id: int, args: ModelArgs): + super().__init__() + self.use_kv_cache = args.use_kv_cache + self.n_heads = args.n_heads + self.dim = args.dim + self.head_dim = args.dim // args.n_heads + self.attention = Attention(args, layer_id) + if args.moe: + self.block_sparse_moe = MOEFeedForward(args) + else: + self.feed_forward = FeedForward(args) + self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps) + self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps) + + def forward( + self, x, freqs_cos, freqs_sin, start_pos=None, cache_k=None, cache_v=None + ): # x: 1xN + h, cache_k, cache_v = self.attention.forward( + self.attention_norm(x), + freqs_cos, + freqs_sin, + start_pos, + cache_k, + cache_v, + ) + + h = x + h + if hasattr(self, "block_sparse_moe"): + out = h + self.block_sparse_moe(self.ffn_norm(h)) + else: + out = h + self.feed_forward(self.ffn_norm(h)) + return out, cache_k, cache_v + + +class Transformer(nn.Module): + def __init__(self, params: ModelArgs): + super().__init__() + self.params = params + self.vocab_size = params.vocab_size + self.n_layers = params.n_layers + + self.tok_embeddings = nn.Embedding(params.vocab_size, params.dim) + self.layers = torch.nn.ModuleList() + for layer_id in range(params.n_layers): + self.layers.append(TransformerBlock(layer_id, params)) + self.norm = RMSNorm(params.dim, eps=params.norm_eps) + self.output = nn.Linear(params.dim, params.vocab_size, bias=False) + self.use_kv_cache = params.use_kv_cache + + freqs_cos, freqs_sin = precompute_freqs_cis( + self.params.dim // self.params.n_heads, self.params.max_seq_len + ) + self.register_buffer("freqs_cos", freqs_cos, persistent=False) + self.register_buffer("freqs_sin", freqs_sin, persistent=False) + + def forward( + self, + tokens: torch.Tensor, + start_pos: Optional[ + torch.Tensor + ] = None, # Scalar tensor indicating size of window of the caches + cache_k: Optional[ + torch.Tensor + ] = None, # n_layers long, it should be a list of tensors to accommodate the potential size difference among attention layers. The current implementation is overly simplified. + cache_v: Optional[torch.Tensor] = None, # n_layers long + ) -> Union[ + torch.Tensor, Tuple[torch.Tensor, List[torch.Tensor], List[torch.Tensor]] + ]: + _bsz, seqlen = tokens.shape + h = self.tok_embeddings(tokens) + + if self.use_kv_cache: + assert ( + cache_k is not None and cache_v is not None and start_pos is not None + ), "Caches and start_pos must be provided when use_kv_cache is True" + assert ( + cache_k.size(0) == self.n_layers + ), f"{cache_k.size(0)} != {self.n_layers}" + assert ( + cache_v.size(0) == self.n_layers + ), f"{cache_v.size(0)} != {self.n_layers}" + + sp = start_pos.item() + # self.params.max_seq_len - 1 because of 0 based indexing, and - 1 again because our input seq len is 1 and its added to the cache before accessing the cache + torch._constrain_as_size(sp, min=0, max=self.params.max_seq_len - 2) + torch._constrain_as_value( + cache_k.shape[0], + max=self.n_layers, + min=self.n_layers, + ) + torch._constrain_as_value( + cache_v.shape[0], min=self.n_layers, max=self.n_layers + ) + # when KV cache is used, seqlen is most likely 1. We want to slice from the start_pos. + freqs_cos = self.freqs_cos[sp : sp + seqlen] + freqs_sin = self.freqs_sin[sp : sp + seqlen] + else: + assert ( + start_pos is None and cache_k is None and cache_v is None + ), "Caches and start_pos are unused when use_kv_cache is False" + freqs_cos = self.freqs_cos[:seqlen] + freqs_sin = self.freqs_sin[:seqlen] + + for index, layer in enumerate(self.layers): + if self.use_kv_cache: + if self.params.use_sdpa_with_kv_cache_op: + h, updated_cache_k, updated_cache_v = layer( + h, + freqs_cos, + freqs_sin, + sp, # pyre-ignore[61] + cache_k, + cache_v, + ) + else: + h, updated_cache_k, updated_cache_v = layer( + h, + freqs_cos, + freqs_sin, + sp, # pyre-ignore[61] + cache_k[index], # pyre-ignore[16] + cache_v[index], + ) + cache_k[index] = updated_cache_k # pyre-ignore[16] + cache_v[index] = updated_cache_v + + else: + h, _, _ = layer(h, freqs_cos, freqs_sin) + + h = self.norm(h) + + logits = self.output(h) + if self.use_kv_cache: + return (logits, cache_k, cache_v) # pyre-ignore + else: + # 'None' is not a valid return for export so have to split the return into if else + return logits + + # For each layer return the sizes of the needed caches + def get_cache_sizes(self): + # cache_k and cache_v have the same shape so could pick either here. + return [self.n_layers, *self.layers[0].attention.kv_cache_sizes] diff --git a/examples/models/llama2/model.py b/examples/models/llama2/model.py index 221a6fbfd8..c22ae8ad35 100644 --- a/examples/models/llama2/model.py +++ b/examples/models/llama2/model.py @@ -1,23 +1,15 @@ -# @lint-ignore-every LICENSELINT # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # -# Llama 2 is licensed under the LLAMA 2 Community License, -# Copyright (c) Meta Platforms, Inc. All Rights Reserved. - -# Please refer to README.md in the same folder for more information. - +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. import json -import math -from dataclasses import dataclass from pathlib import Path -from typing import List, Optional, Tuple, Union import torch -import torch.nn.functional as F -from torch import nn +from executorch.examples.models.llama2.llama_transformer import ModelArgs, Transformer try: from .fairseq2 import convert_to_llama_checkpoint @@ -32,495 +24,6 @@ def convert_to_llama_checkpoint(**kwargs): from ..model_base import EagerModelBase -from .custom_ops.sdpa_with_kv_cache import * - - -class RMSNorm(torch.nn.Module): - def __init__(self, dim: int, eps: float = 1e-6): - """ - Initialize the RMSNorm normalization layer. - - Args: - dim (int): The dimension of the input tensor. - eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6. - - Attributes: - eps (float): A small value added to the denominator for numerical stability. - weight (nn.Parameter): Learnable scaling parameter. - - """ - super().__init__() - self.eps = eps - self.weight = nn.Parameter(torch.ones(dim)) - - def _norm(self, x): - """ - Apply the RMSNorm normalization to the input tensor. - - Args: - x (torch.Tensor): The input tensor. - - Returns: - torch.Tensor: The normalized tensor. - - """ - return x * torch.rsqrt((x * x).mean(-1, keepdim=True) + self.eps) - - def forward(self, x): - """ - Forward pass through the RMSNorm layer. - - Args: - x (torch.Tensor): The input tensor. - - Returns: - torch.Tensor: The output tensor after applying RMSNorm. - - """ - output = self._norm(x.float()).type_as(x) - return output * self.weight - - -@dataclass -class ModelArgs: - dim: int = 4096 - n_layers: int = 32 - n_heads: int = 32 - n_kv_heads: Optional[int] = None - vocab_size: int = -1 # defined later by tokenizer - hidden_dim: Optional[int] = None - multiple_of: int = 256 # make SwiGLU hidden layer size multiple of large power of 2 - ffn_dim_multiplier: Optional[float] = None - norm_eps: float = 1e-5 - max_batch_size: int = 32 - max_seq_len: int = 2048 - moe: bool = False # True to enable the MoE (Mixture of Experts) - num_experts: int = 8 # Number of experts - num_activated_experts: int = 2 # Number of experts to activate - use_kv_cache: bool = False # Use key/value cache - use_sdpa_with_kv_cache_op: bool = ( - False # Use custom sdpa op that updates kv cache in-place - ) - # Additional Model Metadata needed at runtime - bos_idx: int = 1 - eos_idx: int = 3 - bos_count: int = -1 # i.e., a single EOS is used as BOS - eos_count: int = 2 - - def __post_init__(self): - if self.n_kv_heads is None: - self.n_kv_heads = self.n_heads - - if self.use_sdpa_with_kv_cache_op: - assert self.use_kv_cache, "use_sdpa_with_kv_cache_op requires use_kv_cache" - - -def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor: - """torch.repeat_interleave(x, dim=2, repeats=n_rep)""" - bs, slen, n_kv_heads, head_dim = x.shape - if n_rep == 1: - return x - return ( - x[:, :, :, None, :] - .expand(bs, slen, n_kv_heads, n_rep, head_dim) - .reshape(bs, slen, n_kv_heads * n_rep, head_dim) - ) - - -def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0): - freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)) - t = torch.arange(end, device=freqs.device) # pyre-ignore - freqs = torch.outer(t, freqs).float() # pyre-ignore - freqs_cos = torch.cos(freqs) - freqs_sin = torch.sin(freqs) - return freqs_cos, freqs_sin - - -def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor): - ndim = x.ndim - assert 0 <= 1 < ndim - assert freqs_cis.shape == (x.shape[1], x.shape[-1]) - shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)] - return freqs_cis.view(shape) - - -def apply_rotary_emb( - xq: torch.Tensor, xk: torch.Tensor, freqs_cos: torch.Tensor, freqs_sin: torch.Tensor -) -> Tuple[torch.Tensor, torch.Tensor]: - xq_r, xq_i = xq.float().reshape(xq.shape[:-1] + (-1, 2)).unbind(-1) - xk_r, xk_i = xk.float().reshape(xk.shape[:-1] + (-1, 2)).unbind(-1) - - freqs_cos = reshape_for_broadcast(freqs_cos, xq_r) - freqs_sin = reshape_for_broadcast(freqs_sin, xq_r) - - xq_out_r = xq_r * freqs_cos - xq_i * freqs_sin - xq_out_i = xq_r * freqs_sin + xq_i * freqs_cos - xk_out_r = xk_r * freqs_cos - xk_i * freqs_sin - xk_out_i = xk_r * freqs_sin + xk_i * freqs_cos - - xq_out = torch.stack([xq_out_r, xq_out_i], dim=-1).flatten(3) - xk_out = torch.stack([xk_out_r, xk_out_i], dim=-1).flatten(3) - - return xq_out.type_as(xq), xk_out.type_as(xk) - - -class Attention(nn.Module): - def __init__(self, args: ModelArgs, layer_id: int): - super().__init__() - self.use_kv_cache = args.use_kv_cache - self.n_kv_heads = args.n_heads if args.n_kv_heads is None else args.n_kv_heads - assert args.n_heads % self.n_kv_heads == 0 - model_parallel_size = 1 - self.n_local_heads = args.n_heads // model_parallel_size - self.n_local_kv_heads = self.n_kv_heads // model_parallel_size - self.n_rep = self.n_local_heads // self.n_local_kv_heads - self.head_dim = args.dim // args.n_heads - self.max_batch_size = args.max_batch_size - self.max_seq_len = args.max_seq_len - # args.dim = 4096, args.n_heads = 32, self.head_dim = 4096 / 32 = 125 - self.wq = nn.Linear(args.dim, args.n_heads * self.head_dim, bias=False) - self.wk = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False) - self.wv = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False) - self.wo = nn.Linear(args.n_heads * self.head_dim, args.dim, bias=False) - - self.use_sdpa_with_kv_cache_op = args.use_sdpa_with_kv_cache_op - self.layer_id = layer_id - - mask = torch.full( - (1, 1, args.max_seq_len, args.max_seq_len), - float("-inf"), - ) - - mask = torch.triu(mask, diagonal=1) - self.register_buffer("mask", mask) - - # This is what we would use if ExecuTorch could support mutable buffers. We can't at this time, so instead - # what is done is this module takes in the cache as io. - # self.cache_k = torch.zeros( - # (args.max_batch_size, args.max_seq_len, self.n_kv_heads, self.head_dim) - # ) - # self.cache_v = torch.zeros( - # (args.max_batch_size, args.max_seq_len, self.n_kv_heads, self.head_dim) - # ) - self.kv_cache_sizes = [ - args.max_batch_size, - args.max_seq_len, - self.n_kv_heads, - self.head_dim, - ] - - def forward( - self, - x: torch.Tensor, - freqs_cos: torch.Tensor, - freqs_sin: torch.Tensor, - start_pos: Optional[int] = None, - cache_k: Optional[torch.Tensor] = None, - # if use_sdpa_with_kv_cache_op - # shape: (num_layers, args.max_batch_size, args.max_seq_len, self.n_kv_heads, self.head_dim) - # otherwise - # shape: (args.max_batch_size, args.max_seq_len, self.n_kv_heads, self.head_dim) - cache_v: Optional[torch.Tensor] = None, - # if use_sdpa_with_kv_cache_op - # shape: (num_layers, args.max_batch_size, args.max_seq_len, self.n_kv_heads, self.head_dim) - # otherwise - # shape: (args.max_batch_size, args.max_seq_len, self.n_kv_heads, self.head_dim) - ): - bsz, seqlen, _ = x.shape - - # QKV - xq, xk, xv = self.wq(x), self.wk(x), self.wv(x) - # We need view_copy elimination - xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim) - xk = xk.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim) - xv = xv.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim) - - # RoPE relative positional embeddings - xq, xk = apply_rotary_emb(xq, xk, freqs_cos, freqs_sin) - - if self.use_kv_cache: - assert start_pos is not None - assert cache_k is not None and cache_v is not None - - # TODO(T180671810) - # Refactor this code to make custom op based - # SDPA into a separate optimized attention module - if self.use_sdpa_with_kv_cache_op: - output = torch.ops.llama.sdpa_with_kv_cache( - xq, - xk, - xv, - cache_k, - cache_v, - self.layer_id, - start_pos, - seqlen, - ) - output = output.view(bsz, seqlen, -1) - output = self.wo(output) - return output, cache_k, cache_v - else: - # Replace the entry in the cache for this token - # The following lines are equivalent to: - # cache_k[:bsz, start_pos : start_pos + seqlen] = xk - # cache_v[:bsz, start_pos : start_pos + seqlen] = xv - # We use .narrow() here to make the compiler happy - narrowed_k = cache_k[:bsz].narrow(1, start_pos, seqlen) - narrowed_v = cache_v[:bsz].narrow(1, start_pos, seqlen) - - narrowed_k.copy_(xk) - narrowed_v.copy_(xv) - - keys = cache_k[:bsz].narrow(1, 0, start_pos + seqlen) - values = cache_v[:bsz].narrow(1, 0, start_pos + seqlen) - else: - keys = xk - values = xv - - # grouped multiquery attention: expand out keys and values - keys = repeat_kv(keys, self.n_rep) # (bs, seqlen, n_local_heads, head_dim) - values = repeat_kv(values, self.n_rep) # (bs, seqlen, n_local_heads, head_dim) - - # make heads into a batch dimension - xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim) - keys = keys.transpose(1, 2) - values = values.transpose(1, 2) - - assert hasattr(self, "mask") - mask = self.mask[:, :, :seqlen, :seqlen] - - # this is needed to support xnnpack which requires mask shape to be 2d. - # this is a temporary workaround. once we update xnnpack we should be able to handle this. - # shape before: [1, 1, l, s], after: [l, s] - # we make sure to specify the dimensions to be squeezed [0, 1] to ensure that the output - # tensor will be 2-dimensional, regarldess of the values of l & s - mask = torch.squeeze(mask, [0, 1]) - - output = F.scaled_dot_product_attention( - xq, keys, values, attn_mask=mask, dropout_p=0.0 - ) - - output = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1) - - output = self.wo(output) - - if self.use_kv_cache: - return output, cache_k, cache_v - else: - return output, None, None - - -class FeedForward(nn.Module): - def __init__(self, args: ModelArgs): - super().__init__() - dim = args.dim - hidden_dim = args.hidden_dim - if hidden_dim is None: - # If hidden_dim is not explicitly set in the ModelArgs, - # then calculate implicitly based on dim and also multiple of `args.multiple_of` - multiple_of = args.multiple_of - hidden_dim = 4 * dim - hidden_dim = int(2 * hidden_dim / 3) - hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) - - self.w1 = nn.Linear(dim, hidden_dim, bias=False) - self.w2 = nn.Linear(hidden_dim, dim, bias=False) - self.w3 = nn.Linear(dim, hidden_dim, bias=False) - - def forward(self, x): - return self.w2(F.silu(self.w1(x)) * self.w3(x)) - - -class ConditionalFeedForward(nn.Module): - def __init__(self, args: ModelArgs): - super().__init__() - self.dim = args.dim - hidden_dim = args.hidden_dim - if hidden_dim is None: - # If hidden_dim is not explicitly set in the ModelArgs, - # then calculate implicitly based on dim and also multiple of `args.multiple_of` - multiple_of = args.multiple_of - hidden_dim = 4 * self.dim - hidden_dim = int(2 * hidden_dim / 3) - hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) - - self.w1 = nn.Parameter(torch.randn(args.num_experts, hidden_dim, self.dim)) - self.w2 = nn.Parameter(torch.randn(args.num_experts, hidden_dim, self.dim)) - self.w3 = nn.Parameter(torch.randn(args.num_experts, hidden_dim, self.dim)) - self.num_experts = args.num_experts - - def forward(self, x: torch.Tensor, expert_indices: torch.Tensor) -> torch.Tensor: - w1_weights = self.w1[expert_indices].transpose(-1, -2) # [T, A, D, D] - w3_weights = self.w3[expert_indices].transpose(-1, -2) # [T, A, D, D] - w2_weights = self.w2[expert_indices] # [T, A, D, D] - x1 = F.silu(torch.einsum("ti,taio -> tao", x, w1_weights)) - x3 = torch.einsum("ti, taio -> tao", x, w3_weights) - expert_outs = torch.einsum("tao, taoi -> tai", (x1 * x3), w2_weights) - return expert_outs - - -class MOEFeedForward(nn.Module): - def __init__(self, config) -> None: - super().__init__() - self.gate = nn.Linear(config.dim, config.num_experts, bias=False) - self.cond_ffn = ConditionalFeedForward(config) - self.dim = config.dim - - def forward(self, x: torch.Tensor) -> torch.Tensor: - x = x.view(-1, self.dim) - # T = num_tokens, E = num_experts, D = hidden dim, A = activated experts - # x: [T, D] - scores = self.gate(x) # [T, E] - expert_weights, expert_indices = torch.topk(scores, 2, dim=-1) # [T, A], [T, A] - expert_weights = expert_weights.softmax(dim=-1) # [T, A] - expert_outs = self.cond_ffn(x, expert_indices) - return torch.einsum("tai,ta -> ti", expert_outs, expert_weights) - - -class TransformerBlock(nn.Module): - def __init__(self, layer_id: int, args: ModelArgs): - super().__init__() - self.use_kv_cache = args.use_kv_cache - self.n_heads = args.n_heads - self.dim = args.dim - self.head_dim = args.dim // args.n_heads - self.attention = Attention(args, layer_id) - if args.moe: - self.block_sparse_moe = MOEFeedForward(args) - else: - self.feed_forward = FeedForward(args) - self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps) - self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps) - - def forward( - self, x, freqs_cos, freqs_sin, start_pos=None, cache_k=None, cache_v=None - ): # x: 1xN - h, cache_k, cache_v = self.attention.forward( - self.attention_norm(x), - freqs_cos, - freqs_sin, - start_pos, - cache_k, - cache_v, - ) - - h = x + h - if hasattr(self, "block_sparse_moe"): - out = h + self.block_sparse_moe(self.ffn_norm(h)) - else: - out = h + self.feed_forward(self.ffn_norm(h)) - return out, cache_k, cache_v - - -class Transformer(nn.Module): - def __init__(self, params: ModelArgs): - super().__init__() - self.params = params - self.vocab_size = params.vocab_size - self.n_layers = params.n_layers - - self.tok_embeddings = nn.Embedding(params.vocab_size, params.dim) - self.layers = torch.nn.ModuleList() - for layer_id in range(params.n_layers): - self.layers.append(TransformerBlock(layer_id, params)) - self.norm = RMSNorm(params.dim, eps=params.norm_eps) - self.output = nn.Linear(params.dim, params.vocab_size, bias=False) - self.use_kv_cache = params.use_kv_cache - - freqs_cos, freqs_sin = precompute_freqs_cis( - self.params.dim // self.params.n_heads, self.params.max_seq_len - ) - self.register_buffer("freqs_cos", freqs_cos, persistent=False) - self.register_buffer("freqs_sin", freqs_sin, persistent=False) - - def forward( - self, - tokens: torch.Tensor, - start_pos: Optional[ - torch.Tensor - ] = None, # Scalar tensor indicating size of window of the caches - cache_k: Optional[ - torch.Tensor - ] = None, # n_layers long, it should be a list of tensors to accommodate the potential size difference among attention layers. The current implementation is overly simplified. - cache_v: Optional[torch.Tensor] = None, # n_layers long - ) -> Union[ - torch.Tensor, Tuple[torch.Tensor, List[torch.Tensor], List[torch.Tensor]] - ]: - _bsz, seqlen = tokens.shape - h = self.tok_embeddings(tokens) - - if self.use_kv_cache: - assert ( - cache_k is not None and cache_v is not None and start_pos is not None - ), "Caches and start_pos must be provided when use_kv_cache is True" - assert ( - cache_k.size(0) == self.n_layers - ), f"{cache_k.size(0)} != {self.n_layers}" - assert ( - cache_v.size(0) == self.n_layers - ), f"{cache_v.size(0)} != {self.n_layers}" - - sp = start_pos.item() - # self.params.max_seq_len - 1 because of 0 based indexing, and - 1 again because our input seq len is 1 and its added to the cache before accessing the cache - torch._constrain_as_size(sp, min=0, max=self.params.max_seq_len - 2) - torch._constrain_as_value( - cache_k.shape[0], - max=self.n_layers, - min=self.n_layers, - ) - torch._constrain_as_value( - cache_v.shape[0], min=self.n_layers, max=self.n_layers - ) - # when KV cache is used, seqlen is most likely 1. We want to slice from the start_pos. - freqs_cos = self.freqs_cos[sp : sp + seqlen] - freqs_sin = self.freqs_sin[sp : sp + seqlen] - else: - assert ( - start_pos is None and cache_k is None and cache_v is None, - ), "Caches and start_pos are unused when use_kv_cache is False" - freqs_cos = self.freqs_cos[:seqlen] - freqs_sin = self.freqs_sin[:seqlen] - - for index, layer in enumerate(self.layers): - if self.use_kv_cache: - if self.params.use_sdpa_with_kv_cache_op: - h, updated_cache_k, updated_cache_v = layer( - h, - freqs_cos, - freqs_sin, - sp, # pyre-ignore[61] - cache_k, - cache_v, - ) - else: - h, updated_cache_k, updated_cache_v = layer( - h, - freqs_cos, - freqs_sin, - sp, # pyre-ignore[61] - cache_k[index], # pyre-ignore[16] - cache_v[index], - ) - cache_k[index] = updated_cache_k # pyre-ignore[16] - cache_v[index] = updated_cache_v - - else: - h, _, _ = layer(h, freqs_cos, freqs_sin) - - h = self.norm(h) - - logits = self.output(h) - if self.use_kv_cache: - return (logits, cache_k, cache_v) # pyre-ignore - else: - # 'None' is not a valid return for export so have to split the return into if else - return logits - - # For each layer return the sizes of the needed caches - def get_cache_sizes(self): - # cache_k and cache_v have the same shape so could pick either here. - return [self.n_layers, *self.layers[0].attention.kv_cache_sizes] - class Llama2Model(EagerModelBase): def __init__(self, **kwargs): diff --git a/setup.py b/setup.py index 5cc1772882..3544960fb0 100644 --- a/setup.py +++ b/setup.py @@ -236,14 +236,18 @@ def run(self): setup( package_dir={ "executorch/backends": "backends", + # TODO(mnachin T180504136): Do not put examples/models + # into core pip packages. Refactor out the necessary utils + # or core models files into a separate package. + "executorch/examples/models": "examples/models", "executorch/exir": "exir", + "executorch/extension": "extension", "executorch/schema": "schema", "executorch/sdk": "sdk", - "executorch/util": "util", - "executorch/extension": "extension", "executorch/sdk/bundled_program": "sdk/bundled_program", - "tosa": "backends/arm/third-party/serialization_lib/python/tosa", + "executorch/util": "util", "serializer": "backends/arm/third-party/serialization_lib/python/serializer", + "tosa": "backends/arm/third-party/serialization_lib/python/tosa", }, cmdclass=cmdclass, ext_modules=ext_modules, From a6d71e275fb780ba7436a1380048e4215703f3d2 Mon Sep 17 00:00:00 2001 From: Mergen Nachin Date: Mon, 4 Mar 2024 08:25:19 -0800 Subject: [PATCH 003/290] Be able to set RopE.Freq_Base explicitly (#2064) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/2064 Currently it is set to 10K all the time. Let's make it possible so that one can set it explicitly. This should be a no-op for existing models and CI jobs. bypass-github-export-checks bypass-github-pytorch-ci-checks bypass-github-executorch-ci-checks Reviewed By: larryliu0820 Differential Revision: D54131552 fbshipit-source-id: 46e9eb98bc7bb99c7225d3d7b02ab4bd2ecaed1c --- examples/models/llama2/llama_transformer.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/examples/models/llama2/llama_transformer.py b/examples/models/llama2/llama_transformer.py index fdfb41830d..1e4a7386e7 100644 --- a/examples/models/llama2/llama_transformer.py +++ b/examples/models/llama2/llama_transformer.py @@ -82,6 +82,7 @@ class ModelArgs: use_sdpa_with_kv_cache_op: bool = ( False # Use custom sdpa op that updates kv cache in-place ) + rope_freq_base: float = 10000.0 # The base frequency for RoPE # Additional Model Metadata needed at runtime bos_idx: int = 1 eos_idx: int = 3 @@ -108,7 +109,7 @@ def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor: ) -def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0): +def precompute_freqs_cis(dim: int, end: int, theta: float): freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)) t = torch.arange(end, device=freqs.device) # pyre-ignore freqs = torch.outer(t, freqs).float() # pyre-ignore @@ -411,7 +412,9 @@ def __init__(self, params: ModelArgs): self.use_kv_cache = params.use_kv_cache freqs_cos, freqs_sin = precompute_freqs_cis( - self.params.dim // self.params.n_heads, self.params.max_seq_len + params.dim // params.n_heads, + params.max_seq_len, + params.rope_freq_base, ) self.register_buffer("freqs_cos", freqs_cos, persistent=False) self.register_buffer("freqs_sin", freqs_sin, persistent=False) From 9283e506c822b8f388493243d32aad1234e1741c Mon Sep 17 00:00:00 2001 From: Mergen Nachin Date: Mon, 4 Mar 2024 08:25:19 -0800 Subject: [PATCH 004/290] Skeleton for GGUF conversion (#2018) Summary: Starting a skeleton implementation - Only llama for now. Will add new architecture inside gguf_util/converters/ - Only fp32. Will figure out the quantization. - Reusing the existing llama code in examples to reduce duplication. For other architectures, there won't be much duplication. - Currently converting to PyTorch, and then going through export, to_edge, to_executorch. But that's an implementation detail. Pull Request resolved: https://github.com/pytorch/executorch/pull/2018 Test Plan: `python extension/gguf_util/convert_main.py --gguf_file="/Users/mnachin/models_gguf/OpenHermes-2.5-Mistral-7B-fp16.gguf"` Reviewed By: shoumikhin Differential Revision: D53982833 Pulled By: mergennachin fbshipit-source-id: 5402c0de3e729e434763a5d6a390448603e77429 --- extension/gguf_util/README.md | 6 + extension/gguf_util/convert_main.py | 53 ++++++++ extension/gguf_util/converter.py | 27 ++++ .../gguf_util/converters/llama_converter.py | 121 ++++++++++++++++++ extension/gguf_util/install_requirements.sh | 8 ++ extension/gguf_util/load_gguf.py | 102 +++++++++++++++ 6 files changed, 317 insertions(+) create mode 100644 extension/gguf_util/README.md create mode 100644 extension/gguf_util/convert_main.py create mode 100644 extension/gguf_util/converter.py create mode 100644 extension/gguf_util/converters/llama_converter.py create mode 100644 extension/gguf_util/install_requirements.sh create mode 100644 extension/gguf_util/load_gguf.py diff --git a/extension/gguf_util/README.md b/extension/gguf_util/README.md new file mode 100644 index 0000000000..72a49e1b93 --- /dev/null +++ b/extension/gguf_util/README.md @@ -0,0 +1,6 @@ +# Summary +This is an experimental feature to convert [GGUF format](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) to PTE file, which can be executed directly on ExecuTorch. + +## Usage: + + python executorch/extension/gguf_util/convert_main.py --gguf_file= --pte_file= diff --git a/extension/gguf_util/convert_main.py b/extension/gguf_util/convert_main.py new file mode 100644 index 0000000000..038c104f69 --- /dev/null +++ b/extension/gguf_util/convert_main.py @@ -0,0 +1,53 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import argparse + +from executorch.extension.gguf_util.converter import convert_to_pte +from executorch.extension.gguf_util.load_gguf import load_file + + +def save_pte_program(_, pte_file) -> None: + # TODO (mnachin): Save the PTE program to a file + print(f"Saving PTE program to {pte_file}") + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument( + "--gguf_file", + type=str, + help="The GGUF file to load.", + ) + parser.add_argument( + "--pte_file", + type=str, + help="The path to save the PTE file.", + ) + args = parser.parse_args() + + # Step 1: Load the GGUF file + gguf_model_args, gguf_weights = load_file(args.gguf_file) + + # Step 2: Convert the GGUF model to PTE + # Currently, underneath the hood, it is first converting the GGUF model + # to a PyTorch model (nn.Module), then exporting to ET. + # + # NOTE: In the future, it may makes sense to refactor out the conversion from GGUF to nn.Module + # into its own package that can be shared between ExecuTorch and PyTorch core. I can + # imagine that there will be a need to do load GGUF file directly into PyTorch core, and + # use torch.compile/AOTInductor to accelerate on server, without ever touching ExecuTorch. + # + # TODO(mnachin): Add a knob to delegate to various backends. + pte_program = convert_to_pte(gguf_model_args, gguf_weights) + + # Step 3: Save the PTE program so that + # it can be used by the ExecuTorch runtime + save_pte_program(pte_program, args.pte_file) + + +if __name__ == "__main__": + main() diff --git a/extension/gguf_util/converter.py b/extension/gguf_util/converter.py new file mode 100644 index 0000000000..58b8afdca5 --- /dev/null +++ b/extension/gguf_util/converter.py @@ -0,0 +1,27 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from executorch.extension.gguf_util.load_gguf import GGUFModelArgs, GGUFWeights + + +def convert_to_pte(model_args: GGUFModelArgs, weights: GGUFWeights) -> None: + """Convert a GGUF model into a PTE file, an ExecuTorch program. + + Args: + model_args: The arguments for the GGUF model. + weights: The weights of the GGUF model. + """ + + # Switch statement based on the architecture enum. + # Each enum has its own converter function. + if model_args.arch == "llama": + from executorch.extension.gguf_util.converters.llama_converter import ( + convert_to_pte as llama_convert_to_pte, + ) + + return llama_convert_to_pte(model_args, weights) + else: + raise NotImplementedError("Unsupported architecture.") diff --git a/extension/gguf_util/converters/llama_converter.py b/extension/gguf_util/converters/llama_converter.py new file mode 100644 index 0000000000..a2107ad605 --- /dev/null +++ b/extension/gguf_util/converters/llama_converter.py @@ -0,0 +1,121 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import copy +from typing import Any, Mapping + +import torch +import torch.nn as nn +from executorch.examples.models.llama2.llama_transformer import ( + ModelArgs as LlamaModelArgs, + Transformer as LlamaTransformer, +) +from executorch.extension.gguf_util.load_gguf import GGUFModelArgs, GGUFWeights + + +def _create_pt_model( + gguf_model_args: GGUFModelArgs, +) -> nn.Module: + llama_model_args = LlamaModelArgs( + dim=gguf_model_args.embedding_length, + n_layers=gguf_model_args.block_count, + n_heads=gguf_model_args.attention.head_count, + n_kv_heads=gguf_model_args.attention.head_count_kv, + vocab_size=gguf_model_args.vocab_size, + norm_eps=gguf_model_args.attention.layer_norm_rms_epsilon, + hidden_dim=gguf_model_args.feed_forward_length, + rope_freq_base=gguf_model_args.rope.freq_base, + ) + pt_model = LlamaTransformer(llama_model_args) + pt_model.eval() + return pt_model + + +_name_replacements = [ + ("blk", "layers"), + ("token_embd", "tok_embeddings"), + ("attn_q", "attention.wq"), + ("attn_k", "attention.wk"), + ("attn_v", "attention.wv"), + ("attn_output", "attention.wo"), + ("attn_norm", "attention_norm"), + ("output_norm.weight", "norm.weight"), + ("ffn_down", "feed_forward.w2"), + ("ffn_gate", "feed_forward.w1"), + ("ffn_up", "feed_forward.w3"), +] + + +def _convert_gguf_tensor_name_to_llama_nn(gguf_name: str) -> str: + result = copy.deepcopy(gguf_name) + for gguf_string, replacement in _name_replacements: + result = result.replace(gguf_string, replacement) + return result + + +def _convert_to_state_dict(gguf_weights: GGUFWeights) -> Mapping[str, Any]: + + state_dict = {} + for tensor in gguf_weights.tensors: + gguf_tensor_name = tensor.name + nn_tensor_name = _convert_gguf_tensor_name_to_llama_nn(gguf_tensor_name) + new_tensor = tensor.data.reshape(tensor.shape).transpose() + state_dict[nn_tensor_name] = torch.from_numpy(new_tensor) + + return state_dict + + +def _load_weights_into_nn( + pt_model: nn.Module, gguf_model_args: GGUFModelArgs, gguf_weights: GGUFWeights +): + + state_dict: Mapping[str, Any] = _convert_to_state_dict(gguf_weights) + + # We need to fake initialize the mask, to match with the llama_transformer.py + for id in range(gguf_model_args.block_count): + mask_name = f"layers.{id}.attention.mask" + mask = torch.full( + (1, 1, pt_model.params.max_seq_len, pt_model.params.max_seq_len), + float("-inf"), + ) + mask = torch.triu(mask, diagonal=1) + state_dict[mask_name] = mask + + pt_model.load_state_dict(state_dict) + return + + +def _create_pte_program(pt_model: nn.Module) -> bytes: + # TODO (mnachin): Export + return + + +def convert_to_pte(gguf_model_args: GGUFModelArgs, gguf_weights: GGUFWeights) -> bytes: + """Convert a GGUF model into an ExecuTorch program. + + Args: + model_args: The arguments for the GGUF model. + weights: The weights of the GGUF model. + """ + + assert ( + gguf_model_args.arch == "llama" + ), "Only LLaMa models are supported by this converter." + + # Step 1: Create the PyTorch model + print("Create the PyTorch model") + pt_model = _create_pt_model( + gguf_model_args, + ) + + # Step 2: Load the weights into the PyTorch model + print("Load the weights into the PyTorch model") + _load_weights_into_nn(pt_model, gguf_model_args, gguf_weights) + + # Step 3: Export to ExecuTorch + print("Exporting to ExecuTorch.") + pte_program = _create_pte_program(pt_model) + return pte_program diff --git a/extension/gguf_util/install_requirements.sh b/extension/gguf_util/install_requirements.sh new file mode 100644 index 0000000000..4221b4c16f --- /dev/null +++ b/extension/gguf_util/install_requirements.sh @@ -0,0 +1,8 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +pip install gguf==0.6.0 diff --git a/extension/gguf_util/load_gguf.py b/extension/gguf_util/load_gguf.py new file mode 100644 index 0000000000..c258fdca19 --- /dev/null +++ b/extension/gguf_util/load_gguf.py @@ -0,0 +1,102 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +import gguf +from gguf import GGUFValueType, ReaderTensor + + +@dataclass +class AttentionArgs: + head_count: int + head_count_kv: int + layer_norm_rms_epsilon: float + + +@dataclass +class RopeArgs: + freq_base: float + + +@dataclass +class GGUFModelArgs: + arch: str + embedding_length: int + block_count: int + feed_forward_length: int + vocab_size: int + attention: AttentionArgs + rope: RopeArgs + + +@dataclass +class GGUFWeights: + tensors: list[ReaderTensor] + + +def _get_metadata(reader: gguf.GGUFReader) -> dict[str, Any]: + metadata: dict[str, Any] = {} + + for idx, field in enumerate(reader.fields.values()): + val = None + if field.types[:1] == [GGUFValueType.ARRAY]: + itype = field.types[-1] + if itype == GGUFValueType.STRING: + val = [ + str(bytes(field.parts[idx]), encoding="utf-8") for idx in field.data + ] + else: + val = [pv for idx in field.data for pv in field.parts[idx].tolist()] + elif field.types[0] == GGUFValueType.STRING: + val = str(bytes(field.parts[-1]), encoding="utf-8") + else: + val = field.parts[-1].tolist()[0] + + metadata[field.name] = val + + return metadata + + +def _build_model_args(metadata: dict[str, Any]) -> GGUFModelArgs: + arch = metadata["general.architecture"] + + return GGUFModelArgs( + arch=arch, + embedding_length=metadata[f"{arch}.embedding_length"], + block_count=metadata[f"{arch}.block_count"], + feed_forward_length=metadata[f"{arch}.feed_forward_length"], + vocab_size=len(metadata["tokenizer.ggml.tokens"]), + attention=AttentionArgs( + head_count=metadata[f"{arch}.attention.head_count"], + head_count_kv=metadata[f"{arch}.attention.head_count_kv"], + layer_norm_rms_epsilon=metadata[f"{arch}.attention.layer_norm_rms_epsilon"], + ), + rope=RopeArgs( + freq_base=metadata[f"{arch}.rope.freq_base"], + ), + ) + + +def load_file(gguf_file: str) -> (GGUFModelArgs, GGUFWeights): + """ + Load a GGUF file and return the model arguments and weights. + """ + if not Path(gguf_file).is_file(): + raise ValueError(f"Could not find file {gguf_file}") + + reader = gguf.GGUFReader(gguf_file, "r") + + # Step 1: Build GGUFModelArgs + metadata = _get_metadata(reader) + model_args = _build_model_args(metadata) + + # Step 2: Build GGUFWeights + gguf_weights = GGUFWeights(tensors=reader.tensors) + + return (model_args, gguf_weights) From 3ff0f77ee71a2719a6262982edfb437a42f0581a Mon Sep 17 00:00:00 2001 From: Andrew Or Date: Mon, 4 Mar 2024 09:44:48 -0800 Subject: [PATCH 005/290] Fix llama quantize_per_token numerics Summary: The existing implementation can produce quantized values outside the quant range, since we add the zero points after clamping. This was not a problem for symmetric quantization since zero points are 0 there, but causes dqlinear numerics to diverge significantly from the lowered implementation for asymmetric quantization. Reviewed By: digantdesai Differential Revision: D54320424 fbshipit-source-id: e8d9136354b0dac1993ef7825fc331f68d0d4c05 --- examples/models/llama2/quantize.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/models/llama2/quantize.py b/examples/models/llama2/quantize.py index d7e024ad10..2e5e57ffb7 100644 --- a/examples/models/llama2/quantize.py +++ b/examples/models/llama2/quantize.py @@ -234,8 +234,9 @@ def quantize_per_token( """ _quant_min_max_bounds_check(quant_min, quant_max, dtype) _per_token_quant_qparam_dim_check(input, scales, zero_points) - input = torch.round(input / scales).clamp(quant_min, quant_max).to(dtype) - input = input + zero_points + input = ( + torch.round(input / scales + zero_points).clamp(quant_min, quant_max).to(dtype) + ) return input From 2a42737622386546734f6e3a6f40b90a3d48be38 Mon Sep 17 00:00:00 2001 From: Andrew Or Date: Mon, 4 Mar 2024 09:53:05 -0800 Subject: [PATCH 006/290] Add choose_qparams_per_token_asymmetric for llama on XNNPACK Summary: XNNPACK uses asymmetric activation quantizations, but the existing `choose_qparams_per_token` assumed symmetric quantization (zero point is always 0). This caused significant numerical discrepancies between eager and lowered models. This commit adds a new asymmetric version of `choose_qparams_per_token` for this purpose. Reviewed By: digantdesai Differential Revision: D54323650 fbshipit-source-id: afd1e8f8b582bc8c07d4b03752ab71caa30c2bb0 --- examples/models/llama2/quantize.py | 68 ++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/examples/models/llama2/quantize.py b/examples/models/llama2/quantize.py index 2e5e57ffb7..1b34593b08 100644 --- a/examples/models/llama2/quantize.py +++ b/examples/models/llama2/quantize.py @@ -189,6 +189,74 @@ def choose_qparams_per_token_meta( ) +# TODO: move this to https://github.com/pytorch/pytorch/blob/main/torch/ao/quantization/fx/_decomposed.py +quantized_decomposed_lib.define( + "choose_qparams_per_token_asymmetric(Tensor input, ScalarType dtype) -> (Tensor, Tensor)" +) + + +@impl( + quantized_decomposed_lib, + "choose_qparams_per_token_asymmetric", + "CompositeExplicitAutograd", +) +def choose_qparams_per_token_asymmetric( + input: torch.Tensor, + dtype: torch.dtype, +) -> Tuple[torch.Tensor, torch.Tensor]: + """Choose quantization parameters for per token quantization. This means for a N dimension Tensor + (M1, M2, ...Mn, N), we calculate scales/zero_points for each N elements and quantize + every N elements with the same quantization parameter. The dimension for scales/zero_points + will be (M1 * M2 ... * Mn) + + Args: + input (torch.Tensor): original float32/float16 Tensor + dtype (torch.dtype): dtype (e.g. torch.uint8) for input Tensor + + Returns: + scales and zero_points, both float32 Tensors + """ + # Based on https://github.com/google/XNNPACK/blob/df156f0cf3db5a4576cc711123eeb54915f82ffc/src/xnnpack/quantization.h#L18 + qmin, qmax = -128, 127 + min_val, max_val = torch.aminmax(input, dim=-1, keepdim=True) + min_val_neg = torch.min(min_val, torch.zeros_like(min_val)) + max_val_pos = torch.max(max_val, torch.zeros_like(max_val)) + eps = torch.finfo(torch.float32).eps # use xnnpack eps? + + # scale + scale = (max_val_pos - min_val_neg) / float(qmax - qmin) + scale = scale.clamp(min=eps) + + # zero point + descaled_min = min_val_neg / scale + descaled_max = max_val_pos / scale + zero_point_from_min_error = qmin + descaled_min + zero_point_from_max_error = qmax + descaled_max + zero_point = torch.where( + zero_point_from_min_error + zero_point_from_max_error > 0, + qmin - descaled_min, + qmax - descaled_max, + ) + zero_point = torch.clamp(zero_point, qmin, qmax).round() + + return scale.to(torch.float32), zero_point.to(torch.float32) + + +@impl( + quantized_decomposed_lib, + "choose_qparams_per_token_asymmetric", + "Meta", +) +def choose_qparams_per_token_asymmetric_meta( + input: torch.Tensor, + dtype: torch.dtype, +) -> Tuple[torch.Tensor, torch.Tensor]: + size = (1, input.size(-1)) + return torch.empty(size, dtype=torch.double, device=input.device), torch.empty( + size, dtype=torch.int64, device=input.device + ) + + def _per_token_quant_qparam_dim_check(input, scales, zero_points): num_tokens = math.prod(list(input.size())[:-1]) assert ( From d25b57b7141b4469e8ec045cbe754b0e5f387b35 Mon Sep 17 00:00:00 2001 From: Manuel Candales Date: Mon, 4 Mar 2024 10:43:08 -0800 Subject: [PATCH 007/290] Back out "Enable embedding_byte output dtype be different than scales/zp dtype" (#2210) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/2210 Original commit changeset: f79754770ddc Original Phabricator Diff: D54141337 Reviewed By: kjweng, mikekgfb Differential Revision: D54454388 fbshipit-source-id: 13381d5e14f53edfaa5b57997e4b8b9ac57a27f4 --- examples/models/llama2/ops/quantized.yaml | 2 +- examples/models/llama2/ops/quantized_ops.py | 11 ++--- examples/models/llama2/quantize.py | 4 +- .../_quant_patterns_and_replacements.py | 49 +------------------ kernels/quantized/cpu/op_embedding.cpp | 45 ++++++----------- kernels/quantized/quantized.yaml | 2 +- kernels/quantized/test/op_embedding_test.cpp | 9 ---- 7 files changed, 23 insertions(+), 99 deletions(-) diff --git a/examples/models/llama2/ops/quantized.yaml b/examples/models/llama2/ops/quantized.yaml index f912b6ce0e..3c5376ceef 100644 --- a/examples/models/llama2/ops/quantized.yaml +++ b/examples/models/llama2/ops/quantized.yaml @@ -1,4 +1,4 @@ -- func: llama_quantized::embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) +- func: llama_quantized::embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, *, Tensor(a!) out) -> Tensor(a!) variants: function kernels: - arg_meta: null diff --git a/examples/models/llama2/ops/quantized_ops.py b/examples/models/llama2/ops/quantized_ops.py index 726416c5ff..f316aa42b4 100644 --- a/examples/models/llama2/ops/quantized_ops.py +++ b/examples/models/llama2/ops/quantized_ops.py @@ -14,12 +14,12 @@ ) # to not be confused with torch.ops.quantized.* ops. quantized_lib.define( "embedding_byte(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, " - "int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None) -> Tensor", + "int weight_quant_min, int weight_quant_max, Tensor indices) -> Tensor", ) quantized_lib.define( "embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, " - "int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)", + "int weight_quant_min, int weight_quant_max, Tensor indices, *, Tensor(a!) out) -> Tensor(a!)", ) @@ -31,8 +31,6 @@ def embedding_byte_meta( weight_quant_min, weight_quant_max, indices, - *, - dtype, ): assert weight.dtype in [ torch.int8, @@ -73,7 +71,7 @@ def embedding_byte_meta( weight_quant_max, weight.dtype, ) - return torch.ops.aten.embedding.default(weight, indices).to(dtype) + return torch.ops.aten.embedding.default(weight, indices) @impl_abstract("llama_quantized::embedding_byte.out") @@ -84,8 +82,6 @@ def embedding_byte_out_meta( weight_quant_min, weight_quant_max, indices, - *, - dtype, out, ): return embedding_byte_meta( @@ -95,5 +91,4 @@ def embedding_byte_out_meta( weight_quant_min, weight_quant_max, indices, - dtype=dtype, ) diff --git a/examples/models/llama2/quantize.py b/examples/models/llama2/quantize.py index 1b34593b08..eac11678e6 100644 --- a/examples/models/llama2/quantize.py +++ b/examples/models/llama2/quantize.py @@ -887,8 +887,8 @@ def __init__( @torch.no_grad() def forward(self, indices: torch.Tensor) -> torch.Tensor: return torch.ops.llama_quantized.embedding_byte.default( - self.weight, self.scales, None, 0, 0, indices, dtype=self.dtype - ) + self.weight, self.scales, None, 0, 0, indices + ).to(self.dtype) # result_weights = self.weight.index_select(0, indices.view(-1)) diff --git a/exir/passes/_quant_patterns_and_replacements.py b/exir/passes/_quant_patterns_and_replacements.py index 4ed8eb5dea..17287dc2cf 100644 --- a/exir/passes/_quant_patterns_and_replacements.py +++ b/exir/passes/_quant_patterns_and_replacements.py @@ -27,7 +27,7 @@ quantized_decomposed_lib.define( "embedding_byte(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, " - "int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None) -> Tensor", + "int weight_quant_min, int weight_quant_max, Tensor indices) -> Tensor", ) quantized_decomposed_lib.define( @@ -482,48 +482,6 @@ def replacement( ) return out - @bind_pattern_to_op(quantized_decomposed_lib, "embedding_byte") - def pattern_with_dtype( - weight, - weight_scales, - weight_zero_points, - weight_quant_min, - weight_quant_max, - indicies, - dtype, - ): - weight = torch.ops.quantized_decomposed.dequantize_per_channel.default( - weight, - weight_scales, - weight_zero_points, - 0, - weight_quant_min, - weight_quant_max, - torch.uint8, - ) - out = torch.ops.aten.embedding.default(weight, indicies).to(dtype) - return out - - def replacement_with_dtype( - weight, - weight_scales, - weight_zero_points, - weight_quant_min, - weight_quant_max, - indicies, - dtype, - ): - out = torch.ops.quantized_decomposed.embedding_byte.default( - weight, - weight_scales, - weight_zero_points, - weight_quant_min, - weight_quant_max, - indicies, - dtype=dtype, - ) - return out - @bind_pattern_to_op(quantized_decomposed_lib, "embedding_byte") def pattern_with_padding_idx( weight, @@ -571,11 +529,6 @@ def replacement_with_padding_idx( _trace_and_lower_to_edge_ops(replacement), [], ), - ( - _trace_and_lower_to_edge_ops(pattern_with_dtype), - _trace_and_lower_to_edge_ops(replacement_with_dtype), - [], - ), ( _trace_and_lower_to_edge_ops(pattern_with_padding_idx), _trace_and_lower_to_edge_ops(replacement_with_padding_idx), diff --git a/kernels/quantized/cpu/op_embedding.cpp b/kernels/quantized/cpu/op_embedding.cpp index 640a6a22e1..2964ecbab5 100644 --- a/kernels/quantized/cpu/op_embedding.cpp +++ b/kernels/quantized/cpu/op_embedding.cpp @@ -31,7 +31,6 @@ void check_embedding_byte_args( const int64_t weight_quant_min, const int64_t weight_quant_max, const Tensor& indices, - exec_aten::optional out_dtype, Tensor& out) { ET_CHECK_MSG( weight.dim() == 2, "weight must be 2D but got() %zd dims", weight.dim()); @@ -76,9 +75,8 @@ void check_embedding_byte_args( static_cast(out.scalar_type())); ET_CHECK_MSG( - weight_scales.scalar_type() == ScalarType::Float || - weight_scales.scalar_type() == ScalarType::Half, - "weight_scales.scalar_type() %" PRId8 " is not supported:", + weight_scales.scalar_type() == out.scalar_type(), + "weight scales scalar type %" PRId8 " does not match out.scalar_type()", static_cast(weight_scales.scalar_type())); if (opt_weight_zero_points.has_value()) { @@ -118,19 +116,13 @@ void check_embedding_byte_args( " is greater than weight quant max: %" PRId64, weight_quant_min, weight_quant_max); - - if (out_dtype.has_value()) { - ET_CHECK_MSG( - out.scalar_type() == out_dtype.value(), - "output_dtype must match the dtype of the out tensor"); - } } /** * Retrieves the embeddings specified by indices, dequantizes them, and stores * them in out */ -template +template void embedding_byte_per_channel( const Tensor& weight, const Tensor& weight_scales, @@ -150,19 +142,19 @@ void embedding_byte_per_channel( CTYPE_OUT* out_data = out.mutable_data_ptr(); const int64_t* indices_ptr = indices.const_data_ptr(); - const CTYPE_PARAMS* scales = weight_scales.const_data_ptr(); - const CTYPE_PARAMS* zero_points = nullptr; + const CTYPE_OUT* scales = weight_scales.const_data_ptr(); + const CTYPE_OUT* zero_points = nullptr; if (opt_weight_zero_points.has_value()) { - zero_points = opt_weight_zero_points.value().const_data_ptr(); + zero_points = opt_weight_zero_points.value().const_data_ptr(); } for (int i = 0; i < indices.numel(); i++) { int64_t index = indices_ptr[i]; // If using groupwise embedding int32_t qparams_index = index * num_groups_per_channel; - CTYPE_PARAMS zp = 0.0; - const CTYPE_PARAMS* scale_ptr = scales + qparams_index; - const CTYPE_PARAMS* zero_points_ptr = nullptr; + CTYPE_OUT zp = 0.0; + const CTYPE_OUT* scale_ptr = scales + qparams_index; + const CTYPE_OUT* zero_points_ptr = nullptr; if (opt_weight_zero_points.has_value()) { zero_points_ptr = zero_points + qparams_index; } @@ -172,7 +164,7 @@ void embedding_byte_per_channel( for (int j = 0; j < embedding_dim; ++j) { int32_t group_id = j / group_size; - const CTYPE_PARAMS scale = scale_ptr[group_id]; + const CTYPE_OUT scale = scale_ptr[group_id]; if (opt_weight_zero_points.has_value()) { zp = zero_points_ptr[group_id]; } @@ -227,7 +219,6 @@ Tensor& quantized_embedding_byte_out( const int64_t weight_quant_min, const int64_t weight_quant_max, const Tensor& indices, - exec_aten::optional out_dtype, Tensor& out) { // TODO (jakeszwe): improve these to account for the size of out in relation // to weight and indices accounting for a possible batch dimension @@ -238,20 +229,16 @@ Tensor& quantized_embedding_byte_out( weight_quant_min, weight_quant_max, indices, - out_dtype, out); - ScalarType weight_type = weight.scalar_type(); - ScalarType params_type = weight_scales.scalar_type(); + ScalarType w_type = weight.scalar_type(); ScalarType out_type = out.scalar_type(); constexpr auto name = "quantized_decomposed::embedding_byte.out"; - ET_SWITCH_TWO_TYPES(Byte, Char, weight_type, ctx, name, CTYPE_W, [&]() { - ET_SWITCH_TWO_TYPES(Float, Half, params_type, ctx, name, CTYPE_P, [&]() { - ET_SWITCH_TWO_TYPES(Float, Half, out_type, ctx, name, CTYPE_OUT, [&]() { - embedding_byte_per_channel( - weight, weight_scales, opt_weight_zero_points, indices, out); - }); + ET_SWITCH_TWO_TYPES(Byte, Char, w_type, ctx, name, CTYPE_W, [&]() { + ET_SWITCH_TWO_TYPES(Float, Half, out_type, ctx, name, CTYPE_OUT, [&]() { + embedding_byte_per_channel( + weight, weight_scales, opt_weight_zero_points, indices, out); }); }); @@ -266,7 +253,6 @@ Tensor& quantized_embedding_byte_out( int64_t weight_quant_min, int64_t weight_quant_max, const Tensor& indices, - exec_aten::optional out_dtype, Tensor& out) { // TODO(larryliu): Add a context arg to the real op function and remove this // wrapper @@ -279,7 +265,6 @@ Tensor& quantized_embedding_byte_out( weight_quant_min, weight_quant_max, indices, - out_dtype, out); } diff --git a/kernels/quantized/quantized.yaml b/kernels/quantized/quantized.yaml index 93fc1569d5..34830b01bd 100644 --- a/kernels/quantized/quantized.yaml +++ b/kernels/quantized/quantized.yaml @@ -34,7 +34,7 @@ - arg_meta: null kernel_name: torch::executor::dequantize_per_channel_out -- func: quantized_decomposed::embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) +- func: quantized_decomposed::embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, *, Tensor(a!) out) -> Tensor(a!) variants: function kernels: - arg_meta: null diff --git a/kernels/quantized/test/op_embedding_test.cpp b/kernels/quantized/test/op_embedding_test.cpp index 20d4a04c71..49605977cc 100644 --- a/kernels/quantized/test/op_embedding_test.cpp +++ b/kernels/quantized/test/op_embedding_test.cpp @@ -76,7 +76,6 @@ void test_dtype() { quant_min, quant_max, indices, - out.scalar_type(), out); // (8 - 1) * 0.5 = 3.5 @@ -140,7 +139,6 @@ TEST(OpQuantizedEmbeddingTest, ConsitencyWithReferencePattern) { quant_min, quant_max, indices, - out.scalar_type(), out); // Do Q DQ embedding @@ -198,7 +196,6 @@ TEST(OpQuantizedEmbeddingTest, TestGroupWiseQuantizedEmbedding) { quant_min, quant_max, indices, - out.scalar_type(), out); EXPECT_TENSOR_EQ(out, expected); @@ -223,7 +220,6 @@ TEST(OpQuantizedEmbeddingTest, TestGroupWiseQuantizedEmbedding) { quant_min, quant_max, indices, - out.scalar_type(), out); EXPECT_TENSOR_EQ(out, expected); @@ -255,7 +251,6 @@ TEST(OpQuantizedEmbeddingTest, TestGroupWiseQuantizedEmbeddingDeath1) { quant_min, quant_max, indices, - out.scalar_type(), out), ""); } @@ -286,7 +281,6 @@ TEST(OpQuantizedEmbeddingTest, TestGroupWiseQuantizedEmbeddingDeath2) { quant_min, quant_max, indices, - out.scalar_type(), out), ""); } @@ -316,7 +310,6 @@ TEST(OpQuantizedEmbeddingTest, TestGroupWiseQuantizedEmbeddingDeath3) { quant_min, quant_max, indices, - out.scalar_type(), out), ""); } @@ -346,7 +339,6 @@ TEST(OpQuantizedEmbeddingTest, TestGroupWiseQuantizedEmbeddingDeath4) { quant_min, quant_max, indices, - out.scalar_type(), out), ""); } @@ -376,7 +368,6 @@ TEST(OpQuantizedEmbeddingTest, TestGroupWiseQuantizedEmbeddingDeath5) { quant_min, quant_max, indices, - out.scalar_type(), out), ""); } From dc77c852e306c36b8f495b704b4178ba9aa90a1c Mon Sep 17 00:00:00 2001 From: Mergen Nachin Date: Mon, 4 Mar 2024 11:15:05 -0800 Subject: [PATCH 008/290] Update black linter in OSS lintrunner (#2229) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/2229 Black linter is upgraded to v24.2.0 in fbsource. Let's also upgrade the OSS linter to match this to reduce inconsistency between internal linter and OSS linters. Currently lintrunner is failing as is: https://hud.pytorch.org/hud/pytorch/executorch/main/1?per_page=50 Context: https://fb.workplace.com/groups/pyfmt/posts/1391116614859184/?fbclid=IwAR1Gag0Bkq2OE_4EeH5XY_iOFgwF6VE-7OAr9kmHeyB3QzkfaGvsWWY3nCo Reviewed By: kimishpatel Differential Revision: D54487606 fbshipit-source-id: 82d5eb011e6e96dd2b369299cd4a74724564b94c --- backends/apple/mps/operators/node_visitor.py | 1 - backends/arm/arm_backend.py | 8 +++-- backends/arm/arm_vela.py | 1 + .../decompose_scaled_dot_product_attention.py | 12 ++++---- backends/qualcomm/passes/i64_to_i32.py | 8 +++-- examples/arm/aot_arm_compiler.py | 1 + examples/models/llama2/quantize.py | 3 +- .../_quant_patterns_and_replacements.py | 30 +++++++++---------- exir/serde/export_serialize.py | 6 ++-- requirements-lintrunner.txt | 2 +- 10 files changed, 39 insertions(+), 33 deletions(-) diff --git a/backends/apple/mps/operators/node_visitor.py b/backends/apple/mps/operators/node_visitor.py index 2c1f029912..ed2afea772 100644 --- a/backends/apple/mps/operators/node_visitor.py +++ b/backends/apple/mps/operators/node_visitor.py @@ -235,7 +235,6 @@ def get_serialized_data( def get_serialized_id( self, node: Union[torch.fx.Node, float, int], mps_graph: MPSGraph ) -> int: - """ Map a tensor to a unique id. If the tensor was already mapped, return the existent id. diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py index f67877366d..c1bc9ed920 100644 --- a/backends/arm/arm_backend.py +++ b/backends/arm/arm_backend.py @@ -128,9 +128,11 @@ def preprocess( # noqa: C901 # Add output to TOSA graph tosa_graph.currRegion.currBasicBlock.addTensor( output.name, - inputs[0].shape - if is_permute_node_before_addmm(node) - else output.shape, + ( + inputs[0].shape + if is_permute_node_before_addmm(node) + else output.shape + ), ts.DType.INT8 if is_quant_node(node) else output.dtype, ) diff --git a/backends/arm/arm_vela.py b/backends/arm/arm_vela.py index 1f1d72cc7c..0d0f0eb037 100644 --- a/backends/arm/arm_vela.py +++ b/backends/arm/arm_vela.py @@ -12,6 +12,7 @@ import numpy as np + # Pack either input or output tensor block, compose the related arrays into # per-io structs to simplify runtime use. def vela_bin_pack_io(prefix, data): diff --git a/backends/qualcomm/passes/decompose_scaled_dot_product_attention.py b/backends/qualcomm/passes/decompose_scaled_dot_product_attention.py index 84e63db666..9fd3c8fb46 100644 --- a/backends/qualcomm/passes/decompose_scaled_dot_product_attention.py +++ b/backends/qualcomm/passes/decompose_scaled_dot_product_attention.py @@ -42,9 +42,9 @@ def call(self, graph_module: torch.fx.GraphModule): # In decomposed module, there are only input tensors for placeholder op. for decomposed_node in decomposed_module.graph.nodes: if decomposed_node.op == "placeholder": - decomposed_node_to_subgraph_node[ - decomposed_node - ] = name_to_input_tensor_map[decomposed_node.name] + decomposed_node_to_subgraph_node[decomposed_node] = ( + name_to_input_tensor_map[decomposed_node.name] + ) if decomposed_node.op == "output": last_decomposed_node = decomposed_node.args[0] @@ -76,9 +76,9 @@ def call(self, graph_module: torch.fx.GraphModule): subgraph_node.meta["source_fn_stack"] = [ (subgraph_node, subgraph_node.target) ] - decomposed_node_to_subgraph_node[ - decomposed_node - ] = subgraph_node + decomposed_node_to_subgraph_node[decomposed_node] = ( + subgraph_node + ) graph.erase_node(node) diff --git a/backends/qualcomm/passes/i64_to_i32.py b/backends/qualcomm/passes/i64_to_i32.py index 6673741a91..76ba2924e1 100644 --- a/backends/qualcomm/passes/i64_to_i32.py +++ b/backends/qualcomm/passes/i64_to_i32.py @@ -21,9 +21,11 @@ def _update_meta(self, node: torch.fx.node) -> None: meta_val = node.meta["val"] if isinstance(meta_val, tuple): node.meta["val"] = ( - fake_tensor.to(torch.int32) - if fake_tensor.dtype == torch.int64 - else fake_tensor + ( + fake_tensor.to(torch.int32) + if fake_tensor.dtype == torch.int64 + else fake_tensor + ) for fake_tensor in meta_val ) else: diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py index 7ed65c1c8b..253aa644cb 100644 --- a/examples/arm/aot_arm_compiler.py +++ b/examples/arm/aot_arm_compiler.py @@ -26,6 +26,7 @@ # quantization step in our example. This will take the models # from examples/models/ and quantize then export to delegate. + # Two simple models class AddModule(torch.nn.Module): def __init__(self): diff --git a/examples/models/llama2/quantize.py b/examples/models/llama2/quantize.py index eac11678e6..c3bf88a627 100644 --- a/examples/models/llama2/quantize.py +++ b/examples/models/llama2/quantize.py @@ -64,7 +64,8 @@ def dynamically_quantize_per_channel( with a final group of a size less than group size. Assumptions: - This function assumes symmetric quantization, axis ==0 and a dense memory format.""" + This function assumes symmetric quantization, axis ==0 and a dense memory format. + """ # assumes symmetric quantization # assumes axis == 0 diff --git a/exir/passes/_quant_patterns_and_replacements.py b/exir/passes/_quant_patterns_and_replacements.py index 17287dc2cf..692fe8d1dd 100644 --- a/exir/passes/_quant_patterns_and_replacements.py +++ b/exir/passes/_quant_patterns_and_replacements.py @@ -337,9 +337,9 @@ def binary_relu_op_replacement( ] -def _get_binary_ops_patterns_and_replacements() -> List[ - Tuple[Callable, Callable, List[Callable]] -]: +def _get_binary_ops_patterns_and_replacements() -> ( + List[Tuple[Callable, Callable, List[Callable]]] +): # TODO: replace qbinary op with the ops implemented in lean mode binary_op_to_qbinary_ops = { @@ -360,9 +360,9 @@ def _get_binary_ops_patterns_and_replacements() -> List[ return pattern_and_replacements -def _get_reshape_patterns_and_replacements() -> List[ - Tuple[Callable, Callable, List[Callable]] -]: +def _get_reshape_patterns_and_replacements() -> ( + List[Tuple[Callable, Callable, List[Callable]]] +): def pattern( x, arg0, @@ -413,9 +413,9 @@ def replacement( ] -def _get_slice_patterns_and_replacements() -> List[ - Tuple[Callable, Callable, List[Callable]] -]: +def _get_slice_patterns_and_replacements() -> ( + List[Tuple[Callable, Callable, List[Callable]]] +): def pattern(x, dim, start, end, x_scale, x_zero_point, x_qmin, x_qmax): x = torch.ops.quantized_decomposed.dequantize_per_tensor.default( x, x_scale, x_zero_point, x_qmin, x_qmax, torch.uint8 @@ -439,9 +439,9 @@ def replacement(x, dim, start, end, x_scale, x_zero_point, x_qmin, x_qmax): ] -def _get_embedding_ops_patterns_and_replacements() -> List[ - Tuple[Callable, Callable, List[Callable]] -]: +def _get_embedding_ops_patterns_and_replacements() -> ( + List[Tuple[Callable, Callable, List[Callable]]] +): def get_pattern_and_replacement(): @bind_pattern_to_op(quantized_decomposed_lib, "embedding_byte") def pattern( @@ -569,9 +569,9 @@ def replacement(x, x_scale, x_zero_point, x_qmin, x_qmax): """ -def get_quant_patterns_and_replacements() -> List[ - Tuple[Callable, Callable, List[Callable]] -]: +def get_quant_patterns_and_replacements() -> ( + List[Tuple[Callable, Callable, List[Callable]]] +): return copy.copy( [ diff --git a/exir/serde/export_serialize.py b/exir/serde/export_serialize.py index d24f456389..7d07b20bfd 100644 --- a/exir/serde/export_serialize.py +++ b/exir/serde/export_serialize.py @@ -1728,9 +1728,9 @@ def deserialize( symbol_name_to_range, res.names_to_symbols, ) - model_opset_version: Optional[ - Dict[str, int] - ] = serialized_artifact.exported_program.opset_version # pyre-ignore + model_opset_version: Optional[Dict[str, int]] = ( + serialized_artifact.exported_program.opset_version # pyre-ignore + ) self._validate_model_opset_version(model_opset_version) upgrader = GraphModuleOpUpgrader( diff --git a/requirements-lintrunner.txt b/requirements-lintrunner.txt index e5caf38c59..85d7e8e13d 100644 --- a/requirements-lintrunner.txt +++ b/requirements-lintrunner.txt @@ -13,7 +13,7 @@ pycodestyle==2.10.0 torchfix==0.1.1 # UFMT -black==22.12.0 +black==24.2.0 ufmt==2.0.1 usort==1.0.5 From 3609c7231b3e89d27d396f3e38c3176f4d51c618 Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Mon, 4 Mar 2024 12:30:44 -0800 Subject: [PATCH 009/290] Add copy_ to aten op lib. (#2202) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/2202 title bypass-github-export-checks neither failure in oss looks related to this diff so skipping Reviewed By: lucylq Differential Revision: D54432221 fbshipit-source-id: 0bff99ecaa6c7c2e403166e3b338cef157830b0d --- exir/passes/__init__.py | 1 + kernels/aten/functions.yaml | 2 ++ 2 files changed, 3 insertions(+) diff --git a/exir/passes/__init__.py b/exir/passes/__init__.py index cc43f51948..2611d6a154 100644 --- a/exir/passes/__init__.py +++ b/exir/passes/__init__.py @@ -249,6 +249,7 @@ def callWithLoggerEnabled(self, graph_module: torch.fx.GraphModule) -> None: # it's retraced after running to_out_variant with the first trace. memory.alloc, executorch_call_delegate, + torch.ops.aten.copy_.default, } to_out_var_skiplist.update(_EXECUTORCH_SYM_OPS) diff --git a/kernels/aten/functions.yaml b/kernels/aten/functions.yaml index f5451593f8..a4237458fd 100644 --- a/kernels/aten/functions.yaml +++ b/kernels/aten/functions.yaml @@ -15,6 +15,8 @@ - op: _softmax.out +- op: copy_ + - op: _to_copy.out - op: _unique2.out From a03912603f14f1c91e0414cec39b73af1c9aae9d Mon Sep 17 00:00:00 2001 From: Manuel Candales Date: Mon, 4 Mar 2024 12:55:40 -0800 Subject: [PATCH 010/290] Add op: clamp.Tensor_out (#2212) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/2212 bypass-github-pytorch-ci-checks ghstack-source-id: 217233091 exported-using-ghexport bypass-github-export-checks Reviewed By: SS-JIA Differential Revision: D53624733 fbshipit-source-id: eafef06cadd68fb1c226b76ac8bb4dbb9857fce1 --- kernels/aten/functions.yaml | 2 + kernels/portable/cpu/op_clamp.cpp | 81 +++++++++++++++++++++++++++++++ kernels/portable/cpu/targets.bzl | 1 + kernels/portable/functions.yaml | 5 ++ kernels/test/op_clamp_test.cpp | 25 ++++++++++ 5 files changed, 114 insertions(+) diff --git a/kernels/aten/functions.yaml b/kernels/aten/functions.yaml index a4237458fd..eded965647 100644 --- a/kernels/aten/functions.yaml +++ b/kernels/aten/functions.yaml @@ -96,6 +96,8 @@ - op: clamp.out cpp_no_default_args: ['min'] +- op: clamp.Tensor_out + - op: clone.out - op: constant_pad_nd.out diff --git a/kernels/portable/cpu/op_clamp.cpp b/kernels/portable/cpu/op_clamp.cpp index 05b19c4ba2..4db4730edb 100644 --- a/kernels/portable/cpu/op_clamp.cpp +++ b/kernels/portable/cpu/op_clamp.cpp @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -160,6 +161,86 @@ Tensor& clamp_out( return out; } +Tensor& clamp_tensor_out( + RuntimeContext& ctx, + const Tensor& in, + const exec_aten::optional& min_opt, + const exec_aten::optional& max_opt, + Tensor& out) { + (void)ctx; + + bool has_min = min_opt.has_value(); + bool has_max = max_opt.has_value(); + + ET_KERNEL_CHECK_MSG( + ctx, + has_min || has_max, + InvalidArgument, + out, + "At least one of 'min' or 'max' must not be None"); + + const Tensor& min = has_min ? min_opt.value() : in; + const Tensor& max = has_max ? max_opt.value() : in; + + ET_KERNEL_CHECK( + ctx, + resize_to_broadcast_target_size(in, min, max, out) == Error::Ok, + InvalidArgument, + out); + + ScalarType in_type = in.scalar_type(); + ScalarType min_type = min.scalar_type(); + ScalarType max_type = max.scalar_type(); + ScalarType common_type = in_type; + ScalarType out_type = out.scalar_type(); + + if (has_min) { + common_type = promoteTypes(common_type, min_type); + } + if (has_max) { + common_type = promoteTypes(common_type, max_type); + } + + ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); + + constexpr auto name = "clamp.Tensor_out"; + + ET_SWITCH_REALB_TYPES(in_type, ctx, name, CTYPE_IN, [&]() { + ET_SWITCH_REALB_TYPES(min_type, ctx, name, CTYPE_MIN, [&]() { + ET_SWITCH_REALB_TYPES(max_type, ctx, name, CTYPE_MAX, [&]() { + ET_SWITCH_REALB_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() { + apply_ternary_elementwise_fn< + CTYPE_IN, + CTYPE_MIN, + CTYPE_MAX, + CTYPE_OUT>( + [has_min, has_max]( + const CTYPE_IN val_in, + const CTYPE_MIN val_min, + const CTYPE_MAX val_max) { + CTYPE_OUT val_out = static_cast(val_in); + if (has_min) { + val_out = utils::max_override( + val_out, static_cast(val_min)); + } + if (has_max) { + val_out = utils::min_override( + val_out, static_cast(val_max)); + } + return val_out; + }, + in, + min, + max, + out); + }); + }); + }); + }); + + return out; +} + } // namespace native } // namespace executor } // namespace torch diff --git a/kernels/portable/cpu/targets.bzl b/kernels/portable/cpu/targets.bzl index 2b8805d248..07a7d680eb 100644 --- a/kernels/portable/cpu/targets.bzl +++ b/kernels/portable/cpu/targets.bzl @@ -198,6 +198,7 @@ _ATEN_OPS = ( name = "op_clamp", deps = [ ":scalar_utils", + "//executorch/kernels/portable/cpu/util:broadcast_util", "//executorch/kernels/portable/cpu/util:functional_util", "//executorch/kernels/portable/cpu/util:math_util", ], diff --git a/kernels/portable/functions.yaml b/kernels/portable/functions.yaml index c81e21c2b1..cac457171a 100644 --- a/kernels/portable/functions.yaml +++ b/kernels/portable/functions.yaml @@ -208,6 +208,11 @@ - arg_meta: null kernel_name: torch::executor::clamp_out +- op: clamp.Tensor_out + kernels: + - arg_meta: null + kernel_name: torch::executor::clamp_tensor_out + - op: clone.out kernels: - arg_meta: null diff --git a/kernels/test/op_clamp_test.cpp b/kernels/test/op_clamp_test.cpp index 08d898733e..3ad6574a87 100644 --- a/kernels/test/op_clamp_test.cpp +++ b/kernels/test/op_clamp_test.cpp @@ -40,6 +40,15 @@ Tensor& op_clamp_out( return torch::executor::aten::clamp_outf(context, self, min, max, out); } +Tensor& op_clamp_tensor_out( + const Tensor& self, + const optional& min, + const optional& max, + Tensor& out) { + exec_aten::RuntimeContext context{}; + return torch::executor::aten::clamp_outf(context, self, min, max, out); +} + /// Describes a test case, using tensors of the specified DTYPE. template struct ClampTestCase { @@ -436,3 +445,19 @@ TEST(OpClampOutTest, DynamicShapeUnbound) { Tensor ret = op_clamp_out(x, y, z, out); EXPECT_TENSOR_CLOSE(out, expected_result); } + +TEST(OpClampTensorOutTest, SmokeTest) { + TensorFactory tf_in; + TensorFactory tf_min; + TensorFactory tf_max; + TensorFactory tf_out; + + Tensor in = tf_in.make({1, 1}, {3}); + Tensor min = tf_min.make({1, 3}, {0, 1, 4}); + Tensor max = tf_max.make({2, 1}, {2, 5}); + Tensor out = tf_out.zeros({2, 3}); + Tensor expected = tf_out.make({2, 3}, {2, 2, 2, 3, 3, 4}); + + op_clamp_tensor_out(in, min, max, out); + EXPECT_TENSOR_EQ(out, expected); +} From 09d3318290f5fa38120c258f3a67679adcbe3f09 Mon Sep 17 00:00:00 2001 From: Manuel Candales Date: Mon, 4 Mar 2024 12:55:40 -0800 Subject: [PATCH 011/290] Add op: var.correction_out (#2213) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/2213 bypass-github-pytorch-ci-checks ghstack-source-id: 217233086 exported-using-ghexport bypass-github-export-checks Reviewed By: SS-JIA Differential Revision: D53624736 fbshipit-source-id: a6b27eb1d72fc0491087116f7005298bb6ed293c --- kernels/aten/functions.yaml | 2 + kernels/portable/cpu/op_var.cpp | 125 +++++++++++++++++++++++-------- kernels/portable/cpu/targets.bzl | 1 + kernels/portable/functions.yaml | 5 ++ kernels/test/op_var_test.cpp | 25 +++++++ 5 files changed, 125 insertions(+), 33 deletions(-) diff --git a/kernels/aten/functions.yaml b/kernels/aten/functions.yaml index eded965647..b127e32246 100644 --- a/kernels/aten/functions.yaml +++ b/kernels/aten/functions.yaml @@ -368,6 +368,8 @@ - op: upsample_nearest2d.vec_out +- op: var.correction_out + - op: var.out - op: view_copy.out diff --git a/kernels/portable/cpu/op_var.cpp b/kernels/portable/cpu/op_var.cpp index 9d4d8f87f6..7b8debaca0 100644 --- a/kernels/portable/cpu/op_var.cpp +++ b/kernels/portable/cpu/op_var.cpp @@ -8,6 +8,7 @@ #include +#include #include #include #include @@ -15,9 +16,45 @@ namespace torch { namespace executor { namespace native { +namespace { -using Tensor = exec_aten::Tensor; -using ScalarType = exec_aten::ScalarType; +template +void compute_variance( + const Tensor& in, + Tensor& out, + optional> dim_list, + const size_t num, + const double denominator) { + CTYPE_OUT* out_data = out.mutable_data_ptr(); + if (num == 0 || denominator == 0) { + for (size_t out_ix = 0; out_ix < out.numel(); ++out_ix) { + out_data[out_ix] = NAN; + } + } else { + for (size_t out_ix = 0; out_ix < out.numel(); ++out_ix) { + CTYPE_OUT sum = map_reduce_over_dim_list( + [](CTYPE_IN v) { return static_cast(v); }, + [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; }, + in, + dim_list, + out_ix); + CTYPE_OUT mean = sum / num; + CTYPE_OUT sum2 = map_reduce_over_dim_list( + [mean](CTYPE_IN v) { + return ( + (static_cast(v) - mean) * + (static_cast(v) - mean)); + }, + [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; }, + in, + dim_list, + out_ix); + out_data[out_ix] = sum2 / denominator; + } + } +} + +} // namespace Tensor& var_out( RuntimeContext& ctx, @@ -40,37 +77,59 @@ Tensor& var_out( InvalidArgument, out); - ET_SWITCH_FLOAT_TYPES(in.scalar_type(), ctx, "var.out", CTYPE_IN, [&] { - ET_SWITCH_FLOAT_TYPES(out.scalar_type(), ctx, "var.out", CTYPE_OUT, [&] { - CTYPE_OUT* out_data = out.mutable_data_ptr(); - const size_t num = get_reduced_dim_product(in, dim_list); - const size_t denominator = unbiased ? num - 1 : num; - if (num == 0 || denominator == 0) { - for (size_t out_ix = 0; out_ix < out.numel(); ++out_ix) { - out_data[out_ix] = NAN; - } - } else { - for (size_t out_ix = 0; out_ix < out.numel(); ++out_ix) { - CTYPE_OUT sum = map_reduce_over_dim_list( - [](CTYPE_IN v) { return static_cast(v); }, - [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; }, - in, - dim_list, - out_ix); - CTYPE_OUT mean = sum / num; - CTYPE_OUT sum2 = map_reduce_over_dim_list( - [mean](CTYPE_IN v) { - return ( - (static_cast(v) - mean) * - (static_cast(v) - mean)); - }, - [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; }, - in, - dim_list, - out_ix); - out_data[out_ix] = sum2 / denominator; - } - } + const size_t num = get_reduced_dim_product(in, dim_list); + const size_t denom = unbiased ? num - 1 : num; + + constexpr auto name = "var.out"; + + ET_SWITCH_FLOAT_TYPES(in.scalar_type(), ctx, name, CTYPE_IN, [&] { + ET_SWITCH_FLOAT_TYPES(out.scalar_type(), ctx, name, CTYPE_OUT, [&] { + compute_variance(in, out, dim_list, num, denom); + }); + }); + + return out; +} + +Tensor& var_correction_out( + RuntimeContext& ctx, + const Tensor& in, + optional> dim_list, + const optional& correction, + bool keepdim, + Tensor& out) { + (void)ctx; + + ET_KERNEL_CHECK( + ctx, + check_reduction_args(in, dim_list, keepdim, {}, out), + InvalidArgument, + out); + + ET_KERNEL_CHECK( + ctx, + resize_reduction_out(in, dim_list, keepdim, out) == Error::Ok, + InvalidArgument, + out); + + constexpr auto name = "var.correction_out"; + + double correction_val = 1; + if (correction.has_value()) { + ScalarType corr_type = utils::get_scalar_dtype(correction.value()); + ET_SWITCH_SCALAR_OBJ_TYPES(corr_type, ctx, name, CTYPE_CORR, [&]() { + CTYPE_CORR corr_val = 0; + utils::extract_scalar(correction.value(), &corr_val); + correction_val = static_cast(corr_val); + }); + } + + const size_t num = get_reduced_dim_product(in, dim_list); + const double denom = num - correction_val; + + ET_SWITCH_FLOAT_TYPES(in.scalar_type(), ctx, name, CTYPE_IN, [&] { + ET_SWITCH_FLOAT_TYPES(out.scalar_type(), ctx, name, CTYPE_OUT, [&] { + compute_variance(in, out, dim_list, num, denom); }); }); diff --git a/kernels/portable/cpu/targets.bzl b/kernels/portable/cpu/targets.bzl index 07a7d680eb..621cf09462 100644 --- a/kernels/portable/cpu/targets.bzl +++ b/kernels/portable/cpu/targets.bzl @@ -918,6 +918,7 @@ _ATEN_OPS = ( op_target( name = "op_var", deps = [ + ":scalar_utils", "//executorch/runtime/core/exec_aten/util:scalar_type_util", "//executorch/runtime/core/exec_aten/util:tensor_util", "//executorch/kernels/portable/cpu/util:reduce_util", diff --git a/kernels/portable/functions.yaml b/kernels/portable/functions.yaml index cac457171a..9532b5b668 100644 --- a/kernels/portable/functions.yaml +++ b/kernels/portable/functions.yaml @@ -812,6 +812,11 @@ - arg_meta: null kernel_name: torch::executor::unsqueeze_copy_out +- op: var.correction_out + kernels: + - arg_meta: null + kernel_name: torch::executor::var_correction_out + - op: var.out kernels: - arg_meta: null diff --git a/kernels/test/op_var_test.cpp b/kernels/test/op_var_test.cpp index 175a50928e..f7505c2e8d 100644 --- a/kernels/test/op_var_test.cpp +++ b/kernels/test/op_var_test.cpp @@ -20,6 +20,7 @@ using namespace ::testing; using exec_aten::ArrayRef; using exec_aten::optional; +using exec_aten::Scalar; using exec_aten::ScalarType; using exec_aten::Tensor; using torch::executor::testing::TensorFactory; @@ -35,6 +36,17 @@ Tensor& op_var_out( context, self, dim, unbiased, keepdim, out); } +Tensor& op_var_correction_out( + const Tensor& self, + optional> dim, + optional& correction, + bool keepdim, + Tensor& out) { + exec_aten::RuntimeContext context{}; + return torch::executor::aten::var_outf( + context, self, dim, correction, keepdim, out); +} + template void test_var_out_invalid_dimensions() { TensorFactory tf_in; @@ -363,3 +375,16 @@ TEST(OpVarOutTest, DynamicShapeUnbound) { x, ArrayRef{1}, /*unbiased=*/true, /*keepdim=*/false, out); EXPECT_TENSOR_CLOSE(out, expected_result); } + +TEST(OpVarCorrectionOutTest, SmokeTest) { + TensorFactory tf; + + Tensor x = tf.make({2, 3}, {4.9, 4.0, 5.6, 3.8, 4.9, 5.6}); + Tensor expected = tf.make({2}, {0.72693, 0.93032}); + optional correction(1.23); + Tensor out = tf.zeros({2}); + + op_var_correction_out( + x, ArrayRef{1}, correction, /*keepdim=*/false, out); + EXPECT_TENSOR_CLOSE(out, expected); +} From 1b22459c5f4c519288bfc1a8e0bdebc209a68363 Mon Sep 17 00:00:00 2001 From: Manuel Candales Date: Mon, 4 Mar 2024 12:55:40 -0800 Subject: [PATCH 012/290] Add op: roll.out (#2214) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/2214 bypass-github-pytorch-ci-checks ghstack-source-id: 217233084 exported-using-ghexport bypass-github-export-checks Reviewed By: SS-JIA Differential Revision: D53624734 fbshipit-source-id: a5c871bc7a3775357ede1dd5d7ef5bab88564877 --- kernels/aten/functions.yaml | 2 + kernels/portable/cpu/op_roll.cpp | 91 ++++++++++++++++++++++++++++++++ kernels/portable/cpu/targets.bzl | 3 ++ kernels/portable/functions.yaml | 5 ++ kernels/test/op_roll_test.cpp | 54 +++++++++++++++++++ kernels/test/targets.bzl | 1 + 6 files changed, 156 insertions(+) create mode 100644 kernels/portable/cpu/op_roll.cpp create mode 100644 kernels/test/op_roll_test.cpp diff --git a/kernels/aten/functions.yaml b/kernels/aten/functions.yaml index b127e32246..73b201ee63 100644 --- a/kernels/aten/functions.yaml +++ b/kernels/aten/functions.yaml @@ -292,6 +292,8 @@ - op: replication_pad3d.out +- op: roll.out + - op: round.out - op: rsqrt.out diff --git a/kernels/portable/cpu/op_roll.cpp b/kernels/portable/cpu/op_roll.cpp new file mode 100644 index 0000000000..e6600850cc --- /dev/null +++ b/kernels/portable/cpu/op_roll.cpp @@ -0,0 +1,91 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +namespace torch { +namespace executor { +namespace native { +namespace { + +bool check_roll_args( + const Tensor& in, + IntArrayRef shifts, + IntArrayRef dims, + const Tensor& out) { + for (const auto& d : dims) { + if (in.dim() == 0) { + ET_LOG_AND_RETURN_IF_FALSE(d == 0 || d == -1); + } else { + ET_LOG_AND_RETURN_IF_FALSE(dim_is_valid(d, in.dim())); + } + } + ET_LOG_AND_RETURN_IF_FALSE(!shifts.empty()); + ET_LOG_AND_RETURN_IF_FALSE(shifts.size() == dims.size()); + ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, out)); + return true; +} + +size_t unshift_flat_ix(size_t ix, const Tensor& in, IntArrayRef dim_shifts) { + size_t ix_coord[kTensorDimensionLimit]; + indexToCoordinate(in, ix, ix_coord); + + size_t shifted_coord[kTensorDimensionLimit]; + for (size_t d = 0; d < in.dim(); d++) { + shifted_coord[d] = (ix_coord[d] - dim_shifts[d]) % in.size(d); + } + + return coordinateToIndex(in, shifted_coord); +} + +} // namespace + +Tensor& roll_out( + RuntimeContext& ctx, + const Tensor& in, + IntArrayRef shifts, + IntArrayRef dims, + Tensor& out) { + (void)ctx; + + ET_KERNEL_CHECK( + ctx, resize_tensor(out, in.sizes()) == Error::Ok, InvalidArgument, out); + + ET_KERNEL_CHECK( + ctx, check_roll_args(in, shifts, dims, out), InvalidArgument, out); + + constexpr auto name = "roll.out"; + + int64_t dim_shift_array[kTensorDimensionLimit]; + for (size_t i = 0; i < in.dim(); i++) { + dim_shift_array[i] = 0; + } + for (size_t i = 0; i < dims.size(); i++) { + const auto d = dims[i] < 0 ? dims[i] + in.dim() : dims[i]; + dim_shift_array[d] += shifts[i]; + } + + size_t dim_shift_array_length = static_cast(in.dim()); // NOLINT + IntArrayRef dim_shifts(dim_shift_array, dim_shift_array_length); + + ET_SWITCH_REAL_TYPES(in.scalar_type(), ctx, name, CTYPE, [&] { + const CTYPE* in_data = in.const_data_ptr(); + CTYPE* out_data = out.mutable_data_ptr(); + + for (size_t ix = 0; ix < out.numel(); ++ix) { + out_data[ix] = in_data[unshift_flat_ix(ix, in, dim_shifts)]; + } + }); + + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/kernels/portable/cpu/targets.bzl b/kernels/portable/cpu/targets.bzl index 621cf09462..2eb9eefa42 100644 --- a/kernels/portable/cpu/targets.bzl +++ b/kernels/portable/cpu/targets.bzl @@ -722,6 +722,9 @@ _ATEN_OPS = ( "//executorch/kernels/portable/cpu/util:padding_util", ], ), + op_target( + name = "op_roll", + ), op_target( name = "op_round", deps = [ diff --git a/kernels/portable/functions.yaml b/kernels/portable/functions.yaml index 9532b5b668..38e4fddf81 100644 --- a/kernels/portable/functions.yaml +++ b/kernels/portable/functions.yaml @@ -662,6 +662,11 @@ - arg_meta: null kernel_name: torch::executor::replication_pad3d_out +- op: roll.out + kernels: + - arg_meta: null + kernel_name: torch::executor::roll_out + - op: round.out kernels: - arg_meta: null diff --git a/kernels/test/op_roll_test.cpp b/kernels/test/op_roll_test.cpp new file mode 100644 index 0000000000..dc7b23ca50 --- /dev/null +++ b/kernels/test/op_roll_test.cpp @@ -0,0 +1,54 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include // Declares the operator +#include +#include +#include +#include +#include + +#include + +using namespace ::testing; +using exec_aten::ArrayRef; +using exec_aten::ScalarType; +using exec_aten::Tensor; +using torch::executor::testing::TensorFactory; + +Tensor& op_roll_out( + const Tensor& input, + ArrayRef shifts, + ArrayRef dims, + Tensor& out) { + exec_aten::RuntimeContext context{}; + return torch::executor::aten::roll_outf(context, input, shifts, dims, out); +} + +class OpRollOutTest : public ::testing::Test { + protected: + void SetUp() override { + // Since these tests cause ET_LOG to be called, the PAL must be initialized + // first. + torch::executor::runtime_init(); + } +}; + +TEST_F(OpRollOutTest, SmokeTest) { + TensorFactory tfFloat; + + Tensor input = tfFloat.make({4, 2}, {1, 2, 3, 4, 5, 6, 7, 8}); + int64_t shifts_data[2] = {2, 1}; + ArrayRef shifts = ArrayRef(shifts_data, 2); + int64_t dims_data[2] = {0, 1}; + ArrayRef dims = ArrayRef(dims_data, 2); + Tensor out = tfFloat.zeros({4, 2}); + Tensor out_expected = tfFloat.make({4, 2}, {6, 5, 8, 7, 2, 1, 4, 3}); + op_roll_out(input, shifts, dims, out); + EXPECT_TENSOR_CLOSE(out, out_expected); +} diff --git a/kernels/test/targets.bzl b/kernels/test/targets.bzl index 071a3504a4..e76007a94a 100644 --- a/kernels/test/targets.bzl +++ b/kernels/test/targets.bzl @@ -258,6 +258,7 @@ def define_common_targets(is_fbcode = False): _common_op_test("op_replication_pad1d_test", ["aten", "portable"]) _common_op_test("op_replication_pad2d_test", ["aten", "portable"]) _common_op_test("op_replication_pad3d_test", ["aten", "portable"]) + _common_op_test("op_roll_test", ["aten", "portable"]) _common_op_test("op_round_test", ["aten", "portable"]) _common_op_test("op_rsqrt_test", ["aten", "portable"]) _common_op_test("op_rsub_test", ["aten", "portable"]) From 21ab4e7d11b29d13316d460bf32b934f3b4d09c1 Mon Sep 17 00:00:00 2001 From: Manuel Candales Date: Mon, 4 Mar 2024 12:55:40 -0800 Subject: [PATCH 013/290] Add op: flip.out (#2215) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/2215 bypass-github-pytorch-ci-checks ghstack-source-id: 217233087 exported-using-ghexport bypass-github-export-checks Reviewed By: SS-JIA Differential Revision: D53624739 fbshipit-source-id: 23491cc422bf8b863b609a37766a2ed61cad7e5f --- kernels/aten/functions.yaml | 2 + kernels/portable/cpu/op_flip.cpp | 77 ++++++++++++++++++++++++++++++++ kernels/portable/cpu/targets.bzl | 6 +++ kernels/portable/functions.yaml | 5 +++ kernels/test/op_flip_test.cpp | 64 ++++++++++++++++++++++++++ kernels/test/targets.bzl | 1 + 6 files changed, 155 insertions(+) create mode 100644 kernels/portable/cpu/op_flip.cpp create mode 100644 kernels/test/op_flip_test.cpp diff --git a/kernels/aten/functions.yaml b/kernels/aten/functions.yaml index 73b201ee63..6a7bcad557 100644 --- a/kernels/aten/functions.yaml +++ b/kernels/aten/functions.yaml @@ -148,6 +148,8 @@ - op: fill.Tensor_out +- op: flip.out + - op: floor_divide.out - op: floor.out diff --git a/kernels/portable/cpu/op_flip.cpp b/kernels/portable/cpu/op_flip.cpp new file mode 100644 index 0000000000..6304237e10 --- /dev/null +++ b/kernels/portable/cpu/op_flip.cpp @@ -0,0 +1,77 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +namespace torch { +namespace executor { +namespace native { +namespace { + +bool check_flip_args(const Tensor& in, IntArrayRef dims, const Tensor& out) { + ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, out)); + return check_dim_list_is_valid(in, dims); +} + +size_t unflip_flat_ix(size_t ix, const Tensor& in, ArrayRef flip_dim) { + size_t ix_coord[kTensorDimensionLimit]; + indexToCoordinate(in, ix, ix_coord); + + size_t unflip_coord[kTensorDimensionLimit]; + for (size_t d = 0; d < in.dim(); d++) { + if (flip_dim[d]) { + unflip_coord[d] = in.size(d) - ix_coord[d] - 1; + } else { + unflip_coord[d] = ix_coord[d]; + } + } + + return coordinateToIndex(in, unflip_coord); +} + +} // namespace + +Tensor& +flip_out(RuntimeContext& ctx, const Tensor& in, IntArrayRef dims, Tensor& out) { + (void)ctx; + + ET_KERNEL_CHECK( + ctx, resize_tensor(out, in.sizes()) == Error::Ok, InvalidArgument, out); + + ET_KERNEL_CHECK(ctx, check_flip_args(in, dims, out), InvalidArgument, out); + + bool flip_dim_data[kTensorDimensionLimit]; + for (size_t i = 0; i < in.dim(); i++) { + flip_dim_data[i] = false; + } + for (size_t i = 0; i < dims.size(); i++) { + const auto d = dims[i] < 0 ? dims[i] + in.dim() : dims[i]; + flip_dim_data[d] = true; + } + size_t flip_dim_length = static_cast(in.dim()); // NOLINT + ArrayRef flip_dim(flip_dim_data, flip_dim_length); + + constexpr auto name = "flip.out"; + + ET_SWITCH_REAL_TYPES(in.scalar_type(), ctx, name, CTYPE, [&] { + const CTYPE* in_data = in.const_data_ptr(); + CTYPE* out_data = out.mutable_data_ptr(); + + for (size_t ix = 0; ix < out.numel(); ++ix) { + out_data[ix] = in_data[unflip_flat_ix(ix, in, flip_dim)]; + } + }); + + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/kernels/portable/cpu/targets.bzl b/kernels/portable/cpu/targets.bzl index 2eb9eefa42..14387c8ccd 100644 --- a/kernels/portable/cpu/targets.bzl +++ b/kernels/portable/cpu/targets.bzl @@ -318,6 +318,12 @@ _ATEN_OPS = ( "//executorch/runtime/core/exec_aten/util:tensor_util", ], ), + op_target( + name = "op_flip", + deps = [ + "//executorch/kernels/portable/cpu/util:reduce_util", + ], + ), op_target( name = "op_floor", deps = [ diff --git a/kernels/portable/functions.yaml b/kernels/portable/functions.yaml index 38e4fddf81..c08bb6732a 100644 --- a/kernels/portable/functions.yaml +++ b/kernels/portable/functions.yaml @@ -324,6 +324,11 @@ - arg_meta: null kernel_name: torch::executor::fill_tensor_out +- op: flip.out + kernels: + - arg_meta: null + kernel_name: torch::executor::flip_out + - op: floor.out kernels: - arg_meta: null diff --git a/kernels/test/op_flip_test.cpp b/kernels/test/op_flip_test.cpp new file mode 100644 index 0000000000..36d85d8a1f --- /dev/null +++ b/kernels/test/op_flip_test.cpp @@ -0,0 +1,64 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include // Declares the operator +#include +#include +#include +#include +#include + +#include + +using namespace ::testing; +using exec_aten::IntArrayRef; +using exec_aten::ScalarType; +using exec_aten::Tensor; +using torch::executor::testing::TensorFactory; + +Tensor& op_flip_out(const Tensor& input, IntArrayRef dims, Tensor& out) { + exec_aten::RuntimeContext context{}; + return torch::executor::aten::flip_outf(context, input, dims, out); +} + +class OpFlipOutTest : public ::testing::Test { + protected: + void SetUp() override { + // Since these tests cause ET_LOG to be called, the PAL must be initialized + // first. + torch::executor::runtime_init(); + } +}; + +TEST_F(OpFlipOutTest, SmokeTest1Dim) { + TensorFactory tfFloat; + + Tensor input = + tfFloat.make({4, 1, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); + int64_t dims_data[1] = {-1}; + IntArrayRef dims = IntArrayRef(dims_data, 1); + Tensor out = tfFloat.zeros({4, 1, 3}); + Tensor out_expected = + tfFloat.make({4, 1, 3}, {3, 2, 1, 6, 5, 4, 9, 8, 7, 12, 11, 10}); + op_flip_out(input, dims, out); + EXPECT_TENSOR_CLOSE(out, out_expected); +} + +TEST_F(OpFlipOutTest, SmokeTest2Dims) { + TensorFactory tfFloat; + + Tensor input = + tfFloat.make({4, 1, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); + int64_t dims_data[2] = {-1, 0}; + IntArrayRef dims = IntArrayRef(dims_data, 2); + Tensor out = tfFloat.zeros({4, 1, 3}); + Tensor out_expected = + tfFloat.make({4, 1, 3}, {12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1}); + op_flip_out(input, dims, out); + EXPECT_TENSOR_CLOSE(out, out_expected); +} diff --git a/kernels/test/targets.bzl b/kernels/test/targets.bzl index e76007a94a..4e8a8ffa04 100644 --- a/kernels/test/targets.bzl +++ b/kernels/test/targets.bzl @@ -200,6 +200,7 @@ def define_common_targets(is_fbcode = False): _common_op_test("op_expand_copy_test", ["aten", "portable"]) _common_op_test("op_expm1_test", ["aten", "portable"]) _common_op_test("op_fill_test", ["aten", "portable"]) + _common_op_test("op_flip_test", ["aten", "portable"]) _common_op_test("op_floor_divide_test", ["aten", "portable"]) _common_op_test("op_floor_test", ["aten", "portable"]) _common_op_test("op_fmod_test", ["aten", "portable"]) From ace5bc3a1f3671d74e82104163b01eb837bf93e2 Mon Sep 17 00:00:00 2001 From: Manuel Candales Date: Mon, 4 Mar 2024 12:55:40 -0800 Subject: [PATCH 014/290] Move as_strided_copy template to copy_ops_util (#2216) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/2216 bypass-github-pytorch-ci-checks ghstack-source-id: 217233089 exported-using-ghexport bypass-github-export-checks Reviewed By: SS-JIA Differential Revision: D53624735 fbshipit-source-id: b04b3835e0683f4e414a8ecba435fecfc007fe8d --- kernels/portable/cpu/op_as_strided_copy.cpp | 50 +++------------------ kernels/portable/cpu/util/copy_ops_util.h | 49 ++++++++++++++++++++ 2 files changed, 54 insertions(+), 45 deletions(-) diff --git a/kernels/portable/cpu/op_as_strided_copy.cpp b/kernels/portable/cpu/op_as_strided_copy.cpp index 619604b7c1..17ebc41087 100644 --- a/kernels/portable/cpu/op_as_strided_copy.cpp +++ b/kernels/portable/cpu/op_as_strided_copy.cpp @@ -16,42 +16,9 @@ namespace native { using Tensor = exec_aten::Tensor; using ScalarType = exec_aten::ScalarType; -namespace { - -/** - * Copy input_data to output_data according to the stride and shape recursively - */ -template -void as_strided_copy( - CTYPE_IN* input_data, - CTYPE_IN* output_data, - Tensor& out, - ArrayRef size, - ArrayRef stride, - int64_t dim) { - // the last dimension, copy data - if (dim == size.size() - 1) { - for (size_t i = 0; i < size.at(dim); ++i) { - output_data[i] = *input_data; - input_data += stride.at(dim); - } - return; - } - size_t trailing_dims = getTrailingDims(out, dim); - // recursively set data for the next dimension - for (size_t i = 0; i < size.at(dim); ++i) { - as_strided_copy( - input_data, output_data, out, size, stride, dim + 1); - input_data += stride.at(dim); - output_data += trailing_dims; - } -} - -} // namespace - Tensor& as_strided_copy_out( RuntimeContext& ctx, - const Tensor& self, + const Tensor& in, ArrayRef size, ArrayRef stride, optional storage_offset, @@ -60,7 +27,7 @@ Tensor& as_strided_copy_out( ET_KERNEL_CHECK( ctx, - check_as_strided_copy_args(self, size, stride, storage_offset, out), + check_as_strided_copy_args(in, size, stride, storage_offset, out), InvalidArgument, out); @@ -70,21 +37,14 @@ Tensor& as_strided_copy_out( InvalidArgument, out); - if (self.numel() == 0) { + if (in.numel() == 0) { return out; } size_t offset = storage_offset.has_value() ? storage_offset.value() : 0; - ET_SWITCH_ALL_TYPES(self.scalar_type(), ctx, __func__, CTYPE, [&] { - CTYPE* self_data = self.mutable_data_ptr() + offset; - CTYPE* out_data = out.mutable_data_ptr(); - - if (size.empty()) { - out_data[0] = self_data[0]; - } else { - as_strided_copy(self_data, out_data, out, size, stride, 0); - } + ET_SWITCH_ALL_TYPES(in.scalar_type(), ctx, __func__, CTYPE, [&] { + as_strided_copy(in, size, stride, offset, out); }); return out; diff --git a/kernels/portable/cpu/util/copy_ops_util.h b/kernels/portable/cpu/util/copy_ops_util.h index dc8e5902ae..7690e84de1 100644 --- a/kernels/portable/cpu/util/copy_ops_util.h +++ b/kernels/portable/cpu/util/copy_ops_util.h @@ -12,6 +12,38 @@ namespace torch { namespace executor { +namespace { + +/** + * Copy input_data to output_data according to the stride and shape recursively + */ +template +void _as_strided_copy( + CTYPE* input_data, + CTYPE* output_data, + Tensor& out, + ArrayRef size, + ArrayRef stride, + int64_t dim) { + // the last dimension, copy data + if (dim == size.size() - 1) { + for (size_t i = 0; i < size.at(dim); ++i) { + output_data[i] = *input_data; + input_data += stride.at(dim); + } + return; + } + size_t trailing_dims = getTrailingDims(out, dim); + // recursively set data for the next dimension + for (size_t i = 0; i < size.at(dim); ++i) { + _as_strided_copy( + input_data, output_data, out, size, stride, dim + 1); + input_data += stride.at(dim); + output_data += trailing_dims; + } +} + +} // namespace bool check_as_strided_copy_args( const Tensor& in, @@ -20,6 +52,23 @@ bool check_as_strided_copy_args( optional storage_offset, Tensor& out); +template +void as_strided_copy( + const Tensor& in, + ArrayRef size, + ArrayRef stride, + int64_t offset, + Tensor& out) { + CTYPE* in_data = in.mutable_data_ptr() + offset; + CTYPE* out_data = out.mutable_data_ptr(); + + if (size.empty()) { + out_data[0] = in_data[0]; + } else { + _as_strided_copy(in_data, out_data, out, size, stride, 0); + } +} + bool check_cat_args( exec_aten::ArrayRef tensors, int64_t dim, From 1ae981913a5d7cb6ea050673a191724e3ec083cc Mon Sep 17 00:00:00 2001 From: Manuel Candales Date: Mon, 4 Mar 2024 12:55:40 -0800 Subject: [PATCH 015/290] Add op: diagonal_copy.out (#2217) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/2217 bypass-github-pytorch-ci-checks ghstack-source-id: 217233085 exported-using-ghexport bypass-github-export-checks Reviewed By: SS-JIA Differential Revision: D53624738 fbshipit-source-id: 04b52f7b2b64839414a864f77b0cadda82d9ce6a --- kernels/aten/functions.yaml | 2 + kernels/portable/cpu/op_diagonal_copy.cpp | 101 ++++++++++++++++++++ kernels/portable/cpu/targets.bzl | 6 ++ kernels/portable/cpu/util/copy_ops_util.cpp | 54 +++++++++++ kernels/portable/cpu/util/copy_ops_util.h | 14 +++ kernels/portable/functions.yaml | 5 + kernels/test/op_diagonal_copy_test.cpp | 74 ++++++++++++++ kernels/test/targets.bzl | 1 + 8 files changed, 257 insertions(+) create mode 100644 kernels/portable/cpu/op_diagonal_copy.cpp create mode 100644 kernels/test/op_diagonal_copy_test.cpp diff --git a/kernels/aten/functions.yaml b/kernels/aten/functions.yaml index 6a7bcad557..1c84ff7cc3 100644 --- a/kernels/aten/functions.yaml +++ b/kernels/aten/functions.yaml @@ -118,6 +118,8 @@ - op: detach_copy.out +- op: diagonal_copy.out + - op: div.out - op: div.Scalar_mode_out diff --git a/kernels/portable/cpu/op_diagonal_copy.cpp b/kernels/portable/cpu/op_diagonal_copy.cpp new file mode 100644 index 0000000000..77a663ca68 --- /dev/null +++ b/kernels/portable/cpu/op_diagonal_copy.cpp @@ -0,0 +1,101 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include + +namespace torch { +namespace executor { +namespace native { +namespace { + +template +void diagonal_copy_impl( + const Tensor& in, + int64_t offset, + int64_t dim1, + int64_t dim2, + Tensor& out) { + int64_t storage_offset = 0; + size_t diag_size = out.size(out.dim() - 1); + + if (diag_size == 0) { + // skip + } else if (offset >= 0) { + storage_offset += offset * in.strides().at(dim2); + } else { + storage_offset -= offset * in.strides().at(dim1); + } + + size_t new_ndim = out.dim(); + int64_t new_sizes[kTensorDimensionLimit]; + for (size_t i = 0; i < new_ndim; ++i) { + new_sizes[i] = out.size(i); + } + + int64_t new_strides[kTensorDimensionLimit]; + size_t shift = 0; + for (size_t d = 0; d < in.dim(); ++d) { + if (d == dim1 || d == dim2) { + shift++; + } else { + new_strides[d - shift] = in.strides().at(d); + } + } + new_strides[in.dim() - 2] = in.strides().at(dim1) + in.strides().at(dim2); + + as_strided_copy( + in, {new_sizes, new_ndim}, {new_strides, new_ndim}, storage_offset, out); +} + +} // namespace + +Tensor& diagonal_copy_out( + RuntimeContext& ctx, + const Tensor& in, + int64_t offset, + int64_t dim1, + int64_t dim2, + Tensor& out) { + (void)ctx; + + ET_KERNEL_CHECK( + ctx, check_diagonal_copy_args(in, dim1, dim2, out), InvalidArgument, out); + + if (dim1 < 0) { + dim1 += nonzero_dim(in); + } + if (dim2 < 0) { + dim2 += nonzero_dim(in); + } + + Tensor::SizesType expected_out_size[kTensorDimensionLimit]; + size_t expected_out_dim = 0; + get_diagonal_copy_out_target_size( + in, offset, dim1, dim2, expected_out_size, &expected_out_dim); + + ET_KERNEL_CHECK( + ctx, + resize_tensor(out, {expected_out_size, expected_out_dim}) == Error::Ok, + InvalidArgument, + out); + + constexpr auto name = "diagonal_copy.out"; + + ET_SWITCH_REAL_TYPES(in.scalar_type(), ctx, name, CTYPE, [&] { + diagonal_copy_impl(in, offset, dim1, dim2, out); + }); + + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/kernels/portable/cpu/targets.bzl b/kernels/portable/cpu/targets.bzl index 14387c8ccd..6d6517d476 100644 --- a/kernels/portable/cpu/targets.bzl +++ b/kernels/portable/cpu/targets.bzl @@ -255,6 +255,12 @@ _ATEN_OPS = ( "//executorch/runtime/core/exec_aten/util:tensor_util", ], ), + op_target( + name = "op_diagonal_copy", + deps = [ + "//executorch/kernels/portable/cpu/util:copy_ops_util", + ], + ), op_target( name = "op_div", deps = [ diff --git a/kernels/portable/cpu/util/copy_ops_util.cpp b/kernels/portable/cpu/util/copy_ops_util.cpp index 69f7dc94c3..ae48dee0fb 100644 --- a/kernels/portable/cpu/util/copy_ops_util.cpp +++ b/kernels/portable/cpu/util/copy_ops_util.cpp @@ -845,5 +845,59 @@ bool get_view_copy_target_size( return true; } +bool check_diagonal_copy_args( + const Tensor& in, + int64_t dim1, + int64_t dim2, + Tensor& out) { + ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, out)); + ET_LOG_AND_RETURN_IF_FALSE(tensor_has_rank_greater_or_equal_to(in, 2)); + ET_LOG_AND_RETURN_IF_FALSE(tensor_has_dim(in, dim1)); + ET_LOG_AND_RETURN_IF_FALSE(tensor_has_dim(in, dim2)); + if (dim1 < 0) { + dim1 += nonzero_dim(in); + } + if (dim2 < 0) { + dim2 += nonzero_dim(in); + } + ET_LOG_AND_RETURN_IF_FALSE(dim1 != dim2); + return true; +} + +void get_diagonal_copy_out_target_size( + const Tensor& in, + int64_t offset, + int64_t dim1, + int64_t dim2, + Tensor::SizesType* out_sizes, + size_t* out_ndim) { + *out_ndim = in.dim() - 1; + + if (dim1 < 0) { + dim1 += nonzero_dim(in); + } + if (dim2 < 0) { + dim2 += nonzero_dim(in); + } + + size_t diagonal_size = 0; + if (offset >= 0) { + diagonal_size = std::min(in.size(dim1), in.size(dim2) - offset); + } else { + diagonal_size = std::min(in.size(dim1) + offset, in.size(dim2)); + } + diagonal_size = std::max(diagonal_size, 0); + + size_t shift = 0; + for (size_t d = 0; d < in.dim(); ++d) { + if (d == dim1 || d == dim2) { + shift++; + } else { + out_sizes[d - shift] = in.size(d); + } + } + out_sizes[in.dim() - 2] = diagonal_size; +} + } // namespace executor } // namespace torch diff --git a/kernels/portable/cpu/util/copy_ops_util.h b/kernels/portable/cpu/util/copy_ops_util.h index 7690e84de1..db2161ec0e 100644 --- a/kernels/portable/cpu/util/copy_ops_util.h +++ b/kernels/portable/cpu/util/copy_ops_util.h @@ -214,5 +214,19 @@ bool get_view_copy_target_size( int64_t dim, exec_aten::SizesType* out_sizes); +bool check_diagonal_copy_args( + const Tensor& in, + int64_t dim1, + int64_t dim2, + Tensor& out); + +void get_diagonal_copy_out_target_size( + const Tensor& in, + int64_t offset, + int64_t dim1, + int64_t dim2, + Tensor::SizesType* out_sizes, + size_t* out_ndim); + } // namespace executor } // namespace torch diff --git a/kernels/portable/functions.yaml b/kernels/portable/functions.yaml index c08bb6732a..f7e50cb7a4 100644 --- a/kernels/portable/functions.yaml +++ b/kernels/portable/functions.yaml @@ -253,6 +253,11 @@ - arg_meta: null kernel_name: torch::executor::detach_copy_out +- op: diagonal_copy.out + kernels: + - arg_meta: null + kernel_name: torch::executor::diagonal_copy_out + - op: div.out kernels: - arg_meta: null diff --git a/kernels/test/op_diagonal_copy_test.cpp b/kernels/test/op_diagonal_copy_test.cpp new file mode 100644 index 0000000000..71f2839db7 --- /dev/null +++ b/kernels/test/op_diagonal_copy_test.cpp @@ -0,0 +1,74 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include // Declares the operator +#include +#include +#include +#include +#include + +#include + +using namespace ::testing; +using exec_aten::IntArrayRef; +using exec_aten::ScalarType; +using exec_aten::Tensor; +using torch::executor::testing::TensorFactory; + +Tensor& op_diagonal_copy_out( + const Tensor& input, + int64_t offset, + int64_t dim1, + int64_t dim2, + Tensor& out) { + exec_aten::RuntimeContext context{}; + return torch::executor::aten::diagonal_copy_outf( + context, input, offset, dim1, dim2, out); +} + +class OpDiagonalCopyOutTest : public ::testing::Test { + protected: + void SetUp() override { + // Since these tests cause ET_LOG to be called, the PAL must be initialized + // first. + torch::executor::runtime_init(); + } +}; + +TEST_F(OpDiagonalCopyOutTest, SmokeTest2D) { + TensorFactory tfFloat; + + Tensor input = tfFloat.make({3, 4}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); + Tensor out = tfFloat.zeros({2}); + Tensor out_expected = tfFloat.make({2}, {5, 10}); + op_diagonal_copy_out(input, 1, 1, 0, out); + EXPECT_TENSOR_CLOSE(out, out_expected); +} + +TEST_F(OpDiagonalCopyOutTest, SmokeTest3D) { + TensorFactory tfFloat; + + Tensor input = + tfFloat.make({2, 3, 2}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); + Tensor out = tfFloat.zeros({3, 1}); + Tensor out_expected = tfFloat.make({3, 1}, {7, 9, 11}); + op_diagonal_copy_out(input, -1, 0, -1, out); + EXPECT_TENSOR_CLOSE(out, out_expected); +} + +TEST_F(OpDiagonalCopyOutTest, SmokeTest4D) { + TensorFactory tfFloat; + + Tensor input = + tfFloat.make({2, 1, 2, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); + Tensor out = tfFloat.zeros({1, 3, 2}); + Tensor out_expected = tfFloat.make({1, 3, 2}, {1, 10, 2, 11, 3, 12}); + op_diagonal_copy_out(input, 0, 0, 2, out); + EXPECT_TENSOR_CLOSE(out, out_expected); +} diff --git a/kernels/test/targets.bzl b/kernels/test/targets.bzl index 4e8a8ffa04..2c183348d2 100644 --- a/kernels/test/targets.bzl +++ b/kernels/test/targets.bzl @@ -191,6 +191,7 @@ def define_common_targets(is_fbcode = False): _common_op_test("op_cosh_test", ["aten", "portable"]) _common_op_test("op_cumsum_test", ["aten", "portable"]) _common_op_test("op_detach_copy_test", ["aten", "portable"]) + _common_op_test("op_diagonal_copy_test", ["aten", "portable"]) _common_op_test("op_div_test", ["aten", "portable", "optimized"]) _common_op_test("op_embedding_test", ["aten", "portable"]) _common_op_test("op_empty_test", ["aten", "portable"]) From a810f346e16359933b0392e0449057deca72feb2 Mon Sep 17 00:00:00 2001 From: Manuel Candales Date: Mon, 4 Mar 2024 12:55:40 -0800 Subject: [PATCH 016/290] Add ops: _native_batch_norm_legit variants (#2218) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/2218 bypass-github-pytorch-ci-checks ghstack-source-id: 217233093 exported-using-ghexport bypass-github-export-checks Reviewed By: SS-JIA Differential Revision: D53719158 fbshipit-source-id: c11b3119cd96c215dd0787d6b3761774b8d7011e --- kernels/aten/functions.yaml | 4 + kernels/portable/cpu/op_native_batch_norm.cpp | 147 +++++++++++++----- kernels/portable/functions.yaml | 10 ++ kernels/test/op_native_batch_norm_test.cpp | 117 ++++++++++++++ 4 files changed, 242 insertions(+), 36 deletions(-) diff --git a/kernels/aten/functions.yaml b/kernels/aten/functions.yaml index 1c84ff7cc3..d928c60ced 100644 --- a/kernels/aten/functions.yaml +++ b/kernels/aten/functions.yaml @@ -9,6 +9,10 @@ - op: _log_softmax.out +- op: _native_batch_norm_legit.out + +- op: _native_batch_norm_legit.no_stats_out + - op: _native_batch_norm_legit_no_training.out - op: _reshape_alias_copy.out diff --git a/kernels/portable/cpu/op_native_batch_norm.cpp b/kernels/portable/cpu/op_native_batch_norm.cpp index a04b49a1d8..26eb5d90a7 100644 --- a/kernels/portable/cpu/op_native_batch_norm.cpp +++ b/kernels/portable/cpu/op_native_batch_norm.cpp @@ -30,10 +30,10 @@ std::tuple _native_batch_norm_legit_no_training_out( double eps, Tensor& out, Tensor& mean_out, - Tensor& var_out) { + Tensor& invstd_out) { (void)ctx; - std::tuple ret_val(out, mean_out, var_out); + std::tuple ret_val(out, mean_out, invstd_out); ET_KERNEL_CHECK( ctx, @@ -45,7 +45,10 @@ std::tuple _native_batch_norm_legit_no_training_out( ctx, resize_tensor(mean_out, {0}) == Error::Ok, InvalidArgument, ret_val); ET_KERNEL_CHECK( - ctx, resize_tensor(var_out, {0}) == Error::Ok, InvalidArgument, ret_val); + ctx, + resize_tensor(invstd_out, {0}) == Error::Ok, + InvalidArgument, + ret_val); ET_KERNEL_CHECK( ctx, @@ -59,7 +62,7 @@ std::tuple _native_batch_norm_legit_no_training_out( eps, out, mean_out, - var_out), + invstd_out), InvalidArgument, ret_val); @@ -75,39 +78,111 @@ std::tuple _native_batch_norm_legit_no_training_out( size_t outer = getLeadingDims(in, C_dim); size_t inner = getTrailingDims(in, C_dim); - ET_SWITCH_FLOAT_TYPES( - in.scalar_type(), - ctx, - "native_batch_norm_legit_no_training.out", - CTYPE, - [&] { - const CTYPE* in_data = in.const_data_ptr(); - CTYPE* out_data = out.mutable_data_ptr(); - - const CTYPE* const mean_data = running_mean.const_data_ptr(); - const CTYPE* const var_data = running_var.const_data_ptr(); - - for (size_t i = 0; i < outer; ++i) { - for (size_t c = 0; c < C; ++c) { - CTYPE mean = mean_data[c]; - CTYPE var = var_data[c]; - CTYPE invstd = 1.0 / std::sqrt(var + eps); - CTYPE weight_val = 1; - if (weight.has_value()) { - weight_val = weight.value().const_data_ptr()[c]; - } - CTYPE bias_val = 0; - if (bias.has_value()) { - bias_val = bias.value().const_data_ptr()[c]; - } - for (size_t j = 0; j < inner; ++j) { - *out_data = (*in_data - mean) * invstd * weight_val + bias_val; - out_data++; - in_data++; - } - } + constexpr auto name = "native_batch_norm_legit_no_training.out"; + + ET_SWITCH_FLOAT_TYPES(in.scalar_type(), ctx, name, CTYPE, [&] { + const CTYPE* in_data = in.const_data_ptr(); + CTYPE* out_data = out.mutable_data_ptr(); + + const CTYPE* const mean_data = running_mean.const_data_ptr(); + const CTYPE* const var_data = running_var.const_data_ptr(); + + for (size_t i = 0; i < outer; ++i) { + for (size_t c = 0; c < C; ++c) { + CTYPE mean = mean_data[c]; + CTYPE var = var_data[c]; + CTYPE invstd = 1.0 / std::sqrt(var + eps); + CTYPE weight_val = 1; + if (weight.has_value()) { + weight_val = weight.value().const_data_ptr()[c]; + } + CTYPE bias_val = 0; + if (bias.has_value()) { + bias_val = bias.value().const_data_ptr()[c]; + } + for (size_t j = 0; j < inner; ++j) { + *out_data = (*in_data - mean) * invstd * weight_val + bias_val; + out_data++; + in_data++; } - }); + } + } + }); + + return ret_val; +} + +std::tuple _native_batch_norm_legit_out( + RuntimeContext& ctx, + const Tensor& in, + const exec_aten::optional& weight, + const exec_aten::optional& bias, + Tensor& running_mean, + Tensor& running_var, + bool training, + double momentum, + double eps, + Tensor& out, + Tensor& mean_out, + Tensor& invstd_out) { + (void)ctx; + + std::tuple ret_val(out, mean_out, invstd_out); + + ET_KERNEL_CHECK_MSG( + ctx, + training == false, + InvalidArgument, + ret_val, + "Portable kernels only support inference mode!"); + + return _native_batch_norm_legit_no_training_out( + ctx, + in, + weight, + bias, + running_mean, + running_var, + momentum, + eps, + out, + mean_out, + invstd_out); +} + +std::tuple _native_batch_norm_legit_no_stats_out( + RuntimeContext& ctx, + const Tensor& in, + const exec_aten::optional& weight, + const exec_aten::optional& bias, + bool training, + double momentum, + double eps, + Tensor& out, + Tensor& mean_out, + Tensor& invstd_out) { + (void)ctx; + (void)in; + (void)weight; + (void)bias; + (void)momentum; + (void)eps; + + std::tuple ret_val(out, mean_out, invstd_out); + + ET_KERNEL_CHECK_MSG( + ctx, + training == false, + InvalidArgument, + ret_val, + "Portable kernels only support inference mode!"); + + ET_KERNEL_CHECK_MSG( + ctx, + training == true, + InvalidArgument, + ret_val, + "running_mean & running_var must be provided during inference!"); return ret_val; } diff --git a/kernels/portable/functions.yaml b/kernels/portable/functions.yaml index f7e50cb7a4..92d44ed599 100644 --- a/kernels/portable/functions.yaml +++ b/kernels/portable/functions.yaml @@ -22,6 +22,16 @@ - arg_meta: null kernel_name: torch::executor::log_softmax_out +- op: _native_batch_norm_legit.out + kernels: + - arg_meta: null + kernel_name: torch::executor::_native_batch_norm_legit_out + +- op: _native_batch_norm_legit.no_stats_out + kernels: + - arg_meta: null + kernel_name: torch::executor::_native_batch_norm_legit_no_stats_out + - op: _native_batch_norm_legit_no_training.out kernels: - arg_meta: null diff --git a/kernels/test/op_native_batch_norm_test.cpp b/kernels/test/op_native_batch_norm_test.cpp index 1ce7f28eed..10ad74dd88 100644 --- a/kernels/test/op_native_batch_norm_test.cpp +++ b/kernels/test/op_native_batch_norm_test.cpp @@ -44,6 +44,35 @@ op_native_batch_norm_legit_no_training_out( out2); } +::std::tuple +op_native_batch_norm_legit_out( + const exec_aten::Tensor& input, + const exec_aten::optional& weight, + const exec_aten::optional& bias, + exec_aten::Tensor& running_mean, + exec_aten::Tensor& running_var, + bool training, + double momentum, + double eps, + exec_aten::Tensor& out0, + exec_aten::Tensor& out1, + exec_aten::Tensor& out2) { + exec_aten::RuntimeContext context{}; + return torch::executor::aten::_native_batch_norm_legit_outf( + context, + input, + weight, + bias, + running_mean, + running_var, + training, + momentum, + eps, + out0, + out1, + out2); +} + TEST(OpNativeBatchNormLegitNoTrainingOutTest, SampleAtomicTest2D) { torch::executor::testing::TensorFactory tfFloat; @@ -825,3 +854,91 @@ TEST(OpNativeBatchNormLegitNoTrainingOutTest, SampleAtomicTestNoWeightNoBias) { EXPECT_TENSOR_CLOSE(out1, out1_expected); EXPECT_TENSOR_CLOSE(out2, out2_expected); } + +TEST(OpNativeBatchNormLegitOutTest, SampleAtomicTest2D) { + torch::executor::testing::TensorFactory tfFloat; + + exec_aten::Tensor input = tfFloat.make( + {4, 7}, {2.876736640930176, 7.67944860458374, 5.701690196990967, + 9.299789428710938, 3.023690700531006, 5.315116882324219, + 7.185585021972656, 6.911304473876953, 7.61051082611084, + 1.4963287115097046, 0.7381612062454224, 8.588483810424805, + 6.583977699279785, 8.831110000610352, 0.8165055513381958, + 7.087201118469238, 5.572513580322266, 4.446897983551025, + 4.444573402404785, 6.254056930541992, 5.906398296356201, + 9.971039772033691, 3.5423521995544434, 7.452159881591797, + 9.93700122833252, 1.8560808897018433, 1.524025797843933, + 7.3222975730896}); + exec_aten::optional weight = + exec_aten::optional(tfFloat.make( + {7}, + {8.287437438964844, + 8.227645874023438, + 6.65926456451416, + 9.436124801635742, + 4.119281768798828, + 8.593960762023926, + 2.3760855197906494})); + exec_aten::optional bias = + exec_aten::optional(tfFloat.make( + {7}, + {7.824275970458984, + 6.84327507019043, + 8.354326248168945, + 8.773970603942871, + 3.89609694480896, + 3.0753469467163086, + 3.1105971336364746})); + exec_aten::Tensor running_mean = tfFloat.make( + {7}, + {9.700226783752441, + 0.1234668493270874, + 7.527220249176025, + 8.993252754211426, + 0.4736626148223877, + 7.7135701179504395, + 5.12320613861084}); + exec_aten::Tensor running_var = tfFloat.make( + {7}, + {3.585531234741211, + 6.615292549133301, + 0.24084866046905518, + 5.175800323486328, + 0.5886000394821167, + 6.23909854888916, + 1.5029621124267578}); + bool training = false; + double momentum = 0.1; + double eps = 0; + exec_aten::Tensor out0 = tfFloat.zeros({4, 7}); + exec_aten::Tensor out1 = tfFloat.zeros({0}); + exec_aten::Tensor out2 = tfFloat.zeros({0}); + exec_aten::Tensor out0_expected = tfFloat.make( + {4, 7}, {-22.039867401123047, 31.014127731323242, -16.416650772094727, + 10.04538631439209, 17.5877628326416, -5.17673921585083, + 7.1078033447265625, -4.381907939910889, 30.793603897094727, + -73.48003387451172, -25.46548080444336, 47.46636962890625, + -0.8111140131950378, 10.29708194732666, -31.056814193725586, + 29.119586944580078, -18.16947364807129, -10.082839965820312, + 25.216796875, -1.9462348222732544, 4.628543376922607, + 9.00953483581543, 17.779958724975586, 7.335818767547607, + 12.688335418701172, 11.318607330322266, -18.22031593322754, + 7.372773170471191}); + exec_aten::Tensor out1_expected = tfFloat.make({0}, {}); + exec_aten::Tensor out2_expected = tfFloat.make({0}, {}); + op_native_batch_norm_legit_out( + input, + weight, + bias, + running_mean, + running_var, + training, + momentum, + eps, + out0, + out1, + out2); + EXPECT_TENSOR_CLOSE(out0, out0_expected); + EXPECT_TENSOR_CLOSE(out1, out1_expected); + EXPECT_TENSOR_CLOSE(out2, out2_expected); +} From 38532e44112fa9b89cd363c432f27df4bdd2bc5f Mon Sep 17 00:00:00 2001 From: Manuel Candales Date: Mon, 4 Mar 2024 12:55:40 -0800 Subject: [PATCH 017/290] Add op: native_group_norm.out (#2219) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/2219 bypass-github-pytorch-ci-checks ghstack-source-id: 217233090 exported-using-ghexport bypass-github-export-checks Reviewed By: SS-JIA Differential Revision: D53719157 fbshipit-source-id: 4fb5714662ebf4942bedfc98dbc661a71c4d4784 --- kernels/aten/functions.yaml | 2 + kernels/portable/cpu/op_native_group_norm.cpp | 173 ++++++++++++++++++ kernels/portable/cpu/targets.bzl | 7 + .../cpu/util/normalization_ops_util.cpp | 40 ++++ .../cpu/util/normalization_ops_util.h | 12 ++ kernels/portable/functions.yaml | 5 + kernels/test/op_native_group_norm_test.cpp | 129 +++++++++++++ kernels/test/targets.bzl | 1 + 8 files changed, 369 insertions(+) create mode 100644 kernels/portable/cpu/op_native_group_norm.cpp create mode 100644 kernels/test/op_native_group_norm_test.cpp diff --git a/kernels/aten/functions.yaml b/kernels/aten/functions.yaml index d928c60ced..5bb1dd535e 100644 --- a/kernels/aten/functions.yaml +++ b/kernels/aten/functions.yaml @@ -252,6 +252,8 @@ - op: native_batch_norm.out +- op: native_group_norm.out + - op: native_layer_norm.out - op: ne.Scalar_out diff --git a/kernels/portable/cpu/op_native_group_norm.cpp b/kernels/portable/cpu/op_native_group_norm.cpp new file mode 100644 index 0000000000..f9213fdeb1 --- /dev/null +++ b/kernels/portable/cpu/op_native_group_norm.cpp @@ -0,0 +1,173 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include + +namespace torch { +namespace executor { +namespace native { + +using Tensor = exec_aten::Tensor; + +namespace { + +template +void group_norm( + const Tensor& input, + const optional& weight, + const optional& bias, + int64_t sN, + int64_t sC, + int64_t sHxW, + int64_t group, + CTYPE eps, + Tensor& out, + Tensor& mean, + Tensor& rstd) { + size_t N = static_cast(sN); // NOLINT + size_t C = static_cast(sC); // NOLINT + size_t HxW = static_cast(sHxW); // NOLINT + size_t G = static_cast(group); // NOLINT + + size_t leading = N * G; + size_t D = C / G; + size_t inner_size = D * HxW; + + if (leading == 0) { + return; + } + + CTYPE* out_data = out.mutable_data_ptr(); + CTYPE* mean_data = mean.mutable_data_ptr(); + CTYPE* rstd_data = rstd.mutable_data_ptr(); + + if (inner_size == 0) { + for (int i = 0; i < leading; ++i) { + mean_data[i] = static_cast(0); + rstd_data[i] = static_cast(NAN); + } + return; + } + + const CTYPE* input_data = input.const_data_ptr(); + const CTYPE* weight_data; + if (weight.has_value()) { + weight_data = weight.value().const_data_ptr(); + } else { + weight_data = nullptr; + } + const CTYPE* bias_data; + if (bias.has_value()) { + bias_data = bias.value().const_data_ptr(); + } else { + bias_data = nullptr; + } + + for (int i = 0; i < leading; ++i) { + const CTYPE* x = input_data + i * inner_size; + + // compute E[X] and Var[x] = E[x^2] - E[x]^2 + CTYPE sum = reduce_add(x, inner_size); + CTYPE sq_sum = vec_powerf(x, inner_size); + CTYPE mean_value = sum / inner_size; + CTYPE variance = sq_sum / inner_size - mean_value * mean_value; + CTYPE std = std::sqrt(variance + eps); + CTYPE rstd_value = 1.0 / std; + + // Calculate the elements of output + if (weight_data == nullptr && bias_data == nullptr) { + CTYPE* y = out_data + i * inner_size; + for (size_t j = 0; j < inner_size; j++) { + y[j] = (x[j] - mean_value) * rstd_value; + } + } else { + const size_t g = i % G; + for (size_t j = 0; j < D; j++) { + const size_t ch = g * D + j; + const CTYPE scale = + rstd_value * (weight_data == nullptr ? 1.0 : weight_data[ch]); + const CTYPE beta = + -scale * mean_value + (bias_data == nullptr ? 0.0 : bias_data[ch]); + x = input_data + (i * D + j) * HxW; + CTYPE* y = out_data + (i * D + j) * HxW; + for (size_t k = 0; k < HxW; k++) { + y[k] = scale * x[k] + beta; + } + } + } + + mean_data[i] = mean_value; + rstd_data[i] = rstd_value; + } +} + +} // namespace + +std::tuple native_group_norm_out( + RuntimeContext& ctx, + const Tensor& input, + const exec_aten::optional& weight, + const exec_aten::optional& bias, + int64_t N, + int64_t C, + int64_t HxW, + int64_t group, + double eps, + Tensor& out, + Tensor& mean_out, + Tensor& rstd_out) { + (void)ctx; + + std::tuple ret_val(out, mean_out, rstd_out); + + ET_KERNEL_CHECK( + ctx, + check_group_norm_args( + input, weight, bias, N, C, HxW, group, out, mean_out, rstd_out), + InvalidArgument, + ret_val); + + Tensor::SizesType mean_rstd_sizes[kTensorDimensionLimit]; + mean_rstd_sizes[0] = N; + mean_rstd_sizes[1] = group; + + ET_KERNEL_CHECK( + ctx, + resize_tensor(out, input.sizes()) == Error::Ok, + InvalidArgument, + ret_val); + + ET_KERNEL_CHECK( + ctx, + resize_tensor(mean_out, {mean_rstd_sizes, 2}) == Error::Ok, + InvalidArgument, + ret_val); + + ET_KERNEL_CHECK( + ctx, + resize_tensor(rstd_out, {mean_rstd_sizes, 2}) == Error::Ok, + InvalidArgument, + ret_val); + + constexpr auto name = "native_group_norm.out"; + + ET_SWITCH_FLOAT_TYPES(input.scalar_type(), ctx, name, CTYPE, [&]() { + group_norm( + input, weight, bias, N, C, HxW, group, eps, out, mean_out, rstd_out); + }); + + return ret_val; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/kernels/portable/cpu/targets.bzl b/kernels/portable/cpu/targets.bzl index 6d6517d476..3caad95343 100644 --- a/kernels/portable/cpu/targets.bzl +++ b/kernels/portable/cpu/targets.bzl @@ -608,6 +608,13 @@ _ATEN_OPS = ( "//executorch/kernels/portable/cpu/util:normalization_ops_util", ], ), + op_target( + name = "op_native_group_norm", + deps = [ + ":vec_ops", + "//executorch/kernels/portable/cpu/util:normalization_ops_util", + ], + ), op_target( name = "op_native_layer_norm", deps = [ diff --git a/kernels/portable/cpu/util/normalization_ops_util.cpp b/kernels/portable/cpu/util/normalization_ops_util.cpp index c748878656..6b2b12bf14 100644 --- a/kernels/portable/cpu/util/normalization_ops_util.cpp +++ b/kernels/portable/cpu/util/normalization_ops_util.cpp @@ -120,5 +120,45 @@ void get_layer_norm_out_target_size( } } +bool check_group_norm_args( + const Tensor& in, + const exec_aten::optional& weight, + const exec_aten::optional& bias, + int64_t N, + int64_t C, + int64_t HxW, + int64_t group, + Tensor& out, + Tensor& mean_out, + Tensor& rstd_out) { + ET_LOG_AND_RETURN_IF_FALSE(in.size(0) == N); + ET_LOG_AND_RETURN_IF_FALSE(in.size(1) == C); + ET_LOG_AND_RETURN_IF_FALSE(in.numel() == N * C * HxW); + ET_LOG_MSG_AND_RETURN_IF_FALSE( + group > 0, "Expected number of groups to be greater than 0"); + ET_LOG_MSG_AND_RETURN_IF_FALSE( + C % group == 0, + "Expected number of channels in input to be divisible by number of groups"); + ET_LOG_MSG_AND_RETURN_IF_FALSE( + !weight.has_value() || + (weight.value().dim() == 1 && weight.value().size(0) == C), + "Expected weight to be a vector of size equal to the number of channels in input"); + ET_LOG_MSG_AND_RETURN_IF_FALSE( + !bias.has_value() || + (bias.value().dim() == 1 && bias.value().size(0) == C), + "Expected bias to be a vector of size equal to the number of channels in input"); + + if (weight.has_value()) { + ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, weight.value())); + } + if (bias.has_value()) { + ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, bias.value())); + } + ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, out)); + ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, mean_out)); + ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, rstd_out)); + return true; +} + } // namespace executor } // namespace torch diff --git a/kernels/portable/cpu/util/normalization_ops_util.h b/kernels/portable/cpu/util/normalization_ops_util.h index 2f2b9b8a89..59d43d700c 100644 --- a/kernels/portable/cpu/util/normalization_ops_util.h +++ b/kernels/portable/cpu/util/normalization_ops_util.h @@ -40,5 +40,17 @@ void get_layer_norm_out_target_size( Tensor::SizesType* mean_rstd_sizes, size_t* mean_rstd_ndim); +bool check_group_norm_args( + const Tensor& input, + const exec_aten::optional& weight, + const exec_aten::optional& bias, + int64_t N, + int64_t C, + int64_t HxW, + int64_t group, + Tensor& out, + Tensor& mean_out, + Tensor& rstd_out); + } // namespace executor } // namespace torch diff --git a/kernels/portable/functions.yaml b/kernels/portable/functions.yaml index 92d44ed599..69480b4474 100644 --- a/kernels/portable/functions.yaml +++ b/kernels/portable/functions.yaml @@ -562,6 +562,11 @@ - arg_meta: null kernel_name: torch::executor::mul_scalar_out +- op: native_group_norm.out + kernels: + - arg_meta: null + kernel_name: torch::executor::native_group_norm_out + - op: native_layer_norm.out kernels: - arg_meta: null diff --git a/kernels/test/op_native_group_norm_test.cpp b/kernels/test/op_native_group_norm_test.cpp new file mode 100644 index 0000000000..6bc4785ce4 --- /dev/null +++ b/kernels/test/op_native_group_norm_test.cpp @@ -0,0 +1,129 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include // Declares the operator +#include +#include +#include +#include + +#include + +using namespace ::testing; +using exec_aten::optional; +using exec_aten::ScalarType; +using exec_aten::Tensor; +using torch::executor::testing::TensorFactory; + +::std::tuple op_native_group_norm_out( + const Tensor& input, + const optional& weight, + const optional& bias, + int64_t N, + int64_t C, + int64_t HxW, + int64_t group, + double eps, + Tensor& out0, + Tensor& out1, + Tensor& out2) { + exec_aten::RuntimeContext context{}; + return torch::executor::aten::native_group_norm_outf( + context, input, weight, bias, N, C, HxW, group, eps, out0, out1, out2); +} + +TEST(OpNativeGroupNormOutTest, SmokeTest) { + TensorFactory tfFloat; + + Tensor input = tfFloat.make( + {5, 6, 2, 2}, + {-0.8125, 0.0625, -2.7500, -3.0625, -1.1250, -2.1250, -1.3125, -4.0625, + 2.8125, -2.0625, 4.2500, 3.5000, -0.3750, 1.6250, 4.3125, -1.0625, + -2.8750, 3.3750, 4.9375, 4.0625, -3.0625, -1.8750, -2.7500, -2.5625, + -0.1875, -3.0000, -2.7500, 0.6875, -3.2500, -3.1875, 1.0000, -4.6250, + -0.1875, -1.7500, 4.5000, -1.8750, -2.6875, 4.8125, -3.8125, -2.9375, + -1.1875, 2.8750, 0.7500, 2.8750, 1.1250, -0.6250, -2.2500, -3.7500, + 3.2500, -0.3750, -2.0625, -4.7500, 2.0625, 3.0000, -3.1875, -4.1250, + -3.7500, 1.2500, -2.3125, 1.5625, 3.1250, 0.3125, 3.2500, -2.7500, + -3.8125, -4.2500, -4.3125, -0.5625, -0.4375, 2.9375, -1.3750, -0.6250, + -2.5625, -4.5625, 0.1250, -3.5000, -5.0000, -1.0000, -4.6875, -0.6875, + 1.1250, 1.8750, -4.5000, 4.3125, 4.5625, 0.2500, -3.6250, 4.5625, + -3.5000, -2.1250, -3.6250, -2.9375, 3.6875, 3.9375, 4.3750, 3.0625, + 2.4375, 2.0625, -2.4375, -3.9375, 3.6875, 2.7500, -0.8750, -0.9375, + 2.7500, -2.4375, -2.3750, -0.9375, -4.8750, 0.1875, 3.5000, -2.0000, + -0.2500, -2.7500, 0.3125, 1.2500, -0.5625, 0.0000, 1.8125, 1.0625}); + optional weight = + tfFloat.make({6}, {4.5625, -2.8750, -0.6875, 0.5625, -2.0625, -2.7500}); + optional bias = + tfFloat.make({6}, {-0.5000, -2.7500, 1.1875, 3.6875, 3.8125, 4.6875}); + double eps = 1e-5; + Tensor out0 = tfFloat.zeros({5, 6, 2, 2}); + Tensor out1 = tfFloat.zeros({5, 3}); + Tensor out2 = tfFloat.zeros({5, 3}); + Tensor out0_expected = tfFloat.make( + {5, 6, 2, 2}, + {3.419882, 6.578348, -3.573864, -4.701888, -4.509254, -2.234663, + -4.082768, 2.172355, 0.838826, 2.270225, 0.416747, 0.636962, + 3.207030, 3.687500, 4.333131, 3.041869, 5.547079, 1.649148, + 0.674665, 1.220376, 7.156189, 6.168714, 6.896327, 6.740410, + 3.509863, -3.022041, -2.441427, 5.542011, -0.794903, -0.886369, + -7.014627, 1.217361, 1.120617, 1.463606, 0.091652, 1.491045, + 3.293219, 4.640229, 3.091168, 3.248319, 4.895990, 1.114683, + 3.092597, 1.114683, 3.262238, 5.434066, 7.450763, 9.312329, + 5.570122, 0.101119, -2.444796, -6.499403, -5.446074, -6.337338, + -0.454995, 0.436269, 2.228491, 0.871598, 1.838385, 0.786793, + 4.362284, 3.737805, 4.390039, 3.057817, 5.814659, 6.202621, + 6.258044, 2.932658, 3.366583, -0.623879, 4.475045, 3.588276, + -0.082914, -4.936279, 6.438795, -2.357929, 0.714463, -5.402106, + 0.236606, -5.879963, 1.176247, 1.021916, 2.333727, 0.520341, + 4.275447, 3.549392, 2.896994, 4.275447, 6.120910, 5.298480, + 6.195676, 5.784461, 2.033296, 1.833920, 1.485010, 2.531738, + 3.193988, 2.532378, -5.406940, -8.053379, -6.467402, -5.425139, + -1.395059, -1.325575, 0.266062, 1.622680, 1.606336, 1.230405, + 2.809896, 3.893110, 4.601880, 3.425055, 4.374411, 8.283354, + 3.494898, 2.029045, 6.088204, 4.915522, 1.136877, 2.700454}); + Tensor out1_expected = tfFloat.make( + {5, 3}, + {-1.89843750, + 1.62500000, + -0.09375000, + -1.91406250, + -0.49218744, + -0.02343750, + -0.77343756, + 0.08593753, + -1.55468738, + -2.73437500, + 1.07031238, + 0.35937503, + 0.34374997, + -0.77343750, + 0.10937499}); + Tensor out2_expected = tfFloat.make( + {5, 3}, + {0.79116172, + 0.42708409, + 0.30238494, + 0.50903118, + 0.31929117, + 0.45128885, + 0.33067191, + 0.39473253, + 0.42994878, + 0.53187561, + 0.29930803, + 0.29000264, + 0.38669431, + 0.38038814, + 0.75809801}); + op_native_group_norm_out( + input, weight, bias, 5, 6, 4, 3, eps, out0, out1, out2); + EXPECT_TENSOR_CLOSE(out0, out0_expected); + EXPECT_TENSOR_CLOSE(out1, out1_expected); + EXPECT_TENSOR_CLOSE(out2, out2_expected); +} diff --git a/kernels/test/targets.bzl b/kernels/test/targets.bzl index 2c183348d2..32a7c731cd 100644 --- a/kernels/test/targets.bzl +++ b/kernels/test/targets.bzl @@ -242,6 +242,7 @@ def define_common_targets(is_fbcode = False): _common_op_test("op_mul_test", ["aten", "portable", "optimized"]) _common_op_test("op_pow_test", ["aten", "portable"]) _common_op_test("op_native_batch_norm_test", ["aten", "portable"]) + _common_op_test("op_native_group_norm_test", ["aten", "portable"]) _common_op_test("op_native_layer_norm_test", ["aten", "portable", "optimized"]) _common_op_test("op_ne_test", ["aten", "portable"]) _common_op_test("op_neg_test", ["aten", "portable", "optimized"]) From 205cfeafac5ef9d9fb4e33f7aadfd53e4e58df69 Mon Sep 17 00:00:00 2001 From: Manuel Candales Date: Mon, 4 Mar 2024 12:55:40 -0800 Subject: [PATCH 018/290] Add op: _pdist_forward.out (#2220) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/2220 bypass-github-pytorch-ci-checks ghstack-source-id: 217233088 exported-using-ghexport bypass-github-export-checks Reviewed By: SS-JIA Differential Revision: D53733572 fbshipit-source-id: 45e82efff4db6b13a0fd42539da311f0efe1daf9 --- kernels/aten/functions.yaml | 2 + kernels/portable/cpu/op_pdist_forward.cpp | 48 ++++++++ kernels/portable/cpu/targets.bzl | 6 + kernels/portable/cpu/util/distance_util.cpp | 32 +++++ kernels/portable/cpu/util/distance_util.h | 127 ++++++++++++++++++++ kernels/portable/cpu/util/targets.bzl | 13 ++ kernels/portable/functions.yaml | 5 + kernels/test/op_pdist_forward_test.cpp | 77 ++++++++++++ kernels/test/targets.bzl | 1 + 9 files changed, 311 insertions(+) create mode 100644 kernels/portable/cpu/op_pdist_forward.cpp create mode 100644 kernels/portable/cpu/util/distance_util.cpp create mode 100644 kernels/portable/cpu/util/distance_util.h create mode 100644 kernels/test/op_pdist_forward_test.cpp diff --git a/kernels/aten/functions.yaml b/kernels/aten/functions.yaml index 5bb1dd535e..b72ba13c0b 100644 --- a/kernels/aten/functions.yaml +++ b/kernels/aten/functions.yaml @@ -15,6 +15,8 @@ - op: _native_batch_norm_legit_no_training.out +- op: _pdist_forward.out + - op: _reshape_alias_copy.out - op: _softmax.out diff --git a/kernels/portable/cpu/op_pdist_forward.cpp b/kernels/portable/cpu/op_pdist_forward.cpp new file mode 100644 index 0000000000..88b5e88194 --- /dev/null +++ b/kernels/portable/cpu/op_pdist_forward.cpp @@ -0,0 +1,48 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +namespace torch { +namespace executor { +namespace native { + +using Tensor = exec_aten::Tensor; + +Tensor& _pdist_forward_out( + RuntimeContext& ctx, + const Tensor& in, + double p, + Tensor& out) { + (void)ctx; + + ET_KERNEL_CHECK(ctx, check_pdist_args(in, p, out), InvalidArgument, out); + + Tensor::SizesType target_sizes[kTensorDimensionLimit]; + size_t target_ndim = 0; + get_pdist_out_target_size(in, target_sizes, &target_ndim); + + ET_KERNEL_CHECK( + ctx, + resize_tensor(out, {target_sizes, target_ndim}) == Error::Ok, + InvalidArgument, + out); + + ScalarType in_type = in.scalar_type(); + constexpr auto name = "_pdist_forward.out"; + + ET_SWITCH_FLOAT_TYPES( + in_type, ctx, name, CTYPE, [&] { pdist(in, out, p); }); + + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/kernels/portable/cpu/targets.bzl b/kernels/portable/cpu/targets.bzl index 3caad95343..b86d880f76 100644 --- a/kernels/portable/cpu/targets.bzl +++ b/kernels/portable/cpu/targets.bzl @@ -650,6 +650,12 @@ _ATEN_OPS = ( "//executorch/runtime/core/exec_aten/util:tensor_util", ], ), + op_target( + name = "op_pdist_forward", + deps = [ + "//executorch/kernels/portable/cpu/util:distance_util", + ], + ), op_target( name = "op_permute_copy", deps = [ diff --git a/kernels/portable/cpu/util/distance_util.cpp b/kernels/portable/cpu/util/distance_util.cpp new file mode 100644 index 0000000000..e740e69b3a --- /dev/null +++ b/kernels/portable/cpu/util/distance_util.cpp @@ -0,0 +1,32 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +namespace torch { +namespace executor { + +bool check_pdist_args(const Tensor& in, double p, const Tensor& out) { + ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, out)); + ET_LOG_AND_RETURN_IF_FALSE(tensor_is_rank(in, 2)); + ET_LOG_MSG_AND_RETURN_IF_FALSE( + p >= 0, "pdist only supports non-negative p values"); + return true; +} + +void get_pdist_out_target_size( + const Tensor& in, + Tensor::SizesType* out_sizes, + size_t* out_ndim) { + *out_ndim = 1; + size_t n = in.size(0); + out_sizes[0] = n * (n - 1) / 2; +} + +} // namespace executor +} // namespace torch diff --git a/kernels/portable/cpu/util/distance_util.h b/kernels/portable/cpu/util/distance_util.h new file mode 100644 index 0000000000..b04bbf2c92 --- /dev/null +++ b/kernels/portable/cpu/util/distance_util.h @@ -0,0 +1,127 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +namespace torch { +namespace executor { + +bool check_pdist_args(const Tensor& in, double p, const Tensor& out); + +void get_pdist_out_target_size( + const Tensor& in, + Tensor::SizesType* out_sizes, + size_t* out_ndim); + +template +void pdist(const Tensor& in, Tensor& out, double p) { + const CTYPE* in_data = in.const_data_ptr(); + CTYPE* out_data = out.mutable_data_ptr(); + + size_t n = in.size(0); + size_t m = in.size(1); + + size_t out_ix = 0; + for (size_t i = 0; i < n; ++i) { + for (size_t j = i + 1; j < n; ++j) { + const CTYPE* row_i = in_data + i * m; + const CTYPE* row_j = in_data + j * m; + CTYPE agg = 0; + for (size_t k = 0; k < m; ++k) { + CTYPE diff = std::abs(row_i[k] - row_j[k]); + agg = Norm::reduce(agg, Norm::map(diff, p)); + } + out_data[out_ix++] = Norm::finish(agg, p); + } + } +} + +template +struct L0 { + static inline CTYPE map(const CTYPE& diff, const CTYPE&) { + return diff == 0 ? 0 : 1; + } + static inline CTYPE reduce(const CTYPE& agg, const CTYPE& up) { + return agg + up; + } + static inline CTYPE finish(const CTYPE& agg, const CTYPE&) { + return agg; + } +}; + +template +struct L1 { + static inline CTYPE map(const CTYPE& diff, const CTYPE&) { + return diff; + } + static inline CTYPE reduce(const CTYPE& agg, const CTYPE& up) { + return agg + up; + } + static inline CTYPE finish(const CTYPE& agg, const CTYPE&) { + return agg; + } +}; + +template +struct L2 { + static inline CTYPE map(const CTYPE& diff, const CTYPE&) { + return diff * diff; + } + static inline CTYPE reduce(const CTYPE& agg, const CTYPE& up) { + return agg + up; + } + static inline CTYPE finish(const CTYPE& agg, const CTYPE&) { + return std::sqrt(agg); + } +}; + +template +struct Lp { + static inline CTYPE map(const CTYPE& diff, const CTYPE& p) { + return std::pow(diff, p); + } + static inline CTYPE reduce(const CTYPE& agg, const CTYPE& up) { + return agg + up; + } + static inline CTYPE finish(const CTYPE& agg, const CTYPE& p) { + return std::pow(agg, 1.0 / p); + } +}; + +template +struct Linf { + static inline CTYPE map(const CTYPE& diff, const CTYPE&) { + return diff; + } + static inline CTYPE reduce(const CTYPE& agg, const CTYPE& up) { + return std::max(agg, up); + } + static inline CTYPE finish(const CTYPE& agg, const CTYPE&) { + return agg; + } +}; + +template +void pdist(const Tensor& in, Tensor& out, double p) { + if (p == 0.0) { + pdist>(in, out, p); + } else if (p == 1.0) { + pdist>(in, out, p); + } else if (p == 2.0) { + pdist>(in, out, p); + } else if (p == INFINITY) { + pdist>(in, out, p); + } else { + pdist>(in, out, p); + } +} + +} // namespace executor +} // namespace torch diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl index 4f4c039c80..135b8af5af 100644 --- a/kernels/portable/cpu/util/targets.bzl +++ b/kernels/portable/cpu/util/targets.bzl @@ -77,6 +77,19 @@ def define_common_targets(): visibility = ["//executorch/kernels/portable/cpu/...", "//executorch/kernels/optimized/cpu/..."], ) + runtime.cxx_library( + name = "distance_util", + srcs = ["distance_util.cpp"], + exported_headers = [ + "distance_util.h", + ], + compiler_flags = ["-Wno-missing-prototypes"], + deps = [ + "//executorch/runtime/kernel:kernel_includes", + ], + visibility = ["//executorch/kernels/portable/cpu/...", "//executorch/kernels/optimized/cpu/..."], + ) + runtime.cxx_library( name = "kernel_ops_util", srcs = ["kernel_ops_util.cpp"], diff --git a/kernels/portable/functions.yaml b/kernels/portable/functions.yaml index 69480b4474..0593a67c2b 100644 --- a/kernels/portable/functions.yaml +++ b/kernels/portable/functions.yaml @@ -37,6 +37,11 @@ - arg_meta: null kernel_name: torch::executor::_native_batch_norm_legit_no_training_out +- op: _pdist_forward.out + kernels: + - arg_meta: null + kernel_name: torch::executor::_pdist_forward_out + - op: _softmax.out kernels: - arg_meta: null diff --git a/kernels/test/op_pdist_forward_test.cpp b/kernels/test/op_pdist_forward_test.cpp new file mode 100644 index 0000000000..a21c1eb825 --- /dev/null +++ b/kernels/test/op_pdist_forward_test.cpp @@ -0,0 +1,77 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include // Declares the operator +#include +#include +#include +#include +#include + +#include + +using namespace ::testing; +using exec_aten::ArrayRef; +using exec_aten::ScalarType; +using exec_aten::Tensor; +using torch::executor::testing::TensorFactory; + +Tensor& op_pdist_forward_out(const Tensor& input, double p, Tensor& out) { + exec_aten::RuntimeContext context{}; + return torch::executor::aten::_pdist_forward_outf(context, input, p, out); +} + +class OpPdistForwardOutTest : public ::testing::Test { + protected: + void SetUp() override { + // Since these tests cause ET_LOG to be called, the PAL must be initialized + // first. + torch::executor::runtime_init(); + } +}; + +TEST_F(OpPdistForwardOutTest, SmokeTest) { + TensorFactory tfFloat; + + Tensor in = tfFloat.make( + {4, 5}, {0, 1, 2, 3, 5, 4, 3, 2, -1, 5, 1, 1, -2, 1, 5, 4, 3, 2, -1, 5}); + Tensor out = tfFloat.zeros({6}); + + Tensor l0 = tfFloat.make({6}, {3., 3., 3., 4., 0., 4.}); + op_pdist_forward_out(in, 0.0, out); + EXPECT_TENSOR_CLOSE(out, l0); + + Tensor l0p5 = tfFloat.make( + {6}, + {29.31370926, 19.48528290, 29.31370926, 43.03986740, 0.0, 43.03986740}); + op_pdist_forward_out(in, 0.5, out); + EXPECT_TENSOR_CLOSE(out, l0p5); + + Tensor l1 = tfFloat.make({6}, {10., 7., 10., 11., 0., 11.}); + op_pdist_forward_out(in, 1.0, out); + EXPECT_TENSOR_CLOSE(out, l1); + + Tensor l1p5 = tfFloat.make( + {6}, {7.07743692, 5.19140196, 7.07743692, 7.08359480, 0.0, 7.08359480}); + op_pdist_forward_out(in, 1.5, out); + EXPECT_TENSOR_CLOSE(out, l1p5); + + Tensor l2 = + tfFloat.make({6}, {6.0, 4.58257580, 6.0, 5.74456263, 0.0, 5.74456263}); + op_pdist_forward_out(in, 2.0, out); + EXPECT_TENSOR_CLOSE(out, l2); + + Tensor l3 = tfFloat.make( + {6}, {5.14256334, 4.17933941, 5.14256334, 4.74745941, 0.0, 4.74745941}); + op_pdist_forward_out(in, 3.0, out); + EXPECT_TENSOR_CLOSE(out, l3); + + Tensor linf = tfFloat.make({6}, {4., 4., 4., 4., 0., 4.}); + op_pdist_forward_out(in, INFINITY, out); + EXPECT_TENSOR_CLOSE(out, linf); +} diff --git a/kernels/test/targets.bzl b/kernels/test/targets.bzl index 32a7c731cd..c839c734a7 100644 --- a/kernels/test/targets.bzl +++ b/kernels/test/targets.bzl @@ -248,6 +248,7 @@ def define_common_targets(is_fbcode = False): _common_op_test("op_neg_test", ["aten", "portable", "optimized"]) _common_op_test("op_nonzero_test", ["aten", "portable"]) _common_op_test("op_ones_test", ["aten", "portable"]) + _common_op_test("op_pdist_forward_test", ["aten", "portable"]) _common_op_test("op_permute_copy_test", ["aten", "portable"]) _common_op_test("op_pixel_shuffle_test", ["aten", "portable"]) _common_op_test("op_prod_test", ["aten", "portable"]) From 14b00d0ac5bd85c0fcef5c2e32aab0843dd1e19d Mon Sep 17 00:00:00 2001 From: Manuel Candales Date: Mon, 4 Mar 2024 12:55:40 -0800 Subject: [PATCH 019/290] Add op: _cdist_forward.out (#2221) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/2221 bypass-github-pytorch-ci-checks ghstack-source-id: 217233092 exported-using-ghexport bypass-github-export-checks Reviewed By: SS-JIA Differential Revision: D53767328 fbshipit-source-id: 3a05a0f95c52869ed84640094cf088722c3fd1a7 --- kernels/aten/functions.yaml | 3 + kernels/portable/cpu/op_cdist_forward.cpp | 154 ++++++++++++++++++++ kernels/portable/cpu/targets.bzl | 7 + kernels/portable/cpu/util/distance_util.cpp | 24 +++ kernels/portable/cpu/util/distance_util.h | 7 + kernels/portable/functions.yaml | 5 + kernels/test/op_cdist_forward_test.cpp | 128 ++++++++++++++++ kernels/test/targets.bzl | 1 + 8 files changed, 329 insertions(+) create mode 100644 kernels/portable/cpu/op_cdist_forward.cpp create mode 100644 kernels/test/op_cdist_forward_test.cpp diff --git a/kernels/aten/functions.yaml b/kernels/aten/functions.yaml index b72ba13c0b..f95169a068 100644 --- a/kernels/aten/functions.yaml +++ b/kernels/aten/functions.yaml @@ -1,6 +1,9 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. # # This yaml file contains operators that are defined by the ATen library. + +- op: _cdist_forward.out + - op: _fake_quantize_per_tensor_affine_cachemask_tensor_qparams.out - op: _linalg_det.result diff --git a/kernels/portable/cpu/op_cdist_forward.cpp b/kernels/portable/cpu/op_cdist_forward.cpp new file mode 100644 index 0000000000..fa80daadb1 --- /dev/null +++ b/kernels/portable/cpu/op_cdist_forward.cpp @@ -0,0 +1,154 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +namespace torch { +namespace executor { +namespace native { + +using exec_aten::optional; +using exec_aten::Tensor; + +namespace { + +inline ArrayRef get_batch_sizes(const Tensor& tensor) { + return {tensor.sizes().data(), tensor.sizes().size() - 2}; +} + +template +void cdist(const Tensor& x1, const Tensor& x2, Tensor& out, double p) { + const CTYPE* x1_data = x1.const_data_ptr(); + const CTYPE* x2_data = x2.const_data_ptr(); + CTYPE* out_data = out.mutable_data_ptr(); + + const ArrayRef x1_batch_sizes = get_batch_sizes(x1); + const ArrayRef x2_batch_sizes = get_batch_sizes(x2); + const ArrayRef out_batch_sizes = get_batch_sizes(out); + + const bool x1_is_broadcasted = !out_batch_sizes.equals(x1_batch_sizes); + const bool x2_is_broadcasted = !out_batch_sizes.equals(x2_batch_sizes); + const bool any_is_broadcasted = (x1_is_broadcasted || x2_is_broadcasted); + + size_t out_batch_numel = 1; + for (auto i : out_batch_sizes) { + out_batch_numel *= i; + } + + size_t P = static_cast(x1.size(x1.dim() - 2)); // NOLINT + size_t R = static_cast(x2.size(x2.dim() - 2)); // NOLINT + size_t M = static_cast(x1.size(x1.dim() - 1)); // NOLINT + + size_t x1_inner_size = P * M; + size_t x2_inner_size = R * M; + size_t out_inner_size = P * R; + + for (size_t b = 0; b < out_batch_numel; ++b) { + size_t x1_base_ix = b * x1_inner_size; + size_t x2_base_ix = b * x2_inner_size; + size_t out_base_ix = b * out_inner_size; + + if (any_is_broadcasted) { + size_t out_base_coord[kTensorDimensionLimit]; + delinearize_index( + out_base_ix, out, out_base_coord, kTensorDimensionLimit); + + if (x1_is_broadcasted) { + x1_base_ix = linearize_access_indexes(out_base_coord, out.dim(), x1); + } + if (x2_is_broadcasted) { + x2_base_ix = linearize_access_indexes(out_base_coord, out.dim(), x2); + } + } + + size_t out_ix = 0; + for (size_t i = 0; i < P; ++i) { + const CTYPE* row_i = x1_data + x1_base_ix + i * M; + for (size_t j = 0; j < R; ++j) { + const CTYPE* row_j = x2_data + x2_base_ix + j * M; + CTYPE agg = 0; + for (size_t k = 0; k < M; ++k) { + CTYPE diff = std::abs(row_i[k] - row_j[k]); + agg = Norm::reduce(agg, Norm::map(diff, p)); + } + out_data[out_base_ix + out_ix++] = Norm::finish(agg, p); + } + } + } +} + +template +void cdist(const Tensor& x1, const Tensor& x2, Tensor& out, double p) { + if (p == 0.0) { + cdist>(x1, x2, out, p); + } else if (p == 1.0) { + cdist>(x1, x2, out, p); + } else if (p == 2.0) { + cdist>(x1, x2, out, p); + } else if (p == INFINITY) { + cdist>(x1, x2, out, p); + } else { + cdist>(x1, x2, out, p); + } +} + +} // namespace + +Tensor& _cdist_forward_out( + RuntimeContext& ctx, + const Tensor& x1, + const Tensor& x2, + double p, + optional compute_mode, + Tensor& out) { + (void)ctx; + + ET_KERNEL_CHECK( + ctx, + check_cdist_args(x1, x2, p, compute_mode, out), + InvalidArgument, + out); + + Tensor::SizesType target_sizes[kTensorDimensionLimit]; + size_t target_ndim = 0; + + ET_KERNEL_CHECK( + ctx, + get_broadcast_target_size( + {x1.sizes().data(), x1.sizes().size() - 2}, + {x2.sizes().data(), x2.sizes().size() - 2}, + target_sizes, + kTensorDimensionLimit, + &target_ndim) == Error::Ok, + InvalidArgument, + out); + + target_ndim += 2; + target_sizes[target_ndim - 2] = x1.size(x1.dim() - 2); + target_sizes[target_ndim - 1] = x2.size(x2.dim() - 2); + + ET_KERNEL_CHECK( + ctx, + resize_tensor(out, {target_sizes, target_ndim}) == Error::Ok, + InvalidArgument, + out); + + ScalarType out_type = out.scalar_type(); + constexpr auto name = "_cdist_forward.out"; + + ET_SWITCH_FLOAT_TYPES( + out_type, ctx, name, CTYPE, [&] { cdist(x1, x2, out, p); }); + + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/kernels/portable/cpu/targets.bzl b/kernels/portable/cpu/targets.bzl index b86d880f76..b80e343f17 100644 --- a/kernels/portable/cpu/targets.bzl +++ b/kernels/portable/cpu/targets.bzl @@ -188,6 +188,13 @@ _ATEN_OPS = ( "//executorch/kernels/portable/cpu/util:copy_ops_util", ], ), + op_target( + name = "op_cdist_forward", + deps = [ + "//executorch/kernels/portable/cpu/util:broadcast_util", + "//executorch/kernels/portable/cpu/util:distance_util", + ], + ), op_target( name = "op_ceil", deps = [ diff --git a/kernels/portable/cpu/util/distance_util.cpp b/kernels/portable/cpu/util/distance_util.cpp index e740e69b3a..f8dc2f7121 100644 --- a/kernels/portable/cpu/util/distance_util.cpp +++ b/kernels/portable/cpu/util/distance_util.cpp @@ -28,5 +28,29 @@ void get_pdist_out_target_size( out_sizes[0] = n * (n - 1) / 2; } +bool check_cdist_args( + const Tensor& x1, + const Tensor& x2, + double p, + optional compute_mode, + const Tensor& out) { + ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(x1, x2)); + ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(x1, out)); + ET_LOG_AND_RETURN_IF_FALSE(tensor_has_rank_greater_or_equal_to(x1, 2)); + ET_LOG_AND_RETURN_IF_FALSE(tensor_has_rank_greater_or_equal_to(x2, 2)); + ET_LOG_AND_RETURN_IF_FALSE( + tensors_have_same_size_at_dims(x1, x1.dim() - 1, x2, x2.dim() - 1)); + ET_LOG_MSG_AND_RETURN_IF_FALSE( + p >= 0, "cdist only supports non-negative p values"); + if (compute_mode.has_value()) { + int64_t mode = compute_mode.value(); + ET_LOG_MSG_AND_RETURN_IF_FALSE( + mode >= 0 && mode <= 2, + "possible modes: 0, 1, 2, but was: %" PRId64, + mode); + } + return true; +} + } // namespace executor } // namespace torch diff --git a/kernels/portable/cpu/util/distance_util.h b/kernels/portable/cpu/util/distance_util.h index b04bbf2c92..05406e3548 100644 --- a/kernels/portable/cpu/util/distance_util.h +++ b/kernels/portable/cpu/util/distance_util.h @@ -123,5 +123,12 @@ void pdist(const Tensor& in, Tensor& out, double p) { } } +bool check_cdist_args( + const Tensor& x1, + const Tensor& x2, + double p, + optional compute_mode, + const Tensor& out); + } // namespace executor } // namespace torch diff --git a/kernels/portable/functions.yaml b/kernels/portable/functions.yaml index 0593a67c2b..e302eb5805 100644 --- a/kernels/portable/functions.yaml +++ b/kernels/portable/functions.yaml @@ -17,6 +17,11 @@ # See the README.md file in this directory for a description of the syntax used # by this file. +- op: _cdist_forward.out + kernels: + - arg_meta: null + kernel_name: torch::executor::_cdist_forward_out + - op: _log_softmax.out kernels: - arg_meta: null diff --git a/kernels/test/op_cdist_forward_test.cpp b/kernels/test/op_cdist_forward_test.cpp new file mode 100644 index 0000000000..04ccb6d34a --- /dev/null +++ b/kernels/test/op_cdist_forward_test.cpp @@ -0,0 +1,128 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include // Declares the operator +#include +#include +#include +#include +#include + +#include + +using namespace ::testing; +using exec_aten::ArrayRef; +using exec_aten::optional; +using exec_aten::ScalarType; +using exec_aten::Tensor; +using torch::executor::testing::TensorFactory; + +Tensor& op_cdist_forward_out( + const Tensor& x1, + const Tensor& x2, + double p, + optional compute_mode, + Tensor& out) { + exec_aten::RuntimeContext context{}; + return torch::executor::aten::_cdist_forward_outf( + context, x1, x2, p, compute_mode, out); +} + +class OpCdistForwardOutTest : public ::testing::Test { + protected: + void SetUp() override { + // Since these tests cause ET_LOG to be called, the PAL must be initialized + // first. + torch::executor::runtime_init(); + } +}; + +TEST_F(OpCdistForwardOutTest, SmokeTest) { + TensorFactory tfFloat; + + Tensor x1 = + tfFloat.make({2, 1, 4, 3}, {0, 1, 2, 3, 5, 4, 3, -3, 7, 1, 6, 2, + -1, 5, 1, 1, -2, 1, 5, 4, 3, 2, -1, 5}); + Tensor x2 = tfFloat.make( + {1, 2, 5, 3}, {0, 1, 2, 3, 5, -3, 7, 1, 6, 2, -1, 5, 1, 1, -2, + 4, 3, 2, -1, 5, 1, 1, -2, 1, 5, 4, 3, 2, -1, 5}); + optional compute_mode = optional(); + + Tensor out = tfFloat.zeros({2, 2, 4, 5}); + + Tensor l0 = tfFloat.make( + {2, 2, 4, 5}, + {0., 3., 2., 3., 2., 3., 1., 3., 3., 3., 3., 2., 3., 3., 3., 2., + 3., 3., 3., 2., 2., 3., 3., 3., 3., 3., 2., 3., 3., 3., 3., 3., + 3., 3., 3., 2., 3., 2., 3., 3., 3., 2., 3., 3., 3., 3., 3., 3., + 3., 2., 3., 3., 3., 3., 3., 3., 3., 3., 0., 3., 3., 0., 2., 3., + 3., 3., 2., 0., 3., 3., 3., 3., 3., 0., 3., 3., 3., 3., 3., 0.}); + op_cdist_forward_out(x1, x2, 0.0, compute_mode, out); + EXPECT_TENSOR_CLOSE(out, l0); + + Tensor l1 = tfFloat.make( + {2, 2, 4, 5}, + {0., 12., 11., 7., 5., 9., 7., 10., 8., 12., 12., 18., 9., 5., + 15., 6., 8., 15., 11., 9., 6., 6., 5., 9., 7., 5., 7., 12., + 4., 8., 12., 18., 9., 13., 5., 6., 4., 9., 7., 11., 6., 8., + 17., 13., 9., 5., 13., 14., 6., 6., 9., 9., 8., 10., 12., 7., + 15., 8., 0., 10., 8., 0., 9., 9., 13., 9., 9., 0., 12., 6., + 3., 9., 12., 0., 10., 9., 13., 6., 10., 0.}); + op_cdist_forward_out(x1, x2, 1.0, compute_mode, out); + EXPECT_TENSOR_CLOSE(out, l1); + + Tensor l2 = tfFloat.make( + {2, 2, 4, 5}, + {0.00000000, 7.07106781, 8.06225777, 4.12310553, 4.12310553, + 5.38516474, 7.00000000, 6.00000000, 6.16441393, 7.48331499, + 7.07106781, 12.80624866, 5.74456263, 3.00000000, 10.04987526, + 5.09901953, 5.47722578, 8.77496433, 7.68114567, 6.40312433, + 4.47213602, 4.24264050, 3.31662488, 5.91608000, 4.12310553, + 3.00000000, 5.00000000, 7.87400770, 2.44948983, 6.16441393, + 7.87400770, 10.77032948, 6.40312433, 8.30662346, 3.00000000, + 4.24264050, 2.44948983, 8.06225777, 4.58257580, 7.68114567, + 4.24264050, 5.65685415, 10.24695110, 7.81024981, 5.38516474, + 3.31662488, 8.30662346, 8.36660004, 4.24264050, 4.24264050, + 5.91608000, 6.40312433, 4.69041586, 6.16441393, 7.07106781, + 4.12310553, 10.04987526, 5.47722578, 0.00000000, 7.34846926, + 5.47722578, 0.00000000, 7.28010988, 6.40312433, 7.81024981, + 5.91608000, 7.28010988, 0.00000000, 7.48331499, 4.24264050, + 1.73205078, 6.40312433, 7.48331499, 0.00000000, 6.16441393, + 5.38516474, 7.81024981, 4.24264050, 6.16441393, 0.00000000}); + op_cdist_forward_out(x1, x2, 2.0, compute_mode, out); + EXPECT_TENSOR_CLOSE(out, l2); + + Tensor l3 = tfFloat.make( + {2, 2, 4, 5}, + {0.00000000, 6.00000000, 7.41079521, 3.50339794, 4.02072573, 4.62606478, + 7.00000000, 5.14256334, 6.01846170, 6.60385466, 6.00000000, 11.47758675, + 5.05277443, 2.57128167, 9.28704357, 5.01329803, 5.11722994, 7.39863634, + 7.18551636, 5.73879337, 4.16016769, 4.04124022, 3.07231688, 5.34848118, + 3.50339794, 2.57128167, 4.49794149, 7.23042679, 2.15443468, 6.01846170, + 6.99319077, 9.25212955, 6.08220196, 7.45903587, 2.57128167, 3.77976322, + 2.15443468, 8.00520515, 4.17933941, 7.18551636, 4.04124022, 5.03968430, + 8.88326645, 6.74599648, 4.62606478, 3.07231688, 7.45903587, 7.16609573, + 4.04124022, 3.77976322, 5.34848118, 6.08220196, 3.95789170, 5.42883539, + 6.00000000, 3.50339794, 9.00000000, 5.11722994, 0.00000000, 7.06069660, + 5.11722994, 0.00000000, 7.05400419, 6.08220196, 6.74599648, 5.34848118, + 7.05400419, 0.00000000, 6.60385466, 4.04124022, 1.44224954, 6.08220196, + 6.60385466, 0.00000000, 5.42883539, 4.62606478, 6.74599648, 4.04124022, + 5.42883539, 0.00000000}); + op_cdist_forward_out(x1, x2, 3.0, compute_mode, out); + EXPECT_TENSOR_CLOSE(out, l3); + + Tensor linf = tfFloat.make( + {2, 2, 4, 5}, + {0., 5., 7., 3., 4., 4., 7., 4., 6., 6., 5., 10., 4., 2., 9., 5., + 5., 6., 7., 5., 4., 4., 3., 5., 3., 2., 4., 7., 2., 6., 6., 8., + 6., 7., 2., 3., 2., 8., 4., 7., 4., 4., 8., 6., 4., 3., 7., 6., + 4., 3., 5., 6., 3., 5., 5., 3., 8., 5., 0., 7., 5., 0., 7., 6., + 6., 5., 7., 0., 6., 4., 1., 6., 6., 0., 5., 4., 6., 4., 5., 0.}); + op_cdist_forward_out(x1, x2, INFINITY, compute_mode, out); + EXPECT_TENSOR_CLOSE(out, linf); +} diff --git a/kernels/test/targets.bzl b/kernels/test/targets.bzl index c839c734a7..789179c4ca 100644 --- a/kernels/test/targets.bzl +++ b/kernels/test/targets.bzl @@ -181,6 +181,7 @@ def define_common_targets(is_fbcode = False): _common_op_test("op_bitwise_xor_test", ["aten", "portable"]) _common_op_test("op_bmm_test", ["aten", "portable", "optimized"]) _common_op_test("op_cat_test", ["aten", "portable"]) + _common_op_test("op_cdist_forward_test", ["aten", "portable"]) _common_op_test("op_ceil_test", ["aten", "portable"]) _common_op_test("op_clamp_test", ["aten", "portable"]) _common_op_test("op_clone_test", ["aten", "portable"]) From bff4e0ccd181dcc927ca71666d09c55436e62daa Mon Sep 17 00:00:00 2001 From: Manuel Candales Date: Mon, 4 Mar 2024 13:35:28 -0800 Subject: [PATCH 020/290] Add embedding_byte.dtype to enable output dtype be different than scales/zp dtype (#2236) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/2236 Reviewed By: mikekgfb Differential Revision: D54454644 fbshipit-source-id: 500e8864ad5f23ba5b18238c67f909b103f45f87 --- examples/models/llama2/ops/quantized.yaml | 6 + examples/models/llama2/ops/quantized_ops.py | 83 ++++++++++++-- examples/models/llama2/quantize.py | 6 +- .../_quant_patterns_and_replacements.py | 52 +++++++++ kernels/quantized/cpu/op_embedding.cpp | 104 +++++++++++++++--- kernels/quantized/quantized.yaml | 6 + 6 files changed, 228 insertions(+), 29 deletions(-) diff --git a/examples/models/llama2/ops/quantized.yaml b/examples/models/llama2/ops/quantized.yaml index 3c5376ceef..bc3e857665 100644 --- a/examples/models/llama2/ops/quantized.yaml +++ b/examples/models/llama2/ops/quantized.yaml @@ -4,6 +4,12 @@ - arg_meta: null kernel_name: torch::executor::quantized_embedding_byte_out +- func: llama_quantized::embedding_byte.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + variants: function + kernels: + - arg_meta: null + kernel_name: torch::executor::quantized_embedding_byte_dtype_out + - func: quantized_decomposed::mixed_linear.out(Tensor input, Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!) variants: function kernels: diff --git a/examples/models/llama2/ops/quantized_ops.py b/examples/models/llama2/ops/quantized_ops.py index f316aa42b4..2ab8df3080 100644 --- a/examples/models/llama2/ops/quantized_ops.py +++ b/examples/models/llama2/ops/quantized_ops.py @@ -22,16 +22,18 @@ "int weight_quant_min, int weight_quant_max, Tensor indices, *, Tensor(a!) out) -> Tensor(a!)", ) +quantized_lib.define( + "embedding_byte.dtype(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, " + "int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None) -> Tensor", +) -@impl(quantized_lib, "embedding_byte", "CompositeExplicitAutograd") -def embedding_byte_meta( - weight, - weight_scales, - weight_zero_points, - weight_quant_min, - weight_quant_max, - indices, -): +quantized_lib.define( + "embedding_byte.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, " + "int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)", +) + + +def embedding_byte_weight_checks(weight, weight_scales, weight_zero_points): assert weight.dtype in [ torch.int8, torch.uint8, @@ -45,8 +47,8 @@ def embedding_byte_meta( torch.float32, ], f"Expecting weight_scales to be of dtype in [torch.float16, torch.float32], but got {weight_scales.dtype}" assert ( - weight_scales.dim() == 1 - ), f"Expecting weight_scales tensor to have dim()==1, but found {weight_scales.dim()}" + weight_scales.dim() == 1 or weight_scales.dim() == 2 + ), f"Expecting weight_scales tensor to have rank 1 or 2, but found {weight_scales.dim()}" assert weight_scales.size(0) == weight.size( 0 ), f"Expecting weight and scale tensor to have same number of rows, but found {weight.size()} and {weight_scales.size()}" @@ -62,6 +64,18 @@ def embedding_byte_meta( ), f"Expecting weight_zero_points tensor to be None or have same number of rows as weights, but found {weight.size()} and {weight_zero_points.size()}" if not weight_zero_points: weight_zero_points = torch.zeros(weight.size(0)) + + +@impl(quantized_lib, "embedding_byte", "CompositeExplicitAutograd") +def embedding_byte_meta( + weight, + weight_scales, + weight_zero_points, + weight_quant_min, + weight_quant_max, + indices, +): + embedding_byte_weight_checks(weight, weight_scales, weight_zero_points) weight = torch.ops.quantized_decomposed.dequantize_per_channel.default( weight, weight_scales, @@ -92,3 +106,50 @@ def embedding_byte_out_meta( weight_quant_max, indices, ) + + +@impl(quantized_lib, "embedding_byte.dtype", "CompositeExplicitAutograd") +def embedding_byte_dtype_meta( + weight, + weight_scales, + weight_zero_points, + weight_quant_min, + weight_quant_max, + indices, + *, + dtype, +): + embedding_byte_weight_checks(weight, weight_scales, weight_zero_points) + weight = torch.ops.quantized_decomposed.dequantize_per_channel.default( + weight, + weight_scales, + weight_zero_points, + 0, + weight_quant_min, + weight_quant_max, + weight.dtype, + ) + return torch.ops.aten.embedding.default(weight, indices).to(dtype) + + +@impl_abstract("llama_quantized::embedding_byte.dtype_out") +def embedding_byte_dtype_out_meta( + weight, + weight_scales, + weight_zero_points, + weight_quant_min, + weight_quant_max, + indices, + *, + dtype, + out, +): + return embedding_byte_dtype_meta( + weight, + weight_scales, + weight_zero_points, + weight_quant_min, + weight_quant_max, + indices, + dtype=dtype, + ) diff --git a/examples/models/llama2/quantize.py b/examples/models/llama2/quantize.py index c3bf88a627..4c49a9de75 100644 --- a/examples/models/llama2/quantize.py +++ b/examples/models/llama2/quantize.py @@ -887,9 +887,9 @@ def __init__( @torch.no_grad() def forward(self, indices: torch.Tensor) -> torch.Tensor: - return torch.ops.llama_quantized.embedding_byte.default( - self.weight, self.scales, None, 0, 0, indices - ).to(self.dtype) + return torch.ops.llama_quantized.embedding_byte.dtype( + self.weight, self.scales, None, 0, 0, indices, dtype=self.dtype + ) # result_weights = self.weight.index_select(0, indices.view(-1)) diff --git a/exir/passes/_quant_patterns_and_replacements.py b/exir/passes/_quant_patterns_and_replacements.py index 692fe8d1dd..0cb9707774 100644 --- a/exir/passes/_quant_patterns_and_replacements.py +++ b/exir/passes/_quant_patterns_and_replacements.py @@ -30,6 +30,11 @@ "int weight_quant_min, int weight_quant_max, Tensor indices) -> Tensor", ) +quantized_decomposed_lib.define( + "embedding_byte.dtype(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, " + "int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None) -> Tensor", +) + quantized_decomposed_lib.define( "mixed_mm(Tensor input, Tensor weight, Tensor weight_scales, Tensor? weight_zero_points) -> Tensor", ) @@ -523,6 +528,48 @@ def replacement_with_padding_idx( ) return out + @bind_pattern_to_op(quantized_decomposed_lib, "embedding_byte.dtype") + def pattern_with_dtype( + weight, + weight_scales, + weight_zero_points, + weight_quant_min, + weight_quant_max, + indicies, + dtype, + ): + weight = torch.ops.quantized_decomposed.dequantize_per_channel.default( + weight, + weight_scales, + weight_zero_points, + 0, + weight_quant_min, + weight_quant_max, + torch.uint8, + ) + out = torch.ops.aten.embedding.default(weight, indicies).to(dtype) + return out + + def replacement_with_dtype( + weight, + weight_scales, + weight_zero_points, + weight_quant_min, + weight_quant_max, + indicies, + dtype, + ): + out = torch.ops.quantized_decomposed.embedding_byte.dtype( + weight, + weight_scales, + weight_zero_points, + weight_quant_min, + weight_quant_max, + indicies, + dtype=dtype, + ) + return out + return [ ( _trace_and_lower_to_edge_ops(pattern), @@ -534,6 +581,11 @@ def replacement_with_padding_idx( _trace_and_lower_to_edge_ops(replacement_with_padding_idx), [], ), + ( + _trace_and_lower_to_edge_ops(pattern_with_dtype), + _trace_and_lower_to_edge_ops(replacement_with_dtype), + [], + ), ] patterns_and_replacements = [] diff --git a/kernels/quantized/cpu/op_embedding.cpp b/kernels/quantized/cpu/op_embedding.cpp index 2964ecbab5..514125ed46 100644 --- a/kernels/quantized/cpu/op_embedding.cpp +++ b/kernels/quantized/cpu/op_embedding.cpp @@ -31,12 +31,13 @@ void check_embedding_byte_args( const int64_t weight_quant_min, const int64_t weight_quant_max, const Tensor& indices, + exec_aten::optional out_dtype, Tensor& out) { ET_CHECK_MSG( weight.dim() == 2, "weight must be 2D but got() %zd dims", weight.dim()); ET_CHECK_MSG( - weight_scales.dim() <= 2, + weight_scales.dim() == 1 || weight_scales.dim() == 2, "weight_scales must be 1D or 2D but got() %zd dims", weight_scales.dim()); @@ -75,8 +76,9 @@ void check_embedding_byte_args( static_cast(out.scalar_type())); ET_CHECK_MSG( - weight_scales.scalar_type() == out.scalar_type(), - "weight scales scalar type %" PRId8 " does not match out.scalar_type()", + weight_scales.scalar_type() == ScalarType::Float || + weight_scales.scalar_type() == ScalarType::Half, + "weight_scales.scalar_type() %" PRId8 " is not supported:", static_cast(weight_scales.scalar_type())); if (opt_weight_zero_points.has_value()) { @@ -116,13 +118,19 @@ void check_embedding_byte_args( " is greater than weight quant max: %" PRId64, weight_quant_min, weight_quant_max); + + if (out_dtype.has_value()) { + ET_CHECK_MSG( + out.scalar_type() == out_dtype.value(), + "output_dtype must match the dtype of the out tensor"); + } } /** * Retrieves the embeddings specified by indices, dequantizes them, and stores * them in out */ -template +template void embedding_byte_per_channel( const Tensor& weight, const Tensor& weight_scales, @@ -142,19 +150,19 @@ void embedding_byte_per_channel( CTYPE_OUT* out_data = out.mutable_data_ptr(); const int64_t* indices_ptr = indices.const_data_ptr(); - const CTYPE_OUT* scales = weight_scales.const_data_ptr(); - const CTYPE_OUT* zero_points = nullptr; + const CTYPE_PARAMS* scales = weight_scales.const_data_ptr(); + const CTYPE_PARAMS* zero_points = nullptr; if (opt_weight_zero_points.has_value()) { - zero_points = opt_weight_zero_points.value().const_data_ptr(); + zero_points = opt_weight_zero_points.value().const_data_ptr(); } for (int i = 0; i < indices.numel(); i++) { int64_t index = indices_ptr[i]; // If using groupwise embedding int32_t qparams_index = index * num_groups_per_channel; - CTYPE_OUT zp = 0.0; - const CTYPE_OUT* scale_ptr = scales + qparams_index; - const CTYPE_OUT* zero_points_ptr = nullptr; + CTYPE_PARAMS zp = 0.0; + const CTYPE_PARAMS* scale_ptr = scales + qparams_index; + const CTYPE_PARAMS* zero_points_ptr = nullptr; if (opt_weight_zero_points.has_value()) { zero_points_ptr = zero_points + qparams_index; } @@ -164,7 +172,7 @@ void embedding_byte_per_channel( for (int j = 0; j < embedding_dim; ++j) { int32_t group_id = j / group_size; - const CTYPE_OUT scale = scale_ptr[group_id]; + const CTYPE_PARAMS scale = scale_ptr[group_id]; if (opt_weight_zero_points.has_value()) { zp = zero_points_ptr[group_id]; } @@ -220,6 +228,9 @@ Tensor& quantized_embedding_byte_out( const int64_t weight_quant_max, const Tensor& indices, Tensor& out) { + ScalarType w_type = weight.scalar_type(); + ScalarType out_type = out.scalar_type(); + // TODO (jakeszwe): improve these to account for the size of out in relation // to weight and indices accounting for a possible batch dimension check_embedding_byte_args( @@ -229,15 +240,13 @@ Tensor& quantized_embedding_byte_out( weight_quant_min, weight_quant_max, indices, + out_type, out); - ScalarType w_type = weight.scalar_type(); - ScalarType out_type = out.scalar_type(); - constexpr auto name = "quantized_decomposed::embedding_byte.out"; ET_SWITCH_TWO_TYPES(Byte, Char, w_type, ctx, name, CTYPE_W, [&]() { ET_SWITCH_TWO_TYPES(Float, Half, out_type, ctx, name, CTYPE_OUT, [&]() { - embedding_byte_per_channel( + embedding_byte_per_channel( weight, weight_scales, opt_weight_zero_points, indices, out); }); }); @@ -268,6 +277,71 @@ Tensor& quantized_embedding_byte_out( out); } +Tensor& quantized_embedding_byte_dtype_out( + // TODO Evaluate whether this name is appropriate for an operator that takes + // non quant input and returns fp output + const Tensor& weight, + const Tensor& weight_scales, + const optional& opt_weight_zero_points, + const int64_t weight_quant_min, + const int64_t weight_quant_max, + const Tensor& indices, + exec_aten::optional out_dtype, + Tensor& out) { + // TODO (jakeszwe): improve these to account for the size of out in relation + // to weight and indices accounting for a possible batch dimension + check_embedding_byte_args( + weight, + weight_scales, + opt_weight_zero_points, + weight_quant_min, + weight_quant_max, + indices, + out_dtype, + out); + + ScalarType weight_type = weight.scalar_type(); + ScalarType params_type = weight_scales.scalar_type(); + ScalarType out_type = out.scalar_type(); + + constexpr auto name = "quantized_decomposed::embedding_byte.dtype_out"; + ET_SWITCH_TWO_TYPES(Byte, Char, weight_type, ctx, name, CTYPE_W, [&]() { + ET_SWITCH_TWO_TYPES(Float, Half, params_type, ctx, name, CTYPE_P, [&]() { + ET_SWITCH_TWO_TYPES(Float, Half, out_type, ctx, name, CTYPE_OUT, [&]() { + embedding_byte_per_channel( + weight, weight_scales, opt_weight_zero_points, indices, out); + }); + }); + }); + + return out; +} + +Tensor& quantized_embedding_byte_dtype_out( + RuntimeContext& context, + const Tensor& weight, + const Tensor& weight_scales, + const optional& opt_weight_zero_points, + int64_t weight_quant_min, + int64_t weight_quant_max, + const Tensor& indices, + exec_aten::optional out_dtype, + Tensor& out) { + // TODO(larryliu): Add a context arg to the real op function and remove this + // wrapper + (void)context; + resize_out_tensor(weight, indices, out); + return quantized_embedding_byte_dtype_out( + weight, + weight_scales, + opt_weight_zero_points, + weight_quant_min, + weight_quant_max, + indices, + out_dtype, + out); +} + } // namespace native } // namespace executor } // namespace torch diff --git a/kernels/quantized/quantized.yaml b/kernels/quantized/quantized.yaml index 34830b01bd..9f802e33c6 100644 --- a/kernels/quantized/quantized.yaml +++ b/kernels/quantized/quantized.yaml @@ -40,6 +40,12 @@ - arg_meta: null kernel_name: torch::executor::quantized_embedding_byte_out +- func: quantized_decomposed::embedding_byte.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + variants: function + kernels: + - arg_meta: null + kernel_name: torch::executor::quantized_embedding_byte_dtype_out + - func: quantized_decomposed::mixed_mm.out(Tensor input, Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, *, Tensor(a!) out) -> Tensor(a!) variants: function kernels: From 568673ed8a8ecd607b74e27d7e7cb7c30306d977 Mon Sep 17 00:00:00 2001 From: Mergen Nachin Date: Mon, 4 Mar 2024 13:46:22 -0800 Subject: [PATCH 021/290] Update ufmt version (#2237) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/2237 `pyfmt -V` says that ufmt version is 2.5.1 Context: https://www.internalfb.com/intern/wiki/Python/code_formatting/pyfmt/#replicating-pyfmt-in-ope Reviewed By: kimishpatel Differential Revision: D54501609 fbshipit-source-id: c1d694d27d1603b927f41c07c648f664218c0e87 --- requirements-lintrunner.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-lintrunner.txt b/requirements-lintrunner.txt index 85d7e8e13d..c3d550fc07 100644 --- a/requirements-lintrunner.txt +++ b/requirements-lintrunner.txt @@ -14,7 +14,7 @@ torchfix==0.1.1 # UFMT black==24.2.0 -ufmt==2.0.1 +ufmt==2.5.1 usort==1.0.5 # Other linters From 566209d6ecd4ad6038935cd7a073e573dcea2ecb Mon Sep 17 00:00:00 2001 From: Hansong Zhang Date: Mon, 4 Mar 2024 14:15:49 -0800 Subject: [PATCH 022/290] Implementation thread parallel with threadpool (#2173) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/2173 Use the ET threadpool (with underlying pthreadpool) to provide `parallel_for` functionality Reviewed By: kimishpatel Differential Revision: D54335940 fbshipit-source-id: 0865d0c76d1f16c325da8c13656fa955d6a48ade --- extension/parallel/TARGETS | 8 + extension/parallel/targets.bzl | 28 +++ extension/parallel/test/TARGETS | 8 + extension/parallel/test/targets.bzl | 18 ++ .../parallel/test/thread_parallel_test.cpp | 202 ++++++++++++++++++ extension/parallel/thread_parallel.cpp | 62 ++++++ extension/parallel/thread_parallel.h | 37 ++++ 7 files changed, 363 insertions(+) create mode 100644 extension/parallel/TARGETS create mode 100644 extension/parallel/targets.bzl create mode 100644 extension/parallel/test/TARGETS create mode 100644 extension/parallel/test/targets.bzl create mode 100644 extension/parallel/test/thread_parallel_test.cpp create mode 100644 extension/parallel/thread_parallel.cpp create mode 100644 extension/parallel/thread_parallel.h diff --git a/extension/parallel/TARGETS b/extension/parallel/TARGETS new file mode 100644 index 0000000000..2341af9282 --- /dev/null +++ b/extension/parallel/TARGETS @@ -0,0 +1,8 @@ +# Any targets that should be shared between fbcode and xplat must be defined in +# targets.bzl. This file can contain fbcode-only targets. + +load(":targets.bzl", "define_common_targets") + +oncall("executorch") + +define_common_targets() diff --git a/extension/parallel/targets.bzl b/extension/parallel/targets.bzl new file mode 100644 index 0000000000..46029d9d5e --- /dev/null +++ b/extension/parallel/targets.bzl @@ -0,0 +1,28 @@ +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") + +def define_common_targets(): + """Defines targets that should be shared between fbcode and xplat. + + The directory containing this targets.bzl file should also contain both + TARGETS and BUCK files that call this function. + """ + + for aten_mode in (True, False): + aten_suffix = ("_aten" if aten_mode else "") + + runtime.cxx_library( + name = "thread_parallel" + aten_suffix, + srcs = [ + "thread_parallel.cpp", + ], + exported_headers = [ + "thread_parallel.h", + ], + visibility = [ + "//executorch/...", + ], + deps = [ + "//executorch/backends/xnnpack/threadpool:threadpool", + "//executorch/runtime/core:core", + ], + ) diff --git a/extension/parallel/test/TARGETS b/extension/parallel/test/TARGETS new file mode 100644 index 0000000000..2341af9282 --- /dev/null +++ b/extension/parallel/test/TARGETS @@ -0,0 +1,8 @@ +# Any targets that should be shared between fbcode and xplat must be defined in +# targets.bzl. This file can contain fbcode-only targets. + +load(":targets.bzl", "define_common_targets") + +oncall("executorch") + +define_common_targets() diff --git a/extension/parallel/test/targets.bzl b/extension/parallel/test/targets.bzl new file mode 100644 index 0000000000..ad2e3feb5f --- /dev/null +++ b/extension/parallel/test/targets.bzl @@ -0,0 +1,18 @@ +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") + +def define_common_targets(): + """Defines targets that should be shared between fbcode and xplat. + + The directory containing this targets.bzl file should also contain both + TARGETS and BUCK files that call this function. + """ + + runtime.cxx_test( + name = "thread_parallel_test", + srcs = [ + "thread_parallel_test.cpp", + ], + deps = [ + "//executorch/extension/parallel:thread_parallel", + ], + ) diff --git a/extension/parallel/test/thread_parallel_test.cpp b/extension/parallel/test/thread_parallel_test.cpp new file mode 100644 index 0000000000..9e45523937 --- /dev/null +++ b/extension/parallel/test/thread_parallel_test.cpp @@ -0,0 +1,202 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include + +#include +#include + +using namespace ::testing; + +namespace torch::executor { + +class ParallelTest : public ::testing::Test { + protected: + void SetUp() override { + data_.fill(0); + sum_of_all_elements_ = 0; + } + + void RunTask(int64_t begin, int64_t end) { + for (int64_t j = begin; j < end; ++j) { + // Check that we haven't written to this index before + EXPECT_EQ(data_[j], 0); + data_[j] = j; + } + } + + void RunExclusiveTask(int64_t begin, int64_t end) { + for (int64_t j = begin; j < end; ++j) { + // Check that we haven't written to this index before + EXPECT_EQ(data_[j], 0); + std::lock_guard lock(mutex_); + data_[j] = j; + sum_of_all_elements_ += data_[j]; + } + } + + std::array data_; + std::mutex mutex_; + int sum_of_all_elements_; +}; + +TEST_F(ParallelTest, TestAllInvoked) { + parallel_for(0, 10, 1, [this](int64_t begin, int64_t end) { + this->RunTask(begin, end); + }); + + for (int64_t i = 0; i < 10; ++i) { + EXPECT_EQ(data_[i], i); + } +} + +TEST_F(ParallelTest, TestAllInvokedWithMutex) { + parallel_for(0, 10, 1, [this](int64_t begin, int64_t end) { + this->RunExclusiveTask(begin, end); + }); + + int expected_sum = 0; + for (int64_t i = 0; i < 10; ++i) { + EXPECT_EQ(data_[i], i); + expected_sum += i; + } + EXPECT_EQ(sum_of_all_elements_, expected_sum); +} + +TEST_F(ParallelTest, TestInvalidRange) { + ET_EXPECT_DEATH( + { + parallel_for(10, 0, 1, [this](int64_t begin, int64_t end) { + this->RunExclusiveTask(begin, end); + }); + }, + ""); + + for (int64_t i = 0; i < 10; ++i) { + EXPECT_EQ(data_[i], 0); + } + EXPECT_EQ(sum_of_all_elements_, 0); +} + +TEST_F(ParallelTest, TestInvalidRange2) { + ET_EXPECT_DEATH( + { + parallel_for(6, 5, 1, [this](int64_t begin, int64_t end) { + this->RunExclusiveTask(begin, end); + }); + }, + ""); + + for (int64_t i = 0; i < 10; ++i) { + EXPECT_EQ(data_[i], 0); + } + EXPECT_EQ(sum_of_all_elements_, 0); +} + +TEST_F(ParallelTest, TestInvokePartialFromBeginning) { + parallel_for(0, 5, 1, [this](int64_t begin, int64_t end) { + this->RunTask(begin, end); + }); + + for (int64_t i = 0; i < 5; ++i) { + EXPECT_EQ(data_[i], i); + } + for (int64_t i = 5; i < 10; ++i) { + EXPECT_EQ(data_[i], 0); + } +} + +TEST_F(ParallelTest, TestInvokePartialToEnd) { + parallel_for(5, 10, 1, [this](int64_t begin, int64_t end) { + this->RunTask(begin, end); + }); + + for (int64_t i = 0; i < 5; ++i) { + EXPECT_EQ(data_[i], 0); + } + for (int64_t i = 5; i < 10; ++i) { + EXPECT_EQ(data_[i], i); + } +} + +TEST_F(ParallelTest, TestInvokePartialMiddle) { + parallel_for(2, 8, 1, [this](int64_t begin, int64_t end) { + this->RunTask(begin, end); + }); + + for (int64_t i = 0; i < 2; ++i) { + EXPECT_EQ(data_[i], 0); + } + for (int64_t i = 2; i < 8; ++i) { + EXPECT_EQ(data_[i], i); + } + for (int64_t i = 8; i < 10; ++i) { + EXPECT_EQ(data_[i], 0); + } +} + +TEST_F(ParallelTest, TestChunkSize2) { + parallel_for(0, 10, 2, [this](int64_t begin, int64_t end) { + this->RunTask(begin, end); + }); + + for (int64_t i = 0; i < 10; ++i) { + EXPECT_EQ(data_[i], i); + } +} + +TEST_F(ParallelTest, TestChunkSize2Middle) { + parallel_for(3, 8, 2, [this](int64_t begin, int64_t end) { + this->RunTask(begin, end); + }); + + for (int64_t i = 0; i < 3; ++i) { + EXPECT_EQ(data_[i], 0); + } + for (int64_t i = 3; i < 8; ++i) { + EXPECT_EQ(data_[i], i); + } + for (int64_t i = 8; i < 10; ++i) { + EXPECT_EQ(data_[i], 0); + } +} + +TEST_F(ParallelTest, TestChunkSize3) { + parallel_for(0, 10, 3, [this](int64_t begin, int64_t end) { + this->RunTask(begin, end); + }); + + for (int64_t i = 0; i < 10; ++i) { + EXPECT_EQ(data_[i], i); + } +} + +TEST_F(ParallelTest, TestChunkSize6) { + parallel_for(0, 10, 6, [this](int64_t begin, int64_t end) { + this->RunTask(begin, end); + }); + + for (int64_t i = 0; i < 10; ++i) { + EXPECT_EQ(data_[i], i); + } +} + +TEST_F(ParallelTest, TestChunkSizeTooLarge) { + parallel_for(0, 10, 11, [this](int64_t begin, int64_t end) { + this->RunTask(begin, end); + }); + + for (int64_t i = 0; i < 10; ++i) { + EXPECT_EQ(data_[i], i); + } +} + +} // namespace torch::executor diff --git a/extension/parallel/thread_parallel.cpp b/extension/parallel/thread_parallel.cpp new file mode 100644 index 0000000000..14ee87b98e --- /dev/null +++ b/extension/parallel/thread_parallel.cpp @@ -0,0 +1,62 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include +#include + +namespace torch::executor { + +using namespace torch::executorch::threadpool; + +inline int64_t divup(int64_t x, int64_t y) { + return (x + y - 1) / y; +} + +inline std::tuple +calc_num_tasks_and_chunk_size(int64_t begin, int64_t end, int64_t grain_size) { + if ((end - begin) < grain_size) { + return std::make_tuple(1, std::max((int64_t)0, end - begin)); + } + // Choose number of tasks based on grain size and number of threads. + int64_t chunk_size = + divup((end - begin), get_threadpool()->get_thread_count()); + // Make sure each task is at least grain_size size. + chunk_size = std::max(grain_size, chunk_size); + int64_t num_tasks = divup((end - begin), chunk_size); + return std::make_tuple(num_tasks, chunk_size); +} + +void parallel_for( + const int64_t begin, + const int64_t end, + const int64_t grain_size, + const std::function& f) { + ET_CHECK_MSG(begin >= 0 && end >= 0, "Begin and end should be non-negative"); + ET_CHECK_MSG(end >= begin, "end should be greater than or equal to begin"); + ET_CHECK_MSG(grain_size > 0, "grain_size should be positive"); + int64_t num_tasks = 0, chunk_size = 0; + std::tie(num_tasks, chunk_size) = + calc_num_tasks_and_chunk_size(begin, end, grain_size); + + auto task = [f, begin, end, chunk_size](size_t task_id) { + int64_t local_start = begin + static_cast(task_id) * chunk_size; + if (local_start < end) { + int64_t local_end = std::min(end, (int64_t)(chunk_size + local_start)); + f(local_start, local_end); + } + }; + + // Per protocol from threadpool (pthreadpool), when this returns, all tasks + // are executed, so this is synchronous. + get_threadpool()->run(task, num_tasks); +} + +} // namespace torch::executor diff --git a/extension/parallel/thread_parallel.h b/extension/parallel/thread_parallel.h new file mode 100644 index 0000000000..ccfec1da66 --- /dev/null +++ b/extension/parallel/thread_parallel.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +// @nolint PATTERNLINT Ok to use stdlib for this optional library +#include + +namespace torch::executor { + +/** + * A helper to run function in parallel. + * + * begin, end: describe the extent of the workitems via first and last workitem + * to be processed + * grain_size: number of workitems processed by user callback which is + * described below + * f: user function applied in parallel to the chunks, signature: + * void f(int64_t begin, int64_t end) + * + * Warning: parallel_for does NOT copy thread local states from the current + * thread to the worker threads. Users need to protect the access to captured + * data if they mutate them in f. + */ +void parallel_for( + const int64_t begin, + const int64_t end, + const int64_t grain_size, + const std::function& f); + +} // namespace torch::executor From eb2d8b69fa0ad678e29b2e37746679f29a0cb558 Mon Sep 17 00:00:00 2001 From: Hansong Zhang Date: Mon, 4 Mar 2024 14:31:27 -0800 Subject: [PATCH 023/290] Allow using files from local storage (#2200) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/2200 This helps us decouple models from the app, so we can use language model. ``` adb push language_llama_xnnpack_kv.pte /data/local/tmp/language.pte adb push flores200sacrebleuspm.bin /data/local/tmp/language.bin ``` Reviewed By: mcr229 Differential Revision: D54427423 fbshipit-source-id: 35ebe58c04c8f817750cee72bdc51834db0d2866 --- .../main/java/com/example/executorchdemo/MainActivity.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchdemo/MainActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchdemo/MainActivity.java index a9e1a32018..c24c367860 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchdemo/MainActivity.java +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchdemo/MainActivity.java @@ -68,6 +68,10 @@ private void setModel(String modelPath, String tokenizerPath) { } } + private void setLocalModel(String modelPath, String tokenizerPath) { + mModule = new LlamaModule(modelPath, tokenizerPath, 0.8f); + } + private void modelDialog() { AlertDialog.Builder builder = new AlertDialog.Builder(this); builder.setTitle("Select a Model"); @@ -81,7 +85,7 @@ public void onClick(android.content.DialogInterface dialog, int item) { setModel("stories110M.pte", "tokenizer.bin"); break; case 1: - setModel("language.pte", "language.bin"); + setLocalModel("/data/local/tmp/language.pte", "/data/local/tmp/language.bin"); break; } mEditTextMessage.setText(""); From 6c9880c4dffdc5cd1b5aa02c1fe2fe4c527d786d Mon Sep 17 00:00:00 2001 From: Stephen Jia Date: Mon, 4 Mar 2024 19:46:32 -0800 Subject: [PATCH 024/290] Implement global shader registry (#2222) Summary: X-link: https://github.com/pytorch/pytorch/pull/121088 Pull Request resolved: https://github.com/pytorch/executorch/pull/2222 ## Context This changeset updates Vulkan SPIR-V codegen to introduce a global SPIR-V shader registry and register shaders dynamically at static initialization time. This change makes it possible to define and link custom shader libraries to the ATen-Vulkan runtime. Before: * `gen_vulkan_spv.py` generated two files, `spv.h` and `spv.cpp` which would contain the definition and initialization of Vulkan shader registry variables. After: * Introduce the `ShaderRegistry` class in `api/`, which encapsulates functionality of the `ShaderRegistry` class previously defined in the generated `spv.h` file * Introduce a global shader registry (defined as a static variable in the `api::shader_registry() function` * Define a `ShaderRegisterInit` class (taking inspiration from `TorchLibraryInit`) that allows for dynamic shader registration * `gen_vulkan_spv.py` now only generates `spv.cpp`, which defines a static `ShaderRegisterInit` instance that triggers registration of the compiled shaders to the global shader registry. Benefits: * Cleaner code base; we no longer have `ShaderRegistry` defined in a generated file, and don't need a separate implementation file (`impl/Registry.*`) to handle shader lookup. All that logic now lives under `api/ShaderRegistry.*` * Makes it possible to compile and link separate shader libraries, providing similar flexibility as defining and linking custom ATen operators bypass-github-pytorch-ci-checks Reviewed By: jorgep31415 Differential Revision: D54447700 fbshipit-source-id: 95b971ea82378a373124fad94bfbb1ef6d51b31f --- backends/vulkan/targets.bzl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/vulkan/targets.bzl b/backends/vulkan/targets.bzl index f189e44c1a..345f18801f 100644 --- a/backends/vulkan/targets.bzl +++ b/backends/vulkan/targets.bzl @@ -53,8 +53,8 @@ def define_common_targets(): "@EXECUTORCH_CLIENTS", ], exported_deps = [ - "//caffe2:torch_vulkan_api", "//caffe2:torch_vulkan_ops", + "//caffe2:torch_vulkan_spv", ], define_static_target = False, ) From 829a2bd7e016586bb8e9790ad0d3d1aaca9ad5d9 Mon Sep 17 00:00:00 2001 From: Stephen Jia Date: Mon, 4 Mar 2024 20:06:31 -0800 Subject: [PATCH 025/290] Set up Vulkan executor_runner (#2239) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/2239 imported-using-ghimport Test Plan: Imported from OSS Reviewed By: kirklandsign Differential Revision: D54512106 Pulled By: SS-JIA fbshipit-source-id: 3a52d2051b62c4dc9bcf1d252b9636fcae182fe4 --- backends/vulkan/CMakeLists.txt | 26 +++++++++++++++++++ backends/vulkan/__init__.py | 14 ++++++++++ .../serialization/vulkan_graph_serialize.py | 5 ++-- 3 files changed, 43 insertions(+), 2 deletions(-) create mode 100644 backends/vulkan/__init__.py diff --git a/backends/vulkan/CMakeLists.txt b/backends/vulkan/CMakeLists.txt index 203923f83e..834f2704fc 100644 --- a/backends/vulkan/CMakeLists.txt +++ b/backends/vulkan/CMakeLists.txt @@ -32,6 +32,9 @@ if(NOT FLATC_EXECUTABLE) set(FLATC_EXECUTABLE flatc) endif() +# Include this file to access target_link_options_shared_lib +include(${EXECUTORCH_ROOT}/build/Utils.cmake) + # ATen Vulkan Libs set(PYTORCH_PATH ${EXECUTORCH_ROOT}/third-party/pytorch) @@ -92,6 +95,29 @@ target_link_libraries(vulkan_backend PRIVATE executorch) target_compile_options(vulkan_backend PRIVATE ${VULKAN_CXX_FLAGS}) +# This is required to ensure that vulkan_backend gets linked with +# --whole-archive since backends are registered via static variables that would +# otherwise be discarded +target_link_options_shared_lib(vulkan_backend) + +# Executor Runner + +if(NOT CMAKE_TOOLCHAIN_FILE MATCHES ".*iOS\.cmake$") + set(VULKAN_RUNNER_SRCS ${_executor_runner__srcs}) + list(TRANSFORM VULKAN_RUNNER_SRCS PREPEND "${EXECUTORCH_ROOT}/") + add_executable(vulkan_executor_runner ${VULKAN_RUNNER_SRCS}) + target_link_libraries(vulkan_executor_runner ${_executor_runner_libs}) + target_link_libraries(vulkan_executor_runner vulkan_schema) + target_link_libraries(vulkan_executor_runner vulkan_backend) + target_compile_options(vulkan_executor_runner PUBLIC ${VULKAN_CXX_FLAGS}) + + add_library(vulkan_executor_runner_lib STATIC ${VULKAN_RUNNER_SRCS}) + target_link_libraries(vulkan_executor_runner_lib ${_executor_runner_libs}) + target_link_libraries(vulkan_executor_runner_lib vulkan_schema) + target_link_libraries(vulkan_executor_runner_lib vulkan_backend) + target_compile_options(vulkan_executor_runner_lib PUBLIC ${VULKAN_CXX_FLAGS}) +endif() + # Test targets if(EXECUTORCH_BUILD_GTESTS) diff --git a/backends/vulkan/__init__.py b/backends/vulkan/__init__.py new file mode 100644 index 0000000000..6c25e56115 --- /dev/null +++ b/backends/vulkan/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from .partitioner.vulkan_partitioner import VulkanPartitioner + +from .vulkan_preprocess import VulkanBackend + +__all__ = [ + "VulkanPartitioner", + "VulkanBackend", +] diff --git a/backends/vulkan/serialization/vulkan_graph_serialize.py b/backends/vulkan/serialization/vulkan_graph_serialize.py index e10e24da50..83a9e75f6c 100644 --- a/backends/vulkan/serialization/vulkan_graph_serialize.py +++ b/backends/vulkan/serialization/vulkan_graph_serialize.py @@ -13,7 +13,6 @@ from typing import ClassVar, List # pyre-ignore[21]: Could not find module `executorch.exir._serialize._bindings`. -import executorch.exir._serialize._bindings as bindings # @manual=//executorch/exir/_serialize:_bindings import pkg_resources import torch @@ -23,6 +22,8 @@ ) from executorch.exir._serialize._dataclass import _DataclassEncoder +from executorch.exir._serialize._flatbuffer import _flatc_compile + def convert_to_flatbuffer(vk_graph: VkGraph) -> bytes: vk_graph_json = json.dumps(vk_graph, cls=_DataclassEncoder) @@ -35,7 +36,7 @@ def convert_to_flatbuffer(vk_graph: VkGraph) -> bytes: with open(json_path, "wb") as json_file: json_file.write(vk_graph_json.encode("ascii")) # pyre-ignore - bindings.flatc_compile(d, schema_path, json_path) + _flatc_compile(d, schema_path, json_path) output_path = os.path.join(d, "schema.bin") with open(output_path, "rb") as output_file: return output_file.read() From 10e251009b578e925b1c52bb95ae3b3ace56bac7 Mon Sep 17 00:00:00 2001 From: Angela Yi Date: Mon, 4 Mar 2024 21:20:45 -0800 Subject: [PATCH 026/290] Disable exported_program.__call__ (#1994) Summary: X-link: https://github.com/pytorch/pytorch/pull/120019 Pull Request resolved: https://github.com/pytorch/executorch/pull/1994 Reland of D53075378 / https://github.com/pytorch/pytorch/pull/119466 Reviewed By: suo Differential Revision: D53827930 fbshipit-source-id: 83554c86ea320a4d00c781c10bcfa71d6c3e6471 --- exir/backend/test/test_backends_lifted.py | 16 ++++++++-------- exir/capture/_capture.py | 10 ++++++---- exir/tests/test_verification.py | 2 +- 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/exir/backend/test/test_backends_lifted.py b/exir/backend/test/test_backends_lifted.py index 86653fa96e..905ce1a7f2 100644 --- a/exir/backend/test/test_backends_lifted.py +++ b/exir/backend/test/test_backends_lifted.py @@ -626,7 +626,7 @@ def forward(self, x_raw, h, c): ), ) - new_res = program_with_delegates.exported_program()(*inputs) + new_res = program_with_delegates.exported_program().module()(*inputs) for t1, t2 in zip(new_res, orig_res, strict=True): self.assertTrue(torch.allclose(t1, t2, atol=1e-03, rtol=1e-03)) @@ -745,7 +745,7 @@ def forward(self, x_raw, h, c): HTAPartitionerOnePatternDemo() ) - new_res = traced_with_delegate.exported_program()(*inputs) + new_res = traced_with_delegate.exported_program().module()(*inputs) for t1, t2 in zip(new_res, orig_res, strict=True): self.assertTrue(torch.allclose(t1, t2, atol=1e-03, rtol=1e-03)) @@ -768,7 +768,7 @@ def forward(self, x_raw, h, c): # config=exir.ExecutorchBackendConfig(extract_delegate_segments=extract_delegate_segments), # ) - new_res = program_with_delegates.exported_program()(*inputs) + new_res = program_with_delegates.exported_program().module()(*inputs) for t1, t2 in zip(new_res, orig_res, strict=True): self.assertTrue(torch.allclose(t1, t2, atol=1e-03, rtol=1e-03)) @@ -1029,7 +1029,7 @@ def f(x, y): partitioned = orig partitioned = partitioned.to_backend(AddMulPartitionerDemo()) - new_res = partitioned.exported_program()(*inputs) + new_res = partitioned.exported_program().module()(*inputs) self.assertTrue(torch.allclose(orig_res, new_res[0])) toplevel_lowered = get_lowered_submodules( @@ -1102,7 +1102,7 @@ def f(xs, y): map_fn_lowered[0][1].original_module.graph_module.code ) - new_res = partitioned.exported_program()(*inputs) + new_res = partitioned.exported_program().module()(*inputs) self.assertTrue(torch.allclose(orig_res, new_res[0])) @@ -1153,7 +1153,7 @@ def f(xs, pred1, pred2, y): partitioned = orig partitioned = partitioned.to_backend(AddMulPartitionerDemo()) - new_res = partitioned.exported_program()(*inputs) + new_res = partitioned.exported_program().module()(*inputs) self.assertTrue(torch.allclose(orig_res, new_res[0])) toplevel_lowered = get_lowered_submodules( @@ -1224,7 +1224,7 @@ def forward(self, x: List[torch.Tensor]): return self.lowered(x) gm = to_edge(export(ComposedM(), inputs)) - gm.exported_program()(*inputs) + gm.exported_program().module()(*inputs) def test_dict_input(self): def f(x: Dict[str, torch.Tensor]): @@ -1246,4 +1246,4 @@ def forward(self, x: List[torch.Tensor]): return self.lowered(x) gm = to_edge(export(ComposedM(), inputs)) - gm.exported_program()(*inputs) + gm.exported_program().module()(*inputs) diff --git a/exir/capture/_capture.py b/exir/capture/_capture.py index 93281aa317..b9e051f2cb 100644 --- a/exir/capture/_capture.py +++ b/exir/capture/_capture.py @@ -240,10 +240,12 @@ def capture( # noqa: C901 ) if out_spec is None: - out_spec = ( - graph_module.graph._codegen.pytree_info.out_spec - or pytree.tree_flatten(f(*args))[1] - ) + if isinstance(graph_module.graph._codegen, torch.fx.graph._PyTreeCodeGen): + out_spec = graph_module.graph._codegen.pytree_info.out_spec + elif hasattr(graph_module, "_out_spec"): + out_spec = graph_module._out_spec + else: + out_spec = pytree.tree_flatten(f(*args))[1] # NOTE (tmanlaibaatar) # torchdynamo.export adds extra kwarg into the graph module diff --git a/exir/tests/test_verification.py b/exir/tests/test_verification.py index 232de306fb..33623d7f6c 100644 --- a/exir/tests/test_verification.py +++ b/exir/tests/test_verification.py @@ -136,7 +136,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: exec_prog = to_edge(export(model2, (inputs,))).to_executorch() exported_prog = exec_prog.exported_program() - res = exported_prog(inputs)[0] # noqa + res = exported_prog.module()(inputs)[0] # noqa # Verifiers are run internally in to_edge, export, and to_executorch. # If we make it this far then no errors were thrown in verification From a1293b22f564eafda3c4109e44ef1c93ae536f97 Mon Sep 17 00:00:00 2001 From: Songhao Jia Date: Mon, 4 Mar 2024 22:16:03 -0800 Subject: [PATCH 027/290] using exec_aten::SizeType to replace Tensor::SizesType in copy_ops_utils (#2250) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/2250 as title Reviewed By: vmpuri Differential Revision: D54518327 fbshipit-source-id: ec0e0b7f185571fb5a10eb97d4e1732a1d03d017 --- kernels/portable/cpu/util/copy_ops_util.cpp | 2 +- kernels/portable/cpu/util/copy_ops_util.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernels/portable/cpu/util/copy_ops_util.cpp b/kernels/portable/cpu/util/copy_ops_util.cpp index ae48dee0fb..bea40261ed 100644 --- a/kernels/portable/cpu/util/copy_ops_util.cpp +++ b/kernels/portable/cpu/util/copy_ops_util.cpp @@ -869,7 +869,7 @@ void get_diagonal_copy_out_target_size( int64_t offset, int64_t dim1, int64_t dim2, - Tensor::SizesType* out_sizes, + exec_aten::SizesType* out_sizes, size_t* out_ndim) { *out_ndim = in.dim() - 1; diff --git a/kernels/portable/cpu/util/copy_ops_util.h b/kernels/portable/cpu/util/copy_ops_util.h index db2161ec0e..5f341b0c2b 100644 --- a/kernels/portable/cpu/util/copy_ops_util.h +++ b/kernels/portable/cpu/util/copy_ops_util.h @@ -225,7 +225,7 @@ void get_diagonal_copy_out_target_size( int64_t offset, int64_t dim1, int64_t dim2, - Tensor::SizesType* out_sizes, + exec_aten::SizesType* out_sizes, size_t* out_ndim); } // namespace executor From 9260d7bc6e27b7118055eff82bedd5eb2ef5b3e5 Mon Sep 17 00:00:00 2001 From: Michael Gschwind Date: Tue, 5 Mar 2024 06:28:41 -0800 Subject: [PATCH 028/290] Add mixed dtype linear test (#2206) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/2206 Add mixed dtype linear test Reviewed By: manuelcandales Differential Revision: D54180116 fbshipit-source-id: ec841b2a0b8817a662624efefb8c1beb228331cf --- kernels/quantized/cpu/op_mixed_linear.cpp | 3 +- .../quantized/test/op_mixed_linear_test.cpp | 142 ++++++++++++++++++ kernels/quantized/test/targets.bzl | 6 + 3 files changed, 150 insertions(+), 1 deletion(-) create mode 100644 kernels/quantized/test/op_mixed_linear_test.cpp diff --git a/kernels/quantized/cpu/op_mixed_linear.cpp b/kernels/quantized/cpu/op_mixed_linear.cpp index d00fdd05c9..a9d945b887 100644 --- a/kernels/quantized/cpu/op_mixed_linear.cpp +++ b/kernels/quantized/cpu/op_mixed_linear.cpp @@ -24,7 +24,8 @@ bool check_quantized_mixed_linear_args( Tensor& out) { ET_LOG_AND_RETURN_IF_FALSE(tensor_is_rank(in, 2)); ET_LOG_AND_RETURN_IF_FALSE(tensor_is_rank(weight, 2)); - ET_LOG_AND_RETURN_IF_FALSE(tensor_is_rank(weight_scales, 1)); + ET_LOG_AND_RETURN_IF_FALSE( + tensor_is_rank(weight_scales, 1) || tensor_is_rank(weight_scales, 2)); ET_LOG_AND_RETURN_IF_FALSE(tensor_is_rank(out, 2)); ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_size_at_dims(in, 1, weight, 1)); diff --git a/kernels/quantized/test/op_mixed_linear_test.cpp b/kernels/quantized/test/op_mixed_linear_test.cpp new file mode 100644 index 0000000000..df141cb1cb --- /dev/null +++ b/kernels/quantized/test/op_mixed_linear_test.cpp @@ -0,0 +1,142 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include // Declares the aten operator +#include // Declares the quantized operator +#include +#include +#include +#include +#include + +#include + +using namespace ::testing; +using exec_aten::optional; +using exec_aten::RuntimeContext; +using exec_aten::ScalarType; +using exec_aten::Tensor; +using torch::executor::native::quantized_mixed_linear_out; +using torch::executor::testing::TensorFactory; + +class OpQuantizedMixedDtypeLinearTest : public ::testing::Test { + protected: + void SetUp() override { + // Since these tests cause ET_LOG to be called, the PAL must be initialized + // first. + torch::executor::runtime_init(); + } +}; + +template +void test_dtype() { + TensorFactory tf; + TensorFactory tf_char; + TensorFactory tf_out; + + Tensor input = tf.make( + /*sizes=*/{1, 3}, + /*data=*/{1.0, 1.5, 2.0}); + Tensor weight = tf_char.make( + /*sizes=*/{2, 3}, + /*data=*/{5, 3, 1, 4, 2, 1}); + Tensor weight_scales = tf.make( + /*sizes=*/{2}, + /*data=*/{0.2, 0.4}); + const optional opt_weight_zp{}; + const optional opt_dtype_out{}; + + Tensor out = tf_out.zeros({1, 2}); + + Tensor expected = tf_out.make( + /*sizes=*/{1, 2}, + /*data=*/{2.3, 3.6}); + + RuntimeContext ctx{}; + + quantized_mixed_linear_out( + ctx, input, weight, weight_scales, opt_weight_zp, opt_dtype_out, out); + + EXPECT_TENSOR_CLOSE(out, expected); +} + +TEST_F(OpQuantizedMixedDtypeLinearTest, FloatInputFloatOutput) { + test_dtype(); +} + +#if 0 +// need << +TEST_F(OpQuantizedMixedDtypeLinearTest, FloatInputHalfOutput) { + test_dtype(); +} + +// need to relax tolerance +TEST_F(OpQuantizedMixedDtypeLinearTest, HalfInputFloatOutput) { + test_dtype(); +} + +// need << +TEST_F(OpQuantizedMixedDtypeLinearTest, HalfInputHalfOutput) { + test_dtype(); +} +#endif + +template +void test_dtype_partials() { + TensorFactory tf; + TensorFactory tf_char; + TensorFactory tf_out; + + Tensor input = tf.make( + /*sizes=*/{1, 3}, + /*data=*/{1.0, 1.5, 2.0}); + Tensor weight = tf_char.make( + /*sizes=*/{2, 3}, + /*data=*/{5, 3, 1, 4, 2, 1}); + Tensor weight_scales = tf.make( + /*sizes=*/{2, 2}, + /*data=*/{0.2, 1, 0.4, 0.5}); + const optional opt_weight_zp{}; + const optional opt_dtype_out{}; + + Tensor out = tf_out.zeros({1, 2}); + + Tensor expected = tf_out.make( + /*sizes=*/{1, 2}, + /*data=*/ + {(1.0 * 5 + 1.5 * 3) * 0.2 + 2.0 * 1 * 1, + (1.0 * 4 + 1.5 * 2) * 0.4 + 2.0 * 1 * 0.5}); + + RuntimeContext ctx{}; + + quantized_mixed_linear_out( + ctx, input, weight, weight_scales, opt_weight_zp, opt_dtype_out, out); + + EXPECT_TENSOR_CLOSE(out, expected); +} + +TEST_F(OpQuantizedMixedDtypeLinearTest, FloatInputFloatOutput_Partials) { + test_dtype_partials(); +} + +#if 0 +// need << +TEST_F(OpQuantizedMixedDtypeLinearTest, FloatInputHalfOutput_Partials) { + test_dtype_partials(); +} + +// need to relax tolerance +TEST_F(OpQuantizedMixedDtypeLinearTest, HalfInputFloatOutput_Partials) { + test_dtype_partials(); +} + +// need << +TEST_F(OpQuantizedMixedDtypeLinearTest, HalfInputHalfOutput_Partials) { + test_dtype_partials(); +} +#endif diff --git a/kernels/quantized/test/targets.bzl b/kernels/quantized/test/targets.bzl index 157bcefb89..e06090cae9 100644 --- a/kernels/quantized/test/targets.bzl +++ b/kernels/quantized/test/targets.bzl @@ -31,3 +31,9 @@ def define_common_targets(): "//executorch/kernels/portable:generated_lib_headers", "//executorch/runtime/core/exec_aten/testing_util:tensor_util", ]) + op_test("op_mixed_linear_test", kernel_name = "quantized", deps = [ + "//executorch/kernels/quantized/cpu:op_mixed_linear", + "//executorch/kernels/quantized:generated_lib_headers", + "//executorch/kernels/portable:generated_lib_headers", + "//executorch/runtime/core/exec_aten/testing_util:tensor_util", + ]) From 65f970198deb1d72c6d33e4e2df9666a072b32d3 Mon Sep 17 00:00:00 2001 From: Michael Gschwind Date: Tue, 5 Mar 2024 07:15:23 -0800 Subject: [PATCH 029/290] Build ARM V8 ONLY (no native FP16) (#2207) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/2207 Build ARM V8 ONLY (no native FP16) Reviewed By: manuelcandales Differential Revision: D54447524 fbshipit-source-id: 855c6dd14d9e903401c54ea59a24eb95cc39bd17 --- runtime/core/portable_type/half.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/runtime/core/portable_type/half.h b/runtime/core/portable_type/half.h index c0f4d6e0d9..0bdf388adc 100644 --- a/runtime/core/portable_type/half.h +++ b/runtime/core/portable_type/half.h @@ -15,7 +15,9 @@ #if defined(__GNUC__) || defined(__clang__) #if defined(__aarch64__) +#ifndef __ARM_V8_ONLY__ #define NATIVE_FP16 1 +#endif // __ARM_V8_ONLY__ #endif // __aarch64__ #endif // GNUC or clang From 911094eb46cd852298d14cf4729c037d3b1fe317 Mon Sep 17 00:00:00 2001 From: shewu-quic Date: Tue, 5 Mar 2024 09:48:59 -0800 Subject: [PATCH 030/290] Qualcomm AI Engine Direct - enable per-tensor dump mechanism (#1294) Summary: - Add "tensor_dump_output_path" option into compiler spec - Add "output_" for output tensors in the AOT phase. In the runtime, we fill in the output tensor based on the order of the output tensor in the context cache. If tensor_dump_output_path is given, Delegate would write outputs of each OP there in runtime. In ALL cases, we don't recommend setting this option. This option exists just for debugging some accuracy issues. Pull Request resolved: https://github.com/pytorch/executorch/pull/1294 Reviewed By: kirklandsign Differential Revision: D53947214 Pulled By: cccclai fbshipit-source-id: 64cb2a0e998da87c66cc16249a54264ae3fcf046 --- backends/qualcomm/CMakeLists.txt | 6 +++ .../aot/python/PyQnnManagerAdaptor.cpp | 3 +- .../qualcomm/aot/python/PyQnnManagerAdaptor.h | 4 ++ .../qualcomm/aot/wrappers/TensorWrapper.cpp | 13 ++++++ .../qualcomm/aot/wrappers/TensorWrapper.h | 2 + backends/qualcomm/builders/node_visitor.py | 21 ++++++++-- backends/qualcomm/qnn_preprocess.py | 5 ++- backends/qualcomm/runtime/CMakeLists.txt | 8 ++++ .../qualcomm/runtime/QnnExecuTorchBackend.cpp | 17 ++++---- backends/qualcomm/runtime/QnnManager.cpp | 40 +++++++++++++++++++ backends/qualcomm/runtime/QnnManager.h | 5 +++ backends/qualcomm/runtime/Utils.cpp | 36 +++++++++++++++++ backends/qualcomm/runtime/Utils.h | 20 ++++++++++ .../serialization/qnn_compile_spec_schema.py | 1 + backends/qualcomm/serialization/schema.fbs | 6 +++ backends/qualcomm/tests/test_qnn_delegate.py | 4 ++ backends/qualcomm/utils/utils.py | 8 ++++ 17 files changed, 186 insertions(+), 13 deletions(-) create mode 100644 backends/qualcomm/runtime/Utils.cpp create mode 100644 backends/qualcomm/runtime/Utils.h diff --git a/backends/qualcomm/CMakeLists.txt b/backends/qualcomm/CMakeLists.txt index ff15e25575..163842b4e9 100644 --- a/backends/qualcomm/CMakeLists.txt +++ b/backends/qualcomm/CMakeLists.txt @@ -139,6 +139,7 @@ add_library(qnn_backend STATIC) add_library(qnn_factory STATIC) add_library(qnn_header INTERFACE) add_library(wrappers STATIC) +add_library(utils STATIC) # # declare dependency @@ -228,6 +229,7 @@ target_link_libraries(qnn_manager qnn_factory wrappers qnn_schema + utils ) target_link_libraries(qnn_executorch_backend PRIVATE @@ -237,6 +239,10 @@ target_link_libraries(qnn_executorch_backend executorch qcir_utils ) +target_link_libraries(utils + PRIVATE + qnn_executorch_logging +) # # add linker option diff --git a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp index 4749f32403..77f6a63f62 100644 --- a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp +++ b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp @@ -30,7 +30,8 @@ PYBIND11_MODULE(PyQnnManagerAdaptor, m) { .def("IsNodeSupportedByBackend", &PyQnnManager::IsNodeSupportedByBackend) .def("Compile", &PyQnnManager::Compile) .def("Destroy", &PyQnnManager::Destroy) - .def("IsAvailable", &PyQnnManager::IsAvailable); + .def("IsAvailable", &PyQnnManager::IsAvailable) + .def("IsTensorDump", &PyQnnManager::IsTensorDump); } } // namespace qnn } // namespace executor diff --git a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h index a474e95e14..5bde58687f 100644 --- a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h +++ b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h @@ -137,6 +137,10 @@ class PyQnnManager { return qnn_manager_->IsAvailable(); } + bool IsTensorDump() { + return qnn_manager_->IsTensorDump(); + } + private: // Store the bytes object instead of a raw pointer so that this module will // keep the bytes alive. diff --git a/backends/qualcomm/aot/wrappers/TensorWrapper.cpp b/backends/qualcomm/aot/wrappers/TensorWrapper.cpp index a99d5d3023..2a2cda84c5 100644 --- a/backends/qualcomm/aot/wrappers/TensorWrapper.cpp +++ b/backends/qualcomm/aot/wrappers/TensorWrapper.cpp @@ -121,6 +121,19 @@ Error TensorWrapper::FillDataBuffer(const void* data, bool copy_data) { return Error::Ok; } +Error TensorWrapper::AllocateDataBuffer() { + char* static_data_buffer = new (std::nothrow) char[bytes_]; // NOLINT + if (static_data_buffer == nullptr) { + return Error::Internal; + } + owned_data_ = std::unique_ptr(static_data_buffer); + QNN_VER_PTR(tensor_)->memType = QNN_TENSORMEMTYPE_RAW; + QNN_VER_PTR(tensor_)->clientBuf.dataSize = bytes_; + QNN_VER_PTR(tensor_)->clientBuf.data = owned_data_.get(); + + return Error::Ok; +} + void TensorWrapper::UpdateQnnTensorMeta(const Qnn_Tensor_t& tensor_src) { QNN_VER_PTR(tensor_)->id = QNN_VER_PTR(tensor_src)->id; } diff --git a/backends/qualcomm/aot/wrappers/TensorWrapper.h b/backends/qualcomm/aot/wrappers/TensorWrapper.h index 1ac3882078..5c2be69348 100644 --- a/backends/qualcomm/aot/wrappers/TensorWrapper.h +++ b/backends/qualcomm/aot/wrappers/TensorWrapper.h @@ -35,6 +35,8 @@ class TensorWrapper { Error FillDataBuffer(const void* data, bool copy_data = false); + Error AllocateDataBuffer(); + // update qnn tensor meta // this function is used to recover metadata from QNN context binary. void UpdateQnnTensorMeta(const Qnn_Tensor_t& tensor_src); diff --git a/backends/qualcomm/builders/node_visitor.py b/backends/qualcomm/builders/node_visitor.py index f4ccb6a7a4..c6c7b47b38 100644 --- a/backends/qualcomm/builders/node_visitor.py +++ b/backends/qualcomm/builders/node_visitor.py @@ -57,10 +57,14 @@ class NodeVisitor: """ def __init__( - self, external_ids, edge_program: torch.export.ExportedProgram + self, + external_ids, + edge_program: torch.export.ExportedProgram, + enable_tensor_dump, ) -> None: self.external_ids = external_ids or {} self.edge_program = edge_program + self.enable_tensor_dump = enable_tensor_dump def get_tensor(self, input_node, op_node, idx=None): """ @@ -176,6 +180,9 @@ def get_tensor_type( if is_parameter(node, self.edge_program): return PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC + # dump all tensor, set to app read + if self.enable_tensor_dump: + return PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_APP_READ return tensor_type def get_data_type( @@ -250,13 +257,16 @@ def define_value( if node_name in nodes_to_wrappers: return nodes_to_wrappers[node_name] + tensor_name = node.name + if is_graph_output(node): + tensor_name = "output_" + tensor_name dims = [1] if len(tensor.size()) == 0 else tensor.size() tensor_type = self.get_tensor_type(node, tensor_type) quant_encoding, quant_configs = self.get_quant_encoding_conf(node) dtype = self.get_data_type(tensor, quant_configs, is_tensor) if isinstance(tensor, torch._subclasses.fake_tensor.FakeTensor): tensor_wrapper = PyQnnWrapper.TensorWrapper( - node_name, + tensor_name, tensor_type, dtype, quant_encoding, @@ -270,7 +280,7 @@ def define_value( if quant_configs: tensor = self.get_quant_tensor_value(node, tensor, dtype) tensor_wrapper = PyQnnWrapper.TensorWrapper( - node_name, + tensor_name, tensor_type, dtype, quant_encoding, @@ -372,6 +382,7 @@ def generate_node_to_external_map( def get_node_visitors( edge_program: torch.export.ExportedProgram, + enable_tensor_dump=False, ) -> Dict[str, NodeVisitor]: """Create a new class instance at runtime, and put them in a dict""" node_to_external_map = generate_node_to_external_map(edge_program) @@ -380,5 +391,7 @@ def get_node_visitors( assert callable( visitor ), f"Expeting a callable class, but got {visitor} of type {type(visitor)}" - node_visitors[target] = visitor(node_to_external_map, edge_program) + node_visitors[target] = visitor( + node_to_external_map, edge_program, enable_tensor_dump + ) return node_visitors diff --git a/backends/qualcomm/qnn_preprocess.py b/backends/qualcomm/qnn_preprocess.py index f6e7918bbd..20b54d1db8 100644 --- a/backends/qualcomm/qnn_preprocess.py +++ b/backends/qualcomm/qnn_preprocess.py @@ -53,8 +53,11 @@ def preprocess( pass_result = qnn_compiler_passes(edge_program.graph_module) assert pass_result is not None + enable_tensor_dump = qnn_manager.IsTensorDump() nodes_to_wrappers = {} - node_visitors = get_node_visitors(edge_program) + node_visitors = get_node_visitors( + edge_program, enable_tensor_dump=enable_tensor_dump + ) py_op_wrapper_list = [] for node in pass_result.graph_module.graph.nodes: if node.op == "call_function": diff --git a/backends/qualcomm/runtime/CMakeLists.txt b/backends/qualcomm/runtime/CMakeLists.txt index c46f5f9222..615c6320b5 100644 --- a/backends/qualcomm/runtime/CMakeLists.txt +++ b/backends/qualcomm/runtime/CMakeLists.txt @@ -39,3 +39,11 @@ target_sources(qnn_executorch_logging PRIVATE ${CMAKE_CURRENT_LIST_DIR}/Logging.cpp ) + +# utils +target_sources(utils + PUBLIC + ${CMAKE_CURRENT_LIST_DIR}/Utils.h + PRIVATE + ${CMAKE_CURRENT_LIST_DIR}/Utils.cpp +) diff --git a/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp b/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp index 7aa814be44..0e2a0c9bfd 100644 --- a/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp +++ b/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp @@ -194,13 +194,16 @@ Error QnnExecuTorchBackend::execute( input_tensor_structs.push_back(input_tensors[i]->CloneTensorStruct()); } - for (int i = input_tensors.size(); - i < input_tensors.size() + output_tensors.size(); - ++i) { - output_tensors[i - input_tensors.size()]->FillDataBuffer( - args[i]->toTensor().mutable_data_ptr(), false /* copy_data */); - output_tensor_structs.push_back( - output_tensors[i - input_tensors.size()]->CloneTensorStruct()); + int output_index = input_tensors.size(); + for (const auto& output_tensor : output_tensors) { + // pos=0 limits the search to the prefix + if (output_tensor->GetName().rfind("output_", 0) == 0) { + output_tensor->FillDataBuffer( + args[output_index]->toTensor().mutable_data_ptr(), + false /* copy_data */); + output_index++; + } + output_tensor_structs.push_back(output_tensor->CloneTensorStruct()); } ET_CHECK_OR_RETURN_ERROR( diff --git a/backends/qualcomm/runtime/QnnManager.cpp b/backends/qualcomm/runtime/QnnManager.cpp index 1d7ba55648..edfb5a1947 100644 --- a/backends/qualcomm/runtime/QnnManager.cpp +++ b/backends/qualcomm/runtime/QnnManager.cpp @@ -6,10 +6,12 @@ * LICENSE file in the root directory of this source tree. */ #include +#include #include #include #include +#include namespace torch { namespace executor { namespace qnn { @@ -25,6 +27,7 @@ QnnManager::QnnManager( : backend_type_(options->backend_type()), library_path_(options->library_path()->c_str()), skel_library_dir_(options->skel_library_dir()->c_str()), + tensor_dump_output_path_(options->tensor_dump_output_path()->c_str()), graph_name_(options->graph_name()->c_str()), soc_info_(options->soc_info()), htp_options_(options->htp_options()), @@ -41,6 +44,9 @@ QnnManager::QnnManager( "library_path: %s", options->library_path()->c_str()); QNN_EXECUTORCH_LOG_INFO( "skel_library_dir: %s", options->skel_library_dir()->c_str()); + QNN_EXECUTORCH_LOG_INFO( + "tensor_dump_output_path: %s", + options->tensor_dump_output_path()->c_str()); QNN_EXECUTORCH_LOG_INFO( "log_level: %s", EnumNameQnnExecuTorchLogLevel(options->log_level())); QNN_EXECUTORCH_LOG_INFO( @@ -144,6 +150,9 @@ Error QnnManager::AllocateTensor() { for (auto& tensor : output_tensors) { std::shared_ptr tensor_wrapper = CreateTensorWrapper(tensor); tensor_wrapper->UpdateQnnTensorMeta(tensor); + if (!tensor_dump_output_path_.empty()) { + tensor_wrapper->AllocateDataBuffer(); + } output_tensors_.emplace_back(std::move(tensor_wrapper)); } return Error::Ok; @@ -153,6 +162,11 @@ Error QnnManager::AllocateTensor( std::vector>& inputs, std::vector>& outputs) { input_tensors_ = std::move(inputs); + for (auto& output_tensor : outputs) { + if (!tensor_dump_output_path_.empty()) { + output_tensor->AllocateDataBuffer(); + } + } output_tensors_ = std::move(outputs); return Error::Ok; } @@ -171,6 +185,32 @@ Error QnnManager::Execute( return Error::Internal; } + if (!tensor_dump_output_path_.empty()) { + // TODO: Need to handle the graph which is partitioned. + // Maybe we could use graph name. + std::string dir = tensor_dump_output_path_ + "/Result/"; + CreateDirectory(dir); + QNN_EXECUTORCH_LOG_INFO("Dump tensor to the path: %s", dir.c_str()); + for (std::size_t out_idx = 0; out_idx < output_tensor_structs.size(); + ++out_idx) { + const Qnn_Tensor_t& output_tensor = output_tensor_structs[out_idx]; + + std::string output_path = + dir + QNN_VER_PTR(output_tensor)->name + "_tensor.raw"; + + std::ofstream fout(output_path, std::ios::binary); + if (fout.fail()) { + QNN_EXECUTORCH_LOG_ERROR( + "Dump tensor name: %s Failed.", QNN_VER_PTR(output_tensor)->name); + return Error::Internal; + } + + fout.write( + static_cast(QNN_VER_PTR(output_tensor)->clientBuf.data), + QNN_VER_PTR(output_tensor)->clientBuf.dataSize); + } + } + return Error::Ok; } diff --git a/backends/qualcomm/runtime/QnnManager.h b/backends/qualcomm/runtime/QnnManager.h index e510ec05f1..b71a087d98 100644 --- a/backends/qualcomm/runtime/QnnManager.h +++ b/backends/qualcomm/runtime/QnnManager.h @@ -42,6 +42,10 @@ class QnnManager { bool IsAvailable(); + bool IsTensorDump() { + return !tensor_dump_output_path_.empty(); + } + bool IsOnlinePrepare(); bool IsNodeSupportedByBackend( @@ -68,6 +72,7 @@ class QnnManager { QnnExecuTorchBackendType backend_type_; std::string library_path_; std::string skel_library_dir_; + std::string tensor_dump_output_path_; std::string graph_name_; const SocInfo* soc_info_; const QnnExecuTorchHtpBackendOptions* htp_options_; diff --git a/backends/qualcomm/runtime/Utils.cpp b/backends/qualcomm/runtime/Utils.cpp new file mode 100644 index 0000000000..c049d3720e --- /dev/null +++ b/backends/qualcomm/runtime/Utils.cpp @@ -0,0 +1,36 @@ +/* + * Copyright (c) Qualcomm Innovation Center, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include +#include +#include +namespace torch { +namespace executor { +namespace qnn { + +void CreateDirectory(const std::string& path) { + // Create any recursive directory + if (path.empty()) { + QNN_EXECUTORCH_LOG_ERROR("Create folder shouldn't be empty"); + return; + } + std::size_t pos = path.find_last_of('/'); + std::string subdir = (std::string::npos == pos) ? "" : path.substr(0, pos); + if (subdir.empty() || subdir == "." || subdir == "..") { + return; + } + CreateDirectory(subdir); + int mkdir_err = mkdir(subdir.c_str(), S_IRWXU | S_IRWXG | S_IRWXO); + if (mkdir_err != 0 && errno != EEXIST) { + std::string err_msg = "Failed to create " + subdir + " folder\n"; + QNN_EXECUTORCH_LOG_ERROR(err_msg.c_str()); + } +} + +} // namespace qnn +} // namespace executor +} // namespace torch diff --git a/backends/qualcomm/runtime/Utils.h b/backends/qualcomm/runtime/Utils.h new file mode 100644 index 0000000000..84a7da2388 --- /dev/null +++ b/backends/qualcomm/runtime/Utils.h @@ -0,0 +1,20 @@ +/* + * Copyright (c) Qualcomm Innovation Center, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#pragma once + +#include + +namespace torch { +namespace executor { +namespace qnn { +// Create Directory +void CreateDirectory(const std::string& path); + +} // namespace qnn +} // namespace executor +} // namespace torch diff --git a/backends/qualcomm/serialization/qnn_compile_spec_schema.py b/backends/qualcomm/serialization/qnn_compile_spec_schema.py index 21ff10e1af..dbe23e5785 100644 --- a/backends/qualcomm/serialization/qnn_compile_spec_schema.py +++ b/backends/qualcomm/serialization/qnn_compile_spec_schema.py @@ -114,3 +114,4 @@ class QnnExecuTorchOptions: htp_options: QnnExecuTorchHtpBackendOptions = QnnExecuTorchHtpBackendOptions() soc_info: SocInfo = SocInfo() online_prepare: bool = False + tensor_dump_output_path: str = "" diff --git a/backends/qualcomm/serialization/schema.fbs b/backends/qualcomm/serialization/schema.fbs index 2e74877c31..948fb5773c 100644 --- a/backends/qualcomm/serialization/schema.fbs +++ b/backends/qualcomm/serialization/schema.fbs @@ -140,6 +140,12 @@ table QnnExecuTorchOptions { /// Check if on-device graph construction. Default is false. online_prepare:bool; + + /// Tensor dump output path. If a path is given, Delegate would write + /// outputs of each OP there. + /// In ALL cases, we don't recommend to set this option. + /// This option exist just for debugging some accuracy issues. + tensor_dump_output_path:string; } root_type QnnExecuTorchOptions; diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py index 4a410eb72c..25f24dacf7 100644 --- a/backends/qualcomm/tests/test_qnn_delegate.py +++ b/backends/qualcomm/tests/test_qnn_delegate.py @@ -44,6 +44,7 @@ def setUp(self): debug=False, saver=False, online_prepare=TestQNN.online_prepare, + tensor_dump_output_path="", ) def test_qnn_backend_arange(self): @@ -373,6 +374,7 @@ def setUp(self): debug=False, saver=False, online_prepare=TestQNN.online_prepare, + tensor_dump_output_path="", ) def test_qnn_backend_conv1d_relu_log_softmax(self): @@ -464,6 +466,7 @@ def setUp(self): debug=False, saver=False, online_prepare=TestQNN.online_prepare, + tensor_dump_output_path="", ) def test_qnn_backend_arange(self): @@ -840,6 +843,7 @@ def setUp(self): debug=False, saver=False, online_prepare=TestQNN.online_prepare, + tensor_dump_output_path="", ) def test_qnn_backend_conv1d_relu_log_softmax(self): diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py index 64b4c9c02a..dc8ca85fdc 100644 --- a/backends/qualcomm/utils/utils.py +++ b/backends/qualcomm/utils/utils.py @@ -122,6 +122,7 @@ def generate_qnn_executorch_compiler_spec( debug: bool = False, saver: bool = False, online_prepare: bool = False, + tensor_dump_output_path: str = "", ) -> List[CompileSpec]: """ Helper function generating compiler specs for Qualcomm AI Engine Direct @@ -142,6 +143,10 @@ def generate_qnn_executorch_compiler_spec( saver: Instead of compiling the model, run QNN Saver. Please check documents of Qualcomm AI Engine Direct SDK. This feature is usually for debugging purpose. + tensor_dump_output_path: If a path is given, Delegate would write + outputs of each OP there in runtime. In ALL cases, + we don't recommend to set this option. This option exist just + for debugging some accuracy issues. Returns: List[CompileSpec]: Compiler specs for Qualcomm AI Engine Direct. @@ -187,6 +192,9 @@ def generate_qnn_executorch_compiler_spec( if saver: qnn_executorch_options.library_path = "libQnnSaver.so" + if len(tensor_dump_output_path.strip()) != 0: + qnn_executorch_options.tensor_dump_output_path = tensor_dump_output_path + if online_prepare: qnn_executorch_options.online_prepare = True return [ From 46036135fe82502f382b743ce742be3bbf46b0e1 Mon Sep 17 00:00:00 2001 From: Jorge Pineda Date: Tue, 5 Mar 2024 10:15:18 -0800 Subject: [PATCH 031/290] Remove CopyNode (#2244) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/2244 `CopyNode` simply copies data from one image texture to another image texture of the same size. During inception of the graph runtime, it was useful to validate that components were working, but won't serve us any use in the future. ghstack-source-id: 217394065 exported-using-ghexport Reviewed By: SS-JIA Differential Revision: D54369109 fbshipit-source-id: bdc63ca302e552bed742da0b59c8a848f44d1273 --- .../vulkan/runtime/graph/ops/impl/Copy.cpp | 61 ------------------- backends/vulkan/runtime/graph/ops/impl/Copy.h | 33 ---------- 2 files changed, 94 deletions(-) delete mode 100644 backends/vulkan/runtime/graph/ops/impl/Copy.cpp delete mode 100644 backends/vulkan/runtime/graph/ops/impl/Copy.h diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp deleted file mode 100644 index fdd0124781..0000000000 --- a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -namespace at { -namespace native { -namespace vulkan { - -void add_copy_node( - ComputeGraph& graph, - const ValueRef from, - const ValueRef to) { - graph.execute_nodes().emplace_back(new CopyNode(from, to)); -} - -ValueRef add_copy_node(ComputeGraph& graph, const ValueRef from) { - std::vector out_sizes = graph.get_val_sizes(from); - api::ScalarType out_dtype = graph.get_val_dtype(from); - ValueRef to = graph.add_tensor(out_sizes, out_dtype); - add_copy_node(graph, from, to); - return to; -} - -CopyNode::CopyNode(const ValueRef from, const ValueRef to) - : ExecuteNode(from, to) {} - -void CopyNode::encode(ComputeGraph* graph) const { - api::PipelineBarrier pipeline_barrier{}; - - vTensor& from_tensor = graph->get_val(inputs_[0]).toTensor(); - vTensor& to_tensor = graph->get_val(outputs_[0]).toTensor(); - - graph->context()->submit_copy( - // pipeline barrier - pipeline_barrier, - // resources - from_tensor.image( - pipeline_barrier, - api::PipelineStage::TRANSFER, - api::MemoryAccessType::READ), - to_tensor.image( - pipeline_barrier, - api::PipelineStage::TRANSFER, - api::MemoryAccessType::WRITE), - // copy details - from_tensor.extents(), - {0u, 0u, 0u}, - {0u, 0u, 0u}, - // fence handle - VK_NULL_HANDLE); -} - -} // namespace vulkan -} // namespace native -} // namespace at diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.h b/backends/vulkan/runtime/graph/ops/impl/Copy.h deleted file mode 100644 index b686dac57e..0000000000 --- a/backends/vulkan/runtime/graph/ops/impl/Copy.h +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#ifdef USE_VULKAN_API - -#include - -namespace at { -namespace native { -namespace vulkan { - -void add_copy_node(ComputeGraph& graph, const ValueRef from, const ValueRef to); -ValueRef add_copy_node(ComputeGraph& graph, const ValueRef from); - -class CopyNode : public virtual ExecuteNode { - public: - explicit CopyNode(const ValueRef from, const ValueRef to); - - void encode(ComputeGraph* graph) const override; -}; - -} // namespace vulkan -} // namespace native -} // namespace at - -#endif /* USE_VULKAN_API */ From dfb5f510565fd869ce44cdfba734ca66a65c8f00 Mon Sep 17 00:00:00 2001 From: Jorge Pineda Date: Tue, 5 Mar 2024 10:15:18 -0800 Subject: [PATCH 032/290] Remove Functions.h/cpp (#2245) Summary: bypass-github-export-checks Pull Request resolved: https://github.com/pytorch/executorch/pull/2245 In D53982443, OperatorRegistry.h/cpp and Functions.h/cpp were both introduced, as they were split across the PT and ET repos, but now both are in ET. ## OperatorRegistry.cpp Here, we see all our operators. OPERATOR_ENTRY maps from Vulkan Dialect op name to the OpFunction, which have an op-specific name. Note that all OpFunction carry the same function signature. ## Functions.h/cpp -> Arithmetic.h/cpp We don't need another place to see all our operators. They will each reference one ops/impl file, so we group them accordingly in their ops/impl file. ## Nit Also, sort `add_arithmetic_node()` declarations according to their execution order. ghstack-source-id: 217394062 exported-using-ghexport Reviewed By: SS-JIA Differential Revision: D54370467 fbshipit-source-id: d9e82896577610d5dcee3e0bf7f662e69d59e1db --- .../vulkan/runtime/graph/ops/Functions.cpp | 40 ---------------- .../runtime/graph/ops/OperatorRegistry.cpp | 2 +- .../graph/ops/{Functions.h => Utils.h} | 13 ++--- .../runtime/graph/ops/impl/Arithmetic.cpp | 48 +++++++++++++------ .../runtime/graph/ops/impl/Arithmetic.h | 21 +++++--- 5 files changed, 53 insertions(+), 71 deletions(-) delete mode 100644 backends/vulkan/runtime/graph/ops/Functions.cpp rename backends/vulkan/runtime/graph/ops/{Functions.h => Utils.h} (68%) diff --git a/backends/vulkan/runtime/graph/ops/Functions.cpp b/backends/vulkan/runtime/graph/ops/Functions.cpp deleted file mode 100644 index fb3d9eca67..0000000000 --- a/backends/vulkan/runtime/graph/ops/Functions.cpp +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include -#include - -#include - -namespace at { -namespace native { -namespace vulkan { - -#define DEFINE_ARITHMETIC_FN(function, op_type) \ - ValueRef function(ComputeGraph& graph, const std::vector& args) { \ - return add_arithmetic_node( \ - graph, \ - args[0], \ - args[1], \ - args[2], \ - arithmetic::OpType::op_type, \ - args[3]); \ - } - -DEFINE_ARITHMETIC_FN(add, ADD); -DEFINE_ARITHMETIC_FN(sub, SUB); -DEFINE_ARITHMETIC_FN(mul, MUL); -DEFINE_ARITHMETIC_FN(div, DIV); -DEFINE_ARITHMETIC_FN(floor_div, FLOOR_DIV); -DEFINE_ARITHMETIC_FN(pow, POW); - -} // namespace vulkan -} // namespace native -} // namespace at diff --git a/backends/vulkan/runtime/graph/ops/OperatorRegistry.cpp b/backends/vulkan/runtime/graph/ops/OperatorRegistry.cpp index 989e54dd5d..0d46e5b351 100644 --- a/backends/vulkan/runtime/graph/ops/OperatorRegistry.cpp +++ b/backends/vulkan/runtime/graph/ops/OperatorRegistry.cpp @@ -8,7 +8,7 @@ #include -#include +#include namespace at { namespace native { diff --git a/backends/vulkan/runtime/graph/ops/Functions.h b/backends/vulkan/runtime/graph/ops/Utils.h similarity index 68% rename from backends/vulkan/runtime/graph/ops/Functions.h rename to backends/vulkan/runtime/graph/ops/Utils.h index 156b821481..f962c17bcc 100644 --- a/backends/vulkan/runtime/graph/ops/Functions.h +++ b/backends/vulkan/runtime/graph/ops/Utils.h @@ -10,21 +10,16 @@ #ifdef USE_VULKAN_API +#include + #include namespace at { namespace native { namespace vulkan { -#define DEFINE_OP_FN(name) \ - ValueRef name(ComputeGraph& graph, const std::vector& args); - -DEFINE_OP_FN(add); -DEFINE_OP_FN(sub); -DEFINE_OP_FN(mul); -DEFINE_OP_FN(div); -DEFINE_OP_FN(floor_div); -DEFINE_OP_FN(pow); +#define DECLARE_OP_FN(function) \ + ValueRef function(ComputeGraph& graph, const std::vector& args); } // namespace vulkan } // namespace native diff --git a/backends/vulkan/runtime/graph/ops/impl/Arithmetic.cpp b/backends/vulkan/runtime/graph/ops/impl/Arithmetic.cpp index cf05030d01..665d8f37af 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Arithmetic.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Arithmetic.cpp @@ -16,6 +16,39 @@ namespace at { namespace native { namespace vulkan { +#define DEFINE_ARITHMETIC_FN(function, op_type) \ + ValueRef function(ComputeGraph& graph, const std::vector& args) { \ + return add_arithmetic_node( \ + graph, \ + args[0], \ + args[1], \ + args[2], \ + arithmetic::OpType::op_type, \ + args[3]); \ + } + +DEFINE_ARITHMETIC_FN(add, ADD); +DEFINE_ARITHMETIC_FN(sub, SUB); +DEFINE_ARITHMETIC_FN(mul, MUL); +DEFINE_ARITHMETIC_FN(div, DIV); +DEFINE_ARITHMETIC_FN(floor_div, FLOOR_DIV); +DEFINE_ARITHMETIC_FN(pow, POW); + +ValueRef add_arithmetic_node( + ComputeGraph& graph, + const ValueRef t1, + const ValueRef t2, + const float alpha, + const arithmetic::OpType optype, + const int64_t shared_object_idx) { + std::vector t1_sizes = graph.get_val_sizes(t1); + api::ScalarType t1_dtype = graph.get_val_dtype(t1); + + ValueRef out = graph.add_tensor(t1_sizes, t1_dtype, shared_object_idx); + add_arithmetic_node(graph, t1, t2, out, alpha, optype); + return out; +} + void add_arithmetic_node( ComputeGraph& graph, const ValueRef t1, @@ -46,21 +79,6 @@ void add_arithmetic_node( new ArithmeticNode(arg1, arg2, out, alpha, optype)); } -ValueRef add_arithmetic_node( - ComputeGraph& graph, - const ValueRef t1, - const ValueRef t2, - const float alpha, - const arithmetic::OpType optype, - const int64_t shared_object_idx) { - std::vector t1_sizes = graph.get_val_sizes(t1); - api::ScalarType t1_dtype = graph.get_val_dtype(t1); - - ValueRef out = graph.add_tensor(t1_sizes, t1_dtype, shared_object_idx); - add_arithmetic_node(graph, t1, t2, out, alpha, optype); - return out; -} - ArithmeticPrepack::ArithmeticPrepack(const ValueRef tref, const ValueRef packed) : PrepackNode(tref, packed) {} diff --git a/backends/vulkan/runtime/graph/ops/impl/Arithmetic.h b/backends/vulkan/runtime/graph/ops/impl/Arithmetic.h index 69b1004a01..26ccd931c7 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Arithmetic.h +++ b/backends/vulkan/runtime/graph/ops/impl/Arithmetic.h @@ -14,25 +14,34 @@ #include +#include + namespace at { namespace native { namespace vulkan { -void add_arithmetic_node( +DECLARE_OP_FN(add); +DECLARE_OP_FN(sub); +DECLARE_OP_FN(mul); +DECLARE_OP_FN(div); +DECLARE_OP_FN(floor_div); +DECLARE_OP_FN(pow); + +ValueRef add_arithmetic_node( ComputeGraph& graph, const ValueRef t1, const ValueRef t2, - const ValueRef out, const float alpha, - const arithmetic::OpType optype); + const arithmetic::OpType optype, + const int64_t shared_object_idx = -1); -ValueRef add_arithmetic_node( +void add_arithmetic_node( ComputeGraph& graph, const ValueRef t1, const ValueRef t2, + const ValueRef out, const float alpha, - const arithmetic::OpType optype, - const int64_t shared_object_idx = -1); + const arithmetic::OpType optype); class ArithmeticPrepack : public virtual PrepackNode { public: From fae9ef0e0a1c64bb4a6e1384ccfcd81eebb36ccb Mon Sep 17 00:00:00 2001 From: Jorge Pineda Date: Tue, 5 Mar 2024 10:15:18 -0800 Subject: [PATCH 033/290] Nit Arithmetic cleanup (#2246) Summary: bypass-github-export-checks Pull Request resolved: https://github.com/pytorch/executorch/pull/2246 Facilitate code review before the big refactoring. Create a `maybe_prepack()` helper and improve variable naming. ghstack-source-id: 217394066 exported-using-ghexport Reviewed By: SS-JIA Differential Revision: D54400674 fbshipit-source-id: 1912eabeaa9882d56b3b2a7a30b1ceb28a377559 --- .../runtime/graph/ops/impl/Arithmetic.cpp | 54 +++++++++---------- .../runtime/graph/ops/impl/Arithmetic.h | 12 ++--- 2 files changed, 31 insertions(+), 35 deletions(-) diff --git a/backends/vulkan/runtime/graph/ops/impl/Arithmetic.cpp b/backends/vulkan/runtime/graph/ops/impl/Arithmetic.cpp index 665d8f37af..8d316e9f48 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Arithmetic.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Arithmetic.cpp @@ -36,44 +36,40 @@ DEFINE_ARITHMETIC_FN(pow, POW); ValueRef add_arithmetic_node( ComputeGraph& graph, - const ValueRef t1, - const ValueRef t2, + const ValueRef in1, + const ValueRef in2, const float alpha, const arithmetic::OpType optype, const int64_t shared_object_idx) { - std::vector t1_sizes = graph.get_val_sizes(t1); - api::ScalarType t1_dtype = graph.get_val_dtype(t1); + std::vector in1_sizes = graph.get_val_sizes(in1); + api::ScalarType in1_dtype = graph.get_val_dtype(in1); - ValueRef out = graph.add_tensor(t1_sizes, t1_dtype, shared_object_idx); - add_arithmetic_node(graph, t1, t2, out, alpha, optype); + ValueRef out = graph.add_tensor(in1_sizes, in1_dtype, shared_object_idx); + add_arithmetic_node(graph, in1, in2, out, alpha, optype); return out; } +// TODO(T181006464): Move to Utils when we remove ArithmeticPrepack. +ValueRef prepack_if_tensor_ref(ComputeGraph& graph, const ValueRef v) { + if (graph.get_val(v).isTensor()) { + return v; + } else { + TensorRef& tRef = graph.get_val(v).toTensorRef(); + ValueRef vTen = graph.add_tensor(tRef.sizes, tRef.dtype); + graph.prepack_nodes().emplace_back(new ArithmeticPrepack(v, vTen)); + return vTen; + } +} + void add_arithmetic_node( ComputeGraph& graph, - const ValueRef t1, - const ValueRef t2, + const ValueRef in1, + const ValueRef in2, const ValueRef out, const float alpha, const arithmetic::OpType optype) { - // Prepacking first arg (if needed) - ValueRef arg1 = t1; - if (graph.get_val(t1).isTensorRef()) { - TensorRef& t1_asref = graph.get_val(t1).toTensorRef(); - ValueRef t1_vten = graph.add_tensor(t1_asref.sizes, t1_asref.dtype); - graph.prepack_nodes().emplace_back(new ArithmeticPrepack(t1, t1_vten)); - arg1 = t1_vten; - } - VK_CHECK_COND(graph.get_val(arg1).isTensor()); - // Prepacking second arg (if needed) - ValueRef arg2 = t2; - if (graph.get_val(t2).isTensorRef()) { - TensorRef& t2_asref = graph.get_val(t2).toTensorRef(); - ValueRef t2_vten = graph.add_tensor(t2_asref.sizes, t2_asref.dtype); - graph.prepack_nodes().emplace_back(new ArithmeticPrepack(t2, t2_vten)); - arg2 = t2_vten; - } - VK_CHECK_COND(graph.get_val(arg2).isTensor()); + ValueRef arg1 = prepack_if_tensor_ref(graph, in1); + ValueRef arg2 = prepack_if_tensor_ref(graph, in2); graph.execute_nodes().emplace_back( new ArithmeticNode(arg1, arg2, out, alpha, optype)); @@ -97,12 +93,12 @@ void ArithmeticPrepack::encode(ComputeGraph* graph) const { } ArithmeticNode::ArithmeticNode( - const ValueRef t1, - const ValueRef t2, + const ValueRef in1, + const ValueRef in2, const ValueRef out, const float alpha, const arithmetic::OpType optype) - : ExecuteNode({t1, t2}, {out}), alpha_(alpha), optype_(optype) {} + : ExecuteNode({in1, in2}, {out}), alpha_(alpha), optype_(optype) {} void ArithmeticNode::encode(ComputeGraph* graph) const { vTensor& in1 = graph->get_val(inputs_[0]).toTensor(); diff --git a/backends/vulkan/runtime/graph/ops/impl/Arithmetic.h b/backends/vulkan/runtime/graph/ops/impl/Arithmetic.h index 26ccd931c7..767517043b 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Arithmetic.h +++ b/backends/vulkan/runtime/graph/ops/impl/Arithmetic.h @@ -29,16 +29,16 @@ DECLARE_OP_FN(pow); ValueRef add_arithmetic_node( ComputeGraph& graph, - const ValueRef t1, - const ValueRef t2, + const ValueRef in1, + const ValueRef in2, const float alpha, const arithmetic::OpType optype, const int64_t shared_object_idx = -1); void add_arithmetic_node( ComputeGraph& graph, - const ValueRef t1, - const ValueRef t2, + const ValueRef in1, + const ValueRef in2, const ValueRef out, const float alpha, const arithmetic::OpType optype); @@ -53,8 +53,8 @@ class ArithmeticPrepack : public virtual PrepackNode { class ArithmeticNode : public virtual ExecuteNode { public: explicit ArithmeticNode( - const ValueRef t1, - const ValueRef t2, + const ValueRef in1, + const ValueRef in2, const ValueRef out, const float alpha, const arithmetic::OpType optype); From 862f755716d4a31984686c5214fd06581b469cce Mon Sep 17 00:00:00 2001 From: Jorge Pineda Date: Tue, 5 Mar 2024 10:15:18 -0800 Subject: [PATCH 034/290] Merge ArithmeticNode into ExecuteNode (#2247) Summary: bypass-github-export-checks Pull Request resolved: https://github.com/pytorch/executorch/pull/2247 This diff moves the logic of `ArithmeticNode` into its corresponding OpFunction `add_arithmetic_node()` and the `ExecuteNode` class. Our aim is to remove all derived classes of `ExecuteNode`, i.e., to make `ExecuteNode` a final class. All operator-specific logic will be handled in the OpFunction. Note the next change will move `StagingNode` into its OpFunction + this new ExecuteNode implementation. Until then, we can't tidy up the `ExecuteNode` class fully. Finally, we leave a few task TODOs. ghstack-source-id: 217439330 exported-using-ghexport Reviewed By: SS-JIA Differential Revision: D53982441 fbshipit-source-id: b8a51eee538b679e4168864a4870f3921c9ba333 --- .../vulkan/runtime/graph/ops/ExecuteNode.cpp | 51 ++++++++++++++ .../vulkan/runtime/graph/ops/ExecuteNode.h | 27 ++++++-- backends/vulkan/runtime/graph/ops/Utils.cpp | 63 +++++++++++++++++ backends/vulkan/runtime/graph/ops/Utils.h | 19 +++++- .../runtime/graph/ops/impl/Arithmetic.cpp | 67 +++++++++---------- .../runtime/graph/ops/impl/Arithmetic.h | 27 +++----- .../vulkan/runtime/graph/ops/impl/Staging.cpp | 2 +- .../vulkan/runtime/graph/ops/impl/Staging.h | 2 +- .../vulkan/test/vulkan_compute_api_test.cpp | 31 +++++---- 9 files changed, 212 insertions(+), 77 deletions(-) create mode 100644 backends/vulkan/runtime/graph/ops/ExecuteNode.cpp create mode 100644 backends/vulkan/runtime/graph/ops/Utils.cpp diff --git a/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp b/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp new file mode 100644 index 0000000000..6bdb07e719 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp @@ -0,0 +1,51 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +#include + +namespace at { +namespace native { +namespace vulkan { + +void ExecuteNode::encode(ComputeGraph* graph) { + api::Context* const context = graph->context(); + api::PipelineBarrier pipeline_barrier{}; + + std::unique_lock cmd_lock = context->dispatch_lock(); + + api::DescriptorSet descriptor_set = + context->get_descriptor_set(shader_, local_workgroup_size_); + + uint32_t idx = 0; + idx = bind_values_to_descriptor_set( + graph, + outputs_, + pipeline_barrier, + api::MemoryAccessType::WRITE, + descriptor_set, + idx); + idx = bind_values_to_descriptor_set( + graph, + inputs_, + pipeline_barrier, + api::MemoryAccessType::READ, + descriptor_set, + idx); + descriptor_set.bind(idx, params_.buffer()); + + context->register_shader_dispatch( + descriptor_set, pipeline_barrier, shader_, global_workgroup_size_); +} + +} // namespace vulkan +} // namespace native +} // namespace at diff --git a/backends/vulkan/runtime/graph/ops/ExecuteNode.h b/backends/vulkan/runtime/graph/ops/ExecuteNode.h index 2b8fb04cbd..1b726e73d4 100644 --- a/backends/vulkan/runtime/graph/ops/ExecuteNode.h +++ b/backends/vulkan/runtime/graph/ops/ExecuteNode.h @@ -33,20 +33,37 @@ class ExecuteNode { public: ExecuteNode(ValueRef input, ValueRef output) - : inputs_{input}, outputs_{output} {} + : outputs_{output}, inputs_{input} {} + ExecuteNode( + const api::ShaderInfo& shader, + const api::utils::uvec3& global_workgroup_size, + const api::utils::uvec3& local_workgroup_size, + const std::vector& outputs, const std::vector& inputs, - const std::vector& outputs) - : inputs_(inputs), outputs_(outputs) {} + api::UniformParamsBuffer&& params) + : shader_(shader), + global_workgroup_size_(global_workgroup_size), + local_workgroup_size_(local_workgroup_size), + outputs_(outputs), + inputs_(inputs), + params_(std::move(params)) {} virtual ~ExecuteNode() = default; protected: - std::vector inputs_; + // TODO: Consider making members const after we remove StagingNode. + api::ShaderInfo shader_; + api::utils::uvec3 global_workgroup_size_; + api::utils::uvec3 local_workgroup_size_; std::vector outputs_; + std::vector inputs_; + // TODO(T180906086): pass multiple buffers and index with ValueRef. + // TODO(T180906457): allow re-computing param buffers. + api::UniformParamsBuffer params_; public: - virtual void encode(ComputeGraph* graph) const = 0; + virtual void encode(ComputeGraph* graph); }; } // namespace vulkan diff --git a/backends/vulkan/runtime/graph/ops/Utils.cpp b/backends/vulkan/runtime/graph/ops/Utils.cpp new file mode 100644 index 0000000000..579eac54e3 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/Utils.cpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +namespace at { +namespace native { +namespace vulkan { + +api::utils::ivec4 get_size_as_ivec4(const vTensor& t) { + return api::utils::make_ivec4( + {dim_at(t), + dim_at(t), + dim_at(t), + dim_at(t)}); +} + +void bind_tensor_to_descriptor_set( + vTensor& tensor, + api::PipelineBarrier& pipeline_barrier, + const api::MemoryAccessType accessType, + api::DescriptorSet& descriptor_set, + const uint32_t idx) { + if (tensor.buffer()) { + api::VulkanBuffer& buffer = tensor.buffer( + pipeline_barrier, api::PipelineStage::COMPUTE, accessType); + descriptor_set.bind(idx, buffer); + } else { + api::VulkanImage& image = + tensor.image(pipeline_barrier, api::PipelineStage::COMPUTE, accessType); + descriptor_set.bind(idx, image); + } +} + +uint32_t bind_values_to_descriptor_set( + ComputeGraph* graph, + const std::vector& args, + api::PipelineBarrier& pipeline_barrier, + const api::MemoryAccessType accessType, + api::DescriptorSet& descriptor_set, + const uint32_t base_idx) { + uint32_t idx = base_idx; + for (auto& arg : args) { + Value& val = graph->get_val(arg); + if (val.isTensor()) { + vTensor& tensor = val.toTensor(); + bind_tensor_to_descriptor_set( + tensor, pipeline_barrier, accessType, descriptor_set, idx++); + } else { + VK_THROW("Unsupported type: ", val.type()); + } + } + return idx; +} + +} // namespace vulkan +} // namespace native +} // namespace at diff --git a/backends/vulkan/runtime/graph/ops/Utils.h b/backends/vulkan/runtime/graph/ops/Utils.h index f962c17bcc..9cf214ca87 100644 --- a/backends/vulkan/runtime/graph/ops/Utils.h +++ b/backends/vulkan/runtime/graph/ops/Utils.h @@ -10,7 +10,7 @@ #ifdef USE_VULKAN_API -#include +#include #include @@ -21,6 +21,23 @@ namespace vulkan { #define DECLARE_OP_FN(function) \ ValueRef function(ComputeGraph& graph, const std::vector& args); +api::utils::ivec4 get_size_as_ivec4(const vTensor& t); + +void bind_tensor_to_descriptor_set( + vTensor& tensor, + api::PipelineBarrier& pipeline_barrier, + const api::MemoryAccessType accessType, + api::DescriptorSet& descriptor_set, + const uint32_t idx); + +uint32_t bind_values_to_descriptor_set( + ComputeGraph* graph, + const std::vector& args, + api::PipelineBarrier& pipeline_barrier, + const api::MemoryAccessType accessType, + api::DescriptorSet& descriptor_set, + const uint32_t base_idx); + } // namespace vulkan } // namespace native } // namespace at diff --git a/backends/vulkan/runtime/graph/ops/impl/Arithmetic.cpp b/backends/vulkan/runtime/graph/ops/impl/Arithmetic.cpp index 8d316e9f48..ce43005384 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Arithmetic.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Arithmetic.cpp @@ -8,44 +8,39 @@ #include -#include - #include namespace at { namespace native { namespace vulkan { -#define DEFINE_ARITHMETIC_FN(function, op_type) \ +#define DEFINE_ARITHMETIC_FN(function, shader) \ ValueRef function(ComputeGraph& graph, const std::vector& args) { \ return add_arithmetic_node( \ - graph, \ - args[0], \ - args[1], \ - args[2], \ - arithmetic::OpType::op_type, \ - args[3]); \ + graph, args[0], args[1], args[2], VK_KERNEL(shader), args[3]); \ } -DEFINE_ARITHMETIC_FN(add, ADD); -DEFINE_ARITHMETIC_FN(sub, SUB); -DEFINE_ARITHMETIC_FN(mul, MUL); -DEFINE_ARITHMETIC_FN(div, DIV); -DEFINE_ARITHMETIC_FN(floor_div, FLOOR_DIV); -DEFINE_ARITHMETIC_FN(pow, POW); +DEFINE_ARITHMETIC_FN(add, add); +DEFINE_ARITHMETIC_FN(sub, sub); +DEFINE_ARITHMETIC_FN(mul, mul); +DEFINE_ARITHMETIC_FN(div, div); +DEFINE_ARITHMETIC_FN(floor_div, floor_divide); +DEFINE_ARITHMETIC_FN(pow, pow); +// TODO(T180908843): Bypass this entrypoint function by creating `ValueRef out` +// ahead of time. ValueRef add_arithmetic_node( ComputeGraph& graph, const ValueRef in1, const ValueRef in2, const float alpha, - const arithmetic::OpType optype, + const api::ShaderInfo& shader, const int64_t shared_object_idx) { std::vector in1_sizes = graph.get_val_sizes(in1); api::ScalarType in1_dtype = graph.get_val_dtype(in1); ValueRef out = graph.add_tensor(in1_sizes, in1_dtype, shared_object_idx); - add_arithmetic_node(graph, in1, in2, out, alpha, optype); + add_arithmetic_node(graph, in1, in2, out, alpha, shader); return out; } @@ -67,12 +62,27 @@ void add_arithmetic_node( const ValueRef in2, const ValueRef out, const float alpha, - const arithmetic::OpType optype) { + const api::ShaderInfo& shader) { ValueRef arg1 = prepack_if_tensor_ref(graph, in1); ValueRef arg2 = prepack_if_tensor_ref(graph, in2); - graph.execute_nodes().emplace_back( - new ArithmeticNode(arg1, arg2, out, alpha, optype)); + vTensor& t_in1 = graph.get_val(arg1).toTensor(); + vTensor& t_in2 = graph.get_val(arg2).toTensor(); + vTensor& t_out = graph.get_val(out).toTensor(); + + api::utils::uvec3 global_size = t_out.extents(); + api::utils::uvec3 local_size = adaptive_work_group_size(global_size); + + ArithmeticParams block{ + get_size_as_ivec4(t_out), + get_size_as_ivec4(t_in1), + get_size_as_ivec4(t_in2), + 1.0, + }; + api::UniformParamsBuffer params(graph.context(), block); + + graph.execute_nodes().emplace_back(new ExecuteNode( + shader, global_size, local_size, {out}, {arg1, arg2}, std::move(params))); } ArithmeticPrepack::ArithmeticPrepack(const ValueRef tref, const ValueRef packed) @@ -92,23 +102,6 @@ void ArithmeticPrepack::encode(ComputeGraph* graph) const { encode_copy_to_vtensor(graph->context(), staging, packed); } -ArithmeticNode::ArithmeticNode( - const ValueRef in1, - const ValueRef in2, - const ValueRef out, - const float alpha, - const arithmetic::OpType optype) - : ExecuteNode({in1, in2}, {out}), alpha_(alpha), optype_(optype) {} - -void ArithmeticNode::encode(ComputeGraph* graph) const { - vTensor& in1 = graph->get_val(inputs_[0]).toTensor(); - vTensor& in2 = graph->get_val(inputs_[1]).toTensor(); - vTensor& out = graph->get_val(outputs_[0]).toTensor(); - - api::ShaderInfo kernel = arithmetic::get_shader(optype_); - arithmetic::record_op(graph->context(), kernel, in1, in2, out, alpha_); -} - } // namespace vulkan } // namespace native } // namespace at diff --git a/backends/vulkan/runtime/graph/ops/impl/Arithmetic.h b/backends/vulkan/runtime/graph/ops/impl/Arithmetic.h index 767517043b..82e2aa2cdf 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Arithmetic.h +++ b/backends/vulkan/runtime/graph/ops/impl/Arithmetic.h @@ -32,7 +32,7 @@ ValueRef add_arithmetic_node( const ValueRef in1, const ValueRef in2, const float alpha, - const arithmetic::OpType optype, + const api::ShaderInfo& shader, const int64_t shared_object_idx = -1); void add_arithmetic_node( @@ -41,29 +41,20 @@ void add_arithmetic_node( const ValueRef in2, const ValueRef out, const float alpha, - const arithmetic::OpType optype); + const api::ShaderInfo& shader); -class ArithmeticPrepack : public virtual PrepackNode { - public: - explicit ArithmeticPrepack(const ValueRef tref, const ValueRef packed); - - void encode(ComputeGraph* graph) const override; +struct ArithmeticParams final { + api::utils::ivec4 outputSizes; + api::utils::ivec4 input1Sizes; + api::utils::ivec4 input2Sizes; + float alpha; }; -class ArithmeticNode : public virtual ExecuteNode { +class ArithmeticPrepack : public virtual PrepackNode { public: - explicit ArithmeticNode( - const ValueRef in1, - const ValueRef in2, - const ValueRef out, - const float alpha, - const arithmetic::OpType optype); + explicit ArithmeticPrepack(const ValueRef tref, const ValueRef packed); void encode(ComputeGraph* graph) const override; - - private: - float alpha_; - arithmetic::OpType optype_; }; } // namespace vulkan diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp index 459d5edf1b..5b16780777 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp @@ -100,7 +100,7 @@ void encode_copy_from_vtensor( StagingNode::StagingNode(ValueRef from, ValueRef to) : ExecuteNode(from, to) {} -void StagingNode::encode(ComputeGraph* graph) const { +void StagingNode::encode(ComputeGraph* graph) { Value& in_val = graph->get_val(inputs_[0]); Value& out_val = graph->get_val(outputs_[0]); diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.h b/backends/vulkan/runtime/graph/ops/impl/Staging.h index bb9671d4e9..be57a9817f 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Staging.h +++ b/backends/vulkan/runtime/graph/ops/impl/Staging.h @@ -88,7 +88,7 @@ class StagingNode : public virtual ExecuteNode { public: explicit StagingNode(ValueRef from, ValueRef to); - void encode(ComputeGraph* graph) const override; + void encode(ComputeGraph* graph) override; }; } // namespace vulkan diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index b36e2352eb..0692d8c709 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -10,7 +10,7 @@ #include -#include +#include #include #include @@ -431,8 +431,7 @@ TEST(VulkanComputeGraphTest, test_simple_graph) { IOValueRef out = {}; - out.value = add_arithmetic_node( - graph, a.value, b.value, 1.0, arithmetic::OpType::ADD); + out.value = add_arithmetic_node(graph, a.value, b.value, 1.0, VK_KERNEL(add)); out.staging = graph.set_output_tensor(out.value); @@ -478,9 +477,8 @@ TEST(VulkanComputeGraphTest, test_simple_prepacked_graph) { IOValueRef a = graph.add_input_tensor(size_big, api::kFloat); - ValueRef c = - add_arithmetic_node(graph, a.value, w1, 1.0, arithmetic::OpType::ADD); - ValueRef e = add_arithmetic_node(graph, c, w2, 1.0, arithmetic::OpType::MUL); + ValueRef c = add_arithmetic_node(graph, a.value, w1, 1.0, VK_KERNEL(add)); + ValueRef e = add_arithmetic_node(graph, c, w2, 1.0, VK_KERNEL(mul)); IOValueRef out = {}; out.value = e; @@ -528,7 +526,8 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects) { api::kFloat, /*shared_object_idx = */ 4); - // Allocation count will be 2 (1 staging buffer for each input tensor) + // Allocation count will be 2: + // 1 staging buffer for each input tensor EXPECT_TRUE(get_vma_allocation_count() == 2); ValueRef c = add_arithmetic_node( @@ -536,7 +535,7 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects) { a.value, b.value, 1.0, - arithmetic::OpType::ADD, + VK_KERNEL(add), /*shared_object_idx = */ 6); IOValueRef d = graph.add_input_tensor( @@ -544,29 +543,33 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects) { api::kFloat, /*shared_object_idx = */ 2); - // Allocation count will be 3 (1 staging buffer for each input tensor) - EXPECT_TRUE(get_vma_allocation_count() == 3); + // Allocation count will be 4, two are new: + // 1 uniform buffer for arithmetic shader params + // 1 staging buffer for the input tensor + EXPECT_TRUE(get_vma_allocation_count() == 4); ValueRef e = add_arithmetic_node( graph, c, d.value, 1.0, - arithmetic::OpType::MUL, + VK_KERNEL(mul), /*shared_object_idx = */ 4); IOValueRef out = {}; out.value = e; out.staging = graph.set_output_tensor(out.value); - // Allocation count will be 4 (1 staging buffer for each I/O tensor) - EXPECT_TRUE(get_vma_allocation_count() == 4); + // Allocation count will be 6, three are new: + // 1 uniform buffer for arithmetic shader params + // 1 staging buffer for the input tensor + EXPECT_TRUE(get_vma_allocation_count() == 6); graph.encode_execute(); // Allocation count will be 13: // 4 staging buffers for each I/O tensor - // 6 uniform buffers to store args for each shader dispatch + // 6 uniform buffers to store params for each shader dispatch // 3 shared objects to back tensor memory EXPECT_TRUE(get_vma_allocation_count() == 13); From 69bf18b4f2ae315c356d67695fdf4abd6ede1687 Mon Sep 17 00:00:00 2001 From: Manuel Candales Date: Tue, 5 Mar 2024 10:43:02 -0800 Subject: [PATCH 035/290] Enable operator<< for Half (#1733) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/1733 Reviewed By: mikekgfb Differential Revision: D53139440 fbshipit-source-id: e371ea15516677dfc726757047f4c5fb2b793d21 --- .../exec_aten/testing_util/tensor_util.cpp | 2 +- runtime/core/portable_type/half.cpp | 28 +++++++++++++++++++ runtime/core/portable_type/half.h | 3 ++ runtime/core/portable_type/targets.bzl | 1 + 4 files changed, 33 insertions(+), 1 deletion(-) create mode 100644 runtime/core/portable_type/half.cpp diff --git a/runtime/core/exec_aten/testing_util/tensor_util.cpp b/runtime/core/exec_aten/testing_util/tensor_util.cpp index 9be3db6051..815e86bcb8 100644 --- a/runtime/core/exec_aten/testing_util/tensor_util.cpp +++ b/runtime/core/exec_aten/testing_util/tensor_util.cpp @@ -251,7 +251,7 @@ std::ostream& operator<<(std::ostream& os, const Tensor& t) { break; switch (t.scalar_type()) { - ET_FORALL_REAL_TYPES_AND(Bool, PRINT_CASE) + ET_FORALL_REAL_TYPES_AND2(Half, Bool, PRINT_CASE) default: ET_CHECK_MSG( false, diff --git a/runtime/core/portable_type/half.cpp b/runtime/core/portable_type/half.cpp new file mode 100644 index 0000000000..5062d38923 --- /dev/null +++ b/runtime/core/portable_type/half.cpp @@ -0,0 +1,28 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +namespace torch { +namespace executor { + +static_assert( + std::is_standard_layout_v, + "Half must be standard layout."); + +std::ostream& operator<<( + std::ostream& out, + const torch::executor::Half& value) { + out << (float)value; + return out; +} + +} // namespace executor +} // namespace torch diff --git a/runtime/core/portable_type/half.h b/runtime/core/portable_type/half.h index 0bdf388adc..448114b5ef 100644 --- a/runtime/core/portable_type/half.h +++ b/runtime/core/portable_type/half.h @@ -12,6 +12,7 @@ #include #include #include +#include #if defined(__GNUC__) || defined(__clang__) #if defined(__aarch64__) @@ -673,6 +674,8 @@ inline Half operator/(int64_t a, Half b) { /// NOTE: we do not define comparisons directly and instead rely on the implicit /// conversion Half to float. +std::ostream& operator<<(std::ostream& out, const Half& value); + } // namespace executor } // namespace torch diff --git a/runtime/core/portable_type/targets.bzl b/runtime/core/portable_type/targets.bzl index 4062611d1e..1d65039241 100644 --- a/runtime/core/portable_type/targets.bzl +++ b/runtime/core/portable_type/targets.bzl @@ -40,6 +40,7 @@ def define_common_targets(): # Set up a specific exported library for scalar_type to avoid circle dependency in ScalarTypeUtil.h runtime.cxx_library( name = "scalar_type", + srcs = ["half.cpp"], exported_headers = [ "bfloat16.h", "complex.h", From a5c18907c4565449a0bc88ac712b51ca6cd3e444 Mon Sep 17 00:00:00 2001 From: Hansong Zhang Date: Tue, 5 Mar 2024 11:15:02 -0800 Subject: [PATCH 036/290] parallel_for should return true if precondition fails (#2240) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/2240 Don't crash, but log error and return false. Reviewed By: SS-JIA Differential Revision: D54505636 fbshipit-source-id: 1cdd2861fbae5bb355ef7bd61eb34c03f35418c5 --- extension/parallel/targets.bzl | 1 + extension/parallel/test/targets.bzl | 1 + .../parallel/test/thread_parallel_test.cpp | 64 +++++++++---------- extension/parallel/thread_parallel.cpp | 10 +-- extension/parallel/thread_parallel.h | 3 +- 5 files changed, 39 insertions(+), 40 deletions(-) diff --git a/extension/parallel/targets.bzl b/extension/parallel/targets.bzl index 46029d9d5e..e4603aa08e 100644 --- a/extension/parallel/targets.bzl +++ b/extension/parallel/targets.bzl @@ -24,5 +24,6 @@ def define_common_targets(): deps = [ "//executorch/backends/xnnpack/threadpool:threadpool", "//executorch/runtime/core:core", + "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix, ], ) diff --git a/extension/parallel/test/targets.bzl b/extension/parallel/test/targets.bzl index ad2e3feb5f..791c072747 100644 --- a/extension/parallel/test/targets.bzl +++ b/extension/parallel/test/targets.bzl @@ -14,5 +14,6 @@ def define_common_targets(): ], deps = [ "//executorch/extension/parallel:thread_parallel", + "//executorch/runtime/platform:platform", ], ) diff --git a/extension/parallel/test/thread_parallel_test.cpp b/extension/parallel/test/thread_parallel_test.cpp index 9e45523937..1eea87beb0 100644 --- a/extension/parallel/test/thread_parallel_test.cpp +++ b/extension/parallel/test/thread_parallel_test.cpp @@ -12,7 +12,7 @@ #include #include -#include +#include using namespace ::testing; @@ -49,9 +49,9 @@ class ParallelTest : public ::testing::Test { }; TEST_F(ParallelTest, TestAllInvoked) { - parallel_for(0, 10, 1, [this](int64_t begin, int64_t end) { + EXPECT_TRUE(parallel_for(0, 10, 1, [this](int64_t begin, int64_t end) { this->RunTask(begin, end); - }); + })); for (int64_t i = 0; i < 10; ++i) { EXPECT_EQ(data_[i], i); @@ -59,9 +59,9 @@ TEST_F(ParallelTest, TestAllInvoked) { } TEST_F(ParallelTest, TestAllInvokedWithMutex) { - parallel_for(0, 10, 1, [this](int64_t begin, int64_t end) { + EXPECT_TRUE(parallel_for(0, 10, 1, [this](int64_t begin, int64_t end) { this->RunExclusiveTask(begin, end); - }); + })); int expected_sum = 0; for (int64_t i = 0; i < 10; ++i) { @@ -72,13 +72,10 @@ TEST_F(ParallelTest, TestAllInvokedWithMutex) { } TEST_F(ParallelTest, TestInvalidRange) { - ET_EXPECT_DEATH( - { - parallel_for(10, 0, 1, [this](int64_t begin, int64_t end) { - this->RunExclusiveTask(begin, end); - }); - }, - ""); + et_pal_init(); + EXPECT_FALSE(parallel_for(10, 0, 1, [this](int64_t begin, int64_t end) { + this->RunExclusiveTask(begin, end); + })); for (int64_t i = 0; i < 10; ++i) { EXPECT_EQ(data_[i], 0); @@ -87,13 +84,10 @@ TEST_F(ParallelTest, TestInvalidRange) { } TEST_F(ParallelTest, TestInvalidRange2) { - ET_EXPECT_DEATH( - { - parallel_for(6, 5, 1, [this](int64_t begin, int64_t end) { - this->RunExclusiveTask(begin, end); - }); - }, - ""); + et_pal_init(); + EXPECT_FALSE(parallel_for(6, 5, 1, [this](int64_t begin, int64_t end) { + this->RunExclusiveTask(begin, end); + })); for (int64_t i = 0; i < 10; ++i) { EXPECT_EQ(data_[i], 0); @@ -102,9 +96,9 @@ TEST_F(ParallelTest, TestInvalidRange2) { } TEST_F(ParallelTest, TestInvokePartialFromBeginning) { - parallel_for(0, 5, 1, [this](int64_t begin, int64_t end) { + EXPECT_TRUE(parallel_for(0, 5, 1, [this](int64_t begin, int64_t end) { this->RunTask(begin, end); - }); + })); for (int64_t i = 0; i < 5; ++i) { EXPECT_EQ(data_[i], i); @@ -115,9 +109,9 @@ TEST_F(ParallelTest, TestInvokePartialFromBeginning) { } TEST_F(ParallelTest, TestInvokePartialToEnd) { - parallel_for(5, 10, 1, [this](int64_t begin, int64_t end) { + EXPECT_TRUE(parallel_for(5, 10, 1, [this](int64_t begin, int64_t end) { this->RunTask(begin, end); - }); + })); for (int64_t i = 0; i < 5; ++i) { EXPECT_EQ(data_[i], 0); @@ -128,9 +122,9 @@ TEST_F(ParallelTest, TestInvokePartialToEnd) { } TEST_F(ParallelTest, TestInvokePartialMiddle) { - parallel_for(2, 8, 1, [this](int64_t begin, int64_t end) { + EXPECT_TRUE(parallel_for(2, 8, 1, [this](int64_t begin, int64_t end) { this->RunTask(begin, end); - }); + })); for (int64_t i = 0; i < 2; ++i) { EXPECT_EQ(data_[i], 0); @@ -144,9 +138,9 @@ TEST_F(ParallelTest, TestInvokePartialMiddle) { } TEST_F(ParallelTest, TestChunkSize2) { - parallel_for(0, 10, 2, [this](int64_t begin, int64_t end) { + EXPECT_TRUE(parallel_for(0, 10, 2, [this](int64_t begin, int64_t end) { this->RunTask(begin, end); - }); + })); for (int64_t i = 0; i < 10; ++i) { EXPECT_EQ(data_[i], i); @@ -154,9 +148,9 @@ TEST_F(ParallelTest, TestChunkSize2) { } TEST_F(ParallelTest, TestChunkSize2Middle) { - parallel_for(3, 8, 2, [this](int64_t begin, int64_t end) { + EXPECT_TRUE(parallel_for(3, 8, 2, [this](int64_t begin, int64_t end) { this->RunTask(begin, end); - }); + })); for (int64_t i = 0; i < 3; ++i) { EXPECT_EQ(data_[i], 0); @@ -170,9 +164,9 @@ TEST_F(ParallelTest, TestChunkSize2Middle) { } TEST_F(ParallelTest, TestChunkSize3) { - parallel_for(0, 10, 3, [this](int64_t begin, int64_t end) { + EXPECT_TRUE(parallel_for(0, 10, 3, [this](int64_t begin, int64_t end) { this->RunTask(begin, end); - }); + })); for (int64_t i = 0; i < 10; ++i) { EXPECT_EQ(data_[i], i); @@ -180,9 +174,9 @@ TEST_F(ParallelTest, TestChunkSize3) { } TEST_F(ParallelTest, TestChunkSize6) { - parallel_for(0, 10, 6, [this](int64_t begin, int64_t end) { + EXPECT_TRUE(parallel_for(0, 10, 6, [this](int64_t begin, int64_t end) { this->RunTask(begin, end); - }); + })); for (int64_t i = 0; i < 10; ++i) { EXPECT_EQ(data_[i], i); @@ -190,9 +184,9 @@ TEST_F(ParallelTest, TestChunkSize6) { } TEST_F(ParallelTest, TestChunkSizeTooLarge) { - parallel_for(0, 10, 11, [this](int64_t begin, int64_t end) { + EXPECT_TRUE(parallel_for(0, 10, 11, [this](int64_t begin, int64_t end) { this->RunTask(begin, end); - }); + })); for (int64_t i = 0; i < 10; ++i) { EXPECT_EQ(data_[i], i); diff --git a/extension/parallel/thread_parallel.cpp b/extension/parallel/thread_parallel.cpp index 14ee87b98e..c765c1c9e0 100644 --- a/extension/parallel/thread_parallel.cpp +++ b/extension/parallel/thread_parallel.cpp @@ -10,6 +10,7 @@ #include #include +#include #include namespace torch::executor { @@ -34,14 +35,14 @@ calc_num_tasks_and_chunk_size(int64_t begin, int64_t end, int64_t grain_size) { return std::make_tuple(num_tasks, chunk_size); } -void parallel_for( +bool parallel_for( const int64_t begin, const int64_t end, const int64_t grain_size, const std::function& f) { - ET_CHECK_MSG(begin >= 0 && end >= 0, "Begin and end should be non-negative"); - ET_CHECK_MSG(end >= begin, "end should be greater than or equal to begin"); - ET_CHECK_MSG(grain_size > 0, "grain_size should be positive"); + ET_LOG_AND_RETURN_IF_FALSE(begin >= 0 && end >= 0); + ET_LOG_AND_RETURN_IF_FALSE(end >= begin); + ET_LOG_AND_RETURN_IF_FALSE(grain_size > 0); int64_t num_tasks = 0, chunk_size = 0; std::tie(num_tasks, chunk_size) = calc_num_tasks_and_chunk_size(begin, end, grain_size); @@ -57,6 +58,7 @@ void parallel_for( // Per protocol from threadpool (pthreadpool), when this returns, all tasks // are executed, so this is synchronous. get_threadpool()->run(task, num_tasks); + return true; } } // namespace torch::executor diff --git a/extension/parallel/thread_parallel.h b/extension/parallel/thread_parallel.h index ccfec1da66..e7caa2f3d6 100644 --- a/extension/parallel/thread_parallel.h +++ b/extension/parallel/thread_parallel.h @@ -23,12 +23,13 @@ namespace torch::executor { * described below * f: user function applied in parallel to the chunks, signature: * void f(int64_t begin, int64_t end) + * Returns true if all work items are processed successfully, false otherwise * * Warning: parallel_for does NOT copy thread local states from the current * thread to the worker threads. Users need to protect the access to captured * data if they mutate them in f. */ -void parallel_for( +bool parallel_for( const int64_t begin, const int64_t end, const int64_t grain_size, From b2862eac22a6134aad71cc8ec44644c41a65c6f4 Mon Sep 17 00:00:00 2001 From: Jorge Pineda Date: Tue, 5 Mar 2024 11:59:13 -0800 Subject: [PATCH 037/290] Merge StagingNode into ExecuteNode (#2260) Summary: bypass-github-export-checks Pull Request resolved: https://github.com/pytorch/executorch/pull/2260 We dispose of `StagingNode` in favor of functions `add_staging_to_tensor_node()` and `add_tensor_to_staging_node()`, which each create an `ExecuteNode`. Hence, we fulfill our goal of making `ExecuteNode` a final class. These `add_X_node()` are not an `OpFunction` since staging is not an operator; its purpose is specific to starting and ending Vulkan execution. Note that we can't remove `encode_copy_to_vtensor()` as it's still used in ArithmeticPrepack. The prepack refactor is next. ghstack-source-id: 217439329 exported-using-ghexport Reviewed By: SS-JIA Differential Revision: D54445787 fbshipit-source-id: f455327630de2873be85d035f42efedda2810047 --- .../vulkan/runtime/graph/ComputeGraph.cpp | 4 +- .../vulkan/runtime/graph/ops/ExecuteNode.h | 23 +- backends/vulkan/runtime/graph/ops/Utils.cpp | 12 +- backends/vulkan/runtime/graph/ops/Utils.h | 5 + .../vulkan/runtime/graph/ops/impl/Staging.cpp | 210 +++++++++++++++--- .../vulkan/runtime/graph/ops/impl/Staging.h | 32 +-- .../vulkan/test/vulkan_compute_api_test.cpp | 21 +- 7 files changed, 237 insertions(+), 70 deletions(-) diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp index 5adb5691e3..c78431b50b 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.cpp +++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp @@ -81,7 +81,7 @@ ValueRef ComputeGraph::set_input_tensor( if (use_staging) { vTensor& tensor = get_val(idx).toTensor(); ValueRef staging_idx = add_staging(tensor.dtype(), tensor.gpu_numel()); - execute_nodes_.emplace_back(new StagingNode(staging_idx, idx)); + add_staging_to_tensor_node(*this, staging_idx, idx); inputs_.push_back(staging_idx); return staging_idx; } @@ -95,7 +95,7 @@ ValueRef ComputeGraph::set_output_tensor( if (use_staging) { vTensor& tensor = get_val(idx).toTensor(); ValueRef staging_idx = add_staging(tensor.dtype(), tensor.gpu_numel()); - execute_nodes_.emplace_back(new StagingNode(idx, staging_idx)); + add_tensor_to_staging_node(*this, idx, staging_idx); outputs_.push_back(staging_idx); return staging_idx; } diff --git a/backends/vulkan/runtime/graph/ops/ExecuteNode.h b/backends/vulkan/runtime/graph/ops/ExecuteNode.h index 1b726e73d4..94b5c0d5de 100644 --- a/backends/vulkan/runtime/graph/ops/ExecuteNode.h +++ b/backends/vulkan/runtime/graph/ops/ExecuteNode.h @@ -28,13 +28,10 @@ class ComputeGraph; * encoding of the shader corresponding to the op into the command buffer of a * ComputeGraph. */ -class ExecuteNode { +class ExecuteNode final { friend class ComputeGraph; public: - ExecuteNode(ValueRef input, ValueRef output) - : outputs_{output}, inputs_{input} {} - ExecuteNode( const api::ShaderInfo& shader, const api::utils::uvec3& global_workgroup_size, @@ -49,21 +46,19 @@ class ExecuteNode { inputs_(inputs), params_(std::move(params)) {} - virtual ~ExecuteNode() = default; + ~ExecuteNode() = default; + + void encode(ComputeGraph* graph); protected: - // TODO: Consider making members const after we remove StagingNode. - api::ShaderInfo shader_; - api::utils::uvec3 global_workgroup_size_; - api::utils::uvec3 local_workgroup_size_; - std::vector outputs_; - std::vector inputs_; + const api::ShaderInfo shader_; + const api::utils::uvec3 global_workgroup_size_; + const api::utils::uvec3 local_workgroup_size_; + const std::vector outputs_; + const std::vector inputs_; // TODO(T180906086): pass multiple buffers and index with ValueRef. // TODO(T180906457): allow re-computing param buffers. api::UniformParamsBuffer params_; - - public: - virtual void encode(ComputeGraph* graph); }; } // namespace vulkan diff --git a/backends/vulkan/runtime/graph/ops/Utils.cpp b/backends/vulkan/runtime/graph/ops/Utils.cpp index 579eac54e3..c3dbb0b37a 100644 --- a/backends/vulkan/runtime/graph/ops/Utils.cpp +++ b/backends/vulkan/runtime/graph/ops/Utils.cpp @@ -37,6 +37,13 @@ void bind_tensor_to_descriptor_set( } } +void bind_staging_to_descriptor_set( + api::StorageBuffer& staging, + api::DescriptorSet& descriptor_set, + const uint32_t idx) { + descriptor_set.bind(idx, staging.buffer()); +} + uint32_t bind_values_to_descriptor_set( ComputeGraph* graph, const std::vector& args, @@ -48,9 +55,10 @@ uint32_t bind_values_to_descriptor_set( for (auto& arg : args) { Value& val = graph->get_val(arg); if (val.isTensor()) { - vTensor& tensor = val.toTensor(); bind_tensor_to_descriptor_set( - tensor, pipeline_barrier, accessType, descriptor_set, idx++); + val.toTensor(), pipeline_barrier, accessType, descriptor_set, idx++); + } else if (val.isStaging()) { + bind_staging_to_descriptor_set(val.toStaging(), descriptor_set, idx++); } else { VK_THROW("Unsupported type: ", val.type()); } diff --git a/backends/vulkan/runtime/graph/ops/Utils.h b/backends/vulkan/runtime/graph/ops/Utils.h index 9cf214ca87..1d2d4bdede 100644 --- a/backends/vulkan/runtime/graph/ops/Utils.h +++ b/backends/vulkan/runtime/graph/ops/Utils.h @@ -30,6 +30,11 @@ void bind_tensor_to_descriptor_set( api::DescriptorSet& descriptor_set, const uint32_t idx); +void bind_staging_to_descriptor_set( + api::StorageBuffer& staging, + api::DescriptorSet& descriptor_set, + const uint32_t idx); + uint32_t bind_values_to_descriptor_set( ComputeGraph* graph, const std::vector& args, diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp index 5b16780777..aeb3d6d7b3 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp @@ -8,6 +8,7 @@ #include +#include #include namespace at { @@ -72,7 +73,7 @@ void encode_copy_to_vtensor( api::Context* context, api::StorageBuffer& staging, vTensor& tensor) { - api::ShaderInfo shader = packing::get_nchw_to_image_shader(tensor); + api::ShaderInfo shader = get_nchw_to_image_shader(tensor); api::PipelineBarrier pipeline_barrier{}; packing::record_nchw_to_image_op( context, @@ -83,41 +84,190 @@ void encode_copy_to_vtensor( VK_NULL_HANDLE); } -void encode_copy_from_vtensor( - api::Context* context, - vTensor& tensor, - api::StorageBuffer& staging) { - api::ShaderInfo shader = packing::get_image_to_nchw_shader(tensor); - api::PipelineBarrier pipeline_barrier{}; - packing::record_image_to_nchw_op( - context, +struct StagingParams final { + api::utils::ivec3 extents; + int32_t plane_size; + api::utils::ivec2 channel_info; +}; + +StagingParams create_staging_params(const vTensor& t) { + int32_t height = api::utils::safe_downcast(dim_at(t)); + int32_t width = api::utils::safe_downcast(dim_at(t)); + int32_t channels = + api::utils::safe_downcast(dim_at(t)); + + int32_t plane_size = height * width; + int32_t c_depth = api::utils::div_up(channels, 4); + + return { + api::utils::make_ivec3(t.extents()), + plane_size, + {c_depth, channels}, + }; +} + +void add_staging_to_tensor_node( + ComputeGraph& graph, + const ValueRef in_staging, + const ValueRef out_tensor) { + vTensor& t_out = graph.get_val(out_tensor).toTensor(); + VK_CHECK_COND(graph.get_val(in_staging).isStaging()); + + api::ShaderInfo shader = get_nchw_to_image_shader(t_out); + + api::utils::uvec3 global_size = t_out.extents(); + api::utils::uvec3 local_size = adaptive_work_group_size(global_size); + + api::UniformParamsBuffer params( + graph.context(), create_staging_params(t_out)); + + graph.execute_nodes().emplace_back(new ExecuteNode( shader, - tensor, - staging.buffer(), - pipeline_barrier, - VK_NULL_HANDLE); + global_size, + local_size, + {out_tensor}, + {in_staging}, + std::move(params))); } -StagingNode::StagingNode(ValueRef from, ValueRef to) : ExecuteNode(from, to) {} +void add_tensor_to_staging_node( + ComputeGraph& graph, + const ValueRef in_tensor, + const ValueRef out_staging) { + vTensor& t_in = graph.get_val(in_tensor).toTensor(); + VK_CHECK_COND(graph.get_val(out_staging).isStaging()); -void StagingNode::encode(ComputeGraph* graph) { - Value& in_val = graph->get_val(inputs_[0]); - Value& out_val = graph->get_val(outputs_[0]); + api::ShaderInfo shader = get_image_to_nchw_shader(t_in); + + api::utils::uvec3 global_size = t_in.extents(); + api::utils::uvec3 local_size = adaptive_work_group_size(global_size); + + StagingParams sp = create_staging_params(t_in); + api::UniformParamsBuffer params(graph.context(), sp); + + // TODO(T181194784): These are workgroup sizes for special cases. Refactor the + // calculation of workgroup sizes to a standalone function. We should use + // scalar type to get the shader name, and use the shader name to get the + // workgroup size. + if (t_in.dtype() == api::ScalarType::QUInt8 || + t_in.dtype() == api::ScalarType::QInt8 || t_in.dtype() == api::kBool) { + if (sp.plane_size % 4 == 0) { + global_size.data[0u] = sp.plane_size / 4; + global_size.data[1u] = 1; + local_size.data[0u] *= local_size.data[1u]; + local_size.data[1u] = 1; + } else { + uint32_t numel = t_in.numel(); + global_size = {api::utils::div_up(numel, uint32_t(4)), 1u, 1u}; + local_size = {64u, 1u, 1u}; + } + } + + graph.execute_nodes().emplace_back(new ExecuteNode( + shader, + global_size, + local_size, + {in_tensor}, + {out_staging}, + std::move(params))); +} + +api::ShaderInfo get_nchw_to_image_shader(const vTensor& v_dst) { + if (v_dst.is_quantized()) { + switch (v_dst.storage_type()) { + case api::StorageType::TEXTURE_3D: + switch (v_dst.dtype()) { + case api::ScalarType::QUInt8: + return VK_KERNEL(nchw_to_image_uint8); + case api::ScalarType::QInt8: + return VK_KERNEL(nchw_to_image_int8); + case api::ScalarType::QInt32: + return VK_KERNEL(nchw_to_image_int32); + default: + VK_THROW( + "Vulkan quantization currently not supported for dtype ", + v_dst.dtype()); + } + case api::StorageType::TEXTURE_2D: + switch (v_dst.dtype()) { + case api::ScalarType::QUInt8: + return VK_KERNEL(nchw_to_image2d_uint8); + case api::ScalarType::QInt8: + return VK_KERNEL(nchw_to_image2d_int8); + case api::ScalarType::QInt32: + return VK_KERNEL(nchw_to_image2d_int32); + default: + VK_THROW( + "Vulkan quantization currently not supported for dtype ", + v_dst.dtype()); + } + default: + VK_THROW("No kernel available!"); + case api::StorageType::BUFFER: + case api::StorageType::UNKNOWN: + VK_THROW("Requested storage type must be a texture type."); + } + } + + if (v_dst.dtype() == api::kFloat) { + switch (v_dst.storage_type()) { + case api::StorageType::TEXTURE_3D: + return VK_KERNEL(nchw_to_image); + case api::StorageType::TEXTURE_2D: + return VK_KERNEL(nchw_to_image2d); + default: + VK_THROW("No kernel available!"); + } + } else if (v_dst.dtype() == api::kBool) { + switch (v_dst.storage_type()) { + case api::StorageType::TEXTURE_3D: + return VK_KERNEL(nchw_to_image_bool); + default: + VK_THROW("No kernel available!"); + } + } else { + VK_THROW("Unsupported dtype!"); + } +} + +api::ShaderInfo get_image_to_nchw_shader(const vTensor& v_src) { + if (v_src.is_quantized() || v_src.dtype() == api::kBool) { + auto plane_size = + dim_at(v_src) * dim_at(v_src); + switch (v_src.storage_type()) { + case api::StorageType::TEXTURE_3D: + switch (v_src.dtype()) { + case api::ScalarType::QUInt8: + case api::ScalarType::QInt8: + case api::kBool: + return plane_size % 4 == 0 ? VK_KERNEL(image_to_nchw_quantized_mul4) + : VK_KERNEL(image_to_nchw_uint); + case api::ScalarType::QInt32: + return VK_KERNEL(image_to_nchw_int32); + default: + VK_THROW( + "Vulkan quantization currently not supported for dtype ", + v_src.dtype()); + } + default: + VK_THROW("No kernel available!"); + case api::StorageType::BUFFER: + case api::StorageType::UNKNOWN: + VK_THROW("Requested storage type must be a texture type."); + } + } - if (in_val.isStaging() && out_val.isTensor()) { - api::StorageBuffer& from_staging = graph->get_val(inputs_[0]).toStaging(); - vTensor& to_tensor = graph->get_val(outputs_[0]).toTensor(); - encode_copy_to_vtensor(graph->context(), from_staging, to_tensor); - } else if (in_val.isTensor() && out_val.isStaging()) { - vTensor& from_tensor = graph->get_val(inputs_[0]).toTensor(); - api::StorageBuffer& to_staging = graph->get_val(outputs_[0]).toStaging(); - encode_copy_from_vtensor(graph->context(), from_tensor, to_staging); + if (v_src.dtype() == api::kFloat) { + switch (v_src.storage_type()) { + case api::StorageType::TEXTURE_3D: + return VK_KERNEL(image_to_nchw); + case api::StorageType::TEXTURE_2D: + return VK_KERNEL(image2d_to_nchw); + default: + VK_THROW("No kernel available!"); + } } else { - VK_THROW( - "Unexpected input value type ", - in_val.type(), - " and output value type ", - out_val.type()); + VK_THROW("Unsupported dtype!"); } } diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.h b/backends/vulkan/runtime/graph/ops/impl/Staging.h index be57a9817f..51cb6fb7f2 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Staging.h +++ b/backends/vulkan/runtime/graph/ops/impl/Staging.h @@ -12,7 +12,7 @@ #include -#include +#include namespace at { namespace native { @@ -76,20 +76,26 @@ void encode_copy_to_vtensor( api::Context* context, api::StorageBuffer& staging, vTensor& tensor); -void encode_copy_from_vtensor( - api::Context* context, - vTensor& tensor, - api::StorageBuffer& staging); -/* - * OpNode that allows copying data into and out of a staging buffer. - */ -class StagingNode : public virtual ExecuteNode { - public: - explicit StagingNode(ValueRef from, ValueRef to); +// +// Functions to initialize ExecuteNode +// + +void add_staging_to_tensor_node( + ComputeGraph& graph, + const ValueRef in_staging, + const ValueRef out_tensor); +void add_tensor_to_staging_node( + ComputeGraph& graph, + const ValueRef in_tensor, + const ValueRef out_staging); + +// +// Functions to get shaders +// - void encode(ComputeGraph* graph) override; -}; +api::ShaderInfo get_nchw_to_image_shader(const vTensor& v_dst); +api::ShaderInfo get_image_to_nchw_shader(const vTensor& v_src); } // namespace vulkan } // namespace native diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index 0692d8c709..3f9dad087f 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -526,9 +526,10 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects) { api::kFloat, /*shared_object_idx = */ 4); - // Allocation count will be 2: + // Allocation count will be 4: + // 1 uniform buffer for each staging shader args // 1 staging buffer for each input tensor - EXPECT_TRUE(get_vma_allocation_count() == 2); + EXPECT_TRUE(get_vma_allocation_count() == 4); ValueRef c = add_arithmetic_node( graph, @@ -543,10 +544,11 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects) { api::kFloat, /*shared_object_idx = */ 2); - // Allocation count will be 4, two are new: - // 1 uniform buffer for arithmetic shader params + // Allocation count will be 7, three are new: + // 1 uniform buffer for arithmetic shader args + // 1 uniform buffer for staging shader args // 1 staging buffer for the input tensor - EXPECT_TRUE(get_vma_allocation_count() == 4); + EXPECT_TRUE(get_vma_allocation_count() == 7); ValueRef e = add_arithmetic_node( graph, @@ -560,14 +562,15 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects) { out.value = e; out.staging = graph.set_output_tensor(out.value); - // Allocation count will be 6, three are new: - // 1 uniform buffer for arithmetic shader params + // Allocation count will be 10, three are new: + // 1 uniform buffer for arithmetic shader + // 1 uniform buffer for staging shader // 1 staging buffer for the input tensor - EXPECT_TRUE(get_vma_allocation_count() == 6); + EXPECT_TRUE(get_vma_allocation_count() == 10); graph.encode_execute(); - // Allocation count will be 13: + // Allocation count will be 13, three shared objects are allocated for total: // 4 staging buffers for each I/O tensor // 6 uniform buffers to store params for each shader dispatch // 3 shared objects to back tensor memory From 4a975ea542a102978a8c8f2b0e9c99cfadc832b3 Mon Sep 17 00:00:00 2001 From: Jorge Pineda Date: Tue, 5 Mar 2024 11:59:13 -0800 Subject: [PATCH 038/290] Merge ArithmeticPrepack into PrepackNode (#2261) Summary: bypass-github-export-checks Pull Request resolved: https://github.com/pytorch/executorch/pull/2261 There's a lot of shared logic between - `add_staging_to_tensor_node()`, which handles I/O data on execute(), and - `ArithmeticPrepack`'s simple prepacking, on prepack(). Both just copy data to and from GPU, without any manipulation. Hence, I've decided to consolidate shared logic in this diff as well. Here are the final results: + Make `PrepackNode` a final class. + Remove all references of `impl/Packing.h`. - Extract shared util functions to new `StagingUtils.h/cpp`. ghstack-source-id: 217439331 exported-using-ghexport Reviewed By: SS-JIA Differential Revision: D54504449 fbshipit-source-id: 358f2f5acb396a05bf7758cce1f3314d3a85ba55 --- .../vulkan/runtime/graph/ComputeGraph.cpp | 2 + .../vulkan/runtime/graph/ops/PrepackNode.cpp | 55 +++++ .../vulkan/runtime/graph/ops/PrepackNode.h | 33 ++- .../vulkan/runtime/graph/ops/StagingUtils.cpp | 174 ++++++++++++++++ .../vulkan/runtime/graph/ops/StagingUtils.h | 82 ++++++++ .../runtime/graph/ops/impl/Arithmetic.cpp | 29 --- .../runtime/graph/ops/impl/Arithmetic.h | 7 - .../vulkan/runtime/graph/ops/impl/Staging.cpp | 191 +++--------------- .../vulkan/runtime/graph/ops/impl/Staging.h | 74 +------ .../vulkan/test/vulkan_compute_api_test.cpp | 2 + 10 files changed, 369 insertions(+), 280 deletions(-) create mode 100644 backends/vulkan/runtime/graph/ops/PrepackNode.cpp create mode 100644 backends/vulkan/runtime/graph/ops/StagingUtils.cpp create mode 100644 backends/vulkan/runtime/graph/ops/StagingUtils.h diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp index c78431b50b..647371424c 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.cpp +++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp @@ -8,6 +8,8 @@ #include +#include + #include namespace at { diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp new file mode 100644 index 0000000000..d16c671ba4 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp @@ -0,0 +1,55 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +#include +#include + +namespace at { +namespace native { +namespace vulkan { + +void PrepackNode::encode(ComputeGraph* graph) { + api::Context* const context = graph->context(); + api::PipelineBarrier pipeline_barrier{}; + + TensorRef tref = graph->get_val(tref_).toTensorRef(); + vTensor packed = graph->get_val(packed_).toTensor(); + + // TODO: Extract to standalone function, to support other types of prepacking. + api::StorageBuffer staging( + graph->context(), packed.dtype(), packed.gpu_nbytes()); + size_t numel = api::utils::multiply_integers(tref.sizes); + size_t nbytes = numel * api::element_size(tref.dtype); + copy_ptr_to_staging(tref.data, staging, nbytes); + + std::unique_lock cmd_lock = context->dispatch_lock(); + + api::DescriptorSet descriptor_set = + context->get_descriptor_set(shader_, local_workgroup_size_); + + uint32_t idx = 0; + bind_tensor_to_descriptor_set( + packed, + pipeline_barrier, + api::MemoryAccessType::WRITE, + descriptor_set, + idx++); + bind_staging_to_descriptor_set(staging, descriptor_set, idx++); + descriptor_set.bind(idx, params_.buffer()); + + context->register_shader_dispatch( + descriptor_set, pipeline_barrier, shader_, global_workgroup_size_); +} + +} // namespace vulkan +} // namespace native +} // namespace at diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.h b/backends/vulkan/runtime/graph/ops/PrepackNode.h index 6f581eb931..b3a5fd0086 100644 --- a/backends/vulkan/runtime/graph/ops/PrepackNode.h +++ b/backends/vulkan/runtime/graph/ops/PrepackNode.h @@ -28,20 +28,37 @@ class ComputeGraph; * encoding of shaders transferring necessary data (such as weights and biases) * to the GPU. */ -class PrepackNode { +class PrepackNode final { friend class ComputeGraph; public: - PrepackNode(ValueRef tref, ValueRef packed) : tref_{tref}, packed_{packed} {} + PrepackNode( + const api::ShaderInfo& shader, + const api::utils::uvec3& global_workgroup_size, + const api::utils::uvec3& local_workgroup_size, + const ValueRef tref, + const ValueRef packed, + api::UniformParamsBuffer&& params) + : shader_(shader), + global_workgroup_size_(global_workgroup_size), + local_workgroup_size_(local_workgroup_size), + tref_(tref), + packed_(packed), + params_(std::move(params)) {} - virtual ~PrepackNode() = default; + ~PrepackNode() = default; - protected: - ValueRef tref_; - ValueRef packed_; + void encode(ComputeGraph* graph); - public: - virtual void encode(ComputeGraph* graph) const = 0; + protected: + const api::ShaderInfo shader_; + const api::utils::uvec3 global_workgroup_size_; + const api::utils::uvec3 local_workgroup_size_; + const ValueRef tref_; + const ValueRef packed_; + // TODO(T180906086): pass multiple buffers and index with ValueRef. + // TODO(T180906457): allow re-computing param buffers. + api::UniformParamsBuffer params_; }; } // namespace vulkan diff --git a/backends/vulkan/runtime/graph/ops/StagingUtils.cpp b/backends/vulkan/runtime/graph/ops/StagingUtils.cpp new file mode 100644 index 0000000000..18e2159980 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/StagingUtils.cpp @@ -0,0 +1,174 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +#include + +namespace at { +namespace native { +namespace vulkan { + +void memcpy_to_mapping( + const void* src, + api::MemoryMap& dst_mapping, + const size_t nbytes, + const api::ScalarType dtype) { +#define DTYPE_CASE(ctype, vkformat, name) \ + case api::ScalarType::name: \ + memcpy_to_mapping_impl(src, dst_mapping, nbytes); \ + break; + + switch (dtype) { + VK_FORALL_SCALAR_TYPES(DTYPE_CASE) + default: + VK_THROW("Unrecognized dtype!"); + } +#undef DTYPE_CASE +} + +void memcpy_from_mapping( + api::MemoryMap& src_mapping, + void* dst, + const size_t nbytes, + const api::ScalarType dtype) { +#define DTYPE_CASE(ctype, vkformat, name) \ + case api::ScalarType::name: \ + memcpy_from_mapping_impl(src_mapping, dst, nbytes); \ + break; + + switch (dtype) { + VK_FORALL_SCALAR_TYPES(DTYPE_CASE) + default: + VK_THROW("Unrecognized dtype!"); + } +#undef DTYPE_CASE +} + +void copy_ptr_to_staging( + const void* src, + api::StorageBuffer& staging, + const size_t nbytes) { + api::MemoryMap mapping(staging.buffer(), api::MemoryAccessType::WRITE); + mapping.invalidate(); + memcpy_to_mapping(src, mapping, nbytes, staging.dtype()); +} + +void copy_staging_to_ptr( + api::StorageBuffer& staging, + void* dst, + const size_t nbytes) { + api::MemoryMap mapping(staging.buffer(), api::MemoryAccessType::READ); + mapping.invalidate(); + memcpy_from_mapping(mapping, dst, nbytes, staging.dtype()); +} + +api::ShaderInfo get_nchw_to_image_shader(const vTensor& v_dst) { + if (v_dst.is_quantized()) { + switch (v_dst.storage_type()) { + case api::StorageType::TEXTURE_3D: + switch (v_dst.dtype()) { + case api::ScalarType::QUInt8: + return VK_KERNEL(nchw_to_image_uint8); + case api::ScalarType::QInt8: + return VK_KERNEL(nchw_to_image_int8); + case api::ScalarType::QInt32: + return VK_KERNEL(nchw_to_image_int32); + default: + VK_THROW( + "Vulkan quantization currently not supported for dtype ", + v_dst.dtype()); + } + case api::StorageType::TEXTURE_2D: + switch (v_dst.dtype()) { + case api::ScalarType::QUInt8: + return VK_KERNEL(nchw_to_image2d_uint8); + case api::ScalarType::QInt8: + return VK_KERNEL(nchw_to_image2d_int8); + case api::ScalarType::QInt32: + return VK_KERNEL(nchw_to_image2d_int32); + default: + VK_THROW( + "Vulkan quantization currently not supported for dtype ", + v_dst.dtype()); + } + default: + VK_THROW("No kernel available!"); + case api::StorageType::BUFFER: + case api::StorageType::UNKNOWN: + VK_THROW("Requested storage type must be a texture type."); + } + } + + if (v_dst.dtype() == api::kFloat) { + switch (v_dst.storage_type()) { + case api::StorageType::TEXTURE_3D: + return VK_KERNEL(nchw_to_image); + case api::StorageType::TEXTURE_2D: + return VK_KERNEL(nchw_to_image2d); + default: + VK_THROW("No kernel available!"); + } + } else if (v_dst.dtype() == api::kBool) { + switch (v_dst.storage_type()) { + case api::StorageType::TEXTURE_3D: + return VK_KERNEL(nchw_to_image_bool); + default: + VK_THROW("No kernel available!"); + } + } else { + VK_THROW("Unsupported dtype!"); + } +} + +api::ShaderInfo get_image_to_nchw_shader(const vTensor& v_src) { + if (v_src.is_quantized() || v_src.dtype() == api::kBool) { + auto plane_size = + dim_at(v_src) * dim_at(v_src); + switch (v_src.storage_type()) { + case api::StorageType::TEXTURE_3D: + switch (v_src.dtype()) { + case api::ScalarType::QUInt8: + case api::ScalarType::QInt8: + case api::kBool: + return plane_size % 4 == 0 ? VK_KERNEL(image_to_nchw_quantized_mul4) + : VK_KERNEL(image_to_nchw_uint); + case api::ScalarType::QInt32: + return VK_KERNEL(image_to_nchw_int32); + default: + VK_THROW( + "Vulkan quantization currently not supported for dtype ", + v_src.dtype()); + } + default: + VK_THROW("No kernel available!"); + case api::StorageType::BUFFER: + case api::StorageType::UNKNOWN: + VK_THROW("Requested storage type must be a texture type."); + } + } + + if (v_src.dtype() == api::kFloat) { + switch (v_src.storage_type()) { + case api::StorageType::TEXTURE_3D: + return VK_KERNEL(image_to_nchw); + case api::StorageType::TEXTURE_2D: + return VK_KERNEL(image2d_to_nchw); + default: + VK_THROW("No kernel available!"); + } + } else { + VK_THROW("Unsupported dtype!"); + } +} + +} // namespace vulkan +} // namespace native +} // namespace at diff --git a/backends/vulkan/runtime/graph/ops/StagingUtils.h b/backends/vulkan/runtime/graph/ops/StagingUtils.h new file mode 100644 index 0000000000..c101581a77 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/StagingUtils.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#ifdef USE_VULKAN_API + +#include + +#include + +namespace at { +namespace native { +namespace vulkan { + +// +// Functions to memcpy data into staging buffer +// + +void memcpy_to_mapping( + const void* src, + api::MemoryMap& dst_mapping, + const size_t nbytes, + const api::ScalarType dtype); +void memcpy_from_mapping( + const api::MemoryMap& src_mapping, + void* dst, + const size_t nbytes, + const api::ScalarType dtype); + +// +// Utility functions for memcpy +// + +template +void memcpy_to_mapping_impl( + const void* src, + api::MemoryMap& dst_mapping, + const size_t nbytes) { + T* data_ptr = dst_mapping.template data(); + memcpy(data_ptr, reinterpret_cast(src), nbytes); +} + +template +void memcpy_from_mapping_impl( + api::MemoryMap& src_mapping, + void* dst, + const size_t nbytes) { + T* data_ptr = src_mapping.template data(); + memcpy(reinterpret_cast(dst), data_ptr, nbytes); +} + +// +// Functions to copy data into and out of a staging buffer +// + +void copy_ptr_to_staging( + const void* src, + api::StorageBuffer& staging, + const size_t nbytes); +void copy_staging_to_ptr( + api::StorageBuffer& staging, + void* dst, + const size_t nbytes); + +// +// Functions to get shaders +// + +api::ShaderInfo get_nchw_to_image_shader(const vTensor& v_dst); +api::ShaderInfo get_image_to_nchw_shader(const vTensor& v_src); + +} // namespace vulkan +} // namespace native +} // namespace at + +#endif /* USE_VULKAN_API */ diff --git a/backends/vulkan/runtime/graph/ops/impl/Arithmetic.cpp b/backends/vulkan/runtime/graph/ops/impl/Arithmetic.cpp index ce43005384..3e999b5fe1 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Arithmetic.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Arithmetic.cpp @@ -44,18 +44,6 @@ ValueRef add_arithmetic_node( return out; } -// TODO(T181006464): Move to Utils when we remove ArithmeticPrepack. -ValueRef prepack_if_tensor_ref(ComputeGraph& graph, const ValueRef v) { - if (graph.get_val(v).isTensor()) { - return v; - } else { - TensorRef& tRef = graph.get_val(v).toTensorRef(); - ValueRef vTen = graph.add_tensor(tRef.sizes, tRef.dtype); - graph.prepack_nodes().emplace_back(new ArithmeticPrepack(v, vTen)); - return vTen; - } -} - void add_arithmetic_node( ComputeGraph& graph, const ValueRef in1, @@ -85,23 +73,6 @@ void add_arithmetic_node( shader, global_size, local_size, {out}, {arg1, arg2}, std::move(params))); } -ArithmeticPrepack::ArithmeticPrepack(const ValueRef tref, const ValueRef packed) - : PrepackNode(tref, packed) {} - -void ArithmeticPrepack::encode(ComputeGraph* graph) const { - TensorRef tref = graph->get_val(tref_).toTensorRef(); - vTensor packed = graph->get_val(packed_).toTensor(); - - api::StorageBuffer staging( - graph->context(), packed.dtype(), packed.gpu_nbytes()); - - size_t numel = api::utils::multiply_integers(tref.sizes); - size_t nbytes = numel * api::element_size(tref.dtype); - copy_ptr_to_staging(tref.data, staging, nbytes); - - encode_copy_to_vtensor(graph->context(), staging, packed); -} - } // namespace vulkan } // namespace native } // namespace at diff --git a/backends/vulkan/runtime/graph/ops/impl/Arithmetic.h b/backends/vulkan/runtime/graph/ops/impl/Arithmetic.h index 82e2aa2cdf..06fe57a26a 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Arithmetic.h +++ b/backends/vulkan/runtime/graph/ops/impl/Arithmetic.h @@ -50,13 +50,6 @@ struct ArithmeticParams final { float alpha; }; -class ArithmeticPrepack : public virtual PrepackNode { - public: - explicit ArithmeticPrepack(const ValueRef tref, const ValueRef packed); - - void encode(ComputeGraph* graph) const override; -}; - } // namespace vulkan } // namespace native } // namespace at diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp index aeb3d6d7b3..9dab1f7e2f 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp @@ -8,88 +8,15 @@ #include +#include +#include + #include -#include namespace at { namespace native { namespace vulkan { -void memcpy_to_mapping( - const void* src, - api::MemoryMap& dst_mapping, - const size_t nbytes, - const api::ScalarType dtype) { -#define DTYPE_CASE(ctype, vkformat, name) \ - case api::ScalarType::name: \ - memcpy_to_mapping_impl(src, dst_mapping, nbytes); \ - break; - - switch (dtype) { - VK_FORALL_SCALAR_TYPES(DTYPE_CASE) - default: - VK_THROW("Unrecognized dtype!"); - } -#undef DTYPE_CASE -} - -void memcpy_from_mapping( - api::MemoryMap& src_mapping, - void* dst, - const size_t nbytes, - const api::ScalarType dtype) { -#define DTYPE_CASE(ctype, vkformat, name) \ - case api::ScalarType::name: \ - memcpy_from_mapping_impl(src_mapping, dst, nbytes); \ - break; - - switch (dtype) { - VK_FORALL_SCALAR_TYPES(DTYPE_CASE) - default: - VK_THROW("Unrecognized dtype!"); - } -#undef DTYPE_CASE -} - -void copy_ptr_to_staging( - const void* src, - api::StorageBuffer& staging, - const size_t nbytes) { - api::MemoryMap mapping(staging.buffer(), api::MemoryAccessType::WRITE); - mapping.invalidate(); - memcpy_to_mapping(src, mapping, nbytes, staging.dtype()); -} - -void copy_staging_to_ptr( - api::StorageBuffer& staging, - void* dst, - const size_t nbytes) { - api::MemoryMap mapping(staging.buffer(), api::MemoryAccessType::READ); - mapping.invalidate(); - memcpy_from_mapping(mapping, dst, nbytes, staging.dtype()); -} - -void encode_copy_to_vtensor( - api::Context* context, - api::StorageBuffer& staging, - vTensor& tensor) { - api::ShaderInfo shader = get_nchw_to_image_shader(tensor); - api::PipelineBarrier pipeline_barrier{}; - packing::record_nchw_to_image_op( - context, - shader, - staging.buffer(), - tensor, - pipeline_barrier, - VK_NULL_HANDLE); -} - -struct StagingParams final { - api::utils::ivec3 extents; - int32_t plane_size; - api::utils::ivec2 channel_info; -}; - StagingParams create_staging_params(const vTensor& t) { int32_t height = api::utils::safe_downcast(dim_at(t)); int32_t width = api::utils::safe_downcast(dim_at(t)); @@ -172,102 +99,30 @@ void add_tensor_to_staging_node( std::move(params))); } -api::ShaderInfo get_nchw_to_image_shader(const vTensor& v_dst) { - if (v_dst.is_quantized()) { - switch (v_dst.storage_type()) { - case api::StorageType::TEXTURE_3D: - switch (v_dst.dtype()) { - case api::ScalarType::QUInt8: - return VK_KERNEL(nchw_to_image_uint8); - case api::ScalarType::QInt8: - return VK_KERNEL(nchw_to_image_int8); - case api::ScalarType::QInt32: - return VK_KERNEL(nchw_to_image_int32); - default: - VK_THROW( - "Vulkan quantization currently not supported for dtype ", - v_dst.dtype()); - } - case api::StorageType::TEXTURE_2D: - switch (v_dst.dtype()) { - case api::ScalarType::QUInt8: - return VK_KERNEL(nchw_to_image2d_uint8); - case api::ScalarType::QInt8: - return VK_KERNEL(nchw_to_image2d_int8); - case api::ScalarType::QInt32: - return VK_KERNEL(nchw_to_image2d_int32); - default: - VK_THROW( - "Vulkan quantization currently not supported for dtype ", - v_dst.dtype()); - } - default: - VK_THROW("No kernel available!"); - case api::StorageType::BUFFER: - case api::StorageType::UNKNOWN: - VK_THROW("Requested storage type must be a texture type."); - } - } +ValueRef prepack(ComputeGraph& graph, const ValueRef vref) { + TensorRef& tref = graph.get_val(vref).toTensorRef(); + ValueRef v = graph.add_tensor(tref.sizes, tref.dtype); + vTensor t = graph.get_val(v).toTensor(); - if (v_dst.dtype() == api::kFloat) { - switch (v_dst.storage_type()) { - case api::StorageType::TEXTURE_3D: - return VK_KERNEL(nchw_to_image); - case api::StorageType::TEXTURE_2D: - return VK_KERNEL(nchw_to_image2d); - default: - VK_THROW("No kernel available!"); - } - } else if (v_dst.dtype() == api::kBool) { - switch (v_dst.storage_type()) { - case api::StorageType::TEXTURE_3D: - return VK_KERNEL(nchw_to_image_bool); - default: - VK_THROW("No kernel available!"); - } - } else { - VK_THROW("Unsupported dtype!"); - } -} + api::ShaderInfo shader = get_nchw_to_image_shader(t); -api::ShaderInfo get_image_to_nchw_shader(const vTensor& v_src) { - if (v_src.is_quantized() || v_src.dtype() == api::kBool) { - auto plane_size = - dim_at(v_src) * dim_at(v_src); - switch (v_src.storage_type()) { - case api::StorageType::TEXTURE_3D: - switch (v_src.dtype()) { - case api::ScalarType::QUInt8: - case api::ScalarType::QInt8: - case api::kBool: - return plane_size % 4 == 0 ? VK_KERNEL(image_to_nchw_quantized_mul4) - : VK_KERNEL(image_to_nchw_uint); - case api::ScalarType::QInt32: - return VK_KERNEL(image_to_nchw_int32); - default: - VK_THROW( - "Vulkan quantization currently not supported for dtype ", - v_src.dtype()); - } - default: - VK_THROW("No kernel available!"); - case api::StorageType::BUFFER: - case api::StorageType::UNKNOWN: - VK_THROW("Requested storage type must be a texture type."); - } - } + api::utils::uvec3 global_size = t.extents(); + api::utils::uvec3 local_size = adaptive_work_group_size(global_size); - if (v_src.dtype() == api::kFloat) { - switch (v_src.storage_type()) { - case api::StorageType::TEXTURE_3D: - return VK_KERNEL(image_to_nchw); - case api::StorageType::TEXTURE_2D: - return VK_KERNEL(image2d_to_nchw); - default: - VK_THROW("No kernel available!"); - } + StagingParams sp = create_staging_params(t); + api::UniformParamsBuffer params(graph.context(), sp); + + graph.prepack_nodes().emplace_back(new PrepackNode( + shader, global_size, local_size, vref, v, std::move(params))); + + return v; +} + +ValueRef prepack_if_tensor_ref(ComputeGraph& graph, const ValueRef v) { + if (graph.get_val(v).isTensorRef()) { + return prepack(graph, v); } else { - VK_THROW("Unsupported dtype!"); + return v; } } diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.h b/backends/vulkan/runtime/graph/ops/impl/Staging.h index 51cb6fb7f2..2a49026e8e 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Staging.h +++ b/backends/vulkan/runtime/graph/ops/impl/Staging.h @@ -18,69 +18,6 @@ namespace at { namespace native { namespace vulkan { -// -// Functions to memcpy data into staging buffer -// - -void memcpy_to_mapping( - const void* src, - api::MemoryMap& dst_mapping, - const size_t nbytes, - const api::ScalarType dtype); -void memcpy_from_mapping( - const api::MemoryMap& src_mapping, - void* dst, - const size_t nbytes, - const api::ScalarType dtype); - -// -// Utility functions for memcpy -// - -template -void memcpy_to_mapping_impl( - const void* src, - api::MemoryMap& dst_mapping, - const size_t nbytes) { - T* data_ptr = dst_mapping.template data(); - memcpy(data_ptr, reinterpret_cast(src), nbytes); -} - -template -void memcpy_from_mapping_impl( - api::MemoryMap& src_mapping, - void* dst, - const size_t nbytes) { - T* data_ptr = src_mapping.template data(); - memcpy(reinterpret_cast(dst), data_ptr, nbytes); -} - -// -// Functions to copy data into and out of a staging buffer -// - -void copy_ptr_to_staging( - const void* src, - api::StorageBuffer& staging, - const size_t nbytes); -void copy_staging_to_ptr( - api::StorageBuffer& staging, - void* dst, - const size_t nbytes); - -// -// Functions to record copying data between a staging buffer and a vTensor -// - -void encode_copy_to_vtensor( - api::Context* context, - api::StorageBuffer& staging, - vTensor& tensor); - -// -// Functions to initialize ExecuteNode -// - void add_staging_to_tensor_node( ComputeGraph& graph, const ValueRef in_staging, @@ -90,12 +27,13 @@ void add_tensor_to_staging_node( const ValueRef in_tensor, const ValueRef out_staging); -// -// Functions to get shaders -// +struct StagingParams final { + api::utils::ivec3 extents; + int32_t plane_size; + api::utils::ivec2 channel_info; +}; -api::ShaderInfo get_nchw_to_image_shader(const vTensor& v_dst); -api::ShaderInfo get_image_to_nchw_shader(const vTensor& v_src); +ValueRef prepack_if_tensor_ref(ComputeGraph& graph, const ValueRef v); } // namespace vulkan } // namespace native diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index 3f9dad087f..2ac0f1a8b1 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -13,6 +13,8 @@ #include #include +#include + #include #include From 49fb74b5afbabe89284da2668904e2f0c57429dc Mon Sep 17 00:00:00 2001 From: Jorge Pineda Date: Tue, 5 Mar 2024 11:59:13 -0800 Subject: [PATCH 039/290] Generalize ExecuteNode args with ArgGroup (#2262) Summary: bypass-github-export-checks Pull Request resolved: https://github.com/pytorch/executorch/pull/2262 Leftover from Op Redesign 5/n - D54445787. --- Typically, we specify outputs first and inputs second in the shader layout, but not always. In `image_to_nchw.glsl`, this is flipped: https://www.internalfb.com/code/fbsource/[d303d229f22616bfba32e5bb5d4d27dc656f41a7]/fbcode/caffe2/aten/src/ATen/native/vulkan/glsl/image_to_nchw.glsl?lines=8-19 Hence, we generalize our `ExecuteNode` specification to take a vector of args (image, buffer, etc.), with specification of access type. Since typically we will group args of the same access together, we correspond one access specification to multiple args. We reuse `api::MemoryAccessType` for access specification. ghstack-source-id: 217489419 exported-using-ghexport Reviewed By: SS-JIA Differential Revision: D54518840 fbshipit-source-id: f5f7e881a95ed937931c56ddcf03eb6b4755981b --- .../vulkan/runtime/graph/ops/ExecuteNode.cpp | 14 +--------- .../vulkan/runtime/graph/ops/ExecuteNode.h | 26 ++++++++++++++----- backends/vulkan/runtime/graph/ops/Utils.cpp | 25 +++++++++++------- backends/vulkan/runtime/graph/ops/Utils.h | 3 +-- .../runtime/graph/ops/impl/Arithmetic.cpp | 7 ++++- .../vulkan/runtime/graph/ops/impl/Staging.cpp | 8 +++--- 6 files changed, 47 insertions(+), 36 deletions(-) diff --git a/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp b/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp index 6bdb07e719..7c1f0fe807 100644 --- a/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp +++ b/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp @@ -27,19 +27,7 @@ void ExecuteNode::encode(ComputeGraph* graph) { uint32_t idx = 0; idx = bind_values_to_descriptor_set( - graph, - outputs_, - pipeline_barrier, - api::MemoryAccessType::WRITE, - descriptor_set, - idx); - idx = bind_values_to_descriptor_set( - graph, - inputs_, - pipeline_barrier, - api::MemoryAccessType::READ, - descriptor_set, - idx); + graph, args_, pipeline_barrier, descriptor_set, idx); descriptor_set.bind(idx, params_.buffer()); context->register_shader_dispatch( diff --git a/backends/vulkan/runtime/graph/ops/ExecuteNode.h b/backends/vulkan/runtime/graph/ops/ExecuteNode.h index 94b5c0d5de..ddd50c1f67 100644 --- a/backends/vulkan/runtime/graph/ops/ExecuteNode.h +++ b/backends/vulkan/runtime/graph/ops/ExecuteNode.h @@ -22,6 +22,23 @@ namespace vulkan { class ComputeGraph; +/* + * Represents a group of shader arguments (images and/or buffers), with a common + * access permission. + */ +struct ArgGroup { + ArgGroup(const ValueRef ref, const api::MemoryAccessType access) + : refs{ref}, access(access) {} + + ArgGroup( + const std::vector& refs, + const api::MemoryAccessType access) + : refs(refs), access(access) {} + + const std::vector refs; + const api::MemoryAccessType access; +}; + /* * Represents a single execution op in a ML model. In graph mode, ops will be * implemented in a derived class that implements encode, which will implement @@ -36,14 +53,12 @@ class ExecuteNode final { const api::ShaderInfo& shader, const api::utils::uvec3& global_workgroup_size, const api::utils::uvec3& local_workgroup_size, - const std::vector& outputs, - const std::vector& inputs, + const std::vector& args, api::UniformParamsBuffer&& params) : shader_(shader), global_workgroup_size_(global_workgroup_size), local_workgroup_size_(local_workgroup_size), - outputs_(outputs), - inputs_(inputs), + args_(args), params_(std::move(params)) {} ~ExecuteNode() = default; @@ -54,8 +69,7 @@ class ExecuteNode final { const api::ShaderInfo shader_; const api::utils::uvec3 global_workgroup_size_; const api::utils::uvec3 local_workgroup_size_; - const std::vector outputs_; - const std::vector inputs_; + const std::vector args_; // TODO(T180906086): pass multiple buffers and index with ValueRef. // TODO(T180906457): allow re-computing param buffers. api::UniformParamsBuffer params_; diff --git a/backends/vulkan/runtime/graph/ops/Utils.cpp b/backends/vulkan/runtime/graph/ops/Utils.cpp index c3dbb0b37a..868b5b7068 100644 --- a/backends/vulkan/runtime/graph/ops/Utils.cpp +++ b/backends/vulkan/runtime/graph/ops/Utils.cpp @@ -46,21 +46,26 @@ void bind_staging_to_descriptor_set( uint32_t bind_values_to_descriptor_set( ComputeGraph* graph, - const std::vector& args, + const std::vector& args, api::PipelineBarrier& pipeline_barrier, - const api::MemoryAccessType accessType, api::DescriptorSet& descriptor_set, const uint32_t base_idx) { uint32_t idx = base_idx; for (auto& arg : args) { - Value& val = graph->get_val(arg); - if (val.isTensor()) { - bind_tensor_to_descriptor_set( - val.toTensor(), pipeline_barrier, accessType, descriptor_set, idx++); - } else if (val.isStaging()) { - bind_staging_to_descriptor_set(val.toStaging(), descriptor_set, idx++); - } else { - VK_THROW("Unsupported type: ", val.type()); + for (auto& ref : arg.refs) { + Value& val = graph->get_val(ref); + if (val.isTensor()) { + bind_tensor_to_descriptor_set( + val.toTensor(), + pipeline_barrier, + arg.access, + descriptor_set, + idx++); + } else if (val.isStaging()) { + bind_staging_to_descriptor_set(val.toStaging(), descriptor_set, idx++); + } else { + VK_THROW("Unsupported type: ", val.type()); + } } } return idx; diff --git a/backends/vulkan/runtime/graph/ops/Utils.h b/backends/vulkan/runtime/graph/ops/Utils.h index 1d2d4bdede..ee8a32f77b 100644 --- a/backends/vulkan/runtime/graph/ops/Utils.h +++ b/backends/vulkan/runtime/graph/ops/Utils.h @@ -37,9 +37,8 @@ void bind_staging_to_descriptor_set( uint32_t bind_values_to_descriptor_set( ComputeGraph* graph, - const std::vector& args, + const std::vector& args, api::PipelineBarrier& pipeline_barrier, - const api::MemoryAccessType accessType, api::DescriptorSet& descriptor_set, const uint32_t base_idx); diff --git a/backends/vulkan/runtime/graph/ops/impl/Arithmetic.cpp b/backends/vulkan/runtime/graph/ops/impl/Arithmetic.cpp index 3e999b5fe1..cbc1a5600b 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Arithmetic.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Arithmetic.cpp @@ -70,7 +70,12 @@ void add_arithmetic_node( api::UniformParamsBuffer params(graph.context(), block); graph.execute_nodes().emplace_back(new ExecuteNode( - shader, global_size, local_size, {out}, {arg1, arg2}, std::move(params))); + shader, + global_size, + local_size, + {{out, api::MemoryAccessType::WRITE}, + {{arg1, arg2}, api::MemoryAccessType::READ}}, + std::move(params))); } } // namespace vulkan diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp index 9dab1f7e2f..205496555b 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp @@ -52,8 +52,8 @@ void add_staging_to_tensor_node( shader, global_size, local_size, - {out_tensor}, - {in_staging}, + {{out_tensor, api::MemoryAccessType::WRITE}, + {in_staging, api::MemoryAccessType::READ}}, std::move(params))); } @@ -94,8 +94,8 @@ void add_tensor_to_staging_node( shader, global_size, local_size, - {in_tensor}, - {out_staging}, + {{in_tensor, api::MemoryAccessType::READ}, + {out_staging, api::MemoryAccessType::WRITE}}, std::move(params))); } From aaba33d6bc4c3a1afeb3d250a2da6fa3d08cde7f Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Tue, 5 Mar 2024 12:22:14 -0800 Subject: [PATCH 040/290] Fix op schema and avoid re-registering. Summary: Looks like in op schema declaration the special `*` arg should separate in- and out-args, so rearranging them a bit. Additionally, skip lininking the runner with `examples/models/llama/ops`in favor of `kernels/quantized`. bypass-github-export-checks Reviewed By: mikekgfb Differential Revision: D54523778 fbshipit-source-id: ed6750c75369f90d4a1469d18f2a9554d93f806a --- examples/models/llama2/ops/quantized.yaml | 2 +- kernels/quantized/quantized.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/models/llama2/ops/quantized.yaml b/examples/models/llama2/ops/quantized.yaml index bc3e857665..e3b6e2bbbe 100644 --- a/examples/models/llama2/ops/quantized.yaml +++ b/examples/models/llama2/ops/quantized.yaml @@ -4,7 +4,7 @@ - arg_meta: null kernel_name: torch::executor::quantized_embedding_byte_out -- func: llama_quantized::embedding_byte.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) +- func: llama_quantized::embedding_byte.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!) variants: function kernels: - arg_meta: null diff --git a/kernels/quantized/quantized.yaml b/kernels/quantized/quantized.yaml index 9f802e33c6..b6103343f6 100644 --- a/kernels/quantized/quantized.yaml +++ b/kernels/quantized/quantized.yaml @@ -40,7 +40,7 @@ - arg_meta: null kernel_name: torch::executor::quantized_embedding_byte_out -- func: quantized_decomposed::embedding_byte.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) +- func: quantized_decomposed::embedding_byte.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!) variants: function kernels: - arg_meta: null From e7197a14a0b9b2861a5e80d3c6535bc9bb252dd7 Mon Sep 17 00:00:00 2001 From: Angela Yi Date: Tue, 5 Mar 2024 12:41:35 -0800 Subject: [PATCH 041/290] Fix .module() calls (#2258) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/2258 D53827930 Reviewed By: Jack-Khuu, aakhundov Differential Revision: D54542173 fbshipit-source-id: 91999605ba9fcf850a9c3f997cc6b6557a41cae8 --- exir/emit/test/test_emit.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/exir/emit/test/test_emit.py b/exir/emit/test/test_emit.py index cc7022f0d6..b14111c35a 100644 --- a/exir/emit/test/test_emit.py +++ b/exir/emit/test/test_emit.py @@ -215,7 +215,7 @@ def forward(self, x): return [((1, 3, 1.2), True, [x + x, x * x])] ep = torch.export.export(M(), (torch.ones(2, 3),)) - res = ep(torch.ones(2, 3)) + res = ep.module()(torch.ones(2, 3)) self.assertEqual(res[0][0], (1, 3, 1.2)) program = to_edge(ep).to_executorch().executorch_program outputs = program.execution_plan[0].outputs @@ -235,7 +235,7 @@ def forward(self, x, y, z): return x + y, x + x, x + y + z ep = torch.export.export(M(), (torch.ones(2, 3), 2, True)) - ep(torch.ones(2, 3), 2, True) + ep.module()(torch.ones(2, 3), 2, True) program = to_edge(ep).to_executorch().executorch_program inputs = program.execution_plan[0].inputs self.assertEqual(len(inputs), 3) From 80f3b1bec473ced4517f6b4c1c2fa72e69193993 Mon Sep 17 00:00:00 2001 From: Stephen Jia Date: Tue, 5 Mar 2024 13:58:59 -0800 Subject: [PATCH 042/290] Introduce instrumentation test and custom shader library test for Compute API test (#2259) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/2259 ## Context Add an Android instrumentation test for `vulkan_compute_api_test` so that changes to the Vulkan Compute API and the Vulkan graph runtime can be tested on Android devices. Also add a test for retrieving a shader from a linked custom shader library. ghstack-source-id: 217501490 bypass-github-pytorch-ci-checks bypass-github-export-checks Reviewed By: jorgep31415 Differential Revision: D54520213 fbshipit-source-id: 111f2c8954c9c5e0107a9f630aac8a23b605692c --- backends/vulkan/test/glsl/test_shader.glsl | 27 +++++++++++++++++++ .../vulkan/test/vulkan_compute_api_test.cpp | 7 +++++ 2 files changed, 34 insertions(+) create mode 100644 backends/vulkan/test/glsl/test_shader.glsl diff --git a/backends/vulkan/test/glsl/test_shader.glsl b/backends/vulkan/test/glsl/test_shader.glsl new file mode 100644 index 0000000000..897b8a4548 --- /dev/null +++ b/backends/vulkan/test/glsl/test_shader.glsl @@ -0,0 +1,27 @@ +#version 450 core +#define PRECISION ${PRECISION} +#define FORMAT ${FORMAT} + +layout(std430) buffer; + +/* Qualifiers: layout - storage - precision - memory */ + +layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict writeonly image3D uOutput; +layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput; +layout(set = 0, binding = 2) uniform PRECISION restrict Block { + ivec4 size; +} uBlock; + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +void main() { + const ivec3 pos = ivec3(gl_GlobalInvocationID); + + if (all(lessThan(pos, uBlock.size.xyz))) { + const vec4 intex = texelFetch(uInput, pos, 0); + imageStore( + uOutput, + pos, + intex + 5); + } +} diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index 2ac0f1a8b1..271a0ffeaa 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -163,6 +163,13 @@ class VulkanComputeAPITest : public ::testing::Test { // Compute API Tests // +TEST_F(VulkanComputeAPITest, retrieve_custom_shader_test) { + // Try to get shader from custom shader library + const api::ShaderInfo& kernel = VK_KERNEL(test_shader); + + EXPECT_TRUE(kernel.kernel_name == "test_shader"); +} + TEST_F(VulkanComputeAPITest, buffer_copy_sanity_check) { // Simple test that copies data into a and reads from a std::vector sizes = {4, 4, 1}; From 61d6393516d80c1efdf010c610c270bf7bfab8cf Mon Sep 17 00:00:00 2001 From: Michael Gschwind Date: Tue, 5 Mar 2024 15:21:44 -0800 Subject: [PATCH 043/290] Fix groupwise quantization when group size divides channel size (#2264) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/2264 Fix groupwise quantization when group size divides channel size Reviewed By: manuelcandales Differential Revision: D54549436 fbshipit-source-id: 859a419b43be0c120795396c7d34641ed311a34d --- examples/models/llama2/quantize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/models/llama2/quantize.py b/examples/models/llama2/quantize.py index 4c49a9de75..ec699fcb8e 100644 --- a/examples/models/llama2/quantize.py +++ b/examples/models/llama2/quantize.py @@ -76,7 +76,7 @@ def dynamically_quantize_per_channel( if group_size is None or group_size == 0: items = x_shape_1 - elif not enable_non_multiple_groups: + elif ((x_shape_1 % group_size) == 0) or not enable_non_multiple_groups: assert group_size > 0, "group size must be positive" assert ( x_shape_1 % group_size From c11924742cb34477f952144efe6d53ecc06ef477 Mon Sep 17 00:00:00 2001 From: Kush Rastogi Date: Tue, 5 Mar 2024 17:17:36 -0800 Subject: [PATCH 044/290] Updating Media Enhancement codebase to support ET (#2168) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/2168 - Loads Executorch models into Media Enhancement Library - Brings in new operators for saliency CPU model - Resizes image and model results to and from 320hx240w Differential Revision: D51361309 fbshipit-source-id: b42034aa63c46748b4acc43b28c975e7e2e9defa --- sdk/bundled_program/bundled_program.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sdk/bundled_program/bundled_program.cpp b/sdk/bundled_program/bundled_program.cpp index 9ab94e146f..1b1769e7f7 100644 --- a/sdk/bundled_program/bundled_program.cpp +++ b/sdk/bundled_program/bundled_program.cpp @@ -282,7 +282,7 @@ __ET_NODISCARD Error LoadBundledInput( false, NotSupported, "Data type %hhu not supported", - bundled_input->val_type()); + static_cast(bundled_input->val_type())); break; } } @@ -291,7 +291,7 @@ __ET_NODISCARD Error LoadBundledInput( status == Error::Ok, NotSupported, "set_input failed during load bundled inputs with status %" PRIu32, - status); + static_cast(status)); } internal::event_tracer_set_bundled_input_index( @@ -352,7 +352,7 @@ __ET_NODISCARD Error VerifyResultWithBundledExpectedOutput( false, NotSupported, "Data type %hhd not supported", - bundled_expected_output->val_type()); + static_cast(bundled_expected_output->val_type())); break; } } From aef3a7c211934ae9019938c5ae6789c4f593b913 Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Tue, 5 Mar 2024 17:17:50 -0800 Subject: [PATCH 045/290] emit programs with mutable buffers (#2233) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/2233 Meaningful changes to the emitter logic here. Before we would ignore the tensor spec passed in and try to decide if the placeholder was a constant and if it was we would create a new spec from the actual value for that constant. That drops meta data on the input spec which is not great. Now instead of that we just look up the storage of the concrete tensor and hook it up to the spec. Also added some logic to seperate out behavior for mutable buffers specifically. While working on this I also discovered a bug that memory planning is planning space for parameters and constant buffers if its told to allocate space for inputs which is really bad lol. Oh one big assumption this diff makes is that the buffer does not have a meaningful initial state. I should probably throw out a warning during emission about this in the short term. Long term we will handle them properly. bypass-github-export-checks Reviewed By: tarun292, Jack-Khuu Differential Revision: D53713544 fbshipit-source-id: b0bd8abd89e2d0e2006f0f1d885b1eaa02653afa --- exir/emit/_emit_program.py | 46 ++++++++++++---- exir/emit/_emitter.py | 105 +++++++++++++++++++++++++----------- exir/emit/test/TARGETS | 2 +- exir/emit/test/test_emit.py | 40 ++++++++++++++ 4 files changed, 151 insertions(+), 42 deletions(-) diff --git a/exir/emit/_emit_program.py b/exir/emit/_emit_program.py index a22cac53e4..fc3e446af9 100644 --- a/exir/emit/_emit_program.py +++ b/exir/emit/_emit_program.py @@ -32,7 +32,8 @@ ) from executorch.exir.tensor import layout_enum, scalar_type_enum from executorch.exir.version import EXECUTORCH_SCHEMA_VERSION -from torch.export.exported_program import ExportedProgram +from torch.export.exported_program import ExportedProgram, OutputKind +from torch.utils import _pytree as pytree def _emit_prim_getters(prim_getters: Dict[str, Any]) -> List[ExecutionPlan]: @@ -122,6 +123,36 @@ class EmitterOutput: ] +def _remove_non_user_outputs(exported_program: ExportedProgram) -> torch.fx.GraphModule: + gm = exported_program.graph_module + output_node = None + for node in gm.graph.nodes: + if node.op == "output": + output_node = node + assert output_node is not None + + mutated_outputs: List[Optional[str]] = [ + out_spec.target if out_spec.kind in (OutputKind.BUFFER_MUTATION,) else None + for out_spec in exported_program.graph_signature.output_specs + ] + outputs = pytree.tree_flatten(output_node.args)[0] + + user_output_nodes = [] + for return_node, mutated_node_name in zip(outputs, mutated_outputs): + if mutated_node_name is None: + user_output_nodes.append(return_node) + continue + + with gm.graph.inserting_before(output_node): + # Only return user outputs + new_output = gm.graph.output(tuple(user_output_nodes)) + new_output.meta = output_node.meta.copy() + output_node.replace_all_uses_with(new_output) + gm.graph.erase_node(output_node) + + return gm + + def emit_program( methods: Union[ExportedProgram, Dict[str, ExportedProgram]], emit_stacktrace: bool = False, @@ -163,13 +194,6 @@ def emit_program( # emit each entry point in order according to name. for name, exported_program in sorted(methods.items()): - if ( - exported_program.graph_signature.buffers_to_mutate - ): # see if we are mutating any state - raise ExportError( - ExportErrorType.INVALID_INPUT_TYPE, - "Buffers cannot be modified in executorch.", - ) # create empty state emitter_state = _EmitterState( values=[], @@ -180,7 +204,11 @@ def emit_program( emit_stacktrace=emit_stacktrace, ) - emitter = _TopLevelEmitter(name, exported_program, program_state, emitter_state) + gm = _remove_non_user_outputs(exported_program) + + emitter = _TopLevelEmitter( + name, exported_program, gm, program_state, emitter_state + ) emitter.run() plans.append(emitter.plan()) diff --git a/exir/emit/_emitter.py b/exir/emit/_emitter.py index e3e655a1bf..2c667690ce 100644 --- a/exir/emit/_emitter.py +++ b/exir/emit/_emitter.py @@ -32,8 +32,9 @@ import hashlib import operator import typing +import warnings from dataclasses import dataclass, field -from typing import Callable, cast, Dict, List, Mapping, Optional, Tuple, Union +from typing import Any, Callable, cast, Dict, List, Mapping, Optional, Tuple, Union import executorch.exir.memory as memory import executorch.extension.pytree as ex_pytree @@ -1266,15 +1267,17 @@ def __init__( self, name: str, exported_program: ExportedProgram, + graph_module: torch.fx.GraphModule, program_state: _ProgramState, emitter_state: _EmitterState, ) -> None: - super().__init__(exported_program.graph_module, emitter_state, program_state) + super().__init__(graph_module, emitter_state, program_state) self.name = name self.exported_program = exported_program self.inputs: List[int] = [] self.outputs: List[int] = [] + self.given_mutable_buffer_warning = False def create_container_str(spec: Optional[pytree.TreeSpec]) -> str: if spec is None: @@ -1293,6 +1296,42 @@ def create_container_str(spec: Optional[pytree.TreeSpec]) -> str: inp_container_str, out_container_str ) + def _find_fqn_for_placeholder( + self, target: _Target, spec: Any # pyre-ignore[2] + ) -> Tuple[Optional[str], bool]: + # Find the fully qualified name + fqn = None + is_mutable_buffer = False + if target in self.exported_program.graph_signature.inputs_to_parameters: + fqn = self.exported_program.graph_signature.inputs_to_parameters[target] + + elif target in self.exported_program.graph_signature.inputs_to_buffers: + fqn = self.exported_program.graph_signature.inputs_to_buffers[target] + + # if the buffer is mutated then record that + if fqn in self.exported_program.graph_signature.buffers_to_mutate.values(): + is_mutable_buffer = True + if not self.given_mutable_buffer_warning: + warnings.warn( + "Mutation on a buffer in the model is detected. ExecuTorch assumes " + "buffers that are mutated in the graph have a meaningless initial state, " + "only the shape and dtype will be serialized.", + UserWarning, + stacklevel=1, + ) + self.given_mutable_buffer_warning = True + + elif ( + target + in self.exported_program.graph_signature.inputs_to_lifted_tensor_constants + ): + fqn = ( + self.exported_program.graph_signature.inputs_to_lifted_tensor_constants[ + target + ] + ) + return fqn, is_mutable_buffer + def placeholder( self, target: _Target, args: Tuple[_Argument, ...], kwargs: Dict[str, _Argument] ) -> _AbstractValue: @@ -1302,40 +1341,27 @@ def placeholder( https://pytorch.org/docs/stable/fx.html#torch.fx.Graph.placeholder """ spec = self.node.meta["spec"] - const_tensor = False - if isinstance(target, str) and ( - target in self.exported_program.graph_signature.inputs_to_parameters - or target in self.exported_program.graph_signature.inputs_to_buffers - or target - in self.exported_program.graph_signature.inputs_to_lifted_tensor_constants - ): - if ( - target - in self.exported_program.graph_signature.inputs_to_lifted_tensor_constants - ): - fqn = self.exported_program.graph_signature.inputs_to_lifted_tensor_constants[ - target - ] - elif target in self.exported_program.graph_signature.inputs_to_buffers: - fqn = self.exported_program.graph_signature.inputs_to_buffers[target] - else: - fqn = self.exported_program.graph_signature.inputs_to_parameters[target] + is_user_input = True + + if isinstance(target, str) and isinstance(spec, TensorSpec): + + fqn, is_mutable_buffer = self._find_fqn_for_placeholder(target, spec) + + # From the fqn find the corresponding tensor + real_tensor = None if fqn in self.exported_program.state_dict: - spec = TensorSpec.from_tensor( - self.exported_program.state_dict[fqn], const=True - ) - const_tensor = True + real_tensor = self.exported_program.state_dict[fqn] + is_user_input = False + elif fqn in self.exported_program.constants: - spec = TensorSpec.from_tensor( - self.exported_program.constants[fqn], const=True - ) - const_tensor = True - else: + real_tensor = self.exported_program.constants[fqn] + is_user_input = False + elif fqn is not None: buffers = self.exported_program.named_buffers() buf = next((x[1] for x in buffers if x[0] == fqn), None) if buf is not None: - spec = TensorSpec.from_tensor(buf, const=True) - const_tensor = True + real_tensor = buf + is_user_input = False else: raise InternalError( self._emit_node_specific_error( @@ -1344,13 +1370,28 @@ def placeholder( ) ) + # assign the storage of the placeholder spec to the storage of the real tensor if there is one + if real_tensor is not None: + # for non-contigous tensors, convert to a contiguous one + real_tensor = real_tensor.contiguous() + # Weights cannot be views during emission or serialization + if real_tensor.nbytes != real_tensor.untyped_storage().nbytes(): + real_tensor = real_tensor.clone() + + spec.storage = real_tensor.untyped_storage() + + # User inputs and mutable buffers are not constants, other buffers or parameters are. + spec.const = not (is_user_input or is_mutable_buffer) + evalue = ( self._tensor_spec_to_evalue(spec) if isinstance(spec, TensorSpec) else self._constant_to_evalue(spec, None) ) value = self._emit_evalue(evalue) - if not const_tensor: + + # Only user inputs should remain as inputs. + if is_user_input: self.inputs.append(value.id) return value diff --git a/exir/emit/test/TARGETS b/exir/emit/test/TARGETS index 4fb30b220b..06119a696e 100644 --- a/exir/emit/test/TARGETS +++ b/exir/emit/test/TARGETS @@ -21,6 +21,6 @@ python_unittest( "//executorch/exir/passes:constant_prop_pass", "//executorch/exir/tests:lib", "//executorch/exir/tests:models", - "//executorch/extension/pybindings:portable_lib", # @manual + "//executorch/extension/pybindings:aten_lib", ], ) diff --git a/exir/emit/test/test_emit.py b/exir/emit/test/test_emit.py index b14111c35a..47e05999f5 100644 --- a/exir/emit/test/test_emit.py +++ b/exir/emit/test/test_emit.py @@ -19,6 +19,7 @@ from executorch.exir import EdgeCompileConfig, ExecutorchProgramManager, to_edge from executorch.exir.backend.backend_api import to_backend from executorch.exir.backend.backend_details import BackendDetails, PreprocessResult +from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.emit import emit_program # noqa from executorch.exir.passes.constant_prop_pass import constant_prop_pass from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass @@ -42,6 +43,7 @@ ) from executorch.exir.tests.common import register_additional_test_aten_ops from executorch.exir.tests.models import Mul +from executorch.extension.pybindings.aten_lib import _load_for_executorch_from_buffer from functorch.experimental import control_flow from torch import nn @@ -1393,3 +1395,41 @@ def forward(self, x): self.assertEqual(len(exec_plan.inputs), 1) self.assertEqual(len(program.constant_buffer), 2) self.assertEqual(len(program.constant_buffer[1].storage), 24) + + def test_mutable_buffers(self) -> None: + def count_copies(gm: torch.fx.GraphModule) -> int: + return sum( + ( + node.target == torch.ops.aten.copy_ + or node.target == exir_ops.edge.aten.copy_.default + ) + for node in gm.graph.nodes + ) + + class MutableStateModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.register_buffer("state", torch.zeros(1)) + + def forward(self, x): + y = x + self.state + self.state.add_(1) + return y + + model = to_edge( + export( + MutableStateModule(), + (torch.zeros(1),), + ) + ) + model = model.to_executorch() + model.dump_executorch_program(True) + self.assertTrue( + model.executorch_program.execution_plan[0] # pyre-ignore[16] + .values[0] + .val.allocation_info + is not None + ) + executorch_module = _load_for_executorch_from_buffer(model.buffer) + self.assertEqual(executorch_module(torch.zeros(1))[0], torch.zeros(1)) + self.assertEqual(executorch_module(torch.zeros(1))[0], torch.zeros(1) + 1) From 0de3a97360ae9283283e86782825b6b181917d5f Mon Sep 17 00:00:00 2001 From: Chen Lai Date: Tue, 5 Mar 2024 18:19:29 -0800 Subject: [PATCH 046/290] rename original original module to orginal exported program (#2263) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/2263 This is exported program but not module, rename them to avoid confusion Reviewed By: angelayi Differential Revision: D54527107 fbshipit-source-id: e00eaf7f46ac90acc6ae44cf64ef7c476627b67b --- exir/backend/test/test_backends.py | 2 +- exir/backend/test/test_backends_lifted.py | 4 +-- exir/lowered_backend_module.py | 34 ++++++++++++++--------- 3 files changed, 24 insertions(+), 16 deletions(-) diff --git a/exir/backend/test/test_backends.py b/exir/backend/test/test_backends.py index 95f63f44f3..db3e806f13 100644 --- a/exir/backend/test/test_backends.py +++ b/exir/backend/test/test_backends.py @@ -99,7 +99,7 @@ def check_delegate_input( self, delegate: LoweredBackendModule, input_len: int ) -> None: counter = 0 - for node in delegate._original_module.graph.nodes: + for node in delegate.original_module.graph.nodes: if node.op == "placeholder": counter += 1 self.assertEqual(counter, input_len) diff --git a/exir/backend/test/test_backends_lifted.py b/exir/backend/test/test_backends_lifted.py index 905ce1a7f2..c712ab19c3 100644 --- a/exir/backend/test/test_backends_lifted.py +++ b/exir/backend/test/test_backends_lifted.py @@ -98,7 +98,7 @@ def check_delegate_input( self, delegate: LoweredBackendModule, input_len: int ) -> None: counter = 0 - for node in delegate._original_module.graph.nodes: + for node in delegate.original_module.graph.nodes: if node.op == "placeholder": counter += 1 self.assertEqual(counter, input_len) @@ -913,7 +913,7 @@ def forward(self, x, y): ) self.assertEqual(len(lowered_backends), 2) for backend in lowered_backends: - original_program = backend._original_module + original_program = backend.original_module # check that program has the lowered attributes self.assertEqual(len(original_program.state_dict), 1) # check backend has one placeholder input one placeholder parameter diff --git a/exir/lowered_backend_module.py b/exir/lowered_backend_module.py index 32f9813fc2..09b504e40e 100644 --- a/exir/lowered_backend_module.py +++ b/exir/lowered_backend_module.py @@ -58,7 +58,7 @@ class LoweredBackendModule(torch.nn.Module): _compile_specs: List[ CompileSpec ] # A list of backend-specific objects with static metadata to configure the "compilation" process. - _original_module: ExportedProgram # The original EXIR module + _original_exported_program: ExportedProgram # The original EXIR module def __init__( self, @@ -68,7 +68,7 @@ def __init__( compile_specs: List[CompileSpec], ) -> None: super().__init__() - self._original_module = edge_program + self._original_exported_program = edge_program self._backend_id = backend_id self._processed_bytes = processed_bytes self._compile_specs = compile_specs @@ -77,14 +77,20 @@ def __init__( def __deepcopy__(self, memo: Optional[Dict[int, Any]]) -> "LoweredBackendModule": # Copy exported program copied_program = ExportedProgram( - root=copy.deepcopy(self._original_module.graph_module), - graph=copy.deepcopy(self._original_module.graph), - graph_signature=copy.deepcopy(self._original_module.graph_signature), - state_dict=self._original_module.state_dict, - range_constraints=copy.deepcopy(self._original_module.range_constraints), - module_call_graph=copy.deepcopy(self._original_module.module_call_graph), - verifier=copy.deepcopy(self._original_module.verifier), - constants=self._original_module.constants, + root=copy.deepcopy(self._original_exported_program.graph_module), + graph=copy.deepcopy(self._original_exported_program.graph), + graph_signature=copy.deepcopy( + self._original_exported_program.graph_signature + ), + state_dict=self._original_exported_program.state_dict, + range_constraints=copy.deepcopy( + self._original_exported_program.range_constraints + ), + module_call_graph=copy.deepcopy( + self._original_exported_program.module_call_graph + ), + verifier=copy.deepcopy(self._original_exported_program.verifier), + constants=self._original_exported_program.constants, ) res = LoweredBackendModule( @@ -122,7 +128,7 @@ def original_module(self) -> ExportedProgram: """ Returns the original EXIR module """ - return self._original_module + return self._original_exported_program # TODO(chenlai): consolidate the seriailization config with serialize_to_flatbuffer api def buffer( @@ -185,7 +191,7 @@ def program(self, emit_stacktrace: bool = False) -> Program: # We'll remove all call_function nodes, insert an call_delegate node, inserting getitems nodes to get the result for call_delegate node # and return the list of getitems as the output - lowered_exported_program = copy.deepcopy(self.original_module) + lowered_exported_program = copy.deepcopy(self._original_exported_program) # The real input nodes are the ones not buffer or parameter all_input_nodes = [ @@ -237,7 +243,9 @@ def program(self, emit_stacktrace: bool = False) -> Program: # Get the output list. Since the output node is a tuple of list, like ([aten_mul_tensor, aten_add_tensor],) # We add some handling logic to get the list `[aten_mul_tensor, aten_add_tensor]` properly original_output_nodes = [ - node for node in self.original_module.graph.nodes if node.op == "output" + node + for node in self._original_exported_program.graph.nodes + if node.op == "output" ][0].args[0] delegate_node.meta["spec"] = tuple( From bcba73924547df8dc7520a1248d394145f7db98e Mon Sep 17 00:00:00 2001 From: Jorge Pineda Date: Tue, 5 Mar 2024 19:16:50 -0800 Subject: [PATCH 047/290] Remove runtime dependency on ATen/native/vulkan/impl (#2270) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/2270 bypass-github-export-checks The only missing logic is copied from ``` ATen/native/vulkan/impl/Common.h/cpp ``` to ``` executorch/backends/vulkan/runtime/graph/ops/OpUtils.h/cpp ``` We can create a utils directory and improve their file organization, in a follow up change. Reviewed By: SS-JIA Differential Revision: D54555273 fbshipit-source-id: 3281391ee60623382b9eece2d6c9cf26678e9342 --- backends/vulkan/runtime/graph/ops/OpUtils.cpp | 34 ++++++++ backends/vulkan/runtime/graph/ops/OpUtils.h | 87 +++++++++++++++++++ .../vulkan/runtime/graph/ops/StagingUtils.cpp | 3 +- backends/vulkan/runtime/graph/ops/Utils.cpp | 2 + backends/vulkan/runtime/graph/ops/Utils.h | 2 - .../runtime/graph/ops/impl/Arithmetic.cpp | 2 + .../runtime/graph/ops/impl/Arithmetic.h | 2 - .../vulkan/runtime/graph/ops/impl/Staging.cpp | 3 +- backends/vulkan/targets.bzl | 1 - .../vulkan/test/vulkan_compute_api_test.cpp | 1 + 10 files changed, 128 insertions(+), 9 deletions(-) create mode 100644 backends/vulkan/runtime/graph/ops/OpUtils.cpp create mode 100644 backends/vulkan/runtime/graph/ops/OpUtils.h diff --git a/backends/vulkan/runtime/graph/ops/OpUtils.cpp b/backends/vulkan/runtime/graph/ops/OpUtils.cpp new file mode 100644 index 0000000000..ce82aef092 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/OpUtils.cpp @@ -0,0 +1,34 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +namespace at { +namespace native { +namespace vulkan { + +api::utils::uvec3 adaptive_work_group_size( + const api::utils::uvec3& global_work_group) { + api::utils::uvec3 local_group_size = {4, 4, 4}; + if (global_work_group.data[2u] == 1) { + if (global_work_group.data[1u] < 8) { + local_group_size.data[0u] = 16; + local_group_size.data[1u] = 4; + local_group_size.data[2u] = 1; + } else { + local_group_size.data[0u] = 8; + local_group_size.data[1u] = 8; + local_group_size.data[2u] = 1; + } + } + return local_group_size; +} + +} // namespace vulkan +} // namespace native +} // namespace at diff --git a/backends/vulkan/runtime/graph/ops/OpUtils.h b/backends/vulkan/runtime/graph/ops/OpUtils.h new file mode 100644 index 0000000000..2a98337721 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/OpUtils.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#ifdef USE_VULKAN_API + +#include + +namespace at { +namespace native { +namespace vulkan { + +/* + * Maps a semantic dimension name to an integer that corresponds to its + * innermost ordering in a 4D tensor in NCHW format. Width is the innermost + * dimension, so it corresponds to 1, height is the next innermost, so it + * corresponds to 2, and so on. + */ +struct Dim4D { + static constexpr uint32_t Width = 1u; + static constexpr uint32_t Height = 2u; + static constexpr uint32_t Channel = 3u; + static constexpr uint32_t Batch = 4u; +}; + +/* + * Semantic dimension names for a 1D tensor + */ +struct Dim1D { + static constexpr uint32_t Length = 1u; +}; + +/* + * Semantic dimension names for a 2D Convolution kernel. + */ +struct DimConv2DKernel { + static constexpr uint32_t Width = 1u; + static constexpr uint32_t Height = 2u; + static constexpr uint32_t InChannels = 3u; + static constexpr uint32_t OutChannels = 4u; +}; + +/* + * The same as the above, except for a 2D Transposed Convolution kernel. + */ +struct DimTConv2DKernel { + static constexpr uint32_t Width = 1u; + static constexpr uint32_t Height = 2u; + static constexpr uint32_t OutChannels = 3u; + static constexpr uint32_t InChannels = 4u; +}; + +/* + * The functions below safely return the size of the dimension at the N-th + * innermost index. If the dimensionality of the size array is not sufficient + * then 1 will be returned. The structs above are intended to be used with + * these functions. + */ +template +uint32_t dim_at(const std::vector& sizes) { + const uint32_t dims = sizes.size(); + return dims < N ? 1 : api::utils::safe_downcast(sizes[dims - N]); +} + +template +uint32_t dim_at(const vTensor& v_in) { + return dim_at(v_in.sizes()); +} + +/* + * For most global work group sizes, returns {4, 4, 4}, but adjusts the size for + * 2D global work group sizes. Always maintains a total of 64 invocations + */ +api::utils::uvec3 adaptive_work_group_size( + const api::utils::uvec3& global_work_group); + +} // namespace vulkan +} // namespace native +} // namespace at + +#endif /* USE_VULKAN_API */ diff --git a/backends/vulkan/runtime/graph/ops/StagingUtils.cpp b/backends/vulkan/runtime/graph/ops/StagingUtils.cpp index 18e2159980..1637cfb2e1 100644 --- a/backends/vulkan/runtime/graph/ops/StagingUtils.cpp +++ b/backends/vulkan/runtime/graph/ops/StagingUtils.cpp @@ -8,10 +8,9 @@ #include +#include #include -#include - namespace at { namespace native { namespace vulkan { diff --git a/backends/vulkan/runtime/graph/ops/Utils.cpp b/backends/vulkan/runtime/graph/ops/Utils.cpp index 868b5b7068..dda813c669 100644 --- a/backends/vulkan/runtime/graph/ops/Utils.cpp +++ b/backends/vulkan/runtime/graph/ops/Utils.cpp @@ -8,6 +8,8 @@ #include +#include + namespace at { namespace native { namespace vulkan { diff --git a/backends/vulkan/runtime/graph/ops/Utils.h b/backends/vulkan/runtime/graph/ops/Utils.h index ee8a32f77b..9d6153e1d1 100644 --- a/backends/vulkan/runtime/graph/ops/Utils.h +++ b/backends/vulkan/runtime/graph/ops/Utils.h @@ -10,8 +10,6 @@ #ifdef USE_VULKAN_API -#include - #include namespace at { diff --git a/backends/vulkan/runtime/graph/ops/impl/Arithmetic.cpp b/backends/vulkan/runtime/graph/ops/impl/Arithmetic.cpp index cbc1a5600b..d635ea9a7f 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Arithmetic.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Arithmetic.cpp @@ -8,6 +8,8 @@ #include +#include + #include namespace at { diff --git a/backends/vulkan/runtime/graph/ops/impl/Arithmetic.h b/backends/vulkan/runtime/graph/ops/impl/Arithmetic.h index 06fe57a26a..8017f6c4c4 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Arithmetic.h +++ b/backends/vulkan/runtime/graph/ops/impl/Arithmetic.h @@ -10,8 +10,6 @@ #ifdef USE_VULKAN_API -#include - #include #include diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp index 205496555b..41104532d4 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp @@ -8,11 +8,10 @@ #include +#include #include #include -#include - namespace at { namespace native { namespace vulkan { diff --git a/backends/vulkan/targets.bzl b/backends/vulkan/targets.bzl index 345f18801f..2045cb3725 100644 --- a/backends/vulkan/targets.bzl +++ b/backends/vulkan/targets.bzl @@ -53,7 +53,6 @@ def define_common_targets(): "@EXECUTORCH_CLIENTS", ], exported_deps = [ - "//caffe2:torch_vulkan_ops", "//caffe2:torch_vulkan_spv", ], define_static_target = False, diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index 271a0ffeaa..51b58720c3 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -10,6 +10,7 @@ #include +#include #include #include From 34db73d72c4c8b9e2700e14f8723100cd78a7ade Mon Sep 17 00:00:00 2001 From: Kimish Patel Date: Tue, 5 Mar 2024 22:01:52 -0800 Subject: [PATCH 048/290] Fix 4bit groupwise dynamic linear quantization (#2251) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/2251 This diff fixes following issues: - removes scales packing/unpacking - separate compute precision from scales storage precision, instead of maintaining activation/weight precision - defaults to fp32 everywhere unless specified otherwise. This is because atm groupwise quant kernels in xnnpack are for fp32. - Removes some dead code - Remove k tile constraints: These were from GPU and are not needed here - Replaces torch.ops.aten.linear with nn.functional.linear: This had to be done because otherwise delegation doesnt recognize the pattern. Yet another issue with pattern matching. ghstack-source-id: 217579450 exported-using-ghexport bypassing check because oss failures are unrelated bypass-github-export-checks Reviewed By: cccclai Differential Revision: D54427828 fbshipit-source-id: 634c34212e6ec80c41b21ae1dd1ad3211bf04862 --- examples/models/llama2/export_llama_lib.py | 7 +- examples/models/llama2/quantize.py | 200 ++++++++++----------- 2 files changed, 98 insertions(+), 109 deletions(-) diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py index 1ab2bb3bd9..7b0d1c44af 100644 --- a/examples/models/llama2/export_llama_lib.py +++ b/examples/models/llama2/export_llama_lib.py @@ -191,7 +191,8 @@ def quantize( return WeightOnlyInt8QuantHandler(model).quantized_model() elif qmode == "int4": model_int4 = Int8DynActInt4WeightQuantHandler( - model, activation_precision=torch_dtype + model, + precision=torch_dtype, ).quantized_model() print("quantized model:", model_int4) return model_int4 @@ -397,10 +398,6 @@ def _export_llama(modelname, args) -> str: # noqa: C901 modelname = f"xnnpack_{modelname}" # TODO: remove this after xnnpack delegation is ready - if args.quantization_mode == "int4": - raise Exception( - "some quantized ops should be lowered to xnnpack, but xnnpack delegate is not ready yet" - ) builder = ( load_llama_model( diff --git a/examples/models/llama2/quantize.py b/examples/models/llama2/quantize.py index ec699fcb8e..ddb1010edb 100644 --- a/examples/models/llama2/quantize.py +++ b/examples/models/llama2/quantize.py @@ -376,7 +376,7 @@ def dequantize_per_token_meta( return torch.empty_like(input, dtype=output_dtype) -def get_group_qparams_symmetric(w, n_bit=4, groupsize=128, precision=torch.float16): +def get_group_qparams_symmetric(w, n_bit=4, groupsize=128, precision=torch.float32): # needed for GPTQ with padding if groupsize > w.shape[-1]: groupsize = w.shape[-1] @@ -422,13 +422,6 @@ def pack_scales_and_zeros(scales, zeros, precision=torch.float16): ) -def unpack_scales_and_zeros(scales_and_zeros): - assert len(scales_and_zeros.shape) == 3 and scales_and_zeros.shape[2] == 2 - # why is this float? - # assert scales_and_zeros.dtype == torch.float - return torch.split(scales_and_zeros.transpose(0, 1), 1, 2) - - quantized_decomposed_lib.define( "quantize_per_channel_group(Tensor input, Tensor scales, Tensor zero_points, int quant_min, " "int quant_max, ScalarType dtype, int group_size) -> Tensor" @@ -513,8 +506,10 @@ def quantize_per_channel_group_meta( return torch.empty_like(input, dtype=dtype) -def group_quantize_tensor_symmetric(w, n_bit=4, group_size=128): - scales, zeros = get_group_qparams_symmetric(w, n_bit, group_size) +def group_quantize_tensor_symmetric( + w, n_bit=4, group_size=128, precision=torch.float32 +): + scales, zeros = get_group_qparams_symmetric(w, n_bit, group_size, precision) n_bit = 4 max_int = 2 ** (n_bit - 1) - 1 min_int = -(2 ** (n_bit - 1)) @@ -524,8 +519,7 @@ def group_quantize_tensor_symmetric(w, n_bit=4, group_size=128): w, scales, zeros, min_int, max_int, torch.int8, group_size ) - scales_and_zeros = pack_scales_and_zeros(scales, zeros) - return w_int8, scales_and_zeros + return w_int8, scales, zeros quantized_decomposed_lib.define( @@ -584,19 +578,6 @@ def dequantize_per_channel_group( return w_dq -def group_dequantize_tensor_symmetric( - w_int8, scales_and_zeros, n_bit=4, group_size=128 -): - # TODO: remove this - scales, zero_points = unpack_scales_and_zeros(scales_and_zeros) - n_bit = 4 - quant_min = -(2 ** (n_bit - 1)) - quant_max = 2 ** (n_bit - 1) - 1 - return torch.ops.quantized_decomposed.quantize_per_channel_group( - w_int8, scales, zero_points, quant_min, quant_max, torch.int8, group_size - ) - - def down_size(size): assert size[-1] % 2 == 0, f"{size} last dim not divisible by two" return (*size[:-1], size[-1] // 2) @@ -900,29 +881,27 @@ def forward(self, indices: torch.Tensor) -> torch.Tensor: ##### weight only int4 per channel groupwise quantized code ###### -def prepare_int4_weight_and_scales_and_zeros(weight_bf16, group_size, inner_k_tiles): - weight_int8, scales_and_zeros = group_quantize_tensor_symmetric( - weight_bf16, n_bit=4, group_size=group_size +def prepare_int4_weight_and_scales_and_zeros(weight, group_size, precision): + weight_int8, scales, zeros = group_quantize_tensor_symmetric( + weight, + n_bit=4, + group_size=group_size, + precision=precision, ) # TODO: better API # weight_int4packed = torch.ops.quantized_decomposed.pack_int4_from_int8(weight_int8) - return weight_int8, scales_and_zeros + return weight_int8, scales, zeros def linear_forward_int4( - x, weight_int8, scales_and_zeros, out_features, group_size, precision + x, weight_int8, scales, zeros, out_features, group_size, precision ): - origin_x_size = x.size() - x = x.reshape(-1, origin_x_size[-1]) # TODO: better API - # TODO: remove? - scales_and_zeros = scales_and_zeros.to(torch.float) + # weight_int8 = torch.ops.quantized_decomposed.unpack_int4_to_int8(weight_int4packed) n_bit = 4 quant_min = -(2 ** (n_bit - 1)) quant_max = 2 ** (n_bit - 1) - 1 - scales, zeros = unpack_scales_and_zeros(scales_and_zeros) - # weight_int8 = torch.ops.quantized_decomposed.unpack_int4_to_int8(weight_int4packed) w_dq = torch.ops.quantized_decomposed.dequantize_per_channel_group( weight_int8, scales, @@ -936,10 +915,8 @@ def linear_forward_int4( # x = x.to(torch.float16) # w_dq = w_dq.to(torch.float16) - c = torch.ops.aten.linear(x, w_dq) + c = torch.nn.functional.linear(x, w_dq) - new_shape = origin_x_size[:-1] + (out_features,) - c = c.reshape(new_shape) return c @@ -950,28 +927,24 @@ def find_multiple(n: int, *args: Tuple[int]) -> int: return n + k - (n % k) -def _check_linear_int4_k(k, group_size=1, inner_k_tiles=1): - return k % group_size == 0 and k % (inner_k_tiles * 16) == 0 +def _check_linear_int4_k(k, group_size=1): + return k % group_size == 0 -def _calc_padded_size_linear_int4(k, groupsize=1, inner_k_tiles=1): - return find_multiple(k, groupsize, inner_k_tiles * 16) +def _calc_padded_size_linear_int4(k, groupsize=1): + return find_multiple(k, groupsize) def replace_linear_8da4w( module, group_size, - inner_k_tiles, padding_allowed, - activation_precision, - weight_precision, + precision, + scales_precision, ): for name, child in module.named_children(): if isinstance(child, nn.Linear): - if ( - _check_linear_int4_k(child.in_features, group_size, inner_k_tiles) - or padding_allowed - ): + if _check_linear_int4_k(child.in_features, group_size) or padding_allowed: setattr( module, name, @@ -980,19 +953,17 @@ def replace_linear_8da4w( child.out_features, bias=False, group_size=group_size, - inner_k_tiles=inner_k_tiles, - activation_precision=activation_precision, - weight_precision=weight_precision, + precision=precision, + scales_precision=scales_precision, ), ) else: replace_linear_8da4w( child, group_size, - inner_k_tiles, padding_allowed, - activation_precision, - weight_precision, + precision, + scales_precision, ) @@ -1000,20 +971,17 @@ class Int8DynActInt4WeightQuantHandler: def __init__( self, mod, - group_size=128, - inner_k_tiles=8, - padding_allowed=True, - activation_precision=torch.float16, - weight_precision=torch.float16, + group_size=256, + padding_allowed=False, + precision=torch.float32, + scales_precision=torch.float32, ): self.mod = mod self.group_size = group_size - self.inner_k_tiles = inner_k_tiles self.padding_allowed = padding_allowed - self.activation_precision = activation_precision - self.weight_precision = weight_precision - assert group_size in [32, 64, 128, 256] - assert inner_k_tiles in [2, 4, 8] + self.precision = precision + self.scales_precision = scales_precision + # assert group_size in [32, 64, 128, 256] @torch.no_grad() def create_quantized_state_dict(self): @@ -1027,37 +995,43 @@ def create_quantized_state_dict(self): # assert out_features % 8 == 0, "require out_features % 8 == 0" print(f"linear: {fqn}, in={in_features}, out={out_features}") + assert ( + in_features % self.group_size == 0 + ), f"require in_features:{in_features} % self.group_size:{self.group_size} == 0" + weight = mod.weight.data + """ if not _check_linear_int4_k( - in_features, self.group_size, self.inner_k_tiles + in_features, self.group_size ): if self.padding_allowed: print( f"warning: {fqn} is padded to satisfy in_features % 1024 == 0" ) padded_in_features = _calc_padded_size_linear_int4( - in_features, 1024 + in_features, self.group_size ) weight = F.pad( weight, pad=(0, padded_in_features - in_features) ) else: - print( + raise RuntimeError( f"warning: {fqn} is skipped, int4 requires that in_features is 32, 64, or is divisible by 1024, " - + "and that group_size and inner_k_tiles*16 evenly divide into it" + + "and that group_size" ) - - continue + """ ( weight_int4pack, - scales_and_zeros, + scales, + zeros, ) = prepare_int4_weight_and_scales_and_zeros( - weight.to(self.weight_precision), + weight.to(self.precision), self.group_size, - self.inner_k_tiles, + self.scales_precision, ) cur_state_dict[f"{fqn}.weight"] = weight_int4pack.to("cpu") - cur_state_dict[f"{fqn}.scales_and_zeros"] = scales_and_zeros.to("cpu") + cur_state_dict[f"{fqn}.scales"] = scales.to("cpu") + cur_state_dict[f"{fqn}.zeros"] = zeros.to("cpu") return cur_state_dict @@ -1065,10 +1039,9 @@ def convert_for_runtime(self): replace_linear_8da4w( self.mod, self.group_size, - self.inner_k_tiles, self.padding_allowed, - self.activation_precision, - self.weight_precision, + self.precision, + self.scales_precision, ) return self.mod @@ -1086,6 +1059,15 @@ class Int8DynActInt4WeightLinear(torch.nn.Module): out_features: int weight: torch.Tensor + """ + This module implements a dynamic quantized linear layer with int4 weight. + Weights are per channel groupwise quantized. Parameters of importance + group_size: the number of elements in each quantized group + precision: precision of input and output. e.g. torch.float32 means input + activation is float32 and output is float32. + scales_precision: precision of per group scale. + """ + def __init__( self, in_features: int, @@ -1093,50 +1075,60 @@ def __init__( bias=True, device=None, dtype=None, - group_size: int = 128, - inner_k_tiles: int = 8, - activation_precision: torch.dtype = torch.float16, - weight_precision: torch.dtype = torch.float16, + group_size: int = 256, + precision: torch.dtype = torch.float32, + scales_precision: torch.dtype = torch.float32, ) -> None: super().__init__() # always pad if needed since it becomes a noop at runtime if not needed - self.origin_in_features = in_features - in_features = _calc_padded_size_linear_int4( - in_features, group_size, inner_k_tiles - ) + # self.origin_in_features = in_features + assert ( + in_features % group_size == 0 + ), f"require in_features:{in_features} % group_size:{group_size} == 0" + # in_features = _calc_padded_size_linear_int4( + # in_features, group_size + # ) self.in_features = in_features self.out_features = out_features assert not bias, "require bias=False" self.group_size = group_size - self.inner_k_tiles = inner_k_tiles - self.weight_precision = weight_precision - self.activation_precision = activation_precision + # Precision of the activation which also indicates + # output precision of the dynamically quantized linear layer + # that his module represents. + self.precision = precision - # assert out_features % 8 == 0, "require out_features % 8 == 0" - assert ( - in_features % (inner_k_tiles * 16) == 0 - ), "require in_features % (innerKTiles * 16) == 0" # currently storing unpacked int8 weights self.register_buffer( "weight", torch.empty((out_features, in_features), dtype=torch.int8), ) self.register_buffer( - "scales_and_zeros", + "scales", torch.empty( - (in_features // group_size, out_features, 2), - dtype=self.weight_precision, + (out_features, in_features // group_size), + dtype=scales_precision, + ), + ) + self.register_buffer( + "zeros", + torch.empty( + (out_features, in_features // group_size), + dtype=scales_precision, ), ) def forward(self, input: torch.Tensor) -> torch.Tensor: - input = input.to(self.activation_precision) - input = F.pad(input, pad=(0, self.in_features - self.origin_in_features)) + input = input.to(self.precision) + # Change this to pad if needed later + # else this op will always show up + # input = F.pad(input, pad=(0, self.in_features - self.origin_in_features)) ( scales, zero_points, - ) = torch.ops.quantized_decomposed.choose_qparams_per_token(input, torch.int8) + ) = torch.ops.quantized_decomposed.choose_qparams_per_token_asymmetric( + input, torch.int8 + ) # TODO: get these from torch.int8 quant_min = -128 @@ -1151,15 +1143,15 @@ def forward(self, input: torch.Tensor) -> torch.Tensor: quant_min, quant_max, torch.int8, - self.activation_precision, + self.precision, ) - input = input.to(self.activation_precision) return linear_forward_int4( input, self.weight, - self.scales_and_zeros, + self.scales, + self.zeros, self.out_features, self.group_size, - self.weight_precision, + self.precision, ) From 61a69d5cf154b4dec59cb6623e0dea38629ab71d Mon Sep 17 00:00:00 2001 From: Kimish Patel Date: Wed, 6 Mar 2024 08:01:39 -0800 Subject: [PATCH 049/290] Use dynamic quantized linear partitioner of xnnpack (#2252) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/2252 For grupwise 4bit quant we need dynamic quantized linear partitioner. Ideally -X option just uses both dqlinear as well as regular partitioner but the latter doesnt yet work. ghstack-source-id: 217594372 bypass-github-export-checks Reviewed By: mikekgfb Differential Revision: D54492109 fbshipit-source-id: 638f274dd2074818672aed738b361fc24927324c --- examples/models/llama2/export_llama_lib.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py index 7b0d1c44af..8702c4bdc4 100644 --- a/examples/models/llama2/export_llama_lib.py +++ b/examples/models/llama2/export_llama_lib.py @@ -394,11 +394,16 @@ def _export_llama(modelname, args) -> str: # noqa: C901 modelname = f"xnnpack_dq_{modelname}" if args.xnnpack: - partitioners[XnnpackPartitioner.__name__] = XnnpackPartitioner() + # Following changes due to. + # 1. We need dynamically quantized partitioner for both pt2e_quantize options + # as well as "qmode int4" which is also dynamic quantizes linear layers. + # 2. XNNPACK partitioner seems to result in seg fault for non dqlinear ops. + partitioners[XnnpackDynamicallyQuantizedPartitioner.__name__] = ( + XnnpackDynamicallyQuantizedPartitioner() + ) + # partitioners[XnnpackPartitioner.__name__] = XnnpackPartitioner() modelname = f"xnnpack_{modelname}" - # TODO: remove this after xnnpack delegation is ready - builder = ( load_llama_model( checkpoint=checkpoint_path, From 8ad8a2e1f5b1a2a3fd2163db5fbf8da630316a24 Mon Sep 17 00:00:00 2001 From: Zhengxu Chen Date: Wed, 6 Mar 2024 10:09:04 -0800 Subject: [PATCH 050/290] Serialize storage_offset in sym int. (#2267) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/2267 as title. bypass-github-export-checks Reviewed By: tugsbayasgalan, angelayi Differential Revision: D54553809 fbshipit-source-id: cf6025f6995eaba4342f7209c4b7933e790e8932 --- .ci/docker/ci_commit_pins/pytorch.txt | 2 +- exir/serde/export_serialize.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt index ec597b9c72..d157ae370b 100644 --- a/.ci/docker/ci_commit_pins/pytorch.txt +++ b/.ci/docker/ci_commit_pins/pytorch.txt @@ -1 +1 @@ -f5b99976adcbb01fd71bd0a39ea15bdac6c9e48a +export-D54553770 diff --git a/exir/serde/export_serialize.py b/exir/serde/export_serialize.py index 7d07b20bfd..a8fd01a7f1 100644 --- a/exir/serde/export_serialize.py +++ b/exir/serde/export_serialize.py @@ -220,7 +220,7 @@ def serialize_tensor_meta(t: torch.Tensor) -> TensorMeta: requires_grad=t.requires_grad, device=Device(type=t.device.type, index=t.device.index), strides=[serialize_sym_int(s) for s in t.stride()], - storage_offset=0, + storage_offset=serialize_sym_int(0), layout=_TORCH_TO_SERIALIZE_LAYOUT[t.layout], ) From c0167c556a1cbc7c59f6689a1e5f7fe52ceb15f9 Mon Sep 17 00:00:00 2001 From: Zhengxu Chen Date: Wed, 6 Mar 2024 10:09:26 -0800 Subject: [PATCH 051/290] Fix tests. (#2277) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/2277 as title. bypass-github-export-checks Reviewed By: angelayi Differential Revision: D54588001 fbshipit-source-id: df805d81b7bc2f91c8440d41703a02971f2d365e --- exir/backend/test/test_backends_lifted.py | 54 ++++++++++++++--------- exir/program/test/test_program.py | 12 ++++- 2 files changed, 43 insertions(+), 23 deletions(-) diff --git a/exir/backend/test/test_backends_lifted.py b/exir/backend/test/test_backends_lifted.py index c712ab19c3..e219198027 100644 --- a/exir/backend/test/test_backends_lifted.py +++ b/exir/backend/test/test_backends_lifted.py @@ -1012,17 +1012,19 @@ def false_fn(x, y): x = x - y return x - def f(x, y): - x = x + y - x = control_flow.cond(x[0][0] == 1, true_fn, false_fn, [x, y]) - x = x - y - return x + class Module(torch.nn.Module): + def forward(self, x, y): + x = x + y + x = control_flow.cond(x[0][0] == 1, true_fn, false_fn, [x, y]) + x = x - y + return x + f = Module() inputs = (torch.ones(2, 2), torch.ones(2, 2)) orig_res = f(*inputs) orig = to_edge( export( - torch.export.WrapperModule(f), + f, inputs, ) ) @@ -1066,15 +1068,17 @@ def map_fn(x, y): x = x + y return x - def f(xs, y): - y = torch.mm(y, y) - return control_flow.map(map_fn, xs, y) + class Module(torch.nn.Module): + def forward(self, xs, y): + y = torch.mm(y, y) + return control_flow.map(map_fn, xs, y) + f = Module() inputs = (torch.ones(2, 2), torch.ones(2, 2)) orig_res = f(*inputs) orig = to_edge( export( - torch.export.WrapperModule(f), + f, inputs, ) ) @@ -1132,9 +1136,10 @@ def map_fn(x, pred1, pred2, y): x = x + y return x.sin() - def f(xs, pred1, pred2, y): - y = torch.mm(y, y) - return control_flow.map(map_fn, xs, pred1, pred2, y) + class Module(torch.nn.Module): + def forward(self, xs, pred1, pred2, y): + y = torch.mm(y, y) + return control_flow.map(map_fn, xs, pred1, pred2, y) inputs = ( torch.ones(2, 2), @@ -1143,10 +1148,11 @@ def f(xs, pred1, pred2, y): torch.ones(2, 2), ) + f = Module() orig_res = f(*inputs) orig = to_edge( export( - torch.export.WrapperModule(f), + f, inputs, ) ) @@ -1205,12 +1211,14 @@ def f(xs, pred1, pred2, y): ) def test_list_input(self): - def f(x: List[torch.Tensor]): - y = x[0] + x[1] - return y + class Module(torch.nn.Module): + def forward(self, x: List[torch.Tensor]): + y = x[0] + x[1] + return y + f = Module() inputs = ([torch.randn(2, 2), torch.randn(2, 2)],) - edge_prog = to_edge(export(torch.export.WrapperModule(f), inputs)) + edge_prog = to_edge(export(f, inputs)) lowered_gm = to_backend( BackendWithCompilerDemo.__name__, edge_prog.exported_program(), [] ) @@ -1227,12 +1235,14 @@ def forward(self, x: List[torch.Tensor]): gm.exported_program().module()(*inputs) def test_dict_input(self): - def f(x: Dict[str, torch.Tensor]): - y = x["a"] + x["b"] - return y + class Module(torch.nn.Module): + def forward(self, x: Dict[str, torch.Tensor]): + y = x["a"] + x["b"] + return y + f = Module() inputs = ({"a": torch.randn(2, 2), "b": torch.randn(2, 2)},) - edge_prog = to_edge(export(torch.export.WrapperModule(f), inputs)) + edge_prog = to_edge(export(f, inputs)) lowered_gm = to_backend( BackendWithCompilerDemo.__name__, edge_prog.exported_program(), [] ) diff --git a/exir/program/test/test_program.py b/exir/program/test/test_program.py index 8c2ddddb7c..01de1f3bef 100644 --- a/exir/program/test/test_program.py +++ b/exir/program/test/test_program.py @@ -30,6 +30,16 @@ from torch.library import impl, Library + +class WrapperModule(torch.nn.Module): + def __init__(self, fn): + super().__init__() + self.fn = fn + + def forward(self, *args, **kwargs): + return self.fn(*args, **kwargs) + + lib = Library("test_op", "DEF") # Fake a operator for testing. @@ -374,7 +384,7 @@ def _test_edge_dialect_verifier(self, callable, validate_ir=True): two, ) if not isinstance(callable, torch.nn.Module): - callable = torch.export.WrapperModule(callable) + callable = WrapperModule(callable) exported_foo = export(callable, inputs) _ = to_edge(exported_foo, compile_config=edge_compile_config) From 3b0dd33ac5daea6ab13a966765bbba5cb51e44d5 Mon Sep 17 00:00:00 2001 From: Zhengxu Chen Date: Wed, 6 Mar 2024 12:22:11 -0800 Subject: [PATCH 052/290] Revert D54553809: Serialize storage_offset in sym int. Differential Revision: D54553809 Original commit changeset: cf6025f6995e Original Phabricator Diff: D54553809 fbshipit-source-id: 295b2d9476aae5943b80b3af7e2f089a8406b92a --- .ci/docker/ci_commit_pins/pytorch.txt | 2 +- exir/serde/export_serialize.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt index d157ae370b..ec597b9c72 100644 --- a/.ci/docker/ci_commit_pins/pytorch.txt +++ b/.ci/docker/ci_commit_pins/pytorch.txt @@ -1 +1 @@ -export-D54553770 +f5b99976adcbb01fd71bd0a39ea15bdac6c9e48a diff --git a/exir/serde/export_serialize.py b/exir/serde/export_serialize.py index a8fd01a7f1..7d07b20bfd 100644 --- a/exir/serde/export_serialize.py +++ b/exir/serde/export_serialize.py @@ -220,7 +220,7 @@ def serialize_tensor_meta(t: torch.Tensor) -> TensorMeta: requires_grad=t.requires_grad, device=Device(type=t.device.type, index=t.device.index), strides=[serialize_sym_int(s) for s in t.stride()], - storage_offset=serialize_sym_int(0), + storage_offset=0, layout=_TORCH_TO_SERIALIZE_LAYOUT[t.layout], ) From 12fcfcf29b1056461b4bd1bcc7f962a5a7576bd4 Mon Sep 17 00:00:00 2001 From: Stephen Jia Date: Wed, 6 Mar 2024 13:15:02 -0800 Subject: [PATCH 053/290] Extend support for scalars and scalar lists in Value class (#2271) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/2271 ## Context This changeset enables serialization and execution of Operators with arbitrary function signatures. Previously, only operators with a very specific schema were supported (2 inputs, 1 output). This is achieved by extending the `Value` class (which is essentially a tagged union) to support all necessary types. All objects needed to execute an operator are now serialized/deserialized as a tagged union. This changeset also refactors `VulkanBackend.cpp` by introducing `GraphBuilder` which makes constructing a `ComputeGraph` from a serialized flatbuffer much clearer. bypass-github-pytorch-ci-checks bypass-github-export-checks Reviewed By: jorgep31415 Differential Revision: D54561567 fbshipit-source-id: c66d5d6b49e33a4373c7d6e83d510bc85da4b5d4 --- backends/vulkan/runtime/VulkanBackend.cpp | 323 +++++++++--------- .../vulkan/runtime/graph/ComputeGraph.cpp | 6 + backends/vulkan/runtime/graph/ComputeGraph.h | 39 +++ .../vulkan/runtime/graph/containers/Types.cpp | 29 +- .../vulkan/runtime/graph/containers/Types.h | 15 +- .../vulkan/runtime/graph/containers/Value.h | 258 +++++++++----- backends/vulkan/runtime/graph/ops/OpUtils.h | 16 + .../runtime/graph/ops/OperatorRegistry.h | 7 +- backends/vulkan/runtime/graph/ops/Utils.h | 2 +- .../runtime/graph/ops/impl/Arithmetic.cpp | 48 +-- .../runtime/graph/ops/impl/Arithmetic.h | 10 +- backends/vulkan/serialization/schema.fbs | 49 ++- .../serialization/vulkan_graph_builder.py | 73 +++- .../serialization/vulkan_graph_schema.py | 60 +++- .../vulkan/test/vulkan_compute_api_test.cpp | 89 ++++- 15 files changed, 697 insertions(+), 327 deletions(-) diff --git a/backends/vulkan/runtime/VulkanBackend.cpp b/backends/vulkan/runtime/VulkanBackend.cpp index 9c554a232c..1222ee38e5 100644 --- a/backends/vulkan/runtime/VulkanBackend.cpp +++ b/backends/vulkan/runtime/VulkanBackend.cpp @@ -23,12 +23,15 @@ #include /* strtol */ #include #include +#include namespace torch { namespace executor { namespace vulkan { namespace { +using namespace at::native::vulkan; + // Flatbuffer types using VkGraphPtr = const vkgraph::VkGraph*; using OpCallPtr = const vkgraph::OperatorCall*; @@ -51,102 +54,194 @@ const uint8_t* getConstantDataPtr( return constant_data + constant_bytes->offset(); } -using namespace at::native::vulkan; +api::ScalarType get_scalar_type(const vkgraph::VkDataType& vk_datatype) { + switch (vk_datatype) { + case (vkgraph::VkDataType::fp32): { + return api::kFloat; + } + } +} + +GraphConfig generate_config() { + const uint32_t submit_frequency = UINT32_MAX; + + const api::CommandPoolConfig cmd_config{ + 4u, // cmdPoolInitialSize + 2u, // cmdPoolBatchSize + }; + + const api::DescriptorPoolConfig descriptor_pool_config{ + 1024u, // descriptorPoolMaxSets + 1024u, // descriptorUniformBufferCount + 1024u, // descriptorStorageBufferCount + 1024u, // descriptorCombinedSamplerCount + 1024u, // descriptorStorageImageCount + 32u, // descriptorPileSizes + }; + + const api::QueryPoolConfig query_pool_config{}; + + const api::ContextConfig context_config{ + submit_frequency, // cmdSubmitFrequency + cmd_config, // cmdPoolConfig + descriptor_pool_config, // descriptorPoolConfig + query_pool_config, // queryPoolConfig + }; + + const GraphConfig graph_config{ + context_config, + }; + + return graph_config; +} + +class GraphBuilder { + ComputeGraph* compute_graph_; + VkGraphPtr flatbuffer_; + const uint8_t* constant_data_; + + std::unordered_map ref_mapping_; -class VulkanBackend final : public PyTorchBackendInterface { public: - ~VulkanBackend() override = default; + explicit GraphBuilder( + ComputeGraph* compute_graph, + VkGraphPtr flatbuffer, + const uint8_t* constant_data) + : compute_graph_(compute_graph), + flatbuffer_(flatbuffer), + constant_data_(constant_data), + ref_mapping_() {} + + bool fb_id_exists(const uint32_t fb_id) { + const std::unordered_map::iterator found_ref = + ref_mapping_.find(fb_id); - bool is_available() const override { - return true; + return found_ref != ref_mapping_.end(); } - api::ScalarType get_scalar_type( - const vkgraph::VkDataType& vk_datatype) const { - switch (vk_datatype) { - case (vkgraph::VkDataType::fp32): { - return api::kFloat; - } - } + ValueRef get_fb_id_valueref(const uint32_t fb_id) { + const std::unordered_map::iterator found_ref = + ref_mapping_.find(fb_id); + + ET_CHECK_MSG( + found_ref != ref_mapping_.end(), + "Trying to extract a value that hasn't yet been added to the graph."); + + return found_ref->second; } - ValueRef get_value_ref( - const uint32_t value_id, - VkGraphPtr flatbuffer_graph, - ComputeGraph* compute_graph, - std::unordered_map& ref_mapping, - VkValuesVector value_mapping, - const uint8_t* constant_data) const { - const std::unordered_map::iterator found_ref = - ref_mapping.find(value_id); + void add_tensor_to_graph(const uint32_t fb_id, VkTensorPtr tensor_fb) { + const api::ScalarType& dtype = get_scalar_type(tensor_fb->datatype()); + + UIntVector dims_fb = tensor_fb->dims(); + const std::vector dims_vector(dims_fb->cbegin(), dims_fb->cend()); - if (found_ref != ref_mapping.end()) { - return found_ref->second; + ValueRef ref; + if (tensor_fb->constant_id() >= 0) { + const uint8_t* tensor_data = getConstantDataPtr( + flatbuffer_, tensor_fb->constant_id(), constant_data_); + + ref = compute_graph_->add_tensorref(dims_vector, dtype, tensor_data); + } else { + ref = compute_graph_->add_tensor( + dims_vector, dtype, tensor_fb->mem_obj_id()); } - VkValuePtr vk_value = value_mapping->Get(value_id); - VkTensorPtr vk_tensor = vk_value->value(); + ref_mapping_[fb_id] = ref; + } + + template + typename std::enable_if::value, void>::type + add_scalar_to_graph(const uint32_t fb_id, T value) { + ValueRef ref = compute_graph_->add_scalar(value); + ref_mapping_[fb_id] = ref; + } + + void add_string_to_graph(const uint32_t fb_id, VkValuePtr value) { + const auto fb_str = value->value_as_String()->string_val(); + std::string string(fb_str->cbegin(), fb_str->cend()); + ValueRef ref = compute_graph_->add_string(std::move(string)); + ref_mapping_[fb_id] = ref; + } + void add_value_to_graph(const uint32_t fb_id, VkValuePtr value) { ET_CHECK_MSG( - vk_tensor->constant_id() >= 0, - "Only constant buffers are supported when adding tensors to compute graph (indicated by constant_id < 0), but got constant_id of %d", - vk_tensor->constant_id()); + !fb_id_exists(fb_id), + "Trying to add a value that has already been added to the graph."); + + switch (value->value_type()) { + case vkgraph::GraphTypes::Int: + add_scalar_to_graph(fb_id, value->value_as_Int()->int_val()); + break; + case vkgraph::GraphTypes::Double: + add_scalar_to_graph(fb_id, value->value_as_Double()->double_val()); + break; + case vkgraph::GraphTypes::Bool: + add_scalar_to_graph(fb_id, value->value_as_Bool()->bool_val()); + break; + case vkgraph::GraphTypes::VkTensor: + add_tensor_to_graph(fb_id, value->value_as_VkTensor()); + break; + case vkgraph::GraphTypes::String: + add_string_to_graph(fb_id, value); + break; + default: + ET_CHECK_MSG(false, "Unsupported value type."); + } + } - const api::ScalarType& tensor_dtype = - get_scalar_type(vk_tensor->datatype()); + void build_graph() { + // First, add all values to the graph + for (uint32_t fb_id = 0; fb_id < flatbuffer_->values()->size(); ++fb_id) { + VkValuePtr value = flatbuffer_->values()->Get(fb_id); + add_value_to_graph(fb_id, value); + } - UIntVector tensor_dims_fb = vk_tensor->dims(); - const std::vector tensor_dims_vector( - tensor_dims_fb->cbegin(), tensor_dims_fb->cend()); + // Parse the inputs + for (const uint32_t fb_id : *flatbuffer_->input_ids()) { + const ValueRef ref = get_fb_id_valueref(fb_id); + compute_graph_->set_input_tensor(ref); + } - const uint8_t* tensor_data = getConstantDataPtr( - flatbuffer_graph, vk_tensor->constant_id(), constant_data); + // Parse the operators + for (OpCallPtr op_call : *(flatbuffer_->chain())) { + std::string op_name = op_call->name()->str(); + ET_CHECK_MSG(hasOpsFn(op_name), "Missing operator: %s", op_name.c_str()); - const ValueRef value_ref = compute_graph->add_tensorref( - tensor_dims_vector, tensor_dtype, tensor_data); + const std::vector arg_fb_ids( + op_call->args()->cbegin(), op_call->args()->cend()); - ref_mapping[value_id] = value_ref; + std::vector args; + for (const int arg_fb_id : arg_fb_ids) { + args.push_back(get_fb_id_valueref(arg_fb_id)); + } - return value_ref; + auto vkFn = getOpsFn(op_name); + vkFn(*compute_graph_, args); + } + + // Parse the outputs + for (const uint32_t fb_id : *flatbuffer_->output_ids()) { + const ValueRef ref = get_fb_id_valueref(fb_id); + compute_graph_->set_output_tensor(ref); + } } +}; - GraphConfig generate_config() const { - const uint32_t submit_frequency = UINT32_MAX; - - const api::CommandPoolConfig cmd_config{ - 4u, // cmdPoolInitialSize - 2u, // cmdPoolBatchSize - }; - - const api::DescriptorPoolConfig descriptor_pool_config{ - 1024u, // descriptorPoolMaxSets - 1024u, // descriptorUniformBufferCount - 1024u, // descriptorStorageBufferCount - 1024u, // descriptorCombinedSamplerCount - 1024u, // descriptorStorageImageCount - 32u, // descriptorPileSizes - }; - - const api::QueryPoolConfig query_pool_config{}; - - const api::ContextConfig context_config{ - submit_frequency, // cmdSubmitFrequency - cmd_config, // cmdPoolConfig - descriptor_pool_config, // descriptorPoolConfig - query_pool_config, // queryPoolConfig - }; - - const GraphConfig graph_config{ - context_config, - }; - - return graph_config; +class VulkanBackend final : public PyTorchBackendInterface { + public: + ~VulkanBackend() override = default; + + bool is_available() const override { + // TODO(ssjia): replace with an actual Vulkan runtime availability check + return true; } __ET_NODISCARD Error compileModel(const void* buffer_pointer, ComputeGraph* compute_graph) const { Result header = VulkanDelegateHeader::Parse(buffer_pointer); + const uint8_t* flatbuffer_data = nullptr; const uint8_t* constant_data = nullptr; @@ -169,92 +264,10 @@ class VulkanBackend final : public PyTorchBackendInterface { VkGraphPtr flatbuffer_graph = vkgraph::GetVkGraph(flatbuffer_data); - // Mapping from serialized VkValue ids to compute graph ValueRefs - // This will be populated as the compute graph is built - std::unordered_map ref_mapping; - - // A vector which acts as a mapping from VkValue ids (vector indices) to - // VkValues - VkValuesVector value_mapping = flatbuffer_graph->values(); + GraphBuilder builder = + GraphBuilder(compute_graph, flatbuffer_graph, constant_data); - // 1. Add all inputs (and corresponding tensors) to the compute graph - UIntVector input_ids = flatbuffer_graph->input_ids(); - - for (size_t input_index = 0; input_index < input_ids->size(); - ++input_index) { - const uint32_t input_id = input_ids->Get(input_index); - VkValuePtr input_vk_value = value_mapping->Get(input_id); - - VkTensorPtr input_vk_tensor = input_vk_value->value(); - - ET_CHECK_MSG( - input_vk_tensor->constant_id() < 0, - "Expected constant buffer index for input at index %zu with id %d to be < 0 (since it is non-constant), but got: %d", - input_index, - input_id, - input_vk_tensor->constant_id()); - - const api::ScalarType& input_dtype = - get_scalar_type(input_vk_tensor->datatype()); - - UIntVector input_dims_fb = input_vk_tensor->dims(); - const std::vector input_dims_vector( - input_dims_fb->cbegin(), input_dims_fb->cend()); - - const ValueRef input_ref = compute_graph->add_tensor( - input_dims_vector, input_dtype, input_vk_tensor->mem_obj_id()); - - ref_mapping[input_id] = input_ref; - compute_graph->set_input_tensor(input_ref); - } - - // 2. Add all ops to the graph - // TODO: Generalize for ops that don't have 2 inputs and 1 output. - for (OpCallPtr op_call : *(flatbuffer_graph->chain())) { - std::string op_name = op_call->name()->str(); - - ET_CHECK_MSG( - op_call->args() != nullptr && op_call->args()->size() == 3, - "Vulkan currently only supports OperatorCall with 3 args"); - const auto arg_ids = op_call->args()->data(); - - const uint32_t input1_id = arg_ids[0]; - const uint32_t input2_id = arg_ids[1]; - const uint32_t output_id = arg_ids[2]; - - const ValueRef input1_ref = get_value_ref( - input1_id, - flatbuffer_graph, - compute_graph, - ref_mapping, - value_mapping, - constant_data); - - const ValueRef input2_ref = get_value_ref( - input2_id, - flatbuffer_graph, - compute_graph, - ref_mapping, - value_mapping, - constant_data); - - ET_CHECK_MSG(hasOpsFn(op_name), "Missing operator: %s", op_name.c_str()); - auto vkFn = getOpsFn(op_name); - const at::native::vulkan::ValueRef output_ref = vkFn( - *compute_graph, - {input1_ref, - input2_ref, - 1, - value_mapping->Get(output_id)->value()->mem_obj_id()}); - - ref_mapping[output_id] = output_ref; - } - - // 3. Add all outputs to the compute graph - for (const uint32_t output_id : *flatbuffer_graph->output_ids()) { - const ValueRef output_ref = ref_mapping[output_id]; - compute_graph->set_output_tensor(output_ref); - } + builder.build_graph(); compute_graph->encode_prepack(); compute_graph->prepack(); diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp index 647371424c..0900dfb9c1 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.cpp +++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp @@ -77,6 +77,12 @@ ValueRef ComputeGraph::add_staging( return idx; } +ValueRef ComputeGraph::add_string(std::string&& str) { + ValueRef idx(static_cast(values_.size())); + values_.emplace_back(std::move(str)); + return idx; +} + ValueRef ComputeGraph::set_input_tensor( const ValueRef idx, const bool use_staging) { diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h index ec8d3ba1db..a45e449ae2 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.h +++ b/backends/vulkan/runtime/graph/ComputeGraph.h @@ -28,6 +28,19 @@ namespace at { namespace native { namespace vulkan { +// Define valid scalar types that the Value class can accept +template +struct is_valid_scalar_type : std::false_type {}; + +template <> +struct is_valid_scalar_type : std::true_type {}; + +template <> +struct is_valid_scalar_type : std::true_type {}; + +template <> +struct is_valid_scalar_type : std::true_type {}; + /* * This is the core data structure used to execute Vulkan models in graph mode. * As opposed to ATen/eager mode where a command buffer is encoded every @@ -123,6 +136,16 @@ class ComputeGraph final { const void* const data); ValueRef add_staging(const api::ScalarType dtype, const size_t numel); + template + typename std::enable_if::value, ValueRef>::type + add_scalar_list(std::vector&& values); + + template + typename std::enable_if::value, ValueRef>::type + add_scalar(T value); + + ValueRef add_string(std::string&& str); + ValueRef set_input_tensor(const ValueRef idx, const bool use_staging = true); ValueRef set_output_tensor(const ValueRef idx, const bool use_staging = true); @@ -163,6 +186,22 @@ class ComputeGraph final { void execute() const; }; +template +inline typename std::enable_if::value, ValueRef>::type +ComputeGraph::add_scalar_list(std::vector&& values) { + ValueRef idx(static_cast(values_.size())); + values_.emplace_back(std::move(values)); + return idx; +} + +template +inline typename std::enable_if::value, ValueRef>::type +ComputeGraph::add_scalar(T value) { + ValueRef idx(static_cast(values_.size())); + values_.emplace_back(value); + return idx; +} + } // namespace vulkan } // namespace native } // namespace at diff --git a/backends/vulkan/runtime/graph/containers/Types.cpp b/backends/vulkan/runtime/graph/containers/Types.cpp index bbfde572b0..0779ed8716 100644 --- a/backends/vulkan/runtime/graph/containers/Types.cpp +++ b/backends/vulkan/runtime/graph/containers/Types.cpp @@ -12,20 +12,25 @@ namespace at { namespace native { namespace vulkan { +#define PRINT_CASE(name) \ + case TypeTag::name: \ + out << #name; \ + break; + std::ostream& operator<<(std::ostream& out, const TypeTag& tag) { switch (tag) { - case TypeTag::NONE: - out << "NONE"; - break; - case TypeTag::TENSOR: - out << "TENSOR"; - break; - case TypeTag::STAGING: - out << "STAGING"; - break; - default: - out << "UNKNOWN"; - break; + PRINT_CASE(NONE) + PRINT_CASE(INT) + PRINT_CASE(DOUBLE) + PRINT_CASE(BOOL) + PRINT_CASE(TENSOR) + PRINT_CASE(STAGING) + PRINT_CASE(TENSORREF) + PRINT_CASE(INTLIST) + PRINT_CASE(DOUBLELIST) + PRINT_CASE(BOOLLIST) + PRINT_CASE(VALUELIST) + PRINT_CASE(STRING) } return out; } diff --git a/backends/vulkan/runtime/graph/containers/Types.h b/backends/vulkan/runtime/graph/containers/Types.h index a7162d777a..d5dee7ea0d 100644 --- a/backends/vulkan/runtime/graph/containers/Types.h +++ b/backends/vulkan/runtime/graph/containers/Types.h @@ -23,12 +23,21 @@ namespace vulkan { */ enum class TypeTag : uint32_t { NONE, - TENSOR, - STAGING, - TENSORREF, + // Scalar types INT, DOUBLE, BOOL, + // Tensor and tensor adjacent types + TENSOR, + STAGING, + TENSORREF, + // Scalar lists + INTLIST, + DOUBLELIST, + BOOLLIST, + // Special Type + VALUELIST, + STRING, }; std::ostream& operator<<(std::ostream& out, const TypeTag& tag); diff --git a/backends/vulkan/runtime/graph/containers/Value.h b/backends/vulkan/runtime/graph/containers/Value.h index d56791b4fa..82ba941713 100644 --- a/backends/vulkan/runtime/graph/containers/Value.h +++ b/backends/vulkan/runtime/graph/containers/Value.h @@ -22,6 +22,19 @@ namespace at { namespace native { namespace vulkan { +using ValueRef = int32_t; + +constexpr ValueRef kDummyValueRef = -1; + +inline bool is_valid(ValueRef value_ref) { + return value_ref >= 0; +} + +struct IOValueRef { + ValueRef value; + ValueRef staging; +}; + /* * This class is modelled after c10::IValue; however, it is simplified and does * not support as many types. However, the core design is the same; it is a @@ -48,6 +61,17 @@ struct Value final { api::StorageBuffer as_staging; TensorRef as_tensorref; + std::vector as_int_list; + std::vector as_double_list; + std::vector as_bool_list; + + // The below is a special type that is used to represent a list of other + // values stored in the graph. One application of the type is to represent + // a list of tensors or a list of optional tensors. + std::vector as_value_list; + + std::string as_string; + Payload() : u() {} // NOLINTNEXTLINE ~Payload(){}; @@ -68,21 +92,48 @@ struct Value final { Value& operator=(Value&&) = delete; +#define CASE_MOVE_TRIVIALLY_COPYABLE_TYPE(type_tag, member_name) \ + case type_tag: \ + payload.u.member_name = rhs.payload.u.member_name; \ + break; + +#define CASE_MOVE_MOVEABLE_TYPE(type_tag, type, member_name) \ + case type_tag: \ + new (&payload.member_name) type(std::move(rhs.payload.member_name)); \ + break; + Value(Value&& rhs) noexcept : tag(rhs.tag) { - if (rhs.isTensor()) { - new (&payload.as_tensor) vTensor(std::move(rhs.payload.as_tensor)); - } else if (rhs.isStaging()) { - new (&payload.as_staging) - api::StorageBuffer(std::move(rhs.payload.as_staging)); - } else if (rhs.isTensorRef()) { - payload.as_tensorref = std::move(rhs.payload.as_tensorref); - } else { - payload.u = rhs.payload.u; + switch (tag) { + // Scalar types + CASE_MOVE_TRIVIALLY_COPYABLE_TYPE(TypeTag::INT, as_int); + CASE_MOVE_TRIVIALLY_COPYABLE_TYPE(TypeTag::DOUBLE, as_double); + CASE_MOVE_TRIVIALLY_COPYABLE_TYPE(TypeTag::BOOL, as_bool); + // Tensor and tensor adjacent types + CASE_MOVE_MOVEABLE_TYPE(TypeTag::TENSOR, vTensor, as_tensor); + CASE_MOVE_MOVEABLE_TYPE(TypeTag::STAGING, api::StorageBuffer, as_staging); + CASE_MOVE_MOVEABLE_TYPE(TypeTag::TENSORREF, TensorRef, as_tensorref); + // Scalar lists + CASE_MOVE_MOVEABLE_TYPE( + TypeTag::INTLIST, std::vector, as_int_list); + CASE_MOVE_MOVEABLE_TYPE( + TypeTag::DOUBLELIST, std::vector, as_double_list); + CASE_MOVE_MOVEABLE_TYPE( + TypeTag::BOOLLIST, std::vector, as_bool_list); + // Special types + CASE_MOVE_MOVEABLE_TYPE( + TypeTag::VALUELIST, std::vector, as_value_list); + CASE_MOVE_MOVEABLE_TYPE(TypeTag::STRING, std::string, as_string); + + case TypeTag::NONE: + clearToNone(); + break; } - tag = rhs.tag; rhs.clearToNone(); } +#undef CASE_MOVE_TRIVIALLY_COPYABLE_TYPE +#undef CASE_MOVE_MOVEABLE_TYPE + // // Accessors // @@ -96,77 +147,127 @@ struct Value final { // ~Value() { - if (this->isTensor()) { - payload.as_tensor.~vTensor(); - } else if (this->isStaging()) { - payload.as_staging.~StorageBuffer(); - } else if (this->isTensorRef()) { - payload.as_tensorref.~TensorRef(); + switch (tag) { + case TypeTag::TENSOR: + payload.as_tensor.~vTensor(); + break; + case TypeTag::STAGING: + payload.as_staging.~StorageBuffer(); + break; + case TypeTag::TENSORREF: + payload.as_tensorref.~TensorRef(); + break; + case TypeTag::INTLIST: + payload.as_int_list.~vector(); + break; + case TypeTag::DOUBLELIST: + payload.as_double_list.~vector(); + break; + case TypeTag::BOOLLIST: + payload.as_bool_list.~vector(); + break; + case TypeTag::VALUELIST: + payload.as_value_list.~vector(); + break; + case TypeTag::STRING: + payload.as_string.~basic_string(); + break; + // Manually list out the types so that if a type here is added later and + // not handled the compiler can catch it. + case TypeTag::NONE: + case TypeTag::INT: + case TypeTag::DOUBLE: + case TypeTag::BOOL: + break; } } - // - // Tensor - // - - explicit Value(vTensor&& t) : tag(TypeTag::TENSOR) { - new (&payload.as_tensor) vTensor(std::move(t)); - } - - inline bool isTensor() const { - return TypeTag::TENSOR == tag; - } - - inline vTensor& toTensor() { - VK_CHECK_COND( - isTensor(), - "Expected value to have type TENSOR, got ", - tag, - " instead."); - return payload.as_tensor; +#define SUPPORT_TRIVIALLY_COPYABLE_TYPE( \ + type, type_name, type_tag, member_name) \ + explicit Value(type t) : tag(type_tag) { \ + payload.u.member_name = t; \ + } \ + inline bool is##type_name() const { \ + return tag == type_tag; \ + } \ + inline const type& to##type_name() const { \ + VK_CHECK_COND( \ + is##type_name(), \ + "Expected value to have type " #type_name ", got ", \ + tag, \ + " instead."); \ + return payload.u.member_name; \ } - // - // Staging - // - - explicit Value(api::StorageBuffer&& t) : tag(TypeTag::STAGING) { - new (&payload.as_staging) api::StorageBuffer(std::move(t)); - } - - inline bool isStaging() const { - return TypeTag::STAGING == tag; - } - - inline api::StorageBuffer& toStaging() { - VK_CHECK_COND( - isStaging(), - "Expected value to have type STAGING, got ", - tag, - " instead."); - return payload.as_staging; - } - - // - // TensorRef - // - - explicit Value(TensorRef&& t) : tag(TypeTag::TENSORREF) { - payload.as_tensorref = std::move(t); - } - - inline bool isTensorRef() const { - return TypeTag::TENSORREF == tag; + SUPPORT_TRIVIALLY_COPYABLE_TYPE(int64_t, Int, TypeTag::INT, as_int); + SUPPORT_TRIVIALLY_COPYABLE_TYPE(double, Double, TypeTag::DOUBLE, as_double); + SUPPORT_TRIVIALLY_COPYABLE_TYPE(bool, Bool, TypeTag::BOOL, as_bool); + +#undef SUPPORT_TRIVIALLY_COPYABLE_TYPE + +#define SUPPORT_TRIVIALLY_MOVEABLE_TYPE( \ + type, type_name, type_tag, member_name) \ + explicit Value(type&& t) : tag(type_tag) { \ + new (&payload.member_name) type(std::move(t)); \ + } \ + inline bool is##type_name() const { \ + return tag == type_tag; \ + } \ + inline type& to##type_name() { \ + VK_CHECK_COND( \ + is##type_name(), \ + "Expected value to have type " #type_name ", got ", \ + tag, \ + " instead."); \ + return payload.member_name; \ } - inline TensorRef& toTensorRef() { - VK_CHECK_COND( - isTensorRef(), - "Expected value to have type TENSORREF, got ", - tag, - " instead."); - return payload.as_tensorref; - } + SUPPORT_TRIVIALLY_MOVEABLE_TYPE(vTensor, Tensor, TypeTag::TENSOR, as_tensor); + + SUPPORT_TRIVIALLY_MOVEABLE_TYPE( + api::StorageBuffer, + Staging, + TypeTag::STAGING, + as_staging); + + SUPPORT_TRIVIALLY_MOVEABLE_TYPE( + TensorRef, + TensorRef, + TypeTag::TENSORREF, + as_tensorref); + + SUPPORT_TRIVIALLY_MOVEABLE_TYPE( + std::vector, + IntList, + TypeTag::INTLIST, + as_int_list); + + SUPPORT_TRIVIALLY_MOVEABLE_TYPE( + std::vector, + DoubleList, + TypeTag::DOUBLELIST, + as_double_list); + + SUPPORT_TRIVIALLY_MOVEABLE_TYPE( + std::vector, + BoolList, + TypeTag::BOOLLIST, + as_bool_list); + + SUPPORT_TRIVIALLY_MOVEABLE_TYPE( + std::vector, + ValueList, + TypeTag::VALUELIST, + as_value_list); + + SUPPORT_TRIVIALLY_MOVEABLE_TYPE( + std::string, + String, + TypeTag::STRING, + as_string); + +#undef SUPPORT_TRIVIALLY_COPYABLE_TYPE +#undef SUPPORT_TRIVIALLY_MOVEABLE_TYPE private: Payload payload; @@ -177,18 +278,11 @@ struct Value final { // inline void clearToNone() noexcept { - payload.u.as_int = 0; + payload.u.as_int = -1; tag = TypeTag::NONE; } }; -using ValueRef = int32_t; - -struct IOValueRef { - ValueRef value; - ValueRef staging; -}; - } // namespace vulkan } // namespace native } // namespace at diff --git a/backends/vulkan/runtime/graph/ops/OpUtils.h b/backends/vulkan/runtime/graph/ops/OpUtils.h index 2a98337721..b5acb3945a 100644 --- a/backends/vulkan/runtime/graph/ops/OpUtils.h +++ b/backends/vulkan/runtime/graph/ops/OpUtils.h @@ -12,6 +12,8 @@ #include +#include + namespace at { namespace native { namespace vulkan { @@ -80,6 +82,20 @@ uint32_t dim_at(const vTensor& v_in) { api::utils::uvec3 adaptive_work_group_size( const api::utils::uvec3& global_work_group); +template +T extract_scalar(const Value& value) { + if (value.isInt()) { + return static_cast(value.toInt()); + } + if (value.isDouble()) { + return static_cast(value.toDouble()); + } + if (value.isBool()) { + return static_cast(value.toBool()); + } + VK_THROW("Cannot extract scalar from Value with type ", value.type()); +} + } // namespace vulkan } // namespace native } // namespace at diff --git a/backends/vulkan/runtime/graph/ops/OperatorRegistry.h b/backends/vulkan/runtime/graph/ops/OperatorRegistry.h index c11aa0168e..06245d889e 100644 --- a/backends/vulkan/runtime/graph/ops/OperatorRegistry.h +++ b/backends/vulkan/runtime/graph/ops/OperatorRegistry.h @@ -19,11 +19,8 @@ namespace at { namespace native { namespace vulkan { -using OpFunction = const std::function&)>; // TODO: Generalize to - // support float, - // int64_t. +using OpFunction = + const std::function&)>; bool hasOpsFn(const std::string& name); diff --git a/backends/vulkan/runtime/graph/ops/Utils.h b/backends/vulkan/runtime/graph/ops/Utils.h index 9d6153e1d1..918318178b 100644 --- a/backends/vulkan/runtime/graph/ops/Utils.h +++ b/backends/vulkan/runtime/graph/ops/Utils.h @@ -17,7 +17,7 @@ namespace native { namespace vulkan { #define DECLARE_OP_FN(function) \ - ValueRef function(ComputeGraph& graph, const std::vector& args); + void function(ComputeGraph& graph, const std::vector& args); api::utils::ivec4 get_size_as_ivec4(const vTensor& t); diff --git a/backends/vulkan/runtime/graph/ops/impl/Arithmetic.cpp b/backends/vulkan/runtime/graph/ops/impl/Arithmetic.cpp index d635ea9a7f..f5895c1544 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Arithmetic.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Arithmetic.cpp @@ -16,42 +16,35 @@ namespace at { namespace native { namespace vulkan { +#define DEFINE_ARITHMETIC_WITH_ALPHA_FN(function, shader) \ + void function(ComputeGraph& graph, const std::vector& args) { \ + return add_arithmetic_node( \ + graph, args[0], args[1], args[2], args[3], VK_KERNEL(shader)); \ + } + #define DEFINE_ARITHMETIC_FN(function, shader) \ - ValueRef function(ComputeGraph& graph, const std::vector& args) { \ + void function(ComputeGraph& graph, const std::vector& args) { \ return add_arithmetic_node( \ - graph, args[0], args[1], args[2], VK_KERNEL(shader), args[3]); \ + graph, args[0], args[1], kDummyValueRef, args[2], VK_KERNEL(shader)); \ } -DEFINE_ARITHMETIC_FN(add, add); -DEFINE_ARITHMETIC_FN(sub, sub); +DEFINE_ARITHMETIC_WITH_ALPHA_FN(add, add); +DEFINE_ARITHMETIC_WITH_ALPHA_FN(sub, sub); + +// Floor div does not have an alpha, but a string argument (which is unused) is +// passed in at the same location as the alpha argument in other op. +DEFINE_ARITHMETIC_WITH_ALPHA_FN(floor_div, floor_divide); + DEFINE_ARITHMETIC_FN(mul, mul); DEFINE_ARITHMETIC_FN(div, div); -DEFINE_ARITHMETIC_FN(floor_div, floor_divide); DEFINE_ARITHMETIC_FN(pow, pow); -// TODO(T180908843): Bypass this entrypoint function by creating `ValueRef out` -// ahead of time. -ValueRef add_arithmetic_node( - ComputeGraph& graph, - const ValueRef in1, - const ValueRef in2, - const float alpha, - const api::ShaderInfo& shader, - const int64_t shared_object_idx) { - std::vector in1_sizes = graph.get_val_sizes(in1); - api::ScalarType in1_dtype = graph.get_val_dtype(in1); - - ValueRef out = graph.add_tensor(in1_sizes, in1_dtype, shared_object_idx); - add_arithmetic_node(graph, in1, in2, out, alpha, shader); - return out; -} - void add_arithmetic_node( ComputeGraph& graph, const ValueRef in1, const ValueRef in2, + const ValueRef alpha, const ValueRef out, - const float alpha, const api::ShaderInfo& shader) { ValueRef arg1 = prepack_if_tensor_ref(graph, in1); ValueRef arg2 = prepack_if_tensor_ref(graph, in2); @@ -63,11 +56,18 @@ void add_arithmetic_node( api::utils::uvec3 global_size = t_out.extents(); api::utils::uvec3 local_size = adaptive_work_group_size(global_size); + float alpha_val = 1.0f; + // String is checked since floor_div passes in an unused string argument in + // place of alpha + if (is_valid(alpha) && !graph.get_val(alpha).isString()) { + alpha_val = extract_scalar(graph.get_val(alpha)); + } + ArithmeticParams block{ get_size_as_ivec4(t_out), get_size_as_ivec4(t_in1), get_size_as_ivec4(t_in2), - 1.0, + alpha_val, }; api::UniformParamsBuffer params(graph.context(), block); diff --git a/backends/vulkan/runtime/graph/ops/impl/Arithmetic.h b/backends/vulkan/runtime/graph/ops/impl/Arithmetic.h index 8017f6c4c4..3ef3cb3e42 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Arithmetic.h +++ b/backends/vulkan/runtime/graph/ops/impl/Arithmetic.h @@ -25,20 +25,12 @@ DECLARE_OP_FN(div); DECLARE_OP_FN(floor_div); DECLARE_OP_FN(pow); -ValueRef add_arithmetic_node( - ComputeGraph& graph, - const ValueRef in1, - const ValueRef in2, - const float alpha, - const api::ShaderInfo& shader, - const int64_t shared_object_idx = -1); - void add_arithmetic_node( ComputeGraph& graph, const ValueRef in1, const ValueRef in2, + const ValueRef alpha, const ValueRef out, - const float alpha, const api::ShaderInfo& shader); struct ArithmeticParams final { diff --git a/backends/vulkan/serialization/schema.fbs b/backends/vulkan/serialization/schema.fbs index 3d8dab9a2f..e5139b5fd5 100644 --- a/backends/vulkan/serialization/schema.fbs +++ b/backends/vulkan/serialization/schema.fbs @@ -26,8 +26,55 @@ table VkTensor { mem_obj_id:int; } +table Null {} + +table Int { + int_val:long; +} + +table Bool { + bool_val:bool; +} + +table Double { + double_val:double; +} + +table String { + string_val:string; +} + +table IntList { + items:[long]; +} + +table DoubleList { + items:[double]; +} + +table BoolList { + items:[bool]; +} + +table ValueList { + items:[int]; +} + +union GraphTypes { + Null, + Int, + Double, + Bool, + VkTensor, + IntList, + DoubleList, + BoolList, + ValueList, + String, +} + table VkValue { - value:VkTensor; + value:GraphTypes; } // Abstraction to represent a region of bytes in a raw data buffer. Useful for referencing raw data diff --git a/backends/vulkan/serialization/vulkan_graph_builder.py b/backends/vulkan/serialization/vulkan_graph_builder.py index 68e54c2bc3..572ef018bc 100644 --- a/backends/vulkan/serialization/vulkan_graph_builder.py +++ b/backends/vulkan/serialization/vulkan_graph_builder.py @@ -4,7 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -from typing import Optional +from typing import Optional, Union import executorch.backends.vulkan.serialization.vulkan_graph_schema as vk_graph_schema @@ -15,6 +15,9 @@ from torch.export import ExportedProgram from torch.fx import Node +_ScalarType = Union[int, bool, float] +_Argument = Union[torch.fx.Node, int, bool, float, str] + class VkGraphBuilder: def __init__(self, program: ExportedProgram) -> None: @@ -106,7 +109,7 @@ def maybe_add_constant_tensor(self, node: Node) -> int: return const_buffer_idx - def create_single_vk_value(self, node: Node) -> int: + def create_single_tensor_value(self, node: Node) -> int: constant_id = self.maybe_add_constant_tensor(node) spec = node.meta.get("spec") @@ -138,17 +141,48 @@ def create_single_vk_value(self, node: Node) -> int: ) return new_id - def create_vk_values_for(self, node: Node): + def create_tensor_values(self, node: Node) -> int: spec = node.meta.get("spec") if isinstance(spec, TensorSpec): - return self.create_single_vk_value(node) + return self.create_single_tensor_value(node) else: raise RuntimeError( "Creating values for nodes with collection types is not supported yet." ) + def create_scalar_value(self, scalar: _ScalarType) -> int: + new_id = len(self.values) + if isinstance(scalar, int): + self.values.append(vk_graph_schema.VkValue(vk_graph_schema.Int(scalar))) + if isinstance(scalar, float): + self.values.append(vk_graph_schema.VkValue(vk_graph_schema.Double(scalar))) + if isinstance(scalar, bool): + self.values.append(vk_graph_schema.VkValue(vk_graph_schema.Bool(scalar))) + return new_id + + def create_string_value(self, string: str) -> int: + new_id = len(self.values) + self.values.append( + vk_graph_schema.VkValue(vk_graph_schema.String(string_val=string)) + ) + return new_id + + def get_or_create_value_for(self, arg: _Argument): + if isinstance(arg, torch.fx.Node): + # If the value has already been created, return the existing id + if arg in self.node_to_value_ids: + return self.node_to_value_ids[arg] + # Return id for a newly created value + return self.create_tensor_values(arg) + elif isinstance(arg, (int, float, bool)): + return self.create_scalar_value(arg) + elif isinstance(arg, str): + return self.create_string_value(arg) + else: + raise RuntimeError(f"Cannot create value for arg of type {type(arg)}") + def process_placeholder_node(self, node: Node) -> None: - ids = self.create_vk_values_for(node) + ids = self.create_tensor_values(node) if not self.is_param_node(node): if isinstance(ids, int): self.input_ids.append(ids) @@ -156,27 +190,32 @@ def process_placeholder_node(self, node: Node) -> None: self.input_ids += ids def process_call_function_node(self, node) -> None: - args = [] - # Add input nodes - for inp_node in node.all_input_nodes: - if inp_node not in self.node_to_value_ids: - raise AssertionError( - "Cannot find input to current node in node_to_value_ids. This means " - "this node is being serialized before its input which is not allowed." - ) - args.append(self.node_to_value_ids[inp_node]) + operator_call_args = [] + + for i, schema_arg in enumerate(node.target._schema.arguments): + if not schema_arg.kwarg_only and i < len(node.args): + function_arg = node.args[i] + elif schema_arg.name in node.kwargs: + function_arg = node.kwargs[schema_arg.name] + else: + function_arg = schema_arg.default_value + + # Create a value for each function argument. If the argument has been + # previously encountered, then use the existing value id. + operator_call_args.append(self.get_or_create_value_for(function_arg)) + # Add output node - args.append(self.create_vk_values_for(node)) + operator_call_args.append(self.create_tensor_values(node)) self.chain.append( vk_graph_schema.OperatorCall( name=node.target.__name__, - args=args, + args=operator_call_args, ), ) def process_getattr_node(self, node: Node) -> None: - self.create_vk_values_for(node) + self.create_tensor_values(node) def process_output_node(self, node: Node) -> None: if node.all_input_nodes[0] not in self.node_to_value_ids: diff --git a/backends/vulkan/serialization/vulkan_graph_schema.py b/backends/vulkan/serialization/vulkan_graph_schema.py index eeb1589a2a..1c5a05727b 100644 --- a/backends/vulkan/serialization/vulkan_graph_schema.py +++ b/backends/vulkan/serialization/vulkan_graph_schema.py @@ -12,7 +12,7 @@ from dataclasses import dataclass from enum import IntEnum -from typing import List +from typing import List, Union @dataclass @@ -34,13 +34,67 @@ class VkTensor: @dataclass -class VkScalar: +class Null: pass +@dataclass +class Int: + int_val: int + + +@dataclass +class Bool: + bool_val: bool + + +@dataclass +class Double: + double_val: float + + +@dataclass +class IntList: + items: List[int] + + +@dataclass +class DoubleList: + items: List[float] + + +@dataclass +class BoolList: + items: List[bool] + + +@dataclass +class ValueList: + items: List[int] + + +@dataclass +class String: + string_val: str + + +GraphTypes = Union[ + Null, + Int, + Double, + Bool, + VkTensor, + IntList, + BoolList, + DoubleList, + ValueList, + String, +] + + @dataclass class VkValue: - value: VkTensor + value: "GraphTypes" @dataclass diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index 51b58720c3..c53444ff0b 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -427,6 +427,60 @@ TEST_F(VulkanComputeAPITest, use_non_bound_textures_fails) { graph.get_val(name.value).toTensor().gpu_numel()); \ graph.copy_from_staging(name.staging, data_##name.data(), data_##name.size()); +TEST(VulkanComputeGraphTest, test_values_scalars) { + GraphConfig config = generate_graph_config(); + ComputeGraph graph(config); + + ValueRef idx; + + idx = graph.add_scalar(4); + EXPECT_TRUE(graph.get_val(idx).toInt() == 4); + + idx = graph.add_scalar(5.5f); + EXPECT_TRUE(graph.get_val(idx).toDouble() == 5.5f); +} + +TEST(VulkanComputeGraphTest, test_values_scalar_list_inplace_constructed) { + GraphConfig config = generate_graph_config(); + ComputeGraph graph(config); + + ValueRef idx = graph.add_scalar_list({1, 2, 3, 4}); + std::vector& arr = graph.get_val(idx).toIntList(); + EXPECT_TRUE(arr.size() == 4); + for (int i = 0; i < 4; i++) { + EXPECT_TRUE(arr[i] == i + 1); + } +} + +TEST(VulkanComputeGraphTest, test_values_scalar_list_outside_constructed) { + GraphConfig config = generate_graph_config(); + ComputeGraph graph(config); + + ValueRef idx; + { + std::vector data = {5.0, 4.0, 3.0, 2.0, 1.0}; + idx = graph.add_scalar_list(std::move(data)); + } + std::vector& arr = graph.get_val(idx).toDoubleList(); + EXPECT_TRUE(arr.size() == 5); + for (int i = 0; i < 5; i++) { + EXPECT_TRUE(arr[i] == (5 - i)); + } +} + +TEST(VulkanComputeGraphTest, test_values_string) { + GraphConfig config = generate_graph_config(); + ComputeGraph graph(config); + + ValueRef idx; + { + std::string data = "hello, world"; + idx = graph.add_string(std::move(data)); + } + std::string& stored = graph.get_val(idx).toString(); + EXPECT_TRUE(stored == "hello, world"); +} + TEST(VulkanComputeGraphTest, test_simple_graph) { GraphConfig config = generate_graph_config(); ComputeGraph graph(config); @@ -441,7 +495,10 @@ TEST(VulkanComputeGraphTest, test_simple_graph) { IOValueRef out = {}; - out.value = add_arithmetic_node(graph, a.value, b.value, 1.0, VK_KERNEL(add)); + out.value = graph.add_tensor(size_big, api::kFloat); + + add_arithmetic_node( + graph, a.value, b.value, kDummyValueRef, out.value, VK_KERNEL(add)); out.staging = graph.set_output_tensor(out.value); @@ -487,8 +544,11 @@ TEST(VulkanComputeGraphTest, test_simple_prepacked_graph) { IOValueRef a = graph.add_input_tensor(size_big, api::kFloat); - ValueRef c = add_arithmetic_node(graph, a.value, w1, 1.0, VK_KERNEL(add)); - ValueRef e = add_arithmetic_node(graph, c, w2, 1.0, VK_KERNEL(mul)); + ValueRef c = graph.add_tensor(size_big, api::kFloat); + ValueRef e = graph.add_tensor(size_big, api::kFloat); + + add_arithmetic_node(graph, a.value, w1, kDummyValueRef, c, VK_KERNEL(add)); + add_arithmetic_node(graph, c, w2, kDummyValueRef, e, VK_KERNEL(mul)); IOValueRef out = {}; out.value = e; @@ -541,14 +601,14 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects) { // 1 staging buffer for each input tensor EXPECT_TRUE(get_vma_allocation_count() == 4); - ValueRef c = add_arithmetic_node( - graph, - a.value, - b.value, - 1.0, - VK_KERNEL(add), + ValueRef c = graph.add_tensor( + size_big, + api::kFloat, /*shared_object_idx = */ 6); + add_arithmetic_node( + graph, a.value, b.value, kDummyValueRef, c, VK_KERNEL(add)); + IOValueRef d = graph.add_input_tensor( size_small, api::kFloat, @@ -560,14 +620,13 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects) { // 1 staging buffer for the input tensor EXPECT_TRUE(get_vma_allocation_count() == 7); - ValueRef e = add_arithmetic_node( - graph, - c, - d.value, - 1.0, - VK_KERNEL(mul), + ValueRef e = graph.add_tensor( + size_big, + api::kFloat, /*shared_object_idx = */ 4); + add_arithmetic_node(graph, c, d.value, kDummyValueRef, e, VK_KERNEL(mul)); + IOValueRef out = {}; out.value = e; out.staging = graph.set_output_tensor(out.value); From 05702947e30cd720dc0a3f1e0df7bb209c879725 Mon Sep 17 00:00:00 2001 From: Lucy Qiu Date: Wed, 6 Mar 2024 13:44:37 -0800 Subject: [PATCH 054/290] Add cord data structure (#2273) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/2273 Introduce cord data structure to store bytes/bytearrays during serialization. This allows us to manipulate bytes/bytearrays without copying data. bypass-github-export-checks Reviewed By: dbort Differential Revision: D54514244 fbshipit-source-id: 65397dcdea93054d54feea1b9f3ebfb0940c8513 --- exir/_serialize/TARGETS | 1 + exir/_serialize/_cord.py | 49 ++++++++++++++++++++++++ exir/_serialize/test/TARGETS | 10 +++++ exir/_serialize/test/test_cord.py | 63 +++++++++++++++++++++++++++++++ 4 files changed, 123 insertions(+) create mode 100644 exir/_serialize/_cord.py create mode 100644 exir/_serialize/test/test_cord.py diff --git a/exir/_serialize/TARGETS b/exir/_serialize/TARGETS index 2b319d254b..63918b8dae 100644 --- a/exir/_serialize/TARGETS +++ b/exir/_serialize/TARGETS @@ -29,6 +29,7 @@ runtime.python_library( name = "lib", srcs = [ "__init__.py", + "_cord.py", "_dataclass.py", "_flatbuffer.py", "_program.py", diff --git a/exir/_serialize/_cord.py b/exir/_serialize/_cord.py new file mode 100644 index 0000000000..b8be3572e1 --- /dev/null +++ b/exir/_serialize/_cord.py @@ -0,0 +1,49 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import io +from typing import List, Optional, Union + + +class Cord: + """A `bytes`-like sequence of bytes, stored non-contiguously. + + Users can use a Cord to assemble large files and data blobs using references + to and slices of other data, instead of copying and appending that data to a + `bytes` or `bytearray` object. + """ + + def __init__(self, data: Optional[Union[bytes, "Cord"]] = None) -> None: + """Initialize Cord data structure.""" + self._buffers: List[bytes] = [] + self._byte_size: int = 0 + + if data is not None: + self.append(data) + + def __len__(self): + """Number of bytes in the Cord.""" + return self._byte_size + + def __bytes__(self) -> bytes: + """Return the contents of the Cord as a single `bytes` object.""" + return b"".join(self._buffers) + + def append(self, data: Union[bytes, "Cord"]) -> None: + """Append a bytes or Cord to the current Cord.""" + if isinstance(data, bytes): + self._buffers.append(data) + self._byte_size += len(data) + elif isinstance(data, Cord): + self._buffers.extend(data._buffers) + self._byte_size += len(data) + else: + raise TypeError(f"Can only append bytes or Cords, received {type(data)}") + + def write_to_file(self, outfile: io.BufferedIOBase) -> None: + """Write the Cord to a file.""" + for item in self._buffers: + outfile.write(item) diff --git a/exir/_serialize/test/TARGETS b/exir/_serialize/test/TARGETS index 682f03f0f1..853d82b8a9 100644 --- a/exir/_serialize/test/TARGETS +++ b/exir/_serialize/test/TARGETS @@ -23,3 +23,13 @@ python_unittest( "//executorch/exir/_serialize:lib", ], ) + +python_unittest( + name = "cord", + srcs = [ + "test_cord.py", + ], + deps = [ + "//executorch/exir/_serialize:lib", + ], +) diff --git a/exir/_serialize/test/test_cord.py b/exir/_serialize/test/test_cord.py new file mode 100644 index 0000000000..d6c60255f5 --- /dev/null +++ b/exir/_serialize/test/test_cord.py @@ -0,0 +1,63 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + + +import io +import unittest + +from executorch.exir._serialize._cord import Cord + + +class TestCord(unittest.TestCase): + def test_cord_init(self) -> None: + cord_empty = Cord() + self.assertEqual(0, len(cord_empty)) + + cord = Cord(b"HelloWorld") + self.assertEqual(10, len(cord)) + self.assertEqual(b"HelloWorld", bytes(cord)) + + cord2 = Cord(cord) + self.assertEqual(10, len(cord2)) + self.assertEqual(b"HelloWorld", bytes(cord)) + + # Confirm no copies were made. + self.assertEqual(id(cord._buffers[0]), id(cord2._buffers[0])) + + def test_cord_append(self) -> None: + cord = Cord() + cord.append(b"Hello") + self.assertEqual(5, len(cord)) + self.assertEqual(b"Hello", bytes(cord)) + + cord.append(b"World") + self.assertEqual(10, len(cord)) + self.assertEqual(b"HelloWorld", bytes(cord)) + + def test_cord_append_cord(self) -> None: + cord = Cord() + cord.append(b"Hello") + cord.append((b"World")) + + cord2 = Cord() + cord2.append(b"Prefix") + cord2.append(cord) + + self.assertEqual(16, len(cord2)) + self.assertEqual(b"PrefixHelloWorld", bytes(cord2)) + + # Confirm that no copies were made when appending a Cord. + self.assertEqual(id(cord2._buffers[1]), id(cord._buffers[0])) + self.assertEqual(id(cord2._buffers[2]), id(cord._buffers[1])) + + def test_cord_write_to_file(self) -> None: + cord = Cord() + cord.append(b"Hello") + cord.append(b"World") + + outfile = io.BytesIO() + cord.write_to_file(outfile) + self.assertEqual(b"HelloWorld", outfile.getvalue()) From aed32c40096c668b8808a2e068d01db31260ec25 Mon Sep 17 00:00:00 2001 From: Stephen Jia Date: Wed, 6 Mar 2024 19:04:51 -0800 Subject: [PATCH 055/290] Use lazy descriptor pool allocation (#2285) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/2285 ## Context In Vulkan, memory for Descriptor Sets (which are used to bind data to shader arguments) must be pre-allocated. Previously, the convention is that a large number of descriptor sets are allocated upon creation of a Vulkan Context. While this worked well in Lite Interpreter, where only a global vulkan context is used, it will lead to overallocating descriptor sets in the Vulkan Delegate, where every `ComputeGraph` has its own dedicated Context. https://github.com/pytorch/pytorch/pull/121134 allows the Descriptor Set pool to be initialized in a deferred fashion. This means that a ComputeGraph can count the total number of descriptors needed across all the compute shaders that will be encoded, and then allocate a Descriptor Set Pool of the appropriate size. ## Implementation Overview 1. When constructing `ComputeGraph`, make sure that the descriptor pool config contains 0 for number of max sets. This will ensure that no descriptor pool will be initialized when constructing the graph's `api::Context` instance 2. When building the graph, `ExecuteNode` and `PrepackNode` will call `graph.update_descriptor_counts(shader)` upon construction, which allows `ComputeGraph` to count the total number of descriptor sets needed. 3. There is a separate descriptor count object for prepack and execute, since they correspond to different command buffers. 4. Before encoding any command buffers, call `graph.prepare()` which will construct a descriptor pool config from the descriptor counts. ## Notes One interesting finding is that I had to apply a safety factor to the descriptor counts to prevent the pool from running out of memory. This was reproducible on both Linux and Android. A more robust design, i.e. as discussed [here](https://www.reddit.com/r/vulkan/comments/17v66fi/question_about_descriptor_pool_allocations/) may be to maintain separate descriptor pools for each layout type. We should revisit this refactor at a later time. bypass-github-export-checks Reviewed By: jorgep31415 Differential Revision: D54603935 fbshipit-source-id: eb04403b5f0967d69b390153c778b58bd940004e --- backends/vulkan/runtime/VulkanBackend.cpp | 37 +---------- .../vulkan/runtime/graph/ComputeGraph.cpp | 65 +++++++++++++++++++ backends/vulkan/runtime/graph/ComputeGraph.h | 13 ++++ backends/vulkan/runtime/graph/GraphConfig.cpp | 56 ++++++++++++++++ backends/vulkan/runtime/graph/GraphConfig.h | 10 +++ .../vulkan/runtime/graph/ops/ExecuteNode.cpp | 15 +++++ .../vulkan/runtime/graph/ops/ExecuteNode.h | 8 +-- .../vulkan/runtime/graph/ops/PrepackNode.cpp | 17 +++++ .../vulkan/runtime/graph/ops/PrepackNode.h | 9 +-- .../runtime/graph/ops/impl/Arithmetic.cpp | 1 + .../vulkan/runtime/graph/ops/impl/Staging.cpp | 4 +- .../vulkan/test/vulkan_compute_api_test.cpp | 51 ++++----------- 12 files changed, 198 insertions(+), 88 deletions(-) create mode 100644 backends/vulkan/runtime/graph/GraphConfig.cpp diff --git a/backends/vulkan/runtime/VulkanBackend.cpp b/backends/vulkan/runtime/VulkanBackend.cpp index 1222ee38e5..b5d3441886 100644 --- a/backends/vulkan/runtime/VulkanBackend.cpp +++ b/backends/vulkan/runtime/VulkanBackend.cpp @@ -62,39 +62,6 @@ api::ScalarType get_scalar_type(const vkgraph::VkDataType& vk_datatype) { } } -GraphConfig generate_config() { - const uint32_t submit_frequency = UINT32_MAX; - - const api::CommandPoolConfig cmd_config{ - 4u, // cmdPoolInitialSize - 2u, // cmdPoolBatchSize - }; - - const api::DescriptorPoolConfig descriptor_pool_config{ - 1024u, // descriptorPoolMaxSets - 1024u, // descriptorUniformBufferCount - 1024u, // descriptorStorageBufferCount - 1024u, // descriptorCombinedSamplerCount - 1024u, // descriptorStorageImageCount - 32u, // descriptorPileSizes - }; - - const api::QueryPoolConfig query_pool_config{}; - - const api::ContextConfig context_config{ - submit_frequency, // cmdSubmitFrequency - cmd_config, // cmdPoolConfig - descriptor_pool_config, // descriptorPoolConfig - query_pool_config, // queryPoolConfig - }; - - const GraphConfig graph_config{ - context_config, - }; - - return graph_config; -} - class GraphBuilder { ComputeGraph* compute_graph_; VkGraphPtr flatbuffer_; @@ -269,6 +236,8 @@ class VulkanBackend final : public PyTorchBackendInterface { builder.build_graph(); + compute_graph->prepare(); + compute_graph->encode_prepack(); compute_graph->prepack(); @@ -284,7 +253,7 @@ class VulkanBackend final : public PyTorchBackendInterface { ComputeGraph* compute_graph = ET_ALLOCATE_INSTANCE_OR_RETURN_ERROR( context.get_runtime_allocator(), ComputeGraph); - new (compute_graph) ComputeGraph(generate_config()); + new (compute_graph) ComputeGraph(GraphConfig()); Error err = compileModel(processed->data(), compute_graph); diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp index 0900dfb9c1..6aa9171d9f 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.cpp +++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp @@ -18,6 +18,8 @@ namespace vulkan { ComputeGraph::ComputeGraph(GraphConfig config) : config_{config}, + prepack_descriptor_counts_{}, + execute_descriptor_counts_{}, context_{new api::Context( api::runtime()->default_adapter_i(), config_.contextConfig)}, @@ -27,6 +29,19 @@ ComputeGraph::ComputeGraph(GraphConfig config) execute_nodes_{}, inputs_{}, outputs_{} { + // Ensure that descriptor counts are initialized to 0 + prepack_descriptor_counts_.descriptorPoolMaxSets = 0; + prepack_descriptor_counts_.descriptorUniformBufferCount = 0; + prepack_descriptor_counts_.descriptorStorageBufferCount = 0; + prepack_descriptor_counts_.descriptorCombinedSamplerCount = 0; + prepack_descriptor_counts_.descriptorStorageImageCount = 0; + + execute_descriptor_counts_.descriptorPoolMaxSets = 0; + execute_descriptor_counts_.descriptorUniformBufferCount = 0; + execute_descriptor_counts_.descriptorStorageBufferCount = 0; + execute_descriptor_counts_.descriptorCombinedSamplerCount = 0; + execute_descriptor_counts_.descriptorStorageImageCount = 0; + context_->set_cmd(/*reusable = */ true); } @@ -39,6 +54,33 @@ ComputeGraph::~ComputeGraph() { context_->flush(); } +void ComputeGraph::update_descriptor_counts( + const api::ShaderInfo& shader_info, + bool execute) { + api::DescriptorPoolConfig* config = + execute ? &execute_descriptor_counts_ : &prepack_descriptor_counts_; + + config->descriptorPoolMaxSets += 1; + for (const VkDescriptorType arg_type : shader_info.kernel_layout) { + switch (arg_type) { + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: + config->descriptorUniformBufferCount += 1; + break; + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: + config->descriptorStorageBufferCount += 1; + break; + case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: + config->descriptorCombinedSamplerCount += 1; + break; + case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: + config->descriptorStorageImageCount += 1; + break; + default: + VK_THROW("Unsupported descriptor type!"); + } + } +} + ValueRef ComputeGraph::add_tensor( const std::vector& sizes, const api::ScalarType dtype, @@ -138,6 +180,29 @@ void ComputeGraph::copy_from_staging( copy_staging_to_ptr(staging, data, nbytes); } +void ComputeGraph::prepare() { +#define MERGE_FIELD(field) \ + static_cast(std::ceil( \ + std::max( \ + execute_descriptor_counts_.field, \ + prepack_descriptor_counts_.field) * \ + config_.descriptorPoolSafetyFactor)) + + api::DescriptorPoolConfig config{ + MERGE_FIELD(descriptorPoolMaxSets), + MERGE_FIELD(descriptorUniformBufferCount), + MERGE_FIELD(descriptorStorageBufferCount), + MERGE_FIELD(descriptorCombinedSamplerCount), + MERGE_FIELD(descriptorStorageImageCount), + 1u, + }; + + if (!context_->descriptor_pool()) { + context_->descriptor_pool().init(config); + } +#undef MERGE_FIELD +} + void ComputeGraph::encode_prepack() { for (std::unique_ptr& node : prepack_nodes_) { node->encode(this); diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h index a45e449ae2..7917304f0c 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.h +++ b/backends/vulkan/runtime/graph/ComputeGraph.h @@ -60,6 +60,9 @@ class ComputeGraph final { private: GraphConfig config_; + api::DescriptorPoolConfig prepack_descriptor_counts_; + api::DescriptorPoolConfig execute_descriptor_counts_; + std::unique_ptr context_; std::vector shared_objects_; std::vector values_; @@ -87,6 +90,10 @@ class ComputeGraph final { return outputs_; } + void update_descriptor_counts( + const api::ShaderInfo& shader_info, + bool execute); + /* * Returns the value at a particular reference */ @@ -163,6 +170,12 @@ class ComputeGraph final { SharedObject& get_shared_object(const int64_t idx); + // + // Graph Preparation + // + + void prepare(); + // // Input/Output // diff --git a/backends/vulkan/runtime/graph/GraphConfig.cpp b/backends/vulkan/runtime/graph/GraphConfig.cpp new file mode 100644 index 0000000000..8cda518dae --- /dev/null +++ b/backends/vulkan/runtime/graph/GraphConfig.cpp @@ -0,0 +1,56 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +namespace at { +namespace native { +namespace vulkan { + +GraphConfig::GraphConfig() { + // No automatic submissions + const uint32_t submit_frequency = UINT32_MAX; + + // Only one command buffer will be encoded at a time + const api::CommandPoolConfig cmd_config{ + 1u, // cmdPoolInitialSize + 1u, // cmdPoolBatchSize + }; + + // Use lazy descriptor pool initialization by default; the graph runtime will + // tally up the number of descriptor sets needed while building the graph and + // trigger descriptor pool initialization with exact sizes before encoding the + // command buffer. + const api::DescriptorPoolConfig descriptor_pool_config{ + 0u, // descriptorPoolMaxSets + 0u, // descriptorUniformBufferCount + 0u, // descriptorStorageBufferCount + 0u, // descriptorCombinedSamplerCount + 0u, // descriptorStorageImageCount + 0u, // descriptorPileSizes + }; + + const api::QueryPoolConfig query_pool_config{}; + + const api::ContextConfig context_config{ + submit_frequency, // cmdSubmitFrequency + cmd_config, // cmdPoolConfig + descriptor_pool_config, // descriptorPoolConfig + query_pool_config, // queryPoolConfig + }; + + contextConfig = context_config; + + // Empirically selected safety factor. If descriptor pools start running out + // of memory, increase this safety factor. + descriptorPoolSafetyFactor = 1.25; +} + +} // namespace vulkan +} // namespace native +} // namespace at diff --git a/backends/vulkan/runtime/graph/GraphConfig.h b/backends/vulkan/runtime/graph/GraphConfig.h index 0cb9bb6f53..e2c8d6bed0 100644 --- a/backends/vulkan/runtime/graph/GraphConfig.h +++ b/backends/vulkan/runtime/graph/GraphConfig.h @@ -18,6 +18,16 @@ namespace vulkan { struct GraphConfig final { api::ContextConfig contextConfig; + + // Creating a descriptor pool with exactly the number of descriptors tallied + // by iterating through the shader layouts of shaders used in the graph risks + // the descriptor pool running out of memory, therefore apply a safety factor + // to descriptor counts when creating the descriptor pool to mitigate this + // risk. + float descriptorPoolSafetyFactor; + + // Generate a default graph config with pre-configured settings + explicit GraphConfig(); }; } // namespace vulkan diff --git a/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp b/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp index 7c1f0fe807..c9c338bc17 100644 --- a/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp +++ b/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp @@ -16,6 +16,21 @@ namespace at { namespace native { namespace vulkan { +ExecuteNode::ExecuteNode( + ComputeGraph& graph, + const api::ShaderInfo& shader, + const api::utils::uvec3& global_workgroup_size, + const api::utils::uvec3& local_workgroup_size, + const std::vector& args, + api::UniformParamsBuffer&& params) + : shader_(shader), + global_workgroup_size_(global_workgroup_size), + local_workgroup_size_(local_workgroup_size), + args_(args), + params_(std::move(params)) { + graph.update_descriptor_counts(shader, /*execute = */ true); +} + void ExecuteNode::encode(ComputeGraph* graph) { api::Context* const context = graph->context(); api::PipelineBarrier pipeline_barrier{}; diff --git a/backends/vulkan/runtime/graph/ops/ExecuteNode.h b/backends/vulkan/runtime/graph/ops/ExecuteNode.h index ddd50c1f67..f3c2bba9c0 100644 --- a/backends/vulkan/runtime/graph/ops/ExecuteNode.h +++ b/backends/vulkan/runtime/graph/ops/ExecuteNode.h @@ -50,16 +50,12 @@ class ExecuteNode final { public: ExecuteNode( + ComputeGraph& graph, const api::ShaderInfo& shader, const api::utils::uvec3& global_workgroup_size, const api::utils::uvec3& local_workgroup_size, const std::vector& args, - api::UniformParamsBuffer&& params) - : shader_(shader), - global_workgroup_size_(global_workgroup_size), - local_workgroup_size_(local_workgroup_size), - args_(args), - params_(std::move(params)) {} + api::UniformParamsBuffer&& params); ~ExecuteNode() = default; diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp index d16c671ba4..69e6ffabd6 100644 --- a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp +++ b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp @@ -17,6 +17,23 @@ namespace at { namespace native { namespace vulkan { +PrepackNode::PrepackNode( + ComputeGraph& graph, + const api::ShaderInfo& shader, + const api::utils::uvec3& global_workgroup_size, + const api::utils::uvec3& local_workgroup_size, + const ValueRef tref, + const ValueRef packed, + api::UniformParamsBuffer&& params) + : shader_(shader), + global_workgroup_size_(global_workgroup_size), + local_workgroup_size_(local_workgroup_size), + tref_(tref), + packed_(packed), + params_(std::move(params)) { + graph.update_descriptor_counts(shader, /*execute = */ false); +} + void PrepackNode::encode(ComputeGraph* graph) { api::Context* const context = graph->context(); api::PipelineBarrier pipeline_barrier{}; diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.h b/backends/vulkan/runtime/graph/ops/PrepackNode.h index b3a5fd0086..59071e9371 100644 --- a/backends/vulkan/runtime/graph/ops/PrepackNode.h +++ b/backends/vulkan/runtime/graph/ops/PrepackNode.h @@ -33,18 +33,13 @@ class PrepackNode final { public: PrepackNode( + ComputeGraph& graph, const api::ShaderInfo& shader, const api::utils::uvec3& global_workgroup_size, const api::utils::uvec3& local_workgroup_size, const ValueRef tref, const ValueRef packed, - api::UniformParamsBuffer&& params) - : shader_(shader), - global_workgroup_size_(global_workgroup_size), - local_workgroup_size_(local_workgroup_size), - tref_(tref), - packed_(packed), - params_(std::move(params)) {} + api::UniformParamsBuffer&& params); ~PrepackNode() = default; diff --git a/backends/vulkan/runtime/graph/ops/impl/Arithmetic.cpp b/backends/vulkan/runtime/graph/ops/impl/Arithmetic.cpp index f5895c1544..108ff2b2dc 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Arithmetic.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Arithmetic.cpp @@ -72,6 +72,7 @@ void add_arithmetic_node( api::UniformParamsBuffer params(graph.context(), block); graph.execute_nodes().emplace_back(new ExecuteNode( + graph, shader, global_size, local_size, diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp index 41104532d4..953a06426a 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp @@ -48,6 +48,7 @@ void add_staging_to_tensor_node( graph.context(), create_staging_params(t_out)); graph.execute_nodes().emplace_back(new ExecuteNode( + graph, shader, global_size, local_size, @@ -90,6 +91,7 @@ void add_tensor_to_staging_node( } graph.execute_nodes().emplace_back(new ExecuteNode( + graph, shader, global_size, local_size, @@ -112,7 +114,7 @@ ValueRef prepack(ComputeGraph& graph, const ValueRef vref) { api::UniformParamsBuffer params(graph.context(), sp); graph.prepack_nodes().emplace_back(new PrepackNode( - shader, global_size, local_size, vref, v, std::move(params))); + graph, shader, global_size, local_size, vref, v, std::move(params))); return v; } diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index c53444ff0b..5c1fc8f3c5 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -108,39 +108,6 @@ size_t get_vma_allocation_count() { return get_vma_stats().total.statistics.allocationCount; } -GraphConfig generate_graph_config() { - const uint32_t submit_frequency = UINT32_MAX; - - const api::CommandPoolConfig cmd_config{ - 4u, // cmdPoolInitialSize - 2u, // cmdPoolBatchSize - }; - - const api::DescriptorPoolConfig descriptor_pool_config{ - 1024u, // descriptorPoolMaxSets - 1024u, // descriptorUniformBufferCount - 1024u, // descriptorStorageBufferCount - 1024u, // descriptorCombinedSamplerCount - 1024u, // descriptorStorageImageCount - 32u, // descriptorPileSizes - }; - - const api::QueryPoolConfig query_pool_config{}; - - const api::ContextConfig context_config{ - submit_frequency, // cmdSubmitFrequency - cmd_config, // cmdPoolConfig - descriptor_pool_config, // descriptorPoolConfig - query_pool_config, // queryPoolConfig - }; - - const GraphConfig graph_config{ - context_config, - }; - - return graph_config; -} - // // Test Wrapper // @@ -428,7 +395,7 @@ TEST_F(VulkanComputeAPITest, use_non_bound_textures_fails) { graph.copy_from_staging(name.staging, data_##name.data(), data_##name.size()); TEST(VulkanComputeGraphTest, test_values_scalars) { - GraphConfig config = generate_graph_config(); + GraphConfig config; ComputeGraph graph(config); ValueRef idx; @@ -441,7 +408,7 @@ TEST(VulkanComputeGraphTest, test_values_scalars) { } TEST(VulkanComputeGraphTest, test_values_scalar_list_inplace_constructed) { - GraphConfig config = generate_graph_config(); + GraphConfig config; ComputeGraph graph(config); ValueRef idx = graph.add_scalar_list({1, 2, 3, 4}); @@ -453,7 +420,7 @@ TEST(VulkanComputeGraphTest, test_values_scalar_list_inplace_constructed) { } TEST(VulkanComputeGraphTest, test_values_scalar_list_outside_constructed) { - GraphConfig config = generate_graph_config(); + GraphConfig config; ComputeGraph graph(config); ValueRef idx; @@ -469,7 +436,7 @@ TEST(VulkanComputeGraphTest, test_values_scalar_list_outside_constructed) { } TEST(VulkanComputeGraphTest, test_values_string) { - GraphConfig config = generate_graph_config(); + GraphConfig config; ComputeGraph graph(config); ValueRef idx; @@ -482,7 +449,7 @@ TEST(VulkanComputeGraphTest, test_values_string) { } TEST(VulkanComputeGraphTest, test_simple_graph) { - GraphConfig config = generate_graph_config(); + GraphConfig config; ComputeGraph graph(config); std::vector size_big = {4, 4, 4}; @@ -502,6 +469,7 @@ TEST(VulkanComputeGraphTest, test_simple_graph) { out.staging = graph.set_output_tensor(out.value); + graph.prepare(); graph.encode_execute(); // Run graph @@ -531,7 +499,7 @@ TEST(VulkanComputeGraphTest, test_simple_graph) { ValueRef name = graph.add_tensorref(sizes, api::kFloat, data_##name.data()); TEST(VulkanComputeGraphTest, test_simple_prepacked_graph) { - GraphConfig config = generate_graph_config(); + GraphConfig config; ComputeGraph graph(config); std::vector size_big = {4, 4, 4}; @@ -554,6 +522,8 @@ TEST(VulkanComputeGraphTest, test_simple_prepacked_graph) { out.value = e; out.staging = graph.set_output_tensor(out.value); + graph.prepare(); + graph.encode_prepack(); graph.prepack(); @@ -579,7 +549,7 @@ TEST(VulkanComputeGraphTest, test_simple_prepacked_graph) { } TEST(VulkanComputeGraphTest, test_simple_shared_objects) { - GraphConfig config = generate_graph_config(); + GraphConfig config; ComputeGraph graph(config); std::vector size_big = {4, 4, 4}; @@ -637,6 +607,7 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects) { // 1 staging buffer for the input tensor EXPECT_TRUE(get_vma_allocation_count() == 10); + graph.prepare(); graph.encode_execute(); // Allocation count will be 13, three shared objects are allocated for total: From 63b0a22ad6f2ac4b49efa39afa89e9513282ae49 Mon Sep 17 00:00:00 2001 From: Michael Gschwind Date: Wed, 6 Mar 2024 19:57:26 -0800 Subject: [PATCH 056/290] Enable tensor closeness check for additional types (#2256) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/2256 Enable tensor closeness check for additional types Reviewed By: manuelcandales Differential Revision: D54538932 fbshipit-source-id: 14d36f2bdcdee833a30995b664b9089e3264b511 --- .../exec_aten/testing_util/tensor_util.cpp | 18 +++++++++++++++--- runtime/core/portable_type/half.h | 12 ++++++++++++ 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/runtime/core/exec_aten/testing_util/tensor_util.cpp b/runtime/core/exec_aten/testing_util/tensor_util.cpp index 815e86bcb8..dd05468152 100644 --- a/runtime/core/exec_aten/testing_util/tensor_util.cpp +++ b/runtime/core/exec_aten/testing_util/tensor_util.cpp @@ -32,9 +32,7 @@ namespace { * T must be a floating point type. Non-floating point data should be compared * directly. */ -template < - typename T, - typename = std::enable_if_t::value>> +template bool data_is_close( const T* a, const T* b, @@ -110,6 +108,13 @@ bool tensors_are_close( a.numel(), rtol, atol); + } else if (a.scalar_type() == ScalarType::Half) { + return data_is_close( + a.const_data_ptr(), + b.const_data_ptr(), + a.numel(), + rtol, + atol); } else { // Non-floating-point types can be compared bitwise. return memcmp(a.const_data_ptr(), b.const_data_ptr(), a.nbytes()) == 0; @@ -150,6 +155,13 @@ bool tensor_data_is_close( a.numel(), rtol, atol); + } else if (a.scalar_type() == ScalarType::Half) { + return data_is_close( + a.const_data_ptr(), + b.const_data_ptr(), + a.numel(), + rtol, + atol); } else { // Non-floating-point types can be compared bitwise. return memcmp(a.const_data_ptr(), b.const_data_ptr(), a.nbytes()) == 0; diff --git a/runtime/core/portable_type/half.h b/runtime/core/portable_type/half.h index 448114b5ef..ad17054965 100644 --- a/runtime/core/portable_type/half.h +++ b/runtime/core/portable_type/half.h @@ -681,6 +681,18 @@ std::ostream& operator<<(std::ostream& out, const Half& value); namespace std { +static inline int isinf(torch::executor::Half value) { + return (value.x & 0x7FFF) == 0x7C00; +} + +static inline int isnan(torch::executor::Half value) { + return ((value.x & 0x7C00) == 0x7C00) && ((value.x & 0x03ff) != 0); +} + +static inline int isfinite(torch::executor::Half value) { + return !(isinf(value) || isnan(value)); +} + template <> class numeric_limits { public: From 187079e25fa7248a31bb7e32ff3f734d3120758b Mon Sep 17 00:00:00 2001 From: Michael Gschwind Date: Wed, 6 Mar 2024 19:59:19 -0800 Subject: [PATCH 057/290] Stop after first EOS post-prompt (#2255) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/2255 Stop after first EOS post-prompt Reviewed By: larryliu0820, shoumikhin Differential Revision: D54525365 fbshipit-source-id: 22b9fc57eb126d18597daf162fb92bf65cc1f0e9 --- examples/models/llama2/runner/runner.cpp | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/examples/models/llama2/runner/runner.cpp b/examples/models/llama2/runner/runner.cpp index 4d53442314..38e9d77a0d 100644 --- a/examples/models/llama2/runner/runner.cpp +++ b/examples/models/llama2/runner/runner.cpp @@ -197,7 +197,6 @@ Error Runner::generate( int next; // will store the next token in the sequence int64_t pos = num_prompt_tokens - 1; // position in the sequence int token = prompt_tokens[pos]; // prefill starts from 0 to num_prompt_tokens - int eos_counter = 0; // counter to capture EOS int logits_index = 0; // index of the logits tensor in the output int k_cache_index = 0; int v_cache_index = 0; @@ -340,13 +339,8 @@ Error Runner::generate( // data-dependent terminating condition: we have n_eos_ number of EOS if (pos >= num_prompt_tokens && next == eos_id_) { - eos_counter++; - if (eos_counter == n_eos_) { - ET_LOG(Info, "Reached to the end of generation"); - break; - } - } else { - eos_counter = 0; + ET_LOG(Info, "Reached to the end of generation"); + break; } token = next; From 8602e9e225cd3a7f92da74a298dce56e912787c9 Mon Sep 17 00:00:00 2001 From: "Ning Liu(Singapore)" Date: Wed, 6 Mar 2024 21:00:55 -0800 Subject: [PATCH 058/290] Revert D54538932: Enable tensor closeness check for additional types Differential Revision: D54538932 Original commit changeset: 14d36f2bdcde Original Phabricator Diff: D54538932 fbshipit-source-id: f7092646e852d6f45ec38fedc2635befcc384ea4 --- .../exec_aten/testing_util/tensor_util.cpp | 18 +++--------------- runtime/core/portable_type/half.h | 12 ------------ 2 files changed, 3 insertions(+), 27 deletions(-) diff --git a/runtime/core/exec_aten/testing_util/tensor_util.cpp b/runtime/core/exec_aten/testing_util/tensor_util.cpp index dd05468152..815e86bcb8 100644 --- a/runtime/core/exec_aten/testing_util/tensor_util.cpp +++ b/runtime/core/exec_aten/testing_util/tensor_util.cpp @@ -32,7 +32,9 @@ namespace { * T must be a floating point type. Non-floating point data should be compared * directly. */ -template +template < + typename T, + typename = std::enable_if_t::value>> bool data_is_close( const T* a, const T* b, @@ -108,13 +110,6 @@ bool tensors_are_close( a.numel(), rtol, atol); - } else if (a.scalar_type() == ScalarType::Half) { - return data_is_close( - a.const_data_ptr(), - b.const_data_ptr(), - a.numel(), - rtol, - atol); } else { // Non-floating-point types can be compared bitwise. return memcmp(a.const_data_ptr(), b.const_data_ptr(), a.nbytes()) == 0; @@ -155,13 +150,6 @@ bool tensor_data_is_close( a.numel(), rtol, atol); - } else if (a.scalar_type() == ScalarType::Half) { - return data_is_close( - a.const_data_ptr(), - b.const_data_ptr(), - a.numel(), - rtol, - atol); } else { // Non-floating-point types can be compared bitwise. return memcmp(a.const_data_ptr(), b.const_data_ptr(), a.nbytes()) == 0; diff --git a/runtime/core/portable_type/half.h b/runtime/core/portable_type/half.h index ad17054965..448114b5ef 100644 --- a/runtime/core/portable_type/half.h +++ b/runtime/core/portable_type/half.h @@ -681,18 +681,6 @@ std::ostream& operator<<(std::ostream& out, const Half& value); namespace std { -static inline int isinf(torch::executor::Half value) { - return (value.x & 0x7FFF) == 0x7C00; -} - -static inline int isnan(torch::executor::Half value) { - return ((value.x & 0x7C00) == 0x7C00) && ((value.x & 0x03ff) != 0); -} - -static inline int isfinite(torch::executor::Half value) { - return !(isinf(value) || isnan(value)); -} - template <> class numeric_limits { public: From 47b837b9991127e013551cea72aca51494ffb298 Mon Sep 17 00:00:00 2001 From: Lucy Qiu Date: Wed, 6 Mar 2024 21:32:40 -0800 Subject: [PATCH 059/290] Use cords to store constant and delegate segment data (#2281) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/2281 Update `extract_constant_segment` and `extract_delegate_segment` to place constants and delegates into cord data structures. Update `serialize_pte_binary` to compile the cords together. Remove the original extract_constant_segment/extract_delegate_segment logic, remove append_segments. Reviewed By: dbort Differential Revision: D54523957 fbshipit-source-id: 196b00710f5980344406aa435eebe75a97430ddf --- exir/_serialize/TARGETS | 1 + exir/_serialize/_program.py | 289 ++++++++------------------- exir/_serialize/test/test_program.py | 1 + 3 files changed, 83 insertions(+), 208 deletions(-) diff --git a/exir/_serialize/TARGETS b/exir/_serialize/TARGETS index 63918b8dae..b3c86953b4 100644 --- a/exir/_serialize/TARGETS +++ b/exir/_serialize/TARGETS @@ -61,5 +61,6 @@ runtime.python_library( ], deps = [ "//executorch/exir:schema", + "//executorch/exir:tensor", ], ) diff --git a/exir/_serialize/_program.py b/exir/_serialize/_program.py index 4135ce00cd..7b05dcf377 100644 --- a/exir/_serialize/_program.py +++ b/exir/_serialize/_program.py @@ -13,6 +13,7 @@ from dataclasses import dataclass from typing import ClassVar, List, Literal, Optional, Tuple +from executorch.exir._serialize._cord import Cord from executorch.exir._serialize._dataclass import _DataclassEncoder, _json_to_dataclass from executorch.exir._serialize._flatbuffer import ( _FlatbufferResult, @@ -29,6 +30,7 @@ Program, SubsegmentOffsets, ) +from executorch.exir.tensor import ALIGNMENT # Byte order of numbers written to program headers. Always little-endian @@ -240,15 +242,15 @@ def _get_extended_header(program_data: bytes) -> Optional[_ExtendedHeader]: def _extract_delegate_segments( - program: Program, segments: List[bytes], segment_alignment: int + program: Program, + segments: List[Cord], ) -> None: - """The input program and segments list are modified in place. + """Extracts the delegate segments inlined in the program into a list of buffers. + The program is modified in-place to remove the delegate data. Args: program: The program to extract segments from. Modified in-place. - segments: A list to which extracted segments will be appended. Modified in-place. - segment_alignment: Alignment in bytes. The starting offset of each - segment will be aligned to this value. + segments: A list of buffers to append extracted segments to. Modified in-place. """ remaining_inline: List[BackendDelegateInlineData] = [] inline_indices_seen: set[int] = set() @@ -278,24 +280,11 @@ def _extract_delegate_segments( if inline.data: # Move the delegate data out of the program. segment_index = len(segments) - segments.append(inline.data) + segments.append(Cord(inline.data)) delegate.processed = BackendDelegateDataReference( location=DataLocation.SEGMENT, index=segment_index, ) - - # Update the segment list in the root Program object. - prev_end = ( - program.segments[-1].offset + program.segments[-1].size - if program.segments - else 0 - ) - program.segments.append( - DataSegment( - offset=_aligned_size(prev_end, segment_alignment), - size=len(inline.data), - ), - ) else: # Not moving into a segment. Keep it inline, but update the # index. @@ -321,183 +310,32 @@ def _extract_delegate_segments( def _extract_constant_segment( constant_buffer: List[Buffer], tensor_alignment: int, -) -> Tuple[bytes, List[int]]: - """Copies the tensors from the provided list into a single buffer and tracks the offsets - of each tensor. +) -> Tuple[Cord, List[int]]: + """Copies the tensors from the provided list into a Cord and tracks the offsets + of each tensor. + Args: constant_buffer: list of Buffers from which to extract constants from. Not modified. - tensor_alignment: Alignment in bytes. The starting offset of each tensor in the - constant segment will be aligned to this value. Default to 16. + tensor_alignment: Alignment in bytes. Each tensor in the cord will be padded to align + with this value. Defaults to ALIGNMENT. Returns: A tuple of (constant segment, list of offsets for each tensor in the segment) """ - constant_segment_data: bytearray = bytearray() + constant_segment_data: Cord = Cord() constant_segment_offsets: List[int] = [] current_offset: int = 0 for i in range(len(constant_buffer)): buffer = constant_buffer[i] + constant_segment_data.append(buffer.storage) buffer_length = len(buffer.storage) pad_length = _padding_required(buffer_length, tensor_alignment) - - # Append each constant buffer to the constant segment. - constant_segment_data += buffer.storage - # Add padding for all but the last tensor. if i < len(constant_buffer) - 1: - constant_segment_data += b"\x00" * pad_length - - # Append constant data offset. + constant_segment_data.append(b"\x00" * pad_length) constant_segment_offsets.append(current_offset) current_offset += buffer_length + pad_length - return bytes(constant_segment_data), constant_segment_offsets - - -def _extract_segments( - program: Program, - extract_delegate_segments: bool, - extract_constant_segment: bool, - segment_alignment: int, - constant_tensor_alignment: int, -) -> Tuple[Program, List[bytes]]: - """Extracts constant and/or delegate data from a given Program into separate segments. - - Args: - program: The Program to extract segments from. - extract_delegate_segments: Whether to extract delegate data blobs from the program. - extract_constant_segment: Whether to extract constant data from the program. - segment_alignment: Alignment in bytes. The starting offset of each - segment will be aligned to this value in the output data. - constant_tensor_alignment: Alignment in bytes. The starting offset of each tensor - in the constant segment will be aligned to this value. - Returns: - A tuple of (modified program, list of segment data). - Raises: - ValueError, if the program already contains segments. - """ - if program.segments: - raise ValueError( - f"Program already has {len(program.segments)} segments: " - + f"{repr(program.segments)}" - ) - - # Don't modify the original program. - # TODO(T144120904): Could avoid yet more huge copies with a more shallow - # copy, reusing the actual data blobs. - program = copy.deepcopy(program) - - # Segment data to be written to the file following the flatbuffer data. - segments: List[bytes] = [] - - if extract_constant_segment: - constant_segment_data, constant_segment_offsets = _extract_constant_segment( - program.constant_buffer, tensor_alignment=constant_tensor_alignment - ) - - if constant_segment_data: - # Append constant_segment_data to the list of segments if non-empty. - segments.append(constant_segment_data) - # Append constant_segment offset to the list of DataSegments. Added as the - # first segment here, but it's not mandatory that the constant segment be - # the first one in the file. - program.segments.append( - DataSegment(offset=0, size=len(constant_segment_data)) - ) - - # Fill in constant_segment offsets and clear the constant buffer; only one of - # constant_segment and constant_buffer should be non-empty. - program.constant_segment = SubsegmentOffsets( - segment_index=0, offsets=constant_segment_offsets - ) - program.constant_buffer = [] - - if extract_delegate_segments: - _extract_delegate_segments( - program, segments=segments, segment_alignment=segment_alignment - ) - return program, segments - - -def _append_segments( - program_data: bytes, - segments: List[bytes], - alignment: int, - segment_table: List[DataSegment], - base_offset: int, -) -> bytes: - """Appends segments to the end of the program data. - - Appends each element of `segments` to `program_data`, with '\0' padding to - ensure that the offset of each segment is aligned to `alignment`. - - Args: - program_data: The flatbuffer-serialized Program. - segments: The list of segments to append to `program_data`. - alignment: Alignment in bytes. The starting offset of each - segment will be aligned to this value in the output data. - segment_table: The expected offsets and sizes of each element in - `segments`. This is typically `program.segments`. Must have the - same length as `segments`. - base_offset: The expected segment base offset from the extended header. - Should point to the aligned offset following the end of - `program_data`. - Returns: - A copy of `program_data` with the segment data and padding appended. - If there are no segments, returns `program_data` directly. - Raises: - ValueError: If the length of `segments` doesn't match the length of - `segment_table`. - """ - if len(segments) != len(segment_table): - raise ValueError( - f"Segments length {len(segments)} does not match " - + f"segment_table length {len(segment_table)}" - ) - if not segments: - return program_data - - # The pieces that will be concatenated to create the output data. - # `program_data` will be its first element. - padded_segments: List[bytes] = [] - # Length of all elements in padded_segments. Only used for assertions. - current_offset: int = 0 - for i, segment in enumerate([program_data] + segments): - # Add padding if necessary to align the start of this segment. - pad_length: int = _padding_required(current_offset, alignment) - if pad_length > 0: - padded_segments.append(b"\x00" * pad_length) - current_offset += pad_length - - # Make sure that we're about to add this segment to the offset that - # agrees with program.segments. Skip the first entry, which is the - # Program itself and isn't included in program.segments. - if i == 1: - # The first real segment should start at the base offset. - assert current_offset == base_offset, ( - f"Offset of first segment {current_offset} " - + f"!= base_offset {base_offset}" - ) - if i > 0: - # Adding a real segment, not `program_data`. - expected_segment = segment_table[i - 1] - expected_offset = base_offset + expected_segment.offset - assert current_offset == expected_offset, ( - f"Segment {i} offset {current_offset} " - + f"!= expected offset {expected_offset} " - + f"(base {base_offset} + {expected_segment.offset}) " - ) - assert expected_segment.size == len(segment), ( - f"Segment {i} size {len(segment)} " - + f"!= expected size {expected_segment.size}" - ) - - # Add the payload. If this is the final segment, it does not need - # padding after it. - padded_segments.append(segment) - current_offset += len(segment) - # Use join() instead of appending to avoid O(n) reallocation of these - # potentially-large buffers. - return b"".join(padded_segments) + return constant_segment_data, constant_segment_offsets def serialize_pte_binary( @@ -524,9 +362,8 @@ def serialize_pte_binary( into a separate segment. segment_alignment: Alignment in bytes. The starting offset of each segment will be aligned to this value in the output data. - constant_tensor_alignment: If provided, the minimum alignment of tensor - buffers in the program. Must be a power of 2. If not provided, uses - the value in the schema file. + constant_tensor_alignment: The minimum alignment of tensor + buffers in the program. Must be a power of 2. Defaults to ALIGNMENT. delegate_alignment: If provided, the minimum alignment of delegate data in the program. Must be a power of 2. If not provided, uses the value in the schema file. @@ -535,20 +372,53 @@ def serialize_pte_binary( """ # Default tensor alignment. if constant_tensor_alignment is None: - constant_tensor_alignment = 16 + constant_tensor_alignment = ALIGNMENT - # Segment data to be written to the file following the flatbuffer data. - segments: List[bytes] = [] + # Don't modify the original program. + # TODO(T144120904): Could avoid yet more huge copies with a more shallow + # copy, reusing the actual data blobs. + program = copy.deepcopy(program) + + # Store extracted segment data; this may be constant data or delegate data. + segments: List[Cord] = [] + + if extract_constant_segment: + constant_segment_data, constant_segment_offsets = _extract_constant_segment( + program.constant_buffer, tensor_alignment=constant_tensor_alignment + ) + if len(constant_segment_data) > 0: + # Update program.constant_segment with constant subsegment offset information. + program.constant_segment = SubsegmentOffsets( + segment_index=len(segments), offsets=constant_segment_offsets + ) + # Clear the constant buffer, as constant data will be stored in segments. + program.constant_buffer = [] + # Add to the aggregate segments cord. + segments.append(constant_segment_data) - # Extract constant segment and delegate segments, if requested. - if extract_constant_segment or extract_delegate_segments: - program, segments = _extract_segments( - program=program, - extract_delegate_segments=extract_delegate_segments, - extract_constant_segment=extract_constant_segment, - segment_alignment=segment_alignment, - constant_tensor_alignment=constant_tensor_alignment, + if extract_delegate_segments: + _extract_delegate_segments(program, segments) + + # Append all segments into a single Cord, adding any necessary padding to ensure that + # each segment begins at the required alignment. + # Update program.segments with the offsets to each segment. + segments_data = Cord() + for data in segments: + prev_end = ( + (program.segments[-1].offset + program.segments[-1].size) + if program.segments + else 0 + ) + program.segments.append( + DataSegment( + offset=_aligned_size(prev_end, segment_alignment), size=len(data) + ) ) + # Add to aggregate segments cord with padding. + padding_length = _padding_required(len(segments_data), segment_alignment) + if padding_length > 0: + segments_data.append(b"\x00" * padding_length) + segments_data.append(data) # Convert to a standard flatbuffer binary. result: _FlatbufferResult = _program_json_to_flatbuffer( @@ -558,7 +428,7 @@ def serialize_pte_binary( ) # If there are no segments present, do not insert the extended header. - if not segments: + if len(segments_data) == 0: return result.data # Size of the header to insert. Its size is padded to the largest @@ -572,7 +442,7 @@ def serialize_pte_binary( # Offset to the first segment, or zero if there are no segments. segment_base_offset: int = ( _aligned_size(input_size=program_size, alignment=segment_alignment) - if segments + if len(segments_data) > 0 else 0 ) @@ -600,18 +470,21 @@ def serialize_pte_binary( assert eh.program_size == program_size assert eh.segment_base_offset == segment_base_offset - if segments: - # Add segments to the end of the data, in order, with the appropriate - # padding. - program_data = _append_segments( - program_data=program_data, - segments=segments, - alignment=segment_alignment, - segment_table=program.segments, - base_offset=segment_base_offset, - ) - - return program_data + # Construct the final pte file containing: + # - program data; written to offset 0. + # - segments data (optional); aligned to segment_alignment. + pte_data = Cord(program_data) + if len(segments_data) > 0: + padding_length = _padding_required(len(pte_data), segment_alignment) + pte_data.append(b"\x00" * padding_length) + # The first segment after program data should start at the segment base offset. + assert ( + len(pte_data) == segment_base_offset + ), f"Offset of first segment {len(pte_data)} != segment base offset {segment_base_offset}" + pte_data.append(segments_data) + + # TODO(lfq): this creates a copy of all the data; once we update existing callsites this will change. + return bytes(pte_data) def _restore_segments(program: Program, segment_data: bytes) -> Program: diff --git a/exir/_serialize/test/test_program.py b/exir/_serialize/test/test_program.py index 94cd40f624..1754400dc6 100644 --- a/exir/_serialize/test/test_program.py +++ b/exir/_serialize/test/test_program.py @@ -461,6 +461,7 @@ def gen_blob_data(size: int, pattern: bytes) -> bytes: assert len(ret) == size return ret + @unittest.skip("TODO(T181362263): Update restore segments to restore cords") def test_round_trip_with_segments(self) -> None: # Create a program with some delegate data blobs. program = get_test_program() From 6be02ceec4480a2c2ce82a12299aa7a3cfd6d7be Mon Sep 17 00:00:00 2001 From: Zhengxu Chen Date: Thu, 7 Mar 2024 11:35:16 -0800 Subject: [PATCH 060/290] Serialize union fields with single entry dict. (#2279) Summary: X-link: https://github.com/pytorch/pytorch/pull/121337 Pull Request resolved: https://github.com/pytorch/executorch/pull/2279 remove "$type" and "$value" fields, instead only serialize as {type: value} for union fields directly. bypass-github-export-checks Reviewed By: tugsbayasgalan Differential Revision: D54600943 fbshipit-source-id: d4bab91541db42eec2d8ead1176db6acf0db8ef3 --- exir/serde/export_serialize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/exir/serde/export_serialize.py b/exir/serde/export_serialize.py index 7d07b20bfd..a8fd01a7f1 100644 --- a/exir/serde/export_serialize.py +++ b/exir/serde/export_serialize.py @@ -220,7 +220,7 @@ def serialize_tensor_meta(t: torch.Tensor) -> TensorMeta: requires_grad=t.requires_grad, device=Device(type=t.device.type, index=t.device.index), strides=[serialize_sym_int(s) for s in t.stride()], - storage_offset=0, + storage_offset=serialize_sym_int(0), layout=_TORCH_TO_SERIALIZE_LAYOUT[t.layout], ) From 424f1a99bb69a61bca0e7ef12aeca0597d166d6d Mon Sep 17 00:00:00 2001 From: Rob Elliott Date: Thu, 7 Mar 2024 11:51:28 -0800 Subject: [PATCH 061/290] Move vela to a newer pin and remove patches we've merged upstream. (#2158) Summary: passes run.sh locally ready for ci fixes in https://github.com/pytorch/executorch/issues/2143. They can import in any order. Pull Request resolved: https://github.com/pytorch/executorch/pull/2158 Reviewed By: mergennachin Differential Revision: D54638819 Pulled By: digantdesai fbshipit-source-id: 505cb8cf21af2961cbb39898e3a48209c79f8436 --- ...001-Improve-rescale-codegen-for-TOSA.patch | 129 ------------------ ...new-12f0e94aca6c17d0c6dc9b463277ab38.patch | 26 ---- examples/arm/setup.sh | 2 +- 3 files changed, 1 insertion(+), 156 deletions(-) delete mode 100644 examples/arm/ethos-u-setup/ethos-u-vela/patches/0001-Improve-rescale-codegen-for-TOSA.patch delete mode 100644 examples/arm/ethos-u-setup/ethos-u-vela/patches/0002-Use-TOSA-0.80.1_new-12f0e94aca6c17d0c6dc9b463277ab38.patch diff --git a/examples/arm/ethos-u-setup/ethos-u-vela/patches/0001-Improve-rescale-codegen-for-TOSA.patch b/examples/arm/ethos-u-setup/ethos-u-vela/patches/0001-Improve-rescale-codegen-for-TOSA.patch deleted file mode 100644 index e131ca76ee..0000000000 --- a/examples/arm/ethos-u-setup/ethos-u-vela/patches/0001-Improve-rescale-codegen-for-TOSA.patch +++ /dev/null @@ -1,129 +0,0 @@ -From ef07230fbb15edbf27ecaf48994fb157430a5e7c Mon Sep 17 00:00:00 2001 -From: Rob Elliott -Date: Thu, 5 Oct 2023 16:45:42 +0000 -Subject: [PATCH] Improve rescale codegen for TOSA - -Signed-off-by: Rob Elliott ---- - ethosu/vela/tosa_graph_optimiser.py | 56 +++++++++++------------------ - ethosu/vela/tosa_mapping.py | 2 +- - 2 files changed, 22 insertions(+), 36 deletions(-) - -diff --git a/ethosu/vela/tosa_graph_optimiser.py b/ethosu/vela/tosa_graph_optimiser.py -index df6b575..b2e3697 100644 ---- a/ethosu/vela/tosa_graph_optimiser.py -+++ b/ethosu/vela/tosa_graph_optimiser.py -@@ -337,7 +337,8 @@ def rewrite_concat(op): - - def remove_memory_ops(op, arch): - if op.run_on_npu and op.type in (Op.Reshape, Op.Identity): -- bypass_memory_only_ops(op) -+ # TODO: is this ok - function doesn't use arch or nng -+ bypass_memory_only_ops(op, arch, None) - - - def rewrite_activation(op, arch, nng): -@@ -357,7 +358,6 @@ def rewrite_activation(op, arch, nng): - - return op - -- - def rewrite_rescale(op, arch, nng): - if op.type == Op.Rescale: - ifm = op.ifm -@@ -368,7 +368,7 @@ def rewrite_rescale(op, arch, nng): - prev_op = ifm.ops[0] - - # TODO currently not supported -- assert len(ifm.consumer_list) == 1 -+ #assert len(ifm.consumer_list) == 1 - - input_zp = op.attrs["input_zp"] - output_zp = op.attrs["output_zp"] -@@ -390,6 +390,9 @@ def rewrite_rescale(op, arch, nng): - assert False - ifm.quantization.zero_point = input_zp - ofm.quantization.zero_point = output_zp -+ -+ assert False == per_channel, "Don't like per_channel!" -+ - for s, m in zip(shift, multiplier): - # TODO these are the TOSA limitations - assert m >= 0 -@@ -403,45 +406,28 @@ def rewrite_rescale(op, arch, nng): - else: - rounding_mode = RoundingMode.HalfUp - -- if prev_op.type.is_depthwise_conv2d_op() or prev_op.type.is_conv2d_op() or prev_op.type == Op.FullyConnected: -+ fuse = len(ifm.ops) == 1 and prev_op.type.is_depthwise_conv2d_op() or prev_op.type.is_conv2d_op() -+ if fuse: -+ # TODO: ERROR: bias.values didn't exist for an op like Add - presumably not a capability of that op - assert len(multiplier) == len(shift) == len(prev_op.bias.values) -- -- if ifm.dtype == DataType.int32 and per_channel: -- prev_op.explicit_scaling = explicit_scaling -- prev_op.rounding_mode = rounding_mode -- -- # Bypass op -- prev_op.set_output_tensor(ofm) -- DebugDatabase.add_optimised(op, prev_op) -- return op -- else: -- print("Warning, unsupported fusing of TOSA Rescale previous operator is of type:", prev_op.type) -- assert False -- # TODO which are the cases we need to and can do standalone Rescale? -- # TODO should we try to identify a conversion uint8<->int8 accomplished by 2 RESCALE ops? -- # origin might be TFLite op QUANTIZE, should we look to see if they can be translated to QUANTIZE? -- # limited to these at the moment: -- elif ( -- (ifm.dtype == DataType.int8 and ofm.dtype == DataType.int8) -- or (ifm.dtype == DataType.uint8 and ofm.dtype == DataType.int8) -- or (ifm.dtype == DataType.int8 and ofm.dtype == DataType.uint8) -- ): -- # Create NOP performing the RESCALE -+ # TODO: generate replacement fusion code from below -+ assert False, "Fusion possible but i've not implemented it" -+ else: -+ # Generate Rescale behaviour attached to a compatible NOP -+ # TODO: I assume this attaches a new operator into the graph?? - avgpool_op = replace_rescale_with_avg_pool(op) - avgpool_op.rounding_mode = rounding_mode -- -+ - if per_channel: -- # TODO -- avgpool_op.explicit_scaling = explicit_scaling -- print("Warning, unsupported TOSA Rescale") -- assert False -+ assert False, "Assert above removed but still not implemented... :/" - else: - avgpool_op.explicit_scaling = explicit_scaling -- else: -- print("Warning, unsupported fusing of TOSA Rescale previous operator is of type:", prev_op.type) -- assert False -- return op - -+ #print( len(multiplier), len(shift), len(prev_op.get_bias_tensors()) ) -+ #print( ifm.dtype, "PC:", per_channel, op.type ) -+ #print( ifm.dtype, ofm.dtype ) -+ -+ return op - - def convert_pad_in_width(op): - """ -diff --git a/ethosu/vela/tosa_mapping.py b/ethosu/vela/tosa_mapping.py -index 2dafd81..ed5aa2e 100644 ---- a/ethosu/vela/tosa_mapping.py -+++ b/ethosu/vela/tosa_mapping.py -@@ -148,7 +148,7 @@ transpose_conv_attrs = AttrSerializer( - ) - transpose_attrs = AttrSerializer("TransposeAttribute", (("perms", is_vec),)) - axis_attrs = AttrSerializer("AxisAttribute", ("axis",)) --reshape_attrs = AttrSerializer("ReshapeAttribute", (("shape", is_vec),)) -+reshape_attrs = AttrSerializer("ReshapeAttribute", (("newShape", is_vec),)) - slice_attrs = AttrSerializer("SliceAttribute", (("start", is_vec), ("size", is_vec))) - tile_attrs = AttrSerializer("TileAttribute", (("multiplies", is_vec),)) - resize_attrs = AttrSerializer( --- -2.41.0 - diff --git a/examples/arm/ethos-u-setup/ethos-u-vela/patches/0002-Use-TOSA-0.80.1_new-12f0e94aca6c17d0c6dc9b463277ab38.patch b/examples/arm/ethos-u-setup/ethos-u-vela/patches/0002-Use-TOSA-0.80.1_new-12f0e94aca6c17d0c6dc9b463277ab38.patch deleted file mode 100644 index 5d9a560b08..0000000000 --- a/examples/arm/ethos-u-setup/ethos-u-vela/patches/0002-Use-TOSA-0.80.1_new-12f0e94aca6c17d0c6dc9b463277ab38.patch +++ /dev/null @@ -1,26 +0,0 @@ -From 394636ef2063e386f54abde094298bb0e40a2cb7 Mon Sep 17 00:00:00 2001 -From: Zingo Andersen -Date: Sat, 20 Jan 2024 10:34:45 +0100 -Subject: [PATCH 2/2] Use TOSA 0.80.1 - -Signed-off-by: Zingo Andersen ---- - ethosu/vela/tosa_reader.py | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/ethosu/vela/tosa_reader.py b/ethosu/vela/tosa_reader.py -index 56af59d..7cb2bf3 100644 ---- a/ethosu/vela/tosa_reader.py -+++ b/ethosu/vela/tosa_reader.py -@@ -294,7 +294,7 @@ class TosaGraph: - def check_version(self, tosa_graph): - version = tosa_graph.Version() - version_str = f"{version._Major()}.{version._Minor()}.{version._Patch()}" -- if version_str != "0.80.0": -+ if version_str != "0.80.1": - print(f"Unsupported TOSA version: {version_str}") - assert False - --- -2.25.1 - diff --git a/examples/arm/setup.sh b/examples/arm/setup.sh index 991772166a..d1eeb84173 100755 --- a/examples/arm/setup.sh +++ b/examples/arm/setup.sh @@ -215,7 +215,7 @@ function setup_vela() { if [[ ! -e ethos-u-vela ]]; then git clone https://review.mlplatform.org/ml/ethos-u/ethos-u-vela repo_dir="${root_dir}/ethos-u-vela" - base_rev=00a15db3e1a188b25065d095152d701f4394cdc5 + base_rev=78b9412b07e0a46e58e8ecb9da8d661399c006a5 patch_repo fi cd "${root_dir}/ethos-u-vela" From b56065b0d7b9b6036b875ef348bea5af672a38af Mon Sep 17 00:00:00 2001 From: Manuel Candales Date: Thu, 7 Mar 2024 12:18:12 -0800 Subject: [PATCH 062/290] Add generated_lib dep (#2297) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/2297 bypass-github-export-checks Reviewed By: larryliu0820, mikekgfb Differential Revision: D54568050 fbshipit-source-id: 3f2530423550546b01846d25114bb95fb169967a --- examples/models/llama2/ops/quantized.yaml | 6 ------ examples/models/llama2/runner/targets.bzl | 2 +- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/examples/models/llama2/ops/quantized.yaml b/examples/models/llama2/ops/quantized.yaml index e3b6e2bbbe..8e435169e1 100644 --- a/examples/models/llama2/ops/quantized.yaml +++ b/examples/models/llama2/ops/quantized.yaml @@ -9,9 +9,3 @@ kernels: - arg_meta: null kernel_name: torch::executor::quantized_embedding_byte_dtype_out - -- func: quantized_decomposed::mixed_linear.out(Tensor input, Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!) - variants: function - kernels: - - arg_meta: null - kernel_name: torch::executor::quantized_mixed_linear_out diff --git a/examples/models/llama2/runner/targets.bzl b/examples/models/llama2/runner/targets.bzl index 57d25be8fa..31d05285e5 100644 --- a/examples/models/llama2/runner/targets.bzl +++ b/examples/models/llama2/runner/targets.bzl @@ -6,7 +6,7 @@ def _get_operator_lib(aten = False): elif runtime.is_oss: return ["//executorch/kernels/portable:generated_lib_all_ops"] else: - return ["//executorch/kernels/portable:generated_lib_all_ops", "//executorch/examples/models/llama2/custom_ops:custom_ops"] + return ["//executorch/kernels/portable:generated_lib_all_ops", "//executorch/examples/models/llama2/custom_ops:custom_ops", "//executorch/examples/models/llama2/ops:generated_lib"] def define_common_targets(): for aten in (True, False): From 990f6ca5e2750cf1db838dbcbc7625a64b9b8316 Mon Sep 17 00:00:00 2001 From: Hansong Zhang Date: Thu, 7 Mar 2024 12:24:53 -0800 Subject: [PATCH 063/290] UX Improvements (#2291) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/2291 * Chat UX * RAM info * Local model selection * Model load time * Tokens/sec info bypass-github-pytorch-ci-checks CI broken Reviewed By: shoumikhin Differential Revision: D54621204 fbshipit-source-id: a12282d5dfd34770f5f6de46db6a6132eaf7d950 --- .../example/executorchdemo/MainActivity.java | 209 ++++++++++++------ .../com/example/executorchdemo/Message.java | 40 ++++ .../executorchdemo/MessageAdapter.java | 40 ++++ .../main/res/drawable/received_message.xml | 6 + .../src/main/res/drawable/sent_message.xml | 6 + .../app/src/main/res/drawable/three_dots.xml | 5 + .../app/src/main/res/layout/activity_main.xml | 50 ++--- .../src/main/res/layout/received_message.xml | 42 ++++ .../app/src/main/res/layout/sent_message.xml | 34 +++ extension/android/jni/jni_layer_llama.cpp | 5 + .../org/pytorch/executorch/LlamaModule.java | 3 + 11 files changed, 343 insertions(+), 97 deletions(-) create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchdemo/Message.java create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchdemo/MessageAdapter.java create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/received_message.xml create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/sent_message.xml create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/three_dots.xml create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/received_message.xml create mode 100644 examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/sent_message.xml diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchdemo/MainActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchdemo/MainActivity.java index c24c367860..dd7cbfe50f 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchdemo/MainActivity.java +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchdemo/MainActivity.java @@ -9,92 +9,118 @@ package com.example.executorchllamademo; import android.app.Activity; +import android.app.ActivityManager; import android.app.AlertDialog; import android.content.Context; import android.os.Bundle; import android.widget.Button; import android.widget.EditText; -import android.widget.TextView; +import android.widget.ImageButton; +import android.widget.ListView; import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; import org.pytorch.executorch.LlamaCallback; import org.pytorch.executorch.LlamaModule; public class MainActivity extends Activity implements Runnable, LlamaCallback { private EditText mEditTextMessage; - private TextView mTextViewChat; private Button mSendButton; - private Button mStopButton; - private Button mModelButton; + private ImageButton mModelButton; + private ListView mMessagesView; + private MessageAdapter mMessageAdapter; private LlamaModule mModule = null; - private String mResult = null; + private Message mResultMessage = null; - private static String assetFilePath(Context context, String assetName) throws IOException { - File file = new File(context.getFilesDir(), assetName); - if (file.exists() && file.length() > 0) { - return file.getAbsolutePath(); - } - - try (InputStream is = context.getAssets().open(assetName)) { - try (OutputStream os = new FileOutputStream(file)) { - byte[] buffer = new byte[4 * 1024]; - int read; - while ((read = is.read(buffer)) != -1) { - os.write(buffer, 0, read); - } - os.flush(); - } - return file.getAbsolutePath(); - } - } + private int mNumTokens = 0; + private long mRunStartTime = 0; + private String mModelFilePath = ""; + private String mTokenizerFilePath = ""; @Override public void onResult(String result) { System.out.println("onResult: " + result); - mResult = result; + mResultMessage.appendText(result); + mNumTokens++; run(); } - private void setModel(String modelPath, String tokenizerPath) { - try { - String model = MainActivity.assetFilePath(getApplicationContext(), modelPath); - String tokenizer = MainActivity.assetFilePath(getApplicationContext(), tokenizerPath); - mModule = new LlamaModule(model, tokenizer, 0.8f); - } catch (IOException e) { - finish(); + private static String[] listLocalFile(String path, String suffix) { + File directory = new File(path); + if (directory.exists() && directory.isDirectory()) { + File[] files = directory.listFiles((dir, name) -> name.toLowerCase().endsWith(suffix)); + String[] result = new String[files.length]; + for (int i = 0; i < files.length; i++) { + if (files[i].isFile() && files[i].getName().endsWith(suffix)) { + result[i] = files[i].getAbsolutePath(); + } + } + return result; } + return null; } private void setLocalModel(String modelPath, String tokenizerPath) { + long runStartTime = System.currentTimeMillis(); mModule = new LlamaModule(modelPath, tokenizerPath, 0.8f); + int loadResult = mModule.load(); + if (loadResult != 0) { + AlertDialog.Builder builder = new AlertDialog.Builder(this); + builder.setTitle("Load failed: " + loadResult); + AlertDialog alert = builder.create(); + alert.show(); + } + + long runDuration = System.currentTimeMillis() - runStartTime; + String modelInfo = + "Model path: " + + modelPath + + "\nTokenizer path: " + + tokenizerPath + + "\nModel loaded time: " + + runDuration + + " ms"; + Message modelLoadedMessage = new Message(modelInfo, false); + mMessageAdapter.add(modelLoadedMessage); + mMessageAdapter.notifyDataSetChanged(); + } + + private String memoryInfo() { + final ActivityManager am = (ActivityManager) getSystemService(Context.ACTIVITY_SERVICE); + ActivityManager.MemoryInfo memInfo = new ActivityManager.MemoryInfo(); + am.getMemoryInfo(memInfo); + return "Total RAM: " + + Math.floorDiv(memInfo.totalMem, 1000000) + + " MB. Available RAM: " + + Math.floorDiv(memInfo.availMem, 1000000) + + " MB."; } private void modelDialog() { - AlertDialog.Builder builder = new AlertDialog.Builder(this); - builder.setTitle("Select a Model"); - builder.setSingleChoiceItems( - new String[] {"stories", "language"}, + String[] pteFiles = listLocalFile("/data/local/tmp/llama/", ".pte"); + String[] binFiles = listLocalFile("/data/local/tmp/llama/", ".bin"); + AlertDialog.Builder modelPathBuilder = new AlertDialog.Builder(this); + modelPathBuilder.setTitle("Select model path"); + AlertDialog.Builder tokenizerPathBuilder = new AlertDialog.Builder(this); + tokenizerPathBuilder.setTitle("Select tokenizer path"); + modelPathBuilder.setSingleChoiceItems( + binFiles, -1, - new android.content.DialogInterface.OnClickListener() { - public void onClick(android.content.DialogInterface dialog, int item) { - switch (item) { - case 0: - setModel("stories110M.pte", "tokenizer.bin"); - break; - case 1: - setLocalModel("/data/local/tmp/language.pte", "/data/local/tmp/language.bin"); - break; - } - mEditTextMessage.setText(""); - mTextViewChat.setText(""); - dialog.dismiss(); - } + (dialog, item) -> { + mTokenizerFilePath = binFiles[item]; + mEditTextMessage.setText(""); + dialog.dismiss(); + tokenizerPathBuilder.create().show(); + }); + + tokenizerPathBuilder.setSingleChoiceItems( + pteFiles, + -1, + (dialog, item) -> { + mModelFilePath = pteFiles[item]; + setLocalModel(mModelFilePath, mTokenizerFilePath); + dialog.dismiss(); }); - AlertDialog alert = builder.create(); - alert.show(); + + modelPathBuilder.create().show(); } @Override @@ -103,38 +129,76 @@ protected void onCreate(Bundle savedInstanceState) { setContentView(R.layout.activity_main); mEditTextMessage = findViewById(R.id.editTextMessage); - mTextViewChat = findViewById(R.id.textViewChat); mSendButton = findViewById(R.id.sendButton); - mStopButton = findViewById(R.id.stopButton); mModelButton = findViewById(R.id.modelButton); + mMessagesView = findViewById(R.id.messages_view); + mMessageAdapter = new MessageAdapter(this, R.layout.sent_message); + mMessagesView.setAdapter(mMessageAdapter); + mModelButton.setOnClickListener( + view -> { + mModule.stop(); + mMessageAdapter.clear(); + mMessageAdapter.notifyDataSetChanged(); + modelDialog(); + }); + setLocalModel("/data/local/tmp/llama/stories110M.pte", "/data/local/tmp/llama/tokenizer.bin"); + onModelRunStopped(); + } + + private void onModelRunStarted() { + mSendButton.setText("Stop"); + mSendButton.setOnClickListener( + view -> { + mModule.stop(); + }); + + mRunStartTime = System.currentTimeMillis(); + } + + private void onModelRunStopped() { + setTitle(memoryInfo()); + long runDuration = System.currentTimeMillis() - mRunStartTime; + if (mResultMessage != null) { + mResultMessage.setTokensPerSecond(1.0f * mNumTokens / (runDuration / 1000.0f)); + } + mSendButton.setText("Generate"); mSendButton.setOnClickListener( view -> { String prompt = mEditTextMessage.getText().toString(); - mTextViewChat.append(prompt); + mMessageAdapter.add(new Message(prompt, true)); + mMessageAdapter.notifyDataSetChanged(); mEditTextMessage.setText(""); + mResultMessage = new Message("", false); + mMessageAdapter.add(mResultMessage); Runnable runnable = new Runnable() { @Override public void run() { + runOnUiThread( + new Runnable() { + @Override + public void run() { + onModelRunStarted(); + } + }); + mModule.generate(prompt, MainActivity.this); + + runOnUiThread( + new Runnable() { + @Override + public void run() { + onModelRunStopped(); + } + }); } }; new Thread(runnable).start(); }); - - mStopButton.setOnClickListener( - view -> { - mModule.stop(); - }); - - mModelButton.setOnClickListener( - view -> { - mModule.stop(); - modelDialog(); - }); - - setModel("stories110M.pte", "tokenizer.bin"); + mNumTokens = 0; + mRunStartTime = 0; + mMessageAdapter.notifyDataSetChanged(); } @Override @@ -143,7 +207,8 @@ public void run() { new Runnable() { @Override public void run() { - mTextViewChat.append(mResult); + mMessageAdapter.notifyDataSetChanged(); + setTitle(memoryInfo()); } }); } diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchdemo/Message.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchdemo/Message.java new file mode 100644 index 0000000000..81b77b1aba --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchdemo/Message.java @@ -0,0 +1,40 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package com.example.executorchllamademo; + +public class Message { + private String text; + private boolean isSent; + private float tokensPerSecond; + + public Message(String text, boolean isSent) { + this.text = text; + this.isSent = isSent; + } + + public String getText() { + return text; + } + + public void appendText(String text) { + this.text += text; + } + + public boolean getIsSent() { + return isSent; + } + + public void setTokensPerSecond(float tokensPerSecond) { + this.tokensPerSecond = tokensPerSecond; + } + + public float getTokensPerSecond() { + return tokensPerSecond; + } +} diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchdemo/MessageAdapter.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchdemo/MessageAdapter.java new file mode 100644 index 0000000000..656da1967d --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchdemo/MessageAdapter.java @@ -0,0 +1,40 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package com.example.executorchllamademo; + +import android.view.LayoutInflater; +import android.view.View; +import android.view.ViewGroup; +import android.widget.ArrayAdapter; +import android.widget.TextView; + +public class MessageAdapter extends ArrayAdapter { + public MessageAdapter(android.content.Context context, int resource) { + super(context, resource); + } + + @Override + public View getView(int position, View convertView, ViewGroup parent) { + Message currentMessage = getItem(position); + + int layoutIdForListItem = + currentMessage.getIsSent() ? R.layout.sent_message : R.layout.received_message; + View listItemView = + LayoutInflater.from(getContext()).inflate(layoutIdForListItem, parent, false); + TextView messageTextView = listItemView.findViewById(R.id.message_text); + messageTextView.setText(currentMessage.getText()); + + if (currentMessage.getTokensPerSecond() > 0) { + TextView tokensView = listItemView.findViewById(R.id.tokens_per_second); + tokensView.setText("" + currentMessage.getTokensPerSecond() + " t/s"); + } + + return listItemView; + } +} diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/received_message.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/received_message.xml new file mode 100644 index 0000000000..ea2d1bbfa1 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/received_message.xml @@ -0,0 +1,6 @@ + + + + + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/sent_message.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/sent_message.xml new file mode 100644 index 0000000000..e8d13ca4e1 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/sent_message.xml @@ -0,0 +1,6 @@ + + + + + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/three_dots.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/three_dots.xml new file mode 100644 index 0000000000..afbe22da80 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/three_dots.xml @@ -0,0 +1,5 @@ + + + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_main.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_main.xml index f769578d33..089acb572b 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_main.xml +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_main.xml @@ -1,44 +1,44 @@ - - - + -