diff --git a/include/onnxruntime/core/framework/tensor_shape.h b/include/onnxruntime/core/framework/tensor_shape.h index 89d9b105946ef..365cbe26c58e3 100644 --- a/include/onnxruntime/core/framework/tensor_shape.h +++ b/include/onnxruntime/core/framework/tensor_shape.h @@ -7,6 +7,7 @@ #include #include #include +#include #include "onnxruntime_config.h" namespace onnxruntime { @@ -16,60 +17,47 @@ namespace onnxruntime { #pragma GCC diagnostic ignored "-Wnull-dereference" #endif #endif -class TensorShape : private std::vector { - // TODO - Use a custom STL allocator to avoid heap allocations in the common case. +class TensorShape { // We use negative numbers for unknown symbolic dimension. Each negative // number represents a unique symbolic dimension. - // Private inheritance is used to prevent ambiguity of element versus dimension size public: TensorShape() = default; - TensorShape(const TensorShape& /*other*/) = default; - TensorShape& operator=(const TensorShape& /*other*/) = default; + TensorShape(const TensorShape& other) : TensorShape(other.GetDims()) {} + TensorShape& operator=(const TensorShape& other); - TensorShape(TensorShape&& /*other*/) = default; - TensorShape& operator=(TensorShape&& /*other*/) = default; + TensorShape(TensorShape&& other) { operator=(std::move(other)); } + TensorShape& operator=(TensorShape&& other); - TensorShape(const std::vector& dims) : std::vector(dims) {} + TensorShape(gsl::span dims); + TensorShape(const std::vector& dims) : TensorShape(gsl::make_span(dims)) {} + TensorShape(const std::initializer_list& dims) : TensorShape(gsl::make_span(dims)) {} + TensorShape(const int64_t* dimension_sizes, size_t dimension_count) : TensorShape(gsl::span(dimension_sizes, dimension_count)) {} + TensorShape(const std::vector& dims, size_t start, size_t end) : TensorShape(gsl::span(&dims[start], end - start)) {} - TensorShape(std::vector&& dims) : std::vector(std::move(dims)) {} - - TensorShape(const std::initializer_list& dims) : std::vector(dims) {} - - TensorShape(const int64_t* dimension_sizes, size_t dimension_count); - - TensorShape(const std::vector& dims, size_t start, size_t end); + // Create a TensorShape that points to an existing buffer internally. As no copy is made, 'data' must remain valid for the life of the TensorShape + static const TensorShape FromExistingBuffer(const std::vector& data) { + return TensorShape(External{}, gsl::span(const_cast(data.data()), data.size())); + } /** Return the dimension specified by . */ - const int64_t& operator[](size_t idx) const { - return std::vector::operator[](static_cast(idx)); - } - - int64_t& operator[](size_t idx) { - return std::vector::operator[](static_cast(idx)); - } + int64_t operator[](size_t idx) const { return values_[idx]; } + int64_t& operator[](size_t idx) { return values_[idx]; } - bool operator==(const TensorShape& other) const noexcept { - auto thisVector = static_cast*>(this); - auto otherVector = static_cast*>(&other); - return *thisVector == *otherVector; - } - - bool operator!=(const TensorShape& other) const noexcept { - return !(*this == other); - } + bool operator==(const TensorShape& other) const noexcept { return GetDims() == other.GetDims(); } + bool operator!=(const TensorShape& other) const noexcept { return GetDims() != other.GetDims(); } size_t NumDimensions() const noexcept { - return size(); + return values_.size(); } /** Copy dims into an array with given size */ void CopyDims(int64_t* dims, size_t num_dims) const { - memcpy(dims, data(), sizeof(value_type) * std::min(num_dims, NumDimensions())); + memcpy(dims, values_.begin(), sizeof(int64_t) * std::min(num_dims, NumDimensions())); } /** @@ -78,13 +66,14 @@ class TensorShape : private std::vector { and this function does no checks to ensure that */ void CopyDims(int64_t* dims, size_t start_dim, size_t num_dims) const { - memcpy(dims, data() + start_dim, sizeof(value_type) * std::min(num_dims, NumDimensions() - start_dim)); + memcpy(dims, values_.begin() + start_dim, sizeof(int64_t) * std::min(num_dims, NumDimensions() - start_dim)); } /** Return underlying vector representation. */ - const std::vector& GetDims() const { return *this; } + gsl::span GetDims() const { return values_; } + std::vector GetDimsAsVector() const { return std::vector(values_.begin(), values_.end()); } /** * Return the total number of elements. Returns 1 for an empty (rank 0) TensorShape. @@ -116,7 +105,7 @@ class TensorShape : private std::vector { /** Return a new TensorShape of the dimensions from dimstart to end. */ - TensorShape Slice(size_t dimstart) const; + TensorShape Slice(size_t dimstart) const { return Slice(dimstart, values_.size()); } /** output dimensions nicely formatted @@ -134,14 +123,22 @@ class TensorShape : private std::vector { empty shape or 1D shape (1) is regarded as scalar tensor */ bool IsScalar() const { - size_t len = size(); - return len == 0 || (len == 1 && operator[](0) == 1); + size_t len = values_.size(); + return len == 0 || (len == 1 && values_[0] == 1); } - static const TensorShape& ReinterpretBaseType(const std::vector& dimensions) { - static_assert(sizeof(TensorShape) == sizeof(std::vector), "Size of TensorShape prevents safe casting from vector"); - return *static_cast(&dimensions); - } + private: + + struct External {}; + TensorShape(External, gsl::span buffer) : values_{buffer} {} + + void Allocate(size_t size); + + gsl::span values_; + int64_t small_buffer_[5]; + std::unique_ptr allocated_buffer_; + + friend struct ProviderHostImpl; // So that the shared provider interface can access Allocate }; #ifdef __GNUC__ #pragma GCC diagnostic pop diff --git a/onnxruntime/contrib_ops/cpu/bert/attention.cc b/onnxruntime/contrib_ops/cpu/bert/attention.cc index f5014d6abebea..8f27b8a86633e 100644 --- a/onnxruntime/contrib_ops/cpu/bert/attention.cc +++ b/onnxruntime/contrib_ops/cpu/bert/attention.cc @@ -423,7 +423,7 @@ Status Attention::Compute(OpKernelContext* context) const { past, extra_add_qk)); - const auto& shape = input->Shape().GetDims(); + const auto shape = input->Shape().GetDims(); const int batch_size = static_cast(shape[0]); const int sequence_length = static_cast(shape[1]); const int input_hidden_size = static_cast(shape[2]); diff --git a/onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h b/onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h index 8ad3b0c217670..dff54d5f3b1c9 100644 --- a/onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h +++ b/onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h @@ -61,7 +61,7 @@ class AttentionCPUBase : public AttentionBase { BufferUniquePtr mask_data_buffer(mask_data, BufferDeleter(allocator)); const int32_t* mask_index_data = mask_index != nullptr ? mask_index->template Data() : nullptr; - const std::vector* mask_index_dims = mask_index != nullptr ? &(mask_index->Shape().GetDims()) : nullptr; + gsl::span mask_index_dims = mask_index != nullptr ? mask_index->Shape().GetDims() : gsl::span{}; const T* past_data = past != nullptr ? past->template Data() : nullptr; T* present_data = present != nullptr ? present->template MutableData() : nullptr; @@ -97,7 +97,7 @@ class AttentionCPUBase : public AttentionBase { const T* Q, // Q data. Its size is BxNxSxH const T* K, // k data. Its size is BxNxSxH const int32_t* mask_index, // mask index. nullptr if no mask or its size is B - const std::vector* mask_index_dims, // mask index shape + gsl::span mask_index_dims, // mask index shape T* mask_data, // buffer for mask data. It is nullptr if mask_index is nullptr and not unidirectional, otherwise its shape is BxSxS* bool has_unidirectional, // has unidirectional mask int batch_size, // batch size of self-attention diff --git a/onnxruntime/contrib_ops/cpu/bert/attention_helper.h b/onnxruntime/contrib_ops/cpu/bert/attention_helper.h index 3a87949caf63f..6e21d82d6a972 100644 --- a/onnxruntime/contrib_ops/cpu/bert/attention_helper.h +++ b/onnxruntime/contrib_ops/cpu/bert/attention_helper.h @@ -62,7 +62,7 @@ inline void ComputeAttentionSoftmaxInplace(float* score, int N, int D, ThreadPoo template void PrepareMask(const int32_t* mask_index, - const std::vector* mask_index_dims, + gsl::span mask_index_dims, T* mask_data, bool is_unidirectional, int batch_size, @@ -74,12 +74,12 @@ void PrepareMask(const int32_t* mask_index, T* p_mask = mask_data; // 4D mask in Megatron GPT2 is currently not support in CPU kernel - if (nullptr != mask_index_dims && mask_index_dims->size() == 4) { + if (nullptr != mask_index && mask_index_dims.size() == 4) { ORT_NOT_IMPLEMENTED("4D mask in attention cpu kernel is not supported"); } // For 3D mask, convert values 0 to -10000.0, and 1 to 0.0, then apply unidirectional mask if any. - if (nullptr != mask_index_dims && mask_index_dims->size() == 3) { + if (nullptr != mask_index && mask_index_dims.size() == 3) { for (int i = 0; i < batch_size * sequence_length * all_sequence_length; i++) { p_mask[i] = (mask_index[i] > 0) ? static_cast(0.0f) : static_cast(-10000.0f); } @@ -98,8 +98,8 @@ void PrepareMask(const int32_t* mask_index, return; } - bool is_raw_attention_mask = (nullptr != mask_index_dims && mask_index_dims->size() == 2); - bool has_mask_start_position = (nullptr != mask_index_dims && mask_index_dims->size() == 1 && static_cast(mask_index_dims->at(0)) == 2 * batch_size); + bool is_raw_attention_mask = (nullptr != mask_index && mask_index_dims.size() == 2); + bool has_mask_start_position = (nullptr != mask_index && mask_index_dims.size() == 1 && static_cast(mask_index_dims.at(0)) == 2 * batch_size); for (int b_i = 0; b_i < batch_size; b_i++) { // TODO: mask_index can be used in softmax to save some calculation. diff --git a/onnxruntime/contrib_ops/cpu/expand_dims.h b/onnxruntime/contrib_ops/cpu/expand_dims.h index 72930508d68eb..71d5f99c9d996 100644 --- a/onnxruntime/contrib_ops/cpu/expand_dims.h +++ b/onnxruntime/contrib_ops/cpu/expand_dims.h @@ -30,7 +30,7 @@ class ExpandDims final : public OpKernel { if (X == nullptr) return Status(common::ONNXRUNTIME, common::FAIL, "input count mismatch"); const TensorShape& X_shape = X->Shape(); - std::vector expanded_shape(X_shape.GetDims()); + std::vector expanded_shape(X_shape.GetDimsAsVector()); int64_t X_NumDims = X_shape.Size(); ORT_ENFORCE(axis <= X_NumDims && axis >= -X_NumDims, "Axis must be within range [", -X_NumDims, ", ", X_NumDims, "].", " Axis is ", axis); diff --git a/onnxruntime/contrib_ops/cpu/qlinear_global_average_pool.cc b/onnxruntime/contrib_ops/cpu/qlinear_global_average_pool.cc index 2312e6dfb6a90..d8314d4cfd37d 100644 --- a/onnxruntime/contrib_ops/cpu/qlinear_global_average_pool.cc +++ b/onnxruntime/contrib_ops/cpu/qlinear_global_average_pool.cc @@ -81,7 +81,7 @@ Status QLinearGlobalAveragePool::Compute(OpKernelContext* context) const { int64_t image_size = std::accumulate(x_shape.cbegin() + spatial_dim_start, x_shape.cbegin() + spatial_dim_end, 1LL, std::multiplies()); - std::vector output_dims(x_shape); + std::vector output_dims(x_shape.begin(), x_shape.end()); std::transform(x_shape.cbegin() + spatial_dim_start, x_shape.cbegin() + spatial_dim_end, output_dims.begin() + spatial_dim_start, [](const int64_t&) { return int64_t{1}; }); Tensor& Y = *context->Output(0, output_dims); diff --git a/onnxruntime/contrib_ops/cpu/qlinear_pool.cc b/onnxruntime/contrib_ops/cpu/qlinear_pool.cc index 6edd75d8a3cf8..154c4620b360d 100644 --- a/onnxruntime/contrib_ops/cpu/qlinear_pool.cc +++ b/onnxruntime/contrib_ops/cpu/qlinear_pool.cc @@ -541,7 +541,7 @@ Status QLinearAveragePool::Compute(OpKernelContext* context) const { std::vector kernel_shape = pool_attrs_.kernel_shape; if (channels_last_) { - std::vector x_dims = x_shape.GetDims(); + std::vector x_dims = x_shape.GetDimsAsVector(); SwitchDimsNchwNhwc(x_dims, false); x_shape = TensorShape(x_dims); } diff --git a/onnxruntime/contrib_ops/cpu/tokenizer.cc b/onnxruntime/contrib_ops/cpu/tokenizer.cc index c991f045f7741..ac0cede673138 100644 --- a/onnxruntime/contrib_ops/cpu/tokenizer.cc +++ b/onnxruntime/contrib_ops/cpu/tokenizer.cc @@ -21,14 +21,14 @@ class Tokenizer final : public OpKernel { private: Status CharTokenize(OpKernelContext* context, size_t N, size_t C, - const std::vector& input_dims) const; + gsl::span input_dims) const; Status SeparatorExpressionTokenizer(OpKernelContext* context, size_t N, size_t C, - const std::vector& input_dims) const; + gsl::span input_dims) const; Status TokenExpression(OpKernelContext* ctx, size_t N, size_t C, - const std::vector& input_dims) const; + gsl::span input_dims) const; bool mark_{false}; std::string pad_value_; @@ -114,7 +114,7 @@ Tokenizer::Tokenizer(const OpKernelInfo& info) : OpKernel(info) { } Status Tokenizer::CharTokenize(OpKernelContext* ctx, size_t N, size_t C, - const std::vector& input_dims) const { + gsl::span input_dims) const { // With char tokenzation we get as many tokens as the number of // utf8 characters in the string. So for every string we calculate its character(utf8) length // add padding and add start/end test separators if necessary @@ -137,7 +137,7 @@ Status Tokenizer::CharTokenize(OpKernelContext* ctx, size_t N, size_t C, ++curr_input; } - std::vector output_dims(input_dims); + std::vector output_dims(input_dims.begin(), input_dims.end()); // Check if we have no output due to apparently empty strings input. if (max_tokens == 0) { output_dims.push_back(0); @@ -193,7 +193,7 @@ Status Tokenizer::CharTokenize(OpKernelContext* ctx, size_t N, size_t C, Status Tokenizer::SeparatorExpressionTokenizer(OpKernelContext* ctx, size_t N, size_t C, - const std::vector& input_dims) const { + gsl::span input_dims) const { using namespace re2; std::vector> rows; rows.reserve(N * C); @@ -276,7 +276,7 @@ Status Tokenizer::SeparatorExpressionTokenizer(OpKernelContext* ctx, ++curr_input; } - std::vector output_dims(input_dims); + std::vector output_dims(input_dims.begin(), input_dims.end()); // Check if we have no output due to either empty input // everything is a separator if (max_tokens == 0) { @@ -334,7 +334,7 @@ Status Tokenizer::SeparatorExpressionTokenizer(OpKernelContext* ctx, Status Tokenizer::TokenExpression(OpKernelContext* ctx, size_t N, size_t C, - const std::vector& input_dims) const { + gsl::span input_dims) const { using namespace re2; // Represents a token that will be output after // first is the index, second is the size; @@ -400,7 +400,7 @@ Status Tokenizer::TokenExpression(OpKernelContext* ctx, } // Check for empty output - std::vector output_dims(input_dims); + std::vector output_dims(input_dims.begin(), input_dims.end()); // Check if we have no output due to either empty input // everything is a separator if (max_tokens == 0) { @@ -468,7 +468,7 @@ Status Tokenizer::Compute(OpKernelContext* ctx) const { } auto& input_shape = X->Shape(); - auto& input_dims = input_shape.GetDims(); + auto input_dims = input_shape.GetDims(); size_t N = 0; size_t C = 0; if (input_dims.size() == 1) { diff --git a/onnxruntime/contrib_ops/cuda/bert/attention.cc b/onnxruntime/contrib_ops/cuda/bert/attention.cc index c4594cf74f99f..b0ad840219b82 100644 --- a/onnxruntime/contrib_ops/cuda/bert/attention.cc +++ b/onnxruntime/contrib_ops/cuda/bert/attention.cc @@ -99,7 +99,7 @@ Status Attention::ComputeInternal(OpKernelContext* context) const { Stream(), reinterpret_cast(gemm_buffer.get()), nullptr == mask_index ? nullptr : mask_index->template Data(), - nullptr == mask_index ? nullptr : &(mask_index->Shape().GetDims()), + nullptr == mask_index ? gsl::span() : mask_index->Shape().GetDims(), output->template MutableData(), batch_size, sequence_length, diff --git a/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu index 0e0cdba9a81b9..2f48d806e5eb9 100644 --- a/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu +++ b/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu @@ -65,7 +65,7 @@ bool QkvToContext( const cudaDeviceProp& prop, cublasHandle_t& cublas, cudaStream_t stream, const int batch_size, const int sequence_length, const int num_heads, const int head_size, const size_t element_size, const T* input, T* output, T* workspace, - const int* mask_index, const std::vector* mask_index_dims, + const int* mask_index, gsl::span mask_index_dims, bool is_unidirectional, int past_sequence_length, const T* past, const T* extra_add_qk, T* present, bool use_persistent_softmax) { const int all_sequence_length = past_sequence_length + sequence_length; const size_t bytes = GetAttentionScratchSize(element_size, batch_size, num_heads, sequence_length, all_sequence_length); @@ -106,7 +106,7 @@ bool QkvToContext( } // Raw attention mask could be 2D (BxS) or 3D (BxSxS*) or 4D(Bx1xMxM), where M is the max sequence length. - bool use_raw_attention_mask = (nullptr != mask_index && nullptr != mask_index_dims && mask_index_dims->size() >= 2); + bool use_raw_attention_mask = (nullptr != mask_index && mask_index_dims.size() >= 2); // compute Q*K' (as K'*Q), scaled by 1/sqrt(H) and store in scratch1: BxNxSxS* // Q: BxNxSxH, K (present_k): BxNxS*xH, Q*K': BxNxSxS* @@ -126,8 +126,8 @@ bool QkvToContext( // apply softmax and store result P to scratch2: BxNxSxS* if (use_raw_attention_mask) { // 2d, 3d or 4d attention mask - const int mask_dimension = static_cast(mask_index_dims->size()); - const int64_t max_sequence_length = mask_dimension == 4 ? mask_index_dims->at(3) : 0; + const int mask_dimension = static_cast(mask_index_dims.size()); + const int64_t max_sequence_length = mask_dimension == 4 ? mask_index_dims.at(3) : 0; T* persistent_softmax_workspace = scratch1; // replace Q*K' in place with masked score if persistent softmax is selected. if (!ComputeSoftmaxWithRawMask(stream, all_sequence_length, sequence_length, batch_size, num_heads, mask_index, extra_add_qk, scratch1, scratch2, @@ -136,9 +136,9 @@ bool QkvToContext( return false; } } else if (nullptr != mask_index) { // 1d mask index - ORT_ENFORCE(nullptr != mask_index_dims && mask_index_dims->size() == 1); + ORT_ENFORCE(mask_index_dims.size() == 1); // mask_index has 1D shape: either (batch_size) or (2*batch_size). Only the later one has start postions. - const int* mask_start = (mask_index_dims->at(0) > batch_size) ? mask_index + batch_size : nullptr; + const int* mask_start = (mask_index_dims.at(0) > batch_size) ? mask_index + batch_size : nullptr; if (!ComputeSoftmaxWithMask1D(stream, all_sequence_length, sequence_length, batch_size, num_heads, mask_index, mask_start, extra_add_qk, scratch1, scratch2, is_unidirectional)) { return false; } @@ -164,7 +164,7 @@ bool LaunchAttentionKernel( cudaStream_t stream, const void* input, const int* mask_index, - const std::vector* mask_index_dims, + gsl::span mask_index_dims, void* output, const int batch_size, const int sequence_length, diff --git a/onnxruntime/contrib_ops/cuda/bert/attention_impl.h b/onnxruntime/contrib_ops/cuda/bert/attention_impl.h index d34cdccfe6a3d..e07b45e8979c9 100644 --- a/onnxruntime/contrib_ops/cuda/bert/attention_impl.h +++ b/onnxruntime/contrib_ops/cuda/bert/attention_impl.h @@ -23,7 +23,7 @@ bool LaunchAttentionKernel( cudaStream_t stream, // cuda stream const void* input, // Input tensor const int* mask_index, // Attention mask raw data or index (end position of each sequence, or end positions and start positions). NULL means no mask. - const std::vector* mask_index_dims, // Mask index shape + gsl::span mask_index_dims, // Mask index shape void* output, // Output tensor int batch_size, // Batch size (B) int sequence_length, // Sequence length (S) diff --git a/onnxruntime/contrib_ops/cuda/fused_conv.cc b/onnxruntime/contrib_ops/cuda/fused_conv.cc index fbdefd137732e..975a73d212845 100644 --- a/onnxruntime/contrib_ops/cuda/fused_conv.cc +++ b/onnxruntime/contrib_ops/cuda/fused_conv.cc @@ -93,7 +93,7 @@ class FusedConv : public onnxruntime::cuda::Conv { if (Base::s_.post_slicing_required) { ORT_RETURN_IF_ERROR(onnxruntime::cuda::SliceOutUnwantedOutputSection( this->Stream(), Base::s_.y_data, Base::s_.y_dims_with_adjusted_pads, Base::s_.Y->MutableDataRaw(), - Base::s_.y_dims, Base::s_.slice_starts, Base::s_.slice_ends, Base::s_.slice_axes, Base::s_.element_size)); + Base::s_.y_dims.GetDims(), Base::s_.slice_starts, Base::s_.slice_ends, Base::s_.slice_axes, Base::s_.element_size)); } return Status::OK(); } diff --git a/onnxruntime/contrib_ops/cuda/quantization/attention_quantization.cc b/onnxruntime/contrib_ops/cuda/quantization/attention_quantization.cc index 9f31e4b9ae4d5..5c260139371a1 100644 --- a/onnxruntime/contrib_ops/cuda/quantization/attention_quantization.cc +++ b/onnxruntime/contrib_ops/cuda/quantization/attention_quantization.cc @@ -170,7 +170,7 @@ Status QAttention::ComputeInternal(OpKernelContext* context) const { Stream(), reinterpret_cast(gemm_buffer.get()), nullptr == mask_index ? nullptr : mask_index->template Data(), - nullptr == mask_index ? nullptr : &(mask_index->Shape().GetDims()), + nullptr == mask_index ? gsl::span() : mask_index->Shape().GetDims(), output->template MutableData(), batch_size, sequence_length, diff --git a/onnxruntime/core/dlpack/dlpack_converter.cc b/onnxruntime/core/dlpack/dlpack_converter.cc index 8937d5467e05e..fe9f2385ef1bd 100644 --- a/onnxruntime/core/dlpack/dlpack_converter.cc +++ b/onnxruntime/core/dlpack/dlpack_converter.cc @@ -212,7 +212,7 @@ DLManagedTensor* OrtValueToDlpack(OrtValue& ort_value) { ort_dlmanaged_tensor->tensor.dl_tensor.ndim = static_cast(tensor.Shape().NumDimensions()); ort_dlmanaged_tensor->tensor.dl_tensor.dtype = GetDlpackDataType(ort_value); ort_dlmanaged_tensor->tensor.dl_tensor.shape = - tensor.Shape().NumDimensions() > 0 ? const_cast(&tensor.Shape()[0]) : nullptr; + tensor.Shape().NumDimensions() > 0 ? &const_cast(tensor.Shape())[0] : nullptr; ort_dlmanaged_tensor->tensor.dl_tensor.strides = nullptr; ort_dlmanaged_tensor->tensor.dl_tensor.byte_offset = 0; return &(ort_dlmanaged_tensor->tensor); diff --git a/onnxruntime/core/framework/tensor_shape.cc b/onnxruntime/core/framework/tensor_shape.cc index 93e66814b8567..5ff2175da27c5 100644 --- a/onnxruntime/core/framework/tensor_shape.cc +++ b/onnxruntime/core/framework/tensor_shape.cc @@ -9,29 +9,59 @@ namespace onnxruntime { -TensorShape::TensorShape(const int64_t* dimension_sizes, size_t dimension_count) - : std::vector(dimension_count) { - for (size_t i = 0; i < dimension_count; ++i) { - (*this)[i] = dimension_sizes[i]; - } +TensorShape::TensorShape(gsl::span dims) { + Allocate(dims.size()); + gsl::copy(dims, values_); +} + +TensorShape& TensorShape::operator=(const TensorShape& other) { + if (&other==this) + return *this; + + Allocate(other.values_.size()); + gsl::copy(other.GetDims(), values_); + return *this; +} + +TensorShape& TensorShape::operator=(TensorShape&& other) { + if (&other==this) + return *this; + + // If the other TensorShape allocated a buffer, then take ownership of it + if (other.allocated_buffer_) { + allocated_buffer_ = std::move(other.allocated_buffer_); + values_ = other.values_; + } else + operator=(other); // Otherwise we do a copy using the regular operator= + + other.values_ = {}; // Just to be safe, set the other to be an empty shape + return *this; } -TensorShape::TensorShape(const std::vector& dims, size_t start, size_t end) { - assign(dims.begin() + start, dims.begin() + end); +void TensorShape::Allocate(size_t size) { + if (values_.size() == size) + return; + + allocated_buffer_.reset(); + + if (size > std::size(small_buffer_)) { + allocated_buffer_ = std::make_unique(size); + values_ = gsl::span(allocated_buffer_.get(), size); + } else + values_ = gsl::span(small_buffer_, size); } /** * Return the total number of elements. Returns 1 for an empty (rank 0) TensorShape. */ int64_t TensorShape::Size() const { - size_t arraySize = size(); - int64_t size = SizeHelper(0, arraySize); + int64_t size = SizeHelper(0, values_.size()); //should we cache the size? as multiple operation may be expensive. return size; } int64_t TensorShape::SizeToDimension(size_t dimension) const { - const size_t num_dims = size(); + const size_t num_dims = values_.size(); ORT_ENFORCE(dimension <= num_dims, "Invalid dimension of ", dimension, " for SizeFromDimension. Tensor has ", num_dims, " dimensions."); @@ -41,7 +71,7 @@ int64_t TensorShape::SizeToDimension(size_t dimension) const { } int64_t TensorShape::SizeFromDimension(size_t dimension) const { - const size_t num_dims = size(); + const size_t num_dims = values_.size(); ORT_ENFORCE(dimension <= num_dims, "Invalid dimension of ", dimension, " for SizeFromDimension. Tensor has ", num_dims, " dimensions."); @@ -51,13 +81,10 @@ int64_t TensorShape::SizeFromDimension(size_t dimension) const { } TensorShape TensorShape::Slice(size_t dimstart, size_t dimend) const { - ORT_ENFORCE(dimstart <= dimend && dimend <= size(), + ORT_ENFORCE(dimstart <= dimend && dimend <= values_.size(), "Invalid tensor shape slice argument."); - return TensorShape(*this, dimstart, dimend); -} - -TensorShape TensorShape::Slice(size_t dimstart) const { - return Slice(dimstart, size()); + return TensorShape(GetDims().subspan(dimstart, dimend - dimstart)); + ; } // output dimensions @@ -66,7 +93,7 @@ std::string TensorShape::ToString() const { result.append("{"); bool first = true; - for (auto dim : (*this)) { + for (auto dim : GetDims()) { if (!first) { result.append(","); } diff --git a/onnxruntime/core/framework/tensorprotoutils.cc b/onnxruntime/core/framework/tensorprotoutils.cc index 2698bebf362af..eddaa41b15de3 100644 --- a/onnxruntime/core/framework/tensorprotoutils.cc +++ b/onnxruntime/core/framework/tensorprotoutils.cc @@ -600,7 +600,7 @@ Status TensorProtoToTensor(const Env& env, const ORTCHAR_T* model_path, Tensor& tensor) { // Validate tensor compatibility std::vector tensor_shape_vec = GetTensorShapeFromTensorProto(tensor_proto); - if (tensor_shape_vec != tensor.Shape().GetDims()) { + if (gsl::make_span(tensor_shape_vec) != tensor.Shape().GetDims()) { return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "TensorProtoToTensor() tensor shape mismatch!"); } const DataTypeImpl* const source_type = DataTypeImpl::TensorTypeFromONNXEnum(tensor_proto.data_type())->GetElementType(); diff --git a/onnxruntime/core/language_interop_ops/pyop/pyop.cc b/onnxruntime/core/language_interop_ops/pyop/pyop.cc index 5c88d3a5a76fa..717aaccbcf657 100644 --- a/onnxruntime/core/language_interop_ops/pyop/pyop.cc +++ b/onnxruntime/core/language_interop_ops/pyop/pyop.cc @@ -270,7 +270,7 @@ void PyCustomKernel::Compute(OrtKernelContext* context) { auto ort_value = ort_.KernelContext_GetInput(context, i); inputs.push_back(const_cast(ort_value)->Get().DataRaw()); inputs_type.push_back(GetType(ort_value)); - inputs_dim.push_back(const_cast(ort_value)->Get().Shape().GetDims()); + inputs_dim.push_back(const_cast(ort_value)->Get().Shape().GetDimsAsVector()); } std::string err; diff --git a/onnxruntime/core/optimizer/transpose_optimizer/api_impl.cc b/onnxruntime/core/optimizer/transpose_optimizer/api_impl.cc index bb024014224cd..2baa76232173b 100644 --- a/onnxruntime/core/optimizer/transpose_optimizer/api_impl.cc +++ b/onnxruntime/core/optimizer/transpose_optimizer/api_impl.cc @@ -153,7 +153,7 @@ std::optional> ApiValueInfo::Shape() const { } TensorShape shape = utils::GetTensorShapeFromTensorShapeProto(*shape_proto); - return shape.GetDims(); + return shape.GetDimsAsVector(); } api::DataType ApiValueInfo::DType() const { diff --git a/onnxruntime/core/optimizer/transpose_optimizer/transpose_optimizer.cc b/onnxruntime/core/optimizer/transpose_optimizer/transpose_optimizer.cc index 99e6210b08dc7..3891d0f717d47 100644 --- a/onnxruntime/core/optimizer/transpose_optimizer/transpose_optimizer.cc +++ b/onnxruntime/core/optimizer/transpose_optimizer/transpose_optimizer.cc @@ -240,7 +240,7 @@ std::vector ChannelFirstToLastPerm(size_t rank) { } // Adds 1 dimensions to indices of shape corresponding to axes. Unsafe if axes has negative/duplicated entries. -static std::vector UnsqueezeShape(const std::vector& shape, const std::vector& axes) { +static std::vector UnsqueezeShape(gsl::span shape, const std::vector& axes) { size_t new_rank = shape.size() + axes.size(); std::vector new_shape(new_rank); diff --git a/onnxruntime/core/providers/cpu/controlflow/scan_utils.cc b/onnxruntime/core/providers/cpu/controlflow/scan_utils.cc index 61977a6a948cf..49a5cb19ec631 100644 --- a/onnxruntime/core/providers/cpu/controlflow/scan_utils.cc +++ b/onnxruntime/core/providers/cpu/controlflow/scan_utils.cc @@ -95,7 +95,7 @@ Status AllocateOutput(OpKernelContextInternal& context, const GraphViewer& subgr } TensorShape output_shape = onnxruntime::utils::GetTensorShapeFromTensorShapeProto(*graph_output_shape); - auto& graph_output_dims(output_shape.GetDims()); + auto graph_output_dims(output_shape.GetDims()); std::vector scan_output_dims; scan_output_dims.reserve(graph_output_dims.size() + 2); diff --git a/onnxruntime/core/providers/cpu/generator/random.cc b/onnxruntime/core/providers/cpu/generator/random.cc index ea85037d2f5d2..c3bcfe37ad555 100644 --- a/onnxruntime/core/providers/cpu/generator/random.cc +++ b/onnxruntime/core/providers/cpu/generator/random.cc @@ -292,7 +292,7 @@ Status Multinomial::Compute(OpKernelContext* ctx) const { const auto* tensor_pointer = ctx->Input(0); if (tensor_pointer == nullptr) return Status(common::ONNXRUNTIME, common::FAIL, "input count mismatch"); const Tensor& X = *tensor_pointer; - auto& X_dims = X.Shape().GetDims(); + auto X_dims = X.Shape().GetDims(); if (X_dims.empty()) { return Status(ONNXRUNTIME, INVALID_ARGUMENT, "Empty dimensions for input tensor"); diff --git a/onnxruntime/core/providers/cpu/math/cumsum.cc b/onnxruntime/core/providers/cpu/math/cumsum.cc index 5a121ae460680..1984b467eedd4 100644 --- a/onnxruntime/core/providers/cpu/math/cumsum.cc +++ b/onnxruntime/core/providers/cpu/math/cumsum.cc @@ -18,7 +18,7 @@ std::vector GetStarts(int64_t rank, int64_t axis, int64_t index) { } template void ZeroOutSliceAtIndex(Tensor& output, int64_t rank, int64_t axis, int64_t index, - const std::vector& slice_dims, const std::vector& steps, const int64_t slice_size) { + gsl::span slice_dims, const std::vector& steps, const int64_t slice_size) { T zero{}; auto output_starts(GetStarts(rank, axis, index)); WritableSliceIterator output_iterator(output, output_starts, slice_dims, steps); @@ -29,7 +29,7 @@ void ZeroOutSliceAtIndex(Tensor& output, int64_t rank, int64_t axis, int64_t ind template void CopySlices(const Tensor& input, Tensor& output, const std::vector& input_starts, const std::vector& output_starts, - const std::vector& slice_dims, const std::vector& steps, const int64_t slice_size) { + gsl::span slice_dims, const std::vector& steps, const int64_t slice_size) { SliceIterator input_iterator(input, input_starts, slice_dims, steps); WritableSliceIterator output_iterator(output, output_starts, slice_dims, steps); for (int64_t k = 0; k < slice_size; ++k, ++output_iterator, ++input_iterator) { @@ -39,7 +39,7 @@ void CopySlices(const Tensor& input, Tensor& output, template void SumSlices(const Tensor& input, Tensor& output, const std::vector& input_starts, const std::vector& output_starts, const std::vector& previous_output_starts, - const std::vector& slice_dims, const std::vector& steps, const int64_t slice_size) { + gsl::span slice_dims, const std::vector& steps, const int64_t slice_size) { SliceIterator input_iterator(input, input_starts, slice_dims, steps); WritableSliceIterator output_iterator(output, output_starts, slice_dims, steps); SliceIterator previous_output_iterator(output, previous_output_starts, slice_dims, steps); diff --git a/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_auxiliary_ops.cc b/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_auxiliary_ops.cc index b87aa1733b574..fb923bbfc3cea 100644 --- a/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_auxiliary_ops.cc +++ b/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_auxiliary_ops.cc @@ -49,7 +49,7 @@ Status MatMul(const T* input_1_data, const T* input_2_data, T* output_data, // CPU specific ReduceSum helper template -std::unique_ptr ReduceSum(const Tensor& input, const std::vector& reduce_axes, +std::unique_ptr ReduceSum(const Tensor& input, gsl::span reduce_axes, bool keep_dims, AllocatorPtr allocator, const TensorShape* input_shape_override, concurrency::ThreadPool* tp, void* /*einsum_cuda_assets*/) { @@ -158,7 +158,7 @@ static std::unique_ptr DiagonalInnermostDims(const Tensor& input, std::unique_ptr Diagonal(const Tensor& input, int64_t dim_1, int64_t dim_2, AllocatorPtr allocator, void* /*einsum_cuda_assets*/) { const auto& input_shape = input.Shape(); - const auto& input_dims = input_shape.GetDims(); + const auto input_dims = input_shape.GetDims(); auto rank = static_cast(input_dims.size()); ORT_ENFORCE(rank >= 2 && dim_1 != dim_2 && input_dims[dim_1] == input_dims[dim_2], @@ -237,7 +237,7 @@ std::unique_ptr Diagonal(const Tensor& input, int64_t dim_1, int64_t dim } // Make copy of the output dims - auto output_dims = output->Shape().GetDims(); + auto output_dims = output->Shape().GetDimsAsVector(); // Unsqueeze the reduced dim auto iter = output_dims.begin() + second_dim; @@ -273,10 +273,10 @@ bool IsTransposeRequired(size_t input_rank, const std::vector& permutati } // The following are thin wrappers over device specific helpers -std::unique_ptr Transpose(const Tensor& input, const std::vector& input_shape_override, +std::unique_ptr Transpose(const Tensor& input, const TensorShape& input_shape_override, const std::vector& permutation, AllocatorPtr allocator, void* einsum_cuda_assets, const DeviceHelpers::Transpose& device_transpose_func) { - auto input_rank = input_shape_override.size(); + auto input_rank = input_shape_override.NumDimensions(); ORT_ENFORCE(input_rank == permutation.size(), "Length of permutation must match the rank of the input to be permutated"); std::vector output_dims; @@ -346,12 +346,11 @@ std::unique_ptr MatMul(const Tensor& input_1, const std::vector } template -std::unique_ptr ReduceSum(const Tensor& input, const std::vector& input_shape_override, - const std::vector& reduce_axes, AllocatorPtr allocator, +std::unique_ptr ReduceSum(const Tensor& input, const TensorShape& input_shape_override, + gsl::span reduce_axes, AllocatorPtr allocator, concurrency::ThreadPool* tp, void* einsum_cuda_assets, const DeviceHelpers::ReduceSum& device_reduce_sum_func) { - TensorShape overriden_shape(input_shape_override); - return device_reduce_sum_func(input, reduce_axes, true, allocator, &overriden_shape, tp, einsum_cuda_assets); + return device_reduce_sum_func(input, reduce_axes, true, allocator, &input_shape_override, tp, einsum_cuda_assets); } // Explicit template instantiations of functions @@ -370,14 +369,14 @@ template std::unique_ptr MatMul( const DeviceHelpers::MatMul& device_matmul_func); template std::unique_ptr DeviceHelpers::CpuDeviceHelpers::ReduceSum( - const Tensor& input, const std::vector& reduce_axes, + const Tensor& input, gsl::span reduce_axes, bool keep_dims, AllocatorPtr allocator, const TensorShape* input_shape_override, concurrency::ThreadPool* tp, void* einsum_cuda_assets); template std::unique_ptr ReduceSum( - const Tensor& input, const std::vector& input_shape_override, - const std::vector& reduce_axes, AllocatorPtr allocator, + const Tensor& input, const TensorShape& input_shape_override, + gsl::span reduce_axes, AllocatorPtr allocator, concurrency::ThreadPool* tp, void* einsum_cuda_assets, const DeviceHelpers::ReduceSum& device_reduce_sum_func); // int32_t @@ -394,14 +393,14 @@ template std::unique_ptr MatMul( const DeviceHelpers::MatMul& device_matmul_func); template std::unique_ptr DeviceHelpers::CpuDeviceHelpers::ReduceSum( - const Tensor& input, const std::vector& reduce_axes, + const Tensor& input, gsl::span reduce_axes, bool keep_dims, AllocatorPtr allocator, const TensorShape* input_shape_override, concurrency::ThreadPool* tp, void* einsum_cuda_assets); template std::unique_ptr ReduceSum( - const Tensor& input, const std::vector& input_shape_override, - const std::vector& reduce_axes, AllocatorPtr allocator, + const Tensor& input, const TensorShape& input_shape_override, + gsl::span reduce_axes, AllocatorPtr allocator, concurrency::ThreadPool* tp, void* einsum_cuda_assets, const DeviceHelpers::ReduceSum& device_reduce_sum_func); @@ -419,14 +418,14 @@ template std::unique_ptr MatMul( const DeviceHelpers::MatMul& device_matmul_func); template std::unique_ptr DeviceHelpers::CpuDeviceHelpers::ReduceSum( - const Tensor& input, const std::vector& reduce_axes, + const Tensor& input, gsl::span reduce_axes, bool keep_dims, AllocatorPtr allocator, const TensorShape* input_shape_override, concurrency::ThreadPool* tp, void* einsum_cuda_assets); template std::unique_ptr ReduceSum( - const Tensor& input, const std::vector& input_shape_override, - const std::vector& reduce_axes, AllocatorPtr allocator, + const Tensor& input, const TensorShape& input_shape_override, + gsl::span reduce_axes, AllocatorPtr allocator, concurrency::ThreadPool* tp, void* einsum_cuda_assets, const DeviceHelpers::ReduceSum& device_reduce_sum_func); @@ -438,7 +437,7 @@ template Status DeviceHelpers::CpuDeviceHelpers::MatMul( void* einsum_cuda_assets); template std::unique_ptr DeviceHelpers::CpuDeviceHelpers::ReduceSum( - const Tensor& input, const std::vector& reduce_axes, + const Tensor& input, gsl::span reduce_axes, bool keep_dims, AllocatorPtr allocator, const TensorShape* input_shape_override, concurrency::ThreadPool* tp, void* einsum_cuda_assets); @@ -450,8 +449,8 @@ template std::unique_ptr MatMul( const DeviceHelpers::MatMul& device_matmul_func); template std::unique_ptr ReduceSum( - const Tensor& input, const std::vector& input_shape_override, - const std::vector& reduce_axes, AllocatorPtr allocator, + const Tensor& input, const TensorShape& input_shape_override, + gsl::span reduce_axes, AllocatorPtr allocator, concurrency::ThreadPool* tp, void* einsum_cuda_assets, const DeviceHelpers::ReduceSum& reduce_sum_func); // MLFloat16 @@ -462,8 +461,8 @@ template std::unique_ptr MatMul( const DeviceHelpers::MatMul& device_matmul_func); template std::unique_ptr ReduceSum( - const Tensor& input, const std::vector& input_shape_override, - const std::vector& reduce_axes, AllocatorPtr allocator, + const Tensor& input, const TensorShape& input_shape_override, + gsl::span reduce_axes, AllocatorPtr allocator, concurrency::ThreadPool* tp, void* einsum_cuda_assets, const DeviceHelpers::ReduceSum& device_reduce_sum_func); diff --git a/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_auxiliary_ops.h b/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_auxiliary_ops.h index dd21654c4a013..2409ac3b2f106 100644 --- a/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_auxiliary_ops.h +++ b/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_auxiliary_ops.h @@ -41,7 +41,7 @@ using MatMul = std::function -using ReduceSum = std::function(const Tensor& input, const std::vector& reduce_axes, +using ReduceSum = std::function(const Tensor& input, gsl::span reduce_axes, bool keep_dims, AllocatorPtr allocator, const TensorShape* input_shape_override, concurrency::ThreadPool* tp, void* einsum_cuda_assets)>; @@ -73,7 +73,7 @@ Status MatMul(const T* input_1_data, const T* input_2_data, T* output_data, void* einsum_cuda_assets); template -std::unique_ptr ReduceSum(const Tensor& input, const std::vector& reduce_axes, +std::unique_ptr ReduceSum(const Tensor& input, gsl::span reduce_axes, bool keep_dims, AllocatorPtr allocator, const TensorShape* input_shape_override, concurrency::ThreadPool* tp, void* einsum_cuda_assets); @@ -88,7 +88,7 @@ std::unique_ptr Diagonal(const Tensor& input, int64_t dim_1, int64_t dim bool IsTransposeRequired(size_t input_rank, const std::vector& permutation); // Thin wrapper over the Transpose op to be called from Einsum that does some checks and invokes the device specific helper -std::unique_ptr Transpose(const Tensor& input, const std::vector& input_shape_override, +std::unique_ptr Transpose(const Tensor& input, const TensorShape& input_shape_override, const std::vector& permutation, AllocatorPtr allocator, void* einsum_cuda_assets, const DeviceHelpers::Transpose& device_transpose_func); @@ -103,8 +103,8 @@ std::unique_ptr MatMul(const Tensor& input_1, const std::vector // Thin wrapper over the ReduceSum op template -std::unique_ptr ReduceSum(const Tensor& input, const std::vector& input_shape_override, - const std::vector& reduce_axes, AllocatorPtr allocator, +std::unique_ptr ReduceSum(const Tensor& input, const TensorShape& input_shape_override, + gsl::span reduce_axes, AllocatorPtr allocator, concurrency::ThreadPool* tp, void* cuda_ep, const DeviceHelpers::ReduceSum& device_reduce_sum_func); diff --git a/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_typed_compute_processor.cc b/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_typed_compute_processor.cc index ce14eaef90ecb..1c11b71ff147a 100644 --- a/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_typed_compute_processor.cc +++ b/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_typed_compute_processor.cc @@ -71,7 +71,7 @@ void EinsumTypedComputeProcessor::FinalizeOutput(const Tensor& candidate_outp } static bool IsTransposeReshapeForEinsum(const std::vector& perm, - const std::vector& input_dims, + gsl::span input_dims, std::vector& new_shape) { // As long as the dims with values > 1 stay in the same order, it's a reshape. // Example: Shape=(1,1,1024,4096) -> perm=(2,0,3,1). @@ -83,7 +83,7 @@ static bool IsTransposeReshapeForEinsum(const std::vector& perm, return false; last_permuted_axis = perm[i]; } - new_shape = input_dims; + new_shape = std::vector(input_dims.begin(), input_dims.end()); for (size_t i = 0; i < perm.size(); ++i) { new_shape[i] = input_dims[perm[i]]; } @@ -159,15 +159,13 @@ std::unique_ptr EinsumTypedComputeProcessor::PairwiseOperandProcess(c reduced_size *= left_dim; } else if (has_left_dim) { // if the dim to be reduced is only in one of left and right, we can reduce right away const Tensor& tensor_to_be_reduced = current_left ? *current_left : left; - const std::vector& tensor_to_be_reduced_dims = - current_left ? current_left->Shape().GetDims() : left_dims; + auto tensor_to_be_reduced_dims = current_left ? current_left->Shape().GetDims() : left_dims; current_left = EinsumOp::ReduceSum( tensor_to_be_reduced, tensor_to_be_reduced_dims, {i}, allocator_, tp_, einsum_ep_assets_, device_reduce_sum_func_); } else if (has_right_dim) { const Tensor& tensor_to_be_reduced = current_right ? *current_right : right; - const std::vector& tensor_to_be_reduced_dims = - current_right ? current_right->Shape().GetDims() : right_dims; + auto tensor_to_be_reduced_dims = current_right ? current_right->Shape().GetDims() : right_dims; current_right = EinsumOp::ReduceSum( tensor_to_be_reduced, tensor_to_be_reduced_dims, {i}, allocator_, tp_, einsum_ep_assets_, device_reduce_sum_func_); diff --git a/onnxruntime/core/providers/cpu/math/element_wise_ops.h b/onnxruntime/core/providers/cpu/math/element_wise_ops.h index d45dc64ad3291..b95738c878386 100644 --- a/onnxruntime/core/providers/cpu/math/element_wise_ops.h +++ b/onnxruntime/core/providers/cpu/math/element_wise_ops.h @@ -537,7 +537,7 @@ struct BroadcastIterator { }; struct Broadcaster { - Broadcaster(const std::vector& shape1, const std::vector& shape2) { + Broadcaster(gsl::span shape1, gsl::span shape2) { size_t dimension_count_max = std::max(shape1.size(), shape2.size()); size_t dimension_count_min = std::min(shape1.size(), shape2.size()); output_shape_.resize(dimension_count_max); diff --git a/onnxruntime/core/providers/cpu/math/top_k.cc b/onnxruntime/core/providers/cpu/math/top_k.cc index fb38cbe98b968..db0b5086e6c80 100644 --- a/onnxruntime/core/providers/cpu/math/top_k.cc +++ b/onnxruntime/core/providers/cpu/math/top_k.cc @@ -431,7 +431,7 @@ static Status ComputeImplOpset1011(OpKernelContext* p_op_kernel_context, int axi "the tensor to be processed and a tensor containing k value"); } - const vector& y_shape = Y->Shape().GetDims(); + auto y_shape = Y->Shape().GetDims(); if (y_shape.size() != 1 || y_shape[0] != 1) { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "k tensor should be a 1D tensor of size 1"); } diff --git a/onnxruntime/core/providers/cpu/ml/feature_vectorizer.cc b/onnxruntime/core/providers/cpu/ml/feature_vectorizer.cc index c7e23fe5c4efe..7d664b673d917 100644 --- a/onnxruntime/core/providers/cpu/ml/feature_vectorizer.cc +++ b/onnxruntime/core/providers/cpu/ml/feature_vectorizer.cc @@ -86,7 +86,7 @@ template static void VectorizeTensor(const Tensor& input_tensor, int64_t feature_size, int64_t sum_input_dimensions, typename gsl::span::iterator out_iter) { auto& shape = input_tensor.Shape(); - auto& input_dims = shape.GetDims(); + auto input_dims = shape.GetDims(); auto input_size = input_dims.size() == 1 ? input_dims[0] : input_tensor.Shape().SizeFromDimension(1); auto N = input_dims.size() == 1 ? 1 : input_dims[0]; diff --git a/onnxruntime/core/providers/cpu/ml/imputer.cc b/onnxruntime/core/providers/cpu/ml/imputer.cc index ff489862a5533..b9550b4c6db11 100644 --- a/onnxruntime/core/providers/cpu/ml/imputer.cc +++ b/onnxruntime/core/providers/cpu/ml/imputer.cc @@ -76,7 +76,7 @@ common::Status ComputeByType(OpKernelContext* context, if (tensor_pointer == nullptr) return Status(common::ONNXRUNTIME, common::FAIL, "input count mismatch"); const Tensor& X = *tensor_pointer; const TensorShape& x_shape = X.Shape(); - auto& dims = x_shape.GetDims(); + auto dims = x_shape.GetDims(); if (dims.empty()) { return Status(ONNXRUNTIME, FAIL, "Empty input dimensions."); } diff --git a/onnxruntime/core/providers/cpu/ml/onehotencoder.cc b/onnxruntime/core/providers/cpu/ml/onehotencoder.cc index 4030bb77a8dbc..36201a5d1e98b 100644 --- a/onnxruntime/core/providers/cpu/ml/onehotencoder.cc +++ b/onnxruntime/core/providers/cpu/ml/onehotencoder.cc @@ -73,7 +73,7 @@ common::Status OneHotEncoderOp::Compute(OpKernelContext* context) const { const auto* X = context->Input(0); const TensorShape& input_shape = X->Shape(); - std::vector output_shape(input_shape.GetDims()); + auto output_shape=input_shape.GetDimsAsVector(); output_shape.push_back(num_categories_); Tensor* Y = context->Output(0, TensorShape(output_shape)); @@ -98,7 +98,7 @@ common::Status OneHotEncoderOp::Compute(OpKernelContext* context) c const auto* X = context->Input(0); const TensorShape& input_shape = X->Shape(); - std::vector output_shape(input_shape.GetDims()); + std::vector output_shape(input_shape.GetDims().begin(), input_shape.GetDims().end()); output_shape.push_back(num_categories_); Tensor* Y = context->Output(0, TensorShape(output_shape)); diff --git a/onnxruntime/core/providers/cpu/ml/scaler.cc b/onnxruntime/core/providers/cpu/ml/scaler.cc index 015612613de83..b31c3b51e58b3 100644 --- a/onnxruntime/core/providers/cpu/ml/scaler.cc +++ b/onnxruntime/core/providers/cpu/ml/scaler.cc @@ -78,7 +78,7 @@ common::Status ScalerOp::Compute(OpKernelContext* context) const { Tensor* Y = context->Output(0, x_shape); const T* x_data = X.template Data(); auto* y_data = Y->template MutableData(); - const vector& x_dims = x_shape.GetDims(); + auto x_dims = x_shape.GetDims(); if (x_dims.empty()) { return Status(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid argument: input has empty dimensions."); } diff --git a/onnxruntime/core/providers/cpu/ml/tree_ensemble_classifier.cc b/onnxruntime/core/providers/cpu/ml/tree_ensemble_classifier.cc index 216bacd607cd0..566e40aa4153f 100644 --- a/onnxruntime/core/providers/cpu/ml/tree_ensemble_classifier.cc +++ b/onnxruntime/core/providers/cpu/ml/tree_ensemble_classifier.cc @@ -164,7 +164,7 @@ TreeEnsembleClassifier::TreeEnsembleClassifier(const OpKernelInfo& info) template common::Status TreeEnsembleClassifier::Compute(OpKernelContext* context) const { const Tensor& X = *context->Input(0); - const std::vector& x_dims = X.Shape().GetDims(); + auto x_dims = X.Shape().GetDims(); if (x_dims.empty()) { return Status(ONNXRUNTIME, INVALID_ARGUMENT, "X dims is empty."); } diff --git a/onnxruntime/core/providers/cpu/ml/zipmap.cc b/onnxruntime/core/providers/cpu/ml/zipmap.cc index 36fa961c549b4..5828a7affa6ef 100644 --- a/onnxruntime/core/providers/cpu/ml/zipmap.cc +++ b/onnxruntime/core/providers/cpu/ml/zipmap.cc @@ -48,7 +48,7 @@ common::Status ZipMapOp::Compute(OpKernelContext* context) const { const auto* tensor_pointer = context->Input(0); if (tensor_pointer == nullptr) return Status(common::ONNXRUNTIME, common::FAIL, "input count mismatch"); const Tensor& X = *tensor_pointer; - const std::vector& x_dims = X.Shape().GetDims(); + auto x_dims = X.Shape().GetDims(); if (x_dims.empty()) { return Status(ONNXRUNTIME, diff --git a/onnxruntime/core/providers/cpu/nn/batch_norm_helper.h b/onnxruntime/core/providers/cpu/nn/batch_norm_helper.h index c813773b7cedd..85e555ef233c0 100644 --- a/onnxruntime/core/providers/cpu/nn/batch_norm_helper.h +++ b/onnxruntime/core/providers/cpu/nn/batch_norm_helper.h @@ -106,12 +106,12 @@ class BatchNormHelper { static void NormalizeDims(const TensorShape& x_shape, std::vector& new_dims) { new_dims.clear(); - auto& orig_dims = x_shape.GetDims(); + auto orig_dims = x_shape.GetDims(); ORT_ENFORCE(orig_dims.size() < 6, "Input dim size should be < 6 for BatchNorm, but got ", std::to_string(orig_dims.size())); if (orig_dims.size() == 4 /*supported size by CUDA*/ || orig_dims.size() == 5 /*supported size by CUDA*/) { - new_dims = orig_dims; + new_dims = std::vector(orig_dims.begin(), orig_dims.end()); return; } diff --git a/onnxruntime/core/providers/cpu/nn/conv_attributes.h b/onnxruntime/core/providers/cpu/nn/conv_attributes.h index d7041d41450d7..7fff54137d92f 100644 --- a/onnxruntime/core/providers/cpu/nn/conv_attributes.h +++ b/onnxruntime/core/providers/cpu/nn/conv_attributes.h @@ -81,7 +81,7 @@ struct ConvAttributes { } } } else { - auto& weight_dims = weight_shape.GetDims(); + auto weight_dims = weight_shape.GetDims(); kernel_shape = std::vector(weight_dims.begin() + 2, weight_dims.end()); } diff --git a/onnxruntime/core/providers/cpu/nn/pool_attributes.h b/onnxruntime/core/providers/cpu/nn/pool_attributes.h index 452fe528314cb..11c11b6b6cb59 100644 --- a/onnxruntime/core/providers/cpu/nn/pool_attributes.h +++ b/onnxruntime/core/providers/cpu/nn/pool_attributes.h @@ -109,7 +109,7 @@ struct PoolAttributes { return output_dims; } - void InferOutputSize(const std::vector& input_dims, + void InferOutputSize(gsl::span input_dims, std::vector* output_dims, std::vector* actual_pads) const { ORT_ENFORCE(input_dims.size() >= 2); diff --git a/onnxruntime/core/providers/cpu/nn/string_normalizer.cc b/onnxruntime/core/providers/cpu/nn/string_normalizer.cc index 4e768077ca382..015c3e97ed59e 100644 --- a/onnxruntime/core/providers/cpu/nn/string_normalizer.cc +++ b/onnxruntime/core/providers/cpu/nn/string_normalizer.cc @@ -298,7 +298,7 @@ Status StringNormalizer::Compute(OpKernelContext* ctx) const { auto X = ctx->Input(0); if (X == nullptr) return Status(common::ONNXRUNTIME, common::FAIL, "input count mismatch"); - auto& input_dims = X->Shape().GetDims(); + auto input_dims = X->Shape().GetDims(); size_t N = 0; size_t C = 0; diff --git a/onnxruntime/core/providers/cpu/nn/tfidfvectorizer.cc b/onnxruntime/core/providers/cpu/nn/tfidfvectorizer.cc index a5185ce775836..d6677a482c510 100644 --- a/onnxruntime/core/providers/cpu/nn/tfidfvectorizer.cc +++ b/onnxruntime/core/providers/cpu/nn/tfidfvectorizer.cc @@ -389,7 +389,7 @@ Status TfIdfVectorizer::Compute(OpKernelContext* ctx) const { int32_t num_rows = 0; size_t B = 0; size_t C = 0; - auto& input_dims = input_shape.GetDims(); + auto input_dims = input_shape.GetDims(); if (input_dims.empty()) { num_rows = 1; C = 1; diff --git a/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc b/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc index e8e689508fcce..4aabb65122590 100644 --- a/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc +++ b/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc @@ -223,22 +223,12 @@ bool IsFastReduceKindAvailable(FastReduceKind scenario, FastReduceKind available return (static_cast(scenario) & static_cast(available)) > 0; } -bool ResultsNoTransposePrepareForReduce::equal(const std::vector& local_input_shape, - const std::vector& local_reduced_axes) { - if (input_shape.size() != local_input_shape.size()) +bool ResultsNoTransposePrepareForReduce::equal(gsl::span local_input_shape, + gsl::span local_reduced_axes) { + if (gsl::make_span(input_shape) != local_input_shape) return false; - if (reduced_axes.size() != local_reduced_axes.size()) + if (gsl::make_span(reduced_axes) != local_reduced_axes) return false; - for (std::vector::const_iterator it1 = input_shape.begin(), it2 = local_input_shape.begin(); - it1 != input_shape.end(); ++it1, ++it2) { - if (*it1 != *it2) - return false; - } - for (std::vector::const_iterator it1 = reduced_axes.begin(), it2 = local_reduced_axes.begin(); - it1 != reduced_axes.end(); ++it1, ++it2) { - if (*it1 != *it2) - return false; - } return true; } @@ -284,10 +274,10 @@ TensorOpCost ParallelReduceFastCost(int64_t n_row, int64_t n_col, int64_t elemen } void NoTransposePrepareForReduce(const TensorShape& new_input_shape, - const std::vector& reduced_axes, + gsl::span reduced_axes, ResultsNoTransposePrepareForReduce& results) { // Common initialisation for the indices. - std::vector cumulative_shape = new_input_shape.GetDims(); + auto cumulative_shape = new_input_shape.GetDimsAsVector(); cumulative_shape[cumulative_shape.size() - 1] = 1; for (int i = static_cast(cumulative_shape.size()) - 2; i >= 0; --i) { cumulative_shape[i] = cumulative_shape[i + 1] * new_input_shape[i + 1]; @@ -395,7 +385,7 @@ struct ParallelizedData { template void NoTransposeReduce1Loop(Tensor* output, const TensorShape& new_input_shape, const Tensor& input, - const std::vector& reduced_axes, concurrency::ThreadPool* tp, + gsl::span reduced_axes, concurrency::ThreadPool* tp, ResultsNoTransposePrepareForReduce& last_results) { auto output_shape = output->Shape(); const typename AGG::input_type* from_data = input.template Data(); @@ -460,7 +450,7 @@ void NoTransposeReduce1Loop(Tensor* output, const TensorShape& new_input_shape, template void NoTransposeReduce2Loops(Tensor* output, const TensorShape& new_input_shape, const Tensor& input, - const std::vector& reduced_axes, concurrency::ThreadPool* tp, + gsl::span reduced_axes, concurrency::ThreadPool* tp, ResultsNoTransposePrepareForReduce& last_results) { auto output_shape = output->Shape(); const typename AGG::input_type* from_data = input.template Data(); @@ -544,16 +534,16 @@ void DropDimensions(const std::vector& input_shape, } } -FastReduceKind OptimizeShapeForFastReduce(const std::vector& input_shape, - const std::vector& reduced_axes, +FastReduceKind OptimizeShapeForFastReduce(gsl::span input_shape, + gsl::span reduced_axes, std::vector& fast_shape, std::vector& fast_output_shape, std::vector& fast_axes, bool keep_dims, bool noop_with_empty_axes) { if (input_shape.empty()) { - fast_shape = input_shape; - fast_output_shape = input_shape; - fast_axes = reduced_axes; + fast_shape = std::vector(input_shape.begin(), input_shape.end()); + fast_output_shape = fast_shape; + fast_axes = std::vector(reduced_axes.begin(), reduced_axes.end()); return FastReduceKind::kNone; } @@ -595,7 +585,7 @@ FastReduceKind OptimizeShapeForFastReduce(const std::vector& input_shap } if (noop_with_empty_axes) { fast_axes.clear(); - fast_output_shape = input_shape; + fast_output_shape = std::vector(input_shape.begin(), input_shape.end()); return FastReduceKind::kK; } else { if (keep_dims) { @@ -884,7 +874,7 @@ Status ReduceSum::Compute(OpKernelContext* ctx) const { } template -std::unique_ptr ReduceSum::Impl(const Tensor& input, const std::vector& reduce_axes, +std::unique_ptr ReduceSum::Impl(const Tensor& input, gsl::span reduce_axes, AllocatorPtr allocator, concurrency::ThreadPool* tp, bool keep_dims, const TensorShape* input_shape_override) { std::vector axes; diff --git a/onnxruntime/core/providers/cpu/reduction/reduction_ops.h b/onnxruntime/core/providers/cpu/reduction/reduction_ops.h index 156bf648d284e..9f064fa3453dd 100644 --- a/onnxruntime/core/providers/cpu/reduction/reduction_ops.h +++ b/onnxruntime/core/providers/cpu/reduction/reduction_ops.h @@ -54,8 +54,8 @@ TensorOpCost ParallelReduceFastCost(int64_t n_row, int64_t n_col, int64_t elemen with vectors operations. Method WhichFastReduce() returns which case case be optimized for which aggregator. */ -FastReduceKind OptimizeShapeForFastReduce(const std::vector& input_shape, - const std::vector& reduced_axes, +FastReduceKind OptimizeShapeForFastReduce(gsl::span input_shape, + gsl::span reduced_axes, std::vector& fast_shape, std::vector& fast_output_shape, std::vector& fast_axes, @@ -79,7 +79,7 @@ class ResultsNoTransposePrepareForReduce { last_loop_inc = 0; } - bool equal(const std::vector& local_input_shape, const std::vector& local_reduced_axes); + bool equal(gsl::span local_input_shape, gsl::span local_reduced_axes); void ValidateNotEmpty(); }; @@ -598,13 +598,13 @@ void NoTransposePrepareForReduce(const TensorShape& new_input_shape, template void NoTransposeReduce1Loop(Tensor* output, const TensorShape& new_input_shape, const Tensor& input, - const std::vector& reduced_axes, concurrency::ThreadPool* tp, + gsl::span reduced_axes, concurrency::ThreadPool* tp, ResultsNoTransposePrepareForReduce& last_results); // Specific case for ReduceLogSumExp. template void NoTransposeReduce2Loops(Tensor* output, const TensorShape& new_input_shape, const Tensor& input, - const std::vector& reduced_axes, concurrency::ThreadPool* tp, + gsl::span reduced_axes, concurrency::ThreadPool* tp, ResultsNoTransposePrepareForReduce& last_results); template @@ -735,7 +735,7 @@ class ReduceSum final : public ReduceKernel { // For external calls requiring ReduceSum implementation - will return the reduced output. //`input_shape_override` overrides the shape of `input` for compute purposes. - static std::unique_ptr Impl(const Tensor& input, const std::vector& reduce_axes, + static std::unique_ptr Impl(const Tensor& input, gsl::span reduce_axes, AllocatorPtr allocator, concurrency::ThreadPool* tp, bool keep_dims, const TensorShape* input_shape_override = nullptr); }; diff --git a/onnxruntime/core/providers/cpu/sequence/sequence_ops.cc b/onnxruntime/core/providers/cpu/sequence/sequence_ops.cc index 5ed9b8f282057..412c6d6aa0373 100644 --- a/onnxruntime/core/providers/cpu/sequence/sequence_ops.cc +++ b/onnxruntime/core/providers/cpu/sequence/sequence_ops.cc @@ -379,7 +379,7 @@ Status SplitToSequence::PrepareForCompute(const TensorShape& input_shape, int64_ int& after_dims_including_split_axis, int& after_dims_excluding_split, bool& is_uneven_split, int& num_remaining_splits, std::vector& split_sizes) const { - auto& input_dims = input_shape.GetDims(); + auto input_dims = input_shape.GetDims(); const auto num_dimensions = gsl::narrow_cast(input_shape.NumDimensions()); axis = HandleNegativeAxis(axis_, num_dimensions); // handle negative and enforce axis is valid const int64_t split_dim_size = input_dims[axis]; @@ -506,8 +506,7 @@ Status SplitToSequence::ComputeImpl(OpKernelContext& context, const Tensor& inpu split_sizes)); // copy dimensions so we can update the selected axis in place - auto& input_dims = input_shape.GetDims(); - std::vector output_dimensions{input_dims}; + auto output_dimensions = input_shape.GetDimsAsVector(); std::vector tensors; int64_t input_offset = 0; const T* input_data = input.template Data(); diff --git a/onnxruntime/core/providers/cpu/tensor/compress.cc b/onnxruntime/core/providers/cpu/tensor/compress.cc index 3007feef92fb0..9915f0d327b0e 100644 --- a/onnxruntime/core/providers/cpu/tensor/compress.cc +++ b/onnxruntime/core/providers/cpu/tensor/compress.cc @@ -28,7 +28,7 @@ ONNX_CPU_OPERATOR_KERNEL( Status Compress::Compute(OpKernelContext* ctx) const { const auto* input_tensor = ctx->Input(0); size_t rank = input_tensor->Shape().NumDimensions(); - auto& input_dimensions = input_tensor->Shape().GetDims(); + auto input_dimensions = input_tensor->Shape().GetDims(); int64_t axis = axis_; if (has_axis_) { axis = HandleNegativeAxis(axis, rank); // handle negative and enforce axis is valid @@ -50,7 +50,7 @@ Status Compress::Compute(OpKernelContext* ctx) const { } } - std::vector output_dims(input_dimensions); + std::vector output_dims(input_dimensions.begin(), input_dimensions.end()); if (has_axis_) { output_dims[axis] = positive_condition_count; } else { diff --git a/onnxruntime/core/providers/cpu/tensor/concat.cc b/onnxruntime/core/providers/cpu/tensor/concat.cc index 3747df9f2fe5f..9ca662231d6e2 100644 --- a/onnxruntime/core/providers/cpu/tensor/concat.cc +++ b/onnxruntime/core/providers/cpu/tensor/concat.cc @@ -81,7 +81,7 @@ Status ConcatBase::PrepareForCompute(OpKernelContext* ctx, const auto& shape = input->Shape(); const auto num_elements = shape.Size(); if (num_elements > 0) { - reference_dims = shape.GetDims(); + reference_dims = shape.GetDimsAsVector(); reference_rank = reference_dims.size(); reference_tensor_index = index; input_tensor_sizes.push_back(num_elements); @@ -97,7 +97,7 @@ Status ConcatBase::PrepareForCompute(OpKernelContext* ctx, // No shape/rank validations will be done (as all inputs are empty). // But the rest of the execution flow (filling in the Prepare instance - p) // can use this info. - reference_dims = input_tensors[0]->Shape().GetDims(); + reference_dims = input_tensors[0]->Shape().GetDimsAsVector(); reference_rank = reference_dims.size(); } @@ -116,7 +116,7 @@ Status ConcatBase::PrepareForCompute(OpKernelContext* ctx, const auto* input = input_tensors[index]; ORT_ENFORCE(input != nullptr, "input count mismatch"); const auto& input_shape = input->Shape(); - const auto& input_dims = input_shape.GetDims(); + const auto input_dims = input_shape.GetDims(); // Skip shape/rank validation for inputs that are empty. // The ONNX spec states that all dim values along axes not concatentated on diff --git a/onnxruntime/core/providers/cpu/tensor/copy.h b/onnxruntime/core/providers/cpu/tensor/copy.h index ea7b48a27b43b..cc14e022825a0 100644 --- a/onnxruntime/core/providers/cpu/tensor/copy.h +++ b/onnxruntime/core/providers/cpu/tensor/copy.h @@ -113,7 +113,7 @@ void StridedCopy(concurrency::ThreadPool* thread_pool, // Coalesce dimensions std::vector dst_strides = dst_strides_in; std::vector src_strides = src_strides_in; - std::vector copy_shape(copy_shape_in.GetDims()); + std::vector copy_shape(copy_shape_in.GetDimsAsVector()); CoalesceDimensions({dst_strides, src_strides}, copy_shape); ORT_ENFORCE(dst_strides.size() == src_strides.size() && diff --git a/onnxruntime/core/providers/cpu/tensor/onehot.cc b/onnxruntime/core/providers/cpu/tensor/onehot.cc index 054c84e6dc4db..cd621f968d56d 100644 --- a/onnxruntime/core/providers/cpu/tensor/onehot.cc +++ b/onnxruntime/core/providers/cpu/tensor/onehot.cc @@ -89,9 +89,9 @@ Status PrepareOutputShape(const Tensor* indices, const int64_t depth_val, const int64_t& prefix_dim_size, int64_t& suffix_dim_size, std::vector& output_shape) { const auto& indices_shape = indices->Shape(); - const auto& indices_dims = indices_shape.GetDims(); + const auto indices_dims = indices_shape.GetDims(); const auto indices_num_dims = indices_shape.NumDimensions(); - output_shape = indices_dims; + output_shape = indices_shape.GetDimsAsVector(); // output rank is always 1 more than the input rank as a new dimension is added to the input shape const auto output_rank = static_cast(indices_num_dims + 1); diff --git a/onnxruntime/core/providers/cpu/tensor/pad.cc b/onnxruntime/core/providers/cpu/tensor/pad.cc index 4132710edca00..1cf7d7e59c379 100644 --- a/onnxruntime/core/providers/cpu/tensor/pad.cc +++ b/onnxruntime/core/providers/cpu/tensor/pad.cc @@ -275,7 +275,7 @@ static Status PadImpl(OpKernelContext* ctx, const auto& input_tensor = *ctx->Input(0); const auto& orig_input_shape = input_tensor.Shape(); - std::vector output_dims(orig_input_shape.GetDims()); + std::vector output_dims(orig_input_shape.GetDimsAsVector()); size_t data_rank = output_dims.size(); // make copy of raw_pads as it may be mutated below @@ -484,7 +484,7 @@ Status Pad::Compute(OpKernelContext* ctx) const { size_t data_rank = input_tensor.Shape().NumDimensions(); const Tensor& pads_tensor = *ctx->Input(1); - const std::vector& pads_tensor_dims = pads_tensor.Shape().GetDims(); + auto pads_tensor_dims = pads_tensor.Shape().GetDims(); ORT_ENFORCE(pads_tensor.IsDataType(), "Pads tensor should be an INT64 tensor"); ORT_ENFORCE(pads_tensor_dims.size() == 1 || (pads_tensor_dims.size() == 2 && pads_tensor_dims[0] == 1), diff --git a/onnxruntime/core/providers/cpu/tensor/scatter.cc b/onnxruntime/core/providers/cpu/tensor/scatter.cc index 8ab616caf15ac..53ad4a35700f7 100644 --- a/onnxruntime/core/providers/cpu/tensor/scatter.cc +++ b/onnxruntime/core/providers/cpu/tensor/scatter.cc @@ -255,8 +255,8 @@ Status Scatter::Compute(OpKernelContext* context) const { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "data type is different from updates type"); } - auto& indices_dims = indices_input->Shape().GetDims(); - auto& updates_dims = updates_input->Shape().GetDims(); + auto indices_dims = indices_input->Shape().GetDims(); + auto updates_dims = updates_input->Shape().GetDims(); if (indices_dims.size() != updates_dims.size()) { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Indices and updates must have the same rank"); @@ -272,7 +272,7 @@ Status Scatter::Compute(OpKernelContext* context) const { // According to the spec the rank of ind/upd shall be the same as input(data) // and we also want to make sure that the dimensions of the of the ind/upd do not // exceed that of the input - auto& input_dims = input_data_shape.GetDims(); + auto input_dims = input_data_shape.GetDims(); if (input_dims.size() != indices_dims.size()) { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Indices must have the same rank as Input. Indices rank=", indices_dims.size(), ". Input rank=", input_dims.size()); diff --git a/onnxruntime/core/providers/cpu/tensor/slice.cc b/onnxruntime/core/providers/cpu/tensor/slice.cc index 1d91c5854e1f0..3d01feb7870ce 100644 --- a/onnxruntime/core/providers/cpu/tensor/slice.cc +++ b/onnxruntime/core/providers/cpu/tensor/slice.cc @@ -77,8 +77,8 @@ ONNX_CPU_OPERATOR_KERNEL( // Updates starts and steps to match flattened_output_dims if it is. // e.g. if input shape is { 2, 2, 2 }, output shape is { 1, 2, 2 }, and the 'steps' value for the last two dims is 1, // we are keeping all the data of the inner most two dimensions so can combine those into dims of { 1, 4 } -static void FlattenOutputDims(const std::vector& input_dimensions, - const std::vector& output_dims, +static void FlattenOutputDims(gsl::span input_dimensions, + gsl::span output_dims, std::vector& starts, std::vector& ends, std::vector& steps, @@ -94,7 +94,7 @@ static void FlattenOutputDims(const std::vector& input_dimensions, if (num_to_combine > 1) { auto num_dims = output_dims.size() - num_to_combine + 1; - *flattened_output_dims = output_dims; + *flattened_output_dims = std::vector(output_dims.begin(), output_dims.end()); flattened_output_dims->resize(num_dims); int64_t dim_value = 1; @@ -232,7 +232,7 @@ static Status SliceImpl(OpKernelContext* ctx, if (compute_metadata.p_flattened_output_dims_) { // if we have flattened output dims we need to also flatten the input dims. // as we're combining the innermost dims and keeping all values we can just copy the size of the last dim - std::vector flattened_input_dims(input_tensor.Shape().GetDims()); + auto flattened_input_dims=input_tensor.Shape().GetDimsAsVector(); flattened_input_dims.resize(compute_metadata.p_flattened_output_dims_->size()); flattened_input_dims.back() = compute_metadata.p_flattened_output_dims_->back(); TensorShape input_shape(std::move(flattened_input_dims)); @@ -265,7 +265,7 @@ static inline bool CallSliceImplIfEnabled(OpKernelContext* ctx, Status SliceBase::Compute(OpKernelContext* ctx) const { const auto* input_tensor_ptr = ctx->Input(0); const auto& input_tensor = *input_tensor_ptr; - const auto& input_dimensions = input_tensor.Shape().GetDims(); + const auto input_dimensions = input_tensor.Shape().GetDims(); if (input_dimensions.empty()) { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Cannot slice scalars"); diff --git a/onnxruntime/core/providers/cpu/tensor/slice_compute_metadata.h b/onnxruntime/core/providers/cpu/tensor/slice_compute_metadata.h index 0eb37124ce37f..ad7b32ab56490 100644 --- a/onnxruntime/core/providers/cpu/tensor/slice_compute_metadata.h +++ b/onnxruntime/core/providers/cpu/tensor/slice_compute_metadata.h @@ -11,16 +11,16 @@ namespace onnxruntime { namespace SliceOp { struct PrepareForComputeMetadata { - explicit PrepareForComputeMetadata(const std::vector& input_dimensions) + explicit PrepareForComputeMetadata(gsl::span input_dimensions) : input_dimensions_(input_dimensions), - ends_(input_dimensions), - output_dims_(input_dimensions) { + ends_(input_dimensions.begin(), input_dimensions.end()), + output_dims_(input_dimensions.begin(), input_dimensions.end()) { size_t dimension_count = input_dimensions.size(); starts_.resize(dimension_count, 0); steps_.resize(dimension_count, 1); } - const std::vector& input_dimensions_; + gsl::span input_dimensions_; std::vector starts_; std::vector ends_; std::vector steps_; diff --git a/onnxruntime/core/providers/cpu/tensor/split.cc b/onnxruntime/core/providers/cpu/tensor/split.cc index 1666efd6a1ceb..74d8be1fce974 100644 --- a/onnxruntime/core/providers/cpu/tensor/split.cc +++ b/onnxruntime/core/providers/cpu/tensor/split.cc @@ -69,7 +69,7 @@ ONNX_CPU_OPERATOR_KERNEL( Status SplitBase::PrepareForCompute(const TensorShape& input_shape, int num_outputs, int64_t& axis, int& before_dims, int& after_dims_including_split_axis, int& after_dims_excluding_split, std::vector& split_sizes) const { - auto& input_dims = input_shape.GetDims(); + auto input_dims = input_shape.GetDims(); const auto num_dimensions = gsl::narrow_cast(input_shape.NumDimensions()); axis = HandleNegativeAxis(axis_, num_dimensions); // handle negative and enforce axis is valid const int64_t split_dim_size = input_dims[axis]; @@ -173,8 +173,7 @@ Status Split::ComputeImpl(OpKernelContext& context, const Tensor& input) const { split_sizes)); // copy dimensions so we can update the selected axis in place - auto& input_dims = input_shape.GetDims(); - std::vector output_dimensions{input_dims}; + auto output_dimensions = input_shape.GetDimsAsVector(); int64_t input_offset = 0; const T* input_data = input.template Data(); diff --git a/onnxruntime/core/providers/cpu/tensor/tile.cc b/onnxruntime/core/providers/cpu/tensor/tile.cc index 73f5122aebb90..523f99f918e97 100644 --- a/onnxruntime/core/providers/cpu/tensor/tile.cc +++ b/onnxruntime/core/providers/cpu/tensor/tile.cc @@ -154,7 +154,7 @@ Status Tile::Compute(OpKernelContext* ctx) const { // Calculate the shape of the output tensor const auto* repeats = repeats_tensor.template Data(); - std::vector output_dims = input_shape.GetDims(); + std::vector output_dims = input_shape.GetDimsAsVector(); for (size_t axis = 0; axis < input_rank; axis++) { output_dims[axis] *= repeats[axis]; } diff --git a/onnxruntime/core/providers/cpu/tensor/transpose.cc b/onnxruntime/core/providers/cpu/tensor/transpose.cc index 85ec8af3ba618..16d0f5c746fc1 100644 --- a/onnxruntime/core/providers/cpu/tensor/transpose.cc +++ b/onnxruntime/core/providers/cpu/tensor/transpose.cc @@ -71,7 +71,7 @@ struct MultiIndex { * be transposed, if source_dims is the shape, stride[i] = source_dims[i+1] * source_dims[i+2] * ... * 1. * element_size is the size of the tensor element (sizeof(float), sizeof(double)). */ -static void IncrementIndexAndComputeOffsetSetup(MultiIndex& mindex, size_t num_axes, const std::vector& target_dims, +static void IncrementIndexAndComputeOffsetSetup(MultiIndex& mindex, size_t num_axes, gsl::span target_dims, const std::vector& stride, size_t element_size) { mindex.Init(num_axes); size_t naxes = 0; @@ -132,7 +132,7 @@ static inline void DoTransposeSingleBlock(size_t num_elts_in_block, const std::s // DoTranspose: copies source tensor to target, transposing elements. // The stride vector indicates the transposition. -static void DoTransposeImpl(int64_t num_axes, const std::vector& target_dims, +static void DoTransposeImpl(int64_t num_axes, gsl::span target_dims, size_t num_blocks, size_t num_elts_in_block, const std::vector& stride, const uint8_t* source, uint8_t* target, size_t element_size) { size_t blocksize = num_elts_in_block * element_size; @@ -148,7 +148,7 @@ static void DoTransposeImpl(int64_t num_axes, const std::vector& target } } -static void DoTransposeImpl(int64_t num_axes, const std::vector& target_dims, +static void DoTransposeImpl(int64_t num_axes, gsl::span target_dims, size_t num_blocks, size_t num_elts_in_block, const std::vector& stride, const std::string* source, std::string* target) { ORT_ENFORCE(num_axes > 0, "Transpose not implemented for empty tensors."); @@ -171,7 +171,7 @@ inline void CopyPrim(uint8_t* target, const uint8_t* source) { // The function does not check num_axes > 0 but this is expected. template -static bool TypedDoTransposeEltWise(int64_t num_axes, const std::vector& target_dims, size_t num_blocks, +static bool TypedDoTransposeEltWise(int64_t num_axes, gsl::span target_dims, size_t num_blocks, const std::vector& stride, const uint8_t* source, uint8_t* target) { constexpr bool enabled = utils::HasTypeWithSameSize(); @@ -194,7 +194,7 @@ static bool TypedDoTransposeEltWise(int64_t num_axes, const std::vector // DoTransposeEltWise: specialization of DoTranspose for the num_elts_in_block=1 case. // copies source tensor to target, transposing elements. // The stride vector indicates the transposition. -Status DoTransposeEltWise(int64_t num_axes, const std::vector& target_dims, size_t num_blocks, +Status DoTransposeEltWise(int64_t num_axes, gsl::span target_dims, size_t num_blocks, const std::vector& stride, const uint8_t* source, uint8_t* target, size_t element_size) { bool enabled = false; @@ -221,7 +221,7 @@ Status DoTransposeEltWise(int64_t num_axes, const std::vector& target_d element_size); } -static void DoTransposeEltWise(int64_t num_axes, const std::vector& target_dims, size_t num_blocks, +static void DoTransposeEltWise(int64_t num_axes, gsl::span target_dims, size_t num_blocks, const std::vector& stride, const std::string* source, std::string* target) { ORT_ENFORCE(num_axes > 0, "Transpose not implemented for empty tensors."); MultiIndex mindex; @@ -652,7 +652,7 @@ bool IsMovingSingleAxis(const std::vector& permutations, size_t& from, s } // namespace -bool IsTransposeReshape(const std::vector& perm, const std::vector& input_dims) { +bool IsTransposeReshape(const std::vector& perm, gsl::span input_dims) { // As long as the dims with values > 1 stay in the same order, it's a reshape. // Example: Shape=(1,1,1024,4096) -> perm=(2,0,3,1). size_t last_permuted_axis = 0; @@ -705,7 +705,7 @@ Status Transpose::Compute(OpKernelContext* ctx) const { ORT_ENFORCE(input_tensor_ptr != nullptr); const Tensor& X = *input_tensor_ptr; const TensorShape& input_shape = X.Shape(); - const std::vector& input_dims = input_shape.GetDims(); + auto input_dims = input_shape.GetDims(); size_t rank = input_dims.size(); std::vector output_dims(rank); diff --git a/onnxruntime/core/providers/cpu/tensor/transpose.h b/onnxruntime/core/providers/cpu/tensor/transpose.h index aa335dcf55bae..76c97fec1f508 100644 --- a/onnxruntime/core/providers/cpu/tensor/transpose.h +++ b/onnxruntime/core/providers/cpu/tensor/transpose.h @@ -17,10 +17,10 @@ namespace onnxruntime { empty dimensions can change place, not empty dimensions must be in the same order in the permuted tenosr. */ -bool IsTransposeReshape(const std::vector& perm, const std::vector& input_dims); +bool IsTransposeReshape(const std::vector& perm, gsl::span input_dims); // Public function for element-wise transpose, primarily to unit test any out of bounds access -Status DoTransposeEltWise(int64_t num_axes, const std::vector& target_dims, size_t num_blocks, +Status DoTransposeEltWise(int64_t num_axes, gsl::span target_dims, size_t num_blocks, const std::vector& stride, const uint8_t* source, uint8_t* target, size_t element_size); diff --git a/onnxruntime/core/providers/cpu/tensor/unique.cc b/onnxruntime/core/providers/cpu/tensor/unique.cc index 7232fc9701f94..2073574a1e8e5 100644 --- a/onnxruntime/core/providers/cpu/tensor/unique.cc +++ b/onnxruntime/core/providers/cpu/tensor/unique.cc @@ -218,7 +218,7 @@ static void CreateOutput(OpKernelContext& context, int64_t num_cols = subtensor_shape.SizeFromDimension(axis); int64_t num_rows = subtensor_shape.SizeToDimension(axis); - const std::vector subtensor_dims = subtensor_shape.GetDims(); + auto subtensor_dims = subtensor_shape.GetDims(); std::vector Y_dims; Y_dims.reserve(subtensor_dims.size()); for (int64_t i = 0, end = subtensor_dims.size(); i < end; ++i) { diff --git a/onnxruntime/core/providers/cpu/tensor/upsample.cc b/onnxruntime/core/providers/cpu/tensor/upsample.cc index 9ac56c5500a08..af6ef46d09279 100644 --- a/onnxruntime/core/providers/cpu/tensor/upsample.cc +++ b/onnxruntime/core/providers/cpu/tensor/upsample.cc @@ -1028,7 +1028,7 @@ Status Upsample::BaseCompute(OpKernelContext* context, const std::vector& output_dims) const { const auto* X = context->Input(0); ORT_ENFORCE(X != nullptr); - const std::vector& dims = X->Shape().GetDims(); + auto dims = X->Shape().GetDims(); ORT_ENFORCE(output_dims.size() == dims.size(), "Rank of input and output tensor should be same."); Tensor* Y = context->Output(0, output_dims); diff --git a/onnxruntime/core/providers/cpu/tensor/upsample.h b/onnxruntime/core/providers/cpu/tensor/upsample.h index dcd3405c30e5f..756c96089404a 100644 --- a/onnxruntime/core/providers/cpu/tensor/upsample.h +++ b/onnxruntime/core/providers/cpu/tensor/upsample.h @@ -325,8 +325,8 @@ class UpsampleBase { } } - void ParseScalesDataFromOutputSize(const std::vector& output_dims, - const std::vector& input_dims, + void ParseScalesDataFromOutputSize(gsl::span output_dims, + gsl::span input_dims, std::vector& scales) const { for (size_t i = 0, end = input_dims.size(); i < end; ++i) { // Handle corner case to avoid dividing by zero in the next step @@ -348,7 +348,7 @@ class UpsampleBase { } void ComputeOutputShape(const std::vector& scales, - const std::vector& input_dims, + gsl::span input_dims, std::vector& output_dims) const { for (std::size_t i = 0; i < input_dims.size(); i++) { output_dims[i] = static_cast(scales[i] * input_dims[i]); diff --git a/onnxruntime/core/providers/cpu/tensor/utils.h b/onnxruntime/core/providers/cpu/tensor/utils.h index 17beb1714ce43..1334bf91e7264 100644 --- a/onnxruntime/core/providers/cpu/tensor/utils.h +++ b/onnxruntime/core/providers/cpu/tensor/utils.h @@ -15,10 +15,14 @@ struct TensorPitches : std::vector { TensorPitches(const TensorShape& shape, size_t rank = 0) : TensorPitches(shape.GetDims(), rank) {} TensorPitches(const std::vector& dims, size_t rank = 0) : std::vector(std::max(rank, dims.size()), 0) { + Calculate(gsl::span(data(), size()), gsl::make_span(dims)); + } + TensorPitches(gsl::span dims, size_t rank = 0) + : std::vector(std::max(rank, dims.size()), 0) { Calculate(gsl::span(data(), size()), dims); } - static bool Calculate(gsl::span p, const std::vector& dims) { + static bool Calculate(gsl::span p, gsl::span dims) { // The pitches is the size of the next inner axis. Aka the amount to move by one of the next inner axis. // For a tensor with shape(2,3,4,5) the values would be: (3*4*5, 4*5, 5, 1) // Note that the outermost '2' is never used, as you never need to move by the entire size of the outermost axis @@ -135,7 +139,7 @@ struct ExtentAxisCounters { struct SliceSkips : std::vector { SliceSkips(const TensorShape& input_shape, gsl::span extents, gsl::span steps) : std::vector(input_shape.NumDimensions(), 0) { - auto& dims = input_shape.GetDims(); + auto dims = input_shape.GetDims(); ORT_ENFORCE(dims.size() == extents.size() && dims.size() >= steps.size()); @@ -175,7 +179,7 @@ struct SliceIteratorBase { SliceIteratorBase(const Tensor& tensor, gsl::span starts, gsl::span extents, gsl::span steps) : tensor_(tensor), extents_(extents), skips_(tensor_.Shape(), extents, steps), indices_(extents.size(), 0) { - auto& dims = tensor_.Shape().GetDims(); + auto dims = tensor_.Shape().GetDims(); Init(dims, starts, steps); } @@ -186,12 +190,12 @@ struct SliceIteratorBase { SliceIteratorBase(const Tensor& tensor, const TensorShape& tensor_shape, gsl::span starts, gsl::span extents, gsl::span steps) : tensor_(tensor), extents_(extents), skips_(tensor_shape, extents, steps), indices_(extents.size(), 0) { - const auto& dims = tensor_shape.GetDims(); + auto dims = tensor_shape.GetDims(); Init(dims, starts, steps); } // Initialize initial skip and inner_extent. - void Init(const std::vector& dims, gsl::span starts, gsl::span steps) { + void Init(gsl::span dims, gsl::span starts, gsl::span steps) { ORT_ENFORCE(dims.size() == starts.size() && dims.size() == extents_.size() && dims.size() >= steps.size()); @@ -381,7 +385,7 @@ struct WritableSliceIterator { WritableSliceIterator(Tensor& tensor, gsl::span starts, gsl::span extents, gsl::span steps) : tensor_(tensor), input_(tensor_.template MutableData()), extents_(extents), skips_(tensor_.Shape(), extents, steps), indices_(extents.size(), 0) { - auto& dims = tensor_.Shape().GetDims(); + auto dims = tensor_.Shape().GetDims(); Init(dims, starts, steps); } @@ -392,12 +396,12 @@ struct WritableSliceIterator { WritableSliceIterator(Tensor& tensor, const TensorShape& tensor_shape, gsl::span starts, gsl::span extents, gsl::span steps) : tensor_(tensor), input_(tensor_.template MutableData()), extents_(extents), skips_(tensor_shape, extents, steps), indices_(extents.size(), 0) { - auto& dims = tensor_shape.GetDims(); + auto dims = tensor_shape.GetDims(); Init(dims, starts, steps); } // Initialize initial skip and inner_extent. - void Init(const std::vector& dims, gsl::span starts, + void Init(gsl::span dims, gsl::span starts, gsl::span steps) { ORT_ENFORCE(dims.size() == starts.size(), "dims.size()=", dims.size(), " != ", "starts.size()=", starts.size()); diff --git a/onnxruntime/core/providers/cuda/cudnn_common.cc b/onnxruntime/core/providers/cuda/cudnn_common.cc index dda49c130f87b..52889ff8c9c16 100644 --- a/onnxruntime/core/providers/cuda/cudnn_common.cc +++ b/onnxruntime/core/providers/cuda/cudnn_common.cc @@ -26,7 +26,7 @@ Status CudnnTensor::CreateTensorIfNeeded() { return Status::OK(); } -Status CudnnTensor::Set(const std::vector& input_dims, cudnnDataType_t dataType) { +Status CudnnTensor::Set(gsl::span input_dims, cudnnDataType_t dataType) { ORT_RETURN_IF_ERROR(CreateTensorIfNeeded()); int rank = gsl::narrow_cast(input_dims.size()); diff --git a/onnxruntime/core/providers/cuda/cudnn_common.h b/onnxruntime/core/providers/cuda/cudnn_common.h index b9337d9127412..6526fc0aae31f 100644 --- a/onnxruntime/core/providers/cuda/cudnn_common.h +++ b/onnxruntime/core/providers/cuda/cudnn_common.h @@ -16,7 +16,7 @@ class CudnnTensor final { ~CudnnTensor(); ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(CudnnTensor); - Status Set(const std::vector& input_dims, cudnnDataType_t dataType); + Status Set(gsl::span input_dims, cudnnDataType_t dataType); Status Set(const CudnnTensor& x_desc, cudnnBatchNormMode_t mode); operator cudnnTensorDescriptor_t() const { return tensor_; } diff --git a/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops.cc b/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops.cc index 5868bf225be77..d8ee94d383c8f 100644 --- a/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops.cc +++ b/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops.cc @@ -78,7 +78,7 @@ Status MatMul(const T* input_1_data, const T* input_2_data, T* output_data, // CUDA EP specific ReduceSum helper template -std::unique_ptr ReduceSum(const Tensor& input, const std::vector& reduce_axes, +std::unique_ptr ReduceSum(const Tensor& input, gsl::span reduce_axes, bool keep_dims, AllocatorPtr allocator, const TensorShape* input_shape_override, concurrency::ThreadPool* /*tp*/, void* einsum_cuda_assets) { @@ -108,7 +108,7 @@ std::unique_ptr Diagonal(const Tensor& input, int64_t dim_1, int64_t dim } // Make a copy - we are going to mutate the dims - std::vector output_dims = input_dims; + std::vector output_dims = input_shape.GetDimsAsVector(); // Remove the dim value in `second_dim` - // The diagonal values are stored along `first_dim` @@ -155,7 +155,7 @@ template Status DeviceHelpers::CudaDeviceHelpers::MatMul( void* einsum_cuda_assets); template std::unique_ptr DeviceHelpers::CudaDeviceHelpers::ReduceSum( - const Tensor& input, const std::vector& reduce_axes, + const Tensor& input, gsl::span reduce_axes, bool keep_dims, AllocatorPtr allocator, const TensorShape* input_shape_override, concurrency::ThreadPool* tp, void* einsum_cuda_assets); @@ -168,7 +168,7 @@ template Status DeviceHelpers::CudaDeviceHelpers::MatMul( void* einsum_cuda_assets); template std::unique_ptr DeviceHelpers::CudaDeviceHelpers::ReduceSum( - const Tensor& input, const std::vector& reduce_axes, + const Tensor& input, gsl::span reduce_axes, bool keep_dims, AllocatorPtr allocator, const TensorShape* input_shape_override, concurrency::ThreadPool* tp, void* einsum_cuda_assets); @@ -181,7 +181,7 @@ template Status DeviceHelpers::CudaDeviceHelpers::MatMul( void* einsum_cuda_assets); template std::unique_ptr DeviceHelpers::CudaDeviceHelpers::ReduceSum( - const Tensor& input, const std::vector& reduce_axes, + const Tensor& input, gsl::span reduce_axes, bool keep_dims, AllocatorPtr allocator, const TensorShape* input_shape_override, concurrency::ThreadPool* tp, void* einsum_cuda_assets); diff --git a/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops.h b/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops.h index 6ba139f2dafc4..5db1468e1a72c 100644 --- a/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops.h +++ b/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops.h @@ -47,7 +47,7 @@ Status MatMul(const T* input_1_data, const T* input_2_data, T* output_data, void* einsum_cuda_assets); template -std::unique_ptr ReduceSum(const Tensor& input, const std::vector& reduce_axes, +std::unique_ptr ReduceSum(const Tensor& input, gsl::span reduce_axes, bool keep_dims, AllocatorPtr allocator, const TensorShape* input_shape_override, concurrency::ThreadPool* /*tp*/, void* einsum_cuda_assets); diff --git a/onnxruntime/core/providers/cuda/math/topk.cc b/onnxruntime/core/providers/cuda/math/topk.cc index 6baaaf2a6d58e..bb68c8f0f36f1 100644 --- a/onnxruntime/core/providers/cuda/math/topk.cc +++ b/onnxruntime/core/providers/cuda/math/topk.cc @@ -73,7 +73,7 @@ Status TopK::ComputeInternal(OpKernelContext* ctx) const { return Status::OK(); } - auto elem_nums = tensor_X->Shape().GetDims(); + auto elem_nums = tensor_X->Shape().GetDimsAsVector(); auto dimension = elem_nums[axis]; for (auto i = static_cast(elem_nums.size()) - 2; i >= 0; --i) { elem_nums[i] *= elem_nums[i + 1]; diff --git a/onnxruntime/core/providers/cuda/nn/conv.cc b/onnxruntime/core/providers/cuda/nn/conv.cc index b0744285a0b1b..f224006bcb799 100644 --- a/onnxruntime/core/providers/cuda/nn/conv.cc +++ b/onnxruntime/core/providers/cuda/nn/conv.cc @@ -69,9 +69,9 @@ size_t GetMaxWorkspaceSize(const CudnnConvState& } Status SliceOutUnwantedOutputSection(cudaStream_t stream, - const void* input_data, const std::vector& input_dims, + const void* input_data, gsl::span input_dims, void* output_data, - const std::vector& output_dims, + gsl::span output_dims, std::vector starts, const std::vector& ends, const std::vector& axes, @@ -81,7 +81,7 @@ Status SliceOutUnwantedOutputSection(cudaStream_t stream, ORT_THROW_IF_ERROR(SliceBase::PrepareForCompute(starts, ends, axes, compute_metadata)); // As a sanity check, ensure that the slice operator's output shape matches with the expected output shape - ORT_ENFORCE(compute_metadata.output_dims_ == output_dims); + ORT_ENFORCE(gsl::make_span(compute_metadata.output_dims_) == output_dims); return SliceCuda::Impl(stream, input_data, input_dims, output_data, compute_metadata, element_size); } @@ -91,13 +91,13 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) const //set X const Tensor* X = context->Input(0); const TensorShape& x_shape = X->Shape(); - const auto& x_dims = x_shape.GetDims(); + const auto x_dims = x_shape.GetDims(); s_.x_data = reinterpret_cast(X->template Data()); s_.element_size = X->DataType()->Size(); //set W const Tensor* W = context->Input(1); const TensorShape& w_shape = W->Shape(); - std::vector w_dims = w_shape.GetDims(); + auto w_dims = w_shape.GetDimsAsVector(); s_.w_data = reinterpret_cast(W->template Data()); //set B if (context->InputCount() >= 3) { @@ -188,7 +188,7 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) const s_.y_data = reinterpret_cast(s_.Y->template MutableData()); } - std::vector x_dims_cudnn = x_dims; + std::vector x_dims_cudnn{x_dims.begin(), x_dims.end()}; std::vector y_dims_cudnn = !post_slicing_required ? y_dims : y_dims_with_adjusted_pads; if (rank < 2) { // TODO: Explore padding the provided input shape [N, C, D] to [N, C, 1, D] @@ -343,8 +343,8 @@ Status Conv::ComputeInternal(OpKernelContext* context) const { // To deal with asymmetric padding, we may have over-padded on one or both sides of the spatial dimensions // This may have lead to extra results that are unnecessary and hence we slice that off here if (s_.post_slicing_required) { - ORT_RETURN_IF_ERROR(SliceOutUnwantedOutputSection(Stream(), s_.y_data, s_.y_dims_with_adjusted_pads, - s_.Y->MutableDataRaw(), s_.y_dims, s_.slice_starts, + ORT_RETURN_IF_ERROR(SliceOutUnwantedOutputSection(Stream(), s_.y_data, gsl::make_span(s_.y_dims_with_adjusted_pads), + s_.Y->MutableDataRaw(), s_.y_dims.GetDims(), s_.slice_starts, s_.slice_ends, s_.slice_axes, s_.element_size)); } return Status::OK(); diff --git a/onnxruntime/core/providers/cuda/nn/conv.h b/onnxruntime/core/providers/cuda/nn/conv.h index 1d037c20d8e5a..845f089a07ac4 100644 --- a/onnxruntime/core/providers/cuda/nn/conv.h +++ b/onnxruntime/core/providers/cuda/nn/conv.h @@ -114,11 +114,11 @@ struct CudnnConvState { cudnnHandle_t handle; // if x/w dims changed, update algo and cudnnTensors - std::vector last_x_dims; - std::vector last_w_dims; + TensorShape last_x_dims; + TensorShape last_w_dims; // these would be recomputed if x/w dims change - std::vector y_dims; + TensorShape y_dims; std::vector y_dims_with_adjusted_pads; size_t workspace_bytes; decltype(AlgoPerfType().algo) algo; @@ -194,9 +194,9 @@ class Conv : public CudaKernel { Status SliceOutUnwantedOutputSection(cudaStream_t stream, const void* input_data, - const std::vector& input_dims, + gsl::span input_dims, void* output_data, - const std::vector& output_dims, + gsl::span output_dims, std::vector starts, const std::vector& ends, const std::vector& axes, diff --git a/onnxruntime/core/providers/cuda/nn/conv_transpose.cc b/onnxruntime/core/providers/cuda/nn/conv_transpose.cc index ad8f9a1362100..dd1d7002cc45c 100644 --- a/onnxruntime/core/providers/cuda/nn/conv_transpose.cc +++ b/onnxruntime/core/providers/cuda/nn/conv_transpose.cc @@ -41,7 +41,7 @@ Status ConvTranspose::DoConvTranspose(OpKernelContext* context, bool dynamic_ const Tensor* X = context->Input(0); const TensorShape& x_shape = X->Shape(); - auto x_dims = x_shape.GetDims(); + auto x_dims = x_shape.GetDimsAsVector(); auto x_data = reinterpret_cast(X->template Data()); auto x_dimensions = X->Shape().NumDimensions(); @@ -52,7 +52,7 @@ Status ConvTranspose::DoConvTranspose(OpKernelContext* context, bool dynamic_ } const Tensor* W = context->Input(1); const TensorShape& w_shape = W->Shape(); - std::vector w_dims = w_shape.GetDims(); + std::vector w_dims = w_shape.GetDimsAsVector(); auto w_data = reinterpret_cast(W->template Data()); size_t num_inputs = OpKernel::Node().InputDefs().size(); @@ -81,7 +81,7 @@ Status ConvTranspose::DoConvTranspose(OpKernelContext* context, bool dynamic_ ConvTransposeAttributes::Prepare p; ORT_RETURN_IF_ERROR(conv_transpose_attrs_.PrepareForCompute(context, has_bias, p, dynamic_padding)); - auto y_dims = p.Y->Shape().GetDims(); + auto y_dims = p.Y->Shape().GetDimsAsVector(); if (x_dimensions == 3) { y_dims.insert(y_dims.begin() + 2, 1); p.kernel_shape.insert(p.kernel_shape.begin(), 1); @@ -160,7 +160,7 @@ Status ConvTranspose::DoConvTranspose(OpKernelContext* context, bool dynamic_ // The following block will be executed in case there has been no change in the shapes of the // input and the filter compared to the previous run if (!y_data) { - auto y_dims = s_.y_dims; + auto y_dims = s_.y_dims.GetDimsAsVector(); if (x_dimensions == 3) { y_dims.erase(y_dims.begin() + 2); } diff --git a/onnxruntime/core/providers/cuda/nn/instance_norm.cc b/onnxruntime/core/providers/cuda/nn/instance_norm.cc index 3360db8f25453..cd0a13418d6f9 100644 --- a/onnxruntime/core/providers/cuda/nn/instance_norm.cc +++ b/onnxruntime/core/providers/cuda/nn/instance_norm.cc @@ -95,10 +95,10 @@ Status InstanceNorm::ComputeInternal(OpKernelContext* p_op_kernel_context) co auto image_size = input_count / stats_count; CudnnTensor data_desc; - ORT_RETURN_IF_ERROR(data_desc.Set({1, stats_count, image_size, 1}, CudnnTensor::GetDataType())); + ORT_RETURN_IF_ERROR(data_desc.Set(std::array{1, stats_count, image_size, 1}, CudnnTensor::GetDataType())); CudnnTensor stats_desc; - ORT_RETURN_IF_ERROR(stats_desc.Set({1, stats_count, 1, 1}, CudnnTensor::GetDataType())); + ORT_RETURN_IF_ERROR(stats_desc.Set(std::array{1, stats_count, 1, 1}, CudnnTensor::GetDataType())); auto mean = GetScratchBuffer(stats_count); auto variance = GetScratchBuffer(stats_count); diff --git a/onnxruntime/core/providers/cuda/nn/pool.cc b/onnxruntime/core/providers/cuda/nn/pool.cc index 4e9d7cca74a8d..25606513390f1 100644 --- a/onnxruntime/core/providers/cuda/nn/pool.cc +++ b/onnxruntime/core/providers/cuda/nn/pool.cc @@ -125,7 +125,7 @@ Status Pool::ComputeInternal(OpKernelContext* context) const { typedef typename ToCudaType::MappedType CudaT; const Tensor* X = context->Input(0); const TensorShape& x_shape = X->Shape(); - const auto& x_dims = x_shape.GetDims(); + const auto x_dims = x_shape.GetDims(); if (x_shape.NumDimensions() < 3) { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Input dimension cannot be less than 3."); @@ -151,8 +151,8 @@ Status Pool::ComputeInternal(OpKernelContext* context) const { auto x_data = reinterpret_cast(X->template Data()); auto y_data = reinterpret_cast(Y->template MutableData()); - std::vector x_dims_cudnn = x_dims; - std::vector y_dims_cudnn = y_dims; + std::vector x_dims_cudnn(x_dims.begin(), x_dims.end()); + std::vector y_dims_cudnn(y_dims.begin(), y_dims.end()); if (kernel_shape.size() < 2) { // cudnn only takes 4D or 5D input, so pad dimensions if needed x_dims_cudnn.push_back(1); diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_functions.cc b/onnxruntime/core/providers/cuda/reduction/reduction_functions.cc index 26ba679356fbf..85fb629ada722 100644 --- a/onnxruntime/core/providers/cuda/reduction/reduction_functions.cc +++ b/onnxruntime/core/providers/cuda/reduction/reduction_functions.cc @@ -90,7 +90,7 @@ optional> GetMinAndMaxContiguousAxes( ApplicableMatrixReduction get_applicable_matrix_reduction( const cudnnReduceTensorOp_t cudnn_reduce_op, - const std::vector& dims, const std::vector& original_axes, + gsl::span dims, gsl::span original_axes, int& m_out, int& n_out) { if (cudnn_reduce_op != CUDNN_REDUCE_TENSOR_ADD && cudnn_reduce_op != CUDNN_REDUCE_TENSOR_AVG) { return ApplicableMatrixReduction::None; @@ -151,7 +151,7 @@ ApplicableMatrixReduction get_applicable_matrix_reduction( // the axis index right after the last flattened into matrix rows const int64_t m_end_axis = axes_from_beginning ? max_axis + 1 : min_axis; - const TensorShape& shape = TensorShape::ReinterpretBaseType(new_dims); + const auto shape=TensorShape::FromExistingBuffer(new_dims); const auto m = shape.SizeToDimension(m_end_axis); const auto n = shape.SizeFromDimension(m_end_axis); diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_functions.h b/onnxruntime/core/providers/cuda/reduction/reduction_functions.h index 1ffcffa1d0f97..40c879568f522 100644 --- a/onnxruntime/core/providers/cuda/reduction/reduction_functions.h +++ b/onnxruntime/core/providers/cuda/reduction/reduction_functions.h @@ -77,7 +77,7 @@ enum class ApplicableMatrixReduction { */ ApplicableMatrixReduction get_applicable_matrix_reduction( const cudnnReduceTensorOp_t cudnn_reduce_op, - const std::vector& dims, const std::vector& axes, + gsl::span dims, gsl::span axes, int& m, int& n); /** diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc index ba1ca2b0390c1..88e3af5ff6852 100644 --- a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc +++ b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc @@ -186,7 +186,6 @@ Status ReduceKernel::ReduceKernelShared( } } - const auto& input_dims = input_shape.GetDims(); int64_t input_count = input_shape.Size(); IAllocatorUniquePtr temp_X; if (ReduceTensorIndices == CUDNN_REDUCE_TENSOR_FLATTENED_INDICES && std::is_same::value) { @@ -197,7 +196,7 @@ Status ReduceKernel::ReduceKernelShared( } // CUDNN requires at least 3D input, so pad 1s if needed - std::vector input_dims_cudnn = input_dims; + std::vector input_dims_cudnn = input_shape.GetDimsAsVector(); std::vector output_dims_cudnn = output_dims; if (rank < 3) { std::vector pads(3 - rank, 1); @@ -367,7 +366,7 @@ template Status ReduceKernel::ReduceKernelShared& axes, + gsl::span axes, PrepareReduceMetadata& prepare_reduce_metadata, const TensorShape* input_shape_override) { ORT_ENFORCE(nullptr != X); @@ -382,9 +381,8 @@ Status PrepareForReduce(const Tensor* X, const auto& input_dims = input_shape.GetDims(); std::vector reduced(rank, false); - prepare_reduce_metadata.output_dims.reserve(input_dims.size()); if (axes.size() > 0) { - prepare_reduce_metadata.output_dims = input_dims; + prepare_reduce_metadata.output_dims = input_shape.GetDimsAsVector(); for (auto axis : axes) { axis = HandleNegativeAxis(axis, rank); ORT_ENFORCE(input_dims[axis] != 0, @@ -396,6 +394,7 @@ Status PrepareForReduce(const Tensor* X, } } else { // no axes provided (i.e.) default axes => reduce on all dims + prepare_reduce_metadata.output_dims.reserve(input_dims.size()); for (auto dim : input_dims) { ORT_ENFORCE(keepdims || dim != 0, "Can't reduce on dim with value of 0 if 'keepdims' is false. " @@ -420,7 +419,7 @@ Status PrepareForReduce(const Tensor* X, } // CUDNN requires at least 3D input, so pad 1s if needed - prepare_reduce_metadata.input_dims_cudnn = input_dims; + prepare_reduce_metadata.input_dims_cudnn = input_shape.GetDimsAsVector(); prepare_reduce_metadata.output_dims_cudnn = prepare_reduce_metadata.output_dims; if (rank < 3) { std::vector pads(3 - rank, 1); @@ -437,7 +436,7 @@ Status PrepareForReduce(const Tensor* X, template Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, PrepareReduceMetadata& prepare_reduce_metadata, /*out*/ Tensor& output, cudnnReduceTensorOp_t cudnn_reduce_op, - const std::vector& axes, + gsl::span axes, bool calculate_log, bool calculate_sqt, bool log_sum_exp, bool fast_reduction, const TensorShape* input_shape_override) { typedef typename ToCudaType::MappedType CudaT; @@ -947,7 +946,7 @@ namespace ReductionOps { template std::unique_ptr ReduceCompute(CUDAExecutionProvider& cuda_ep, cudnnReduceTensorOp_t cudnn_reduce_op, AllocatorPtr allocator, - const Tensor& input, const std::vector& axes, + const Tensor& input, gsl::span axes, bool keep_dims, bool calculate_log, bool calculate_sqt, bool log_sum_exp, bool fast_reduction, const TensorShape* input_shape_override) { PrepareReduceMetadata prepare_reduce_metadata; @@ -978,21 +977,21 @@ std::unique_ptr ReduceCompute(CUDAExecutionProvider& cuda_ep, cudnnReduc template std::unique_ptr ReduceCompute( CUDAExecutionProvider& cuda_ep, cudnnReduceTensorOp_t cudnn_reduce_op, AllocatorPtr allocator, - const Tensor& input, const std::vector& axes, + const Tensor& input, gsl::span axes, bool keep_dims, bool calculate_log, bool calculate_sqt, bool log_sum_exp, bool fast_reduction, const TensorShape* input_shape_override); template std::unique_ptr ReduceCompute( CUDAExecutionProvider& cuda_ep, cudnnReduceTensorOp_t cudnn_reduce_op, AllocatorPtr allocator, - const Tensor& input, const std::vector& axes, + const Tensor& input, gsl::span axes, bool keep_dims, bool calculate_log, bool calculate_sqt, bool log_sum_exp, bool fast_reduction, const TensorShape* input_shape_override); template std::unique_ptr ReduceCompute( CUDAExecutionProvider& cuda_ep, cudnnReduceTensorOp_t cudnn_reduce_op, AllocatorPtr allocator, - const Tensor& input, const std::vector& axes, + const Tensor& input, gsl::span axes, bool keep_dims, bool calculate_log, bool calculate_sqt, bool log_sum_exp, bool fast_reduction, const TensorShape* input_shape_override); diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_ops.h b/onnxruntime/core/providers/cuda/reduction/reduction_ops.h index 3bb1ea42ec8d6..406cae8e02d9c 100644 --- a/onnxruntime/core/providers/cuda/reduction/reduction_ops.h +++ b/onnxruntime/core/providers/cuda/reduction/reduction_ops.h @@ -17,7 +17,7 @@ namespace ReductionOps { template std::unique_ptr ReduceCompute(CUDAExecutionProvider& cuda_ep, cudnnReduceTensorOp_t cudnn_reduce_op, AllocatorPtr allocator, - const Tensor& input, const std::vector& axes, + const Tensor& input, gsl::span axes, bool keep_dims, bool calculate_log, bool calculate_sqt, bool log_sum_exp, bool fast_reduction, const TensorShape* input_shape_override = nullptr); @@ -219,14 +219,14 @@ class ReduceLogSumExp final : public ReduceKernel { Status PrepareForReduce(const Tensor* X, bool keepdims, - const std::vector& axes, + gsl::span axes, PrepareReduceMetadata& prepare_reduce_metadata, const TensorShape* input_shape_override = nullptr); template Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, PrepareReduceMetadata& prepare_reduce_metadata, /*out*/ Tensor& output, cudnnReduceTensorOp_t cudnn_reduce_op, - const std::vector& axes, + gsl::span axes, bool calculate_log, bool calculate_sqt, bool log_sum_exp, bool fast_reduction, const TensorShape* input_shape_override = nullptr); diff --git a/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h b/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h index 1c4b497e88ce9..7d1fc1b495f33 100644 --- a/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h +++ b/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h @@ -9,6 +9,7 @@ #include #include #include +#include #include "core/providers/cuda/shared_inc/fast_divmod.h" @@ -63,6 +64,11 @@ struct TArray { memcpy(data_, vec.data(), vec.size() * sizeof(T)); } + TArray(gsl::span vec) : TArray(static_cast(vec.size())) { + static_assert(std::is_trivially_copyable::value, "T must be trivially copyable."); + memcpy(data_, vec.data(), vec.size() * sizeof(T)); + } + void SetSize(int32_t size) { ORT_ENFORCE( 0 <= size && size <= capacity, diff --git a/onnxruntime/core/providers/cuda/tensor/compress.cc b/onnxruntime/core/providers/cuda/tensor/compress.cc index 45281fb810851..3e746ae12a04d 100644 --- a/onnxruntime/core/providers/cuda/tensor/compress.cc +++ b/onnxruntime/core/providers/cuda/tensor/compress.cc @@ -32,7 +32,7 @@ Status Compress::ComputeInternal(OpKernelContext* ctx) const { const Tensor* input_tensor = ctx->Input(0); ORT_ENFORCE(input_tensor); size_t rank = input_tensor->Shape().NumDimensions(); - auto& input_dimensions = input_tensor->Shape().GetDims(); + auto input_dimensions = input_tensor->Shape().GetDims(); int64_t axis = 0; if (has_axis_) { axis = HandleNegativeAxis(axis_, rank); @@ -71,7 +71,7 @@ Status Compress::ComputeInternal(OpKernelContext* ctx) const { int32_t positive_condition_count = 0; CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(&positive_condition_count, condition_cumulative_sum + valid_condition_length - 1, sizeof(int32_t), cudaMemcpyDeviceToHost, Stream())); - std::vector output_dims(input_dimensions); + std::vector output_dims(input_dimensions.begin(), input_dimensions.end()); if (has_axis_) { output_dims[axis] = positive_condition_count; } else { diff --git a/onnxruntime/core/providers/cuda/tensor/expand.cc b/onnxruntime/core/providers/cuda/tensor/expand.cc index 4d58e4fb40631..411336acd4c27 100644 --- a/onnxruntime/core/providers/cuda/tensor/expand.cc +++ b/onnxruntime/core/providers/cuda/tensor/expand.cc @@ -74,8 +74,8 @@ Status Expand::ComputeInternal(OpKernelContext* ctx) const { return Status::OK(); } - output_dims = output_shape.GetDims(); - auto input_dims = input_data_tensor.Shape().GetDims(); + output_dims = output_shape.GetDimsAsVector(); + auto input_dims = input_data_tensor.Shape().GetDimsAsVector(); CalcEffectiveDims(input_dims, output_dims); int rank = gsl::narrow_cast(output_dims.size()); diff --git a/onnxruntime/core/providers/cuda/tensor/eye_like.cc b/onnxruntime/core/providers/cuda/tensor/eye_like.cc index 95fed4c6018d1..ab26bbb95e51f 100644 --- a/onnxruntime/core/providers/cuda/tensor/eye_like.cc +++ b/onnxruntime/core/providers/cuda/tensor/eye_like.cc @@ -42,7 +42,7 @@ Status EyeLike::ComputeInternal(OpKernelContext* context) const { const auto* T1 = context->Input(0); ORT_ENFORCE(T1 != nullptr); - const std::vector& input_dims = T1->Shape().GetDims(); + auto input_dims = T1->Shape().GetDims(); if (input_dims.size() != 2) { return Status(ONNXRUNTIME, INVALID_ARGUMENT, "EyeLike : Input tensor dimension is not 2"); } diff --git a/onnxruntime/core/providers/cuda/tensor/nonzero_op.cc b/onnxruntime/core/providers/cuda/tensor/nonzero_op.cc index f4f4004986f24..791f508e4b7e3 100644 --- a/onnxruntime/core/providers/cuda/tensor/nonzero_op.cc +++ b/onnxruntime/core/providers/cuda/tensor/nonzero_op.cc @@ -51,13 +51,13 @@ NONZERO_TYPED_KERNEL(float) template Status NonZero::ComputeInternal(OpKernelContext* context) const { - static const std::vector kScalarDims = {1}; + static const TensorShape kScalarDims{1}; const auto x = context->Input(0); int nonzero_elements = 0; const auto& x_shape = x->Shape(); const int x_rank = x_shape.IsScalar() ? 1 : static_cast(x_shape.NumDimensions()); - const std::vector& x_dims = (x_shape.IsScalar()) ? kScalarDims : x_shape.GetDims(); + auto x_dims = (x_shape.IsScalar()) ? kScalarDims.GetDims() : x_shape.GetDims(); const int64_t x_size = x_shape.Size(); if (x_size > 0) { auto x_data = reinterpret_cast::MappedType*>(x->template Data()); diff --git a/onnxruntime/core/providers/cuda/tensor/pad.cc b/onnxruntime/core/providers/cuda/tensor/pad.cc index fb406aeb80fe5..3aa61407e56f6 100644 --- a/onnxruntime/core/providers/cuda/tensor/pad.cc +++ b/onnxruntime/core/providers/cuda/tensor/pad.cc @@ -91,7 +91,7 @@ Status Pad::ComputeInternal(OpKernelContext* ctx) const { std::vector slices; if (is_dynamic_) { const Tensor& pads_tensor = *ctx->Input(1); - const std::vector& pads_tensor_dims = pads_tensor.Shape().GetDims(); + const auto pads_tensor_dims = pads_tensor.Shape().GetDims(); ORT_ENFORCE(utils::IsPrimitiveDataType(pads_tensor.DataType()), "Pads tensor should be an INT64 tensor"); ORT_ENFORCE(pads_tensor_dims.size() == 1 || (pads_tensor_dims.size() == 2 && pads_tensor_dims[0] == 1), @@ -132,7 +132,7 @@ Status Pad::ComputeInternal(OpKernelContext* ctx) const { TArray input_dims(input_shape.GetDims()); TArray input_strides(input_pitches); - std::vector output_dims(input_shape.GetDims()); + std::vector output_dims(input_shape.GetDimsAsVector()); ORT_ENFORCE(dimension_count * 2 == p_pads->size(), "'pads' attribute has wrong number of values"); // Calculate output dimensions, and handle any negative padding diff --git a/onnxruntime/core/providers/cuda/tensor/slice.cc b/onnxruntime/core/providers/cuda/tensor/slice.cc index 1619efa3085ff..df768932ba3bf 100644 --- a/onnxruntime/core/providers/cuda/tensor/slice.cc +++ b/onnxruntime/core/providers/cuda/tensor/slice.cc @@ -130,7 +130,7 @@ static Status ComputeSliceStrides(const TensorShape& input_shape, aggregated_last_dim *= input_dimensions[i]; } - auto flattened_input_dims(input_dimensions); + std::vector flattened_input_dims(input_dimensions.begin(), input_dimensions.end()); flattened_input_dims.resize(dimension_count); flattened_input_dims.back() = aggregated_last_dim; ORT_ENFORCE(TensorPitches::Calculate(input_strides_span, flattened_input_dims)); diff --git a/onnxruntime/core/providers/cuda/tensor/split.cc b/onnxruntime/core/providers/cuda/tensor/split.cc index 2b26efccdbad7..48a96427c9893 100644 --- a/onnxruntime/core/providers/cuda/tensor/split.cc +++ b/onnxruntime/core/providers/cuda/tensor/split.cc @@ -63,8 +63,8 @@ Status Split::ComputeInternal(OpKernelContext* ctx) const { auto input_data = input_tensor->DataRaw(); - auto& input_dims = input_shape.GetDims(); - std::vector output_dimensions{input_dims}; + auto input_dims = input_shape.GetDims(); + std::vector output_dimensions{input_shape.GetDimsAsVector()}; CudaAsyncBuffer output_ptr(this, num_outputs); gsl::span output_ptr_span = output_ptr.CpuSpan(); diff --git a/onnxruntime/core/providers/cuda/tensor/tile.cc b/onnxruntime/core/providers/cuda/tensor/tile.cc index a226c93e5fb8a..9fff9f6cba286 100644 --- a/onnxruntime/core/providers/cuda/tensor/tile.cc +++ b/onnxruntime/core/providers/cuda/tensor/tile.cc @@ -53,8 +53,8 @@ Status Tile::ComputeInternal(OpKernelContext* ctx) const { // Calculate the shape of the output tensor auto* repeats = repeats_tensor.template Data(); const auto& input_shape = input_tensor.Shape(); - const auto& input_dims = input_shape.GetDims(); - std::vector output_dims(input_dims); + const auto input_dims = input_shape.GetDims(); + std::vector output_dims(input_shape.GetDimsAsVector()); for (auto axis = 0; axis < rank; axis++) output_dims[axis] *= repeats[axis]; TensorShape output_shape(output_dims); diff --git a/onnxruntime/core/providers/cuda/tensor/transpose.cc b/onnxruntime/core/providers/cuda/tensor/transpose.cc index 80fd55319b49a..99922b88c113f 100644 --- a/onnxruntime/core/providers/cuda/tensor/transpose.cc +++ b/onnxruntime/core/providers/cuda/tensor/transpose.cc @@ -90,16 +90,16 @@ Status Transpose::DoTranspose(const cudaDeviceProp& prop, if (output.Shape().Size() == 0) return Status::OK(); - const std::vector& input_dims = input_shape_override ? input_shape_override->GetDims() : input.Shape().GetDims(); - const std::vector& output_dims = output.Shape().GetDims(); + const auto input_dims = input_shape_override ? input_shape_override->GetDims() : input.Shape().GetDims(); + const auto output_dims = output.Shape().GetDims(); auto rank = static_cast(input_dims.size()); // flatten the adjacent dimensions which are contiguous // for example: permutations[0, 2, 3, 1] -> [0, 2, 1], permutations[0, 3, 1, 2] -> [0, 2, 1] auto new_rank = rank; std::vector new_permutations(permutations); - std::vector new_input_dims(input_dims); - std::vector new_output_dims(output_dims); + std::vector new_input_dims(input_dims.begin(), input_dims.end()); + std::vector new_output_dims(output_dims.begin(), output_dims.end()); for (auto i = rank - 1; i > 0; i--) { auto curr = new_permutations[i]; @@ -228,8 +228,7 @@ Status Transpose::ComputeInternal(OpKernelContext* ctx) const { if (X_ptr == nullptr) return Status(common::ONNXRUNTIME, common::FAIL, "input count mismatch"); const Tensor& X = *X_ptr; const TensorShape& input_shape = X.Shape(); - const std::vector& input_dims = input_shape.GetDims(); - int32_t rank = gsl::narrow_cast(input_dims.size()); + int32_t rank = gsl::narrow_cast(input_shape.NumDimensions()); std::vector output_dims(rank); std::vector default_perm(rank); diff --git a/onnxruntime/core/providers/cuda/tensor/upsample.cc b/onnxruntime/core/providers/cuda/tensor/upsample.cc index 150787f92e943..507d959601dff 100644 --- a/onnxruntime/core/providers/cuda/tensor/upsample.cc +++ b/onnxruntime/core/providers/cuda/tensor/upsample.cc @@ -43,7 +43,7 @@ Status Upsample::BaseCompute(OpKernelContext* context, const std::vector& scales, const std::vector& output_dims) const { const Tensor* X = context->Input(0); - const std::vector& X_dims = X->Shape().GetDims(); + auto X_dims = X->Shape().GetDims(); int32_t rank = static_cast(X_dims.size()); ORT_ENFORCE(output_dims.size() == rank, "Rank of input and output tensor should be same."); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp index 9a9b48a47b1da..57cfbb12295c1 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp @@ -422,7 +422,7 @@ namespace Dml onnxruntime::Tensor* tensor = kernelContext->Output( static_cast(i), - onnxruntime::TensorShape::ReinterpretBaseType(outputDims) + onnxruntime::TensorShape::FromExistingBuffer(outputDims) ); uint64_t allocId; diff --git a/onnxruntime/core/providers/nuphar/compiler/nuphar_codegen_ctx.cc b/onnxruntime/core/providers/nuphar/compiler/nuphar_codegen_ctx.cc index 91d274f05d423..43e640d545737 100644 --- a/onnxruntime/core/providers/nuphar/compiler/nuphar_codegen_ctx.cc +++ b/onnxruntime/core/providers/nuphar/compiler/nuphar_codegen_ctx.cc @@ -113,7 +113,7 @@ static const Tensor* Marshalling( // input const auto& tensor_shape = original_initializer->Shape(); - auto input_shape = tensor_shape.GetDims(); + auto input_shape = tensor_shape.GetDimsAsVector(); if (input_shape.empty()) input_shape.push_back(1); const void* input_data = original_initializer->DataRaw(); diff --git a/onnxruntime/core/providers/nuphar/compiler/nuphar_op_ir_builder.cc b/onnxruntime/core/providers/nuphar/compiler/nuphar_op_ir_builder.cc index 5c4540dba6ee2..2859a42d0e701 100644 --- a/onnxruntime/core/providers/nuphar/compiler/nuphar_op_ir_builder.cc +++ b/onnxruntime/core/providers/nuphar/compiler/nuphar_op_ir_builder.cc @@ -91,7 +91,7 @@ bool CreateScalarTensorFromInitializer(const Tensor* tensor, std::string normalized_name = NormalizeCppName(name); auto tvm_tensor = tvm::compute( - tvm_codegen::ToTvmArray(tensor->Shape().GetDims()), + tvm_codegen::ToTvmArray(tensor->Shape().GetDimsAsVector()), [&](const tvm::Array&) { return constant_scalar; }, @@ -120,7 +120,7 @@ const tvm::Tensor& GetOrCreateInitializer(const std::string& name, DLDataType dtype = tvm_codegen::ToTvmDLDataType(ONNXRUNTIME_data_type); HalideIR::Type halide_type((halideir_type_code_t)dtype.code, dtype.bits, dtype.lanes); std::string normalized_name = NormalizeCppName(name); - auto tvm_shape = tvm_codegen::ToTvmArray(tensor->Shape().GetDims()); + auto tvm_shape = tvm_codegen::ToTvmArray(tensor->Shape().GetDimsAsVector()); auto tvm_tensor = CreateInputPlaceholder(tvm_shape, halide_type, normalized_name, is_sliced); // create the layout info ctx_codegen.CreateWeightLayoutInfo(name, tvm_tensor); diff --git a/onnxruntime/core/providers/nuphar/nuphar_execution_provider.cc b/onnxruntime/core/providers/nuphar/nuphar_execution_provider.cc index b56f2f673a50f..16f54665409e1 100644 --- a/onnxruntime/core/providers/nuphar/nuphar_execution_provider.cc +++ b/onnxruntime/core/providers/nuphar/nuphar_execution_provider.cc @@ -377,7 +377,7 @@ Status NupharExecutionProvider::SaveInitializer( for (int i = 0; i < dims.size(); ++i) shape_dims[i] = dims[i]; - const TensorShape& shape = TensorShape::ReinterpretBaseType(shape_dims); + const auto shape = TensorShape::FromExistingBuffer(shape_dims); auto data_type = OrtTypeInfo::ElementTypeFromProto(proto->data_type()); auto t = std::make_unique( data_type, diff --git a/onnxruntime/core/providers/nuphar/runtime/control_flow/scan_exec_ctx.cc b/onnxruntime/core/providers/nuphar/runtime/control_flow/scan_exec_ctx.cc index fd14af2c9d69c..03da39c8f86d5 100644 --- a/onnxruntime/core/providers/nuphar/runtime/control_flow/scan_exec_ctx.cc +++ b/onnxruntime/core/providers/nuphar/runtime/control_flow/scan_exec_ctx.cc @@ -256,7 +256,7 @@ void ScanExecCtx::InitContext(KernelComputeCtx* kernel_compute_ctx, // if ith variable is a state output, we just call OutputData2 API with realized_shape output_data = kernel_compute_ctx->OutputData(func_info, ort_output_idx, - TensorShape::ReinterpretBaseType(realized_shape), + TensorShape::FromExistingBuffer(realized_shape), data_type); // set current_ort_state_output_ptrs_ as ort_state_input_buffers_ @@ -278,7 +278,7 @@ void ScanExecCtx::InitContext(KernelComputeCtx* kernel_compute_ctx, output_data = kernel_compute_ctx->OutputData(func_info, ort_output_idx, - TensorShape::ReinterpretBaseType(shape), + TensorShape::FromExistingBuffer(shape), data_type); // Check whether it is backward Scan @@ -319,7 +319,7 @@ void ScanExecCtx::InitContext(KernelComputeCtx* kernel_compute_ctx, ort_state_output_buffers_[ort_state_idx] = kernel_compute_ctx->OutputData(func_info, ort_state_idx, - TensorShape::ReinterpretBaseType(dl_output_shapes[tvm_output_idx]), + TensorShape::FromExistingBuffer(dl_output_shapes[tvm_output_idx]), data_type); state_bytes_size_[ort_state_idx] = BytesOfShape(dl_output_shapes[tvm_output_idx], data_type); } @@ -470,7 +470,7 @@ void ScanExecCtx::UpdateContext(KernelComputeCtx* kernel_compute_ctx, if (ort_output_idx < gsl::narrow(num_state_variables)) { output_data = kernel_compute_ctx->OutputData(func_info, ort_output_idx, - TensorShape::ReinterpretBaseType(ort_output_shape), + TensorShape::FromExistingBuffer(ort_output_shape), data_type); // set current_ort_state_output_ptrs_ as ort_state_input_buffers_ // Note it is "ort_state_input_buffers_", since we will perform double buffering later. @@ -482,7 +482,7 @@ void ScanExecCtx::UpdateContext(KernelComputeCtx* kernel_compute_ctx, ort_output_shape[output_scan_axis] = seq_length_; output_data = kernel_compute_ctx->OutputData(func_info, ort_output_idx, - TensorShape::ReinterpretBaseType(ort_output_shape), + TensorShape::FromExistingBuffer(ort_output_shape), data_type); // Check whether it is backward Scan // If so, we need to use the last frame, instead of the first frame. @@ -516,7 +516,7 @@ void ScanExecCtx::UpdateContext(KernelComputeCtx* kernel_compute_ctx, ort_state_output_buffers_[ort_state_idx] = kernel_compute_ctx->OutputData(func_info, ort_state_idx, - TensorShape::ReinterpretBaseType(dl_output_shapes[tvm_output_idx]), + TensorShape::FromExistingBuffer(dl_output_shapes[tvm_output_idx]), data_type); state_bytes_size_[ort_state_idx] = BytesOfShape(dl_output_shapes[tvm_output_idx], data_type); } diff --git a/onnxruntime/core/providers/nuphar/runtime/sequential/basic.cc b/onnxruntime/core/providers/nuphar/runtime/sequential/basic.cc index 062287d0cf33a..0271adfe5e515 100644 --- a/onnxruntime/core/providers/nuphar/runtime/sequential/basic.cc +++ b/onnxruntime/core/providers/nuphar/runtime/sequential/basic.cc @@ -49,7 +49,7 @@ void BasicExecBlock::Run(KernelComputeCtx* kernel_compute_ctx) { int ort_output_idx = p.first; size_t tvm_idx = p.second; size_t tvm_output_idx = tvm_idx - func_info_->func_input_count; - const TensorShape& shape = TensorShape::ReinterpretBaseType(dl_output_shapes[tvm_output_idx]); + const TensorShape shape = TensorShape::FromExistingBuffer(dl_output_shapes[tvm_output_idx]); MLDataType dtype = output_metas[tvm_output_idx].dtype; void* dst = kernel_compute_ctx->OutputData(func_info_, ort_output_idx, shape, dtype); void* src = dl_tensors[tvm_idx].data; @@ -125,7 +125,7 @@ void BasicExecBlock::InitContext(KernelComputeCtx* kernel_compute_ctx) const { MLDataType data_type = output_meta.dtype; void* output_data = kernel_compute_ctx->OutputData(func_info_, ort_output_idx, - TensorShape::ReinterpretBaseType(realized_output_shape), + TensorShape::FromExistingBuffer(realized_output_shape), data_type); ORT_ENFORCE_DEBUG(kernel_compute_ctx->GetRuntimeHandle()->allow_unaligned_buffers || @@ -183,7 +183,7 @@ void BasicExecBlock::UpdateContext(KernelComputeCtx* kernel_compute_ctx) const { // update pointer dl_tensor.data = kernel_compute_ctx->OutputData(func_info_, ort_output_idx, - TensorShape::ReinterpretBaseType(dl_output_shapes[tvm_output_idx]), + TensorShape::FromExistingBuffer(dl_output_shapes[tvm_output_idx]), output_meta.dtype); ++tvm_output_idx; } diff --git a/onnxruntime/core/providers/rocm/miopen_common.cc b/onnxruntime/core/providers/rocm/miopen_common.cc index 7d38d5d09d69e..4110b3f5e7ab2 100644 --- a/onnxruntime/core/providers/rocm/miopen_common.cc +++ b/onnxruntime/core/providers/rocm/miopen_common.cc @@ -26,7 +26,7 @@ Status MiopenTensor::CreateTensorIfNeeded() { return Status::OK(); } -Status MiopenTensor::Set(const std::vector& input_dims, miopenDataType_t dataType) { +Status MiopenTensor::Set(gsl::span input_dims, miopenDataType_t dataType) { ORT_RETURN_IF_ERROR(CreateTensorIfNeeded()); int rank = gsl::narrow_cast(input_dims.size()); diff --git a/onnxruntime/core/providers/rocm/miopen_common.h b/onnxruntime/core/providers/rocm/miopen_common.h index 8d73ce3f98198..6d17c216b2f83 100644 --- a/onnxruntime/core/providers/rocm/miopen_common.h +++ b/onnxruntime/core/providers/rocm/miopen_common.h @@ -24,7 +24,7 @@ class MiopenTensor final { ~MiopenTensor(); ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(MiopenTensor); - Status Set(const std::vector& input_dims, miopenDataType_t dataType); + Status Set(gsl::span input_dims, miopenDataType_t dataType); Status Set(const MiopenTensor& x_desc, miopenBatchNormMode_t mode); operator miopenTensorDescriptor_t() const { return tensor_; } diff --git a/onnxruntime/core/providers/rocm/nn/conv.cc b/onnxruntime/core/providers/rocm/nn/conv.cc index 1675bfe850655..7e5bea700f52e 100644 --- a/onnxruntime/core/providers/rocm/nn/conv.cc +++ b/onnxruntime/core/providers/rocm/nn/conv.cc @@ -67,9 +67,9 @@ size_t GetMaxWorkspaceSize(const MiopenConvState& s, } Status SliceOutUnwantedOutputSection(hipStream_t stream, - const void* input_data, const std::vector& input_dims, + const void* input_data, gsl::span input_dims, void* output_data, - const std::vector& output_dims, + gsl::span output_dims, std::vector starts, const std::vector& ends, const std::vector& axes, @@ -79,7 +79,7 @@ Status SliceOutUnwantedOutputSection(hipStream_t stream, ORT_THROW_IF_ERROR(SliceBase::PrepareForCompute(starts, ends, axes, compute_metadata)); // As a sanity check, ensure that the slice operator's output shape matches with the expected output shape - ORT_ENFORCE(compute_metadata.output_dims_ == output_dims); + ORT_ENFORCE(gsl::make_span(compute_metadata.output_dims_) == output_dims); return SliceRocm::Impl(stream, input_data, input_dims, output_data, compute_metadata, element_size); } @@ -89,13 +89,13 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) const //set X const Tensor* X = context->Input(0); const TensorShape& x_shape = X->Shape(); - const auto& x_dims = x_shape.GetDims(); + const auto x_dims = x_shape.GetDims(); s_.x_data = reinterpret_cast(X->template Data()); s_.element_size = X->DataType()->Size(); //set W const Tensor* W = context->Input(1); const TensorShape& w_shape = W->Shape(); - std::vector w_dims = w_shape.GetDims(); + auto w_dims = w_shape.GetDimsAsVector(); s_.w_data = reinterpret_cast(W->template Data()); //set B if (context->InputCount() >= 3) { @@ -186,7 +186,7 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) const s_.y_data = reinterpret_cast(s_.Y->template MutableData()); } - std::vector x_dims_miopen = x_dims; + std::vector x_dims_miopen{x_dims.begin(), x_dims.end()}; std::vector y_dims_miopen = !post_slicing_required ? y_dims : y_dims_with_adjusted_pads; if (rank < 2) { // TODO: Remove asym padding correction. @@ -305,7 +305,7 @@ Status Conv::ComputeInternal(OpKernelContext* context) const { // This may have lead to extra results that are unnecessary and hence we slice that off here if (s_.post_slicing_required) { ORT_RETURN_IF_ERROR(SliceOutUnwantedOutputSection(Stream(), s_.y_data, s_.y_dims_with_adjusted_pads, - s_.Y->MutableDataRaw(), s_.y_dims, s_.slice_starts, + s_.Y->MutableDataRaw(), s_.y_dims.GetDims(), s_.slice_starts, s_.slice_ends, s_.slice_axes, s_.element_size)); } return Status::OK(); diff --git a/onnxruntime/core/providers/rocm/nn/conv.h b/onnxruntime/core/providers/rocm/nn/conv.h index 1087cd05d3272..4431aa2eee98a 100644 --- a/onnxruntime/core/providers/rocm/nn/conv.h +++ b/onnxruntime/core/providers/rocm/nn/conv.h @@ -114,11 +114,11 @@ struct MiopenConvState { miopenHandle_t handle; // if x/w dims changed, update algo and miopenTensors - std::vector last_x_dims; - std::vector last_w_dims; + TensorShape last_x_dims; + TensorShape last_w_dims; // these would be recomputed if x/w dims change - std::vector y_dims; + TensorShape y_dims; std::vector y_dims_with_adjusted_pads; size_t workspace_bytes; decltype(AlgoPerfType().bwd_data_algo) bwd_data_algo; @@ -200,9 +200,9 @@ class Conv : public RocmKernel { Status SliceOutUnwantedOutputSection(hipStream_t stream, const void* input_data, - const std::vector& input_dims, + gsl::span input_dims, void* output_data, - const std::vector& output_dims, + gsl::span output_dims, std::vector starts, const std::vector& ends, const std::vector& axes, diff --git a/onnxruntime/core/providers/rocm/nn/conv_transpose.cc b/onnxruntime/core/providers/rocm/nn/conv_transpose.cc index 9b9b0f189b6ab..c39b92b98776e 100644 --- a/onnxruntime/core/providers/rocm/nn/conv_transpose.cc +++ b/onnxruntime/core/providers/rocm/nn/conv_transpose.cc @@ -42,7 +42,7 @@ Status ConvTranspose::DoConvTranspose(OpKernelContext* context, bool dynamic_ const Tensor* X = context->Input(0); const TensorShape& x_shape = X->Shape(); - auto x_dims = x_shape.GetDims(); + auto x_dims = x_shape.GetDimsAsVector(); auto x_data = reinterpret_cast(X->template Data()); auto x_dimensions = X->Shape().NumDimensions(); @@ -53,7 +53,7 @@ Status ConvTranspose::DoConvTranspose(OpKernelContext* context, bool dynamic_ } const Tensor* W = context->Input(1); const TensorShape& w_shape = W->Shape(); - std::vector w_dims = w_shape.GetDims(); + std::vector w_dims = w_shape.GetDimsAsVector(); auto w_data = reinterpret_cast(W->template Data()); size_t num_inputs = OpKernel::Node().InputDefs().size(); @@ -82,7 +82,7 @@ Status ConvTranspose::DoConvTranspose(OpKernelContext* context, bool dynamic_ ConvTransposeAttributes::Prepare p; ORT_RETURN_IF_ERROR(conv_transpose_attrs_.PrepareForCompute(context, has_bias, p, dynamic_padding)); - auto y_dims = p.Y->Shape().GetDims(); + auto y_dims = p.Y->Shape().GetDimsAsVector(); if (x_dimensions == 3) { y_dims.insert(y_dims.begin() + 2, 1); p.kernel_shape.insert(p.kernel_shape.begin(), 1); @@ -159,7 +159,7 @@ Status ConvTranspose::DoConvTranspose(OpKernelContext* context, bool dynamic_ // The following block will be executed in case there has been no change in the shapes of the // input and the filter compared to the previous run if (!y_data) { - auto y_dims = s_.y_dims; + auto y_dims = s_.y_dims.GetDimsAsVector(); if (x_dimensions == 3) { y_dims.erase(y_dims.begin() + 2); } diff --git a/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc b/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc index a8f6dee0d15ae..928a6fe3d25ab 100644 --- a/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc +++ b/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc @@ -183,7 +183,6 @@ Status ReduceKernel::ReduceKernelShared( } } - const auto& input_dims = input_shape.GetDims(); int64_t input_count = input_shape.Size(); IAllocatorUniquePtr temp_X; if (ReduceTensorIndices == MIOPEN_REDUCE_TENSOR_FLATTENED_INDICES && std::is_same::value) { @@ -194,7 +193,7 @@ Status ReduceKernel::ReduceKernelShared( } // MIOpen requires at least 3D input, so pad 1s if needed - std::vector input_dims_miopen = input_dims; + std::vector input_dims_miopen = input_shape.GetDimsAsVector(); std::vector output_dims_miopen = output_dims; if (rank < 3) { std::vector pads(3 - rank, 1); @@ -383,7 +382,7 @@ Status PrepareForReduce(const Tensor* X, std::vector reduced(rank, false); prepare_reduce_metadata.output_dims.reserve(input_dims.size()); if (axes.size() > 0) { - prepare_reduce_metadata.output_dims = input_dims; + prepare_reduce_metadata.output_dims = input_shape.GetDimsAsVector(); for (auto axis : axes) { axis = HandleNegativeAxis(axis, rank); ORT_ENFORCE(input_dims[axis] != 0, @@ -419,7 +418,7 @@ Status PrepareForReduce(const Tensor* X, } // MIOpen requires at least 3D input, so pad 1s if needed - prepare_reduce_metadata.input_dims_miopen = input_dims; + prepare_reduce_metadata.input_dims_miopen = input_shape.GetDimsAsVector(); prepare_reduce_metadata.output_dims_miopen = prepare_reduce_metadata.output_dims; if (rank < 3) { std::vector pads(3 - rank, 1); diff --git a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc index 87f7b9715748b..e3dbfd1cc25f4 100644 --- a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc +++ b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc @@ -212,20 +212,27 @@ Status IDataTransfer::CopySparseTensors(const std::vector& src const Node& OpKernel::Node() const { return g_host->OpKernel__Node(this); } -TensorShape::TensorShape(const int64_t* dimension_sizes, size_t dimension_count) - : std::vector(dimension_count) { - for (size_t i = 0; i < dimension_count; ++i) { - (*this)[i] = dimension_sizes[i]; - } +TensorShape::TensorShape(gsl::span dims) { + Allocate(dims.size()); + gsl::copy(dims, values_); +} + +TensorShape& TensorShape::operator=(const TensorShape& other) { + g_host->TensorShape__operator_assign(this, other); + return *this; } -TensorShape::TensorShape(const std::vector& dims, size_t start, size_t end) { - assign(dims.begin() + start, dims.begin() + end); +TensorShape& TensorShape::operator=(TensorShape&& other) { + g_host->TensorShape__operator_move_assign(this, std::move(other)); + return *this; +} + +void TensorShape::Allocate(size_t size) { + g_host->TensorShape__Allocate(this, size); } int64_t TensorShape::Size() const { - size_t arraySize = size(); - int64_t size = SizeHelper(0, arraySize); + int64_t size = SizeHelper(0, values_.size()); //should we cache the size? as multiple operation may be expensive. return size; } @@ -235,12 +242,8 @@ int64_t TensorShape::SizeHelper(size_t start, size_t end) const { } TensorShape TensorShape::Slice(size_t dimstart, size_t dimend) const { - assert(dimstart <= dimend && dimend <= size()); // "Invalid tensor shape slice argument." - return TensorShape(*this, dimstart, dimend); -} - -TensorShape TensorShape::Slice(size_t dimstart) const { - return Slice(dimstart, size()); + assert(dimstart <= dimend && dimend <= values_.size()); // "Invalid tensor shape slice argument." + return TensorShape(GetDims().subspan(dimstart, dimend - dimstart)); } std::string TensorShape::ToString() const { diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h index 8ede33c9f9992..cc30736c3de4b 100644 --- a/onnxruntime/core/providers/shared_library/provider_interfaces.h +++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h @@ -220,6 +220,9 @@ struct ProviderHost { virtual std::string Status__ToString(const Status* p) = 0; // TensorShape + virtual void TensorShape__operator_assign(TensorShape* p, const TensorShape& other) = 0; + virtual void TensorShape__operator_move_assign(TensorShape* p, TensorShape&& other) = 0; + virtual void TensorShape__Allocate(TensorShape* p, size_t size) = 0; virtual int64_t TensorShape__SizeHelper(const TensorShape* p, size_t start, size_t end) = 0; virtual std::string TensorShape__ToString(const TensorShape* p) = 0; virtual int64_t TensorShape__SizeToDimension(const TensorShape* p, size_t dimension) = 0; diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc index a0d60d7d5d0f4..05a985c9add97 100644 --- a/onnxruntime/core/session/provider_bridge_ort.cc +++ b/onnxruntime/core/session/provider_bridge_ort.cc @@ -282,6 +282,9 @@ struct ProviderHostImpl : ProviderHost { std::string Status__ToString(const Status* p) override { return p->Status::ToString(); } // TensorShape (direct) + void TensorShape__operator_assign(TensorShape* p, const TensorShape& other) override { p->TensorShape::operator=(other); } + void TensorShape__operator_move_assign(TensorShape* p, TensorShape&& other) override { p->TensorShape::operator=(std::move(other)); } + void TensorShape__Allocate(TensorShape* p, size_t size) override { p->TensorShape::Allocate(size); } int64_t TensorShape__SizeHelper(const TensorShape* p, size_t start, size_t end) override { return p->TensorShape::SizeHelper(start, end); } std::string TensorShape__ToString(const TensorShape* p) override { return p->TensorShape::ToString(); } int64_t TensorShape__SizeToDimension(const TensorShape* p, size_t dimension) override { return p->TensorShape::SizeToDimension(dimension); } diff --git a/onnxruntime/gsl/gsl-lite.hpp b/onnxruntime/gsl/gsl-lite.hpp index 40cb6825cc0b7..e6191e98ee705 100644 --- a/onnxruntime/gsl/gsl-lite.hpp +++ b/onnxruntime/gsl/gsl-lite.hpp @@ -1838,6 +1838,11 @@ gsl_DISABLE_MSVC_WARNINGS(26410 26415 26418 26472 26439 26440 26473 26481 26482 return first_; } + gsl_api gsl_constexpr reference back() const { + Expects(size()>0); + return last_[-1]; + } + // 26.7.3.6 Iterator support [span.iterators] gsl_api gsl_constexpr iterator begin() const gsl_noexcept { diff --git a/onnxruntime/test/common/tensor_op_test_utils.h b/onnxruntime/test/common/tensor_op_test_utils.h index 2380a67242def..7a7c9b512b3c0 100644 --- a/onnxruntime/test/common/tensor_op_test_utils.h +++ b/onnxruntime/test/common/tensor_op_test_utils.h @@ -17,7 +17,7 @@ namespace onnxruntime { namespace test { namespace detail { -inline int64_t SizeFromDims(const std::vector& dims) { +inline int64_t SizeFromDims(gsl::span dims) { const int64_t size = std::accumulate( dims.cbegin(), dims.cend(), static_cast(1), std::multiplies{}); ORT_ENFORCE(size >= 0); @@ -41,7 +41,7 @@ class RandomValueGenerator { typename std::enable_if< std::is_floating_point::value, std::vector>::type - Uniform(const std::vector& dims, TFloat min, TFloat max) { + Uniform(gsl::span dims, TFloat min, TFloat max) { std::vector val(detail::SizeFromDims(dims)); std::uniform_real_distribution distribution(min, max); for (size_t i = 0; i < val.size(); ++i) { @@ -55,7 +55,7 @@ class RandomValueGenerator { typename std::enable_if< std::is_integral::value && !utils::IsByteType::value, std::vector>::type - Uniform(const std::vector& dims, TInt min, TInt max) { + Uniform(gsl::span dims, TInt min, TInt max) { std::vector val(detail::SizeFromDims(dims)); std::uniform_int_distribution distribution(min, max - 1); for (size_t i = 0; i < val.size(); ++i) { @@ -68,7 +68,7 @@ class RandomValueGenerator { typename std::enable_if< utils::IsByteType::value, std::vector>::type - Uniform(const std::vector& dims, TByte min, TByte max) { + Uniform(gsl::span dims, TByte min, TByte max) { std::vector val(detail::SizeFromDims(dims)); std::uniform_int_distribution distribution(min, max - 1); for (size_t i = 0; i < val.size(); ++i) { @@ -82,7 +82,7 @@ class RandomValueGenerator { typename std::enable_if< std::is_floating_point::value, std::vector>::type - Gaussian(const std::vector& dims, TFloat mean, TFloat stddev) { + Gaussian(gsl::span dims, TFloat mean, TFloat stddev) { std::vector val(detail::SizeFromDims(dims)); std::normal_distribution distribution(mean, stddev); for (size_t i = 0; i < val.size(); ++i) { diff --git a/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc b/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc index 046f92b737946..b6a3d4461fb99 100644 --- a/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc +++ b/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc @@ -46,7 +46,7 @@ void TestDynamicQuantizeMatMul(const std::vector& A_dims, std::for_each(B_zero_point.begin(), B_zero_point.end(), [&random](T& zp) { - zp = static_cast(random.Uniform({1}, std::numeric_limits::min(), std::numeric_limits::max())[0]); + zp = static_cast(random.Uniform(std::array{1}, std::numeric_limits::min(), std::numeric_limits::max())[0]); }); std::vector Bias = random.Uniform({B_dims.back()}, -0.1f, 0.1f); diff --git a/onnxruntime/test/contrib_ops/fused_matmul_op_test.cc b/onnxruntime/test/contrib_ops/fused_matmul_op_test.cc index 761104cfa15c4..7bf3fc676d1ba 100644 --- a/onnxruntime/test/contrib_ops/fused_matmul_op_test.cc +++ b/onnxruntime/test/contrib_ops/fused_matmul_op_test.cc @@ -111,7 +111,7 @@ void ProcessInputs(const std::vector& input_dims, const std::vector& std::vector& modified_input_dims, std::vector& input_vals) { auto rank = input_dims.size(); ORT_ENFORCE(rank >= 1); - int64_t size0 = TensorShape::ReinterpretBaseType(input_dims).SizeHelper(0, rank); + int64_t size0 = TensorShape::FromExistingBuffer(input_dims).SizeHelper(0, rank); std::vector input_vals_raw(common_input_vals.cbegin(), common_input_vals.cbegin() + size0); input_vals.resize(size0); @@ -123,7 +123,7 @@ void ProcessInputs(const std::vector& input_dims, const std::vector& if (trans_flag) { modified_input_dims[rank - 1] = input_dims[rank - 2]; modified_input_dims[rank - 2] = input_dims[rank - 1]; - auto batch_size = TensorShape::ReinterpretBaseType(input_dims).SizeHelper(0, rank - 2); + auto batch_size = TensorShape::FromExistingBuffer(input_dims).SizeHelper(0, rank - 2); Transpose(input_vals_raw, input_vals, batch_size, input_dims[rank - 2], input_dims[rank - 1]); } else { input_vals = input_vals_raw; diff --git a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc index 9cae4acb9d01b..a84c5fa77e2ef 100644 --- a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc +++ b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc @@ -44,7 +44,7 @@ void TestMatMulIntegerToFloat(const std::vector& A_dims, return static_cast(v); }); - std::vector A_scale = random.Uniform({1}, -0.1f, 0.1f); + std::vector A_scale = random.Uniform(std::array{1}, -0.1f, 0.1f); std::vector A_zero_point{127}; int64_t b_scale_zp_size = per_column ? B_dims.back() : 1; @@ -54,7 +54,7 @@ void TestMatMulIntegerToFloat(const std::vector& A_dims, std::for_each(B_zero_point.begin(), B_zero_point.end(), [&random](T& zp) { - zp = static_cast(random.Uniform({1}, std::numeric_limits::min(), std::numeric_limits::max())[0]); + zp = static_cast(random.Uniform(std::array{1}, std::numeric_limits::min(), std::numeric_limits::max())[0]); }); std::vector Bias = random.Uniform({B_dims.back()}, -0.1f, 0.1f); diff --git a/onnxruntime/test/contrib_ops/quantize_attention_op_test.cc b/onnxruntime/test/contrib_ops/quantize_attention_op_test.cc index 957c48dc1e06d..7979020856486 100644 --- a/onnxruntime/test/contrib_ops/quantize_attention_op_test.cc +++ b/onnxruntime/test/contrib_ops/quantize_attention_op_test.cc @@ -710,7 +710,7 @@ void TestQuantizedAttentionPastState(int64_t batch, std::vector weight_zero_point(weight_scale_zp_size); for (auto& zp : weight_zero_point) { - zp = static_cast(random.Uniform({1}, weight_min, weight_max)[0]); + zp = static_cast(random.Uniform(std::array{1}, weight_min, weight_max)[0]); } WeightT weight_mean = (weight_min + weight_max) / 2 + 1; diff --git a/onnxruntime/test/contrib_ops/quantize_lstm_op_test.cc b/onnxruntime/test/contrib_ops/quantize_lstm_op_test.cc index 6cd9fb191f070..af197737923cc 100644 --- a/onnxruntime/test/contrib_ops/quantize_lstm_op_test.cc +++ b/onnxruntime/test/contrib_ops/quantize_lstm_op_test.cc @@ -209,12 +209,12 @@ static void RunQuantLSTM(int64_t input_size, // X int64_t seq_len = 1; // only use seq length 1 to model the test std::vector X_dims = {seq_len, batch_size, input_size}; - std::vector X_data = rand_gen.Gaussian({seq_len, batch_size, input_size}, 0.0f, 0.25f); + std::vector X_data = rand_gen.Gaussian(std::array{seq_len, batch_size, input_size}, 0.0f, 0.25f); test.AddInput("X", X_dims, X_data); // W std::vector W_dims = {num_directions, input_size, 4 * hidden_size}; - std::vector W_data = rand_gen.Gaussian({num_directions, 4 * hidden_size, input_size}, 0.0f, 0.25f); + std::vector W_data = rand_gen.Gaussian(std::array{num_directions, 4 * hidden_size, input_size}, 0.0f, 0.25f); std::vector w_scale; std::vector w_zp; @@ -224,7 +224,7 @@ static void RunQuantLSTM(int64_t input_size, // R std::vector R_dims = {num_directions, hidden_size, 4 * hidden_size}; - std::vector R_data = rand_gen.Gaussian({num_directions, 4 * hidden_size, hidden_size}, 0.0f, 0.25f); + std::vector R_data = rand_gen.Gaussian(std::array{num_directions, 4 * hidden_size, hidden_size}, 0.0f, 0.25f); std::vector r_scale; std::vector r_zp; @@ -384,12 +384,12 @@ TEST(DynamicQuantLSTMTest, SharedPrepackedWeights) { // X int64_t seq_len = 1; // only use seq length 1 to model the test std::vector X_dims = {seq_len, batch_size, input_size}; - std::vector X_data = rand_gen.Gaussian({seq_len, batch_size, input_size}, 0.0f, 0.25f); + std::vector X_data = rand_gen.Gaussian(std::array{seq_len, batch_size, input_size}, 0.0f, 0.25f); test.AddInput("X", X_dims, X_data); // W std::vector W_dims = {num_directions, input_size, 4 * hidden_size}; - std::vector W_data = rand_gen.Gaussian({num_directions, 4 * hidden_size, input_size}, 0.0f, 0.25f); + std::vector W_data = rand_gen.Gaussian(std::array{num_directions, 4 * hidden_size, input_size}, 0.0f, 0.25f); std::vector w_scale; std::vector w_zp; @@ -399,7 +399,7 @@ TEST(DynamicQuantLSTMTest, SharedPrepackedWeights) { // R std::vector R_dims = {num_directions, hidden_size, 4 * hidden_size}; - std::vector R_data = rand_gen.Gaussian({num_directions, 4 * hidden_size, hidden_size}, 0.0f, 0.25f); + std::vector R_data = rand_gen.Gaussian(std::array{num_directions, 4 * hidden_size, hidden_size}, 0.0f, 0.25f); std::vector r_scale; std::vector r_zp; diff --git a/onnxruntime/test/eager/ort_invoker_test.cc b/onnxruntime/test/eager/ort_invoker_test.cc index 4d3e36f7ab4b5..d5b0c02d09b52 100644 --- a/onnxruntime/test/eager/ort_invoker_test.cc +++ b/onnxruntime/test/eager/ort_invoker_test.cc @@ -39,7 +39,7 @@ TEST(InvokerTest, Basic) { ASSERT_STATUS_OK(kernel_invoker.Invoke("Add", {A, B}, result, nullptr)); const Tensor& C = result.back().Get(); auto& c_shape = C.Shape(); - EXPECT_EQ(c_shape.GetDims(), dims_mul_x); + EXPECT_EQ(c_shape.GetDimsAsVector(), dims_mul_x); std::vector expected_result = {2.0f, 4.0f, 6.0f, 8.0f, 10.0f, 12.0f}; auto* c_data = C.Data(); diff --git a/onnxruntime/test/framework/cuda/fence_cuda_test.cc b/onnxruntime/test/framework/cuda/fence_cuda_test.cc index a49de9a102926..7672ffed1bed2 100644 --- a/onnxruntime/test/framework/cuda/fence_cuda_test.cc +++ b/onnxruntime/test/framework/cuda/fence_cuda_test.cc @@ -124,8 +124,7 @@ TEST(CUDAFenceTests, DISABLED_PartOnCPU) { session.Run(std::unordered_map{{"X1", value}}, std::vector{"Out"}, &outputs)); ASSERT_TRUE(1 == outputs.size()); const Tensor& output = outputs[0].Get(); - //Use reinterpret_cast to bypass a gcc bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51213 - EXPECT_EQ(*reinterpret_cast*>(&output.Shape()), *reinterpret_cast*>(&shape)); + EXPECT_EQ(output.Shape(), shape); EXPECT_EQ(output.DataType(), DataTypeImpl::GetType()); float expected_output[4] = {13.0f, -18.0f, -27.0f, 40.0f}; @@ -174,8 +173,7 @@ TEST(CUDAFenceTests, TileWithInitializer) { session.Run(std::unordered_map{{"X1", value}}, std::vector{"Y"}, &outputs)); ASSERT_TRUE(1 == outputs.size()); const Tensor& output = outputs[0].Get(); - //Use reinterpret_cast to bypass a gcc bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51213 - EXPECT_EQ(*reinterpret_cast*>(&output.Shape()), (std::vector{2, 4})); + EXPECT_EQ(output.Shape(), (TensorShape{2, 4})); EXPECT_EQ(output.DataType(), DataTypeImpl::GetType()); float expected_output[8] = {-1, 2, -1, 2, 3, -4, 3, -4}; @@ -235,8 +233,7 @@ TEST(CUDAFenceTests, TileWithComputedInput) { session.Run(std::unordered_map{{"X1", value}}, std::vector{"Out"}, &outputs)); ASSERT_TRUE(1 == outputs.size()); const Tensor& output = outputs[0].Get(); - //Use reinterpret_cast to bypass a gcc bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51213 - EXPECT_EQ(*reinterpret_cast*>(&output.Shape()), (std::vector{4, 4})); + EXPECT_EQ(output.Shape(), (TensorShape{4, 4})); EXPECT_EQ(output.DataType(), DataTypeImpl::GetType()); float expected_output[16] = {7, -10, 7, -10, -15, 22, -15, 22, 7, -10, 7, -10, -15, 22, -15, 22}; diff --git a/onnxruntime/test/framework/execution_frame_test.cc b/onnxruntime/test/framework/execution_frame_test.cc index 186eb9e707cbc..a9c2f49447ac2 100644 --- a/onnxruntime/test/framework/execution_frame_test.cc +++ b/onnxruntime/test/framework/execution_frame_test.cc @@ -82,9 +82,7 @@ TEST_F(ExecutionFrameTest, TensorAllocationTest) { ASSERT_TRUE(p_ml_value != nullptr); Tensor* p_tensor = p_ml_value->GetMutable(); ASSERT_TRUE(p_tensor != nullptr); - //Use reinterpret_cast to bypass a gcc bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51213 - ASSERT_EQ(*reinterpret_cast*>(&p_tensor->Shape()), - *reinterpret_cast*>(&shape)); + ASSERT_EQ(p_tensor->Shape(), shape); ASSERT_EQ(p_tensor->DataType(), DataTypeImpl::GetType()); //test share memory from tensor @@ -99,9 +97,7 @@ TEST_F(ExecutionFrameTest, TensorAllocationTest) { const OrtValue* p_ml_value_const = frame.GetNodeInputOrOutputMLValue(1); auto tensor2 = p_ml_value_const ? &(p_ml_value_const->Get()) : nullptr; ASSERT_TRUE(tensor2); - //Use reinterpret_cast to bypass a gcc bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51213 - ASSERT_EQ(*reinterpret_cast*>(&tensor2->Shape()), - *reinterpret_cast*>(&shape2)); + ASSERT_EQ(tensor2->Shape(), shape2); ASSERT_EQ(tensor2->template Data(), p_tensor->template Data()); } @@ -203,9 +199,7 @@ TEST_F(ExecutionFrameTest, FeedInDataTest) { OrtValue* p_ml_value = frame.GetMutableNodeInputOrOutputMLValue(0); Tensor* p_tensor_arg_0 = p_ml_value ? p_ml_value->GetMutable() : nullptr; ASSERT_TRUE(p_tensor_arg_0); - //Use reinterpret_cast to bypass a gcc bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51213 - ASSERT_EQ(*reinterpret_cast*>(&p_tensor_arg_0->Shape()), - *reinterpret_cast*>(&shape)); + ASSERT_EQ(p_tensor_arg_0->Shape(), shape); ASSERT_EQ(p_tensor_arg_0->DataType(), DataTypeImpl::GetType()); ASSERT_EQ(p_tensor_arg_0->MutableData(), value.GetMutable()->MutableData()); } @@ -502,7 +496,7 @@ TEST(ExecutionFrameTestInit, SparseInitializerAsOutput) { ASSERT_TRUE(results[0].IsSparseTensor()); const SparseTensor& result = results[0].Get(); ASSERT_EQ(result.DataType(), DataTypeImpl::GetType()); - EXPECT_THAT(result.DenseShape().GetDims(), ::testing::ContainerEq(dense_shape)); + EXPECT_THAT(result.DenseShape().GetDims(), ::testing::ContainerEq(gsl::make_span(dense_shape))); ASSERT_EQ(result.NumValues(), 3U); EXPECT_THAT(result.Values().DataAsSpan(), ::testing::ContainerEq(gsl::make_span(expected_values))); auto coo_view = result.AsCoo(); diff --git a/onnxruntime/test/framework/float_16_test.cc b/onnxruntime/test/framework/float_16_test.cc index 2a84fcf048167..cf453d099e844 100644 --- a/onnxruntime/test/framework/float_16_test.cc +++ b/onnxruntime/test/framework/float_16_test.cc @@ -46,7 +46,7 @@ class MulFP16Kernel final : public OpKernel { auto X_Data = X->Data(); auto W_Data = W->Data(); - auto& shape = X->Shape().GetDims(); + auto shape = X->Shape().GetDims(); auto* Y = p_context->Output(0, shape); auto* Y_Data = Y->MutableData(); @@ -123,8 +123,7 @@ void RunSession(InferenceSession& session_object, ASSERT_EQ(1u, fetches.size()); auto& rtensor = fetches.front().Get(); TensorShape expected_shape(dims_y); - //Use reinterpret_cast to bypass a gcc bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51213 - EXPECT_EQ(*reinterpret_cast*>(&expected_shape), *reinterpret_cast*>(&rtensor.Shape())); + EXPECT_EQ(expected_shape, rtensor.Shape()); const std::vector found(rtensor.template Data(), rtensor.template Data() + expected_shape.Size()); ASSERT_EQ(found.size(), values_y.size()); for (size_t i = 0; i < found.size(); i++) diff --git a/onnxruntime/test/framework/inference_session_test.cc b/onnxruntime/test/framework/inference_session_test.cc index b2764baa50551..9d5fa3ec74d16 100644 --- a/onnxruntime/test/framework/inference_session_test.cc +++ b/onnxruntime/test/framework/inference_session_test.cc @@ -212,8 +212,7 @@ template void VerifyOutputs(const Tensor& tensor, const std::vector& expected_dims, const std::vector& expected_values) { TensorShape expected_shape(expected_dims); - //Use reinterpret_cast to bypass a gcc bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51213 - ASSERT_EQ(*reinterpret_cast*>(&expected_shape), *reinterpret_cast*>(&tensor.Shape())); + ASSERT_EQ(expected_shape, tensor.Shape()); const std::vector found(tensor.template Data(), tensor.template Data() + expected_values.size()); ASSERT_EQ(expected_values, found); @@ -1773,8 +1772,7 @@ TEST(InferenceSessionTests, TestTruncatedSequence) { ASSERT_EQ(1u, fetches.size()); auto& rtensor = fetches.front().Get(); TensorShape expected_shape(Y_dims); - //Use reinterpret_cast to bypass a gcc bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51213 - ASSERT_EQ(*reinterpret_cast*>(&expected_shape), *reinterpret_cast*>(&rtensor.Shape())); + ASSERT_EQ(expected_shape, rtensor.Shape()); for (size_t i = 0; i < Y_data.size(); ++i) EXPECT_NEAR(Y_data[i], rtensor.template Data()[i], FLT_EPSILON); @@ -1816,8 +1814,7 @@ TEST(InferenceSessionTests, TestTruncatedSequence) { std::vector truncated_output_dims = Y_dims; truncated_output_dims[0] = truncated_len; TensorShape truncated_shape(truncated_output_dims); - //Use reinterpret_cast to bypass a gcc bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51213 - ASSERT_EQ(*reinterpret_cast*>(&truncated_shape), *reinterpret_cast*>(&truncated_rtensor.Shape())); + ASSERT_EQ(truncated_shape, truncated_rtensor.Shape()); auto seq_output_stride = truncated_shape.SizeFromDimension(1); for (int i = 0; i < truncated_shape.Size(); ++i) EXPECT_NEAR(Y_data[i + seq_start * seq_output_stride], truncated_rtensor.template Data()[i], FLT_EPSILON); @@ -2653,7 +2650,7 @@ TEST(InferenceSessionTests, InitializerSharing_EnsureSessionsUseUserAddedInitial CreateMLValue(allocator, {3, 2}, input_data_vec, &val_to_share_from_allocator); OrtMemoryInfo mem_info{CPU, OrtArenaAllocator}; - CreateMLValue({3, 2}, input_data_vec.data(), mem_info, &val_to_share); + CreateMLValue(std::array{3, 2}, input_data_vec.data(), mem_info, &val_to_share); // create sessions to share the allocator SessionOptions so1; diff --git a/onnxruntime/test/framework/local_kernel_registry_test.cc b/onnxruntime/test/framework/local_kernel_registry_test.cc index 68b1b40dc9993..1cfdf97311d57 100644 --- a/onnxruntime/test/framework/local_kernel_registry_test.cc +++ b/onnxruntime/test/framework/local_kernel_registry_test.cc @@ -142,7 +142,7 @@ class OptionalOpKernel : public OpKernel { const auto* W = context->Input(1); auto* X_Data = X->Data(); - auto& shape = X->Shape().GetDims(); + auto shape = X->Shape().GetDims(); auto* Y = context->Output(0, shape); auto* Y_Data = Y->MutableData(); size_t size = 1; @@ -215,8 +215,7 @@ void RunSession(InferenceSession& session_object, ASSERT_EQ(1u, fetches.size()); auto& rtensor = fetches.front().Get(); TensorShape expected_shape(dims_y); - //Use reinterpret_cast to bypass a gcc bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51213 - EXPECT_EQ(*reinterpret_cast*>(&expected_shape), *reinterpret_cast*>(&rtensor.Shape())); + EXPECT_EQ(expected_shape, rtensor.Shape()); const std::vector found(rtensor.template Data(), rtensor.template Data() + expected_shape.Size()); ASSERT_EQ(values_y, found); } diff --git a/onnxruntime/test/framework/sparse_kernels_test.cc b/onnxruntime/test/framework/sparse_kernels_test.cc index 64101a4212f5a..3bba5d1a9d0ee 100644 --- a/onnxruntime/test/framework/sparse_kernels_test.cc +++ b/onnxruntime/test/framework/sparse_kernels_test.cc @@ -1291,7 +1291,7 @@ TEST(SparseTensorConversionTests, CsrConversion) { auto* cpu_provider = TestCPUExecutionProvider(); auto cpu_allocator = cpu_provider->GetAllocator(0, OrtMemTypeDefault); - const std::vector dense_shape{3, 3}; + const TensorShape dense_shape{3, 3}; std::vector dense_data = { 0, 0, 1, 1, 0, 1, @@ -1435,7 +1435,7 @@ TEST(SparseTensorConversionTests, CsrConversion) { gsl::make_span(expected_inner), gsl::make_span(expected_outer))); ASSERT_EQ(str_cpu_src.Format(), SparseFormat::kCsrc); ASSERT_TRUE(str_cpu_src.IsDataTypeString()); - ASSERT_EQ(str_cpu_src.DenseShape().GetDims(), dense_shape); + ASSERT_EQ(str_cpu_src.DenseShape(), dense_shape); ASSERT_EQ(str_cpu_src.NumValues(), expected_values_str.size()); auto values = str_cpu_src.Values().DataAsSpan(); ASSERT_TRUE(std::equal(expected_values_str.cbegin(), expected_values_str.cend(), values.cbegin(), values.cend())); @@ -1638,7 +1638,7 @@ TEST(SparseTensorConversionTests, CooConversion) { gsl::make_span(expected_linear_indices))); ASSERT_EQ(str_cpu_src.Format(), SparseFormat::kCoo); ASSERT_TRUE(str_cpu_src.IsDataTypeString()); - ASSERT_EQ(str_cpu_src.DenseShape().GetDims(), dense_shape); + ASSERT_EQ(str_cpu_src.DenseShape(), dense_shape); ASSERT_EQ(str_cpu_src.NumValues(), expected_values_str.size()); auto values = str_cpu_src.Values().DataAsSpan(); ASSERT_TRUE(std::equal(expected_values_str.cbegin(), expected_values_str.cend(), values.cbegin(), values.cend())); diff --git a/onnxruntime/test/framework/tensor_shape_test.cc b/onnxruntime/test/framework/tensor_shape_test.cc new file mode 100644 index 0000000000000..7c799baf0f341 --- /dev/null +++ b/onnxruntime/test/framework/tensor_shape_test.cc @@ -0,0 +1,80 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/framework/tensor_shape.h" + +#include + +#include "gtest/gtest.h" + +namespace onnxruntime { +namespace utils { +namespace test { + +static void TestShapeWithVector(const std::vector& vector) { + + // Test constructing from a vector + TensorShape shape{vector}; + EXPECT_EQ(shape, vector); + + // Test copying to a new shape + TensorShape shape_copy{shape}; + EXPECT_EQ(shape, shape_copy); + + // Test copying to itself + TensorShape &shape2=shape; + shape = shape2; + EXPECT_EQ(shape, shape_copy); +} + +TEST(TensorShapeTest, VariousSizes) { + + // Test various sizes of copying between vectors + TestShapeWithVector({}); + TestShapeWithVector({10}); + TestShapeWithVector({10, 20}); + TestShapeWithVector({10, 20, 30}); + TestShapeWithVector({10, 20, 30, 40}); + TestShapeWithVector({12, 23, 34, 45, 56, 67, 78, 89, 90}); + + // Test assigning a shape to a large then a small vector (causing it to switch from small block to large, then back to small) + std::vector small{1, 2, 3}; + std::vector large{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + + TensorShape shape{small}; + EXPECT_EQ(shape.GetDims(), gsl::make_span(small)); + + shape=TensorShape{large}; + EXPECT_EQ(shape.GetDims(), gsl::make_span(large)); + + shape=TensorShape{small}; + EXPECT_EQ(shape.GetDims(), gsl::make_span(small)); +} + +TEST(TensorShapeTest, FromExistingBuffer) { + + std::vector buffer{12, 23, 34, 45, 56, 67, 78, 89}; + auto shape = TensorShape::FromExistingBuffer(buffer); + auto shape_copy=shape; + + // Pointers and sizes should match as they're the same buffer + EXPECT_EQ(gsl::make_span(buffer).begin(), shape.GetDims().begin()); + EXPECT_EQ(gsl::make_span(buffer).size(), shape.GetDims().size()); + + // Pointers should not match as they're no longer the same buffer + EXPECT_NE(gsl::make_span(buffer).begin(), shape_copy.GetDims().begin()); + // Size should still match + EXPECT_EQ(gsl::make_span(buffer).size(), shape_copy.GetDims().size()); + + EXPECT_EQ(shape, shape_copy); + + // Test assigning from an empty shape + TensorShape empty_shape; + shape_copy=empty_shape; + + EXPECT_EQ(shape_copy, empty_shape); +} + +} // namespace test +} // namespace utils +} // namespace onnxruntime diff --git a/onnxruntime/test/framework/tensor_test.cc b/onnxruntime/test/framework/tensor_test.cc index 33e6189390b27..503b92d947187 100644 --- a/onnxruntime/test/framework/tensor_test.cc +++ b/onnxruntime/test/framework/tensor_test.cc @@ -159,8 +159,7 @@ TEST(TensorTest, StringTensorTest) { Tensor t(DataTypeImpl::GetType(), shape, alloc); auto& tensor_shape = t.Shape(); - //Use reinterpret_cast to bypass a gcc bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51213 - EXPECT_EQ(*reinterpret_cast*>(&shape), *reinterpret_cast*>(&tensor_shape)); + EXPECT_EQ(shape, tensor_shape); EXPECT_EQ(t.DataType(), DataTypeImpl::GetType()); auto& location = t.Location(); ASSERT_STREQ(location.name, CPU); diff --git a/onnxruntime/test/framework/test_utils.h b/onnxruntime/test/framework/test_utils.h index 697d882670aa3..70254aa7bd0eb 100644 --- a/onnxruntime/test/framework/test_utils.h +++ b/onnxruntime/test/framework/test_utils.h @@ -73,7 +73,7 @@ void CreateMLValue(AllocatorPtr alloc, const std::vector& dims, const s // Lifetime of data_buffer should be managed by the caller. template -void CreateMLValue(const std::vector& dims, T* data_buffer, const OrtMemoryInfo& info, +void CreateMLValue(gsl::span dims, T* data_buffer, const OrtMemoryInfo& info, OrtValue* p_mlvalue) { TensorShape shape(dims); auto element_type = DataTypeImpl::GetType(); diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc index 876e3caf6c589..2fda29c58f53c 100644 --- a/onnxruntime/test/optimizer/graph_transform_test.cc +++ b/onnxruntime/test/optimizer/graph_transform_test.cc @@ -1417,8 +1417,7 @@ TEST_F(GraphTransformationTests, FuseConvBnAddMulFloat16) { ASSERT_EQ(1u, fetches.size()); auto& rtensor = fetches.front().Get(); TensorShape expected_shape(expected_dims_prod); - //Use reinterpret_cast to bypass a gcc bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51213 - ASSERT_EQ(*reinterpret_cast*>(&expected_shape), *reinterpret_cast*>(&rtensor.Shape())); + ASSERT_EQ(expected_shape, rtensor.Shape()); const std::vector found(rtensor.template Data(), rtensor.template Data() + expected_dims_prod.size()); ASSERT_EQ(expected_values_prod, found); diff --git a/onnxruntime/test/providers/cpu/controlflow/loop_test.cc b/onnxruntime/test/providers/cpu/controlflow/loop_test.cc index 8dc8bc24fab9c..fc6d8f6ab7cf7 100644 --- a/onnxruntime/test/providers/cpu/controlflow/loop_test.cc +++ b/onnxruntime/test/providers/cpu/controlflow/loop_test.cc @@ -735,8 +735,7 @@ TEST(Loop, SubgraphInputShadowsOuterScopeValue) { auto& b_out = fetches[0].Get(); TensorShape expected_shape(scalar); - //Use reinterpret_cast to bypass a gcc bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51213 - ASSERT_EQ(*reinterpret_cast*>(&expected_shape), *reinterpret_cast*>(&b_out.Shape())); + ASSERT_EQ(expected_shape, b_out.Shape()); ASSERT_EQ(b_out.DataAsSpan()[0], expected_value_b); auto user_defined_vals_out = fetches[1].Get().DataAsSpan(); diff --git a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc index aa89880eca071..68c764813fe63 100644 --- a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc +++ b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc @@ -1070,7 +1070,7 @@ static void TestSumMultipleInputsNoBroadcasting(size_t num_inputs, const TensorS OpTester test{"Sum", 8}; - const auto& dims = shape.GetDims(); + const auto dims = shape.GetDimsAsVector(); const std::vector input_data(shape.Size(), 1); for (size_t i = 0; i < num_inputs; ++i) { diff --git a/onnxruntime/test/providers/cpu/math/matmul_test.cc b/onnxruntime/test/providers/cpu/math/matmul_test.cc index a4edc580e80e6..c286d52cfc69b 100644 --- a/onnxruntime/test/providers/cpu/math/matmul_test.cc +++ b/onnxruntime/test/providers/cpu/math/matmul_test.cc @@ -107,11 +107,11 @@ void RunMatMulTest(int32_t opset_version, bool is_a_constant, bool is_b_constant for (auto t : GenerateTestCases()) { OpTester test("MatMul", opset_version); - int64_t size0 = TensorShape::ReinterpretBaseType(t.input0_dims).SizeHelper(0, t.input0_dims.size()); + int64_t size0 = TensorShape::FromExistingBuffer(t.input0_dims).SizeHelper(0, t.input0_dims.size()); std::vector input0_vals(common_input_vals.cbegin(), common_input_vals.cbegin() + size0); test.AddInput("A", t.input0_dims, input0_vals, is_a_constant); - int64_t size1 = TensorShape::ReinterpretBaseType(t.input1_dims).SizeHelper(0, t.input1_dims.size()); + int64_t size1 = TensorShape::FromExistingBuffer(t.input1_dims).SizeHelper(0, t.input1_dims.size()); std::vector input1_vals(common_input_vals.cbegin(), common_input_vals.cbegin() + size1); test.AddInput("B", t.input1_dims, input1_vals, is_b_constant); diff --git a/onnxruntime/test/providers/cpu/tensor/cast_op_test.cc b/onnxruntime/test/providers/cpu/tensor/cast_op_test.cc index 616429776bd3f..56e509ebdd021 100644 --- a/onnxruntime/test/providers/cpu/tensor/cast_op_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/cast_op_test.cc @@ -36,7 +36,7 @@ template void TestCastOp(gsl::span input, gsl::span output, - const std::vector& dimensions, + const std::vector &dimensions, OpTester::ExpectResult expect_result = OpTester::ExpectResult::kExpectSuccess, const std::string& expected_failure_string = "") { OpTester test("Cast", 13); @@ -113,7 +113,7 @@ struct CastNonStringTester { auto output_span = gsl::make_span(output_buffer.get(), size); CastSpan(input_span, output_span); - TestCastOp(input_span, output_span, shape.GetDims()); + TestCastOp(input_span, output_span, shape.GetDimsAsVector()); } }; diff --git a/onnxruntime/test/providers/cuda/reduction_functions_test.cc b/onnxruntime/test/providers/cuda/reduction_functions_test.cc index 7a677b6ed2eec..bcce48ff30677 100644 --- a/onnxruntime/test/providers/cuda/reduction_functions_test.cc +++ b/onnxruntime/test/providers/cuda/reduction_functions_test.cc @@ -269,8 +269,8 @@ TEST(ReductionFunctionsTest, GetApplicableMatrixReduction) { const optional& expected_n = nullopt) { SCOPED_TRACE(MakeString( "cudnn_op: ", cudnn_op, - ", dims: ", TensorShape::ReinterpretBaseType(dims), - ", axes: ", TensorShape::ReinterpretBaseType(axes))); + ", dims: ", TensorShape::FromExistingBuffer(dims), + ", axes: ", TensorShape::FromExistingBuffer(axes))); int m{}, n{}; EXPECT_EQ( static_cast(get_applicable_matrix_reduction(cudnn_op, dims, axes, m, n)), diff --git a/onnxruntime/test/providers/provider_test_utils.cc b/onnxruntime/test/providers/provider_test_utils.cc index 4ba07d4c19de5..5400a22acc37d 100644 --- a/onnxruntime/test/providers/provider_test_utils.cc +++ b/onnxruntime/test/providers/provider_test_utils.cc @@ -514,8 +514,8 @@ void OpTester::AddNodes( add_attribute_fn(node); } -std::vector OpTester::GetDimsForProto(const std::vector& dims) { - std::vector dims_for_proto{dims}; +std::vector OpTester::GetDimsForProto(gsl::span dims) { + std::vector dims_for_proto{dims.begin(), dims.end()}; if (add_symbolic_dim_to_tensor_data_ >= 0 && dims.size() > static_cast(add_symbolic_dim_to_tensor_data_)) { dims_for_proto[add_symbolic_dim_to_tensor_data_] = -1; @@ -523,7 +523,7 @@ std::vector OpTester::GetDimsForProto(const std::vector& dims) return dims_for_proto; } -void OpTester::AddShapeToTensorData(NodeArg& node_arg, const std::vector& dims, +void OpTester::AddShapeToTensorData(NodeArg& node_arg, gsl::span dims, const std::vector* dim_params) { if (dim_params && !(dim_params->empty()) && add_shape_to_tensor_data_) { // If dim_params presents, configure node_arg's dim value based on dim_params, which supports symbolic dim and dim broadcast. diff --git a/onnxruntime/test/providers/provider_test_utils.h b/onnxruntime/test/providers/provider_test_utils.h index a6d457fc30700..ee24e3a65e9f8 100644 --- a/onnxruntime/test/providers/provider_test_utils.h +++ b/onnxruntime/test/providers/provider_test_utils.h @@ -822,7 +822,7 @@ class OpTester { protected: template - void AddData(std::vector& data, const char* name, const std::vector& dims, const T* values, + void AddData(std::vector& data, const char* name, gsl::span dims, const T* values, int64_t values_count, bool is_initializer = false, bool sort_output = false, const std::vector* dim_params = nullptr, float rel_error = 0.0f, float abs_error = 0.0f, bool is_optional_type_tensor = false) { @@ -944,9 +944,9 @@ class OpTester { optional(), optional())); } - std::vector GetDimsForProto(const std::vector& dims); + std::vector GetDimsForProto(gsl::span dims); - void AddShapeToTensorData(NodeArg& node_arg, const std::vector& dims, const std::vector* dim_params); + void AddShapeToTensorData(NodeArg& node_arg, gsl::span dims, const std::vector* dim_params); void CopyDataToTensor(gsl::span data, Tensor& dst); diff --git a/orttraining/orttraining/core/graph/graph_augmenter.h b/orttraining/orttraining/core/graph/graph_augmenter.h index adc98515ae891..8409150599d4a 100644 --- a/orttraining/orttraining/core/graph/graph_augmenter.h +++ b/orttraining/orttraining/core/graph/graph_augmenter.h @@ -173,7 +173,7 @@ class GraphAugmenter { return graph_type_protos_.back().get(); } - TypeProto* CreateTypeProto(const std::vector& dims, ONNX_NAMESPACE::TensorProto_DataType data_type) { + TypeProto* CreateTypeProto(gsl::span dims, ONNX_NAMESPACE::TensorProto_DataType data_type) { TypeProto* type_proto = CreateTypeProto(); type_proto->mutable_tensor_type()->set_elem_type(data_type); for (int64_t dim : dims) diff --git a/orttraining/orttraining/core/graph/loss_func/softmax_cross_entropy.cc b/orttraining/orttraining/core/graph/loss_func/softmax_cross_entropy.cc index ee12a281a40bd..d8880370cdaf0 100644 --- a/orttraining/orttraining/core/graph/loss_func/softmax_cross_entropy.cc +++ b/orttraining/orttraining/core/graph/loss_func/softmax_cross_entropy.cc @@ -30,7 +30,7 @@ GraphAugmenter::GraphDefs SoftmaxCrossEntropy::operator()( new_nodes.emplace_back(NodeDef(OpDef("SoftmaxCrossEntropy", kMSDomain, 1), // Op {ArgDef(prediction_name), ArgDef(label_name, label_type_proto)}, // Inputs - {ArgDef(loss_name, graph_defs.CreateTypeProto({1,}, ONNX_NAMESPACE::TensorProto_DataType_FLOAT)), + {ArgDef(loss_name, graph_defs.CreateTypeProto(std::array{1}, ONNX_NAMESPACE::TensorProto_DataType_FLOAT)), ArgDef(prob_name)}, // Outputs NodeAttributes(), "SoftmaxCrossEntropy" // name diff --git a/orttraining/orttraining/core/graph/optimizer/adam_optimizer_builder.cc b/orttraining/orttraining/core/graph/optimizer/adam_optimizer_builder.cc index 357c98c4af42c..dd68be9ba29b7 100644 --- a/orttraining/orttraining/core/graph/optimizer/adam_optimizer_builder.cc +++ b/orttraining/orttraining/core/graph/optimizer/adam_optimizer_builder.cc @@ -132,7 +132,7 @@ Status AdamOptimizerBuilder::Build( output_args.push_back(ArgDef()); } if (!opt_configs[i].loss_scale_input_name.empty()) { - input_args.emplace_back(ArgDef(opt_configs[i].loss_scale_input_name, graph_defs.CreateTypeProto({1}, ONNX_NAMESPACE::TensorProto_DataType_FLOAT))); + input_args.emplace_back(ArgDef(opt_configs[i].loss_scale_input_name, graph_defs.CreateTypeProto(std::array{1}, ONNX_NAMESPACE::TensorProto_DataType_FLOAT))); } else { input_args.emplace_back(ArgDef()); } diff --git a/orttraining/orttraining/core/graph/optimizer/lamb_optimizer_builder.cc b/orttraining/orttraining/core/graph/optimizer/lamb_optimizer_builder.cc index 1a3ed2ea7a8e2..b89933ce1dd45 100644 --- a/orttraining/orttraining/core/graph/optimizer/lamb_optimizer_builder.cc +++ b/orttraining/orttraining/core/graph/optimizer/lamb_optimizer_builder.cc @@ -45,7 +45,7 @@ Status LambOptimizerBuilder::Build( // Loss scale ArgDef. if (!opt_configs[0].loss_scale_input_name.empty()) { - input_argdefs.emplace_back(ArgDef(opt_configs[0].loss_scale_input_name, graph_defs.CreateTypeProto({1}, ONNX_NAMESPACE::TensorProto_DataType_FLOAT))); + input_argdefs.emplace_back(ArgDef(opt_configs[0].loss_scale_input_name, graph_defs.CreateTypeProto(std::array{1}, ONNX_NAMESPACE::TensorProto_DataType_FLOAT))); } else { input_argdefs.emplace_back(ArgDef()); } diff --git a/orttraining/orttraining/core/graph/optimizer_builder.h b/orttraining/orttraining/core/graph/optimizer_builder.h index 81399384f249d..f5f614f9ee38f 100644 --- a/orttraining/orttraining/core/graph/optimizer_builder.h +++ b/orttraining/orttraining/core/graph/optimizer_builder.h @@ -139,7 +139,7 @@ class OptimizerBuilder { } static const ONNX_NAMESPACE::TypeProto* CreateLearningRateTypeProto(GraphAugmenter::GraphDefs& graph_defs) { - return graph_defs.CreateTypeProto({1}, ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + return graph_defs.CreateTypeProto(std::array{1}, ONNX_NAMESPACE::TensorProto_DataType_FLOAT); } private: diff --git a/orttraining/orttraining/core/graph/zero_optimizer_graph_builder.cc b/orttraining/orttraining/core/graph/zero_optimizer_graph_builder.cc index 1eaa37e9eabb5..3072592389611 100644 --- a/orttraining/orttraining/core/graph/zero_optimizer_graph_builder.cc +++ b/orttraining/orttraining/core/graph/zero_optimizer_graph_builder.cc @@ -146,7 +146,7 @@ static std::vector AddPartitionsForParameter( view_outputs.push_back(partition_argdef); } else { auto dtype = ONNX_NAMESPACE::TensorProto_DataType_FLOAT; - auto partition_argdef = ArgDef(partition_name, graph_defs.CreateTypeProto({shapes[i].Size()}, dtype)); + auto partition_argdef = ArgDef(partition_name, graph_defs.CreateTypeProto(std::array{shapes[i].Size()}, dtype)); view_outputs.push_back(partition_argdef); } view_num++; @@ -168,7 +168,7 @@ static std::vector AddViewForParameter( ArgDef shape_argdef(argdef.name + "_view_shape_" + std::to_string(view_num), graph_defs.CreateTypeProto({dims}, ONNX_NAMESPACE::TensorProto_DataType_INT64)); - graph_defs.AddInitializers({CreateTensorProto(shape_argdef.name, shape.GetDims(), {dims})}); + graph_defs.AddInitializers({CreateTensorProto(shape_argdef.name, shape.GetDimsAsVector(), {dims})}); auto dtype = static_cast(argdef.type_proto->tensor_type().elem_type()); ArgDef view_argdef(GetViewName(argdef.name, view_num), @@ -360,7 +360,7 @@ static Status ModifyParametersForOptimizerPartitioning( new_weight_argdefs.push_back(weight_argdef); new_gradient_argdefs.push_back(gradient_argdef); } else { - weight_partition_info[weight_argdef.name].original_dim = tensor_shape.GetDims(); + weight_partition_info[weight_argdef.name].original_dim = tensor_shape.GetDimsAsVector(); if (offset < rank_start && offset + tensor_count <= rank_end) { int64_t size_for_previous_rank = rank_start - offset; int64_t size_for_current_rank = offset + tensor_count - rank_start; diff --git a/orttraining/orttraining/core/session/tensor_helper.cc b/orttraining/orttraining/core/session/tensor_helper.cc index a9e7b39269a36..fa5bc3971d46b 100644 --- a/orttraining/orttraining/core/session/tensor_helper.cc +++ b/orttraining/orttraining/core/session/tensor_helper.cc @@ -12,7 +12,7 @@ namespace training { // Return the shape of a tensor slice. std::vector GetSliceShape( - const std::vector& shape, // before-slicing tensor shape + gsl::span shape, // before-slicing tensor shape const size_t slice_axis, // axis to slice along const size_t num_slices) { // number of slices along the slicing axis ORT_ENFORCE(shape.size() > 0); @@ -34,7 +34,7 @@ std::vector GetSliceShape( // Given tensor's element type and shape, this function creates a tensor in the passed-in session. OrtValue CreateCpuTensorValue( const MLDataType elem_type, - std::vector shape, + gsl::span shape, onnxruntime::InferenceSession& session_state) { ORT_ENFORCE(elem_type->AsPrimitiveDataType(), "Tensor's element type must be a scalar type."); ORT_ENFORCE(shape.size() > 0, "Shape vector must be non-empty."); @@ -255,7 +255,7 @@ OrtValue ConcatenateTensors( // Concatenated tensors in CPU buffers. std::vector cpu_values; // Result tensor's shape. - std::vector new_shape = orig_values.front().Get().Shape().GetDims(); + std::vector new_shape = orig_values.front().Get().Shape().GetDimsAsVector(); // Tensor elements' type. MLDataType elem_type = orig_values.front().Get().DataType(); int64_t new_dim = 0; diff --git a/orttraining/orttraining/core/session/training_session.cc b/orttraining/orttraining/core/session/training_session.cc index e4ebb963ba4f2..afecdc7eb02aa 100644 --- a/orttraining/orttraining/core/session/training_session.cc +++ b/orttraining/orttraining/core/session/training_session.cc @@ -317,7 +317,7 @@ static Status AddFakeLossScaling( Graph& graph, std::string& loss_scale_name) { GraphAugmenter::GraphDefs defs{}; loss_scale_name = graph.GenerateNodeArgName("loss_scale"); - const auto* loss_scale_type = defs.CreateTypeProto({1}, ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + const auto* loss_scale_type = defs.CreateTypeProto(std::array{1}, ONNX_NAMESPACE::TensorProto_DataType_FLOAT); graph.GetOrCreateNodeArg(loss_scale_name, loss_scale_type); defs.AddGraphInputs({loss_scale_name}); ORT_RETURN_IF_ERROR(GraphAugmenter::AugmentGraph(graph, defs)); @@ -597,7 +597,7 @@ static Status AddLossScaling( GraphAugmenter::GraphDefs defs{}; *loss_scale_input_name = graph.GenerateNodeArgName("loss_scale"); const auto* loss_scale_input_type = - defs.CreateTypeProto({1}, ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + defs.CreateTypeProto(std::array{1}, ONNX_NAMESPACE::TensorProto_DataType_FLOAT); scaled_loss_name = graph.GenerateNodeArgName("scaled_loss"); defs.AddNodeDef(NodeDef{ "Mul", diff --git a/orttraining/orttraining/eager/ort_tensor.cpp b/orttraining/orttraining/eager/ort_tensor.cpp index 7a231098ff560..72310817df7fe 100644 --- a/orttraining/orttraining/eager/ort_tensor.cpp +++ b/orttraining/orttraining/eager/ort_tensor.cpp @@ -85,7 +85,7 @@ void ORTTensorImpl::cacheSizeMetadata() { numel_ = shape.Size(); - sizes_and_strides_.set_sizes(shape.GetDims()); + sizes_and_strides_.set_sizes(shape.GetDimsAsVector()); for (std::size_t i = 0; i < strides.size(); i++) { sizes_and_strides_.stride_at_unchecked(i) = strides[i]; diff --git a/orttraining/orttraining/eager/ort_util.cpp b/orttraining/orttraining/eager/ort_util.cpp index e8d7b4acaf420..8a0cd87b8967d 100644 --- a/orttraining/orttraining/eager/ort_util.cpp +++ b/orttraining/orttraining/eager/ort_util.cpp @@ -39,7 +39,7 @@ void CreateMLValue(void* data_ptr, onnxruntime::MLDataType element_type, const s onnxruntime::DataTypeImpl::GetType()->GetDeleteFunc()); } -std::vector GetStrides(const std::vector& shape) { +std::vector GetStrides(gsl::span shape) { std::vector strides(shape.size(), 1); for (auto i = shape.size(); i > 1; --i) { strides[i - 2] = strides[i - 1] * shape[i - 1]; diff --git a/orttraining/orttraining/eager/ort_util.h b/orttraining/orttraining/eager/ort_util.h index 44952bb1dc076..4b467ee000ce9 100644 --- a/orttraining/orttraining/eager/ort_util.h +++ b/orttraining/orttraining/eager/ort_util.h @@ -52,7 +52,7 @@ inline void CopyVectorToTensor(onnxruntime::ORTInvoker& /*invoker*/, } } -std::vector GetStrides(const std::vector& shape); +std::vector GetStrides(gsl::span shape); } // namespace eager } // namespace torch_ort \ No newline at end of file diff --git a/orttraining/orttraining/models/runner/training_runner.cc b/orttraining/orttraining/models/runner/training_runner.cc index e9e70eb225f11..44e61cbfa345c 100644 --- a/orttraining/orttraining/models/runner/training_runner.cc +++ b/orttraining/orttraining/models/runner/training_runner.cc @@ -346,7 +346,7 @@ Status TrainingRunner::PrepareFeedNamesAndFeeds(const SessionMode mode, feed_names.push_back(name); const float loss_scale = (mode == EvaluateStep) ? 1.0f : loss_scaler_->GetLossScale(); OrtValue loss_scale_val; - TrainingUtil::CreateCpuMLValue({1}, std::vector{loss_scale}, &loss_scale_val, input_allocator_); + TrainingUtil::CreateCpuMLValue(std::array{1}, std::vector{loss_scale}, &loss_scale_val, input_allocator_); feeds.push_back(loss_scale_val); } } @@ -359,7 +359,7 @@ Status TrainingRunner::PrepareFeedNamesAndFeeds(const SessionMode mode, // learning rate is 0 if there is no learning-rate scheduler. Otherwise, learning rate is obtained from the scheduler. const float learning_rate = lr_scheduler ? lr_scheduler->GetLearningRate(step_ + 1) : 0.0f; OrtValue lr_val; - TrainingUtil::CreateCpuMLValue({1}, std::vector{learning_rate}, &lr_val, input_allocator_); + TrainingUtil::CreateCpuMLValue(std::array{1}, std::vector{learning_rate}, &lr_val, input_allocator_); feeds.push_back(lr_val); } } diff --git a/orttraining/orttraining/models/runner/training_util.cc b/orttraining/orttraining/models/runner/training_util.cc index b7c610770e0ae..74f6c100f5379 100644 --- a/orttraining/orttraining/models/runner/training_util.cc +++ b/orttraining/orttraining/models/runner/training_util.cc @@ -81,7 +81,7 @@ common::Status DataSet::GetTensorDimensionsFromInputs(const std::mapsecond; const Tensor& first_tensor = data_[0]->at(input_index).Get(); - std::vector shape_vector = first_tensor.Shape().GetDims(); + auto shape_vector = first_tensor.Shape().GetDims(); ORT_RETURN_IF_NOT(metric.second < shape_vector.size(), "Index out of bounds for input: ", input_name.c_str(), "; requested index: ", metric.second, ", actual size: ", shape_vector.size()); @@ -99,7 +99,7 @@ std::vector DataSet::GetKthBatch(size_t batch_size, size_t k_th, Alloc const Tensor& first_tensor = data_[0]->at(input_index).Get(); MLDataType element_type = first_tensor.DataType(); - std::vector shape_vector = first_tensor.Shape().GetDims(); + std::vector shape_vector = first_tensor.Shape().GetDimsAsVector(); if (first_tensor.Shape().Size() > 1) { shape_vector.insert(shape_vector.begin(), batch_size); } else { diff --git a/orttraining/orttraining/models/runner/training_util.h b/orttraining/orttraining/models/runner/training_util.h index 76013b578e20c..5fa73be7a62dd 100644 --- a/orttraining/orttraining/models/runner/training_util.h +++ b/orttraining/orttraining/models/runner/training_util.h @@ -115,7 +115,7 @@ class RandomDataSet : public DataSet { class TrainingUtil { public: template - static void CreateCpuMLValue(const std::vector& dims, + static void CreateCpuMLValue(gsl::span dims, const std::vector& value, OrtValue* p_mlvalue, AllocatorPtr alloc = nullptr) { diff --git a/orttraining/orttraining/test/gradient/allreduce_op_test.cc b/orttraining/orttraining/test/gradient/allreduce_op_test.cc index a68b54a159c06..1528ac6f5c705 100644 --- a/orttraining/orttraining/test/gradient/allreduce_op_test.cc +++ b/orttraining/orttraining/test/gradient/allreduce_op_test.cc @@ -592,8 +592,7 @@ TEST(AllreduceTest, GPUHierarchicalAdasumAllreduceOptimizerTest) { // Verify tensor data auto& actual_output_tensor = fetches[0].Get(); TensorShape expected_shape(expected_dims_allreduce); - ASSERT_EQ(*reinterpret_cast*>(&expected_shape), - *reinterpret_cast*>(&actual_output_tensor.Shape())); + ASSERT_EQ(expected_shape, actual_output_tensor.Shape()); const std::vector found(actual_output_tensor.template Data(), actual_output_tensor.template Data() + expected_values_allreduce.size()); @@ -773,8 +772,7 @@ TEST(AllreduceTest, GPUHierarchicalAdasumAllreduceOptimizerFP16Test) { // Verify tensor data auto& actual_output_tensor = fetches[0].Get(); TensorShape expected_shape(expected_dims_allreduce); - ASSERT_EQ(*reinterpret_cast*>(&expected_shape), - *reinterpret_cast*>(&actual_output_tensor.Shape())); + ASSERT_EQ(expected_shape, actual_output_tensor.Shape()); const std::vector found(actual_output_tensor.template Data(), actual_output_tensor.template Data() + expected_values_allreduce.size()); @@ -872,8 +870,7 @@ TEST(AllreduceTest, GPUHierarchicalAdasumAllreduceTest) { // Verify tensor data auto& actual_output_tensor = fetches[0].Get(); TensorShape expected_shape(expected_dims_allreduce); - ASSERT_EQ(*reinterpret_cast*>(&expected_shape), - *reinterpret_cast*>(&actual_output_tensor.Shape())); + ASSERT_EQ(expected_shape, actual_output_tensor.Shape()); const std::vector found(actual_output_tensor.template Data(), actual_output_tensor.template Data() + expected_values_allreduce.size()); @@ -983,8 +980,7 @@ TEST(AllreduceTest, GPUHierarchicalAdasumFP16AllreduceTest) { // Verify tensor data auto& actual_output_tensor = fetches[0].Get(); TensorShape expected_shape(expected_dims_allreduce); - ASSERT_EQ(*reinterpret_cast*>(&expected_shape), - *reinterpret_cast*>(&actual_output_tensor.Shape())); + ASSERT_EQ(expected_shape, actual_output_tensor.Shape()); const std::vector found_half(actual_output_tensor.template Data(), actual_output_tensor.template Data() + expected_values_allreduce_half.size()); @@ -1089,8 +1085,7 @@ TEST(AllreduceTest, GPUAdasumAllreduceTest) { // Verify tensor data auto& actual_output_tensor = fetches[0].Get(); TensorShape expected_shape(expected_dims_allreduce); - ASSERT_EQ(*reinterpret_cast*>(&expected_shape), - *reinterpret_cast*>(&actual_output_tensor.Shape())); + ASSERT_EQ(expected_shape, actual_output_tensor.Shape()); const std::vector found(actual_output_tensor.template Data(), actual_output_tensor.template Data() + expected_values_allreduce.size()); @@ -1199,8 +1194,7 @@ TEST(AllreduceTest, GPUAdasumFP16AllreduceTest) { // Verify tensor data auto& actual_output_tensor = fetches[0].Get(); TensorShape expected_shape(expected_dims_allreduce); - ASSERT_EQ(*reinterpret_cast*>(&expected_shape), - *reinterpret_cast*>(&actual_output_tensor.Shape())); + ASSERT_EQ(expected_shape, actual_output_tensor.Shape()); const std::vector found_half(actual_output_tensor.template Data(), actual_output_tensor.template Data() + expected_values_allreduce_half.size()); diff --git a/orttraining/orttraining/test/gradient/gradient_checker.cc b/orttraining/orttraining/test/gradient/gradient_checker.cc index 4759b86e9fbc3..0bb1daf71fe33 100644 --- a/orttraining/orttraining/test/gradient/gradient_checker.cc +++ b/orttraining/orttraining/test/gradient/gradient_checker.cc @@ -78,25 +78,25 @@ inline std::vector GradientChecker::EvaluateFunctionA if (x_infos[data_index].data_type == DataTypeImpl::GetTensorType()) { std::vector int64_data(data.size()); std::transform(data.begin(), data.end(), int64_data.begin(), [](X_T x) { return static_cast(x); }); - op_session.AddInput(name.c_str(), x_infos[data_index].shape.GetDims(), int64_data); + op_session.AddInput(name.c_str(), x_infos[data_index].shape.GetDimsAsVector(), int64_data); } else if (x_infos[data_index].data_type == DataTypeImpl::GetTensorType()) { std::vector int32_data(data.size()); std::transform(data.begin(), data.end(), int32_data.begin(), [](X_T x) { return static_cast(x); }); - op_session.AddInput(name.c_str(), x_infos[data_index].shape.GetDims(), int32_data); + op_session.AddInput(name.c_str(), x_infos[data_index].shape.GetDimsAsVector(), int32_data); } else if (x_infos[data_index].data_type == DataTypeImpl::GetTensorType()) { std::unique_ptr p_data(new bool[data.size()]); for (size_t i = 0; i < data.size(); ++i) { p_data[i] = static_cast(data[i]); } - op_session.AddInput(name.c_str(), x_infos[data_index].shape.GetDims(), p_data.get(), data.size()); + op_session.AddInput(name.c_str(), x_infos[data_index].shape.GetDimsAsVector(), p_data.get(), data.size()); } else { - op_session.AddInput(name.c_str(), x_infos[data_index].shape.GetDims(), data); + op_session.AddInput(name.c_str(), x_infos[data_index].shape.GetDimsAsVector(), data); } } for (size_t data_index = 0; data_index < y_infos.size(); data_index++) { std::string name = "output" + std::to_string(data_index); - op_session.AddOutput(name.c_str(), y_infos[data_index].shape.GetDims(), (*y_datas)[data_index]); + op_session.AddOutput(name.c_str(), y_infos[data_index].shape.GetDimsAsVector(), (*y_datas)[data_index]); } op_session.Run(); return op_session.GetFetches(); @@ -142,25 +142,25 @@ inline Status GradientChecker::ComputeTheoreticalJacobianTransp if (x_infos[data_index].data_type == DataTypeImpl::GetTensorType()) { std::vector int64_data(data.size()); std::transform(data.begin(), data.end(), int64_data.begin(), [](X_T x) { return static_cast(x); }); - op_session.AddInput(name.c_str(), x_infos[data_index].shape.GetDims(), int64_data); + op_session.AddInput(name.c_str(), x_infos[data_index].shape.GetDimsAsVector(), int64_data); } else if (x_infos[data_index].data_type == DataTypeImpl::GetTensorType()) { std::vector int32_data(data.size()); std::transform(data.begin(), data.end(), int32_data.begin(), [](X_T x) { return static_cast(x); }); - op_session.AddInput(name.c_str(), x_infos[data_index].shape.GetDims(), int32_data); + op_session.AddInput(name.c_str(), x_infos[data_index].shape.GetDimsAsVector(), int32_data); } else if (x_infos[data_index].data_type == DataTypeImpl::GetTensorType()) { std::unique_ptr p_data(new bool[data.size()]); for (size_t i = 0; i < data.size(); ++i) { p_data[i] = static_cast(data[i]); } - op_session.AddInput(name.c_str(), x_infos[data_index].shape.GetDims(), p_data.get(), data.size()); + op_session.AddInput(name.c_str(), x_infos[data_index].shape.GetDimsAsVector(), p_data.get(), data.size()); } else { - op_session.AddInput(name.c_str(), x_infos[data_index].shape.GetDims(), data); + op_session.AddInput(name.c_str(), x_infos[data_index].shape.GetDimsAsVector(), data); } } for (size_t data_index = 0; data_index < y_num; data_index++) { std::string name = "output" + std::to_string(data_index); - op_session.AddOutput(name.c_str(), y_infos[data_index].shape.GetDims(), (*y_datas)[data_index]); + op_session.AddOutput(name.c_str(), y_infos[data_index].shape.GetDimsAsVector(), (*y_datas)[data_index]); } // While calculating theoritical jacobian transpose we calculate the gradient by @@ -215,7 +215,7 @@ inline Status GradientChecker::InitOpTesterWithGraph( std::vector int64_data(data.size()); std::transform(data.begin(), data.end(), int64_data.begin(), [](X_T x) { return static_cast(x); }); op_session.AddInput(name.c_str(), - x_infos[data_index].shape.GetDims(), + x_infos[data_index].shape.GetDimsAsVector(), int64_data, false, &x_infos[data_index].dim_params); @@ -223,7 +223,7 @@ inline Status GradientChecker::InitOpTesterWithGraph( std::vector int32_data(data.size()); std::transform(data.begin(), data.end(), int32_data.begin(), [](X_T x) { return static_cast(x); }); op_session.AddInput(name.c_str(), - x_infos[data_index].shape.GetDims(), + x_infos[data_index].shape.GetDimsAsVector(), int32_data, false, &x_infos[data_index].dim_params); @@ -233,14 +233,14 @@ inline Status GradientChecker::InitOpTesterWithGraph( p_data[i] = static_cast(data[i]); } op_session.AddInput(name.c_str(), - x_infos[data_index].shape.GetDims(), + x_infos[data_index].shape.GetDimsAsVector(), p_data.get(), data.size(), false, &x_infos[data_index].dim_params); } else { op_session.AddInput(name.c_str(), - x_infos[data_index].shape.GetDims(), + x_infos[data_index].shape.GetDimsAsVector(), data, false, &x_infos[data_index].dim_params); @@ -255,10 +255,10 @@ inline Status GradientChecker::InitOpTesterWithGraph( std::vector int64_data(data.size()); std::transform(data.begin(), data.end(), int64_data.begin(), [](Y_T x) { return static_cast(x); }); op_session.AddOutput(name.c_str(), - y_infos[data_index].shape.GetDims(), + y_infos[data_index].shape.GetDimsAsVector(), int64_data); } else { - op_session.AddOutput(name.c_str(), y_infos[data_index].shape.GetDims(), data); + op_session.AddOutput(name.c_str(), y_infos[data_index].shape.GetDimsAsVector(), data); } } // Currently only allows setting int attributes to zero. TODO: Expand this diff --git a/orttraining/orttraining/test/gradient/gradient_ops_test.cc b/orttraining/orttraining/test/gradient/gradient_ops_test.cc index 48baccfa453c0..d32232dd3d761 100644 --- a/orttraining/orttraining/test/gradient/gradient_ops_test.cc +++ b/orttraining/orttraining/test/gradient/gradient_ops_test.cc @@ -1640,7 +1640,7 @@ void TestSparseSoftmaxCrossEntropyGrad(const TensorShape& index_shape, const std // without weight { - std::vector logit_shape(index_shape.GetDims()); + std::vector logit_shape(index_shape.GetDimsAsVector()); logit_shape.emplace_back(D); TensorInfo x_info(logit_shape); @@ -1654,7 +1654,7 @@ void TestSparseSoftmaxCrossEntropyGrad(const TensorShape& index_shape, const std // with weight { - std::vector logit_shape(index_shape.GetDims()); + std::vector logit_shape(index_shape.GetDimsAsVector()); logit_shape.emplace_back(D); TensorInfo x_info(logit_shape); @@ -1702,7 +1702,7 @@ void TestSoftmaxCrossEntropyLossGrad(const TensorShape& index_shape, //label_sh // without weight and ignore_index { - std::vector logit_shape(index_shape.GetDims()); + std::vector logit_shape(index_shape.GetDimsAsVector()); auto it = logit_shape.begin() + 1; logit_shape.insert(it, D); TensorInfo loss_info = {}; @@ -1722,7 +1722,7 @@ void TestSoftmaxCrossEntropyLossGrad(const TensorShape& index_shape, //label_sh // with weight and no ignore_index { - std::vector logit_shape(index_shape.GetDims()); + std::vector logit_shape(index_shape.GetDimsAsVector()); auto it = logit_shape.begin() + 1; logit_shape.insert(it, D); TensorInfo loss_info = {}; @@ -1743,7 +1743,7 @@ void TestSoftmaxCrossEntropyLossGrad(const TensorShape& index_shape, //label_sh // without weight and ignore index { - std::vector logit_shape(index_shape.GetDims()); + std::vector logit_shape(index_shape.GetDimsAsVector()); auto it = logit_shape.begin() + 1; logit_shape.insert(it, D); TensorInfo loss_info = {}; @@ -1763,7 +1763,7 @@ void TestSoftmaxCrossEntropyLossGrad(const TensorShape& index_shape, //label_sh // with weight and ignore_index { - std::vector logit_shape(index_shape.GetDims()); + std::vector logit_shape(index_shape.GetDimsAsVector()); auto it = logit_shape.begin() + 1; logit_shape.insert(it, D); TensorInfo loss_info = {}; @@ -1920,11 +1920,11 @@ void TestDropoutOp(float ratio, TensorShape& x_shape, bool default_ratio = true) std::vector x_data(x_shape.Size(), input_constant); std::vector y_data(x_shape.Size(), 3.0f); - test.AddInput("x", x_shape.GetDims(), x_data); + test.AddInput("x", x_shape.GetDimsAsVector(), x_data); if (!default_ratio) test.AddInput("ratio", {}, {ratio}); - test.AddOutput("y", x_shape.GetDims(), y_data); - test.AddOutput("mask", x_shape.GetDims(), {true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true}); + test.AddOutput("y", x_shape.GetDimsAsVector(), y_data); + test.AddOutput("mask", x_shape.GetDimsAsVector(), {true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true}); test.Run(); //Check output @@ -1965,12 +1965,12 @@ void TestDropoutGradOp(float ratio, TensorShape& x_shape, bool default_ratio = t output_constant, 0, output_constant, 0, output_constant, 0, output_constant, 0}); - test.AddInput("dy", x_shape.GetDims(), dy_data); + test.AddInput("dy", x_shape.GetDimsAsVector(), dy_data); - test.AddInput("mask", x_shape.GetDims(), {true, true, true, false, // - true, false, true, false, // - true, false, true, false, // - true, false, true, false}); + test.AddInput("mask", x_shape.GetDimsAsVector(), {true, true, true, false, // + true, false, true, false, // + true, false, true, false, // + true, false, true, false}); if (!default_ratio) { test.AddInput("ratio", {1}, ratio_data); } else { @@ -1979,7 +1979,7 @@ void TestDropoutGradOp(float ratio, TensorShape& x_shape, bool default_ratio = t test.AddInput("training_mode", {}, {true}); - test.AddOutput("dx", x_shape.GetDims(), dx_data); + test.AddOutput("dx", x_shape.GetDimsAsVector(), dx_data); test.Run(); } diff --git a/orttraining/orttraining/test/graph/optimizer_graph_builder_test.cc b/orttraining/orttraining/test/graph/optimizer_graph_builder_test.cc index 4362619e2165e..3665526277282 100644 --- a/orttraining/orttraining/test/graph/optimizer_graph_builder_test.cc +++ b/orttraining/orttraining/test/graph/optimizer_graph_builder_test.cc @@ -243,12 +243,12 @@ TEST_F(OptimizerGraphBuilderTest, ZeroSplitInitialOptimizerState) { PartitionOptimizerState(partition_offset, partition_size, initial_states); std::vector expected_vec(init_value.begin() + partition_offset, init_value.begin() + partition_offset + partition_size); - std::vector expected_shape = {partition_size}; + std::array expected_shape = {partition_size}; for (const auto& state : initial_states) { const auto& init_tensor = state.second.Get(); const auto& shape = init_tensor.Shape().GetDims(); - ASSERT_EQ(shape, expected_shape); + ASSERT_EQ(shape, gsl::make_span(expected_shape)); const std::vector found(init_tensor.Data(), init_tensor.Data() + partition_size); ASSERT_EQ(expected_vec, found); diff --git a/orttraining/orttraining/test/session/training_session_test_utils.cc b/orttraining/orttraining/test/session/training_session_test_utils.cc index 66575c4629a3a..ea03e88bccbb0 100644 --- a/orttraining/orttraining/test/session/training_session_test_utils.cc +++ b/orttraining/orttraining/test/session/training_session_test_utils.cc @@ -115,8 +115,8 @@ void VerifyState(const DataTransferManager& data_transfer_mgr, const NameMLValMa // compare "Update_Count" or "Step" ASSERT_EQ(actual_tensor.GetElementType(), ONNX_NAMESPACE::TensorProto_DataType_INT64); ASSERT_EQ(expected_tensor.Shape(), actual_tensor.Shape()); - std::vector dims = {1}; - ASSERT_EQ(expected_tensor.Shape().GetDims(), dims); + std::array dims = {1}; + ASSERT_EQ(expected_tensor.Shape().GetDims(), gsl::make_span(dims)); auto size = expected_tensor.Shape().Size(); const std::vector expected(expected_tensor.template Data(), expected_tensor.template Data() + size); const std::vector actual(actual_tensor.template Data(), actual_tensor.template Data() + size); @@ -212,7 +212,7 @@ std::unique_ptr BuildAndRunTrainingSessionWithChecks( float lr = 0.001f; OrtValue lrMLValue; - TrainingUtil::CreateCpuMLValue({1}, std::vector{lr}, &lrMLValue); + TrainingUtil::CreateCpuMLValue(std::array{1}, std::vector{lr}, &lrMLValue); fw_feeds.first.push_back(lr_feed_name); fw_feeds.second.push_back(lrMLValue); } @@ -222,7 +222,7 @@ std::unique_ptr BuildAndRunTrainingSessionWithChecks( config_result.mixed_precision_config_result.value().loss_scale_input_name; float loss_scale = 2048.0f; OrtValue loss_scaleMLValue; - TrainingUtil::CreateCpuMLValue({1}, std::vector{loss_scale}, &loss_scaleMLValue); + TrainingUtil::CreateCpuMLValue(std::array{1}, std::vector{loss_scale}, &loss_scaleMLValue); fw_feeds.first.push_back(loss_scale_input_name); fw_feeds.second.push_back(loss_scaleMLValue); } diff --git a/orttraining/orttraining/test/training_ops/cpu/activation/activation_op_test.cc b/orttraining/orttraining/test/training_ops/cpu/activation/activation_op_test.cc index a0f3685c01f29..d4a52f60788b4 100644 --- a/orttraining/orttraining/test/training_ops/cpu/activation/activation_op_test.cc +++ b/orttraining/orttraining/test/training_ops/cpu/activation/activation_op_test.cc @@ -215,16 +215,16 @@ void TestBiasGeluGradBroadcastBias(const std::string& op, int opset_version, con const std::vector dY(input_size, 1.0f); const std::vector B = ValueRange(bias_size, 1.0f); - test.AddInput("dY", input_shape.GetDims(), dY); - test.AddInput("X", input_shape.GetDims(), X); - test.AddInput("B", bias_shape.GetDims(), B); + test.AddInput("dY", input_shape.GetDimsAsVector(), dY); + test.AddInput("X", input_shape.GetDimsAsVector(), X); + test.AddInput("B", bias_shape.GetDimsAsVector(), B); std::vector expected_dX{}; for (int64_t i = 0; i < input_size; ++i) { expected_dX.push_back(compute_gelu_grad_scalar_fn(dY[i], X[i] + B[i % bias_size])); } - test.AddOutput("dX", input_shape.GetDims(), expected_dX); + test.AddOutput("dX", input_shape.GetDimsAsVector(), expected_dX); test.Run(); } diff --git a/orttraining/orttraining/test/training_ops/cpu/nn/dropout_op_test.cc b/orttraining/orttraining/test/training_ops/cpu/nn/dropout_op_test.cc index 670078576499c..226f82da498a1 100644 --- a/orttraining/orttraining/test/training_ops/cpu/nn/dropout_op_test.cc +++ b/orttraining/orttraining/test/training_ops/cpu/nn/dropout_op_test.cc @@ -189,8 +189,8 @@ void RunDropoutGradTest(float ratio, const std::vector& input_dims, boo mask_buffer.get(), mask_buffer.get() + input_shape.Size(), std::back_inserter(dx_data), [output_constant](bool mask_value) { return mask_value ? output_constant : 0.0f; }); - test.AddInput("dy", input_shape.GetDims(), dy_data); - test.AddInput("mask", input_shape.GetDims(), mask_buffer.get(), input_shape.Size()); + test.AddInput("dy", input_shape.GetDimsAsVector(), dy_data); + test.AddInput("mask", input_shape.GetDimsAsVector(), mask_buffer.get(), input_shape.Size()); if (!default_ratio) { test.AddInput("ratio", {1}, ratio_data); } else { @@ -198,7 +198,7 @@ void RunDropoutGradTest(float ratio, const std::vector& input_dims, boo } test.AddInput("training_mode", {}, {true}); - test.AddOutput("dx", input_shape.GetDims(), dx_data); + test.AddOutput("dx", input_shape.GetDimsAsVector(), dx_data); test.Run(); } } // namespace diff --git a/orttraining/orttraining/test/training_ops/cpu/tensor/gather_grad_op_test.cc b/orttraining/orttraining/test/training_ops/cpu/tensor/gather_grad_op_test.cc index 95638da84ac33..f955b7abb447f 100644 --- a/orttraining/orttraining/test/training_ops/cpu/tensor/gather_grad_op_test.cc +++ b/orttraining/orttraining/test/training_ops/cpu/tensor/gather_grad_op_test.cc @@ -53,7 +53,7 @@ void ConfigureGatherGradRandomDataOpTester( ASSERT_LT(static_cast(axis), X_shape.NumDimensions()); const TensorShape dY_shape = [&]() { - std::vector dY_dims = X_shape.GetDims(); + std::vector dY_dims = X_shape.GetDimsAsVector(); auto it = dY_dims.erase(dY_dims.begin() + axis); dY_dims.insert( it, indices_shape.GetDims().begin(), indices_shape.GetDims().end()); @@ -67,10 +67,10 @@ void ConfigureGatherGradRandomDataOpTester( test.AddAttribute("axis", axis); test.AddInput( - "shape", {static_cast(X_shape.NumDimensions())}, X_shape.GetDims()); - test.AddInput("indices", indices_shape.GetDims(), indices); - test.AddInput("grad", dY_shape.GetDims(), grad); - test.AddOutput("output", X_shape.GetDims(), output); + "shape", {static_cast(X_shape.NumDimensions())}, X_shape.GetDimsAsVector()); + test.AddInput("indices", indices_shape.GetDimsAsVector(), indices); + test.AddInput("grad", dY_shape.GetDimsAsVector(), grad); + test.AddOutput("output", X_shape.GetDimsAsVector(), output); } template diff --git a/orttraining/orttraining/test/training_ops/cuda/reduce_sum_test.cc b/orttraining/orttraining/test/training_ops/cuda/reduce_sum_test.cc index fe1b9ebfa0348..335e6295fbd7b 100644 --- a/orttraining/orttraining/test/training_ops/cuda/reduce_sum_test.cc +++ b/orttraining/orttraining/test/training_ops/cuda/reduce_sum_test.cc @@ -24,7 +24,7 @@ static void TestReduceSum(const std::vector& X_dims, // create rand inputs RandomValueGenerator random{}; - const bool is_positive = random.Uniform({1}, 0, 2)[0] == 0; + const bool is_positive = random.Uniform(std::array{1}, 0, 2)[0] == 0; const float range_begin = is_positive ? 1.0f : -10.0f; const float range_end = is_positive ? 10.0f : -1.0f; const std::vector X_data = random.Uniform(X_dims, range_begin, range_end); diff --git a/orttraining/orttraining/training_ops/cpu/nn/pool_gradient_op.cc b/orttraining/orttraining/training_ops/cpu/nn/pool_gradient_op.cc index f5eb52567bbe7..eb580bc14839f 100644 --- a/orttraining/orttraining/training_ops/cpu/nn/pool_gradient_op.cc +++ b/orttraining/orttraining/training_ops/cpu/nn/pool_gradient_op.cc @@ -63,7 +63,7 @@ Status MaxPoolGrad::Compute(OpKernelContext* context) const { template Status AveragePoolGrad::Compute3DAveragePoolGrad(OpKernelContext* context) const { - const TensorShape& dX_shape = TensorShape::ReinterpretBaseType(output_tensor_shapes_[0]); + const TensorShape dX_shape = TensorShape::FromExistingBuffer(output_tensor_shapes_[0]); Tensor* dX = context->Output(0, dX_shape); T* dX_data = dX->template MutableData(); @@ -120,7 +120,7 @@ Status AveragePoolGrad::Compute3DAveragePoolGrad(OpKernelContext* context) co template Status AveragePoolGrad::Compute2DAveragePoolGrad(OpKernelContext* context) const { - const TensorShape& dX_shape = TensorShape::ReinterpretBaseType(output_tensor_shapes_[0]); + const TensorShape dX_shape = TensorShape::FromExistingBuffer(output_tensor_shapes_[0]); Tensor* dX = context->Output(0, dX_shape); T* dX_data = dX->template MutableData(); @@ -169,7 +169,7 @@ Status AveragePoolGrad::Compute2DAveragePoolGrad(OpKernelContext* context) co } template Status AveragePoolGrad::Compute1DAveragePoolGrad(OpKernelContext* context) const { - const TensorShape& dX_shape = TensorShape::ReinterpretBaseType(output_tensor_shapes_[0]); + const TensorShape dX_shape = TensorShape::FromExistingBuffer(output_tensor_shapes_[0]); Tensor* dX = context->Output(0, dX_shape); T* dX_data = dX->template MutableData(); @@ -214,7 +214,7 @@ ONNX_CPU_OPERATOR_KERNEL( // only StorageOrder::NCHW supported template Status AveragePoolGrad::Compute(OpKernelContext* context) const { - const TensorShape& dX_shape = TensorShape::ReinterpretBaseType(output_tensor_shapes_[0]); + const TensorShape dX_shape = TensorShape::FromExistingBuffer(output_tensor_shapes_[0]); Tensor* dX = context->Output(0, dX_shape); T* dX_data = dX->template MutableData(); EigenVectorMap(dX_data, dX_shape.Size()).setZero(); diff --git a/orttraining/orttraining/training_ops/cpu/tensor/slice_grad.cc b/orttraining/orttraining/training_ops/cpu/tensor/slice_grad.cc index c2619cc208b51..0f2b1f5e88a7f 100644 --- a/orttraining/orttraining/training_ops/cpu/tensor/slice_grad.cc +++ b/orttraining/orttraining/training_ops/cpu/tensor/slice_grad.cc @@ -86,7 +86,7 @@ Status SliceGrad::ComputeImpl(OpKernelContext* ctx, if (flattened_output_dims) { // if we have flattened output dims we need to also flatten the input dims. // as we're combining the innermost dims and keeping all values we can just copy the size of the last dim - std::vector flattened_input_dims(output_grad_tensor.Shape().GetDims()); + std::vector flattened_input_dims(output_grad_tensor.Shape().GetDimsAsVector()); flattened_input_dims.resize(flattened_output_dims->size()); flattened_input_dims.back() = flattened_output_dims->back(); TensorShape input_shape(std::move(flattened_input_dims)); diff --git a/orttraining/orttraining/training_ops/cpu/tensor/split.cc b/orttraining/orttraining/training_ops/cpu/tensor/split.cc index d14c10951c7c3..73cb2db51913d 100644 --- a/orttraining/orttraining/training_ops/cpu/tensor/split.cc +++ b/orttraining/orttraining/training_ops/cpu/tensor/split.cc @@ -23,7 +23,7 @@ ONNX_OPERATOR_KERNEL_EX( Status PrepareForTrainingCompute(const TensorShape& input_shape, int num_outputs, int64_t& axis, int& before_dims, int& after_dims_including_split_axis, int& after_dims_excluding_split, std::vector& split_sizes) { - auto& input_dims = input_shape.GetDims(); + auto input_dims = input_shape.GetDims(); const auto num_dimensions = gsl::narrow_cast(input_shape.NumDimensions()); int64_t axis_value = axis; axis = HandleNegativeAxis(axis_value, num_dimensions); // handle negative and enforce axis is valid @@ -118,8 +118,7 @@ Status SplitTraining::ComputeImpl(OpKernelContext& context, const Tensor& input) split_sizes)); // copy dimensions so we can update the selected axis in place - auto& input_dims = input_shape.GetDims(); - std::vector output_dimensions{input_dims}; + std::vector output_dimensions{input_shape.GetDimsAsVector()}; int64_t input_offset = 0; const T* input_data = input.template Data(); diff --git a/orttraining/orttraining/training_ops/cuda/math/div_grad.cc b/orttraining/orttraining/training_ops/cuda/math/div_grad.cc index fe94baceb91a6..46c55cd5ffeab 100644 --- a/orttraining/orttraining/training_ops/cuda/math/div_grad.cc +++ b/orttraining/orttraining/training_ops/cuda/math/div_grad.cc @@ -26,7 +26,7 @@ DIVGRAD_REGISTER_KERNEL_TYPED(double) std::vector prepended_dimension_1(const TensorShape& shape, size_t total_rank) { size_t input_rank = shape.NumDimensions(); if (input_rank == total_rank) - return shape.GetDims(); + return shape.GetDimsAsVector(); std::vector dims(total_rank, 1); diff --git a/orttraining/orttraining/training_ops/cuda/nn/conv_grad.cc b/orttraining/orttraining/training_ops/cuda/nn/conv_grad.cc index c78e5033a32dd..ccee072b0cea1 100644 --- a/orttraining/orttraining/training_ops/cuda/nn/conv_grad.cc +++ b/orttraining/orttraining/training_ops/cuda/nn/conv_grad.cc @@ -249,15 +249,15 @@ template Status ConvGrad::PrepareArgs(const Tensor& x, const Tensor& dY, const Tensor& w, Tensor* dB, Tensor* dX, Tensor* dW) const { const TensorShape& x_shape = x.Shape(); - std::vector x_dims = x_shape.GetDims(); + std::vector x_dims = x_shape.GetDimsAsVector(); args_.x_data = reinterpret_cast(x.template Data()); const TensorShape& dy_shape = dY.Shape(); - std::vector dy_dims = dy_shape.GetDims(); + std::vector dy_dims = dy_shape.GetDimsAsVector(); args_.dy_data = reinterpret_cast(dY.template Data()); const TensorShape& w_shape = w.Shape(); - std::vector w_dims = w_shape.GetDims(); + std::vector w_dims = w_shape.GetDimsAsVector(); args_.w_data = reinterpret_cast(w.template Data()); args_.db_data = dB ? reinterpret_cast(dB->template MutableData()) : nullptr; diff --git a/orttraining/orttraining/training_ops/cuda/tensor/split.cc b/orttraining/orttraining/training_ops/cuda/tensor/split.cc index 1cc2348f5b167..674596a69dc8c 100644 --- a/orttraining/orttraining/training_ops/cuda/tensor/split.cc +++ b/orttraining/orttraining/training_ops/cuda/tensor/split.cc @@ -44,8 +44,8 @@ Status SplitTraining::ComputeInternal(OpKernelContext* ctx) const { auto input_data = input_tensor->DataRaw(); - auto& input_dims = input_shape.GetDims(); - std::vector output_dimensions{input_dims}; + auto input_dims = input_shape.GetDims(); + std::vector output_dimensions{input_shape.GetDimsAsVector()}; CudaAsyncBuffer output_ptr(this, num_outputs); gsl::span output_ptr_span = output_ptr.CpuSpan(); diff --git a/orttraining/orttraining/training_ops/rocm/math/div_grad.cc b/orttraining/orttraining/training_ops/rocm/math/div_grad.cc index 16a21403dbe1d..1b04424455452 100644 --- a/orttraining/orttraining/training_ops/rocm/math/div_grad.cc +++ b/orttraining/orttraining/training_ops/rocm/math/div_grad.cc @@ -26,7 +26,7 @@ DIVGRAD_REGISTER_KERNEL_TYPED(float) std::vector prepended_dimension_1(const TensorShape& shape, size_t total_rank) { size_t input_rank = shape.NumDimensions(); if (input_rank == total_rank) - return shape.GetDims(); + return shape.GetDimsAsVector(); std::vector dims(total_rank, 1); diff --git a/orttraining/orttraining/training_ops/rocm/nn/conv_grad.cc b/orttraining/orttraining/training_ops/rocm/nn/conv_grad.cc index 2e002c6afab4c..30f5b685099c4 100644 --- a/orttraining/orttraining/training_ops/rocm/nn/conv_grad.cc +++ b/orttraining/orttraining/training_ops/rocm/nn/conv_grad.cc @@ -233,15 +233,15 @@ template Status ConvGrad::PrepareArgs(const Tensor& x, const Tensor& dY, const Tensor& w, Tensor* dB, Tensor* dX, Tensor* dW) const { const TensorShape& x_shape = x.Shape(); - std::vector x_dims = x_shape.GetDims(); + std::vector x_dims = x_shape.GetDimsAsVector(); args_.x_data = reinterpret_cast(x.template Data()); const TensorShape& dy_shape = dY.Shape(); - std::vector dy_dims = dy_shape.GetDims(); + std::vector dy_dims = dy_shape.GetDimsAsVector(); args_.dy_data = reinterpret_cast(dY.template Data()); const TensorShape& w_shape = w.Shape(); - std::vector w_dims = w_shape.GetDims(); + std::vector w_dims = w_shape.GetDimsAsVector(); args_.w_data = reinterpret_cast(w.template Data()); args_.db_data = dB ? reinterpret_cast(dB->template MutableData()) : nullptr; diff --git a/tools/ci_build/github/linux/ort_minimal/build_minimal_ort_android_baseline_and_report_bin_size.sh b/tools/ci_build/github/linux/ort_minimal/build_minimal_ort_android_baseline_and_report_bin_size.sh old mode 100755 new mode 100644 index 7b1ea8067b685..b324f7b53e1b8 --- a/tools/ci_build/github/linux/ort_minimal/build_minimal_ort_android_baseline_and_report_bin_size.sh +++ b/tools/ci_build/github/linux/ort_minimal/build_minimal_ort_android_baseline_and_report_bin_size.sh @@ -29,7 +29,7 @@ python3 /onnxruntime_src/tools/ci_build/build.py \ --include_ops_by_config /home/onnxruntimedev/.test_data/include_no_operators.config # set current size limit to BINARY_SIZE_LIMIT_IN_BYTES. -BINARY_SIZE_LIMIT_IN_BYTES=1302000 +BINARY_SIZE_LIMIT_IN_BYTES=1303608 echo "The current preset binary size limit is $BINARY_SIZE_LIMIT_IN_BYTES" python3 /onnxruntime_src/tools/ci_build/github/linux/ort_minimal/check_build_binary_size.py \ --threshold=$BINARY_SIZE_LIMIT_IN_BYTES \