diff --git a/core/factorization/par_ilu.cpp b/core/factorization/par_ilu.cpp
index 1274d7250fd..1eb3e223d4c 100644
--- a/core/factorization/par_ilu.cpp
+++ b/core/factorization/par_ilu.cpp
@@ -181,8 +181,8 @@ ParIlu<ValueType, IndexType>::generate_l_u(
     // Since the transposed version has the exact same non-zero positions
     // as `u_factor`, we can both skip the allocation and the `make_srow()`
     // call from CSR, leaving just the `transpose()` kernel call
-    exec->run(par_ilu_factorization::make_csr_transpose(u_factor.get(),
-                                                        u_factor_transpose));
+    exec->run(par_ilu_factorization::make_csr_transpose(u_factor_transpose,
+                                                        u_factor.get()));
 
     return Composition<ValueType>::create(std::move(l_factor),
                                           std::move(u_factor));
diff --git a/core/matrix/coo.cpp b/core/matrix/coo.cpp
index 9c7da605fa1..96ce4ff81f3 100644
--- a/core/matrix/coo.cpp
+++ b/core/matrix/coo.cpp
@@ -113,7 +113,7 @@ void Coo<ValueType, IndexType>::convert_to(
         result->get_strategy());
     tmp->values_ = this->values_;
     tmp->col_idxs_ = this->col_idxs_;
-    exec->run(coo::make_convert_to_csr(tmp.get(), this));
+    exec->run(coo::make_convert_to_csr(this, tmp.get()));
     tmp->make_srow();
     tmp->move_to(result);
 }
@@ -128,7 +128,7 @@ void Coo<ValueType, IndexType>::move_to(Csr<ValueType, IndexType> *result)
         result->get_strategy());
     tmp->values_ = std::move(this->values_);
     tmp->col_idxs_ = std::move(this->col_idxs_);
-    exec->run(coo::make_convert_to_csr(tmp.get(), this));
+    exec->run(coo::make_convert_to_csr(this, tmp.get()));
     tmp->make_srow();
     tmp->move_to(result);
 }
@@ -139,7 +139,7 @@ void Coo<ValueType, IndexType>::convert_to(Dense<ValueType> *result) const
 {
     auto exec = this->get_executor();
     auto tmp = Dense<ValueType>::create(exec, this->get_size());
-    exec->run(coo::make_convert_to_dense(tmp.get(), this));
+    exec->run(coo::make_convert_to_dense(this, tmp.get()));
     tmp->move_to(result);
 }
 
diff --git a/core/matrix/coo_kernels.hpp b/core/matrix/coo_kernels.hpp
index f7d163bd854..9a56a0fe6e8 100644
--- a/core/matrix/coo_kernels.hpp
+++ b/core/matrix/coo_kernels.hpp
@@ -69,15 +69,15 @@ namespace kernels {
                         const matrix::Dense<ValueType> *b,           \
                         matrix::Dense<ValueType> *c)
 
-#define GKO_DECLARE_COO_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType)  \
-    void convert_to_dense(std::shared_ptr<const DefaultExecutor> exec, \
-                          matrix::Dense<ValueType> *result,            \
-                          const matrix::Coo<ValueType, IndexType> *source)
-
-#define GKO_DECLARE_COO_CONVERT_TO_CSR_KERNEL(ValueType, IndexType)  \
-    void convert_to_csr(std::shared_ptr<const DefaultExecutor> exec, \
-                        matrix::Csr<ValueType, IndexType> *result,   \
-                        const matrix::Coo<ValueType, IndexType> *source)
+#define GKO_DECLARE_COO_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType)      \
+    void convert_to_dense(std::shared_ptr<const DefaultExecutor> exec,     \
+                          const matrix::Coo<ValueType, IndexType> *source, \
+                          matrix::Dense<ValueType> *result)
+
+#define GKO_DECLARE_COO_CONVERT_TO_CSR_KERNEL(ValueType, IndexType)      \
+    void convert_to_csr(std::shared_ptr<const DefaultExecutor> exec,     \
+                        const matrix::Coo<ValueType, IndexType> *source, \
+                        matrix::Csr<ValueType, IndexType> *result)
 
 #define GKO_DECLARE_ALL_AS_TEMPLATES                             \
     template <typename ValueType, typename IndexType>            \
diff --git a/core/matrix/csr.cpp b/core/matrix/csr.cpp
index 46bf8e2d01c..051b9162445 100644
--- a/core/matrix/csr.cpp
+++ b/core/matrix/csr.cpp
@@ -129,7 +129,7 @@ void Csr<ValueType, IndexType>::convert_to(
         exec, this->get_size(), this->get_num_stored_elements());
     tmp->values_ = this->values_;
     tmp->col_idxs_ = this->col_idxs_;
-    exec->run(csr::make_convert_to_coo(tmp.get(), this));
+    exec->run(csr::make_convert_to_coo(this, tmp.get()));
     tmp->move_to(result);
 }
 
@@ -146,7 +146,7 @@ void Csr<ValueType, IndexType>::convert_to(Dense<ValueType> *result) const
 {
     auto exec = this->get_executor();
     auto tmp = Dense<ValueType>::create(exec, this->get_size());
-    exec->run(csr::make_convert_to_dense(tmp.get(), this));
+    exec->run(csr::make_convert_to_dense(this, tmp.get()));
     tmp->move_to(result);
 }
 
@@ -176,7 +176,7 @@ void Csr<ValueType, IndexType>::convert_to(
     auto tmp = Hybrid<ValueType, IndexType>::create(
         exec, this->get_size(), max_nnz_per_row, stride, coo_nnz,
         result->get_strategy());
-    exec->run(csr::make_convert_to_hybrid(tmp.get(), this));
+    exec->run(csr::make_convert_to_hybrid(this, tmp.get()));
     tmp->move_to(result);
 }
 
@@ -204,7 +204,7 @@ void Csr<ValueType, IndexType>::convert_to(
                                              slice_size));
     auto tmp = Sellp<ValueType, IndexType>::create(
         exec, this->get_size(), slice_size, stride_factor, total_cols);
-    exec->run(csr::make_convert_to_sellp(tmp.get(), this));
+    exec->run(csr::make_convert_to_sellp(this, tmp.get()));
     tmp->move_to(result);
 }
 
@@ -251,7 +251,7 @@ void Csr<ValueType, IndexType>::convert_to(
     exec->run(csr::make_calculate_max_nnz_per_row(this, &max_nnz_per_row));
     auto tmp = Ell<ValueType, IndexType>::create(exec, this->get_size(),
                                                  max_nnz_per_row);
-    exec->run(csr::make_convert_to_ell(tmp.get(), this));
+    exec->run(csr::make_convert_to_ell(this, tmp.get()));
     tmp->move_to(result);
 }
 
@@ -328,7 +328,7 @@ std::unique_ptr<LinOp> Csr<ValueType, IndexType>::transpose() const
         Csr::create(exec, gko::transpose(this->get_size()),
                     this->get_num_stored_elements(), this->get_strategy());
 
-    exec->run(csr::make_transpose(trans_cpy.get(), this));
+    exec->run(csr::make_transpose(this, trans_cpy.get()));
     trans_cpy->make_srow();
     return std::move(trans_cpy);
 }
@@ -342,7 +342,7 @@ std::unique_ptr<LinOp> Csr<ValueType, IndexType>::conj_transpose() const
         Csr::create(exec, gko::transpose(this->get_size()),
                     this->get_num_stored_elements(), this->get_strategy());
 
-    exec->run(csr::make_conj_transpose(trans_cpy.get(), this));
+    exec->run(csr::make_conj_transpose(this, trans_cpy.get()));
     trans_cpy->make_srow();
     return std::move(trans_cpy);
 }
@@ -359,7 +359,7 @@ std::unique_ptr<LinOp> Csr<ValueType, IndexType>::row_permute(
                     this->get_strategy());
 
     exec->run(
-        csr::make_row_permute(permutation_indices, permute_cpy.get(), this));
+        csr::make_row_permute(permutation_indices, this, permute_cpy.get()));
     permute_cpy->make_srow();
     return std::move(permute_cpy);
 }
@@ -376,7 +376,7 @@ std::unique_ptr<LinOp> Csr<ValueType, IndexType>::column_permute(
                     this->get_strategy());
 
     exec->run(
-        csr::make_column_permute(permutation_indices, permute_cpy.get(), this));
+        csr::make_column_permute(permutation_indices, this, permute_cpy.get()));
     permute_cpy->make_srow();
     return std::move(permute_cpy);
 }
@@ -393,8 +393,8 @@ std::unique_ptr<LinOp> Csr<ValueType, IndexType>::inverse_row_permute(
         Csr::create(exec, this->get_size(), this->get_num_stored_elements(),
                     this->get_strategy());
 
-    exec->run(csr::make_inverse_row_permute(inverse_permutation_indices,
-                                            inverse_permute_cpy.get(), this));
+    exec->run(csr::make_inverse_row_permute(inverse_permutation_indices, this,
+                                            inverse_permute_cpy.get()));
     inverse_permute_cpy->make_srow();
     return std::move(inverse_permute_cpy);
 }
@@ -412,7 +412,7 @@ std::unique_ptr<LinOp> Csr<ValueType, IndexType>::inverse_column_permute(
                     this->get_strategy());
 
     exec->run(csr::make_inverse_column_permute(
-        inverse_permutation_indices, inverse_permute_cpy.get(), this));
+        inverse_permutation_indices, this, inverse_permute_cpy.get()));
     inverse_permute_cpy->make_srow();
     return std::move(inverse_permute_cpy);
 }
diff --git a/core/matrix/csr_kernels.hpp b/core/matrix/csr_kernels.hpp
index 365758d7e2e..b9cdd3e8c68 100644
--- a/core/matrix/csr_kernels.hpp
+++ b/core/matrix/csr_kernels.hpp
@@ -77,30 +77,30 @@ namespace kernels {
                          const matrix::Csr<ValueType, IndexType> *d,  \
                          matrix::Csr<ValueType, IndexType> *c)
 
-#define GKO_DECLARE_CSR_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType)  \
-    void convert_to_dense(std::shared_ptr<const DefaultExecutor> exec, \
-                          matrix::Dense<ValueType> *result,            \
-                          const matrix::Csr<ValueType, IndexType> *source)
-
-#define GKO_DECLARE_CSR_CONVERT_TO_COO_KERNEL(ValueType, IndexType)  \
-    void convert_to_coo(std::shared_ptr<const DefaultExecutor> exec, \
-                        matrix::Coo<ValueType, IndexType> *result,   \
-                        const matrix::Csr<ValueType, IndexType> *source)
-
-#define GKO_DECLARE_CSR_CONVERT_TO_ELL_KERNEL(ValueType, IndexType)  \
-    void convert_to_ell(std::shared_ptr<const DefaultExecutor> exec, \
-                        matrix::Ell<ValueType, IndexType> *result,   \
-                        const matrix::Csr<ValueType, IndexType> *source)
-
-#define GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL(ValueType, IndexType)   \
-    void convert_to_hybrid(std::shared_ptr<const DefaultExecutor> exec,  \
-                           matrix::Hybrid<ValueType, IndexType> *result, \
-                           const matrix::Csr<ValueType, IndexType> *source)
-
-#define GKO_DECLARE_CSR_CONVERT_TO_SELLP_KERNEL(ValueType, IndexType)  \
-    void convert_to_sellp(std::shared_ptr<const DefaultExecutor> exec, \
-                          matrix::Sellp<ValueType, IndexType> *result, \
-                          const matrix::Csr<ValueType, IndexType> *source)
+#define GKO_DECLARE_CSR_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType)      \
+    void convert_to_dense(std::shared_ptr<const DefaultExecutor> exec,     \
+                          const matrix::Csr<ValueType, IndexType> *source, \
+                          matrix::Dense<ValueType> *result)
+
+#define GKO_DECLARE_CSR_CONVERT_TO_COO_KERNEL(ValueType, IndexType)      \
+    void convert_to_coo(std::shared_ptr<const DefaultExecutor> exec,     \
+                        const matrix::Csr<ValueType, IndexType> *source, \
+                        matrix::Coo<ValueType, IndexType> *result)
+
+#define GKO_DECLARE_CSR_CONVERT_TO_ELL_KERNEL(ValueType, IndexType)      \
+    void convert_to_ell(std::shared_ptr<const DefaultExecutor> exec,     \
+                        const matrix::Csr<ValueType, IndexType> *source, \
+                        matrix::Ell<ValueType, IndexType> *result)
+
+#define GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL(ValueType, IndexType)      \
+    void convert_to_hybrid(std::shared_ptr<const DefaultExecutor> exec,     \
+                           const matrix::Csr<ValueType, IndexType> *source, \
+                           matrix::Hybrid<ValueType, IndexType> *result)
+
+#define GKO_DECLARE_CSR_CONVERT_TO_SELLP_KERNEL(ValueType, IndexType)      \
+    void convert_to_sellp(std::shared_ptr<const DefaultExecutor> exec,     \
+                          const matrix::Csr<ValueType, IndexType> *source, \
+                          matrix::Sellp<ValueType, IndexType> *result)
 
 #define GKO_DECLARE_CSR_CALCULATE_TOTAL_COLS_KERNEL(ValueType, IndexType)      \
     void calculate_total_cols(std::shared_ptr<const DefaultExecutor> exec,     \
@@ -108,40 +108,40 @@ namespace kernels {
                               size_type *result, size_type stride_factor,      \
                               size_type slice_size)
 
-#define GKO_DECLARE_CSR_TRANSPOSE_KERNEL(ValueType, IndexType)  \
-    void transpose(std::shared_ptr<const DefaultExecutor> exec, \
-                   matrix::Csr<ValueType, IndexType> *trans,    \
-                   const matrix::Csr<ValueType, IndexType> *orig)
-
-#define GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL(ValueType, IndexType)  \
-    void conj_transpose(std::shared_ptr<const DefaultExecutor> exec, \
-                        matrix::Csr<ValueType, IndexType> *trans,    \
-                        const matrix::Csr<ValueType, IndexType> *orig)
-
-#define GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL(ValueType, IndexType)      \
-    void row_permute(std::shared_ptr<const DefaultExecutor> exec,     \
-                     const Array<IndexType> *permutation_indices,     \
-                     matrix::Csr<ValueType, IndexType> *row_permuted, \
-                     const matrix::Csr<ValueType, IndexType> *orig)
-
-#define GKO_DECLARE_CSR_COLUMN_PERMUTE_KERNEL(ValueType, IndexType)         \
-    void column_permute(std::shared_ptr<const DefaultExecutor> exec,        \
-                        const Array<IndexType> *permutation_indices,        \
-                        matrix::Csr<ValueType, IndexType> *column_permuted, \
-                        const matrix::Csr<ValueType, IndexType> *orig)
-
-#define GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL(ValueType, IndexType)      \
-    void inverse_row_permute(std::shared_ptr<const DefaultExecutor> exec,     \
-                             const Array<IndexType> *permutation_indices,     \
-                             matrix::Csr<ValueType, IndexType> *row_permuted, \
-                             const matrix::Csr<ValueType, IndexType> *orig)
+#define GKO_DECLARE_CSR_TRANSPOSE_KERNEL(ValueType, IndexType)    \
+    void transpose(std::shared_ptr<const DefaultExecutor> exec,   \
+                   const matrix::Csr<ValueType, IndexType> *orig, \
+                   matrix::Csr<ValueType, IndexType> *trans)
+
+#define GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL(ValueType, IndexType)    \
+    void conj_transpose(std::shared_ptr<const DefaultExecutor> exec,   \
+                        const matrix::Csr<ValueType, IndexType> *orig, \
+                        matrix::Csr<ValueType, IndexType> *trans)
+
+#define GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL(ValueType, IndexType)    \
+    void row_permute(std::shared_ptr<const DefaultExecutor> exec,   \
+                     const Array<IndexType> *permutation_indices,   \
+                     const matrix::Csr<ValueType, IndexType> *orig, \
+                     matrix::Csr<ValueType, IndexType> *row_permuted)
+
+#define GKO_DECLARE_CSR_COLUMN_PERMUTE_KERNEL(ValueType, IndexType)    \
+    void column_permute(std::shared_ptr<const DefaultExecutor> exec,   \
+                        const Array<IndexType> *permutation_indices,   \
+                        const matrix::Csr<ValueType, IndexType> *orig, \
+                        matrix::Csr<ValueType, IndexType> *column_permuted)
+
+#define GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL(ValueType, IndexType)    \
+    void inverse_row_permute(std::shared_ptr<const DefaultExecutor> exec,   \
+                             const Array<IndexType> *permutation_indices,   \
+                             const matrix::Csr<ValueType, IndexType> *orig, \
+                             matrix::Csr<ValueType, IndexType> *row_permuted)
 
 #define GKO_DECLARE_CSR_INVERSE_COLUMN_PERMUTE_KERNEL(ValueType, IndexType) \
     void inverse_column_permute(                                            \
         std::shared_ptr<const DefaultExecutor> exec,                        \
         const Array<IndexType> *permutation_indices,                        \
-        matrix::Csr<ValueType, IndexType> *column_permuted,                 \
-        const matrix::Csr<ValueType, IndexType> *orig)
+        const matrix::Csr<ValueType, IndexType> *orig,                      \
+        matrix::Csr<ValueType, IndexType> *column_permuted)
 
 #define GKO_DECLARE_CSR_CALCULATE_MAX_NNZ_PER_ROW_KERNEL(ValueType, IndexType) \
     void calculate_max_nnz_per_row(                                            \
diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp
index 3546a5a950d..abf28c2175f 100644
--- a/core/matrix/dense.cpp
+++ b/core/matrix/dense.cpp
@@ -101,7 +101,7 @@ inline void conversion_helper(Coo<ValueType, IndexType> *result,
     exec->run(dense::make_count_nonzeros(source, &num_stored_nonzeros));
     auto tmp = Coo<ValueType, IndexType>::create(exec, source->get_size(),
                                                  num_stored_nonzeros);
-    exec->run(op(tmp.get(), source));
+    exec->run(op(source, tmp.get()));
     tmp->move_to(result);
 }
 
@@ -119,7 +119,7 @@ inline void conversion_helper(Csr<ValueType, IndexType> *result,
         auto tmp = Csr<ValueType, IndexType>::create(exec, source->get_size(),
                                                      num_stored_nonzeros,
                                                      result->get_strategy());
-        exec->run(op(tmp.get(), source));
+        exec->run(op(source, tmp.get()));
         tmp->move_to(result);
     }
     // If source is empty, there is no need to copy data or to call kernels
@@ -145,7 +145,7 @@ inline void conversion_helper(Ell<ValueType, IndexType> *result,
     const auto stride = std::max(result->get_stride(), source->get_size()[0]);
     auto tmp = Ell<ValueType, IndexType>::create(exec, source->get_size(),
                                                  max_nnz_per_row, stride);
-    exec->run(op(tmp.get(), source));
+    exec->run(op(source, tmp.get()));
     tmp->move_to(result);
 }
 
@@ -170,7 +170,7 @@ inline void conversion_helper(Hybrid<ValueType, IndexType> *result,
     auto tmp = Hybrid<ValueType, IndexType>::create(
         exec, source->get_size(), max_nnz_per_row, stride, coo_nnz,
         result->get_strategy());
-    exec->run(op(tmp.get(), source));
+    exec->run(op(source, tmp.get()));
     tmp->move_to(result);
 }
 
@@ -192,7 +192,7 @@ inline void conversion_helper(Sellp<ValueType, IndexType> *result,
                                                stride_factor, slice_size));
     auto tmp = Sellp<ValueType, IndexType>::create(
         exec, source->get_size(), slice_size, stride_factor, total_cols);
-    exec->run(op(tmp.get(), source));
+    exec->run(op(source, tmp.get()));
     tmp->move_to(result);
 }
 
@@ -208,7 +208,7 @@ inline void conversion_helper(SparsityCsr<ValueType, IndexType> *result,
     exec->run(dense::make_count_nonzeros(source, &num_stored_nonzeros));
     auto tmp = SparsityCsr<ValueType, IndexType>::create(
         exec, source->get_size(), num_stored_nonzeros);
-    exec->run(op(tmp.get(), source));
+    exec->run(op(source, tmp.get()));
     tmp->move_to(result);
 }
 
@@ -288,8 +288,8 @@ void Dense<ValueType>::convert_to(Coo<ValueType, int32> *result) const
 {
     conversion_helper(
         result, this,
-        dense::template make_convert_to_coo<decltype(result),
-                                            const Dense<ValueType> *&>);
+        dense::template make_convert_to_coo<const Dense<ValueType> *&,
+                                            decltype(result)>);
 }
 
 
@@ -305,8 +305,8 @@ void Dense<ValueType>::convert_to(Coo<ValueType, int64> *result) const
 {
     conversion_helper(
         result, this,
-        dense::template make_convert_to_coo<decltype(result),
-                                            const Dense<ValueType> *&>);
+        dense::template make_convert_to_coo<const Dense<ValueType> *&,
+                                            decltype(result)>);
 }
 
 
@@ -322,8 +322,8 @@ void Dense<ValueType>::convert_to(Csr<ValueType, int32> *result) const
 {
     conversion_helper(
         result, this,
-        dense::template make_convert_to_csr<decltype(result),
-                                            const Dense<ValueType> *&>);
+        dense::template make_convert_to_csr<const Dense<ValueType> *&,
+                                            decltype(result)>);
     result->make_srow();
 }
 
@@ -340,8 +340,8 @@ void Dense<ValueType>::convert_to(Csr<ValueType, int64> *result) const
 {
     conversion_helper(
         result, this,
-        dense::template make_convert_to_csr<decltype(result),
-                                            const Dense<ValueType> *&>);
+        dense::template make_convert_to_csr<const Dense<ValueType> *&,
+                                            decltype(result)>);
     result->make_srow();
 }
 
@@ -358,8 +358,8 @@ void Dense<ValueType>::convert_to(Ell<ValueType, int32> *result) const
 {
     conversion_helper(
         result, this,
-        dense::template make_convert_to_ell<decltype(result),
-                                            const Dense<ValueType> *&>);
+        dense::template make_convert_to_ell<const Dense<ValueType> *&,
+                                            decltype(result)>);
 }
 
 
@@ -375,8 +375,8 @@ void Dense<ValueType>::convert_to(Ell<ValueType, int64> *result) const
 {
     conversion_helper(
         result, this,
-        dense::template make_convert_to_ell<decltype(result),
-                                            const Dense<ValueType> *&>);
+        dense::template make_convert_to_ell<const Dense<ValueType> *&,
+                                            decltype(result)>);
 }
 
 
@@ -392,8 +392,8 @@ void Dense<ValueType>::convert_to(Hybrid<ValueType, int32> *result) const
 {
     conversion_helper(
         result, this,
-        dense::template make_convert_to_hybrid<decltype(result),
-                                               const Dense<ValueType> *&>);
+        dense::template make_convert_to_hybrid<const Dense<ValueType> *&,
+                                               decltype(result)>);
 }
 
 
@@ -409,8 +409,8 @@ void Dense<ValueType>::convert_to(Hybrid<ValueType, int64> *result) const
 {
     conversion_helper(
         result, this,
-        dense::template make_convert_to_hybrid<decltype(result),
-                                               const Dense<ValueType> *&>);
+        dense::template make_convert_to_hybrid<const Dense<ValueType> *&,
+                                               decltype(result)>);
 }
 
 
@@ -426,8 +426,8 @@ void Dense<ValueType>::convert_to(Sellp<ValueType, int32> *result) const
 {
     conversion_helper(
         result, this,
-        dense::template make_convert_to_sellp<decltype(result),
-                                              const Dense<ValueType> *&>);
+        dense::template make_convert_to_sellp<const Dense<ValueType> *&,
+                                              decltype(result)>);
 }
 
 
@@ -443,8 +443,8 @@ void Dense<ValueType>::convert_to(Sellp<ValueType, int64> *result) const
 {
     conversion_helper(
         result, this,
-        dense::template make_convert_to_sellp<decltype(result),
-                                              const Dense<ValueType> *&>);
+        dense::template make_convert_to_sellp<const Dense<ValueType> *&,
+                                              decltype(result)>);
 }
 
 
@@ -458,9 +458,10 @@ void Dense<ValueType>::move_to(Sellp<ValueType, int64> *result)
 template <typename ValueType>
 void Dense<ValueType>::convert_to(SparsityCsr<ValueType, int32> *result) const
 {
-    conversion_helper(result, this,
-                      dense::template make_convert_to_sparsity_csr<
-                          decltype(result), const Dense<ValueType> *&>);
+    conversion_helper(
+        result, this,
+        dense::template make_convert_to_sparsity_csr<const Dense<ValueType> *&,
+                                                     decltype(result)>);
 }
 
 
@@ -474,9 +475,10 @@ void Dense<ValueType>::move_to(SparsityCsr<ValueType, int32> *result)
 template <typename ValueType>
 void Dense<ValueType>::convert_to(SparsityCsr<ValueType, int64> *result) const
 {
-    conversion_helper(result, this,
-                      dense::template make_convert_to_sparsity_csr<
-                          decltype(result), const Dense<ValueType> *&>);
+    conversion_helper(
+        result, this,
+        dense::template make_convert_to_sparsity_csr<const Dense<ValueType> *&,
+                                                     decltype(result)>);
 }
 
 
@@ -577,7 +579,7 @@ std::unique_ptr<LinOp> Dense<ValueType>::transpose() const
     auto exec = this->get_executor();
     auto trans_cpy = Dense::create(exec, gko::transpose(this->get_size()));
 
-    exec->run(dense::make_transpose(trans_cpy.get(), this));
+    exec->run(dense::make_transpose(this, trans_cpy.get()));
 
     return std::move(trans_cpy);
 }
@@ -589,7 +591,7 @@ std::unique_ptr<LinOp> Dense<ValueType>::conj_transpose() const
     auto exec = this->get_executor();
     auto trans_cpy = Dense::create(exec, gko::transpose(this->get_size()));
 
-    exec->run(dense::make_conj_transpose(trans_cpy.get(), this));
+    exec->run(dense::make_conj_transpose(this, trans_cpy.get()));
     return std::move(trans_cpy);
 }
 
@@ -603,7 +605,7 @@ std::unique_ptr<LinOp> Dense<ValueType>::row_permute(
     auto permute_cpy = Dense::create(exec, this->get_size());
 
     exec->run(
-        dense::make_row_permute(permutation_indices, permute_cpy.get(), this));
+        dense::make_row_permute(permutation_indices, this, permute_cpy.get()));
 
     return std::move(permute_cpy);
 }
@@ -617,8 +619,8 @@ std::unique_ptr<LinOp> Dense<ValueType>::column_permute(
     auto exec = this->get_executor();
     auto permute_cpy = Dense::create(exec, this->get_size());
 
-    exec->run(dense::make_column_permute(permutation_indices, permute_cpy.get(),
-                                         this));
+    exec->run(dense::make_column_permute(permutation_indices, this,
+                                         permute_cpy.get()));
 
     return std::move(permute_cpy);
 }
@@ -633,7 +635,7 @@ std::unique_ptr<LinOp> Dense<ValueType>::row_permute(
     auto permute_cpy = Dense::create(exec, this->get_size());
 
     exec->run(
-        dense::make_row_permute(permutation_indices, permute_cpy.get(), this));
+        dense::make_row_permute(permutation_indices, this, permute_cpy.get()));
 
     return std::move(permute_cpy);
 }
@@ -647,8 +649,8 @@ std::unique_ptr<LinOp> Dense<ValueType>::column_permute(
     auto exec = this->get_executor();
     auto permute_cpy = Dense::create(exec, this->get_size());
 
-    exec->run(dense::make_column_permute(permutation_indices, permute_cpy.get(),
-                                         this));
+    exec->run(dense::make_column_permute(permutation_indices, this,
+                                         permute_cpy.get()));
 
     return std::move(permute_cpy);
 }
@@ -663,8 +665,8 @@ std::unique_ptr<LinOp> Dense<ValueType>::inverse_row_permute(
     auto exec = this->get_executor();
     auto inverse_permute_cpy = Dense::create(exec, this->get_size());
 
-    exec->run(dense::make_inverse_row_permute(inverse_permutation_indices,
-                                              inverse_permute_cpy.get(), this));
+    exec->run(dense::make_inverse_row_permute(inverse_permutation_indices, this,
+                                              inverse_permute_cpy.get()));
 
     return std::move(inverse_permute_cpy);
 }
@@ -680,7 +682,7 @@ std::unique_ptr<LinOp> Dense<ValueType>::inverse_column_permute(
     auto inverse_permute_cpy = Dense::create(exec, this->get_size());
 
     exec->run(dense::make_inverse_column_permute(
-        inverse_permutation_indices, inverse_permute_cpy.get(), this));
+        inverse_permutation_indices, this, inverse_permute_cpy.get()));
 
     return std::move(inverse_permute_cpy);
 }
@@ -695,8 +697,8 @@ std::unique_ptr<LinOp> Dense<ValueType>::inverse_row_permute(
     auto exec = this->get_executor();
     auto inverse_permute_cpy = Dense::create(exec, this->get_size());
 
-    exec->run(dense::make_inverse_row_permute(inverse_permutation_indices,
-                                              inverse_permute_cpy.get(), this));
+    exec->run(dense::make_inverse_row_permute(inverse_permutation_indices, this,
+                                              inverse_permute_cpy.get()));
 
     return std::move(inverse_permute_cpy);
 }
@@ -712,7 +714,7 @@ std::unique_ptr<LinOp> Dense<ValueType>::inverse_column_permute(
     auto inverse_permute_cpy = Dense::create(exec, this->get_size());
 
     exec->run(dense::make_inverse_column_permute(
-        inverse_permutation_indices, inverse_permute_cpy.get(), this));
+        inverse_permutation_indices, this, inverse_permute_cpy.get()));
 
     return std::move(inverse_permute_cpy);
 }
diff --git a/core/matrix/dense_kernels.hpp b/core/matrix/dense_kernels.hpp
index 2b432fa3fcf..20f98e92824 100644
--- a/core/matrix/dense_kernels.hpp
+++ b/core/matrix/dense_kernels.hpp
@@ -75,33 +75,33 @@ namespace kernels {
 
 #define GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL(_type, _prec)        \
     void convert_to_coo(std::shared_ptr<const DefaultExecutor> exec, \
-                        matrix::Coo<_type, _prec> *other,            \
-                        const matrix::Dense<_type> *source)
+                        const matrix::Dense<_type> *source,          \
+                        matrix::Coo<_type, _prec> *other)
 
 #define GKO_DECLARE_DENSE_CONVERT_TO_CSR_KERNEL(_type, _prec)        \
     void convert_to_csr(std::shared_ptr<const DefaultExecutor> exec, \
-                        matrix::Csr<_type, _prec> *other,            \
-                        const matrix::Dense<_type> *source)
+                        const matrix::Dense<_type> *source,          \
+                        matrix::Csr<_type, _prec> *other)
 
 #define GKO_DECLARE_DENSE_CONVERT_TO_ELL_KERNEL(_type, _prec)        \
     void convert_to_ell(std::shared_ptr<const DefaultExecutor> exec, \
-                        matrix::Ell<_type, _prec> *other,            \
-                        const matrix::Dense<_type> *source)
+                        const matrix::Dense<_type> *source,          \
+                        matrix::Ell<_type, _prec> *other)
 
 #define GKO_DECLARE_DENSE_CONVERT_TO_HYBRID_KERNEL(_type, _prec)        \
     void convert_to_hybrid(std::shared_ptr<const DefaultExecutor> exec, \
-                           matrix::Hybrid<_type, _prec> *other,         \
-                           const matrix::Dense<_type> *source)
+                           const matrix::Dense<_type> *source,          \
+                           matrix::Hybrid<_type, _prec> *other)
 
 #define GKO_DECLARE_DENSE_CONVERT_TO_SELLP_KERNEL(_type, _prec)        \
     void convert_to_sellp(std::shared_ptr<const DefaultExecutor> exec, \
-                          matrix::Sellp<_type, _prec> *other,          \
-                          const matrix::Dense<_type> *source)
+                          const matrix::Dense<_type> *source,          \
+                          matrix::Sellp<_type, _prec> *other)
 
 #define GKO_DECLARE_DENSE_CONVERT_TO_SPARSITY_CSR_KERNEL(_type, _prec)        \
     void convert_to_sparsity_csr(std::shared_ptr<const DefaultExecutor> exec, \
-                                 matrix::SparsityCsr<_type, _prec> *other,    \
-                                 const matrix::Dense<_type> *source)
+                                 const matrix::Dense<_type> *source,          \
+                                 matrix::SparsityCsr<_type, _prec> *other)
 
 #define GKO_DECLARE_DENSE_COUNT_NONZEROS_KERNEL(_type)               \
     void count_nonzeros(std::shared_ptr<const DefaultExecutor> exec, \
@@ -125,37 +125,37 @@ namespace kernels {
 
 #define GKO_DECLARE_TRANSPOSE_KERNEL(_type)                     \
     void transpose(std::shared_ptr<const DefaultExecutor> exec, \
-                   matrix::Dense<_type> *trans,                 \
-                   const matrix::Dense<_type> *orig)
+                   const matrix::Dense<_type> *orig,            \
+                   matrix::Dense<_type> *trans)
 
 #define GKO_DECLARE_CONJ_TRANSPOSE_KERNEL(_type)                     \
     void conj_transpose(std::shared_ptr<const DefaultExecutor> exec, \
-                        matrix::Dense<_type> *trans,                 \
-                        const matrix::Dense<_type> *orig)
+                        const matrix::Dense<_type> *orig,            \
+                        matrix::Dense<_type> *trans)
 
 #define GKO_DECLARE_ROW_PERMUTE_KERNEL(_vtype, _itype)            \
     void row_permute(std::shared_ptr<const DefaultExecutor> exec, \
                      const Array<_itype> *permutation_indices,    \
-                     matrix::Dense<_vtype> *row_permuted,         \
-                     const matrix::Dense<_vtype> *orig)
+                     const matrix::Dense<_vtype> *orig,           \
+                     matrix::Dense<_vtype> *row_permuted)
 
 #define GKO_DECLARE_COLUMN_PERMUTE_KERNEL(_vtype, _itype)            \
     void column_permute(std::shared_ptr<const DefaultExecutor> exec, \
                         const Array<_itype> *permutation_indices,    \
-                        matrix::Dense<_vtype> *column_permuted,      \
-                        const matrix::Dense<_vtype> *orig)
+                        const matrix::Dense<_vtype> *orig,           \
+                        matrix::Dense<_vtype> *column_permuted)
 
 #define GKO_DECLARE_INVERSE_ROW_PERMUTE_KERNEL(_vtype, _itype)            \
     void inverse_row_permute(std::shared_ptr<const DefaultExecutor> exec, \
                              const Array<_itype> *permutation_indices,    \
-                             matrix::Dense<_vtype> *row_permuted,         \
-                             const matrix::Dense<_vtype> *orig)
+                             const matrix::Dense<_vtype> *orig,           \
+                             matrix::Dense<_vtype> *row_permuted)
 
 #define GKO_DECLARE_INVERSE_COLUMN_PERMUTE_KERNEL(_vtype, _itype)            \
     void inverse_column_permute(std::shared_ptr<const DefaultExecutor> exec, \
                                 const Array<_itype> *permutation_indices,    \
-                                matrix::Dense<_vtype> *column_permuted,      \
-                                const matrix::Dense<_vtype> *orig)
+                                const matrix::Dense<_vtype> *orig,           \
+                                matrix::Dense<_vtype> *column_permuted)
 
 #define GKO_DECLARE_ALL_AS_TEMPLATES                                        \
     template <typename ValueType>                                           \
diff --git a/core/matrix/ell.cpp b/core/matrix/ell.cpp
index f307a7bc790..80d570ebaad 100644
--- a/core/matrix/ell.cpp
+++ b/core/matrix/ell.cpp
@@ -113,7 +113,7 @@ void Ell<ValueType, IndexType>::convert_to(Dense<ValueType> *result) const
 {
     auto exec = this->get_executor();
     auto tmp = Dense<ValueType>::create(exec, this->get_size());
-    exec->run(ell::make_convert_to_dense(tmp.get(), this));
+    exec->run(ell::make_convert_to_dense(this, tmp.get()));
     tmp->move_to(result);
 }
 
@@ -136,7 +136,7 @@ void Ell<ValueType, IndexType>::convert_to(
 
     auto tmp = Csr<ValueType, IndexType>::create(
         exec, this->get_size(), num_stored_elements, result->get_strategy());
-    exec->run(ell::make_convert_to_csr(tmp.get(), this));
+    exec->run(ell::make_convert_to_csr(this, tmp.get()));
 
     tmp->make_srow();
     tmp->move_to(result);
diff --git a/core/matrix/ell_kernels.hpp b/core/matrix/ell_kernels.hpp
index 56405bbdbc6..cf84a2393c7 100644
--- a/core/matrix/ell_kernels.hpp
+++ b/core/matrix/ell_kernels.hpp
@@ -56,15 +56,15 @@ namespace kernels {
                        const matrix::Dense<ValueType> *beta,        \
                        matrix::Dense<ValueType> *c)
 
-#define GKO_DECLARE_ELL_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType)  \
-    void convert_to_dense(std::shared_ptr<const DefaultExecutor> exec, \
-                          matrix::Dense<ValueType> *result,            \
-                          const matrix::Ell<ValueType, IndexType> *source)
-
-#define GKO_DECLARE_ELL_CONVERT_TO_CSR_KERNEL(ValueType, IndexType)  \
-    void convert_to_csr(std::shared_ptr<const DefaultExecutor> exec, \
-                        matrix::Csr<ValueType, IndexType> *result,   \
-                        const matrix::Ell<ValueType, IndexType> *source)
+#define GKO_DECLARE_ELL_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType)      \
+    void convert_to_dense(std::shared_ptr<const DefaultExecutor> exec,     \
+                          const matrix::Ell<ValueType, IndexType> *source, \
+                          matrix::Dense<ValueType> *result)
+
+#define GKO_DECLARE_ELL_CONVERT_TO_CSR_KERNEL(ValueType, IndexType)      \
+    void convert_to_csr(std::shared_ptr<const DefaultExecutor> exec,     \
+                        const matrix::Ell<ValueType, IndexType> *source, \
+                        matrix::Csr<ValueType, IndexType> *result)
 
 #define GKO_DECLARE_ELL_COUNT_NONZEROS_KERNEL(ValueType, IndexType)      \
     void count_nonzeros(std::shared_ptr<const DefaultExecutor> exec,     \
diff --git a/core/matrix/hybrid.cpp b/core/matrix/hybrid.cpp
index e7908767ed7..bac25255867 100644
--- a/core/matrix/hybrid.cpp
+++ b/core/matrix/hybrid.cpp
@@ -114,7 +114,7 @@ void Hybrid<ValueType, IndexType>::convert_to(Dense<ValueType> *result) const
 {
     auto exec = this->get_executor();
     auto tmp = Dense<ValueType>::create(exec, this->get_size());
-    exec->run(hybrid::make_convert_to_dense(tmp.get(), this));
+    exec->run(hybrid::make_convert_to_dense(this, tmp.get()));
     tmp->move_to(result);
 }
 
@@ -137,7 +137,7 @@ void Hybrid<ValueType, IndexType>::convert_to(
 
     auto tmp = Csr<ValueType, IndexType>::create(
         exec, this->get_size(), num_stored_elements, result->get_strategy());
-    exec->run(hybrid::make_convert_to_csr(tmp.get(), this));
+    exec->run(hybrid::make_convert_to_csr(this, tmp.get()));
 
     tmp->make_srow();
     tmp->move_to(result);
diff --git a/core/matrix/hybrid_kernels.hpp b/core/matrix/hybrid_kernels.hpp
index 2db1c0fda01..df4029ac893 100644
--- a/core/matrix/hybrid_kernels.hpp
+++ b/core/matrix/hybrid_kernels.hpp
@@ -42,15 +42,15 @@ namespace gko {
 namespace kernels {
 
 
-#define GKO_DECLARE_HYBRID_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType) \
-    void convert_to_dense(std::shared_ptr<const DefaultExecutor> exec,   \
-                          matrix::Dense<ValueType> *result,              \
-                          const matrix::Hybrid<ValueType, IndexType> *source)
-
-#define GKO_DECLARE_HYBRID_CONVERT_TO_CSR_KERNEL(ValueType, IndexType) \
-    void convert_to_csr(std::shared_ptr<const DefaultExecutor> exec,   \
-                        matrix::Csr<ValueType, IndexType> *result,     \
-                        const matrix::Hybrid<ValueType, IndexType> *source)
+#define GKO_DECLARE_HYBRID_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType)      \
+    void convert_to_dense(std::shared_ptr<const DefaultExecutor> exec,        \
+                          const matrix::Hybrid<ValueType, IndexType> *source, \
+                          matrix::Dense<ValueType> *result)
+
+#define GKO_DECLARE_HYBRID_CONVERT_TO_CSR_KERNEL(ValueType, IndexType)      \
+    void convert_to_csr(std::shared_ptr<const DefaultExecutor> exec,        \
+                        const matrix::Hybrid<ValueType, IndexType> *source, \
+                        matrix::Csr<ValueType, IndexType> *result)
 
 #define GKO_DECLARE_HYBRID_COUNT_NONZEROS_KERNEL(ValueType, IndexType)      \
     void count_nonzeros(std::shared_ptr<const DefaultExecutor> exec,        \
diff --git a/core/matrix/sellp.cpp b/core/matrix/sellp.cpp
index 860d09f1eab..688bd108f51 100644
--- a/core/matrix/sellp.cpp
+++ b/core/matrix/sellp.cpp
@@ -127,7 +127,7 @@ void Sellp<ValueType, IndexType>::convert_to(Dense<ValueType> *result) const
 {
     auto exec = this->get_executor();
     auto tmp = Dense<ValueType>::create(exec, this->get_size());
-    exec->run(sellp::make_convert_to_dense(tmp.get(), this));
+    exec->run(sellp::make_convert_to_dense(this, tmp.get()));
     tmp->move_to(result);
 }
 
@@ -149,7 +149,7 @@ void Sellp<ValueType, IndexType>::convert_to(
     exec->run(sellp::make_count_nonzeros(this, &num_stored_nonzeros));
     auto tmp = Csr<ValueType, IndexType>::create(
         exec, this->get_size(), num_stored_nonzeros, result->get_strategy());
-    exec->run(sellp::make_convert_to_csr(tmp.get(), this));
+    exec->run(sellp::make_convert_to_csr(this, tmp.get()));
     tmp->make_srow();
     tmp->move_to(result);
 }
diff --git a/core/matrix/sellp_kernels.hpp b/core/matrix/sellp_kernels.hpp
index 57cf9e49a19..368f40342d5 100644
--- a/core/matrix/sellp_kernels.hpp
+++ b/core/matrix/sellp_kernels.hpp
@@ -56,15 +56,15 @@ namespace kernels {
                        const matrix::Dense<ValueType> *beta,         \
                        matrix::Dense<ValueType> *c)
 
-#define GKO_DECLARE_SELLP_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType) \
-    void convert_to_dense(std::shared_ptr<const DefaultExecutor> exec,  \
-                          matrix::Dense<ValueType> *result,             \
-                          const matrix::Sellp<ValueType, IndexType> *source)
-
-#define GKO_DECLARE_SELLP_CONVERT_TO_CSR_KERNEL(ValueType, IndexType) \
-    void convert_to_csr(std::shared_ptr<const DefaultExecutor> exec,  \
-                        matrix::Csr<ValueType, IndexType> *result,    \
-                        const matrix::Sellp<ValueType, IndexType> *source)
+#define GKO_DECLARE_SELLP_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType)      \
+    void convert_to_dense(std::shared_ptr<const DefaultExecutor> exec,       \
+                          const matrix::Sellp<ValueType, IndexType> *source, \
+                          matrix::Dense<ValueType> *result)
+
+#define GKO_DECLARE_SELLP_CONVERT_TO_CSR_KERNEL(ValueType, IndexType)      \
+    void convert_to_csr(std::shared_ptr<const DefaultExecutor> exec,       \
+                        const matrix::Sellp<ValueType, IndexType> *source, \
+                        matrix::Csr<ValueType, IndexType> *result)
 
 #define GKO_DECLARE_SELLP_COUNT_NONZEROS_KERNEL(ValueType, IndexType)      \
     void count_nonzeros(std::shared_ptr<const DefaultExecutor> exec,       \
diff --git a/core/matrix/sparsity_csr.cpp b/core/matrix/sparsity_csr.cpp
index 8d3a0a34184..ec741cb165b 100644
--- a/core/matrix/sparsity_csr.cpp
+++ b/core/matrix/sparsity_csr.cpp
@@ -149,14 +149,16 @@ std::unique_ptr<LinOp> SparsityCsr<ValueType, IndexType>::transpose() const
     auto trans_cpy = SparsityCsr::create(exec, gko::transpose(this->get_size()),
                                          this->get_num_nonzeros());
 
-    exec->run(sparsity_csr::make_transpose(trans_cpy.get(), this));
+    exec->run(sparsity_csr::make_transpose(this, trans_cpy.get()));
     return std::move(trans_cpy);
 }
 
 
 template <typename ValueType, typename IndexType>
 std::unique_ptr<LinOp> SparsityCsr<ValueType, IndexType>::conj_transpose() const
-    GKO_NOT_IMPLEMENTED;
+{
+    return this->transpose();
+}
 
 
 template <typename ValueType, typename IndexType>
@@ -174,7 +176,7 @@ SparsityCsr<ValueType, IndexType>::to_adjacency_matrix() const
                             this->get_num_nonzeros() - num_diagonal_elements);
 
     exec->run(sparsity_csr::make_remove_diagonal_elements(
-        adj_mat.get(), this->get_const_row_ptrs(), this->get_const_col_idxs()));
+        this->get_const_row_ptrs(), this->get_const_col_idxs(), adj_mat.get()));
     return std::move(adj_mat);
 }
 
diff --git a/core/matrix/sparsity_csr_kernels.hpp b/core/matrix/sparsity_csr_kernels.hpp
index 1d90d83d646..62485d77f2e 100644
--- a/core/matrix/sparsity_csr_kernels.hpp
+++ b/core/matrix/sparsity_csr_kernels.hpp
@@ -60,8 +60,8 @@ namespace kernels {
                                                                  IndexType) \
     void remove_diagonal_elements(                                          \
         std::shared_ptr<const DefaultExecutor> exec,                        \
-        matrix::SparsityCsr<ValueType, IndexType> *matrix,                  \
-        const IndexType *row_ptrs, const IndexType *col_idxs)
+        const IndexType *row_ptrs, const IndexType *col_idxs,               \
+        matrix::SparsityCsr<ValueType, IndexType> *matrix)
 
 #define GKO_DECLARE_SPARSITY_CSR_COUNT_NUM_DIAGONAL_ELEMENTS_KERNEL(ValueType, \
                                                                     IndexType) \
@@ -70,10 +70,10 @@ namespace kernels {
         const matrix::SparsityCsr<ValueType, IndexType> *matrix,               \
         size_type *num_diagonal_elements)
 
-#define GKO_DECLARE_SPARSITY_CSR_TRANSPOSE_KERNEL(ValueType, IndexType) \
-    void transpose(std::shared_ptr<const DefaultExecutor> exec,         \
-                   matrix::SparsityCsr<ValueType, IndexType> *trans,    \
-                   const matrix::SparsityCsr<ValueType, IndexType> *orig)
+#define GKO_DECLARE_SPARSITY_CSR_TRANSPOSE_KERNEL(ValueType, IndexType)   \
+    void transpose(std::shared_ptr<const DefaultExecutor> exec,           \
+                   const matrix::SparsityCsr<ValueType, IndexType> *orig, \
+                   matrix::SparsityCsr<ValueType, IndexType> *trans)
 
 #define GKO_DECLARE_SPARSITY_CSR_SORT_BY_COLUMN_INDEX(ValueType, IndexType) \
     void sort_by_column_index(                                              \
diff --git a/cuda/matrix/coo_kernels.cu b/cuda/matrix/coo_kernels.cu
index a329ea54572..cea458a3d36 100644
--- a/cuda/matrix/coo_kernels.cu
+++ b/cuda/matrix/coo_kernels.cu
@@ -197,8 +197,8 @@ void convert_row_idxs_to_ptrs(std::shared_ptr<const CudaExecutor> exec,
 
 template <typename ValueType, typename IndexType>
 void convert_to_csr(std::shared_ptr<const CudaExecutor> exec,
-                    matrix::Csr<ValueType, IndexType> *result,
-                    const matrix::Coo<ValueType, IndexType> *source)
+                    const matrix::Coo<ValueType, IndexType> *source,
+                    matrix::Csr<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
 
@@ -217,8 +217,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_dense(std::shared_ptr<const CudaExecutor> exec,
-                      matrix::Dense<ValueType> *result,
-                      const matrix::Coo<ValueType, IndexType> *source)
+                      const matrix::Coo<ValueType, IndexType> *source,
+                      matrix::Dense<ValueType> *result)
 {
     const auto num_rows = result->get_size()[0];
     const auto num_cols = result->get_size()[1];
diff --git a/cuda/matrix/csr_kernels.cu b/cuda/matrix/csr_kernels.cu
index 2a9766470e1..4cb097b1af7 100644
--- a/cuda/matrix/csr_kernels.cu
+++ b/cuda/matrix/csr_kernels.cu
@@ -597,8 +597,8 @@ void convert_row_ptrs_to_idxs(std::shared_ptr<const CudaExecutor> exec,
 
 template <typename ValueType, typename IndexType>
 void convert_to_coo(std::shared_ptr<const CudaExecutor> exec,
-                    matrix::Coo<ValueType, IndexType> *result,
-                    const matrix::Csr<ValueType, IndexType> *source)
+                    const matrix::Csr<ValueType, IndexType> *source,
+                    matrix::Coo<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
 
@@ -614,8 +614,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_dense(std::shared_ptr<const CudaExecutor> exec,
-                      matrix::Dense<ValueType> *result,
-                      const matrix::Csr<ValueType, IndexType> *source)
+                      const matrix::Csr<ValueType, IndexType> *source,
+                      matrix::Dense<ValueType> *result)
 {
     const auto num_rows = result->get_size()[0];
     const auto num_cols = result->get_size()[1];
@@ -643,8 +643,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_sellp(std::shared_ptr<const CudaExecutor> exec,
-                      matrix::Sellp<ValueType, IndexType> *result,
-                      const matrix::Csr<ValueType, IndexType> *source)
+                      const matrix::Csr<ValueType, IndexType> *source,
+                      matrix::Sellp<ValueType, IndexType> *result)
 {
     const auto num_rows = result->get_size()[0];
     const auto num_cols = result->get_size()[1];
@@ -696,8 +696,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_ell(std::shared_ptr<const CudaExecutor> exec,
-                    matrix::Ell<ValueType, IndexType> *result,
-                    const matrix::Csr<ValueType, IndexType> *source)
+                    const matrix::Csr<ValueType, IndexType> *source,
+                    matrix::Ell<ValueType, IndexType> *result)
 {
     const auto source_values = source->get_const_values();
     const auto source_row_ptrs = source->get_const_row_ptrs();
@@ -777,8 +777,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void transpose(std::shared_ptr<const CudaExecutor> exec,
-               matrix::Csr<ValueType, IndexType> *trans,
-               const matrix::Csr<ValueType, IndexType> *orig)
+               const matrix::Csr<ValueType, IndexType> *orig,
+               matrix::Csr<ValueType, IndexType> *trans)
 {
     if (cusparse::is_supported<ValueType, IndexType>::value) {
         cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC;
@@ -800,8 +800,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_TRANSPOSE_KERNEL);
 
 template <typename ValueType, typename IndexType>
 void conj_transpose(std::shared_ptr<const CudaExecutor> exec,
-                    matrix::Csr<ValueType, IndexType> *trans,
-                    const matrix::Csr<ValueType, IndexType> *orig)
+                    const matrix::Csr<ValueType, IndexType> *orig,
+                    matrix::Csr<ValueType, IndexType> *trans)
 {
     if (cusparse::is_supported<ValueType, IndexType>::value) {
         const dim3 block_size(default_block_size, 1, 1);
@@ -833,8 +833,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void row_permute(std::shared_ptr<const CudaExecutor> exec,
                  const Array<IndexType> *permutation_indices,
-                 matrix::Csr<ValueType, IndexType> *row_permuted,
-                 const matrix::Csr<ValueType, IndexType> *orig)
+                 const matrix::Csr<ValueType, IndexType> *orig,
+                 matrix::Csr<ValueType, IndexType> *row_permuted)
     GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -844,8 +844,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void column_permute(std::shared_ptr<const CudaExecutor> exec,
                     const Array<IndexType> *permutation_indices,
-                    matrix::Csr<ValueType, IndexType> *column_permuted,
-                    const matrix::Csr<ValueType, IndexType> *orig)
+                    const matrix::Csr<ValueType, IndexType> *orig,
+                    matrix::Csr<ValueType, IndexType> *column_permuted)
     GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -855,8 +855,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void inverse_row_permute(std::shared_ptr<const CudaExecutor> exec,
                          const Array<IndexType> *permutation_indices,
-                         matrix::Csr<ValueType, IndexType> *row_permuted,
-                         const matrix::Csr<ValueType, IndexType> *orig)
+                         const matrix::Csr<ValueType, IndexType> *orig,
+                         matrix::Csr<ValueType, IndexType> *row_permuted)
     GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -866,8 +866,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void inverse_column_permute(std::shared_ptr<const CudaExecutor> exec,
                             const Array<IndexType> *permutation_indices,
-                            matrix::Csr<ValueType, IndexType> *column_permuted,
-                            const matrix::Csr<ValueType, IndexType> *orig)
+                            const matrix::Csr<ValueType, IndexType> *orig,
+                            matrix::Csr<ValueType, IndexType> *column_permuted)
     GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -910,8 +910,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_hybrid(std::shared_ptr<const CudaExecutor> exec,
-                       matrix::Hybrid<ValueType, IndexType> *result,
-                       const matrix::Csr<ValueType, IndexType> *source)
+                       const matrix::Csr<ValueType, IndexType> *source,
+                       matrix::Hybrid<ValueType, IndexType> *result)
 {
     auto ell_val = result->get_ell_values();
     auto ell_col = result->get_ell_col_idxs();
diff --git a/cuda/matrix/dense_kernels.cu b/cuda/matrix/dense_kernels.cu
index e6ba96867b4..23135a0785b 100644
--- a/cuda/matrix/dense_kernels.cu
+++ b/cuda/matrix/dense_kernels.cu
@@ -237,8 +237,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
 
 template <typename ValueType, typename IndexType>
 void convert_to_coo(std::shared_ptr<const CudaExecutor> exec,
-                    matrix::Coo<ValueType, IndexType> *result,
-                    const matrix::Dense<ValueType> *source)
+                    const matrix::Dense<ValueType> *source,
+                    matrix::Coo<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
     auto num_cols = result->get_size()[1];
@@ -269,8 +269,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_csr(std::shared_ptr<const CudaExecutor> exec,
-                    matrix::Csr<ValueType, IndexType> *result,
-                    const matrix::Dense<ValueType> *source)
+                    const matrix::Dense<ValueType> *source,
+                    matrix::Csr<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
     auto num_cols = result->get_size()[1];
@@ -303,8 +303,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_ell(std::shared_ptr<const CudaExecutor> exec,
-                    matrix::Ell<ValueType, IndexType> *result,
-                    const matrix::Dense<ValueType> *source)
+                    const matrix::Dense<ValueType> *source,
+                    matrix::Ell<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
     auto num_cols = result->get_size()[1];
@@ -329,8 +329,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_hybrid(std::shared_ptr<const CudaExecutor> exec,
-                       matrix::Hybrid<ValueType, IndexType> *result,
-                       const matrix::Dense<ValueType> *source)
+                       const matrix::Dense<ValueType> *source,
+                       matrix::Hybrid<ValueType, IndexType> *result)
     GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -339,8 +339,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_sellp(std::shared_ptr<const CudaExecutor> exec,
-                      matrix::Sellp<ValueType, IndexType> *result,
-                      const matrix::Dense<ValueType> *source)
+                      const matrix::Dense<ValueType> *source,
+                      matrix::Sellp<ValueType, IndexType> *result)
 {
     const auto stride = source->get_stride();
     const auto num_rows = result->get_size()[0];
@@ -384,8 +384,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_sparsity_csr(std::shared_ptr<const CudaExecutor> exec,
-                             matrix::SparsityCsr<ValueType, IndexType> *result,
-                             const matrix::Dense<ValueType> *source)
+                             const matrix::Dense<ValueType> *source,
+                             matrix::SparsityCsr<ValueType, IndexType> *result)
     GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -510,8 +510,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
 
 template <typename ValueType>
 void transpose(std::shared_ptr<const CudaExecutor> exec,
-               matrix::Dense<ValueType> *trans,
-               const matrix::Dense<ValueType> *orig)
+               const matrix::Dense<ValueType> *orig,
+               matrix::Dense<ValueType> *trans)
 {
     if (cublas::is_supported<ValueType>::value) {
         auto handle = exec->get_cublas_handle();
@@ -535,8 +535,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_TRANSPOSE_KERNEL);
 
 template <typename ValueType>
 void conj_transpose(std::shared_ptr<const CudaExecutor> exec,
-                    matrix::Dense<ValueType> *trans,
-                    const matrix::Dense<ValueType> *orig)
+                    const matrix::Dense<ValueType> *orig,
+                    matrix::Dense<ValueType> *trans)
 {
     if (cublas::is_supported<ValueType>::value) {
         auto handle = exec->get_cublas_handle();
@@ -561,8 +561,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CONJ_TRANSPOSE_KERNEL);
 template <typename ValueType, typename IndexType>
 void row_permute(std::shared_ptr<const CudaExecutor> exec,
                  const Array<IndexType> *permutation_indices,
-                 matrix::Dense<ValueType> *row_permuted,
-                 const matrix::Dense<ValueType> *orig) GKO_NOT_IMPLEMENTED;
+                 const matrix::Dense<ValueType> *orig,
+                 matrix::Dense<ValueType> *row_permuted) GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ROW_PERMUTE_KERNEL);
 
@@ -570,8 +570,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ROW_PERMUTE_KERNEL);
 template <typename ValueType, typename IndexType>
 void column_permute(std::shared_ptr<const CudaExecutor> exec,
                     const Array<IndexType> *permutation_indices,
-                    matrix::Dense<ValueType> *column_permuted,
-                    const matrix::Dense<ValueType> *orig) GKO_NOT_IMPLEMENTED;
+                    const matrix::Dense<ValueType> *orig,
+                    matrix::Dense<ValueType> *column_permuted)
+    GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_COLUMN_PERMUTE_KERNEL);
@@ -580,8 +581,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void inverse_row_permute(std::shared_ptr<const CudaExecutor> exec,
                          const Array<IndexType> *permutation_indices,
-                         matrix::Dense<ValueType> *row_permuted,
-                         const matrix::Dense<ValueType> *orig)
+                         const matrix::Dense<ValueType> *orig,
+                         matrix::Dense<ValueType> *row_permuted)
     GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -591,8 +592,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void inverse_column_permute(std::shared_ptr<const CudaExecutor> exec,
                             const Array<IndexType> *permutation_indices,
-                            matrix::Dense<ValueType> *column_permuted,
-                            const matrix::Dense<ValueType> *orig)
+                            const matrix::Dense<ValueType> *orig,
+                            matrix::Dense<ValueType> *column_permuted)
     GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
diff --git a/cuda/matrix/ell_kernels.cu b/cuda/matrix/ell_kernels.cu
index 41e01cda5bf..c477ae96583 100644
--- a/cuda/matrix/ell_kernels.cu
+++ b/cuda/matrix/ell_kernels.cu
@@ -258,8 +258,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_dense(std::shared_ptr<const CudaExecutor> exec,
-                      matrix::Dense<ValueType> *result,
-                      const matrix::Ell<ValueType, IndexType> *source)
+                      const matrix::Ell<ValueType, IndexType> *source,
+                      matrix::Dense<ValueType> *result)
 {
     const auto num_rows = result->get_size()[0];
     const auto num_cols = result->get_size()[1];
@@ -288,8 +288,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_csr(std::shared_ptr<const CudaExecutor> exec,
-                    matrix::Csr<ValueType, IndexType> *result,
-                    const matrix::Ell<ValueType, IndexType> *source)
+                    const matrix::Ell<ValueType, IndexType> *source,
+                    matrix::Csr<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
 
diff --git a/cuda/matrix/hybrid_kernels.cu b/cuda/matrix/hybrid_kernels.cu
index bae18d874b8..53b8eaa26bf 100644
--- a/cuda/matrix/hybrid_kernels.cu
+++ b/cuda/matrix/hybrid_kernels.cu
@@ -70,9 +70,9 @@ constexpr int warps_in_block = 4;
 
 
 template <typename ValueType, typename IndexType>
-void convert_to_dense(
-    std::shared_ptr<const CudaExecutor> exec, matrix::Dense<ValueType> *result,
-    const matrix::Hybrid<ValueType, IndexType> *source) GKO_NOT_IMPLEMENTED;
+void convert_to_dense(std::shared_ptr<const CudaExecutor> exec,
+                      const matrix::Hybrid<ValueType, IndexType> *source,
+                      matrix::Dense<ValueType> *result) GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_HYBRID_CONVERT_TO_DENSE_KERNEL);
@@ -80,8 +80,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_csr(std::shared_ptr<const CudaExecutor> exec,
-                    matrix::Csr<ValueType, IndexType> *result,
-                    const matrix::Hybrid<ValueType, IndexType> *source)
+                    const matrix::Hybrid<ValueType, IndexType> *source,
+                    matrix::Csr<ValueType, IndexType> *result)
 {
     const auto num_rows = source->get_size()[0];
     auto coo_offset = Array<IndexType>(exec, num_rows + 1);
diff --git a/cuda/matrix/sellp_kernels.cu b/cuda/matrix/sellp_kernels.cu
index e3aabfd5de2..9db96e462f3 100644
--- a/cuda/matrix/sellp_kernels.cu
+++ b/cuda/matrix/sellp_kernels.cu
@@ -111,8 +111,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_dense(std::shared_ptr<const CudaExecutor> exec,
-                      matrix::Dense<ValueType> *result,
-                      const matrix::Sellp<ValueType, IndexType> *source)
+                      const matrix::Sellp<ValueType, IndexType> *source,
+                      matrix::Dense<ValueType> *result)
 {
     const auto num_rows = source->get_size()[0];
     const auto num_cols = source->get_size()[1];
@@ -151,8 +151,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_csr(std::shared_ptr<const CudaExecutor> exec,
-                    matrix::Csr<ValueType, IndexType> *result,
-                    const matrix::Sellp<ValueType, IndexType> *source)
+                    const matrix::Sellp<ValueType, IndexType> *source,
+                    matrix::Csr<ValueType, IndexType> *result)
 {
     const auto num_rows = source->get_size()[0];
     const auto slice_size = source->get_slice_size();
diff --git a/cuda/matrix/sparsity_csr_kernels.cu b/cuda/matrix/sparsity_csr_kernels.cu
index ac060ebe213..84e2861142e 100644
--- a/cuda/matrix/sparsity_csr_kernels.cu
+++ b/cuda/matrix/sparsity_csr_kernels.cu
@@ -98,10 +98,10 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 template <typename ValueType, typename IndexType>
-void remove_diagonal_elements(std::shared_ptr<const CudaExecutor> exec,
-                              matrix::SparsityCsr<ValueType, IndexType> *matrix,
-                              const IndexType *row_ptrs,
-                              const IndexType *col_idxs) GKO_NOT_IMPLEMENTED;
+void remove_diagonal_elements(
+    std::shared_ptr<const CudaExecutor> exec, const IndexType *row_ptrs,
+    const IndexType *col_idxs,
+    matrix::SparsityCsr<ValueType, IndexType> *matrix) GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_REMOVE_DIAGONAL_ELEMENTS_KERNEL);
@@ -109,8 +109,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void transpose(std::shared_ptr<const CudaExecutor> exec,
-               matrix::SparsityCsr<ValueType, IndexType> *trans,
-               const matrix::SparsityCsr<ValueType, IndexType> *orig)
+               const matrix::SparsityCsr<ValueType, IndexType> *orig,
+               matrix::SparsityCsr<ValueType, IndexType> *trans)
     GKO_NOT_IMPLEMENTED;
 
 
diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh
index 7c317ee4560..c4563376545 100644
--- a/cuda/solver/common_trs_kernels.cuh
+++ b/cuda/solver/common_trs_kernels.cuh
@@ -319,8 +319,8 @@ void solve_kernel(std::shared_ptr<const CudaExecutor> exec,
                         cuda_solve_struct->solve_info, b->get_const_values(),
                         b->get_size()[0], x->get_values(), x->get_size()[0]);
                 } else {
-                    dense::transpose(exec, trans_b, b);
-                    dense::transpose(exec, trans_x, x);
+                    dense::transpose(exec, b, trans_b);
+                    dense::transpose(exec, x, trans_x);
                     cusparse::csrsm_solve(
                         handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                         matrix->get_size()[0], trans_b->get_size()[0], &one,
@@ -331,7 +331,7 @@ void solve_kernel(std::shared_ptr<const CudaExecutor> exec,
                         cuda_solve_struct->solve_info, trans_b->get_values(),
                         trans_b->get_size()[1], trans_x->get_values(),
                         trans_x->get_size()[1]);
-                    dense::transpose(exec, x, trans_x);
+                    dense::transpose(exec, trans_x, x);
                 }
             }
 
diff --git a/hip/matrix/coo_kernels.hip.cpp b/hip/matrix/coo_kernels.hip.cpp
index 4485ab2d16c..5096c5c41b7 100644
--- a/hip/matrix/coo_kernels.hip.cpp
+++ b/hip/matrix/coo_kernels.hip.cpp
@@ -207,8 +207,8 @@ void convert_row_idxs_to_ptrs(std::shared_ptr<const HipExecutor> exec,
 
 template <typename ValueType, typename IndexType>
 void convert_to_csr(std::shared_ptr<const HipExecutor> exec,
-                    matrix::Csr<ValueType, IndexType> *result,
-                    const matrix::Coo<ValueType, IndexType> *source)
+                    const matrix::Coo<ValueType, IndexType> *source,
+                    matrix::Csr<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
 
@@ -227,8 +227,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_dense(std::shared_ptr<const HipExecutor> exec,
-                      matrix::Dense<ValueType> *result,
-                      const matrix::Coo<ValueType, IndexType> *source)
+                      const matrix::Coo<ValueType, IndexType> *source,
+                      matrix::Dense<ValueType> *result)
 {
     const auto num_rows = result->get_size()[0];
     const auto num_cols = result->get_size()[1];
diff --git a/hip/matrix/csr_kernels.hip.cpp b/hip/matrix/csr_kernels.hip.cpp
index 00382a4fa78..482193ad1ee 100644
--- a/hip/matrix/csr_kernels.hip.cpp
+++ b/hip/matrix/csr_kernels.hip.cpp
@@ -690,8 +690,8 @@ void convert_row_ptrs_to_idxs(std::shared_ptr<const HipExecutor> exec,
 
 template <typename ValueType, typename IndexType>
 void convert_to_coo(std::shared_ptr<const HipExecutor> exec,
-                    matrix::Coo<ValueType, IndexType> *result,
-                    const matrix::Csr<ValueType, IndexType> *source)
+                    const matrix::Csr<ValueType, IndexType> *source,
+                    matrix::Coo<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
 
@@ -707,8 +707,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_dense(std::shared_ptr<const HipExecutor> exec,
-                      matrix::Dense<ValueType> *result,
-                      const matrix::Csr<ValueType, IndexType> *source)
+                      const matrix::Csr<ValueType, IndexType> *source,
+                      matrix::Dense<ValueType> *result)
 {
     const auto num_rows = result->get_size()[0];
     const auto num_cols = result->get_size()[1];
@@ -738,8 +738,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_sellp(std::shared_ptr<const HipExecutor> exec,
-                      matrix::Sellp<ValueType, IndexType> *result,
-                      const matrix::Csr<ValueType, IndexType> *source)
+                      const matrix::Csr<ValueType, IndexType> *source,
+                      matrix::Sellp<ValueType, IndexType> *result)
 {
     const auto num_rows = result->get_size()[0];
     const auto num_cols = result->get_size()[1];
@@ -793,8 +793,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_ell(std::shared_ptr<const HipExecutor> exec,
-                    matrix::Ell<ValueType, IndexType> *result,
-                    const matrix::Csr<ValueType, IndexType> *source)
+                    const matrix::Csr<ValueType, IndexType> *source,
+                    matrix::Ell<ValueType, IndexType> *result)
 {
     const auto source_values = source->get_const_values();
     const auto source_row_ptrs = source->get_const_row_ptrs();
@@ -880,8 +880,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void transpose(std::shared_ptr<const HipExecutor> exec,
-               matrix::Csr<ValueType, IndexType> *trans,
-               const matrix::Csr<ValueType, IndexType> *orig)
+               const matrix::Csr<ValueType, IndexType> *orig,
+               matrix::Csr<ValueType, IndexType> *trans)
 {
     if (hipsparse::is_supported<ValueType, IndexType>::value) {
         hipsparseAction_t copyValues = HIPSPARSE_ACTION_NUMERIC;
@@ -903,8 +903,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_TRANSPOSE_KERNEL);
 
 template <typename ValueType, typename IndexType>
 void conj_transpose(std::shared_ptr<const HipExecutor> exec,
-                    matrix::Csr<ValueType, IndexType> *trans,
-                    const matrix::Csr<ValueType, IndexType> *orig)
+                    const matrix::Csr<ValueType, IndexType> *orig,
+                    matrix::Csr<ValueType, IndexType> *trans)
 {
     if (hipsparse::is_supported<ValueType, IndexType>::value) {
         const dim3 block_size(default_block_size, 1, 1);
@@ -936,8 +936,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void row_permute(std::shared_ptr<const HipExecutor> exec,
                  const Array<IndexType> *permutation_indices,
-                 matrix::Csr<ValueType, IndexType> *row_permuted,
-                 const matrix::Csr<ValueType, IndexType> *orig)
+                 const matrix::Csr<ValueType, IndexType> *orig,
+                 matrix::Csr<ValueType, IndexType> *row_permuted)
     GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -947,8 +947,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void column_permute(std::shared_ptr<const HipExecutor> exec,
                     const Array<IndexType> *permutation_indices,
-                    matrix::Csr<ValueType, IndexType> *column_permuted,
-                    const matrix::Csr<ValueType, IndexType> *orig)
+                    const matrix::Csr<ValueType, IndexType> *orig,
+                    matrix::Csr<ValueType, IndexType> *column_permuted)
     GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -958,8 +958,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void inverse_row_permute(std::shared_ptr<const HipExecutor> exec,
                          const Array<IndexType> *permutation_indices,
-                         matrix::Csr<ValueType, IndexType> *row_permuted,
-                         const matrix::Csr<ValueType, IndexType> *orig)
+                         const matrix::Csr<ValueType, IndexType> *orig,
+                         matrix::Csr<ValueType, IndexType> *row_permuted)
     GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -969,8 +969,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void inverse_column_permute(std::shared_ptr<const HipExecutor> exec,
                             const Array<IndexType> *permutation_indices,
-                            matrix::Csr<ValueType, IndexType> *column_permuted,
-                            const matrix::Csr<ValueType, IndexType> *orig)
+                            const matrix::Csr<ValueType, IndexType> *orig,
+                            matrix::Csr<ValueType, IndexType> *column_permuted)
     GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -1016,8 +1016,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_hybrid(std::shared_ptr<const HipExecutor> exec,
-                       matrix::Hybrid<ValueType, IndexType> *result,
-                       const matrix::Csr<ValueType, IndexType> *source)
+                       const matrix::Csr<ValueType, IndexType> *source,
+                       matrix::Hybrid<ValueType, IndexType> *result)
 {
     auto ell_val = result->get_ell_values();
     auto ell_col = result->get_ell_col_idxs();
diff --git a/hip/matrix/dense_kernels.hip.cpp b/hip/matrix/dense_kernels.hip.cpp
index c9c6d746769..e96b8b461f6 100644
--- a/hip/matrix/dense_kernels.hip.cpp
+++ b/hip/matrix/dense_kernels.hip.cpp
@@ -247,8 +247,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
 
 template <typename ValueType, typename IndexType>
 void convert_to_coo(std::shared_ptr<const HipExecutor> exec,
-                    matrix::Coo<ValueType, IndexType> *result,
-                    const matrix::Dense<ValueType> *source)
+                    const matrix::Dense<ValueType> *source,
+                    matrix::Coo<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
     auto num_cols = result->get_size()[1];
@@ -281,8 +281,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_csr(std::shared_ptr<const HipExecutor> exec,
-                    matrix::Csr<ValueType, IndexType> *result,
-                    const matrix::Dense<ValueType> *source)
+                    const matrix::Dense<ValueType> *source,
+                    matrix::Csr<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
     auto num_cols = result->get_size()[1];
@@ -317,8 +317,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_ell(std::shared_ptr<const HipExecutor> exec,
-                    matrix::Ell<ValueType, IndexType> *result,
-                    const matrix::Dense<ValueType> *source)
+                    const matrix::Dense<ValueType> *source,
+                    matrix::Ell<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
     auto num_cols = result->get_size()[1];
@@ -344,8 +344,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_hybrid(std::shared_ptr<const HipExecutor> exec,
-                       matrix::Hybrid<ValueType, IndexType> *result,
-                       const matrix::Dense<ValueType> *source)
+                       const matrix::Dense<ValueType> *source,
+                       matrix::Hybrid<ValueType, IndexType> *result)
     GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -354,8 +354,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_sellp(std::shared_ptr<const HipExecutor> exec,
-                      matrix::Sellp<ValueType, IndexType> *result,
-                      const matrix::Dense<ValueType> *source)
+                      const matrix::Dense<ValueType> *source,
+                      matrix::Sellp<ValueType, IndexType> *result)
 {
     const auto stride = source->get_stride();
     const auto num_rows = result->get_size()[0];
@@ -401,8 +401,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_sparsity_csr(std::shared_ptr<const HipExecutor> exec,
-                             matrix::SparsityCsr<ValueType, IndexType> *result,
-                             const matrix::Dense<ValueType> *source)
+                             const matrix::Dense<ValueType> *source,
+                             matrix::SparsityCsr<ValueType, IndexType> *result)
     GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -532,8 +532,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
 
 template <typename ValueType>
 void transpose(std::shared_ptr<const HipExecutor> exec,
-               matrix::Dense<ValueType> *trans,
-               const matrix::Dense<ValueType> *orig)
+               const matrix::Dense<ValueType> *orig,
+               matrix::Dense<ValueType> *trans)
 {
     if (hipblas::is_supported<ValueType>::value) {
         auto handle = exec->get_hipblas_handle();
@@ -557,8 +557,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_TRANSPOSE_KERNEL);
 
 template <typename ValueType>
 void conj_transpose(std::shared_ptr<const HipExecutor> exec,
-                    matrix::Dense<ValueType> *trans,
-                    const matrix::Dense<ValueType> *orig)
+                    const matrix::Dense<ValueType> *orig,
+                    matrix::Dense<ValueType> *trans)
 {
     if (hipblas::is_supported<ValueType>::value) {
         auto handle = exec->get_hipblas_handle();
@@ -583,8 +583,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CONJ_TRANSPOSE_KERNEL);
 template <typename ValueType, typename IndexType>
 void row_permute(std::shared_ptr<const HipExecutor> exec,
                  const Array<IndexType> *permutation_indices,
-                 matrix::Dense<ValueType> *row_permuted,
-                 const matrix::Dense<ValueType> *orig) GKO_NOT_IMPLEMENTED;
+                 const matrix::Dense<ValueType> *orig,
+                 matrix::Dense<ValueType> *row_permuted) GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ROW_PERMUTE_KERNEL);
 
@@ -592,8 +592,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ROW_PERMUTE_KERNEL);
 template <typename ValueType, typename IndexType>
 void column_permute(std::shared_ptr<const HipExecutor> exec,
                     const Array<IndexType> *permutation_indices,
-                    matrix::Dense<ValueType> *column_permuted,
-                    const matrix::Dense<ValueType> *orig) GKO_NOT_IMPLEMENTED;
+                    const matrix::Dense<ValueType> *orig,
+                    matrix::Dense<ValueType> *column_permuted)
+    GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_COLUMN_PERMUTE_KERNEL);
@@ -602,8 +603,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void inverse_row_permute(std::shared_ptr<const HipExecutor> exec,
                          const Array<IndexType> *permutation_indices,
-                         matrix::Dense<ValueType> *row_permuted,
-                         const matrix::Dense<ValueType> *orig)
+                         const matrix::Dense<ValueType> *orig,
+                         matrix::Dense<ValueType> *row_permuted)
     GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -613,8 +614,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void inverse_column_permute(std::shared_ptr<const HipExecutor> exec,
                             const Array<IndexType> *permutation_indices,
-                            matrix::Dense<ValueType> *column_permuted,
-                            const matrix::Dense<ValueType> *orig)
+                            const matrix::Dense<ValueType> *orig,
+                            matrix::Dense<ValueType> *column_permuted)
     GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
diff --git a/hip/matrix/ell_kernels.hip.cpp b/hip/matrix/ell_kernels.hip.cpp
index b60f4a819d9..e76ae023abc 100644
--- a/hip/matrix/ell_kernels.hip.cpp
+++ b/hip/matrix/ell_kernels.hip.cpp
@@ -261,8 +261,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_dense(std::shared_ptr<const HipExecutor> exec,
-                      matrix::Dense<ValueType> *result,
-                      const matrix::Ell<ValueType, IndexType> *source)
+                      const matrix::Ell<ValueType, IndexType> *source,
+                      matrix::Dense<ValueType> *result)
 {
     const auto num_rows = result->get_size()[0];
     const auto num_cols = result->get_size()[1];
@@ -293,8 +293,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_csr(std::shared_ptr<const HipExecutor> exec,
-                    matrix::Csr<ValueType, IndexType> *result,
-                    const matrix::Ell<ValueType, IndexType> *source)
+                    const matrix::Ell<ValueType, IndexType> *source,
+                    matrix::Csr<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
 
diff --git a/hip/matrix/hybrid_kernels.hip.cpp b/hip/matrix/hybrid_kernels.hip.cpp
index 9bfeca3a81e..dc61a03a9be 100644
--- a/hip/matrix/hybrid_kernels.hip.cpp
+++ b/hip/matrix/hybrid_kernels.hip.cpp
@@ -73,9 +73,9 @@ constexpr int warps_in_block = 4;
 
 
 template <typename ValueType, typename IndexType>
-void convert_to_dense(
-    std::shared_ptr<const HipExecutor> exec, matrix::Dense<ValueType> *result,
-    const matrix::Hybrid<ValueType, IndexType> *source) GKO_NOT_IMPLEMENTED;
+void convert_to_dense(std::shared_ptr<const HipExecutor> exec,
+                      const matrix::Hybrid<ValueType, IndexType> *source,
+                      matrix::Dense<ValueType> *result) GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_HYBRID_CONVERT_TO_DENSE_KERNEL);
@@ -83,8 +83,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_csr(std::shared_ptr<const HipExecutor> exec,
-                    matrix::Csr<ValueType, IndexType> *result,
-                    const matrix::Hybrid<ValueType, IndexType> *source)
+                    const matrix::Hybrid<ValueType, IndexType> *source,
+                    matrix::Csr<ValueType, IndexType> *result)
 {
     const auto num_rows = source->get_size()[0];
     auto coo_offset = Array<IndexType>(exec, num_rows + 1);
diff --git a/hip/matrix/sellp_kernels.hip.cpp b/hip/matrix/sellp_kernels.hip.cpp
index fb30422fe43..79c1a230201 100644
--- a/hip/matrix/sellp_kernels.hip.cpp
+++ b/hip/matrix/sellp_kernels.hip.cpp
@@ -116,8 +116,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_dense(std::shared_ptr<const HipExecutor> exec,
-                      matrix::Dense<ValueType> *result,
-                      const matrix::Sellp<ValueType, IndexType> *source)
+                      const matrix::Sellp<ValueType, IndexType> *source,
+                      matrix::Dense<ValueType> *result)
 {
     const auto num_rows = source->get_size()[0];
     const auto num_cols = source->get_size()[1];
@@ -157,8 +157,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_csr(std::shared_ptr<const HipExecutor> exec,
-                    matrix::Csr<ValueType, IndexType> *result,
-                    const matrix::Sellp<ValueType, IndexType> *source)
+                    const matrix::Sellp<ValueType, IndexType> *source,
+                    matrix::Csr<ValueType, IndexType> *result)
 {
     const auto num_rows = source->get_size()[0];
     const auto slice_size = source->get_slice_size();
diff --git a/hip/matrix/sparsity_csr_kernels.hip.cpp b/hip/matrix/sparsity_csr_kernels.hip.cpp
index 54ab4a7c69f..9fe2a046374 100644
--- a/hip/matrix/sparsity_csr_kernels.hip.cpp
+++ b/hip/matrix/sparsity_csr_kernels.hip.cpp
@@ -98,10 +98,10 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 template <typename ValueType, typename IndexType>
-void remove_diagonal_elements(std::shared_ptr<const HipExecutor> exec,
-                              matrix::SparsityCsr<ValueType, IndexType> *matrix,
-                              const IndexType *row_ptrs,
-                              const IndexType *col_idxs) GKO_NOT_IMPLEMENTED;
+void remove_diagonal_elements(
+    std::shared_ptr<const HipExecutor> exec, const IndexType *row_ptrs,
+    const IndexType *col_idxs,
+    matrix::SparsityCsr<ValueType, IndexType> *matrix) GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_REMOVE_DIAGONAL_ELEMENTS_KERNEL);
@@ -109,8 +109,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void transpose(std::shared_ptr<const HipExecutor> exec,
-               matrix::SparsityCsr<ValueType, IndexType> *trans,
-               const matrix::SparsityCsr<ValueType, IndexType> *orig)
+               const matrix::SparsityCsr<ValueType, IndexType> *orig,
+               matrix::SparsityCsr<ValueType, IndexType> *trans)
     GKO_NOT_IMPLEMENTED;
 
 
diff --git a/hip/solver/common_trs_kernels.hip.hpp b/hip/solver/common_trs_kernels.hip.hpp
index 044bc1e5167..493df623b3f 100644
--- a/hip/solver/common_trs_kernels.hip.hpp
+++ b/hip/solver/common_trs_kernels.hip.hpp
@@ -213,8 +213,8 @@ void solve_kernel(std::shared_ptr<const HipExecutor> exec,
                         x->get_values(), hip_solve_struct->policy,
                         hip_solve_struct->factor_work_vec);
                 } else {
-                    dense::transpose(exec, trans_b, b);
-                    dense::transpose(exec, trans_x, x);
+                    dense::transpose(exec, b, trans_b);
+                    dense::transpose(exec, x, trans_x);
                     for (IndexType i = 0; i < trans_b->get_size()[0]; i++) {
                         hipsparse::csrsv2_solve(
                             handle, HIPSPARSE_OPERATION_NON_TRANSPOSE,
@@ -230,7 +230,7 @@ void solve_kernel(std::shared_ptr<const HipExecutor> exec,
                             hip_solve_struct->policy,
                             hip_solve_struct->factor_work_vec);
                     }
-                    dense::transpose(exec, x, trans_x);
+                    dense::transpose(exec, trans_x, x);
                 }
             }
         } else {
diff --git a/omp/matrix/coo_kernels.cpp b/omp/matrix/coo_kernels.cpp
index c708b6f8ca3..71eb4c93a45 100644
--- a/omp/matrix/coo_kernels.cpp
+++ b/omp/matrix/coo_kernels.cpp
@@ -156,8 +156,8 @@ void convert_row_idxs_to_ptrs(std::shared_ptr<const OmpExecutor> exec,
 
 template <typename ValueType, typename IndexType>
 void convert_to_csr(std::shared_ptr<const OmpExecutor> exec,
-                    matrix::Csr<ValueType, IndexType> *result,
-                    const matrix::Coo<ValueType, IndexType> *source)
+                    const matrix::Coo<ValueType, IndexType> *source,
+                    matrix::Csr<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
 
@@ -176,8 +176,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_dense(std::shared_ptr<const OmpExecutor> exec,
-                      matrix::Dense<ValueType> *result,
-                      const matrix::Coo<ValueType, IndexType> *source)
+                      const matrix::Coo<ValueType, IndexType> *source,
+                      matrix::Dense<ValueType> *result)
 {
     auto coo_val = source->get_const_values();
     auto coo_col = source->get_const_col_idxs();
diff --git a/omp/matrix/csr_kernels.cpp b/omp/matrix/csr_kernels.cpp
index 3552046a491..e7904d248c5 100644
--- a/omp/matrix/csr_kernels.cpp
+++ b/omp/matrix/csr_kernels.cpp
@@ -333,8 +333,8 @@ void convert_row_ptrs_to_idxs(std::shared_ptr<const OmpExecutor> exec,
 
 template <typename ValueType, typename IndexType>
 void convert_to_coo(std::shared_ptr<const OmpExecutor> exec,
-                    matrix::Coo<ValueType, IndexType> *result,
-                    const matrix::Csr<ValueType, IndexType> *source)
+                    const matrix::Csr<ValueType, IndexType> *source,
+                    matrix::Coo<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
 
@@ -350,8 +350,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_dense(std::shared_ptr<const OmpExecutor> exec,
-                      matrix::Dense<ValueType> *result,
-                      const matrix::Csr<ValueType, IndexType> *source)
+                      const matrix::Csr<ValueType, IndexType> *source,
+                      matrix::Dense<ValueType> *result)
 {
     auto num_rows = source->get_size()[0];
     auto num_cols = source->get_size()[1];
@@ -378,8 +378,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_sellp(std::shared_ptr<const OmpExecutor> exec,
-                      matrix::Sellp<ValueType, IndexType> *result,
-                      const matrix::Csr<ValueType, IndexType> *source)
+                      const matrix::Csr<ValueType, IndexType> *source,
+                      matrix::Sellp<ValueType, IndexType> *result)
     GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -388,8 +388,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_ell(std::shared_ptr<const OmpExecutor> exec,
-                    matrix::Ell<ValueType, IndexType> *result,
-                    const matrix::Csr<ValueType, IndexType> *source)
+                    const matrix::Csr<ValueType, IndexType> *source,
+                    matrix::Ell<ValueType, IndexType> *result)
     GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -441,8 +441,8 @@ void transpose_and_transform(std::shared_ptr<const OmpExecutor> exec,
 
 template <typename ValueType, typename IndexType>
 void transpose(std::shared_ptr<const OmpExecutor> exec,
-               matrix::Csr<ValueType, IndexType> *trans,
-               const matrix::Csr<ValueType, IndexType> *orig)
+               const matrix::Csr<ValueType, IndexType> *orig,
+               matrix::Csr<ValueType, IndexType> *trans)
 {
     transpose_and_transform(exec, trans, orig,
                             [](const ValueType x) { return x; });
@@ -453,8 +453,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_TRANSPOSE_KERNEL);
 
 template <typename ValueType, typename IndexType>
 void conj_transpose(std::shared_ptr<const OmpExecutor> exec,
-                    matrix::Csr<ValueType, IndexType> *trans,
-                    const matrix::Csr<ValueType, IndexType> *orig)
+                    const matrix::Csr<ValueType, IndexType> *orig,
+                    matrix::Csr<ValueType, IndexType> *trans)
 {
     transpose_and_transform(exec, trans, orig,
                             [](const ValueType x) { return conj(x); });
@@ -485,8 +485,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_hybrid(std::shared_ptr<const OmpExecutor> exec,
-                       matrix::Hybrid<ValueType, IndexType> *result,
-                       const matrix::Csr<ValueType, IndexType> *source)
+                       const matrix::Csr<ValueType, IndexType> *source,
+                       matrix::Hybrid<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
     auto num_cols = result->get_size()[1];
@@ -603,8 +603,8 @@ void row_permute_impl(const Array<IndexType> *permutation_indices,
 template <typename ValueType, typename IndexType>
 void row_permute(std::shared_ptr<const OmpExecutor> exec,
                  const Array<IndexType> *permutation_indices,
-                 matrix::Csr<ValueType, IndexType> *row_permuted,
-                 const matrix::Csr<ValueType, IndexType> *orig)
+                 const matrix::Csr<ValueType, IndexType> *orig,
+                 matrix::Csr<ValueType, IndexType> *row_permuted)
 {
     row_permute_impl(permutation_indices, row_permuted, orig);
 }
@@ -616,8 +616,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void inverse_row_permute(std::shared_ptr<const OmpExecutor> exec,
                          const Array<IndexType> *permutation_indices,
-                         matrix::Csr<ValueType, IndexType> *row_permuted,
-                         const matrix::Csr<ValueType, IndexType> *orig)
+                         const matrix::Csr<ValueType, IndexType> *orig,
+                         matrix::Csr<ValueType, IndexType> *row_permuted)
 {
     auto perm = permutation_indices->get_const_data();
     Array<IndexType> inv_perm(*permutation_indices);
@@ -666,8 +666,8 @@ void column_permute_impl(const Array<IndexType> *permutation_indices,
 template <typename ValueType, typename IndexType>
 void column_permute(std::shared_ptr<const OmpExecutor> exec,
                     const Array<IndexType> *permutation_indices,
-                    matrix::Csr<ValueType, IndexType> *column_permuted,
-                    const matrix::Csr<ValueType, IndexType> *orig)
+                    const matrix::Csr<ValueType, IndexType> *orig,
+                    matrix::Csr<ValueType, IndexType> *column_permuted)
 {
     auto perm = permutation_indices->get_const_data();
     Array<IndexType> inv_perm(*permutation_indices);
@@ -686,8 +686,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void inverse_column_permute(std::shared_ptr<const OmpExecutor> exec,
                             const Array<IndexType> *permutation_indices,
-                            matrix::Csr<ValueType, IndexType> *column_permuted,
-                            const matrix::Csr<ValueType, IndexType> *orig)
+                            const matrix::Csr<ValueType, IndexType> *orig,
+                            matrix::Csr<ValueType, IndexType> *column_permuted)
 {
     column_permute_impl(permutation_indices, column_permuted, orig);
 }
diff --git a/omp/matrix/dense_kernels.cpp b/omp/matrix/dense_kernels.cpp
index 92577f6c1c3..c8349183be9 100644
--- a/omp/matrix/dense_kernels.cpp
+++ b/omp/matrix/dense_kernels.cpp
@@ -214,8 +214,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
 
 template <typename ValueType, typename IndexType>
 void convert_to_coo(std::shared_ptr<const OmpExecutor> exec,
-                    matrix::Coo<ValueType, IndexType> *result,
-                    const matrix::Dense<ValueType> *source)
+                    const matrix::Dense<ValueType> *source,
+                    matrix::Coo<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
     auto num_cols = result->get_size()[1];
@@ -249,8 +249,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_csr(std::shared_ptr<const OmpExecutor> exec,
-                    matrix::Csr<ValueType, IndexType> *result,
-                    const matrix::Dense<ValueType> *source)
+                    const matrix::Dense<ValueType> *source,
+                    matrix::Csr<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
     auto num_cols = result->get_size()[1];
@@ -285,8 +285,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_ell(std::shared_ptr<const OmpExecutor> exec,
-                    matrix::Ell<ValueType, IndexType> *result,
-                    const matrix::Dense<ValueType> *source)
+                    const matrix::Dense<ValueType> *source,
+                    matrix::Ell<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
     auto num_cols = result->get_size()[1];
@@ -318,8 +318,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_hybrid(std::shared_ptr<const OmpExecutor> exec,
-                       matrix::Hybrid<ValueType, IndexType> *result,
-                       const matrix::Dense<ValueType> *source)
+                       const matrix::Dense<ValueType> *source,
+                       matrix::Hybrid<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
     auto num_cols = result->get_size()[1];
@@ -387,8 +387,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_sellp(std::shared_ptr<const OmpExecutor> exec,
-                      matrix::Sellp<ValueType, IndexType> *result,
-                      const matrix::Dense<ValueType> *source)
+                      const matrix::Dense<ValueType> *source,
+                      matrix::Sellp<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
     auto num_cols = result->get_size()[1];
@@ -459,8 +459,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_sparsity_csr(std::shared_ptr<const OmpExecutor> exec,
-                             matrix::SparsityCsr<ValueType, IndexType> *result,
-                             const matrix::Dense<ValueType> *source)
+                             const matrix::Dense<ValueType> *source,
+                             matrix::SparsityCsr<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
     auto num_cols = result->get_size()[1];
@@ -593,8 +593,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
 
 template <typename ValueType>
 void transpose(std::shared_ptr<const OmpExecutor> exec,
-               matrix::Dense<ValueType> *trans,
-               const matrix::Dense<ValueType> *orig)
+               const matrix::Dense<ValueType> *orig,
+               matrix::Dense<ValueType> *trans)
 {
 #pragma omp parallel for
     for (size_type i = 0; i < orig->get_size()[0]; ++i) {
@@ -609,8 +609,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_TRANSPOSE_KERNEL);
 
 template <typename ValueType>
 void conj_transpose(std::shared_ptr<const OmpExecutor> exec,
-                    matrix::Dense<ValueType> *trans,
-                    const matrix::Dense<ValueType> *orig)
+                    const matrix::Dense<ValueType> *orig,
+                    matrix::Dense<ValueType> *trans)
 {
 #pragma omp parallel for
     for (size_type i = 0; i < orig->get_size()[0]; ++i) {
@@ -626,8 +626,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CONJ_TRANSPOSE_KERNEL);
 template <typename ValueType, typename IndexType>
 void row_permute(std::shared_ptr<const OmpExecutor> exec,
                  const Array<IndexType> *permutation_indices,
-                 matrix::Dense<ValueType> *row_permuted,
-                 const matrix::Dense<ValueType> *orig)
+                 const matrix::Dense<ValueType> *orig,
+                 matrix::Dense<ValueType> *row_permuted)
 {
     auto perm = permutation_indices->get_const_data();
 #pragma omp parallel for
@@ -644,8 +644,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ROW_PERMUTE_KERNEL);
 template <typename ValueType, typename IndexType>
 void column_permute(std::shared_ptr<const OmpExecutor> exec,
                     const Array<IndexType> *permutation_indices,
-                    matrix::Dense<ValueType> *column_permuted,
-                    const matrix::Dense<ValueType> *orig)
+                    const matrix::Dense<ValueType> *orig,
+                    matrix::Dense<ValueType> *column_permuted)
 {
     auto perm = permutation_indices->get_const_data();
 #pragma omp parallel for
@@ -663,8 +663,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void inverse_row_permute(std::shared_ptr<const OmpExecutor> exec,
                          const Array<IndexType> *permutation_indices,
-                         matrix::Dense<ValueType> *row_permuted,
-                         const matrix::Dense<ValueType> *orig)
+                         const matrix::Dense<ValueType> *orig,
+                         matrix::Dense<ValueType> *row_permuted)
 {
     auto perm = permutation_indices->get_const_data();
 #pragma omp parallel for
@@ -682,8 +682,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void inverse_column_permute(std::shared_ptr<const OmpExecutor> exec,
                             const Array<IndexType> *permutation_indices,
-                            matrix::Dense<ValueType> *column_permuted,
-                            const matrix::Dense<ValueType> *orig)
+                            const matrix::Dense<ValueType> *orig,
+                            matrix::Dense<ValueType> *column_permuted)
 {
     auto perm = permutation_indices->get_const_data();
 #pragma omp parallel for
diff --git a/omp/matrix/ell_kernels.cpp b/omp/matrix/ell_kernels.cpp
index 97e4dc37a5f..3aa1673812e 100644
--- a/omp/matrix/ell_kernels.cpp
+++ b/omp/matrix/ell_kernels.cpp
@@ -114,8 +114,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_dense(std::shared_ptr<const OmpExecutor> exec,
-                      matrix::Dense<ValueType> *result,
-                      const matrix::Ell<ValueType, IndexType> *source)
+                      const matrix::Ell<ValueType, IndexType> *source,
+                      matrix::Dense<ValueType> *result)
 {
     auto num_rows = source->get_size()[0];
     auto num_cols = source->get_size()[1];
@@ -139,8 +139,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_csr(std::shared_ptr<const OmpExecutor> exec,
-                    matrix::Csr<ValueType, IndexType> *result,
-                    const matrix::Ell<ValueType, IndexType> *source)
+                    const matrix::Ell<ValueType, IndexType> *source,
+                    matrix::Csr<ValueType, IndexType> *result)
     GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
diff --git a/omp/matrix/hybrid_kernels.cpp b/omp/matrix/hybrid_kernels.cpp
index 84477356de6..8282d7c7ab8 100644
--- a/omp/matrix/hybrid_kernels.cpp
+++ b/omp/matrix/hybrid_kernels.cpp
@@ -59,8 +59,8 @@ namespace hybrid {
 
 template <typename ValueType, typename IndexType>
 void convert_to_dense(std::shared_ptr<const OmpExecutor> exec,
-                      matrix::Dense<ValueType> *result,
-                      const matrix::Hybrid<ValueType, IndexType> *source)
+                      const matrix::Hybrid<ValueType, IndexType> *source,
+                      matrix::Dense<ValueType> *result)
 {
     auto num_rows = source->get_size()[0];
     auto num_cols = source->get_size()[1];
@@ -99,8 +99,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_csr(std::shared_ptr<const OmpExecutor> exec,
-                    matrix::Csr<ValueType, IndexType> *result,
-                    const matrix::Hybrid<ValueType, IndexType> *source)
+                    const matrix::Hybrid<ValueType, IndexType> *source,
+                    matrix::Csr<ValueType, IndexType> *result)
 {
     auto csr_val = result->get_values();
     auto csr_col_idxs = result->get_col_idxs();
diff --git a/omp/matrix/sellp_kernels.cpp b/omp/matrix/sellp_kernels.cpp
index 51be43f2cba..023dd7f5249 100644
--- a/omp/matrix/sellp_kernels.cpp
+++ b/omp/matrix/sellp_kernels.cpp
@@ -125,8 +125,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_dense(std::shared_ptr<const OmpExecutor> exec,
-                      matrix::Dense<ValueType> *result,
-                      const matrix::Sellp<ValueType, IndexType> *source)
+                      const matrix::Sellp<ValueType, IndexType> *source,
+                      matrix::Dense<ValueType> *result)
 {
     auto num_rows = source->get_size()[0];
     auto num_cols = source->get_size()[1];
@@ -161,8 +161,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_csr(std::shared_ptr<const OmpExecutor> exec,
-                    matrix::Csr<ValueType, IndexType> *result,
-                    const matrix::Sellp<ValueType, IndexType> *source)
+                    const matrix::Sellp<ValueType, IndexType> *source,
+                    matrix::Csr<ValueType, IndexType> *result)
     GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
diff --git a/omp/matrix/sparsity_csr_kernels.cpp b/omp/matrix/sparsity_csr_kernels.cpp
index 9127faabf58..22987b55287 100644
--- a/omp/matrix/sparsity_csr_kernels.cpp
+++ b/omp/matrix/sparsity_csr_kernels.cpp
@@ -148,9 +148,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void remove_diagonal_elements(std::shared_ptr<const OmpExecutor> exec,
-                              matrix::SparsityCsr<ValueType, IndexType> *matrix,
                               const IndexType *row_ptrs,
-                              const IndexType *col_idxs)
+                              const IndexType *col_idxs,
+                              matrix::SparsityCsr<ValueType, IndexType> *matrix)
 {
     auto num_rows = matrix->get_size()[0];
     auto adj_ptrs = matrix->get_row_ptrs();
@@ -221,8 +221,8 @@ void transpose_and_transform(
 
 template <typename ValueType, typename IndexType>
 void transpose(std::shared_ptr<const OmpExecutor> exec,
-               matrix::SparsityCsr<ValueType, IndexType> *trans,
-               const matrix::SparsityCsr<ValueType, IndexType> *orig)
+               const matrix::SparsityCsr<ValueType, IndexType> *orig,
+               matrix::SparsityCsr<ValueType, IndexType> *trans)
 {
     transpose_and_transform(exec, trans, orig);
 }
diff --git a/omp/test/matrix/sparsity_csr_kernels.cpp b/omp/test/matrix/sparsity_csr_kernels.cpp
index 80a5ecfc207..5ea0ccbbf26 100644
--- a/omp/test/matrix/sparsity_csr_kernels.cpp
+++ b/omp/test/matrix/sparsity_csr_kernels.cpp
@@ -245,10 +245,10 @@ TEST_F(SparsityCsr, RemovesDiagElementsKernelIsEquivalentToRef)
                              dmtx->get_num_nonzeros() - num_diags);
 
     gko::kernels::reference::sparsity_csr::remove_diagonal_elements(
-        ref, tmp.get(), mtx->get_const_row_ptrs(), mtx->get_const_col_idxs());
+        ref, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), tmp.get());
     gko::kernels::omp::sparsity_csr::remove_diagonal_elements(
-        omp, d_tmp.get(), dmtx->get_const_row_ptrs(),
-        dmtx->get_const_col_idxs());
+        omp, dmtx->get_const_row_ptrs(), dmtx->get_const_col_idxs(),
+        d_tmp.get());
 
     GKO_ASSERT_MTX_NEAR(tmp.get(), d_tmp.get(), 0.0);
 }
diff --git a/reference/matrix/coo_kernels.cpp b/reference/matrix/coo_kernels.cpp
index 84b174e65fa..74a0355be68 100644
--- a/reference/matrix/coo_kernels.cpp
+++ b/reference/matrix/coo_kernels.cpp
@@ -145,8 +145,8 @@ void convert_row_idxs_to_ptrs(std::shared_ptr<const ReferenceExecutor> exec,
 
 template <typename ValueType, typename IndexType>
 void convert_to_csr(std::shared_ptr<const ReferenceExecutor> exec,
-                    matrix::Csr<ValueType, IndexType> *result,
-                    const matrix::Coo<ValueType, IndexType> *source)
+                    const matrix::Coo<ValueType, IndexType> *source,
+                    matrix::Csr<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
 
@@ -165,8 +165,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_dense(std::shared_ptr<const ReferenceExecutor> exec,
-                      matrix::Dense<ValueType> *result,
-                      const matrix::Coo<ValueType, IndexType> *source)
+                      const matrix::Coo<ValueType, IndexType> *source,
+                      matrix::Dense<ValueType> *result)
 {
     auto coo_val = source->get_const_values();
     auto coo_col = source->get_const_col_idxs();
diff --git a/reference/matrix/csr_kernels.cpp b/reference/matrix/csr_kernels.cpp
index de0abeef99f..c086950aba6 100644
--- a/reference/matrix/csr_kernels.cpp
+++ b/reference/matrix/csr_kernels.cpp
@@ -328,8 +328,8 @@ void convert_row_ptrs_to_idxs(std::shared_ptr<const ReferenceExecutor> exec,
 
 template <typename ValueType, typename IndexType>
 void convert_to_coo(std::shared_ptr<const ReferenceExecutor> exec,
-                    matrix::Coo<ValueType, IndexType> *result,
-                    const matrix::Csr<ValueType, IndexType> *source)
+                    const matrix::Csr<ValueType, IndexType> *source,
+                    matrix::Coo<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
 
@@ -344,8 +344,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_dense(std::shared_ptr<const ReferenceExecutor> exec,
-                      matrix::Dense<ValueType> *result,
-                      const matrix::Csr<ValueType, IndexType> *source)
+                      const matrix::Csr<ValueType, IndexType> *source,
+                      matrix::Dense<ValueType> *result)
 {
     auto num_rows = source->get_size()[0];
     auto num_cols = source->get_size()[1];
@@ -370,8 +370,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_sellp(std::shared_ptr<const ReferenceExecutor> exec,
-                      matrix::Sellp<ValueType, IndexType> *result,
-                      const matrix::Csr<ValueType, IndexType> *source)
+                      const matrix::Csr<ValueType, IndexType> *source,
+                      matrix::Sellp<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
     auto num_cols = result->get_size()[1];
@@ -475,8 +475,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_ell(std::shared_ptr<const ReferenceExecutor> exec,
-                    matrix::Ell<ValueType, IndexType> *result,
-                    const matrix::Csr<ValueType, IndexType> *source)
+                    const matrix::Csr<ValueType, IndexType> *source,
+                    matrix::Ell<ValueType, IndexType> *result)
 {
     const auto num_rows = source->get_size()[0];
     const auto num_cols = source->get_size()[1];
@@ -549,8 +549,8 @@ void transpose_and_transform(std::shared_ptr<const ReferenceExecutor> exec,
 
 template <typename ValueType, typename IndexType>
 void transpose(std::shared_ptr<const ReferenceExecutor> exec,
-               matrix::Csr<ValueType, IndexType> *trans,
-               const matrix::Csr<ValueType, IndexType> *orig)
+               const matrix::Csr<ValueType, IndexType> *orig,
+               matrix::Csr<ValueType, IndexType> *trans)
 {
     transpose_and_transform(exec, trans, orig,
                             [](const ValueType x) { return x; });
@@ -561,8 +561,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_TRANSPOSE_KERNEL);
 
 template <typename ValueType, typename IndexType>
 void conj_transpose(std::shared_ptr<const ReferenceExecutor> exec,
-                    matrix::Csr<ValueType, IndexType> *trans,
-                    const matrix::Csr<ValueType, IndexType> *orig)
+                    const matrix::Csr<ValueType, IndexType> *orig,
+                    matrix::Csr<ValueType, IndexType> *trans)
 {
     transpose_and_transform(exec, trans, orig,
                             [](const ValueType x) { return conj(x); });
@@ -594,8 +594,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_hybrid(std::shared_ptr<const ReferenceExecutor> exec,
-                       matrix::Hybrid<ValueType, IndexType> *result,
-                       const matrix::Csr<ValueType, IndexType> *source)
+                       const matrix::Csr<ValueType, IndexType> *source,
+                       matrix::Hybrid<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
     auto num_cols = result->get_size()[1];
@@ -690,8 +690,8 @@ void row_permute_impl(const Array<IndexType> *permutation_indices,
 template <typename ValueType, typename IndexType>
 void row_permute(std::shared_ptr<const ReferenceExecutor> exec,
                  const Array<IndexType> *permutation_indices,
-                 matrix::Csr<ValueType, IndexType> *row_permuted,
-                 const matrix::Csr<ValueType, IndexType> *orig)
+                 const matrix::Csr<ValueType, IndexType> *orig,
+                 matrix::Csr<ValueType, IndexType> *row_permuted)
 {
     row_permute_impl(permutation_indices, row_permuted, orig);
 }
@@ -703,8 +703,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void inverse_row_permute(std::shared_ptr<const ReferenceExecutor> exec,
                          const Array<IndexType> *permutation_indices,
-                         matrix::Csr<ValueType, IndexType> *row_permuted,
-                         const matrix::Csr<ValueType, IndexType> *orig)
+                         const matrix::Csr<ValueType, IndexType> *orig,
+                         matrix::Csr<ValueType, IndexType> *row_permuted)
 {
     auto perm = permutation_indices->get_const_data();
     Array<IndexType> inv_perm(*permutation_indices);
@@ -751,8 +751,8 @@ void column_permute_impl(const Array<IndexType> *permutation_indices,
 template <typename ValueType, typename IndexType>
 void column_permute(std::shared_ptr<const ReferenceExecutor> exec,
                     const Array<IndexType> *permutation_indices,
-                    matrix::Csr<ValueType, IndexType> *column_permuted,
-                    const matrix::Csr<ValueType, IndexType> *orig)
+                    const matrix::Csr<ValueType, IndexType> *orig,
+                    matrix::Csr<ValueType, IndexType> *column_permuted)
 {
     auto perm = permutation_indices->get_const_data();
     Array<IndexType> inv_perm(*permutation_indices);
@@ -770,8 +770,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void inverse_column_permute(std::shared_ptr<const ReferenceExecutor> exec,
                             const Array<IndexType> *permutation_indices,
-                            matrix::Csr<ValueType, IndexType> *column_permuted,
-                            const matrix::Csr<ValueType, IndexType> *orig)
+                            const matrix::Csr<ValueType, IndexType> *orig,
+                            matrix::Csr<ValueType, IndexType> *column_permuted)
 {
     column_permute_impl(permutation_indices, column_permuted, orig);
 }
diff --git a/reference/matrix/dense_kernels.cpp b/reference/matrix/dense_kernels.cpp
index 38e96b4ae9d..28d981dbbb3 100644
--- a/reference/matrix/dense_kernels.cpp
+++ b/reference/matrix/dense_kernels.cpp
@@ -197,8 +197,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
 
 template <typename ValueType, typename IndexType>
 void convert_to_coo(std::shared_ptr<const ReferenceExecutor> exec,
-                    matrix::Coo<ValueType, IndexType> *result,
-                    const matrix::Dense<ValueType> *source)
+                    const matrix::Dense<ValueType> *source,
+                    matrix::Coo<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
     auto num_cols = result->get_size()[1];
@@ -228,8 +228,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_csr(std::shared_ptr<const ReferenceExecutor> exec,
-                    matrix::Csr<ValueType, IndexType> *result,
-                    const matrix::Dense<ValueType> *source)
+                    const matrix::Dense<ValueType> *source,
+                    matrix::Csr<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
     auto num_cols = result->get_size()[1];
@@ -260,8 +260,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_ell(std::shared_ptr<const ReferenceExecutor> exec,
-                    matrix::Ell<ValueType, IndexType> *result,
-                    const matrix::Dense<ValueType> *source)
+                    const matrix::Dense<ValueType> *source,
+                    matrix::Ell<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
     auto num_cols = result->get_size()[1];
@@ -292,8 +292,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_hybrid(std::shared_ptr<const ReferenceExecutor> exec,
-                       matrix::Hybrid<ValueType, IndexType> *result,
-                       const matrix::Dense<ValueType> *source)
+                       const matrix::Dense<ValueType> *source,
+                       matrix::Hybrid<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
     auto num_cols = result->get_size()[1];
@@ -347,8 +347,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_sellp(std::shared_ptr<const ReferenceExecutor> exec,
-                      matrix::Sellp<ValueType, IndexType> *result,
-                      const matrix::Dense<ValueType> *source)
+                      const matrix::Dense<ValueType> *source,
+                      matrix::Sellp<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
     auto num_cols = result->get_size()[1];
@@ -418,8 +418,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_sparsity_csr(std::shared_ptr<const ReferenceExecutor> exec,
-                             matrix::SparsityCsr<ValueType, IndexType> *result,
-                             const matrix::Dense<ValueType> *source)
+                             const matrix::Dense<ValueType> *source,
+                             matrix::SparsityCsr<ValueType, IndexType> *result)
 {
     auto num_rows = result->get_size()[0];
     auto num_cols = result->get_size()[1];
@@ -547,8 +547,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
 
 template <typename ValueType>
 void transpose(std::shared_ptr<const ReferenceExecutor> exec,
-               matrix::Dense<ValueType> *trans,
-               const matrix::Dense<ValueType> *orig)
+               const matrix::Dense<ValueType> *orig,
+               matrix::Dense<ValueType> *trans)
 {
     for (size_type i = 0; i < orig->get_size()[0]; ++i) {
         for (size_type j = 0; j < orig->get_size()[1]; ++j) {
@@ -562,8 +562,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_TRANSPOSE_KERNEL);
 
 template <typename ValueType>
 void conj_transpose(std::shared_ptr<const ReferenceExecutor> exec,
-                    matrix::Dense<ValueType> *trans,
-                    const matrix::Dense<ValueType> *orig)
+                    const matrix::Dense<ValueType> *orig,
+                    matrix::Dense<ValueType> *trans)
 {
     for (size_type i = 0; i < orig->get_size()[0]; ++i) {
         for (size_type j = 0; j < orig->get_size()[1]; ++j) {
@@ -578,8 +578,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CONJ_TRANSPOSE_KERNEL);
 template <typename ValueType, typename IndexType>
 void row_permute(std::shared_ptr<const ReferenceExecutor> exec,
                  const Array<IndexType> *permutation_indices,
-                 matrix::Dense<ValueType> *row_permuted,
-                 const matrix::Dense<ValueType> *orig)
+                 const matrix::Dense<ValueType> *orig,
+                 matrix::Dense<ValueType> *row_permuted)
 {
     auto perm = permutation_indices->get_const_data();
     for (size_type i = 0; i < orig->get_size()[0]; ++i) {
@@ -595,8 +595,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ROW_PERMUTE_KERNEL);
 template <typename ValueType, typename IndexType>
 void column_permute(std::shared_ptr<const ReferenceExecutor> exec,
                     const Array<IndexType> *permutation_indices,
-                    matrix::Dense<ValueType> *column_permuted,
-                    const matrix::Dense<ValueType> *orig)
+                    const matrix::Dense<ValueType> *orig,
+                    matrix::Dense<ValueType> *column_permuted)
 {
     auto perm = permutation_indices->get_const_data();
     for (size_type j = 0; j < orig->get_size()[1]; ++j) {
@@ -613,8 +613,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void inverse_row_permute(std::shared_ptr<const ReferenceExecutor> exec,
                          const Array<IndexType> *permutation_indices,
-                         matrix::Dense<ValueType> *row_permuted,
-                         const matrix::Dense<ValueType> *orig)
+                         const matrix::Dense<ValueType> *orig,
+                         matrix::Dense<ValueType> *row_permuted)
 {
     auto perm = permutation_indices->get_const_data();
     for (size_type i = 0; i < orig->get_size()[0]; ++i) {
@@ -631,8 +631,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void inverse_column_permute(std::shared_ptr<const ReferenceExecutor> exec,
                             const Array<IndexType> *permutation_indices,
-                            matrix::Dense<ValueType> *column_permuted,
-                            const matrix::Dense<ValueType> *orig)
+                            const matrix::Dense<ValueType> *orig,
+                            matrix::Dense<ValueType> *column_permuted)
 {
     auto perm = permutation_indices->get_const_data();
     for (size_type j = 0; j < orig->get_size()[1]; ++j) {
diff --git a/reference/matrix/ell_kernels.cpp b/reference/matrix/ell_kernels.cpp
index 07f6bac2ed6..0f21a6c2f3a 100644
--- a/reference/matrix/ell_kernels.cpp
+++ b/reference/matrix/ell_kernels.cpp
@@ -106,8 +106,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_dense(std::shared_ptr<const ReferenceExecutor> exec,
-                      matrix::Dense<ValueType> *result,
-                      const matrix::Ell<ValueType, IndexType> *source)
+                      const matrix::Ell<ValueType, IndexType> *source,
+                      matrix::Dense<ValueType> *result)
 {
     auto num_rows = source->get_size()[0];
     auto num_cols = source->get_size()[1];
@@ -130,8 +130,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_csr(std::shared_ptr<const ReferenceExecutor> exec,
-                    matrix::Csr<ValueType, IndexType> *result,
-                    const matrix::Ell<ValueType, IndexType> *source)
+                    const matrix::Ell<ValueType, IndexType> *source,
+                    matrix::Csr<ValueType, IndexType> *result)
 {
     const auto num_rows = source->get_size()[0];
     const auto max_nnz_per_row = source->get_num_stored_elements_per_row();
diff --git a/reference/matrix/hybrid_kernels.cpp b/reference/matrix/hybrid_kernels.cpp
index 3891b21ecb9..74e126334e2 100644
--- a/reference/matrix/hybrid_kernels.cpp
+++ b/reference/matrix/hybrid_kernels.cpp
@@ -58,8 +58,8 @@ namespace hybrid {
 
 template <typename ValueType, typename IndexType>
 void convert_to_dense(std::shared_ptr<const ReferenceExecutor> exec,
-                      matrix::Dense<ValueType> *result,
-                      const matrix::Hybrid<ValueType, IndexType> *source)
+                      const matrix::Hybrid<ValueType, IndexType> *source,
+                      matrix::Dense<ValueType> *result)
 {
     auto num_rows = source->get_size()[0];
     auto num_cols = source->get_size()[1];
@@ -93,8 +93,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_csr(std::shared_ptr<const ReferenceExecutor> exec,
-                    matrix::Csr<ValueType, IndexType> *result,
-                    const matrix::Hybrid<ValueType, IndexType> *source)
+                    const matrix::Hybrid<ValueType, IndexType> *source,
+                    matrix::Csr<ValueType, IndexType> *result)
 {
     auto csr_val = result->get_values();
     auto csr_col_idxs = result->get_col_idxs();
diff --git a/reference/matrix/sellp_kernels.cpp b/reference/matrix/sellp_kernels.cpp
index 1263d62b0aa..43e01b51fb1 100644
--- a/reference/matrix/sellp_kernels.cpp
+++ b/reference/matrix/sellp_kernels.cpp
@@ -125,8 +125,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_dense(std::shared_ptr<const ReferenceExecutor> exec,
-                      matrix::Dense<ValueType> *result,
-                      const matrix::Sellp<ValueType, IndexType> *source)
+                      const matrix::Sellp<ValueType, IndexType> *source,
+                      matrix::Dense<ValueType> *result)
 {
     auto num_rows = source->get_size()[0];
     auto num_cols = source->get_size()[1];
@@ -161,8 +161,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_csr(std::shared_ptr<const ReferenceExecutor> exec,
-                    matrix::Csr<ValueType, IndexType> *result,
-                    const matrix::Sellp<ValueType, IndexType> *source)
+                    const matrix::Sellp<ValueType, IndexType> *source,
+                    matrix::Csr<ValueType, IndexType> *result)
 {
     auto num_rows = source->get_size()[0];
     auto slice_size = source->get_slice_size();
diff --git a/reference/matrix/sparsity_csr_kernels.cpp b/reference/matrix/sparsity_csr_kernels.cpp
index ff7be054061..70ab3b15aff 100644
--- a/reference/matrix/sparsity_csr_kernels.cpp
+++ b/reference/matrix/sparsity_csr_kernels.cpp
@@ -143,9 +143,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void remove_diagonal_elements(std::shared_ptr<const ReferenceExecutor> exec,
-                              matrix::SparsityCsr<ValueType, IndexType> *matrix,
                               const IndexType *row_ptrs,
-                              const IndexType *col_idxs)
+                              const IndexType *col_idxs,
+                              matrix::SparsityCsr<ValueType, IndexType> *matrix)
 {
     auto num_rows = matrix->get_size()[0];
     auto adj_ptrs = matrix->get_row_ptrs();
@@ -193,8 +193,8 @@ inline void convert_sparsity_to_csc(size_type num_rows,
 template <typename ValueType, typename IndexType>
 void transpose_and_transform(
     std::shared_ptr<const ReferenceExecutor> exec,
-    matrix::SparsityCsr<ValueType, IndexType> *trans,
-    const matrix::SparsityCsr<ValueType, IndexType> *orig)
+    const matrix::SparsityCsr<ValueType, IndexType> *orig,
+    matrix::SparsityCsr<ValueType, IndexType> *trans)
 {
     auto trans_row_ptrs = trans->get_row_ptrs();
     auto orig_row_ptrs = orig->get_const_row_ptrs();
@@ -216,10 +216,10 @@ void transpose_and_transform(
 
 template <typename ValueType, typename IndexType>
 void transpose(std::shared_ptr<const ReferenceExecutor> exec,
-               matrix::SparsityCsr<ValueType, IndexType> *trans,
-               const matrix::SparsityCsr<ValueType, IndexType> *orig)
+               const matrix::SparsityCsr<ValueType, IndexType> *orig,
+               matrix::SparsityCsr<ValueType, IndexType> *trans)
 {
-    transpose_and_transform(exec, trans, orig);
+    transpose_and_transform(exec, orig, trans);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
diff --git a/reference/test/matrix/sparsity_csr_kernels.cpp b/reference/test/matrix/sparsity_csr_kernels.cpp
index 1b7b3783de6..9cd5009d50d 100644
--- a/reference/test/matrix/sparsity_csr_kernels.cpp
+++ b/reference/test/matrix/sparsity_csr_kernels.cpp
@@ -314,8 +314,8 @@ TEST_F(SparsityCsr, RemovesDiagonalElementsForFullRankMatrix)
     tmp_mtx->copy_from(mtx2.get());
 
     gko::kernels::reference::sparsity_csr::remove_diagonal_elements(
-        exec, tmp_mtx.get(), mtx2->get_const_row_ptrs(),
-        mtx2->get_const_col_idxs());
+        exec, mtx2->get_const_row_ptrs(), mtx2->get_const_col_idxs(),
+        tmp_mtx.get());
 
     GKO_ASSERT_MTX_NEAR(tmp_mtx.get(), mtx_s.get(), 0.0);
 }
@@ -338,8 +338,8 @@ TEST_F(SparsityCsr, RemovesDiagonalElementsForIncompleteRankMatrix)
     tmp_mtx->copy_from(mtx2.get());
 
     gko::kernels::reference::sparsity_csr::remove_diagonal_elements(
-        exec, tmp_mtx.get(), mtx2->get_const_row_ptrs(),
-        mtx2->get_const_col_idxs());
+        exec, mtx2->get_const_row_ptrs(), mtx2->get_const_col_idxs(),
+        tmp_mtx.get());
 
     GKO_ASSERT_MTX_NEAR(tmp_mtx.get(), mtx_s.get(), 0.0);
 }