From 21efcbc592a1192682bcd5531c1dd12dcaa87052 Mon Sep 17 00:00:00 2001
From: Ivana <ivana.gyro@gmail.com>
Date: Sun, 1 Dec 2024 00:07:44 +0800
Subject: [PATCH] Delete stale files

Deleted files were replaced by the files in use.
---
 include/Tensor.old.hpp                        | 1958 ----------------
 include/linalg.hpp.old                        | 1066 ---------
 src/BlockUniTensor.cpp.old                    | 1985 -----------------
 src/Tensor.old.cpp                            | 1390 ------------
 .../utils_internal_cpu/GetElems_cpu.cpp.new   |  242 --
 .../utils_internal_cpu/GetElems_cpu.hpp.new   |   33 -
 src/linalg/Trace.cpp.old                      |  392 ----
 7 files changed, 7066 deletions(-)
 delete mode 100644 include/Tensor.old.hpp
 delete mode 100644 include/linalg.hpp.old
 delete mode 100644 src/BlockUniTensor.cpp.old
 delete mode 100644 src/Tensor.old.cpp
 delete mode 100644 src/backend/utils_internal_cpu/GetElems_cpu.cpp.new
 delete mode 100644 src/backend/utils_internal_cpu/GetElems_cpu.hpp.new
 delete mode 100644 src/linalg/Trace.cpp.old
diff --git a/include/Tensor.old.hpp b/include/Tensor.old.hpp
deleted file mode 100644
index 709d7dc0a..000000000
--- a/include/Tensor.old.hpp
+++ /dev/null
@@ -1,1958 +0,0 @@
-#ifndef CYTNX_TENSOR_OLD_H_
-#define CYTNX_TENSOR_OLD_H_
-
-#include "Type.hpp"
-#include "cytnx_error.hpp"
-#include "backend/Storage.hpp"
-#include "Device.hpp"
-#include "intrusive_ptr_base.hpp"
-#include <iostream>
-#include <fstream>
-#include "utils/vec_range.hpp"
-#include "utils/vec_cast.hpp"
-#include "utils/dynamic_arg_resolver.hpp"
-// #include "linalg.hpp"
-#include "Accessor.hpp"
-#include <utility>
-#include <vector>
-#include <initializer_list>
-#include <string>
-#include "backend/Scalar.hpp"
-
-namespace cytnx {
-
-  ///@cond
-  // real implementation
-  class Tensor_impl : public intrusive_ptr_base<Tensor_impl> {
-   private:
-    // Interface:
-    Storage_init_interface __SII;
-
-    // Memory:
-    Storage _storage;
-
-    // tensor shape
-    std::vector<cytnx_uint64> _shape;
-
-    // pseudo-perm info
-    std::vector<cytnx_uint64> _mapper;
-    std::vector<cytnx_uint64> _invmapper;
-    bool _contiguous;
-
-   public:
-    friend class Tensor;
-    boost::intrusive_ptr<Tensor_impl> _clone_meta_only() const {
-      boost::intrusive_ptr<Tensor_impl> out(new Tensor_impl());
-      out->_mapper = this->_mapper;
-      out->_invmapper = this->_invmapper;
-      out->_shape = this->_shape;
-      out->_contiguous = this->_contiguous;
-      return out;
-    }
-    Tensor_impl() : _contiguous(true){};
-
-    void Init(const std::vector<cytnx_uint64> &shape, const unsigned int &dtype = Type.Double,
-              int device = -1, const bool &init_zero = true);
-    void Init(const Storage &in);
-    // void Init(const Storage &in, const std::vector<cytnx_uint64> &shape,
-    // const unsigned int &dtype, int device);
-    /*
-    template<class T>
-    void From_vec(const T &ndvec){
-        cytnx_error_msg(std::string(typeid(T).name()).find("vector") ==
-    std::string::npos,"[ERROR][Tensor][From_vec] the input argument should be a nd vector.%s","\n");
-        //dispatch the rank!:
-
-
-
-    }
-    */
-    // clone&assignment constr., use intrusive_ptr's
-    Tensor_impl(const Tensor_impl &rhs);
-    Tensor_impl &operator=(const Tensor_impl &rhs);  // add const
-
-    unsigned int dtype() const { return this->_storage.dtype(); }
-    int device() const { return this->_storage.device(); }
-
-    std::string dtype_str() const { return Type.getname(this->_storage.dtype()); }
-    std::string device_str() const { return Device.getname(this->_storage.device()); }
-
-    const std::vector<cytnx_uint64> &shape() const { return _shape; }
-
-    const bool &is_contiguous() const { return this->_contiguous; }
-
-    const std::vector<cytnx_uint64> &mapper() const { return this->_mapper; }
-    const std::vector<cytnx_uint64> &invmapper() const { return this->_invmapper; }
-    Storage &storage() { return _storage; }
-
-    const Storage &storage() const { return _storage; }
-
-    boost::intrusive_ptr<Tensor_impl> clone() const {
-      boost::intrusive_ptr<Tensor_impl> out = this->_clone_meta_only();
-      out->_storage = this->_storage.clone();
-      return out;
-    }
-
-    void to_(const int &device) { this->_storage.to_(device); }
-    boost::intrusive_ptr<Tensor_impl> to(const int &device) {
-      if (this->device() == device) {
-        // boost::intrusive_ptr<Tensor_impl> out(this);
-        return this;
-      } else {
-        boost::intrusive_ptr<Tensor_impl> out = this->_clone_meta_only();
-        out->_storage = this->_storage.to(device);
-        return out;
-      }
-    }
-
-    void permute_(const std::vector<cytnx_uint64> &rnks);
-
-    boost::intrusive_ptr<Tensor_impl> permute(const std::vector<cytnx_uint64> &rnks);
-
-    template <class T>
-    T &at(const std::vector<cytnx_uint64> &locator) const {
-      cytnx_error_msg(locator.size() != this->_shape.size(), "%s",
-                      "The input index does not match Tensor's rank.");
-
-      cytnx_uint64 RealRank, mtplyr;
-      // std::vector<cytnx_uint64> c_shape(this->_shape.size());
-      // std::vector<cytnx_uint64> c_loc(this->_shape.size());
-      cytnx_uint64 c_shape, c_loc;
-
-      RealRank = 0;
-      mtplyr = 1;
-
-      for (cytnx_int64 i = this->_shape.size() - 1; i >= 0; i--) {
-        if (locator[i] >= this->_shape[i]) {
-          cytnx_error_msg(true, "%s", "Attempting to access out-of-bound index in Tensor.");
-        }
-        // c_shape[i] = this->_shape[this->_invmapper[i]];
-        // c_loc[i] = locator[this->_invmapper[i]];
-        c_shape = this->_shape[this->_invmapper[i]];
-        c_loc = locator[this->_invmapper[i]];
-        RealRank += mtplyr * c_loc;
-        mtplyr *= c_shape;
-      }
-      return this->_storage.at<T>(RealRank);
-    }
-
-    const Scalar::Sproxy at(const std::vector<cytnx_uint64> &locator) const {
-      cytnx_error_msg(locator.size() != this->_shape.size(), "%s",
-                      "The input index does not match Tensor's rank.");
-
-      cytnx_uint64 RealRank, mtplyr;
-      // std::vector<cytnx_uint64> c_shape(this->_shape.size());
-      // std::vector<cytnx_uint64> c_loc(this->_shape.size());
-
-      cytnx_uint64 c_shape, c_loc;
-      RealRank = 0;
-      mtplyr = 1;
-
-      for (cytnx_int64 i = this->_shape.size() - 1; i >= 0; i--) {
-        if (locator[i] >= this->_shape[i]) {
-          cytnx_error_msg(true, "%s", "Attempting to access out-of-bound index in Tensor.");
-        }
-        // c_shape[i] = this->_shape[this->_invmapper[i]];
-        // c_loc[i] = locator[this->_invmapper[i]];
-        c_shape = this->_shape[this->_invmapper[i]];
-        c_loc = locator[this->_invmapper[i]];
-        RealRank += mtplyr * c_loc;
-        mtplyr *= c_shape;
-      }
-      return this->_storage.at(RealRank);
-    }
-
-    Scalar::Sproxy at(const std::vector<cytnx_uint64> &locator) {
-      cytnx_error_msg(locator.size() != this->_shape.size(), "%s",
-                      "The input index does not match Tensor's rank.");
-
-      cytnx_uint64 RealRank, mtplyr;
-      // std::vector<cytnx_uint64> c_shape(this->_shape.size());
-      // std::vector<cytnx_uint64> c_loc(this->_shape.size());
-      cytnx_uint64 c_shape, c_loc;
-
-      RealRank = 0;
-      mtplyr = 1;
-
-      for (cytnx_int64 i = this->_shape.size() - 1; i >= 0; i--) {
-        if (locator[i] >= this->_shape[i]) {
-          cytnx_error_msg(true, "%s", "Attempting to access out-of-bound index in Tensor.");
-        }
-        // c_shape[i] = this->_shape[this->_invmapper[i]];
-        // c_loc[i] = locator[this->_invmapper[i]];
-        c_shape = this->_shape[this->_invmapper[i]];
-        c_loc = locator[this->_invmapper[i]];
-        RealRank += mtplyr * c_loc;
-        mtplyr *= c_shape;
-      }
-      return this->_storage.at(RealRank);
-    }
-
-    boost::intrusive_ptr<Tensor_impl> get(const std::vector<cytnx::Accessor> &accessors);
-    boost::intrusive_ptr<Tensor_impl> get_deprecated(const std::vector<cytnx::Accessor> &accessors);
-    void set(const std::vector<cytnx::Accessor> &accessors,
-             const boost::intrusive_ptr<Tensor_impl> &rhs);
-
-    template <class T>
-    void set(const std::vector<cytnx::Accessor> &accessors, const T &rc);
-
-    void set(const std::vector<cytnx::Accessor> &accessors, const Scalar::Sproxy &rc);
-
-    template <class Tx>
-    void fill(const Tx &val) {
-      this->storage().fill(val);
-    }
-
-    boost::intrusive_ptr<Tensor_impl> contiguous() {
-      // return new instance if act on non-contiguous tensor
-      // return self if act on contiguous tensor
-      if (this->_contiguous) {
-        boost::intrusive_ptr<Tensor_impl> out(this);
-        // out->_storage = this->_storage;
-        return out;
-      } else {
-        boost::intrusive_ptr<Tensor_impl> out(new Tensor_impl());
-        std::vector<cytnx_uint64> oldshape(this->_shape.size());
-        for (cytnx_uint64 i = 0; i < this->_shape.size(); i++) {
-          oldshape[i] = this->_shape[this->_invmapper[i]];
-        }
-
-        out->_storage._impl =
-          this->_storage._impl->Move_memory(oldshape, this->_mapper, this->_invmapper);
-        // this->_storage._impl->Move_memory_(oldshape, this->_mapper, this->_invmapper);
-        // out->_storage._impl = this->_storage._impl;
-        // std::cout << out->_storage << std::endl;
-        out->_invmapper = vec_range(this->_invmapper.size());
-        out->_mapper = out->_invmapper;
-        out->_shape = this->_shape;
-        out->_contiguous = true;
-        return out;
-      }
-    }
-
-    void contiguous_() {
-      // return new instance if act on non-contiguous tensor
-      // return self if act on contiguous tensor
-      if (!this->_contiguous) {
-        std::vector<cytnx_uint64> oldshape(this->_shape.size());
-        for (cytnx_uint64 i = 0; i < this->_shape.size(); i++) {
-          oldshape[i] = this->_shape[this->_invmapper[i]];
-        }
-
-        this->_storage._impl =
-          this->_storage._impl->Move_memory(oldshape, this->_mapper, this->_invmapper);
-        // this->_storage._impl->Move_memory_(oldshape, this->_mapper, this->_invmapper);
-        // this->_mapper = vec_range(this->_invmapper.size());
-        vec_range_(this->_mapper, this->invmapper().size());
-        this->_invmapper = this->_mapper;
-        this->_contiguous = true;
-      }
-    }
-
-    void reshape_(const std::vector<cytnx_int64> &new_shape) {
-      if (!this->_contiguous) {
-        this->contiguous_();
-      }
-      // std::vector<cytnx_uint64> result_shape(new_shape.size());
-      cytnx_uint64 new_N = 1;
-      bool has_undetermine = false;
-      unsigned int Udet_id = 0;
-      // this->_shape = vec_cast<cytnx_int64,cytnx_uint64>(new_shape);
-      this->_shape.resize(new_shape.size());
-      for (cytnx_uint64 i = 0; i < new_shape.size(); i++) {
-        this->_shape[i] = new_shape[i];
-      }
-      for (int i = 0; i < new_shape.size(); i++) {
-        if (new_shape[i] < 0) {
-          if (new_shape[i] != -1)
-            cytnx_error_msg(
-              new_shape[i] != -1, "%s",
-              "[ERROR] reshape can only have dimension > 0 and one undetermine rank specify as -1");
-          if (has_undetermine)
-            cytnx_error_msg(
-              new_shape[i] != -1, "%s",
-              "[ERROR] reshape can only have dimension > 0 and one undetermine rank specify as -1");
-          Udet_id = i;
-          has_undetermine = true;
-        } else {
-          new_N *= new_shape[i];
-          // result_shape[i] = new_shape[i];
-        }
-      }
-
-      if (has_undetermine) {
-        cytnx_error_msg(new_N > this->_storage.size(), "%s",
-                        "[ERROR] new shape exceed the total number of elements.");
-        cytnx_error_msg(this->_storage.size() % new_N, "%s",
-                        "[ERROR] unmatch size when reshape with undetermine dimension");
-        // result_shape[Udet_id] = this->_storage.size() / new_N;
-        this->_shape[Udet_id] = this->_storage.size() / new_N;
-      } else {
-        cytnx_error_msg(new_N != this->_storage.size(), "%s",
-                        "[ERROR] new shape does not match the number of elements.");
-      }
-
-      // this->_shape = result_shape;
-      // this->_mapper = std::move(vec_range(new_shape.size()));
-      this->_mapper.resize(new_shape.size());
-      vec_range_(this->_mapper, new_shape.size());
-      this->_invmapper = this->_mapper;
-    }
-
-    boost::intrusive_ptr<Tensor_impl> reshape(const std::vector<cytnx_int64> &new_shape) {
-      boost::intrusive_ptr<Tensor_impl> out(new Tensor_impl());
-      if (this->is_contiguous()) {
-        out = this->_clone_meta_only();
-        out->_storage = this->_storage;
-      } else {
-        out = this->contiguous();
-      }
-      // out = this->clone();
-
-      out->reshape_(new_shape);
-      return out;
-    }
-
-    boost::intrusive_ptr<Tensor_impl> astype(const int &new_type) {
-      // boost::intrusive_ptr<Tensor_impl> out(new Tensor_impl());
-      // out->_storage = this->_storage.astype(new_type);
-      if (this->dtype() == new_type) {
-        return this;
-      } else {
-        boost::intrusive_ptr<Tensor_impl> out = this->_clone_meta_only();
-        out->_storage = this->_storage.astype(new_type);
-        return out;
-      }
-    }
-  };
-  ///@endcond
-
-  class Tensor;
-
-  ///@cond
-  // [Note] these are fwd from linalg.hpp
-  template <class T>
-  Tensor operator+(const Tensor &lhs, const T &rc);
-  template <class T>
-  Tensor operator-(const Tensor &lhs, const T &rhs);
-  template <class T>
-  Tensor operator*(const Tensor &lhs, const T &rhs);
-  template <class T>
-  Tensor operator/(const Tensor &lhs, const T &rhs);
-  ///@endcond
-
-  /// @brief an tensor (multi-dimensional array)
-  class Tensor {
-   private:
-   public:
-    /// @cond
-    // this is a proxy class to allow get/set element using [] as python!
-    struct Tproxy {
-      boost::intrusive_ptr<Tensor_impl> _insimpl;
-      std::vector<cytnx::Accessor> _accs;
-      Tproxy(boost::intrusive_ptr<Tensor_impl> _ptr, const std::vector<cytnx::Accessor> &accs)
-          : _insimpl(std::move(_ptr)), _accs(accs) {}
-
-      // when used to set elems:
-      const Tensor &operator=(const Tensor &rhs) {
-        this->_insimpl->set(_accs, rhs._impl);
-        return rhs;
-      }
-
-      template <class T>
-      const T &operator=(const T &rc) {
-        this->_insimpl->set(_accs, rc);
-        return rc;
-      }
-      const Tproxy &operator=(const Tproxy &rc) {
-        Tensor tmp = Tensor(rc);
-        this->_insimpl->set(_accs, tmp._impl);
-        return rc;
-      }
-
-      template <class T>
-      Tensor operator+=(const T &rc) {
-        Tensor self;
-        self._impl = _insimpl->get(_accs);
-        self += rc;
-        _insimpl->set(_accs, self._impl);
-        self._impl = this->_insimpl;
-        return self;
-      }
-      Tensor operator+=(const Tproxy &rc);
-
-      template <class T>
-      Tensor operator-=(const T &rc) {
-        Tensor self;
-        self._impl = _insimpl->get(_accs);
-        self -= rc;
-        _insimpl->set(_accs, self._impl);
-        self._impl = this->_insimpl;
-        return self;
-      }
-      Tensor operator-=(const Tproxy &rc);
-
-      template <class T>
-      Tensor operator/=(const T &rc) {
-        Tensor self;
-        self._impl = _insimpl->get(_accs);
-        self /= rc;
-        _insimpl->set(_accs, self._impl);
-        self._impl = this->_insimpl;
-        return self;
-      }
-      Tensor operator/=(const Tproxy &rc);
-
-      template <class T>
-      Tensor operator*=(const T &rc) {
-        Tensor self;
-        self._impl = _insimpl->get(_accs);
-        self *= rc;
-        _insimpl->set(_accs, self._impl);
-        self._impl = this->_insimpl;
-        return self;
-      }
-      Tensor operator*=(const Tproxy &rc);
-
-      // alias to resolve conflict with op ovld for rc=Tensor
-      /*
-      template<class T>
-      Tensor _operatorADD(const T &rc) const{
-          Tensor out;
-          out._impl = _insimpl->get(_accs);
-          return out.Add(rc);
-      }
-      */
-      Tensor operator+(const cytnx_complex128 &rc) const;  //{return this->_operatorADD(rc);};
-      Tensor operator+(const cytnx_complex64 &rc) const;  //{return this->_operatorADD(rc);};
-      Tensor operator+(const cytnx_double &rc) const;  //{return this->_operatorADD(rc);};
-      Tensor operator+(const cytnx_float &rc) const;  //{return this->_operatorADD(rc);};
-      Tensor operator+(const cytnx_uint64 &rc) const;  //{return this->_operatorADD(rc);};
-      Tensor operator+(const cytnx_int64 &rc) const;  //{return this->_operatorADD(rc);};
-      Tensor operator+(const cytnx_uint32 &rc) const;  //{return this->_operatorADD(rc);};
-      Tensor operator+(const cytnx_int32 &rc) const;  //{return this->_operatorADD(rc);};
-      Tensor operator+(const cytnx_uint16 &rc) const;  //{return this->_operatorADD(rc);};
-      Tensor operator+(const cytnx_int16 &rc) const;  //{return this->_operatorADD(rc);};
-      Tensor operator+(const cytnx_bool &rc) const;  //{return this->_operatorADD(rc);};
-      Tensor operator+(const Tproxy &rc) const;
-
-      /*
-      template<class T>
-      Tensor _operatorSUB(const T &rc) const{
-          Tensor out;
-          out._impl = _insimpl->get(_accs);
-          return out.Sub(rc);
-      }
-      */
-      Tensor operator-(const cytnx_complex128 &rc) const;  //{return this->_operatorSUB(rc);};
-      Tensor operator-(const cytnx_complex64 &rc) const;  //{return this->_operatorSUB(rc);};
-      Tensor operator-(const cytnx_double &rc) const;  //{return this->_operatorSUB(rc);};
-      Tensor operator-(const cytnx_float &rc) const;  //{return this->_operatorSUB(rc);};
-      Tensor operator-(const cytnx_uint64 &rc) const;  //{return this->_operatorSUB(rc);};
-      Tensor operator-(const cytnx_int64 &rc) const;  //{return this->_operatorSUB(rc);};
-      Tensor operator-(const cytnx_uint32 &rc) const;  //{return this->_operatorSUB(rc);};
-      Tensor operator-(const cytnx_int32 &rc) const;  //{return this->_operatorSUB(rc);};
-      Tensor operator-(const cytnx_uint16 &rc) const;  //{return this->_operatorSUB(rc);};
-      Tensor operator-(const cytnx_int16 &rc) const;  //{return this->_operatorSUB(rc);};
-      Tensor operator-(const cytnx_bool &rc) const;  //{return this->_operatorSUB(rc);};
-      Tensor operator-(const Tproxy &rc) const;
-
-      Tensor operator-() const;
-
-      /*
-      template<class T>
-      Tensor _operatorMUL(const T &rc) const{
-          Tensor out;
-          out._impl = _insimpl->get(_accs);
-          return out.Mul(rc);
-      }
-      */
-      Tensor operator*(const cytnx_complex128 &rc) const;  //{return this->_operatorMUL(rc);};
-      Tensor operator*(const cytnx_complex64 &rc) const;  //{return this->_operatorMUL(rc);};
-      Tensor operator*(const cytnx_double &rc) const;  //{return this->_operatorMUL(rc);};
-      Tensor operator*(const cytnx_float &rc) const;  //{return this->_operatorMUL(rc);};
-      Tensor operator*(const cytnx_uint64 &rc) const;  //{return this->_operatorMUL(rc);};
-      Tensor operator*(const cytnx_int64 &rc) const;  //{return this->_operatorMUL(rc);};
-      Tensor operator*(const cytnx_uint32 &rc) const;  //{return this->_operatorMUL(rc);};
-      Tensor operator*(const cytnx_int32 &rc) const;  //{return this->_operatorMUL(rc);};
-      Tensor operator*(const cytnx_uint16 &rc) const;  //{return this->_operatorMUL(rc);};
-      Tensor operator*(const cytnx_int16 &rc) const;  //{return this->_operatorMUL(rc);};
-      Tensor operator*(const cytnx_bool &rc) const;  //{return this->_operatorMUL(rc);};
-      Tensor operator*(const Tproxy &rc) const;
-
-      /*
-      template<class T>
-      Tensor _operatorDIV(const T &rc) const{
-          Tensor out;
-          out._impl = _insimpl->get(_accs);
-          return out.Div(rc);
-      }
-      */
-      Tensor operator/(const cytnx_complex128 &rc) const;  //{return this->_operatorDIV(rc);};
-      Tensor operator/(const cytnx_complex64 &rc) const;  //{return this->_operatorDIV(rc);};
-      Tensor operator/(const cytnx_double &rc) const;  //{return this->_operatorDIV(rc);};
-      Tensor operator/(const cytnx_float &rc) const;  //{return this->_operatorDIV(rc);};
-      Tensor operator/(const cytnx_uint64 &rc) const;  //{return this->_operatorDIV(rc);};
-      Tensor operator/(const cytnx_int64 &rc) const;  //{return this->_operatorDIV(rc);};
-      Tensor operator/(const cytnx_uint32 &rc) const;  //{return this->_operatorDIV(rc);};
-      Tensor operator/(const cytnx_int32 &rc) const;  //{return this->_operatorDIV(rc);};
-      Tensor operator/(const cytnx_uint16 &rc) const;  //{return this->_operatorDIV(rc);};
-      Tensor operator/(const cytnx_int16 &rc) const;  //{return this->_operatorDIV(rc);};
-      Tensor operator/(const cytnx_bool &rc) const;  //{return this->_operatorDIV(rc);};
-      Tensor operator/(const Tproxy &rc) const;
-
-      template <class T>
-      T item() const {
-        Tensor out;
-        out._impl = _insimpl->get(_accs);
-        return out.item<T>();
-      }
-
-      Scalar::Sproxy item() const {
-        Tensor out;
-        out._impl = _insimpl->get(_accs);
-        return out.item();
-      }
-
-      // when used to get elems:
-      operator Tensor() const {
-        Tensor out;
-        out._impl = _insimpl->get(_accs);
-        return out;
-      }
-
-      Storage storage() const {
-        Tensor out;
-        out._impl = _insimpl->get(_accs);
-        return out.storage();
-      }
-
-    };  // proxy class of Tensor.
-
-    /// @endcond
-
-    /// @cond
-    // these two are using the python way!
-    //----------------------------------------
-    template <class... Ts>
-    Tproxy operator()(const std::string &e1, const Ts &...elems) {
-      // std::cout << e1 << std::endl;
-      std::vector<cytnx::Accessor> tmp = Indices_resolver(e1, elems...);
-      return (*this)[tmp];
-    }
-    template <class... Ts>
-    Tproxy operator()(const cytnx_int64 &e1, const Ts &...elems) {
-      // std::cout << e1<< std::endl;
-      std::vector<cytnx::Accessor> tmp = Indices_resolver(e1, elems...);
-      return (*this)[tmp];
-    }
-    template <class... Ts>
-    Tproxy operator()(const cytnx::Accessor &e1, const Ts &...elems) {
-      // std::cout << e1 << std::endl;
-      std::vector<cytnx::Accessor> tmp = Indices_resolver(e1, elems...);
-      return (*this)[tmp];
-    }
-    template <class... Ts>
-    const Tproxy operator()(const std::string &e1, const Ts &...elems) const {
-      // std::cout << e1 << std::endl;
-      std::vector<cytnx::Accessor> tmp = Indices_resolver(e1, elems...);
-      return (*this)[tmp];
-    }
-    template <class... Ts>
-    const Tproxy operator()(const cytnx_int64 &e1, const Ts &...elems) const {
-      std::vector<cytnx::Accessor> tmp = Indices_resolver(e1, elems...);
-      return (*this)[tmp];
-    }
-    template <class... Ts>
-    const Tproxy operator()(const cytnx::Accessor &e1, const Ts &...elems) const {
-      std::vector<cytnx::Accessor> tmp = Indices_resolver(e1, elems...);
-      return (*this)[tmp];
-    }
-
-    //-----------------------------------------
-
-    Tproxy operator[](const std::initializer_list<cytnx::Accessor> &accs) {
-      std::vector<cytnx::Accessor> tmp = accs;
-      return (*this)[tmp];
-    }
-    Tproxy operator[](const std::vector<cytnx::Accessor> &accs) {
-      return Tproxy(this->_impl, accs);
-    }
-
-    const Tproxy operator[](const std::vector<cytnx::Accessor> &accs) const {
-      return Tproxy(this->_impl, accs);
-    }
-    const Tproxy operator[](const std::initializer_list<cytnx::Accessor> &accs) const {
-      std::vector<cytnx::Accessor> tmp = accs;
-      return (*this)[tmp];
-    }
-
-    Tproxy operator[](const std::initializer_list<cytnx_int64> &accs) {
-      std::vector<cytnx_int64> tmp = accs;
-      return (*this)[tmp];
-    }
-    Tproxy operator[](const std::vector<cytnx_int64> &accs) {
-      std::vector<cytnx::Accessor> acc_in;
-      for (int i = 0; i < accs.size(); i++) {
-        acc_in.push_back(cytnx::Accessor(accs[i]));
-      }
-      return Tproxy(this->_impl, acc_in);
-    }
-    const Tproxy operator[](const std::initializer_list<cytnx_int64> &accs) const {
-      std::vector<cytnx_int64> tmp = accs;
-      return (*this)[tmp];
-    }
-    const Tproxy operator[](const std::vector<cytnx_uint64> &accs) const {
-      std::vector<cytnx::Accessor> acc_in;
-      for (int i = 0; i < accs.size(); i++) {
-        acc_in.push_back(cytnx::Accessor(accs[i]));
-      }
-      return Tproxy(this->_impl, acc_in);
-    }
-    const Tproxy operator[](const std::vector<cytnx_int64> &accs) const {
-      std::vector<cytnx::Accessor> acc_in;
-      for (int i = 0; i < accs.size(); i++) {
-        acc_in.push_back(cytnx::Accessor(accs[i]));
-      }
-      return Tproxy(this->_impl, acc_in);
-    }
-    ///@endcond
-    //-------------------------------------------
-
-    /// @cond
-    void _Save(std::fstream &f) const;
-    void _Load(std::fstream &f);
-
-    /// @endcond
-    /**
-    @brief Save current Tensor to file
-    @param[in] fname file name (without file extension)
-
-    @details
-        save the Tensor to file with file path specify with input param \p fname with postfix
-    ".cytn"
-    @see Load(const std::string &fname)
-    */
-    void Save(const std::string &fname) const;
-    /**
-     * @see Save(const std::string &fname) const
-     */
-    void Save(const char *fname) const;
-
-    /**
-     * @brief Save current Tensor to the binary file
-     * @details This function will save the Tensor to the binary file with file
-     *   name \p fname .
-     * @param fname[in] the file name of the binary file.
-     * @pre The file name @p fname must be valid.
-     * @see cytnx::Tensor::Fromfile
-     */
-    void Tofile(const std::string &fname) const;
-
-    /**
-     * @see Tofile(const std::string &fname) const
-     */
-    void Tofile(const char *fname) const;
-
-    /**
-     * @see Tofile(const std::string &fname) const
-     */
-    void Tofile(std::fstream &f) const;
-
-    /**
-    @brief Load current Tensor from file
-    @param fname[in] file name
-    @details
-        load the Storage from file with file path specify with input param 'fname'
-    @pre the file must be a Tensor object which is saved by cytnx::Tensor::Save.
-    */
-
-    static Tensor Load(const std::string &fname);
-    /**
-     * @see Load(const std::string &fname)
-     */
-    static Tensor Load(const char *fname);
-
-    /**
-     * @brief Load current Tensor from the binary file
-     * @details This function will load the Tensor from the binary file which is saved by
-     *    cytnx::Tensor::Tofile. Given the file name \p fname , data type \p dtype and
-     *    number of elements \p count, this function will load the first \p count elements
-     *    from the binary file \p fname with data type \p dtype.
-     * @param fname[in] the file name of the binary file.
-     * @param dtype[in] the data type of the binary file. This can be any of the type defined in
-     *   cytnx::Type.
-     * @param count[in] the number of elements to be loaded from the binary file. If set to -1,
-     *  all elements in the binary file will be loaded.
-     * @return Tensor
-     * @pre
-     *  1. The @p dtype cannot be Type.Void.
-     *  2. The @p dtype must be the same as the data type of the binary file.
-     *  3. The @p Nelem cannot be 0.
-     *  4. The @p Nelem cannot be larger than the number of elements in the binary file.
-     *  5. The file name @p fname must be valid.
-     * @see cytnx::Tensor::Tofile
-     */
-    static Tensor Fromfile(const std::string &fname, const unsigned int &dtype,
-                           const cytnx_int64 &count = -1);
-    static Tensor Fromfile(const char *fname, const unsigned int &dtype,
-                           const cytnx_int64 &count = -1);
-
-    // static Tensor Frombinary(const std::string &fname);
-
-    ///@cond
-    boost::intrusive_ptr<Tensor_impl> _impl;
-    Tensor() : _impl(new Tensor_impl()){};
-    Tensor(const Tensor &rhs) { _impl = rhs._impl; }
-
-    /*
-    template<class Tp>
-    Tensor(const std::initializer_list<Tp> &rhs){
-        Storage stmp = std::vector<Tp>(rhs);
-        boost::intrusive_ptr<Tensor_impl> tmp(new Tensor_impl());
-        tmp->Init(stmp);
-        this->_impl = tmp;
-    }
-    */
-
-    Tensor &operator=(const Tensor &rhs) {
-      _impl = rhs._impl;
-      return *this;
-    }
-
-    void operator=(const Tproxy &rhsp) {  // this is used to handle proxy assignment
-      this->_impl = rhsp._insimpl->get(rhsp._accs);
-    }
-    ///@endcond
-
-    //@{
-    // default device==Device.cpu (-1)
-    /**
-    @brief initialize a Tensor
-    @param[in] shape the shape of tensor.
-    @param[in] dtype the dtype of tensor. This can be any of type defined in cytnx::Type
-    @param[in] device the device that tensor to be created. This can be cytnx::Device.cpu or
-    @param[in] init_zero if true, the content of Tensor will be initialized to zero. if false, the
-    content of Tensor will be un-initialize.
-    cytnx::Device.cuda+<gpuid>, see cytnx::Device for more detail.
-
-    @note
-        The content of Tensor created will be un-initialize! See \link cytnx::zeros
-    zeros()\endlink, \link cytnx::ones ones() \endlink or \link cytnx::arange arange() \endlink for
-    generating an Tensor.
-
-    ## Example:
-    ### c++ API:
-    \include example/Tensor/Init.cpp
-    #### output>
-    \verbinclude example/Tensor/Init.cpp.out
-    ### python API:
-    \include example/Tensor/Init.py
-    #### output>
-    \verbinclude example/Tensor/Init.py.out
-    */
-    void Init(const std::vector<cytnx_uint64> &shape, const unsigned int &dtype = Type.Double,
-              const int &device = -1, const bool &init_zero = true) {
-      boost::intrusive_ptr<Tensor_impl> tmp(new Tensor_impl());
-      this->_impl = tmp;
-      this->_impl->Init(shape, dtype, device, init_zero);
-    }
-    // void Init(const Storage& storage) {
-    //   boost::intrusive_ptr<Tensor_impl> tmp(new Tensor_impl());
-    //   this->_impl = tmp;
-    //   this->_impl->Init(storage);
-    // }
-    // void Init(const Storage& storage, const std::vector<cytnx_uint64> &shape,
-    //   const unsigned int &dtype = Type.Double, const int &device = -1) {
-    //   boost::intrusive_ptr<Tensor_impl> tmp(new Tensor_impl());
-    //   this->_impl = tmp;
-    //   this->_impl->Init(storage, shape, dtype, device);
-    // }
-
-    /**
-     * @brief Construct a new Tensor object
-     * @details This is the constructor of Tensor. It will call
-     *     cytnx::Tensor::Init() to initialize the Tensor.
-     * @param[in] shape the shape of tensor
-     * @param[in] dtype the dtype of tensor. This can be any of type defined in cytnx::Type.
-     * @param[in] device the device that tensor to be created. This can be cytnx::Device.cpu or
-     *    cytnx::Device.cuda+<gpuid>, see cytnx::Device for more detail.
-     * @param[in] init_zero if true, the content of Tensor will be initialized to zero. If false,
-     *   the content of Tensor will be un-initialized.
-     * @see cytnx::Tensor::Init
-     */
-    Tensor(const std::vector<cytnx_uint64> &shape, const unsigned int &dtype = Type.Double,
-           const int &device = -1, const bool &init_zero = 1)
-        : _impl(new Tensor_impl()) {
-      this->Init(shape, dtype, device, init_zero);
-    }
-    // Tensor(const Storage& storage)
-    //     : _impl(new Tensor_impl()) {
-    //   this->Init(storage);
-    // }
-    // Tensor(const Storage& storage, const std::vector<cytnx_uint64> &shape,
-    //   const unsigned int &dtype = Type.Double, const int &device = -1)
-    //     : _impl(new Tensor_impl()) {
-    //   this->Init(storage, shape, dtype, device);
-    // }
-    //@}
-
-    /**
-    @brief Convert a Storage to Tensor
-    @param[in] in the Storage to be converted
-    @return [Tensor] a Tensor with the same dtype and device as the input Storage
-    */
-    static Tensor from_storage(const Storage &in) {
-      Tensor out;
-      boost::intrusive_ptr<Tensor_impl> tmp(new Tensor_impl());
-      out._impl = tmp;
-      out._impl->Init(in);
-      return out;
-    }
-
-    /**
-    @brief the dtype-id of the Tensor
-    @see cytnx::Type
-    @return [unsigned int] the dtype_id of the Tensor
-    */
-    unsigned int dtype() const { return this->_impl->dtype(); }
-
-    /**
-    @brief the device-id of the Tensor
-    @see cytnx::Device
-    @return [int] the device_id of the Tensor
-    */
-    int device() const { return this->_impl->device(); }
-
-    /**
-    @brief the dtype (in string) of the Tensor
-    @see cytnx::Type, dtype() const
-    @return [std::string] the dtype of the Tensor
-    */
-    std::string dtype_str() const { return this->_impl->dtype_str(); }
-
-    /**
-    @brief the device (in string) of the Tensor
-    @see cytnx::Device, device() const
-    @return [std::string] the device of the Tensor
-    */
-    std::string device_str() const { return this->_impl->device_str(); }
-
-    /**
-    @brief the shape of the Tensor
-    @return [std::vector<cytnx_uint64>] the shape of the Tensor
-    */
-    const std::vector<cytnx_uint64> &shape() const { return this->_impl->shape(); }
-
-    /**
-        @brief the rank of the Tensor
-        @return [cytnx_uint64] the rank of the Tensor
-    */
-    cytnx_uint64 rank() const { return this->_impl->shape().size(); }
-
-    /**
-    @brief return a clone of the current Tensor.
-    @return [Tensor]
-    @details
-        In C++ API, the behavior of assignment operator is designed to have same behavior as
-    python,\n to have a copy of the current tensor, we call clone to return a copy.
-
-    ## Example:
-    ### c++ API:
-    \include example/Tensor/clone.cpp
-    #### output>
-    \verbinclude example/Tensor/clone.cpp.out
-    ### python API:
-    \include example/Tensor/clone.py
-    #### output>
-    \verbinclude example/Tensor/clone.py.out
-    */
-    Tensor clone() const {
-      Tensor out;
-      out._impl = this->_impl->clone();
-      return out;
-    }
-
-    /**
-    @brief copy a tensor to new device
-    @param[in] device the device-id that is moving to. it can be any device defined in cytnx::Device
-    @return [Tensor]
-
-    description:\n
-        if the device-id is the same as current Tensor's device, then return self.\n
-        otherwise, return a copy of instance that located on the target device.   \n
-        see also: \link cytnx::Tensor::to_ Tensor.to_ \endlink                    \n
-
-    ## Example:
-    ### c++ API:
-    \include example/Tensor/to.cpp
-    #### output>
-    \verbinclude example/Tensor/to.cpp.out
-    ### python API:
-    \include example/Tensor/to.py
-    #### output>
-    \verbinclude example/Tensor/to.py.out
-    */
-    Tensor to(const int &device) const {
-      Tensor out;
-      out._impl = this->_impl->to(device);
-      return out;
-    }
-
-    /**
-    @brief move the current Tensor to the device.
-    @param[in] device the device-id that is moving to. it can be any device defined in cytnx::Device
-
-    description:\n
-        see also: \link cytnx::Tensor::to Tensor.to \endlink\n
-
-    ## Example:
-    ### c++ API:
-    \include example/Tensor/to_.cpp
-    #### output>
-    \verbinclude example/Tensor/to_.cpp.out
-    ### python API:
-    \include example/Tensor/to_.py
-    #### output>
-    \verbinclude example/Tensor/to_.py.out
-    */
-    void to_(const int &device) { this->_impl->to_(device); }
-
-    /**
-    @brief return whether the Tensor is contiguous or not.
-    @return [bool] true if the Tensor is contiguous, false otherwise.
-    */
-    const bool &is_contiguous() const { return this->_impl->is_contiguous(); }
-
-    Tensor permute_(const std::vector<cytnx_uint64> &rnks) {
-      this->_impl->permute_(rnks);
-      return *this;
-    }
-    /// @cond
-    template <class... Ts>
-    Tensor permute_(const cytnx_uint64 &e1, const Ts &...elems) {
-      std::vector<cytnx_uint64> argv = dynamic_arg_uint64_resolver(e1, elems...);
-      this->_impl->permute_(argv);
-      return *this;
-    }
-    /// @endcond
-
-    /**
-    @brief perform tensor permute on the cytnx::Tensor and return a new instance.
-    @param[in] rnks the permute indices, should have No. of elements equal to the rank of tensor.
-    @return [Tensor] a permuted new Tensor
-    @pre
-        1. The size of input and output Tensor should be the same.
-        2. \p rnks cannot contain duplicated elements.
-
-    ## Example:
-    ### c++ API:
-    \include example/Tensor/permute.cpp
-    #### output>
-    \verbinclude example/Tensor/permute.cpp.out
-    ### python API:
-    \include example/Tensor/permute.py
-    #### output>
-    \verbinclude example/Tensor/permute.py.out
-    */
-    Tensor permute(const std::vector<cytnx_uint64> &rnks) const {
-      Tensor out;
-      out._impl = this->_impl->permute(rnks);
-      return out;
-    }
-    /// @cond
-    template <class... Ts>
-    Tensor permute(const cytnx_uint64 &e1, const Ts &...elems) const {
-      std::vector<cytnx_uint64> argv = dynamic_arg_uint64_resolver(e1, elems...);
-      return this->permute(argv);
-    }
-    /// @endcond
-
-    /**
-    @brief Make the Tensor contiguous by coalescing the memory (storage).
-    @return [Tensor] a new Tensor that is with contiguous memory (storage).
-    @see \link Tensor::contiguous_ Tensor::contiguous_() \endlink
-
-    ## Example:
-    ### c++ API:
-    \include example/Tensor/contiguous.cpp
-    #### output>
-    \verbinclude example/Tensor/contiguous.cpp.out
-    ### python API:
-    \include example/Tensor/contiguous.py
-    #### output>
-    \verbinclude example/Tensor/contiguous.py.out
-    */
-    Tensor contiguous() const {
-      Tensor out;
-      out._impl = this->_impl->contiguous();
-      return out;
-    }
-
-    /**
-    @brief Make the Tensor contiguous by coalescing the memory (storage), inplacely
-    @see \link Tensor::contiguous Tensor::contiguous() \endlink
-
-    ## Example:
-    ### c++ API:
-    \include example/Tensor/contiguous_.cpp
-    #### output>
-    \verbinclude example/Tensor/contiguous_.cpp.out
-    ### python API:
-    \include example/Tensor/contiguous_.py
-    #### output>
-    \verbinclude example/Tensor/contiguous_.py.out
-    */
-    Tensor contiguous_() {
-      this->_impl->contiguous_();
-      return *this;
-    }
-
-    /**
-    @brief reshape the Tensor, inplacely
-    @param[in] new_shape the new shape of the Tensor.
-    @pre
-        1. The size of input and output Tensor should be the same.
-        2. \p new_shape cannot be empty.
-    @see \link Tensor::reshape Tensor::reshape() \endlink
-    @note
-        Compare to reshape(), this function will not create a new Tensor,
-          but reshape the current Tensor inplacely.
-
-    ## Example:
-    ### c++ API:
-    \include example/Tensor/reshape_.cpp
-    #### output>
-    \verbinclude example/Tensor/reshape_.cpp.out
-    ### python API:
-    \include example/Tensor/reshape_.py
-    #### output>
-    \verbinclude example/Tensor/reshape_.py.out
-    */
-    void reshape_(const std::vector<cytnx_int64> &new_shape) { this->_impl->reshape_(new_shape); }
-    /// @cond
-    void reshape_(const std::vector<cytnx_uint64> &new_shape) {
-      std::vector<cytnx_int64> shape(new_shape.begin(), new_shape.end());
-      this->_impl->reshape_(shape);
-    }
-    void reshape_(const std::initializer_list<cytnx_int64> &new_shape) {
-      std::vector<cytnx_int64> shape = new_shape;
-      this->_impl->reshape_(shape);
-    }
-    template <class... Ts>
-    void reshape_(const cytnx_int64 &e1, const Ts... elems) {
-      std::vector<cytnx_int64> shape = dynamic_arg_int64_resolver(e1, elems...);
-      // std::cout << shape << std::endl;
-      this->_impl->reshape_(shape);
-    }
-    /// @endcond
-
-    /**
-    @brief return a new Tensor that is reshaped.
-    @param[in] new_shape the new shape of the Tensor.
-    @return [Tensor]
-    @pre
-        1. The size of input and output Tensor should be the same.
-        2. \p new_shape cannot be empty.
-    @note
-        1. This function will not change the original Tensor.
-        2. You can use Tensor::reshape_() to reshape the Tensor inplacely.
-        3. You can set \p new_shape to -1, which will be automatically determined
-          by the size of the Tensor. The behavior is the same as numpy.reshape().
-    @see \link Tensor::reshape_ Tensor::reshape_() \endlink
-
-    ## Example:
-    ### c++ API:
-    \include example/Tensor/reshape.cpp
-    #### output>
-    \verbinclude example/Tensor/reshape.cpp.out
-    ### python API:
-    \include example/Tensor/reshape.py
-    #### output>
-    \verbinclude example/Tensor/reshape.py.out
-    */
-    Tensor reshape(const std::vector<cytnx_int64> &new_shape) const {
-      Tensor out;
-      out._impl = this->_impl->reshape(new_shape);
-      return out;
-    }
-
-    /**
-     * @see reshape(const std::vector<cytnx_int64> &new_shape) const
-     */
-    Tensor reshape(const std::vector<cytnx_uint64> &new_shape) const {
-      std::vector<cytnx_int64> tmp(new_shape.size());
-      memcpy(&tmp[0], &new_shape[0], sizeof(cytnx_uint64) * new_shape.size());
-      Tensor out;
-      out._impl = this->_impl->reshape(tmp);
-      return out;
-    }
-
-    /**
-     * @see reshape(const std::vector<cytnx_int64> &new_shape) const
-     */
-    Tensor reshape(const std::initializer_list<cytnx_int64> &new_shape) const {
-      return this->reshape(std::vector<cytnx_int64>(new_shape));
-    }
-
-    /// @cond
-    template <class... Ts>
-    Tensor reshape(const cytnx_int64 &e1, const Ts &...elems) const {
-      std::vector<cytnx_int64> argv = dynamic_arg_int64_resolver(e1, elems...);
-      return this->reshape(argv);
-    }
-    /// @endcond
-
-    /**
-    @brief return a new Tensor that cast to different dtype.
-    @param[in] new_type the new dtype. It can be any type defined in cytnx::Type
-    @return [Tensor]
-    @note
-        If the new_type is the same as dtype of the current Tensor, return self.
-    @attention
-        This function cannot convert complex type to real type, please use
-        Tensor::real() or Tensor::imag() to get the real or imaginary part of
-        the complex Tensor instead.
-
-    ## Example:
-    ### c++ API:
-    \include example/Tensor/astype.cpp
-    #### output>
-    \verbinclude example/Tensor/astype.cpp.out
-    ### python API:
-    \include example/Tensor/astype.py
-    #### output>
-    \verbinclude example/Tensor/astype.py.out
-    */
-    Tensor astype(const int &new_type) const {
-      Tensor out;
-      out._impl = this->_impl->astype(new_type);
-      return out;
-    }
-
-    // Tensor diagonal(){
-    //     for(unsigned int i=0;i<this->shape().size();i++){
-    //         if(this->shape()[i] != this->shape()[0],"[ERROR] Tensor.diagonal() can only be called
-    //         when the subject has equal dimension in each rank.%s","\n");
-    //     }
-    //
-    // }
-
-    /**
-    @brief Get an element at specific location.
-    @details This function is used to get an element at specific location. If the template type is
-    not given, the return will be a Scalar.
-    @param[in] locator the location of the element
-    @return [ref]
-
-    @note
-        1. This is for C++ API only!
-        2. need template instantiation to resolve the type, which should be consist with
-          the dtype of the Tensor. An error will be issued if the template type is inconsist
-          with the current dtype of Tensor.
-        3. For python API, use [] directly to get element.
-
-    ## Example:
-    ### c++ API:
-    \include example/Tensor/at.cpp
-    #### output>
-    \verbinclude example/Tensor/at.cpp.out
-    */
-    template <class T>
-    T &at(const std::vector<cytnx_uint64> &locator) {
-      return this->_impl->at<T>(locator);
-    }
-
-    /**
-     * @see at(const std::vector<cytnx_uint64> &locator)
-     */
-    template <class T>
-    const T &at(const std::vector<cytnx_uint64> &locator) const {
-      return this->_impl->at<T>(locator);
-    }
-    /// @cond
-    template <class T, class... Ts>
-    const T &at(const cytnx_uint64 &e1, const Ts &...elems) const {
-      std::vector<cytnx_uint64> argv = dynamic_arg_uint64_resolver(e1, elems...);
-      return this->at<T>(argv);
-    }
-    template <class T, class... Ts>
-    T &at(const cytnx_uint64 &e1, const Ts &...elems) {
-      std::vector<cytnx_uint64> argv = dynamic_arg_uint64_resolver(e1, elems...);
-      return this->at<T>(argv);
-    }
-
-    const Scalar::Sproxy at(const std::vector<cytnx_uint64> &locator) const {
-      return this->_impl->at(locator);
-    }
-
-    Scalar::Sproxy at(const std::vector<cytnx_uint64> &locator) { return this->_impl->at(locator); }
-    /// @endcond
-
-    /**
-    @brief get the element from a rank-0 Tensor.
-    @details This function is used to get the element from a rank-0 Tensor. If the template type is
-    not given, the return will be a Scalar.
-    @return [T]
-
-    @note
-        1. This can only be called on a rank-0 Tensor (scalar). For C++ API, a template
-    instantiation of type is needed to resolve the type, which should be connsist with the dtype of
-    the Tensor. An error will be issued if the template type if inconsist with the current dtype of
-    Tensor.
-        2. Although the return is by reference in C++ part, the return in python is not.
-        3. From 2., We recommend user to use at<T> (C++ API) and [] (python API) to modify the value
-    of the element to have consistant syntax across two languages.
-
-    ## Example:
-    ### c++ API:
-    \include example/Tensor/item.cpp
-    #### output>
-    \verbinclude example/Tensor/item.cpp.out
-    ### python API:
-    \include example/Tensor/item.py
-    #### output>
-    \verbinclude example/Tensor/item.py.out
-    */
-    template <class T>
-    T &item() {
-      cytnx_error_msg(this->_impl->storage().size() != 1, "[ERROR][Tensor.item<T>]%s",
-                      "item can only be called from a Tensor with only one element\n");
-      return this->_impl->storage().at<T>(0);
-    }
-
-    ///@cond
-    template <class T>
-    const T &item() const {
-      cytnx_error_msg(this->_impl->storage().size() != 1, "[ERROR][Tensor.item<T>]%s",
-                      "item can only be called from a Tensor with only one element\n");
-      return this->_impl->storage().at<T>(0);
-    }
-
-    const Scalar::Sproxy item() const {
-      Scalar::Sproxy out(this->storage()._impl, 0);
-      return out;
-    }
-
-    Scalar::Sproxy item() {
-      Scalar::Sproxy out(this->storage()._impl, 0);
-      return out;
-    }
-
-    ///@endcond
-
-    /**
-    @brief get elements using Accessor (C++ API) / slices (python API)
-    @param[in] accessors the Accessor (C++ API) / slices (python API) to get the elements.
-    @return [Tensor]
-    @see \link cytnx::Accessor Accessor\endlink for cordinate with Accessor in C++ API.
-    @note
-        1. the return will be a new Tensor instance, which not share memory with the current Tensor.
-
-    ## Equivalently:
-        One can also using more intruisive way to get the slice using [] operator.
-
-    ## Example:
-    ### c++ API:
-    \include example/Tensor/get.cpp
-    #### output>
-    \verbinclude example/Tensor/get.cpp.out
-    ### python API:
-    \include example/Tensor/get.py
-    #### output>
-    \verbinclude example/Tensor/get.py.out
-    */
-    Tensor get(const std::vector<cytnx::Accessor> &accessors) const {
-      Tensor out;
-      out._impl = this->_impl->get(accessors);
-      return out;
-    }
-
-    /*
-    Tensor get_v2(const std::vector<cytnx::Accessor> &accessors) const{
-        Tensor out;
-        out._impl = this->_impl->get_v2(accessors);
-        return out;
-    }
-    */
-
-    /**
-    @brief set elements with the input Tensor using Accessor (C++ API) / slices (python API)
-    @param[in] accessors the list(vector) of accessors.
-    @param rhs [Tensor]
-    @note:
-        the shape of the input Tensor should be the same as the shape that indicated using Accessor.
-    The memory is not shared with the input Tensor.
-
-    ## Example:
-    ### c++ API:
-    \include example/Tensor/set.cpp
-    #### output>
-    \verbinclude example/Tensor/set.cpp.out
-    ### python API:
-    \include example/Tensor/set.py
-    #### output>
-    \verbinclude example/Tensor/set.py.out
-    */
-    void set(const std::vector<cytnx::Accessor> &accessors, const Tensor &rhs) {
-      this->_impl->set(accessors, rhs._impl);
-    }
-
-    /**
-    @brief set elements with the input constant using Accessor (C++ API) / slices (python API)
-    @param[in] accessors the list(vector) of accessors.
-    @param rc [Const]
-
-    @see \link cytnx::Tensor::fill Tensor::fill \endlink for filling all elements with assigned
-    constant.
-
-    ## Example:
-    ### c++ API:
-    \include example/Tensor/set.cpp
-    #### output>
-    \verbinclude example/Tensor/set.cpp.out
-    ### python API:
-    \include example/Tensor/set.py
-    #### output>
-    \verbinclude example/Tensor/set.py.out
-    */
-    template <class T>
-    void set(const std::vector<cytnx::Accessor> &accessors, const T &rc) {
-      this->_impl->set(accessors, rc);
-    }
-    ///@cond
-    template <class T>
-    void set(const std::initializer_list<cytnx::Accessor> &accessors, const T &rc) {
-      std::vector<cytnx::Accessor> args = accessors;
-      this->set(args, rc);
-    }
-    ///@endcond
-
-    /**
-    @brief return the storage of current Tensor.
-    @return [Storage]
-
-    @note
-      The return storage shares the same instance of the storage of current Tensor. Use
-      Storage.clone() to create a new instance of the returned Storage.
-
-    */
-    Storage &storage() const { return this->_impl->storage(); }
-
-    /**
-    @brief fill all the element of current Tensor with the value.
-    @param[in] val the assigned value
-
-    ## Example:
-    ### c++ API:
-    \include example/Tensor/fill.cpp
-    #### output>
-    \verbinclude example/Tensor/fill.cpp.out
-    ### python API
-    \include example/Tensor/fill.py
-    #### output>
-    \verbinclude example/Tensor/fill.py.out
-    */
-    template <class T>
-    void fill(const T &val) {
-      this->_impl->fill(val);
-    }
-
-    /**
-     * @brief compare the shape of two tensors.
-     * @param[in] rhs the tensor to be compared.
-     */
-    bool equivshape(const Tensor &rhs) {
-      if (this->shape() != rhs.shape()) return false;
-      return true;
-    }
-
-    /**
-     * @brief return the real part of the tensor.
-     * @return [Tensor] the real part of the tensor.
-     * @pre the tensor must be complex type (Type.ComplexDouble or
-     *     Type.ComplexFloat).
-     * @see cytnx::Type
-     */
-    Tensor real();
-
-    /**
-     * @brief return the imaginary part of the tensor.
-     * @return [Tensor] the imaginary part of the tensor.
-     * @pre the tensor must be complex type (Type.ComplexDouble or
-     *    Type.ComplexFloat).
-     * @see cytnx::Type
-     */
-    Tensor imag();
-
-    // Arithmic:
-    /**
-     * @brief addition assignment operator with a Tensor or a scalar.
-     * @details This function will add the template type to the current tensor, inplacely.
-     *     The template can be either a scalar or a tensor. If the template is a
-     *     scalar, then the scalar will be added to all the elements of the
-     *     current tensor. If the template is a tensor, then the shape of the
-     *     template tensor must be the same as the current tensor. The supported
-     *     type of the template are Tensor, Scalar or any scalar type (see
-     *     \ref cytnx_complex128, \ref cytnx_complex64, \ref cytnx_double, \ref cytnx_float,
-     *     \ref cytnx_int64, \ref cytnx_int32, \ref cytnx_int16,
-     *     \ref cytnx_uint64, \ref cytnx_uint32, \ref cytnx_uint16, \ref cytnx_bool).
-     * @param[in] rc the added Tensor or scalar.
-     * @pre
-     *     If the template type is Tensor, then the shape of the template tensor
-     *       must be the same as the current tensor.
-     */
-    template <class T>
-    Tensor &operator+=(const T &rc);
-
-    /**
-     * @brief subtraction assignment operator with a Tensor or a scalar.
-     * @details This function will subtract the template type to the current tensor, inplacely.
-     *    The template can be either a scalar or a tensor. If the template is a
-     *    scalar, then the scalar will be subtracted to all the elements of the
-     *    current tensor. If the template is a tensor, then the shape of the
-     *    template tensor must be the same as the current tensor. The supported
-     *    type of the template are Tensor, Scalar or any scalar type (see
-     *    \ref cytnx_complex128, \ref cytnx_complex64, \ref cytnx_double, \ref cytnx_float,
-     *    \ref cytnx_int64, \ref cytnx_int32, \ref cytnx_int16,
-     *    \ref cytnx_uint64, \ref cytnx_uint32, \ref cytnx_uint16, \ref cytnx_bool).
-     * @param[in] rc the subtracted Tensor or scalar.
-     * @pre
-     *   If the template type is Tensor, then the shape of the template tensor
-     *     must be the same as the current tensor.
-     */
-    template <class T>
-    Tensor &operator-=(const T &rc);
-
-    /**
-     * @brief multiplication assignment operator with a Tensor or a scalar.
-     * @details This function will multiply the template type to the current tensor, inplacely.
-     * The template can be either a scalar or a tensor. If the template is a
-     * scalar, then the scalar will be multiplied to all the elements of the
-     * current tensor. If the template is a tensor, then the shape of the
-     * template tensor must be the same as the current tensor. The supported
-     * type of the template are Tensor, Scalar or any scalar type (see
-     * \ref cytnx_complex128, \ref cytnx_complex64, \ref cytnx_double, \ref cytnx_float,
-     * \ref cytnx_int64, \ref cytnx_int32, \ref cytnx_int16,
-     * \ref cytnx_uint64, \ref cytnx_uint32, \ref cytnx_uint16, \ref cytnx_bool).
-     * @param[in] rc the multiplied Tensor or scalar.
-     * @pre
-     *  If the template type is Tensor, then the shape of the template tensor
-     *   must be the same as the current tensor.
-     */
-    template <class T>
-    Tensor &operator*=(const T &rc);
-
-    /**
-     * @brief division assignment operator with a Tensor or a scalar.
-     * @details This function will divide the template type to the current tensor, inplacely.
-     * The template can be either a scalar or a tensor. If the template is a
-     * scalar, then the scalar will be divided to all the elements of the
-     * current tensor. If the template is a tensor, then the shape of the
-     * template tensor must be the same as the current tensor. The supported
-     * type of the template are Tensor, Scalar or any scalar type (see
-     * \ref cytnx_complex128, \ref cytnx_complex64, \ref cytnx_double, \ref cytnx_float,
-     * \ref cytnx_int64, \ref cytnx_int32, \ref cytnx_int16,
-     * \ref cytnx_uint64, \ref cytnx_uint32, \ref cytnx_uint16, \ref cytnx_bool).
-     * @param[in] rc the divided Tensor or scalar.
-     * @pre
-     * 1. If the template type is Tensor, then the shape of the template tensor
-     *  must be the same as the current tensor.
-     * 2. \p rc cannot be zero.
-     */
-    template <class T>
-    Tensor &operator/=(const T &rc);
-
-    // Tensor &operator+=(const Tproxy &rc);
-    // Tensor &operator-=(const Tproxy &rc);
-    // Tensor &operator*=(const Tproxy &rc);
-    // Tensor &operator/=(const Tproxy &rc);
-    /*
-    Tensor operator+(const Tproxy &rc){
-         return *this + Tensor(rc);
-    }
-    Tensor operator-(const Tproxy &rc){
-         return *this - Tensor(rc);
-    }
-    Tensor operator*(const Tproxy &rc){
-         return *this * Tensor(rc);
-    }
-    Tensor operator/(const Tproxy &rc){
-         return *this / Tensor(rc);
-    }
-    */
-    /**
-     * @brief Addition function with a Tensor or a scalar. Same as
-     * cytnx::operator+(const Tensor &self, const T &rhs).
-     * @param[in] rhs the added Tensor or scalar.
-     */
-    template <class T>
-    Tensor Add(const T &rhs) {
-      return *this + rhs;
-    }
-
-    /**
-     * @brief Addition function with a Tensor or a scalar, inplacely.
-     * Same as operator+=(const T &rhs).
-     * @param[in] rhs the added Tensor or scalar.
-     */
-    template <class T>
-    Tensor &Add_(const T &rhs) {
-      return *this += rhs;
-    }
-
-    /**
-     * @brief Subtraction function with a Tensor or a scalar. Same as
-     * cytnx::operator-(const Tensor &self, const T &rhs).
-     * @param[in] rhs the subtracted Tensor or scalar.
-     */
-    template <class T>
-    Tensor Sub(const T &rhs) {
-      return *this - rhs;
-    }
-
-    /**
-     * @brief Subtraction function with a Tensor or a scalar, inplacely.
-     * Same as operator-=(const T &rhs).
-     * @param[in] rhs the subtracted Tensor or scalar.
-     */
-    template <class T>
-    Tensor &Sub_(const T &rhs) {
-      return *this -= rhs;
-    }
-
-    /**
-     * @brief Multiplication function with a Tensor or a scalar. Same as
-     * cytnx::operator*(const Tensor &self, const T &rhs).
-     * @param[in] rhs the multiplied Tensor or scalar.
-     */
-    template <class T>
-    Tensor Mul(const T &rhs) {
-      return *this * rhs;
-    }
-
-    /**
-     * @brief Multiplication function with a Tensor or a scalar, inplacely.
-     * Same as operator*=(const T &rhs).
-     * @param[in] rhs the multiplied Tensor or scalar.
-     */
-    template <class T>
-    Tensor &Mul_(const T &rhs) {
-      return *this *= rhs;
-    }
-
-    /**
-     * @brief Division function with a Tensor or a scalar. Same as
-     * cytnx::operator/(const Tensor &self, const T &rhs).
-     * @param[in] rhs the divided Tensor or scalar.
-     * @attension \p rhs cannot be zero.
-     */
-    template <class T>
-    Tensor Div(const T &rhs) {
-      return *this / rhs;
-    }
-
-    /**
-     * @brief Division function with a Tensor or a scalar, inplacely.
-     * Same as operator/=(const T &rhs).
-     * @param[in] rhs the divided Tensor or scalar.
-     * @attension \p rhs cannot be zero.
-     */
-    template <class T>
-    Tensor &Div_(const T &rhs) {
-      return *this /= rhs;
-    }
-
-    /**
-     * @brief The comparison function.
-     * @details This function is the comparison function. Same as
-     * cytnx::operator==(const Tensor &self, const T &rhs).
-     * @param[in] rhs the compared object.
-     */
-    template <class T>
-    Tensor Cpr(const T &rhs) {
-      return *this == rhs;
-    }
-
-    // /**
-    //  * @brief Compare each element of the current tensor with the input tensor.
-    //  * @details This function Compare each element of the current tensor with the input tensor.
-    //  * @param[in] rhs the compared tensor.
-    //  */
-    // bool approx_eq(const Tensor &rhs, const cytnx_double tol = 0) {
-    //   if (this->device() != rhs.device()) {
-    //     if (User_debug)
-    //       std::cout << "[approx_eq] Tensor device " << this->device()
-    //                 << "not equal to rhs tensor device " << rhs.device() << std::endl;
-    //     return false;
-    //   }
-    //   // if (this->dtype() != rhs.dtype()) {
-    //   //   std::cout << "[approx_eq] Tensor dtype " << this->dtype()
-    //   //             << "not equal to rhs tensor dtype " << rhs.dtype() << std::endl;
-    //   //   return false;
-    //   // }
-    //   if (this->shape() != rhs.shape()) {
-    //     if (User_debug)
-    //       std::cout << "[approx_eq] Tensor shape " << this->shape()
-    //                 << "not equal to rhs tensor shape " << rhs.shape() << std::endl;
-    //     return false;
-    //   }
-    //   if (this->is_contiguous() != rhs.is_contiguous()) {
-    //     if (User_debug)
-    //       std::cout << "[AreNearlyEqTensor] Tensor contiguous flag " << this->is_contiguous()
-    //                 << "not equal to rhs tensor flag " << rhs.is_contiguous() << std::endl;
-    //     return false;
-    //   }
-    //   return this->_impl->_storage.approx_eq(rhs._impl->_storage._impl, tol);
-    // }
-
-    // template<class T>
-    // Tensor& Cpr_(const T &rhs){
-    //
-    //      return *this == rhs;
-    // }
-
-    template <class T>
-    Tensor Mod(const T &rhs) {
-      return *this % rhs;
-    }
-
-    /**
-     * @brief The negation function.
-     * @details This function is the negation function. Namely, if the current
-     * tensor is \f$A\f$, then the output tensor is \f$-A\f$.
-     * @return The negation of the current tensor.
-     */
-    Tensor operator-() { return this->Mul(-1.); }
-
-    /**
-     * @brief The flatten function.
-     * @details This function is the flatten function. It will clone (deep copy)
-     * , contiguos the current tensor and reshape it to 1-rank Tensor.
-     * @note compare to the flatten_() function, this function will return a new
-     * tensor and the current tensor will not be changed.
-     */
-    Tensor flatten() const {
-      Tensor out = this->clone();
-      out.contiguous_();
-      out.reshape_({-1});
-      return out;
-    }
-
-    /**
-     * @brief The flatten function, inplacely.
-     * @details This function is the flatten function, inplacely. It will
-     * contiguos the current tensor and reshape it to 1-rank Tensor.
-     * @note compare to the flatten() function, this is an inplacely function,
-     * the current tensor will be changed.
-     */
-    void flatten_() {
-      this->contiguous_();
-      this->reshape_({-1});
-    }
-
-    /**
-     * @brief the append function.
-     * @details This function is the append function. It will append the \p rhs
-     * tensor to the current tensor. The \p rhs tensor must have the same shape
-     * as the current tensor, except the first dimension. For example, if the
-     * current tensor is \f$A(i,j,k)\f$ and the \p rhs tensor is \f$B(j,k)\f$, then
-     * the output tensor is \f$C(i,j,k)\f$ where
-     * \f[
-     * C(i,j,k) = \begin{cases}
-     * A(i,j,k) & \text{if } i \neq N \\
-     * B(j,k) & \text{if } i = N
-     * \end{cases}
-     * \f]
-     * where \f$N\f$ is the number of the first dimension of the current tensor.
-     * Here indices \f$i\f$, \f$j\f$ and \f$k\f$ start from 0.
-     * @param[in] rhs the appended tensor.
-     * @return The appended tensor.
-     * @pre
-     * 1. The \p rhs tensor and the current tensor cannot be empty.
-     * 2. The \p rhs tensor must have the same shape as the current tensor,
-     * except the first dimension. Namely, rhs.shape()[i] == this->shape()[i+1]
-     * and rhs.shape().size() == this->shape().size()-1.
-     * @note If the dtype of the \p rhs is different from the current tensor,
-     * the \p rhs will be casted to the dtype of the current tensor.
-     * @see append(const Storage &rhs)
-     */
-    void append(const Tensor &rhs) {
-      // Tensor in;
-      if (!this->is_contiguous()) this->contiguous_();
-
-      // check Tensor in shape:
-      cytnx_error_msg(rhs.shape().size() == 0 || this->shape().size() == 0,
-                      "[ERROR] try to append a null Tensor.%s", "\n");
-      cytnx_error_msg(rhs.shape().size() != (this->shape().size() - 1),
-                      "[ERROR] try to append a Tensor with rank not match.%s", "\n");
-      cytnx_uint64 Nelem = 1;
-      for (unsigned int i = 0; i < rhs.shape().size(); i++) {
-        cytnx_error_msg(rhs.shape()[i] != this->shape()[i + 1],
-                        "[ERROR] dimension mismatch @ rhs.rank: [%d] this: [%d] rhs: [%d]\n", i,
-                        this->shape()[i + 1], rhs.shape()[i]);
-        Nelem *= rhs.shape()[i];
-      }
-
-      // check type:
-      Tensor in;
-      if (rhs.dtype() != this->dtype()) {
-        in = rhs.astype(this->dtype());
-        if (!in.is_contiguous()) in.contiguous_();
-      } else {
-        if (!in.is_contiguous())
-          in = rhs.contiguous();
-        else
-          in = rhs;
-      }
-      this->_impl->_shape[0] += 1;
-      cytnx_uint64 oldsize = this->_impl->_storage.size();
-      this->_impl->_storage.resize(oldsize + Nelem);
-      memcpy(((char *)this->_impl->_storage.data()) +
-               oldsize * Type.typeSize(this->dtype()) / sizeof(char),
-             in._impl->_storage.data(), Type.typeSize(in.dtype()) * Nelem);
-    }
-    /**
-     * @brief the append function of the Storage.
-     * @details This function is the append function of the Storage. It will
-     * append the \p srhs Storage to the current tensor. The current tensor must
-     * be rank-2 and the \p srhs Storage must have the same size as the second
-     * dimension of the current tensor. For example, if the current tensor is
-     * \f$A\f$ with size \f$M \times N\f$ and the \p srhs Storage is \f$B\f$
-     * with size \f$N\f$, then the output tensor is \f$C\f$ with size \f$M \times
-     * (N+1)\f$ where
-     * \f[
-     * C(i,j) = \begin{cases}
-     * A(i,j) & \text{if } j \neq N \\
-     * B(i) & \text{if } j = N
-     * \end{cases}
-     * \f]
-     * Here indices \f$i\f$ and \f$j\f$ start from 0.
-     * @param[in] srhs the appended Storage.
-     * @return The appended tensor.
-     * @pre
-     * 1. The \p srhs Storage and the current tensor cannot be empty.
-     * 2. The current tensor must be rank-2.
-     * 3. The \p srhs Storage must have the same size as the second dimension of
-     * the current tensor. Namely, srhs.size() == this->shape()[1].
-     * @note If the dtype of the \p srhs is different from the current tensor,
-     * the \p srhs will be casted to the dtype of the current tensor.
-     * @see append(const Tensor &rhs)
-     */
-    void append(const Storage &srhs) {
-      if (!this->is_contiguous()) this->contiguous_();
-
-      // check Tensor in shape:
-      cytnx_error_msg(srhs.size() == 0 || this->shape().size() == 0,
-                      "[ERROR] try to append a null Tensor.%s", "\n");
-      cytnx_error_msg((this->shape().size() - 1) != 1,
-                      "[ERROR] append a storage to Tensor can only accept rank-2 Tensor.%s", "\n");
-      cytnx_error_msg(this->shape().back() != srhs.size(), "[ERROR] Tensor dmension mismatch!%s",
-                      "\n");
-
-      // check type:
-      Storage in;
-      if (srhs.dtype() != this->dtype()) {
-        in = srhs.astype(this->dtype());
-      } else {
-        in = srhs;
-      }
-      this->_impl->_shape[0] += 1;
-      cytnx_uint64 oldsize = this->_impl->_storage.size();
-      this->_impl->_storage.resize(oldsize + in.size());
-      memcpy(((char *)this->_impl->_storage.data()) +
-               oldsize * Type.typeSize(this->dtype()) / sizeof(char),
-             in._impl->Mem, Type.typeSize(in.dtype()) * in.size());
-    }
-    /*
-    void append(const Tensor &rhs){
-        // convert to the same type.
-        Tensor in;
-        if(rhs.dtype() != this->dtype()){
-            in = rhs.astype(this->dtype());
-        }else{
-            in = rhs;
-        }
-
-        // 1) check rank
-        if(this->shape().size()==1){
-            // check if rhs is a scalar tensor (only one element)
-            cytnx_error_msg(!(rhs.shape().size()==1 && rhs.shape()[0]==1),"[ERROR] trying to append
-    a scalar into multidimentional Tensor is not allow.\n Only rank-1 Tensor can accept scalar
-    append.%s","\n"); this->_impl->_shape[0]+=1; this->_impl->_storage.append(0);
-
-        }else{
-            cytnx_error_msg(rhs.shape().size() != this->shape().size()-1,"[ERROR] try to append a
-    Tensor with rank not match.%s","\n");
-
-        }
-        cytnx_error_msg(!this->is_contiguous(),"[ERROR] append require the Tensor to be contiguous.
-    suggestion: call contiguous() or contiguous_() first.","\n");
-    }
-    */
-    /**
-     * @brief the append function of the scalar.
-     * @details This function is the append function of the scalar. It can only append
-     * scalar into rank-1 Tensor.
-     * @param[in] rhs the appended scalar.
-     * @return The appended tensor.
-     * @pre
-     * 1. The current Tensor must be rank-1. (1D array)
-     * 2. The current Tensor must be contiguous.
-     * 3. \p rhs must be a scalar.
-     */
-    template <class T>
-    void append(const T &rhs) {
-      cytnx_error_msg(this->shape().size() != 1,
-                      "[ERROR] trying to append a scalar into multidimentional Tensor is not "
-                      "allow.\n Only rank-1 Tensor can accept scalar append.%s",
-                      "\n");
-      cytnx_error_msg(!this->is_contiguous(),
-                      "[ERROR] append require the Tensor to be contiguous. suggestion: call "
-                      "contiguous() or contiguous_() first.",
-                      "\n");
-      this->_impl->_shape[0] += 1;
-      this->_impl->_storage.append(rhs);
-    }
-
-    /**
-     * @brief Check whether two tensors share the same internal memory.
-     * @details This function will check whether two tensors share the same
-     * internal memory. If the two tensors share the same internal memory, then
-     * the function will return true. Otherwise, it will return false. See user
-     * guide for more details.
-     * @param[in] rhs the tensor to be compared.
-     */
-    bool same_data(const Tensor &rhs) const;
-
-    // linalg:
-    /**
-     * @brief the SVD member function. Same as
-     * \ref cytnx::linalg::Svd(const Tensor &Tin, const bool &is_UvT)
-     * , where \p Tin is the current Tensor.
-     */
-    std::vector<Tensor> Svd(const bool &is_UvT = true) const;
-
-    /**
-     * @brief the Eigh member function. Same as
-     * \ref cytnx::linalg::Eigh(const Tensor &Tin, const bool &is_V, const bool &row_v)
-     * , where \p Tin is the current Tensor.
-     */
-    std::vector<Tensor> Eigh(const bool &is_V = true, const bool &row_v = false) const;
-
-    /**
-     * @brief the InvM_ member function. Same as
-     * \ref cytnx::linalg::InvM_(Tensor &Tin), where \p Tin is the current Tensor.
-     */
-    Tensor &InvM_();
-
-    /**
-     * @brief the InvM member function. Same as
-     * \ref cytnx::linalg::InvM(const Tensor &Tin), where \p Tin is the current Tensor.
-     */
-    Tensor InvM() const;
-
-    /**
-     * @brief the Inv_ member function. Same as
-     * \ref cytnx::linalg::Inv_(Tensor &Tin, const double &clip)
-     */
-    Tensor &Inv_(const double &clip);
-
-    /**
-     * @brief the Inv member function. Same as
-     * \ref cytnx::linalg::Inv(const Tensor &Tin, const double &clip)
-     */
-    Tensor Inv(const double &clip) const;
-
-    /**
-     * @brief the Conj_ member function. Same as
-     * \ref cytnx::linalg::Conj_(Tensor &Tin), where \p Tin is the current Tensor.
-     */
-    Tensor &Conj_();
-
-    /**
-     * @brief the Conj member function. Same as
-     * \ref cytnx::linalg::Conj(const Tensor &Tin), where \p Tin is the current Tensor.
-     */
-    Tensor Conj() const;
-
-    /**
-     * @brief the Exp_ member function. Same as linalg::Exp_(Tensor &Tin), where \p Tin is the
-     * current Tensor.
-     */
-    Tensor &Exp_();
-
-    /**
-     * @brief the Exp member function. Same as linalg::Exp(const Tensor &Tin), where \p Tin is the
-     * current Tensor.
-     */
-    Tensor Exp() const;
-
-    /**
-     * @brief the Norm member function. Same as linalg::Norm(const Tensor &Tin), where \p Tin is
-     * the current Tensor.
-     */
-    Tensor Norm() const;
-
-    /**
-     * @brief the Pow member function. Same as linalg::Pow(const Tensor &Tin, const cytnx_double
-     * &p), where \p Tin is the current Tensor.
-     */
-    Tensor Pow(const cytnx_double &p) const;
-
-    /**
-     * @brief the Pow_ member function. Same as linalg::Pow_(Tensor &Tin, const cytnx_double
-     * &p), where \p Tin is the current Tensor.
-     */
-    Tensor &Pow_(const cytnx_double &p);
-
-    /**
-     * @brief the Trace member function. Same as linalg::Trace(const Tensor &Tin, const
-     * cytnx_uint64 &a, const cytnx_uint64 &b), where \p Tin is the current Tensor.
-     */
-    Tensor Trace(const cytnx_uint64 &a = 0, const cytnx_uint64 &b = 1) const;
-
-    /**
-     * @brief the Abs member function. Same as linalg::Abs(const Tensor &Tin), where \p Tin is the
-     * current Tensor.
-     */
-    Tensor Abs() const;
-
-    /**
-     * @brief the Abs_ member function. Same as linalg::Abs_(Tensor &Tin), where \p Tin is the
-     * current Tensor.
-     */
-    Tensor &Abs_();
-
-    /**
-     * @brief the Max member function. Same as linalg::Max(const Tensor &Tin),
-     * where \p Tin is the current Tensor.
-     */
-    Tensor Max() const;
-
-    /**
-     * @brief the Min member function. Same as linalg::Min(const Tensor &Tin),
-     * where \p Tin is the current Tensor.
-     */
-    Tensor Min() const;
-
-  };  // class Tensor
-
-  Tensor operator+(const Tensor &lhs, const Tensor::Tproxy &rhs);
-  Tensor operator-(const Tensor &lhs, const Tensor::Tproxy &rhs);
-  Tensor operator*(const Tensor &lhs, const Tensor::Tproxy &rhs);
-  Tensor operator/(const Tensor &lhs, const Tensor::Tproxy &rhs);
-
-  Tensor operator+(const Tensor &lhs, const Scalar::Sproxy &rhs);
-  Tensor operator-(const Tensor &lhs, const Scalar::Sproxy &rhs);
-  Tensor operator*(const Tensor &lhs, const Scalar::Sproxy &rhs);
-  Tensor operator/(const Tensor &lhs, const Scalar::Sproxy &rhs);
-
-  ///@cond
-  std::ostream &operator<<(std::ostream &os, const Tensor &in);
-  std::ostream &operator<<(std::ostream &os, const Tensor::Tproxy &in);
-  ///@endcond
-  //{ os << Tensor(in);};
-}  // namespace cytnx
-
-#endif  // CYTNX_TENSOR_OLD_H_
diff --git a/include/linalg.hpp.old b/include/linalg.hpp.old
deleted file mode 100644
index 9825e8ddd..000000000
--- a/include/linalg.hpp.old
+++ /dev/null
@@ -1,1066 +0,0 @@
-#ifndef CYTNX_LINALG_HPP_H_
-#define CYTNX_LINALG_HPP_H_
-
-#include "Type.hpp"
-#include "cytnx_error.hpp"
-#include "Tensor.hpp"
-#include "backend/Storage.hpp"
-#include "UniTensor.hpp"
-#include "Scalar.hpp"
-#include "LinOp.hpp"
-#include <functional>
-
-namespace cytnx {
-  // class Tensor;    //fwd
-  // class UniTensor; //fwd
-  // class LinOp;     //fwd
-
-  /**
-  @namespace cytnx::linalg
-  @brief linear algebra related functions.
-  */
-  namespace linalg {
-
-    // Add:
-    //==================================================
-    /**
-    @brief element-wise add
-    */
-    cytnx::UniTensor Add(const cytnx::UniTensor &Lt, const cytnx::UniTensor &Rt);
-    template <class T>
-    cytnx::UniTensor Add(const T &lc, const cytnx::UniTensor &Rt);
-    template <class T>
-    cytnx::UniTensor Add(const cytnx::UniTensor &Lt, const T &rc);
-
-    // Sub:
-    //==================================================
-    /**
-    @brief element-wise subtract
-    */
-    cytnx::UniTensor Sub(const cytnx::UniTensor &Lt, const cytnx::UniTensor &Rt);
-    template <class T>
-    cytnx::UniTensor Sub(const T &lc, const cytnx::UniTensor &Rt);
-    template <class T>
-    cytnx::UniTensor Sub(const cytnx::UniTensor &Lt, const T &rc);
-
-    // Mul:
-    //==================================================
-    /**
-    @brief element-wise subtract
-    */
-    cytnx::UniTensor Mul(const cytnx::UniTensor &Lt, const cytnx::UniTensor &Rt);
-    template <class T>
-    cytnx::UniTensor Mul(const T &lc, const cytnx::UniTensor &Rt);
-    template <class T>
-    cytnx::UniTensor Mul(const cytnx::UniTensor &Lt, const T &rc);
-
-    // Div:
-    //==================================================
-    /**
-    @brief element-wise divide
-    */
-    cytnx::UniTensor Div(const cytnx::UniTensor &Lt, const cytnx::UniTensor &Rt);
-    template <class T>
-    cytnx::UniTensor Div(const T &lc, const cytnx::UniTensor &Rt);
-    template <class T>
-    cytnx::UniTensor Div(const cytnx::UniTensor &Lt, const T &rc);
-
-    // Mod:
-    //==================================================
-    /**
-    @brief element-wise modulo
-    */
-    cytnx::UniTensor Mod(const cytnx::UniTensor &Lt, const cytnx::UniTensor &Rt);
-    template <class T>
-    cytnx::UniTensor Mod(const T &lc, const cytnx::UniTensor &Rt);
-    template <class T>
-    cytnx::UniTensor Mod(const cytnx::UniTensor &Lt, const T &rc);
-
-    std::vector<cytnx::UniTensor> Svd(const cytnx::UniTensor &Tin, const bool &is_U = true,
-                                      const bool &is_vT = true);
-    std::vector<cytnx::UniTensor> Svd_truncate(const cytnx::UniTensor &Tin,
-                                               const cytnx_uint64 &keepdim, const double &err = 0,
-                                               const bool &is_U = true, const bool &is_vT = true,
-                                               const bool &return_err = false);
-    std::vector<cytnx::UniTensor> Hosvd(
-      const cytnx::UniTensor &Tin, const std::vector<cytnx_uint64> &mode,
-      const bool &is_core = true, const bool &is_Ls = false,
-      const std::vector<cytnx_int64> &trucate_dim = std::vector<cytnx_int64>());
-
-    template <typename T>
-    cytnx::UniTensor ExpH(const cytnx::UniTensor &Tin, const T &a, const T &b = 0);
-    template <typename T>
-    cytnx::UniTensor ExpM(const cytnx::UniTensor &Tin, const T &a, const T &b = 0);
-
-    cytnx::UniTensor ExpH(const cytnx::UniTensor &Tin);
-    cytnx::UniTensor ExpM(const cytnx::UniTensor &Tin);
-
-
-    cytnx::UniTensor Trace(const cytnx::UniTensor &Tin, const cytnx_int64 &a = 0,
-                           const cytnx_int64 &b = 1);
-    cytnx::UniTensor Trace(const cytnx::UniTensor &Tin, const std::string &a, const std::string &b);
-    cytnx::UniTensor Trace(const cytnx::UniTensor &Tin, const cytnx_int64 &a = 0,
-                           const cytnx_int64 &b = 1, const bool &by_label = false);
-    std::vector<cytnx::UniTensor> Qr(const cytnx::UniTensor &Tin, const bool &is_tau = false);
-    std::vector<cytnx::UniTensor> Qdr(const cytnx::UniTensor &Tin, const bool &is_tau = false);
-
-    // Pow:
-    //==================================================
-    /**
-    @brief take power p on all the elements in UniTensor.
-    @param p, the power
-    @return
-        [UniTensor]
-
-    */
-    UniTensor Pow(const UniTensor &Tin, const double &p);
-
-    /**
-    @brief inplace perform power on all the elements in UniTensor.
-    @param Tin, the input UniTensor.
-    @param p, the power.
-
-    description:
-        on return, the elements in Tin will be modified to it's exponetial value.
-    */
-    void Pow_(UniTensor &Tin, const double &p);
-
-  }  // namespace linalg
-
-  cytnx::UniTensor operator+(const cytnx::UniTensor &Lt, const cytnx::UniTensor &Rt);
-  template <class T>
-  cytnx::UniTensor operator+(const T &lc, const cytnx::UniTensor &Rt);
-  template <class T>
-  cytnx::UniTensor operator+(const cytnx::UniTensor &Lt, const T &rc);
-
-  cytnx::UniTensor operator-(const cytnx::UniTensor &Lt, const cytnx::UniTensor &Rt);
-  template <class T>
-  cytnx::UniTensor operator-(const T &lc, const cytnx::UniTensor &Rt);
-  template <class T>
-  cytnx::UniTensor operator-(const cytnx::UniTensor &Lt, const T &rc);
-
-  cytnx::UniTensor operator*(const cytnx::UniTensor &Lt, const cytnx::UniTensor &Rt);
-  template <class T>
-  cytnx::UniTensor operator*(const T &lc, const cytnx::UniTensor &Rt);
-  template <class T>
-  cytnx::UniTensor operator*(const cytnx::UniTensor &Lt, const T &rc);
-
-  cytnx::UniTensor operator/(const cytnx::UniTensor &Lt, const cytnx::UniTensor &Rt);
-  template <class T>
-  cytnx::UniTensor operator/(const T &lc, const cytnx::UniTensor &Rt);
-  template <class T>
-  cytnx::UniTensor operator/(const cytnx::UniTensor &Lt, const T &rc);
-
-  cytnx::UniTensor operator%(const cytnx::UniTensor &Lt, const cytnx::UniTensor &Rt);
-  template <class T>
-  cytnx::UniTensor operator%(const T &lc, const cytnx::UniTensor &Rt);
-  template <class T>
-  cytnx::UniTensor operator%(const cytnx::UniTensor &Lt, const T &rc);
-}  // namespace cytnx
-
-//====================================================================================
-//====================================================================================
-//====================================================================================
-namespace cytnx {
-
-  namespace linalg {
-    Tensor Add(const Tensor &Lt, const Tensor &Rt);
-    template <class T>
-    Tensor Add(const T &lc, const Tensor &Rt);
-    template <class T>
-    Tensor Add(const Tensor &Lt, const T &rc);
-
-    void iAdd(Tensor &Lt, const Tensor &Rt);
-
-    // Sub:
-    //==================================================
-    /**
-    @brief element-wise subtract
-    */
-    Tensor Sub(const Tensor &Lt, const Tensor &Rt);
-    template <class T>
-    Tensor Sub(const T &lc, const Tensor &Rt);
-    template <class T>
-    Tensor Sub(const Tensor &Lt, const T &rc);
-
-    void iSub(Tensor &Lt, const Tensor &Rt);
-
-    // Mul:
-    //==================================================
-    /**
-    @brief element-wise subtract
-    */
-    Tensor Mul(const Tensor &Lt, const Tensor &Rt);
-    template <class T>
-    Tensor Mul(const T &lc, const Tensor &Rt);
-    template <class T>
-    Tensor Mul(const Tensor &Lt, const T &rc);
-
-    void iMul(Tensor &Lt, const Tensor &Rt);
-
-    // Div:
-    //==================================================
-    /**
-    @brief element-wise divide
-    */
-    Tensor Div(const Tensor &Lt, const Tensor &Rt);
-    template <class T>
-    Tensor Div(const T &lc, const Tensor &Rt);
-    template <class T>
-    Tensor Div(const Tensor &Lt, const T &rc);
-
-    void iDiv(Tensor &Lt, const Tensor &Rt);
-
-    // Mod:
-    //==================================================
-    /**
-    @brief element-wise divide
-    */
-    Tensor Mod(const Tensor &Lt, const Tensor &Rt);
-    template <class T>
-    Tensor Mod(const T &lc, const Tensor &Rt);
-    template <class T>
-    Tensor Mod(const Tensor &Lt, const T &rc);
-
-    // Cpr:
-    //==================================================
-    /**
-    @brief element-wise compare
-    */
-    Tensor Cpr(const Tensor &Lt, const Tensor &Rt);
-    template <class T>
-    Tensor Cpr(const T &lc, const Tensor &Rt);
-    template <class T>
-    Tensor Cpr(const Tensor &Lt, const T &rc);
-
-    // Norm:
-    //=================================================
-    /**
-    @brief calculate the norm of a tensor.
-    @param Tl input Tensor
-    @return Tensor
-
-    [Note]
-        1. if the input tensor is rank-1, the frobenius norm is calculated.
-        2. if the input tensor is rank-N with N>=2, the tensor will be flatten to 1d first, and
-    calculate the frobenius norm.
-    */
-    Tensor Norm(const Tensor &Tl);
-
-    // Det:
-    //=================================================
-    /**
-    @brief calculate the determinant of a tensor.
-    @param Tl input Tensor
-    @return Tensor
-
-    [Note]
-        1. input tensor should be a NxN rank-2 Tensor.
-    */
-    Tensor Det(const Tensor &Tl);
-
-    // Svd:
-    //==================================================
-    /**
-    @brief Perform Singular-Value decomposition on a rank-2 Tensor.
-    @param Tin a \link cytnx::Tensor Tensor \endlink, it should be a rank-2 tensor (matrix)
-    @param is_U if return a left uniform matrix.
-    @param is_vT if return a right uniform matrix.
-    @return [std::vector<Tensors>]
-
-        1. the first tensor is a 1-d tensor contanin the singular values
-        2. the second tensor is the left uniform matrix [U], a 2-d tensor (matrix). It only return
-    when is_U=true.
-        3. the third tensor is the right uniform matrix [vT], a 2-d tensor (matrix). It only return
-    when is_vT=true.
-    */
-    std::vector<Tensor> Svd(const Tensor &Tin, const bool &is_U = true, const bool &is_vT = true);
-
-    // Svd_truncate:
-    //==================================================
-    std::vector<Tensor> Svd_truncate(const Tensor &Tin, const cytnx_uint64 &keepdim,
-                                     const double &err = 0, const bool &is_U = true,
-                                     const bool &is_vT = true, const bool &return_err = false);
-
-    // Hosvd:
-    std::vector<Tensor> Hosvd(
-      const Tensor &Tin, const std::vector<cytnx_uint64> &mode, const bool &is_core = true,
-      const bool &is_Ls = false,
-      const std::vector<cytnx_int64> &trucate_dim = std::vector<cytnx_int64>());
-
-    // Qr:
-    //==================================================
-    /**
-    @brief Perform QR decomposition on a rank-2 Tensor.
-    @param Tin a \link cytnx::Tensor Tensor \endlink, it should be a rank-2 tensor (matrix)
-    @param is_tau if return the tau that contains the Householder reflectors that generate q along
-    with r. The tau array contains scaling factors for the reflectors
-    @return [std::vector<Tensors>]
-
-        1. the first tensor is the orthomormal matrix [Q], a 2-d tensor (matrix)
-        2. the second tensor is the right-upper triangular matrix [R], a 2-d tensor (matrix).
-        3. the third tensor is the Householder reflectors [H], a 1-d tensor (vector). It only return
-    when is_tau=true.
-    */
-    std::vector<Tensor> Qr(const Tensor &Tin, const bool &is_tau = false);
-
-    // Qdr:
-    //==================================================
-    /**
-    @brief Perform QDR decomposition on a rank-2 Tensor.
-    @param Tin a \link cytnx::Tensor Tensor \endlink, it should be a rank-2 tensor (matrix)
-    @param is_tau if return the tau that contains the Householder reflectors that generate q along
-    with r. The tau array contains scaling factors for the reflectors
-    @return [std::vector<Tensors>]
-
-        1. the first tensor is the orthomormal matrix [Q], a 2-d tensor (matrix)
-        2. the second tensor is the diagonal matrix [D], a 1-d tensor (matrix).
-        3. the third tensor is the right-upper triangular matrix [R], a 2-d tensor (matrix).
-        4. the forth tensor is the Householder reflectors [H], a 1-d tensor (matrix). It only return
-    when is_tau=true.
-    */
-    std::vector<Tensor> Qdr(const Tensor &Tin, const bool &is_tau = false);
-
-    // Eigh:
-    //==================================================
-    /**
-    @brief eigen-value decomposition for Hermitian matrix
-    @param Tin The Tensor
-    @param is_V return eigen vectors
-    @param row_V if set to ture, the return eigen vectors will be row form.
-    [Note] the Tin should be a rank-2 Tensor.
-    */
-    std::vector<Tensor> Eigh(const Tensor &Tin, const bool &is_V = true, const bool &row_v = false);
-
-    // Eig:
-    //==================================================
-    /**
-    @brief eigen-value decomposition for generic square matrix
-    @param Tin The Tensor
-    @param is_V return eigen vectors
-    @param row_V if set to ture, the return eigen vectors will be row form.
-
-    [Note] the Tin should be a rank-2 Tensor.
-    */
-    std::vector<Tensor> Eig(const Tensor &Tin, const bool &is_V = true, const bool &row_v = false);
-
-    // Trace:
-    //==================================================
-    /**
-    @brief perform trace over index.
-
-    [Note] the Tn should be at-least rank-2 Tensor.
-    */
-    Tensor Trace(const Tensor &Tn, const cytnx_uint64 &axisA = 0, const cytnx_uint64 &axisB = 1);
-
-    // Min:
-    //==================================================
-    /**
-    @brief get the minimum element.
-
-    [Note] For complex TN, only real part is compared.
-    */
-    Tensor Min(const Tensor &Tn);
-
-    // Max:
-    //==================================================
-    /**
-    @brief get the maximum element.
-
-    [Note] For complex TN, only real part is compared.
-    */
-    Tensor Max(const Tensor &Tn);
-
-    // Sum:
-    //==================================================
-    /**
-    @brief get the sum of all the elements.
-
-    */
-    Tensor Sum(const Tensor &Tn);
-
-    // Matmul:
-    //==================================================
-    /**
-    @brief perform matrix multiplication on two tensors.
-
-    [Note] the TL and TR should be both rank-2 Tensor.
-    */
-    Tensor Matmul(const Tensor &TL, const Tensor &TR);
-
-    // Matmul:
-    //==================================================
-    /**
-    @brief perform matrix multiplication on two Tensors with one rank-1 and the other rank-2 where
-    the rank-1 represent the diagonal elements of the specific tensor.
-
-    [Note] the TL and TR one of them should be rank-1 Tensor and the other should be rank-2 Tensor.
-    */
-    Tensor Matmul_dg(const Tensor &Tl, const Tensor &Tr);
-
-    // InvM:
-    //==================================================
-    /**
-    @brief Matrix inverse.
-    @return
-        [Tensor]
-
-    [Note] the Tin should be a rank-2 Tensor.
-    */
-    Tensor InvM(const Tensor &Tin);
-    /**
-    @brief inplace perform Matrix inverse.
-
-    description:
-        on return, the Tin will be modified to it's inverse.
-
-    [Note] the Tin should be a rank-2 Tensor.
-    */
-    void InvM_(Tensor &Tin);
-
-    // Inv:
-    //==================================================
-    /**
-    @brief Element-wise inverse with clip.
-    @return
-        [Tensor]
-
-    description:
-        Performs Elementwise inverse with clip. if A[i] < clip, then 1/A[i] = 0 will be set.
-
-    [Note] For complex type Tensors, the square norm is used to determine the clip.
-
-    */
-    Tensor Inv(const Tensor &Tin, const double &clip);
-
-    /**
-    @brief inplace perform Element-wise inverse with clip.
-    @return
-        [Tensor]
-
-    description:
-        1. Performs Elementwise inverse with clip. if A[i] < clip, then 1/A[i] = 0 will be set.
-        2. on return, all the elements will be modified to it's inverse. if Tin is integer type, it
-    will automatically promote to Type.Double.
-
-    [Note] For complex type Tensors, the square norm is used to determine the clip.
-
-    */
-    void Inv_(Tensor &Tin, const double &clip);
-
-    // Conj:
-    //==================================================
-    /**
-    @brief Conjugate all the element in Tensor.
-    @return
-        [Tensor]
-
-    [Note]
-        1. if the input Tensor is complex, then return a new Tensor with all the elements are
-    conjugated.
-        2. if the input Tensor is real, then return a copy of input Tensor.
-    */
-    Tensor Conj(const Tensor &Tin);
-    /**
-    @brief inplace perform Conjugate on all the element in Tensor.
-
-    [Note]
-        1. if the input Tensor is complex, the elements of input Tensor will all be conjugated.
-        2. if the input Tensor is real, then nothing act.
-    */
-    void Conj_(Tensor &Tin);
-
-    // Exp:
-    //==================================================
-    /**
-    @brief Exponential all the element in Tensor.
-    @return
-        [Double Tensor] or [ComplexDouble Tensor]
-
-    */
-    Tensor Exp(const Tensor &Tin);
-
-    /**
-    @brief Exponential all the element in Tensor.
-    @return
-        [Float Tensor] or [ComplexFloat Tensor]
-
-    */
-    Tensor Expf(const Tensor &Tin);
-
-    /**
-    @brief inplace perform Exponential on all the element in Tensor.
-    @param Tin, the input Tensor.
-
-    description:
-        1. on return, the elements in Tin will be modified to it's exponetial value.
-        2. For Real, if the type is not Double, change the type of the input tensor to Double.
-        3. For Complex, if input is ComplexFloat, promote to ComplexDouble.
-    */
-    void Exp_(Tensor &Tin);
-
-    /**
-    @brief inplace perform Exponential on all the element in Tensor.
-    @param Tin, the input Tensor.
-
-    description:
-        1. on return, the elements in Tin will be modified to it's exponetial value.
-        2. For Real, if the type is not Float, change the type of the input tensor to Float.
-        3. For Complex, if input is ComplexDouble, promote to ComplexFloat.
-    */
-    void Expf_(Tensor &Tin);
-
-    // Pow:
-    //==================================================
-    /**
-    @brief take power p on all the elements in Tensor.
-    @param p, the power
-    @return
-        [Tensor]
-
-    */
-    Tensor Pow(const Tensor &Tin, const double &p);
-
-    /**
-    @brief inplace perform power on all the elements in Tensor.
-    @param Tin, the input Tensor.
-    @param p, the power.
-
-    description:
-        on return, the elements in Tin will be modified to it's exponetial value.
-    */
-    void Pow_(Tensor &Tin, const double &p);
-
-    // Abs:
-    //==================================================
-    /**
-    @brief Elementwise absolute value.
-    @param Tin tensor.
-    @return
-        [Tensor]
-
-    */
-    Tensor Abs(const Tensor &Tin);
-
-    /**
-    @brief inplace perform elementwiase absolute value.
-    @param Tin, the input Tensor.
-
-    description:
-        on return, the elements in Tin will be modified to it's absolute value. Note that if the
-    input tensor is complex, it will be modified to real type.
-    */
-    void Abs_(Tensor &Tin);
-
-    // Diag:
-    //==================================================
-    /**
-    @brief return a diagonal tensor with diagonal elements provided as Tin.
-    @return
-        [Tensor]
-
-    description:
-        the return Tensor will be rank-2, with shape=(L, L); where L is the number of elements in
-    Tin.
-
-
-    [Note] Tin should be a rank-1 Tensor.
-
-    */
-    Tensor Diag(const Tensor &Tin);
-
-    // Tensordot:
-    //==================================================
-    /**
-    @brief perform tensor dot by sum out the indices assigned of two Tensors.
-    @param Tl Tensor #1
-    @param Tr Tensor #2
-    @param idxl the indices of rank of Tensor #1 that is going to sum with Tensor #2
-    @param idxr the indices of rank of Tensor #2 that is going to sum with Tensor #1
-    @param cacheL cache Tensor #1 (See user-guide for details)
-    @param cacheR cache Tensor #2 (See user-guide for details)
-    @return
-        [Tensor]
-
-    [Note]
-        1. the elements in idxl and idxr have one to one correspondence.
-        2. two tensors should on same device.
-    */
-    Tensor Tensordot(const Tensor &Tl, const Tensor &Tr, const std::vector<cytnx_uint64> &idxl,
-                     const std::vector<cytnx_uint64> &idxr, const bool &cacheL = false,
-                     const bool &cacheR = false);
-
-    // Tensordot_dg:
-    //==================================================
-    /**
-    @brief perform tensor dot by sum out the indices assigned of two Tensors, with either one of
-    them to be a rank-2 diagonal tensor represented by a rank-2 tensor.
-    @param Tl Tensor #1
-    @param Tr Tensor #2
-    @param idxl the indices of rank of Tensor #1 that is going to sum with Tensor #2
-    @param idxr the indices of rank of Tensor #2 that is going to sum with Tensor #1
-    @param diag_L if Tl(true)/Tr(false) is a diagnal matrix, represented by a rank-1 tensor.
-    @return
-        [Tensor]
-
-    [Note]
-        1. the elements in idxl and idxr have one to one correspondence.
-        2. two tensors should on same device.
-        3. if diag_L=true, Tl should be a rank-1 tensor as the diagonal elements of a diagonal
-    matrix. if false, Tr should be a rank-1 tensor
-    */
-    Tensor Tensordot_dg(const Tensor &Tl, const Tensor &Tr, const std::vector<cytnx_uint64> &idxl,
-                        const std::vector<cytnx_uint64> &idxr, const bool &diag_L);
-
-    // Outer:
-    //==================================================
-    /**
-    @brief perform outer produces of two rank-1 Tensor.
-    @param Tl rank-1 Tensor #1
-    @param Tr rank-1 Tensor #2
-    @return
-        [Tensor]
-
-    description:
-        if the Tensor #1 has [shape_1], and Tensor #2 has [shape_2]; then the return Tensor will
-    have shape: concate(shape_1,shape_2)
-
-    [Note]
-        two tensor should on same device.
-
-    */
-    Tensor Outer(const Tensor &Tl, const Tensor &Tr);
-
-    // Kron:
-    //==================================================
-    /**
-    @brief perform kronecker produces of two Tensor.
-    @param Tl rank-n Tensor #1
-    @param Tr rank-m Tensor #2
-    @param Tl_pad_left The padding scheme for Tl if Tl.rank != Tr.rank
-    @param Tr_pad_left The padding scheme for Tr if Tl.rank != Tr.rank
-    @return
-        [Tensor]
-
-    description:
-        The function assume two tensor has the same rank. In case where two tensors have different
-    ranks, the small one will be extend by adding redundant dimension to the beginning of axis
-    (T<x>_pad_right=true) or by adding redundant dim to the last axis (if T<x>_pad_left=false
-    [default]). if the Tensor #1 has shape=(i1,j1,k1,l1...), and Tensor #2 has
-    shape=(i2,j2,k2,l2...); then the return Tensor will have shape=(i1*i2,j1*j2,k1*k2...)
-
-    [Note]
-        two tensor should on same device.
-
-    */
-    Tensor Kron(const Tensor &Tl, const Tensor &Tr, const bool &Tl_pad_left = false,
-                const bool &Tr_pad_left = false);
-
-
-    // Directsum:
-    //==================================================
-    /**
-    @brief perform directsum of two Tensor.
-    @param T1 rank-n Tensor #1
-    @param T2 rank-n Tensor #2
-    @param shared_axes The axes that are shared by two tensors
-    @return
-        [Tensor]
-
-    description:
-        The function assume two tensor has the same rank, and axes indicated in <shared_axes> are the same for both T1 and T2.
-    The out put tensors will have same rank as T1 and T2, with the dimension of rest of the axes being the sum of dimensions of T1 and T2.
-    e.g., the out put shape = (i1+i2,j1+j2, share_axis_1, k1+k2, share_axis_2, ...); where T1.shape = (i1,j1,share_axis_1,k1,share_axis_2 ...)
-    and T2.shape = (i2,j2,share_axis_1,k2,share_axis_2 ...)
-
-
-    [Note]
-        two tensor should on same device.
-
-    */
-    Tensor Directsum(const Tensor &T1, const Tensor &T2, const std::vector<cytnx_uint64> &shared_axes);
-
-
-
-
-    // VectorDot:
-    //=================================================
-    /**
-    @brief perform inner product of vectors
-    @param Tl Tensor #1
-    @param Tr Tensor #2
-    @param if the Tl should be conjugated (only work for complex. For real Tensor, no function),
-    default: false
-    @return
-        [Tensor] Rank-0
-
-    description:
-        two Tensors must be Rank-1, with same length.
-
-    [Note]
-        performance tune: This function have better performance when two vectors with same types,
-    and are one of following type: cytnx_double, cytnx_float, cytnx_complex64 or cytnx_complex128.
-
-    */
-    Tensor Vectordot(const Tensor &Tl, const Tensor &Tr, const bool &is_conj = false);
-
-    // Dot:
-    //=================================================
-    /**
-    @brief dot product of two arrays.
-    @param Tl Tensor #1
-    @param Tr Tensor #2
-    @return
-        [Tensor]
-
-    description:
-        1. if both Tl and Tr are 1d arrays, it is inner product of vectors (no complex conj), it
-    calls linalg.Vectordot with is_conj=false.
-        2. if both Tl and Tr are 2d arrays, it calls linalg.Matmul to compute the matrix
-    multiplication
-        3. if Tl is Nd array (with N>=2, and Tr is 1-D array, it is sum product over the last axis
-    of a with b
-
-    [Note]
-        performance tune: This function have better performance when two arrays with same types, and
-    are one of following type: cytnx_double, cytnx_float, cytnx_complex64 or cytnx_complex128.
-
-    [Python]
-        In Python API, operator@ is overloaded as a shorthand of linalg::Dot.
-    */
-    Tensor Dot(const Tensor &Tl, const Tensor &Tr);
-
-    // Tridiag:
-    //===========================================
-    /**
-    @brief perform diagonalization of symmetric tri-diagnoal matrix.
-    @param Diag Tensor #1
-    @param Sub_diag Tensor #2
-    @param is_V: if calculate the eigen value.
-    @param k: Return k lowest eigen vector if is_V=True
-    @param throw_excp: Whether to throw exception when error occurs in Tridiag internal function
-    @return
-        [vector<Tensor>] if is_V = True, the first tensor is the eigen value, and second tensor is
-    eigenvector of shape [k,L].
-
-    description:
-        two Tensors must be Rank-1, with length of Diag = L and Sub_diag length = L-1.
-
-    [Note]
-        performance tune: This function have better performance when two vectors with same types,
-    and are one of following type: cytnx_double, cytnx_float. In general all real type can be use as
-    input, which will be promote to floating point type for calculation.
-
-    */
-    std::vector<Tensor> Tridiag(const Tensor &Diag, const Tensor &Sub_diag, const bool &is_V = true,
-                                const bool &is_row = false, bool throw_excp = false);
-
-    // ExpH:
-    //===========================================
-    /**
-    @brief perform matrix exponential for Hermitian matrix
-    @param in input Tensor, should be Hermitian
-    @param a rescale factor
-    @param b bias
-    @return
-        [Tensor]
-
-    description:
-        perform matrix exponential with \f$O = \exp{aM + b}\f$.
-
-    */
-    template <typename T>
-    Tensor ExpH(const Tensor &in, const T &a, const T &b =0);
-    Tensor ExpH(const Tensor &in);
-
-
-
-    // ExpM:
-    //===========================================
-    /**
-    @brief perform matrix exponential for generic matrix
-    @param in input Tensor, should be a square rank-2.
-    @param a rescale factor
-    @param b bias
-    @return
-        [Tensor]
-
-    description:
-        perform matrix exponential with \f$O = \exp{aM + b}\f$.
-
-    */
-    template <typename T>
-    Tensor ExpM(const Tensor &in, const T &a, const T &b = 0);
-
-    Tensor ExpM(const Tensor &in);
-
-    // Lanczos:
-    //===========================================
-    /**
-    @brief perform Lanczos for hermitian/symmetric matrices or linear function.
-    @param Hop the Linear Operator defined by LinOp class or it's inheritance (see LinOp).
-    @param Tin the initial vector, this should be rank-1.
-    @param method the desired Lanczos method to use, can be 'ER' or 'Gnd'.
-    @param CvgCrit the convergence criterion of the energy.
-    @param maxiter the maximum interation steps for each k.
-    @param k the number of lowest k eigen values.
-    @param is_V if set to true, the eigen vectors will be returned.
-    @param is_row whether the return eigen vectors should be in row-major form.
-    @param max_krydim the maximum krylov subspace dimension for each iteration.
-    @param verbose print out iteration info.
-    @return
-        [eigvals (Tensor), eigvecs (Tensor)(option)]
-    #description:
-        This function calculate the eigen value problem using explicitly restarted Lanczos.
-    #Performance tune:
-        For small linear dimension, try to reduce max_krydim.
-    #[Note]
-        To use, define a linear operator with LinOp class either by assign a custom function or
-    create a class that inherit LinOp (see LinOp for further details)
-    */
-    std::vector<Tensor> Lanczos(LinOp *Hop, const Tensor &Tin = Tensor(),
-                                const std::string method = "Gnd", const double &CvgCrit = 1.0e-14,
-                                const unsigned int &Maxiter = 10000, const cytnx_uint64 &k = 1,
-                                const bool &is_V = true, const bool &is_row = false,
-                                const cytnx_uint32 &max_krydim = 0, const bool &verbose = false);
-
-    // Lanczos:
-    //===========================================
-    /**
-    @brief perform Lanczos for hermitian/symmetric matrices or linear function.
-    @param Hop the Linear Operator defined by LinOp class or it's inheritance (see LinOp).
-    @param Tin the initial vector, this should be a UniTensor.
-    @param method the desired Lanczos method to use, can be 'ER' or 'Gnd'.
-    @param CvgCrit the convergence criterion of the energy.
-    @param maxiter the maximum interation steps for each k.
-    @param k the number of lowest k eigen values.
-    @param is_V if set to true, the eigen vectors will be returned.
-    @param is_row whether the return eigen vectors should be in row-major form.
-    @param max_krydim the maximum krylov subspace dimension for each iteration.
-    @param verbose print out iteration info.
-    @return
-        [eigvals (Tensor), eigvecs (Tensor)(option)]
-    #description:
-        This function calculate the eigen value problem using explicitly restarted Lanczos.
-    #Performance tune:
-        For small linear dimension, try to reduce max_krydim.
-    #[Note]
-        To use, define a linear operator with LinOp class either by assign a custom function or
-    create a class that inherit LinOp (see LinOp for further details)
-    */
-    std::vector<UniTensor> Lanczos(LinOp *Hop, const UniTensor &Tin = UniTensor(),
-                                   const std::string method = "Gnd",
-                                   const double &CvgCrit = 1.0e-14,
-                                   const unsigned int &Maxiter = 10000, const cytnx_uint64 &k = 1,
-                                   const bool &is_V = true, const bool &is_row = false,
-                                   const cytnx_uint32 &max_krydim = 4, const bool &verbose = false);
-
-
-    // Lanczos:
-    //===========================================
-    /**
-    @brief perform Lanczos for hermitian/symmetric matrices or linear function.
-    @param Hop the Linear Operator defined by LinOp class or it's inheritance (see LinOp).
-    @param k the number of lowest k eigen values.
-    @param is_V if set to true, the eigen vectors will be returned.
-    @param maxiter the maximum interation steps for each k.
-    @param CvgCrit the convergence criterion of the energy.
-    @param is_row whether the return eigen vectors should be in row-major form.
-    @param Tin the initial vector, this should be rank-1
-    @param max_krydim the maximum krylov subspace dimension for each iteration.
-    @param verbose print out iteration info.
-    @return
-        [eigvals (Tensor), eigvecs (Tensor)(option)]
-
-    #description:
-        This function calculate the eigen value problem using explicitly restarted Lanczos.
-
-    #Performance tune:
-        For small linear dimension, try to reduce max_krydim.
-
-    #[Note]
-        To use, define a linear operator with LinOp class either by assign a custom function or
-    create a class that inherit LinOp (see LinOp for further details)
-    */
-    std::vector<Tensor> Lanczos_ER(LinOp *Hop, const cytnx_uint64 &k = 1, const bool &is_V = true,
-                                   const cytnx_uint64 &maxiter = 10000,
-                                   const double &CvgCrit = 1.0e-14, const bool &is_row = false,
-                                   const Tensor &Tin = Tensor(), const cytnx_uint32 &max_krydim = 4,
-                                   const bool &verbose = false);
-
-    // Lanczos:
-    //===========================================
-    /**
-    @brief perform Lanczos for hermitian/symmetric matrices or linear function to get ground state
-    and lowest eigen value
-    @param Hop the Linear Operator defined by LinOp class or it's inheritance (see LinOp).
-    @param CvgCrit the convergence criterion of the energy.
-    @param is_V if set to true, the eigen vectors will be returned.
-    @param Tin the initial vector, this should be rank-1
-    @param verbose print out iteration info.
-    @param maxiter the maximum interation steps for each k.
-    @return
-        [eigvals (Tensor), eigvecs (Tensor)(option)]
-
-    #description:
-        This function calculate the eigen value problem using naive Lanczos to get ground state and
-    lowest eigen value.
-
-
-    #[Note]
-        To use, define a linear operator with LinOp class either by assign a custom function or
-    create a class that inherit LinOp (see LinOp for further details)
-    */
-    std::vector<Tensor> Lanczos_Gnd(LinOp *Hop, const double &CvgCrit = 1.0e-14,
-                                    const bool &is_V = true, const Tensor &Tin = Tensor(),
-                                    const bool &verbose = false,
-                                    const unsigned int &Maxiter = 100000);
-
-    // Lanczos:
-    //===============================================
-    /**
-    @brief perform Lanczos for hermitian/symmetric matrices or linear function to get ground state
-    and lowest eigen value
-    @param Hop the Linear Operator defined by LinOp class or it's inheritance (see LinOp).
-    @param CvgCrit the convergence criterion of the energy.
-    @param is_V if set to true, the eigen vectors will be returned.
-    @param Tin the initial vector, this should be a UniTensor.
-    @param verbose print out iteration info.
-    @param maxiter the maximum interation steps for each k.
-    @return
-        [eigvals (UniTensor::Dense), eigvecs (UniTensor)(option)]
-
-    #description:
-        This function calculate the eigen value problem using naive Lanczos to get ground state and
-    lowest eigen value.
-
-
-    #[Note]
-        To use, define a linear operator with LinOp class either by assign a custom function or
-    create a class that inherit LinOp (see LinOp for further details)
-    */
-    std::vector<UniTensor> Lanczos_Gnd_Ut(LinOp *Hop, const UniTensor &Tin,
-                                          const double &CvgCrit = 1.0e-14, const bool &is_V = true,
-                                          const bool &verbose = false,
-                                          const unsigned int &Maxiter = 100000);
-
-    // Lstsq:
-    //===========================================
-    /**
-    @brief Return the least-squares solution to a linear matrix equation.
-    @param A “Coefficient” matrix, must be two-dimensional.
-    @param b Ordinate or “dependent variable” values, must be two-dimensional, the least-squares
-    solution is calculated for each of the K columns of b.
-    @param rcond Cut-off ratio for small singular values of a. For the purposes of rank
-    determination, singular values are treated as zero if they are smaller than rcond times the
-    largest singular value of A, If it is negative, the machine precision is used.
-    @return [std::vector<Tensors>]
-
-        1. the first tensor is least-squares solutions in the K columns.
-        2. the second tensor is the sums of squared residuals: Squared Euclidean 2-norm for each
-    column in b - a @ x. If the rank of a is < N or M <= N, this is a zero Tensor.
-        3. the third tensor is the rank of matrix A.
-        4. the forth tensor is singular values of A.
-
-    #description:
-        Computes the vector x that approximatively solves the equation A @ x = b. The equation may
-    be under-, well-, or over-determined independent columns. If a is square and of full rank, then
-    x (but for round-off error) is the “exact” solution of the equation. Else, x minimizes the
-    Euclidean 2-norm || b - a x ||.
-
-    [Ke]
-    */
-    std::vector<Tensor> Lstsq(const Tensor &A, const Tensor &b, const float &rcond = -1);
-
-
-    /**
-    @brief Blas Axpy, performing return = a*x + y
-    @param a Scalar.
-    @param x Tensor, can be any rank
-    @param y Tensor, can be any rank
-    @return
-        [Tensor]
-
-    #description:
-        This function performs a*x+y where x,y are Tensor and a is a Scalar. The dtype of return
-        Tensor will be the strongest among x,y and a.
-
-        If y is not specify, then it performs a*x -> return
-
-    #[Note]
-        This will return a new tensor.
-
-    */
-    Tensor Axpy(const Scalar &a, const Tensor &x, const Tensor &y = Tensor());
-
-    void Axpy_(const Scalar &a, const Tensor &x, Tensor &y);
-
-    /**
-    @brief Blas Ger, performing return = a*vec(x)*vec(y)^T
-    @param x Tensor, rank-1 with size nx
-    @param y Tensor, rank-1 with size ny
-    @param a Scalar, if not provided a = 1.
-    @return
-        [Tensor with shape (nx,ny)]
-
-    #description:
-        This function performs a*x*y^T where x,y are rank-1 Tensor with dimension nx and ny respectively; and a is a Scalar. The dtype of return
-        Tensor will be the strongest among x,y and a.
-
-
-    #[Note]
-        This will return a new tensor.
-
-    */
-    Tensor Ger(const Tensor &x, const Tensor &y, const Scalar &a=Scalar());
-
-
-
-
-  }  // namespace linalg
-
-  // operators:
-  Tensor operator+(const Tensor &Lt, const Tensor &Rt);
-  template <class T>
-  Tensor operator+(const T &lc, const Tensor &Rt);
-  template <class T>
-  Tensor operator+(const Tensor &Lt, const T &rc);
-
-  //------------------------------------
-  Tensor operator-(const Tensor &Lt, const Tensor &Rt);
-  template <class T>
-  Tensor operator-(const T &lc, const Tensor &Rt);
-  template <class T>
-  Tensor operator-(const Tensor &Lt, const T &rc);
-
-  //-----------------------------------
-  Tensor operator*(const Tensor &Lt, const Tensor &Rt);
-  template <class T>
-  Tensor operator*(const T &lc, const Tensor &Rt);
-  template <class T>
-  Tensor operator*(const Tensor &Lt, const T &rc);
-
-  //----------------------------------
-  Tensor operator/(const Tensor &Lt, const Tensor &Rt);
-  template <class T>
-  Tensor operator/(const T &lc, const Tensor &Rt);
-  template <class T>
-  Tensor operator/(const Tensor &Lt, const T &rc);
-
-  //----------------------------------
-  Tensor operator%(const Tensor &Lt, const Tensor &Rt);
-  template <class T>
-  Tensor operator%(const T &lc, const Tensor &Rt);
-  template <class T>
-  Tensor operator%(const Tensor &Lt, const T &rc);
-
-  //----------------------------------
-  Tensor operator==(const Tensor &Lt, const Tensor &Rt);
-  template <class T>
-  Tensor operator==(const T &lc, const Tensor &Rt);
-  template <class T>
-  Tensor operator==(const Tensor &Lt, const T &rc);
-
-}  // namespace cytnx
-
-#endif  // CYTNX_LINALG_HPP_H_
diff --git a/src/BlockUniTensor.cpp.old b/src/BlockUniTensor.cpp.old
deleted file mode 100644
index 4cda25b9b..000000000
--- a/src/BlockUniTensor.cpp.old
+++ /dev/null
@@ -1,1985 +0,0 @@
-#include "UniTensor.hpp"
-#include "Accessor.hpp"
-#include "utils/utils.hpp"
-#include "utils/utils_internal_interface.hpp"
-#include "linalg.hpp"
-#include "Generator.hpp"
-#include <vector>
-#include "utils/vec_print.hpp"
-#include "utils/vec_concatenate.hpp"
-#include <map>
-#include <boost/unordered_map.hpp>
-#include <stack>
-#ifdef UNI_OMP
-  #include <omp.h>
-#endif
-#include "backend/lapack_wrapper.hpp"
-
-using namespace std;
-namespace cytnx {
-  typedef Accessor ac;
-  void BlockUniTensor::Init(const std::vector<Bond> &bonds, const std::vector<string> &in_labels,
-                             const cytnx_int64 &rowrank, const unsigned int &dtype,
-                             const int &device, const bool &is_diag, const bool &no_alloc, const std::string &name) {
-    this->_name = name;
-    // the entering is already check all the bonds have symmetry.
-    //  need to check:
-    //  1. the # of symmetry and their type across all bonds
-    //  2. check if all bonds are non regular:
-
-    // check Symmetry for all bonds
-    cytnx_uint32 N_symmetry = bonds[0].Nsym();
-    vector<Symmetry> tmpSyms = bonds[0].syms();
-
-    cytnx_uint32 N_ket = 0;
-    for (cytnx_uint64 i = 0; i < bonds.size(); i++) {
-      // check
-      cytnx_error_msg(
-        bonds[i].type() == BD_REG,
-        "[ERROR][BlockUniTensor] All bonds must be tagged for UniTensor with symmetries.%s", "\n");
-
-
-      cytnx_error_msg(
-        bonds[i]._impl->_degs.size() == 0,
-        "[ERROR][BlockUniTensor] All bonds must be in new format for BlockUniTensor!.%s", "\n");
-
-      // check rank-0 bond:
-      cytnx_error_msg(bonds[i].dim() == 0,
-                      "[ERROR][BlockUniTensor] All bonds must have dimension >=1%s", "\n");
-      // check symmetry and type:
-      cytnx_error_msg(bonds[i].Nsym() != N_symmetry,
-                      "[ERROR][BlockUniTensor] inconsistant # of symmetry at bond: %d. # of "
-                      "symmetry should be %d\n",
-                      i, N_symmetry);
-      for (cytnx_uint32 n = 0; n < N_symmetry; n++) {
-        cytnx_error_msg(bonds[i].syms()[n] != tmpSyms[n],
-                        "[ERROR][BlockUniTensor] symmetry mismatch at bond: %d, %s != %s\n", n,
-                        bonds[i].syms()[n].stype_str().c_str(), tmpSyms[n].stype_str().c_str());
-      }
-      N_ket += cytnx_uint32(bonds[i].type() == bondType::BD_KET);
-    }
-
-    // check rowrank:
-    cytnx_error_msg((N_ket < 1) || (N_ket > bonds.size() - 1),
-                    "[ERROR][BlockUniTensor] must have at least one ket-bond and one bra-bond.%s",
-                    "\n");
-
-
-    if (rowrank == -1) {
-      this->_rowrank = N_ket;
-      //this->_inner_rowrank = N_ket;
-    } else {
-      if(is_diag){
-        cytnx_error_msg(rowrank != 1,
-                      "[ERROR][BlockUniTensor] rowrank must be = 1 when is_diag = true.%s", "\n");
-      }else{
-      cytnx_error_msg((rowrank < 0) || (rowrank > bonds.size() ),
-                      "[ERROR][BlockUniTensor] rowrank must be >=0 and <=rank.%s", "\n");
-      }
-      this->_rowrank = rowrank;
-      //this->_inner_rowrank = rowrank;
-      // update braket_form >>>
-    }
-
-
-    // check labels:
-    if (in_labels.size() == 0) {
-      for (cytnx_int64 i = 0; i < bonds.size(); i++) this->_labels.push_back(to_string(i));
-
-    } else {
-      // check bonds & labels dim
-      cytnx_error_msg(bonds.size() != in_labels.size(), "%s",
-                      "[ERROR] labels must have same lenth as # of bonds.");
-
-      std::vector<string> tmp = vec_unique(in_labels);
-      cytnx_error_msg(tmp.size() != in_labels.size(),
-                      "[ERROR] labels cannot contain duplicated elements.%s", "\n");
-      this->_labels = in_labels;
-    }
-
-    //cytnx_error_msg(is_diag,"[ERROR][BlockUniTensor] Cannot set is_diag=true when the UniTensor is with symmetry.%s","\n");
-    if(is_diag){
-        cytnx_error_msg(bonds.size()!=2,"[ERROR][BlockUniTensor] is_diag = true must be rank-2 with one in-bond and one out-bond.%s","\n");
-        cytnx_error_msg(bonds[0].type()== bonds[1].type(), "[ERROR][BlockUniTensor] is_diag=true must have one in-bond and oue out-bond.%s","\n");
-        if(rowrank != 1, "[ERROR][BlockUniTensor] is_diag = true must have rowrank=1.%s","\n");
-
-        //checking basis!
-        cytnx_error_msg(bonds[0].redirect() != bonds[1],"[ERROR][BlockUniTensor] is_diag=true the in-bond and out-bond basis must match!%s","\n");
-
-    }
-    this->_is_diag = is_diag;
-
-    // copy bonds, otherwise it will share objects:
-    this->_bonds = vec_clone(bonds);
-    this->_is_braket_form = this->_update_braket();
-
-    // vector<cytnx_uint64> blocklens;
-    // vector<vector<cytnx_uint64>> blocksizes;
-    // cytnx_uint64 totblocksize = 0;
-
-    if(this->_is_diag){
-        for(int b=0;b<this->_bonds[0].qnums().size();b++){
-            this->_inner_to_outer_idx.push_back({(cytnx_uint64)b,(cytnx_uint64)b});
-            if(!no_alloc){
-              this->_blocks.push_back(zeros(this->_bonds[0]._impl->_degs[b],dtype,device));
-            }else{
-              this->_blocks.push_back(Tensor({this->_bonds[0]._impl->_degs[b]},dtype,device,false));
-            }
-        }
-
-    }else{
-        // checking how many blocks are there, and the size:
-        std::vector<cytnx_uint64> Loc(this->_bonds.size(),0);
-        std::vector<cytnx_int64> tot_qns(this->_bonds[0].Nsym()); // use first bond to determine symmetry size
-        std::vector<cytnx_uint64> size(this->_bonds.size());
-        bool fin=false;
-        while(1){
-
-            //get elem
-            //cout << "start!" << endl;
-            //cytnx::vec_print_simple(std::cout , Loc);
-            this->_fx_get_total_fluxs(Loc, this->_bonds[0].syms(),tot_qns);
-
-            //std::cout << "Loc: ";
-            //cytnx::vec_print_simple(std::cout, Loc);
-            //std::cout << "tot_flx: ";
-            //cytnx::vec_print_simple(std::cout, tot_qns);
-
-            //if exists:
-            if( std::all_of(tot_qns.begin(),tot_qns.end(), [](const int &i){return i==0;}) ){
-                //get size & init block!
-                if(!no_alloc){
-                    // cytnx_uint64 blockNelem = 1;
-                    for(cytnx_int32 i=0;i<Loc.size();i++){
-                        size[i] = this->_bonds[i]._impl->_degs[Loc[i]];
-                        // blockNelem *= size[i];
-                    }
-                    this->_blocks.push_back(zeros(size,dtype,device));
-                    // blocklens.push_back(blockNelem);
-                    // blocksizes.push_back(size);
-                    // totblocksize += blockNelem;
-                }else{
-                  for(cytnx_int32 i=0;i<Loc.size();i++){
-                      size[i] = this->_bonds[i]._impl->_degs[Loc[i]];
-                  }
-                  this->_blocks.push_back(Tensor(size,dtype,device,false));
-                }
-                // push its loc
-                this->_inner_to_outer_idx.push_back(Loc);
-
-            }
-
-            while(Loc.size()!=0){
-                if(Loc.back()==this->_bonds[Loc.size()-1]._impl->_qnums.size()-1){
-                    Loc.pop_back();
-                    continue;
-                }
-                else{
-                    Loc.back()+=1;
-                    //cout << "+1 at loc:" << Loc.size()-1 <<endl;
-                    while(Loc.size()!=this->_bonds.size()){
-                        Loc.push_back(0);
-                    }
-                    break;
-                }
-            }
-
-            if(Loc.size()==0) break;
-        }
-
-        // if(!no_alloc){
-        //   cytnx_uint64 offset=0;
-
-        //   char* ptr = (char*)utils_internal::Calloc_cpu(
-        //     totblocksize+blocklens.size()*STORAGE_DEFT_SZ,
-        //     Type.typeSize(dtype));
-        //   for(cytnx_int64 k=0;k<blocklens.size();k++){
-        //     cytnx_uint64 cap=0;
-        //     if (blocklens[k] % STORAGE_DEFT_SZ) {
-        //       cap = ((unsigned long long)((blocklens[k]) / STORAGE_DEFT_SZ) + 1) * STORAGE_DEFT_SZ;
-        //     } else {
-        //       cap = blocklens[k];
-        //     }
-        //     this->_blocks.push_back(Tensor(Storage(ptr+(offset*Type.typeSize(dtype)),
-        //         blocklens[k],dtype,device,true,cap),blocksizes[k],dtype,device));
-        //     offset+=cap;
-        //   }
-        // }
-   }// is_diag?
-
-  }
-
-  void beauty_print_block(std::ostream &os, const cytnx_uint64 &Nin, const cytnx_uint64 &Nout, const std::vector<cytnx_uint64> &qn_indices, const std::vector<Bond> &bonds, const Tensor &block){
-        cytnx_uint64 Total_line = Nin < Nout ? Nout:Nin;
-
-        std::vector<std::string> Lside(Total_line);
-        std::vector<std::string> Rside(Total_line);
-        std::vector<std::string> MidL(Total_line);
-        std::vector<std::string> MidR(Total_line);
-        cytnx_uint64 Lmax = 0;
-        cytnx_uint64 mL = 0;
-        cytnx_uint64 mR = 0;
-
-        for(int i=0;i<Total_line;i++){
-            //Lside:
-            if(i<Nin){
-                Lside[i] += "[" + to_string(qn_indices[i]) + "] ";
-                for(int s=0;s<bonds[0].Nsym();s++){
-                    Lside[i] += bonds[0]._impl->_syms[s].stype_str() + "(" + to_string(bonds[i]._impl->_qnums[qn_indices[i]][s]) + ")";
-                }
-                if(Lmax < Lside[i].size()) Lmax = Lside[i].size();
-
-                MidL[i] += to_string(block.shape()[i]);
-                if(mL < MidL[i].size()) mL =  MidL[i].size();
-            }
-
-            //Rside:
-            if(i<Nout){
-                Rside[i] += "[" + to_string(qn_indices[Nin+i]) + "] ";
-                for(int s=0;s<bonds[0].Nsym();s++){
-                    Rside[i] += bonds[0]._impl->_syms[s].stype_str() + "(" + to_string(bonds[Nin+i]._impl->_qnums[qn_indices[Nin+i]][s]) + ")";
-                }
-                // check if is_diag = true:
-                if(block.shape().size()==1 && bonds.size()==2)
-                    MidR[i] += to_string(block.shape()[i]);
-                else
-                    MidR[i] += to_string(block.shape()[Nin+i]);
-                if(mR < MidR[i].size()) mR =  MidR[i].size();
-            }
-
-        }
-
-        //filling space:
-        for(int i=0;i<Total_line;i++){
-            if(Lside[i].size() < Lmax){
-                Lside[i] += string(" ")*(Lmax-Lside[i].size());
-            }
-            if(MidL[i].size() < mL){
-                MidL[i] += string(" ")*(mL-MidL[i].size());
-            }
-            if(MidR[i].size() < mR){
-                MidR[i] += string(" ")*(mR-MidR[i].size());
-            }
-        }
-
-        //starting printing:
-        // 3spacing, Lmax , 5 for arrow
-        std::string empty_line = (std::string(" ")*(3+Lmax+5)) + "| " + std::string(" ")*(mL+5+mR) + " |";
-        os<< (std::string(" ")*(3+Lmax+5)) << std::string("-")*(4+mL+mR+5) << endl;
-        os << empty_line << endl;
-
-        std::string bks;
-        for(int i=0;i<Total_line;i++){
-            os << "   " << Lside[i];
-            //arrow:
-            if(i < Nin){
-                if(bonds[i].type() == bondType::BD_KET)
-                    bks = "  -->";
-                else
-                    bks = " *<--";
-            }else{
-                bks = "     ";
-            }
-            os << bks << "| " << MidL[i] << "     " << MidR[i] << " |";
-            if(i < Nout){
-                if (bonds[Nin + i].type() == bondType::BD_KET)
-                  bks = "<--* ";
-                else
-                  bks = "-->  ";
-            }else{
-                bks = "";
-            }
-
-            os << bks << Rside[i] << endl;
-            os << empty_line << endl;
-        }
-
-        os<< (std::string(" ")*(3+Lmax+5)) << std::string("-")*(4+mL+mR+5) << endl;
-  }
-  void BlockUniTensor::print_block(const cytnx_int64 &idx, const bool &full_info)const{
-        cytnx_error_msg((idx < 0) || (idx >= this->_blocks.size()),"[ERROR] index [%d] out of bound. should be >0 and < number of available blocks %d\n",idx,this->_blocks.size());
-
-        std::ostream &os = std::cout;
-
-        os << "========================\n";
-        if(this->_is_diag) os << " *is_diag: True\n";
-        os << "BLOCK [#" << idx << "]\n";
-        /*
-        os << "  |-Qn indices for each axis:\n   {\t";
-        for(int s=0;s<this->_inner_to_outer_idx[idx].size();s++){
-            os << this->_inner_to_outer_idx[idx][s] << "\t";
-        }
-        os << "}" << endl;
-        os << "\t";
-        for(int s=0;s<this->_bonds.size();s++){
-            os << ((this->_bonds[s].type()>0)?"OUT":"IN") << "\t";
-        }
-        os << endl;
-        os << "  |-Qn for each axis:\n";
-        for(int s=0;s<this->_bonds[0].Nsym();s++){
-            os << " " <<this->_bonds[0]._impl->_syms[s].stype_str() << ":\t";
-            for(int l=0;l<this->_blocks[idx].shape().size();l++){
-                os << std::showpos << this->_bonds[l]._impl->_qnums[this->_inner_to_outer_idx[idx][l]][s] << "\t";
-            }
-            os << std::noshowpos << endl;
-        }
-        */
-        os << " |- []   : Qn index \n";
-        os << " |- Sym(): Qnum of correspond symmetry\n";
-        beauty_print_block(os, this->_rowrank, this->_labels.size() - this->_rowrank, this->_inner_to_outer_idx[idx], this->_bonds, this->_blocks[idx]);
-
-
-        if(full_info)
-            os << this->_blocks[idx];
-        else{
-            os << "  |-dtype:\t" << Type.getname(this->_blocks[idx].dtype()) << endl;
-            os << "  |-device:\t" << Device.getname(this->_blocks[idx].device()) << endl;
-            os << "  |-contiguous:\t" << (this->_blocks[idx].is_contiguous()? "True" : "False") << endl;
-            os << "  |-shape:\t";
-            vec_print_simple(os,this->_blocks[idx].shape());
-
-        }
-
-  }
-
-  void BlockUniTensor::print_blocks(const bool &full_info) const{
-    std::ostream &os = std::cout;
-
-    os << "-------- start of print ---------\n";
-    char *buffer = (char *)malloc(sizeof(char) * 10240);
-    sprintf(buffer, "Tensor name: %s\n", this->_name.c_str());
-    os << std::string(buffer);
-    if (this->_is_tag) sprintf(buffer, "braket_form : %s\n", this->_is_braket_form ? "True" : "False");
-    os << std::string(buffer);
-    sprintf(buffer, "is_diag    : %s\n", this->_is_diag ? "True" : "False");
-    os << std::string(buffer);
-    sprintf(buffer, "[OVERALL] contiguous : %s\n", this->is_contiguous() ? "True" : "False");
-    os << std::string(buffer);
-
-    /*
-    os << "Symmetries: ";
-    for(int s=0;s<this->_bonds[0].Nsym();s++)
-        os << this->_bonds[0]._impl->_syms[s].stype_str() << " ";
-    os << endl;
-    */
-
-    // print each blocks with its qnum!
-    for(int b=0;b<this->_blocks.size();b++){
-        this->print_block(b,full_info);
-    }
-
-    /*
-      auto tmp_qnums = in.get_blocks_qnums();
-      std::vector<Tensor> tmp = in.get_blocks_(true);
-      sprintf(buffer, "BLOCKS:: %s", "\n");
-      os << std::string(buffer);
-      os << "=============\n";
-
-      if (!in.is_contiguous()) {
-        cytnx_warning_msg(
-          true,
-          "[WARNING][Symmetric] cout/print UniTensor on a non-contiguous UniTensor. the blocks "
-          "appears here could be different than the current shape of UniTensor.%s",
-          "\n");
-      }
-      for (cytnx_uint64 i = 0; i < tmp.size(); i++) {
-        os << "Qnum:" << tmp_qnums[i] << std::endl;
-        os << tmp[i] << std::endl;
-        os << "=============\n";
-      }
-      os << "-------- end of print ---------\n";
-    */
-    free(buffer);
-  }
-
-  void BlockUniTensor::print_diagram(const bool &bond_info) {
-    char *buffer = (char *)malloc(10240 * sizeof(char));
-    unsigned int BUFFsize = 100;
-
-    sprintf(buffer, "-----------------------%s", "\n");
-    std::cout << std::string(buffer);
-    sprintf(buffer, "tensor Name : %s\n", this->_name.c_str());
-    std::cout << std::string(buffer);
-    sprintf(buffer, "tensor Rank : %d\n", this->_labels.size());
-    std::cout << std::string(buffer);
-    //sprintf(buffer, "block_form  : true%s", "\n");
-    //std::cout << std::string(buffer);
-    sprintf(buffer, "contiguous  : %s\n", this->is_contiguous() ? "True" : "False");
-    std::cout << std::string(buffer);
-    sprintf(buffer, "valid blocks : %d\n", this->_blocks.size());
-    std::cout << std::string(buffer);
-    sprintf(buffer, "is diag   : %s\n", this->is_diag() ? "True" : "False");
-    std::cout << std::string(buffer);
-    sprintf(buffer, "on device   : %s\n", this->device_str().c_str());
-    std::cout << std::string(buffer);
-
-    cytnx_uint64 Nin = this->_rowrank;
-    cytnx_uint64 Nout = this->_labels.size() - this->_rowrank;
-    cytnx_uint64 vl;
-    if (Nin > Nout)
-      vl = Nin;
-    else
-      vl = Nout;
-
-    std::string bks;
-    char *l = (char *)malloc(BUFFsize * sizeof(char));
-    char *llbl = (char *)malloc(BUFFsize * sizeof(char));
-    char *r = (char *)malloc(BUFFsize * sizeof(char));
-    char *rlbl = (char *)malloc(BUFFsize * sizeof(char));
-
-    int Space_Llabel_max=0, Space_Ldim_max=0, Space_Rdim_max =0;
-    //quickly checking the size for each line, only check the largest!
-
-    for (cytnx_uint64 i = 0; i < vl; i++) {
-        if(i<Nin){
-            if(Space_Llabel_max < this->_labels[i].size()) Space_Llabel_max = this->_labels[i].size();
-            if(Space_Ldim_max < to_string(this->_bonds[i].dim()).size()) Space_Ldim_max = to_string(this->_bonds[i].dim()).size();
-        }
-        if(i<Nout){
-            if(Space_Rdim_max < to_string(this->_bonds[Nin+i].dim()).size()) Space_Rdim_max = to_string(this->_bonds[Nin+i].dim()).size();
-        }
-    }
-    string LallSpace = (string(" ")*(Space_Llabel_max+3+1));
-    string MallSpace = string(" ")*(1 + Space_Ldim_max + 5 + Space_Rdim_max+1);
-    string M_dashes  = string("-")*(1 + Space_Ldim_max + 5 + Space_Rdim_max+1);
-
-    std::string tmpss;
-    sprintf(buffer, "%s row %s col %s",LallSpace.c_str(),MallSpace.c_str(),"\n");
-    std::cout << std::string(buffer);
-    sprintf(buffer, "%s    -%s-    %s",LallSpace.c_str(),M_dashes.c_str(),"\n");
-    std::cout << std::string(buffer);
-    for (cytnx_uint64 i = 0; i < vl; i++) {
-      sprintf(buffer, "%s    |%s|    %s",LallSpace.c_str(),MallSpace.c_str(),"\n");
-      std::cout << std::string(buffer);
-
-      if (i < Nin) {
-        if (this->_bonds[i].type() == bondType::BD_KET)
-          bks = " -->";
-        else
-          bks = "*<--";
-        memset(l, 0, sizeof(char) * BUFFsize);
-        memset(llbl, 0, sizeof(char) * BUFFsize);
-        tmpss = this->_labels[i] + std::string(" ")*(Space_Llabel_max-this->_labels[i].size());
-        sprintf(l, "%s %s", tmpss.c_str(), bks.c_str());
-        tmpss = to_string(this->_bonds[i].dim()) + std::string(" ")*(Space_Ldim_max-to_string(this->_bonds[i].dim()).size());
-        sprintf(llbl, "%s", tmpss.c_str());
-      } else {
-        memset(l, 0, sizeof(char) * BUFFsize);
-        memset(llbl, 0, sizeof(char) * BUFFsize);
-        tmpss = std::string(" ")*(Space_Llabel_max+5);
-        sprintf(l, "%s",tmpss.c_str());
-        tmpss = std::string(" ")*(Space_Ldim_max);
-        sprintf(llbl, "%s",tmpss.c_str());
-      }
-      if (i < Nout) {
-        if (this->_bonds[Nin + i].type() == bondType::BD_KET)
-          bks = "<--*";
-        else
-          bks = "--> ";
-        memset(r, 0, sizeof(char) * BUFFsize);
-        memset(rlbl, 0, sizeof(char) * BUFFsize);
-
-        sprintf(r, "%s %s", bks.c_str(), this->_labels[Nin + i].c_str());
-
-        tmpss = to_string(this->_bonds[Nin+i].dim()) + std::string(" ")*(Space_Rdim_max-to_string(this->_bonds[Nin+i].dim()).size());
-        sprintf(rlbl, "%s", tmpss.c_str());
-
-      } else {
-        memset(r, 0, sizeof(char) * BUFFsize);
-        memset(rlbl, 0, sizeof(char) * BUFFsize);
-        sprintf(r, "%s", "        ");
-        tmpss = std::string(" ")*Space_Rdim_max;
-        sprintf(rlbl, "%s",tmpss.c_str());
-      }
-      sprintf(buffer, "   %s| %s     %s |%s\n", l, llbl, rlbl, r);
-      std::cout << std::string(buffer);
-    }
-    sprintf(buffer, "%s    |%s|    %s",LallSpace.c_str(),MallSpace.c_str(),"\n");
-    std::cout << std::string(buffer);
-    sprintf(buffer, "%s    -%s-    %s",LallSpace.c_str(),M_dashes.c_str(),"\n");
-    std::cout << std::string(buffer);
-    sprintf(buffer, "%s", "\n");
-    std::cout << std::string(buffer);
-
-    if (bond_info) {
-      for (cytnx_uint64 i = 0; i < this->_bonds.size(); i++) {
-        // sprintf(buffer, "lbl:%d ", this->_labels[i]);
-        sprintf(buffer, "lbl:%s ", this->_labels[i].c_str());
-        std::cout << std::string(buffer);
-        std::cout << this->_bonds[i] << std::endl;
-      }
-    }
-
-    fflush(stdout);
-    free(l);
-    free(llbl);
-    free(r);
-    free(rlbl);
-    free(buffer);
-  }
-
-    boost::intrusive_ptr<UniTensor_base> BlockUniTensor::contiguous() {
-        if(this->is_contiguous()){
-            boost::intrusive_ptr<UniTensor_base> out(this);
-            return out;
-        } else{
-            BlockUniTensor *tmp = new BlockUniTensor();
-            tmp = this->clone_meta(true,true);
-            tmp->_blocks.resize(this->_blocks.size());
-            for(unsigned int b=0;b<this->_blocks.size();b++){
-                if(this->_blocks[b].is_contiguous()){
-                    tmp->_blocks[b] = this->_blocks[b].clone();
-                }else{
-                    tmp->_blocks[b] = this->_blocks[b].contiguous();
-                }
-            }
-            boost::intrusive_ptr<UniTensor_base> out(tmp);
-            return out;
-        }
-    }
-
-  std::vector<Symmetry> BlockUniTensor::syms() const { return this->_bonds[0].syms(); }
-
-
-  boost::intrusive_ptr<UniTensor_base> BlockUniTensor::permute(
-    const std::vector<cytnx_int64> &mapper, const cytnx_int64 &rowrank, const bool &by_label) {
-
-    BlockUniTensor *out_raw = this->clone_meta(true,true);
-    out_raw ->_blocks.resize(this->_blocks.size());
-
-    std::vector<cytnx_uint64> mapper_u64;
-    if (by_label) {
-      // cytnx_error_msg(true,"[Developing!]%s","\n");
-      std::vector<std::string>::iterator it;
-      for (cytnx_uint64 i = 0; i < mapper.size(); i++) {
-        it = std::find(out_raw->_labels.begin(), out_raw->_labels.end(), std::to_string(mapper[i]));
-        cytnx_error_msg(it == out_raw->_labels.end(),
-                        "[ERROR] label %d does not exist in current UniTensor.\n", mapper[i]);
-        mapper_u64.push_back(std::distance(out_raw->_labels.begin(), it));
-      }
-
-    } else {
-      mapper_u64 = std::vector<cytnx_uint64>(mapper.begin(), mapper.end());
-      //checking:
-      for(int i=0;i<mapper_u64.size();i++){
-        cytnx_error_msg(mapper_u64[i] >= this->rank(), "[ERROR] index %d out of bound!\n",mapper_u64[i]);
-      }
-
-    }
-
-
-    out_raw->_bonds = vec_map(vec_clone(out_raw->bonds()), mapper_u64);  // this will check validity
-    out_raw->_labels = vec_map(out_raw->labels(), mapper_u64);
-
-
-    if(out_raw->_is_diag){
-        //cytnx_error_msg(true,"[ERROR][BlockUniTensor] currently do not support permute for is_diag=true for BlockUniTensor!%s","\n");
-        if(rowrank >= 0)
-            cytnx_error_msg(rowrank != 1, "[ERROR][BlockUniTensor] is_diag=true must have rowrank=1.%s","\n");
-        out_raw->_is_braket_form = out_raw->_update_braket();
-
-    }else{
-        //inner_to_outer permute!
-        for(cytnx_int64 b=0;b<this->_inner_to_outer_idx.size();b++){
-            out_raw->_inner_to_outer_idx[b] = vec_map(out_raw->_inner_to_outer_idx[b], mapper_u64);
-            out_raw->_blocks[b] = this->_blocks[b].permute(mapper_u64);
-        }
-
-        if(rowrank >=0){
-            cytnx_error_msg((rowrank >= out_raw->_bonds.size()) || (rowrank < 1),
-                            "[ERROR][BlockUniTensor] rowrank cannot exceed the rank of UniTensor-1, and should be >=1.%s",
-                            "\n");
-            out_raw->_rowrank = rowrank;
-
-        }
-        out_raw->_is_braket_form = out_raw->_update_braket();
-    }
-    boost::intrusive_ptr<UniTensor_base> out(out_raw);
-
-    return out;
-  }
-
-  boost::intrusive_ptr<UniTensor_base> BlockUniTensor::permute(
-    const std::vector<std::string> &mapper, const cytnx_int64 &rowrank) {
-
-        BlockUniTensor *out_raw = this->clone_meta(true,true);
-        out_raw ->_blocks.resize(this->_blocks.size());
-
-        std::vector<cytnx_int64> mapper_i64;
-        // cytnx_error_msg(true,"[Developing!]%s","\n");
-        std::vector<string>::iterator it;
-        for (cytnx_int64 i = 0; i < mapper.size(); i++) {
-          it = std::find(out_raw->_labels.begin(), out_raw->_labels.end(), mapper[i]);
-          cytnx_error_msg(it == out_raw->_labels.end(),
-                          "[ERROR] label %s does not exist in current UniTensor.\n", mapper[i].c_str());
-          mapper_i64.push_back(std::distance(out_raw->_labels.begin(), it));
-        }
-
-        return this->permute(mapper_i64,rowrank,false);
-
-
-  }
-
-  void BlockUniTensor::permute_(const std::vector<cytnx_int64> &mapper, const cytnx_int64 &rowrank,
-                                 const bool &by_label) {
-    std::vector<cytnx_uint64> mapper_u64;
-    if (by_label) {
-      // cytnx_error_msg(true,"[Developing!]%s","\n");
-      std::vector<string>::iterator it;
-      for (cytnx_uint64 i = 0; i < mapper.size(); i++) {
-        it = std::find(this->_labels.begin(), this->_labels.end(), std::to_string(mapper[i]));
-        cytnx_error_msg(it == this->_labels.end(),
-                        "[ERROR] label %d does not exist in current UniTensor.\n", mapper[i]);
-        mapper_u64.push_back(std::distance(this->_labels.begin(), it));
-      }
-
-    } else {
-      mapper_u64 = std::vector<cytnx_uint64>(mapper.begin(), mapper.end());
-      //checking:
-      for(int i=0;i<mapper_u64.size();i++){
-        cytnx_error_msg(mapper_u64[i] >= this->rank(), "[ERROR] index %d out of bound!\n",mapper_u64[i]);
-      }
-    }
-
-    this->_bonds = vec_map(vec_clone(this->bonds()), mapper_u64);  // this will check validity
-    this->_labels = vec_map(this->labels(), mapper_u64);
-
-    if(this->_is_diag){
-
-        if(rowrank >= 0)
-            cytnx_error_msg(rowrank != 1, "[ERROR][BlockUniTensor] is_diag=true must have rowrank=1.%s","\n");
-        this->_is_braket_form = this->_update_braket();
-
-    }else{
-        //inner_to_outer permute!
-        for(cytnx_int64 b=0;b<this->_inner_to_outer_idx.size();b++){
-            this->_inner_to_outer_idx[b] = vec_map(this->_inner_to_outer_idx[b], mapper_u64);
-            this->_blocks[b].permute_(mapper_u64);
-        }
-
-        if (rowrank >= 0) {
-            cytnx_error_msg((rowrank >= this->_bonds.size()) || (rowrank < 1),
-                                "[ERROR][BlockUniTensor] rowrank cannot exceed the rank of UniTensor-1, and should be >=1.%s",
-                                "\n");
-                this->_rowrank = rowrank;
-        }
-        this->_is_braket_form = this->_update_braket();
-    }
-
-  }
-
-  void BlockUniTensor::permute_(const std::vector<std::string> &mapper,
-                                const cytnx_int64 &rowrank) {
-
-    std::vector<cytnx_int64> mapper_i64;
-    // cytnx_error_msg(true,"[Developing!]%s","\n");
-    std::vector<std::string>::iterator it;
-    for (cytnx_uint64 i = 0; i < mapper.size(); i++) {
-      it = std::find(this->_labels.begin(), this->_labels.end(), mapper[i]);
-      cytnx_error_msg(it == this->_labels.end(),
-                      "[ERROR] label %d does not exist in current UniTensor.\n", mapper[i].c_str());
-      mapper_i64.push_back(std::distance(this->_labels.begin(), it));
-    }
-
-    this->permute_(mapper_i64,rowrank,false);
-
-  }
-
-  boost::intrusive_ptr<UniTensor_base> BlockUniTensor::relabels(
-    const std::vector<string> &new_labels) {
-    BlockUniTensor *tmp = this->clone_meta(true, true);
-    tmp->_blocks = this->_blocks;
-    tmp->set_labels(new_labels);
-    boost::intrusive_ptr<UniTensor_base> out(tmp);
-    return out;
-  }
-  boost::intrusive_ptr<UniTensor_base> BlockUniTensor::relabels(
-    const std::vector<cytnx_int64> &new_labels) {
-    vector<string> vs(new_labels.size());
-    transform(new_labels.begin(), new_labels.end(), vs.begin(),
-              [](cytnx_int64 x) -> string { return to_string(x); });
-    //std::cout << "entry" << endl;
-    return relabels(vs);
-  }
-
-  boost::intrusive_ptr<UniTensor_base> BlockUniTensor::relabel(const cytnx_int64 &inx,
-                                                                const cytnx_int64 &new_label,
-                                                                const bool &by_label) {
-    BlockUniTensor *tmp = this->clone_meta(true, true);
-    tmp->_blocks = this->_blocks;
-    tmp->set_label(inx, new_label, by_label);
-    boost::intrusive_ptr<UniTensor_base> out(tmp);
-    return out;
-  }
-  boost::intrusive_ptr<UniTensor_base> BlockUniTensor::relabel(const cytnx_int64 &inx,
-                                                                const string &new_label) {
-    BlockUniTensor *tmp = this->clone_meta(true, true);
-    tmp->_blocks = this->_blocks;
-    tmp->set_label(inx, new_label);
-    boost::intrusive_ptr<UniTensor_base> out(tmp);
-    return out;
-  }
-  boost::intrusive_ptr<UniTensor_base> BlockUniTensor::relabel(const string &inx,
-                                                                const string &new_label) {
-    BlockUniTensor *tmp = this->clone_meta(true, true);
-    tmp->_blocks = this->_blocks;
-    tmp->set_label(inx, new_label);
-    boost::intrusive_ptr<UniTensor_base> out(tmp);
-    return out;
-  }
-  boost::intrusive_ptr<UniTensor_base> BlockUniTensor::relabel(const cytnx_int64 &inx,
-                                                                const cytnx_int64 &new_label) {
-    BlockUniTensor *tmp = this->clone_meta(true, true);
-    tmp->_blocks = this->_blocks;
-    tmp->set_label(inx, new_label);
-    boost::intrusive_ptr<UniTensor_base> out(tmp);
-    return out;
-  }
-
-
-
-  boost::intrusive_ptr<UniTensor_base> BlockUniTensor::contract(
-      const boost::intrusive_ptr<UniTensor_base> &rhs, const bool &mv_elem_self,
-      const bool &mv_elem_rhs){
-    // checking type
-    cytnx_error_msg(rhs->uten_type() != UTenType.Block,
-                    "[ERROR] cannot contract symmetry-block UniTensor with other type of UniTensor%s",
-                    "\n");
-
-    //checking symmetry:
-    cytnx_error_msg(this->syms() != rhs->syms(),
-                    "[ERROR] two UniTensor have different symmetry type cannot contract.%s", "\n");
-
-
-    // get common labels:
-    std::vector<string> comm_labels;
-    std::vector<cytnx_uint64> comm_idx1, comm_idx2;
-    vec_intersect_(comm_labels, this->labels(), rhs->labels(), comm_idx1, comm_idx2);
-
-
-
-    if (comm_idx1.size() == 0) {
-
-        // output instance;
-        BlockUniTensor *tmp = new BlockUniTensor();
-        BlockUniTensor *Rtn = (BlockUniTensor*)rhs.get();
-        std::vector<string> out_labels;
-        std::vector<Bond> out_bonds;
-        cytnx_int64 out_rowrank;
-
-
-        //no-common label:
-        vec_concatenate_(out_labels, this->labels(), rhs->labels());
-        for (cytnx_uint64 i = 0; i < this->_bonds.size(); i++)
-            out_bonds.push_back(this->_bonds[i].clone());
-        for (cytnx_uint64 i = 0; i < rhs->_bonds.size(); i++)
-            out_bonds.push_back(rhs->_bonds[i].clone());
-
-        out_rowrank = this->rowrank() + rhs->rowrank();
-        vec_concatenate_(out_labels, this->_labels, rhs->_labels);
-
-        //cout << out_bonds;
-        tmp->Init(out_bonds,out_labels, out_rowrank, this->dtype(), this->device(),false);
-
-        //tmp->_name = this->_name + "+" + rhs->_name;
-
-        //check each valid block:
-        std::vector<cytnx_uint64> Lidx(this->_bonds.size()); //buffer
-        std::vector<cytnx_uint64> Ridx(rhs->_bonds.size());  //buffer
-        for(cytnx_int32 b=0;b<tmp->_blocks.size();b++){
-            memcpy(&Lidx[0], &tmp->_inner_to_outer_idx[b][0],sizeof(cytnx_uint64)*this->_bonds.size());
-            memcpy(&Ridx[0], &tmp->_inner_to_outer_idx[b][this->_bonds.size()],sizeof(cytnx_uint64)*rhs->_bonds.size());
-
-            auto IDL = vec_argwhere(this->_inner_to_outer_idx,Lidx);
-            auto IDR = vec_argwhere(Rtn->_inner_to_outer_idx,Ridx);
-
-            /*
-            cout << b << endl;
-            //vec_print_simple(std::cout,tmp->_inner_to_outer_idx[b]);
-            //vec_print_simple(std::cout,Lidx);
-            //vec_print_simple(std::cout,Ridx);
-            vec_print_simple(std::cout,IDL);
-            vec_print_simple(std::cout,IDR);
-            */
-            if(User_debug){
-                if(IDL.size()==IDR.size()){
-                    cytnx_error_msg(IDL.size()>1,"[ERROR][BlockUniTensor] IDL has more than two ambiguous location!%s","\n");
-                    cytnx_error_msg(IDR.size()>1,"[ERROR][BlockUniTensor] IDL has more than two ambiguous location!%s","\n");
-
-                }else{
-                    cytnx_error_msg(true,"[ERROR] duplication, something wrong!%s","\n");
-
-                }
-            }
-            if(IDL.size()){
-
-                auto tmpR = Rtn->is_diag()?linalg::Diag(Rtn->_blocks[IDR[0]]):Rtn->_blocks[IDR[0]];
-                auto tmpL = this->is_diag()?linalg::Diag(this->_blocks[IDL[0]]):this->_blocks[IDL[0]];
-                std::vector<cytnx_uint64> shape_L =
-                    vec_concatenate(tmpL.shape(), std::vector<cytnx_uint64>(tmpR.shape().size(), 1));
-
-                tmpL = tmpL.reshape(shape_L);
-                auto Ott = linalg::Kron(tmpL,tmpR,false,true);
-                //checking:
-                cytnx_error_msg(Ott.shape()!=tmp->_blocks[b].shape(),"[ERROR] mismatching shape!%s","\n");
-                tmp->_blocks[b] = Ott;
-            }
-
-        }
-
-        boost::intrusive_ptr<UniTensor_base> out(tmp);
-        return out;
-    }else{
-        //first, get common index!
-
-        // check qnums & type:
-        for (int i = 0; i < comm_labels.size(); i++) {
-            if (User_debug){
-              cytnx_error_msg(this->_bonds[comm_idx1[i]].qnums() != rhs->_bonds[comm_idx2[i]].qnums(),
-                              "[ERROR] contract bond @ label %s have qnum mismatch.\n", comm_labels[i].c_str());
-              cytnx_error_msg(this->_bonds[comm_idx1[i]].getDegeneracies() != rhs->_bonds[comm_idx2[i]].getDegeneracies(),
-                              "[ERROR] contract bond @ label %s have degeneracies mismatch.\n", comm_labels[i].c_str());
-            }
-            cytnx_error_msg(this->_bonds[comm_idx1[i]].type() + rhs->_bonds[comm_idx2[i]].type(),
-                            "[ERROR] BRA can only contract with KET. invalid @ label: %s\n",
-                            comm_labels[i].c_str());
-        }
-
-        // proc meta, labels:
-        std::vector<cytnx_uint64> non_comm_idx1 =
-        vec_erase(utils_internal::range_cpu(this->rank()), comm_idx1);
-        std::vector<cytnx_uint64> non_comm_idx2 =
-        vec_erase(utils_internal::range_cpu(rhs->rank()), comm_idx2);
-
-        if ((non_comm_idx1.size() == 0) && (non_comm_idx2.size() == 0)) {
-            std::vector<cytnx_int64> _shadow_comm_idx1(comm_idx1.size()), _shadow_comm_idx2(comm_idx2.size());
-            memcpy(_shadow_comm_idx1.data(),comm_idx1.data(),sizeof(cytnx_int64)*comm_idx1.size());
-            memcpy(_shadow_comm_idx2.data(),comm_idx2.data(),sizeof(cytnx_int64)*comm_idx2.size());
-            // All the legs are contracted, the return will be a scalar
-
-            // output instance;
-            DenseUniTensor *tmp = new DenseUniTensor();
-
-            boost::intrusive_ptr<UniTensor_base> Lperm = this->permute(_shadow_comm_idx1);
-            boost::intrusive_ptr<UniTensor_base> Rperm = rhs->permute(_shadow_comm_idx2);
-
-            BlockUniTensor *Lperm_raw = (BlockUniTensor*)Lperm.get();
-            BlockUniTensor *Rperm_raw = (BlockUniTensor*)Rperm.get();
-
-
-            //pair the block and contract using vectordot!
-            // naive way!
-            for(unsigned int b=0;b<Lperm_raw->_blocks.size();b++){
-                for(unsigned int a=0;a<Rperm_raw->_blocks.size();a++){
-                    if(Lperm_raw->_inner_to_outer_idx[b] == Rperm_raw->_inner_to_outer_idx[a]){
-                        if(tmp->_block.dtype()==Type.Void)
-                            tmp->_block = linalg::Vectordot(Lperm_raw->_blocks[b].flatten(),Rperm_raw->_blocks[a].flatten());
-                        else
-                            tmp->_block += linalg::Vectordot(Lperm_raw->_blocks[b].flatten(),Rperm_raw->_blocks[a].flatten());
-
-                        // std::cout << b << " " << a << endl;
-
-
-                    }
-                }
-            }
-
-            tmp->_rowrank = 0;
-            tmp->_is_tag = false;
-            /*
-            if(mv_elem_self){
-                // calculate reverse mapper:
-                std::vector<cytnx_uint64> inv_mapperL(comm_idx1.size());
-                for (int i = 0; i < comm_idx1.size(); i++) {
-                  inv_mapperL[comm_idx1[i]] = i;
-                }
-                for(unsigned int b=0;b<this->_blocks.size();b++){
-                    this->_blocks[b].permute_(comm_idx1);
-                    this->_blocks[b].contiguous_();
-                    this->_blocks[b].permute_(inv_mapperL);
-                }
-            }
-
-            if(mv_elem_rhs){
-                BlockUniTensor *Rtn = (BlockUniTensor*)rhs.get();
-                // calculate reverse mapper:
-                std::vector<cytnx_uint64> inv_mapperR(comm_idx2.size());
-                for (int i = 0; i < comm_idx2.size(); i++) {
-                  inv_mapperR[comm_idx2[i]] = i;
-                }
-                for(unsigned int b=0;b<Rtn->_blocks.size();b++){
-                    Rtn->_blocks[b].permute_(comm_idx2);
-                    Rtn->_blocks[b].contiguous_();
-                    Rtn->_blocks[b].permute_(inv_mapperR);
-                }
-            }
-            */
-            boost::intrusive_ptr<UniTensor_base> out(tmp);
-            return out;
-
-
-        }else{
-            //cytnx_error_msg(true,"developing!%s","\n");
-            BlockUniTensor *tmp = new BlockUniTensor();
-            BlockUniTensor *Rtn = (BlockUniTensor*)rhs.get();
-            std::vector<string> out_labels;
-            std::vector<Bond> out_bonds;
-            cytnx_int64 out_rowrank;
-
-            // these two cannot omp parallel, due to intrusive_ptr
-            for (cytnx_uint64 i = 0; i < non_comm_idx1.size(); i++)
-                out_bonds.push_back(this->_bonds[non_comm_idx1[i]].clone());
-            for (cytnx_uint64 i = 0; i < non_comm_idx2.size(); i++)
-                out_bonds.push_back(rhs->_bonds[non_comm_idx2[i]].clone());
-
-            vec_concatenate_(out_labels, vec_clone(this->_labels, non_comm_idx1),
-                       vec_clone(rhs->_labels, non_comm_idx2));
-
-            out_rowrank = this->rowrank() + rhs->rowrank();
-            for (cytnx_uint64 i = 0; i < comm_idx1.size(); i++)
-              if (comm_idx1[i] < this->_rowrank) out_rowrank--;
-            for (cytnx_uint64 i = 0; i < comm_idx2.size(); i++)
-              if (comm_idx2[i] < rhs->_rowrank) out_rowrank--;
-
-            // Initialize!!
-            if((this->dtype()!=Type.Double and this->dtype()!=Type.ComplexDouble) and
-               (this->dtype()!=Type.Float and this->dtype()!=Type.ComplexFloat) or
-               this->is_diag() or Rtn->is_diag()){
-              // cout<<"IM IN!!!"<<endl;
-              tmp->Init(out_bonds,out_labels, out_rowrank, this->dtype(), this->device(), false, false);
-            } else {
-              tmp->Init(out_bonds,out_labels, out_rowrank, this->dtype(), this->device(), false, true);
-            }
-
-            // now, build the itoi table:
-            std::vector< std::vector<cytnx_uint64> > itoiL_common(this->_blocks.size()), itoiR_common(Rtn->_blocks.size());
-            // std::vector< std::vector<cytnx_uint64> > Bkk;
-
-            for(cytnx_int64 a=0;a<this->_blocks.size();a++){
-                itoiL_common[a] = vec_clone(this->_inner_to_outer_idx[a],comm_idx1);
-            }
-
-            // std::unordered_map<std::vector<cytnx_uint64>, std::vector<cytnx_uint64>, VectorHasher> mp;
-            // std::unordered_map<std::vector<cytnx_uint64>, cytnx_uint64, VectorHasher> mpC;
-            boost::unordered_map<std::vector<cytnx_uint64>, std::vector<cytnx_uint64> > mp;
-            boost::unordered_map<std::vector<cytnx_uint64>, cytnx_uint64> mpC;
-
-            for(cytnx_int64 b=0;b<Rtn->_blocks.size();b++){
-                itoiR_common[b] = vec_clone(Rtn->_inner_to_outer_idx[b],comm_idx2);
-                if(!mp[itoiR_common[b]].size())
-                    mp[itoiR_common[b]] = std::vector<cytnx_uint64>(1,b);
-                else mp[itoiR_common[b]].push_back(b);
-            }
-            for(cytnx_int64 b=0;b<tmp->_blocks.size();b++){
-                mpC[tmp->_inner_to_outer_idx[b]] = b;
-            }
-
-            std::vector<cytnx_uint64> Lgbuffer;
-            std::vector<cytnx_uint64> itoiR_idx;
-            std::vector<cytnx_uint64> oldshapeL;
-            std::vector<std::vector<cytnx_uint64>> oldshapeR(Rtn->_blocks.size(),std::vector<cytnx_uint64>());
-            std::vector<std::vector<cytnx_uint64>> oldshapeC;
-            // smallvec<bool> reshaped(tmp->_blocks.size(),false);
-            std::vector<bool> reshaped(tmp->_blocks.size(),false);
-            // smallvec<bool> calculated(tmp->_blocks.size(),false);
-            for(cytnx_int64 a=0;a<tmp->_blocks.size();a++){
-              oldshapeC.push_back(tmp->_blocks[a].shape());
-            }
-            // std::vector<cytnx_uint64> non_contract_l,non_contract_r;
-            std::vector<cytnx_uint64> mapperL,inv_mapperL(this->_blocks[0].shape().size());
-            std::vector<cytnx_uint64> mapperR,inv_mapperR(Rtn->_blocks[0].shape().size());
-            vec_concatenate_(mapperL, non_comm_idx1, comm_idx1);
-            vec_concatenate_(mapperR, comm_idx2, non_comm_idx2);
-            for (int aa = 0; aa < mapperL.size(); aa++) {
-              inv_mapperL[mapperL[aa]] = aa;
-            }
-            for (int aa = 0; aa < mapperR.size(); aa++) {
-              inv_mapperR[mapperR[aa]] = aa;
-            }
-            // std::vector<std::vector<cytnx_uint64>> inv_mapperR(Rtn->_blocks.size(),std::vector<cytnx_uint64>(Rtn->_blocks[0].shape().size()));
-
-            if(this->is_diag()!=Rtn->is_diag()){
-              for(cytnx_int64 a=0;a<this->_blocks.size();a++){
-                cytnx_int64 comm_dim = 1;
-                itoiR_idx = mp[itoiL_common[a]];
-                for(cytnx_uint64 b : itoiR_idx){
-                  Lgbuffer.resize(non_comm_idx1.size()+non_comm_idx2.size());
-                  for(cytnx_uint64 cc=0;cc<non_comm_idx1.size();cc++){
-                    Lgbuffer[cc] = this->_inner_to_outer_idx[a][non_comm_idx1[cc]];
-                  }
-                  for(cytnx_uint64 cc=non_comm_idx1.size();cc<non_comm_idx1.size()+non_comm_idx2.size();cc++){
-                    Lgbuffer[cc] = Rtn->_inner_to_outer_idx[b][non_comm_idx2[cc-non_comm_idx1.size()]];
-                  }
-                  // vec_concatenate_(Lgbuffer, vec_clone(this->_inner_to_outer_idx[a],non_comm_idx1)
-                  //                                , vec_clone(Rtn->_inner_to_outer_idx[b],non_comm_idx2));
-                  // auto it = std::find(tmp->_inner_to_outer_idx.begin(),tmp->_inner_to_outer_idx.end(),Lgbuffer);
-                  // cytnx_int64 targ_b = it - tmp->_inner_to_outer_idx.begin();
-                  cytnx_int64 targ_b = mpC[Lgbuffer];
-                  tmp->_blocks[targ_b] += linalg::Tensordot_dg(this->_blocks[a], Rtn->_blocks[b], comm_idx1, comm_idx2, this->is_diag());
-                }
-              }
-            }else{
-              // smallvec<char> transs(Rtn->_blocks.size(), 'N');
-              // smallvec<blas_int> ms(Rtn->_blocks.size(),0),ns(Rtn->_blocks.size(),0),ks(Rtn->_blocks.size(),0);
-              // smallvec<cytnx_double> doublealpha(Rtn->_blocks.size(),1.0);
-              // smallvec<cytnx_double> doublebeta(Rtn->_blocks.size(),0.0);
-              // smallvec<cytnx_float> floatalpha(Rtn->_blocks.size(),1.0);
-              // smallvec<cytnx_float> floatbeta(Rtn->_blocks.size(),0.0);
-              // smallvec<cytnx_complex128> complexalpha(Rtn->_blocks.size(),1.0);
-              // smallvec<cytnx_complex128> complexbeta(Rtn->_blocks.size(),0.0);
-              // smallvec<cytnx_complex64> complexalpha_f(Rtn->_blocks.size(),1.0);
-              // smallvec<cytnx_complex64> complexbeta_f(Rtn->_blocks.size(),0.0);
-              // smallvec<void*> LMems(Rtn->_blocks.size(),0),RMems(Rtn->_blocks.size(),0),CMems(Rtn->_blocks.size(),0);
-              // smallvec<blas_int> group_size(Rtn->_blocks.size(),1);
-
-              std::vector<char> transs(Rtn->_blocks.size(), 'N');
-              std::vector<blas_int> ms(Rtn->_blocks.size(),0),ns(Rtn->_blocks.size(),0),ks(Rtn->_blocks.size(),0);
-              std::vector<cytnx_double> doublealpha(Rtn->_blocks.size(),1.0);
-              std::vector<cytnx_double> doublebeta(Rtn->_blocks.size(),0.0);
-              std::vector<cytnx_float> floatalpha(Rtn->_blocks.size(),1.0);
-              std::vector<cytnx_float> floatbeta(Rtn->_blocks.size(),0.0);
-              std::vector<cytnx_complex128> complexalpha(Rtn->_blocks.size(),1.0);
-              std::vector<cytnx_complex128> complexbeta(Rtn->_blocks.size(),0.0);
-              std::vector<cytnx_complex64> complexalpha_f(Rtn->_blocks.size(),1.0);
-              std::vector<cytnx_complex64> complexbeta_f(Rtn->_blocks.size(),0.0);
-              std::vector<void*> LMems(Rtn->_blocks.size(),0),RMems(Rtn->_blocks.size(),0),CMems(Rtn->_blocks.size(),0);
-              std::vector<blas_int> group_size(Rtn->_blocks.size(),1);
-
-              for(cytnx_int64 a=0;a<this->_blocks.size();a++){
-                cytnx_int64 comm_dim = 1;
-                itoiR_idx = mp[itoiL_common[a]];
-                for (cytnx_uint64 aa = 0; aa < comm_idx1.size(); aa++) {
-                  comm_dim *= this->_blocks[a].shape()[comm_idx1[aa]];
-                }
-                // vec_concatenate_(mapperL, non_comm_idx1, comm_idx1);
-                // for (int aa = 0; aa < mapperL.size(); aa++) {
-                //   inv_mapperL[mapperL[aa]] = aa;
-                // }
-                this->_blocks[a].permute_(mapperL);
-                oldshapeL = this->_blocks[a].shape();
-                this->_blocks[a].reshape_({-1, comm_dim});
-
-                for(cytnx_uint64 binx = 0;binx<itoiR_idx.size();binx++){
-                  cytnx_uint64 b = itoiR_idx[binx];
-
-                  // vec_concatenate_(mapperR, comm_idx2, non_comm_idx2);
-                  // for (int aa = 0; aa < mapperR.size(); aa++) {
-                  //   // inv_mapperR[mapperR[aa]] = aa;
-                  //   inv_mapperR[b][mapperR[aa]] = aa;
-                  // }
-                  Rtn->_blocks[b].permute_(mapperR);
-                  // oldshapeR = Rtn->_blocks[b].shape();
-                  oldshapeR[b] = Rtn->_blocks[b].shape();
-                  Rtn->_blocks[b].reshape_({comm_dim, -1});
-                  Lgbuffer.resize(non_comm_idx1.size()+non_comm_idx2.size());
-                  for(cytnx_uint64 cc=0;cc<non_comm_idx1.size();cc++){
-                    Lgbuffer[cc] = this->_inner_to_outer_idx[a][non_comm_idx1[cc]];
-                  }
-                  for(cytnx_uint64 cc=non_comm_idx1.size();cc<non_comm_idx1.size()+non_comm_idx2.size();cc++){
-                    Lgbuffer[cc] = Rtn->_inner_to_outer_idx[b][non_comm_idx2[cc-non_comm_idx1.size()]];
-                  }
-                  // vec_concatenate_(Lgbuffer, vec_clone(this->_inner_to_outer_idx[a],non_comm_idx1)
-                  //                               , vec_clone(Rtn->_inner_to_outer_idx[b],non_comm_idx2));
-
-                  // auto it = std::find(tmp->_inner_to_outer_idx.begin(),tmp->_inner_to_outer_idx.end(),Lgbuffer);
-                  // cytnx_int64 targ_b = it - tmp->_inner_to_outer_idx.begin();
-                  cytnx_int64 targ_b = mpC[Lgbuffer];
-                  doublebeta[binx]=1.0;
-                  complexbeta[binx]=1.0;
-                  floatbeta[binx]=1.0;
-                  complexbeta_f[binx]=1.0;
-                  if(!reshaped[targ_b]){
-                    tmp->_blocks[targ_b].reshape_({(cytnx_int64)this->_blocks[a].shape()[0], (cytnx_int64)Rtn->_blocks[b].shape()[1]});
-                    reshaped[targ_b] = true;
-                    doublebeta[binx]=0.0;
-                    complexbeta[binx]=0.0;
-                    floatbeta[binx]=0.0;
-                    complexbeta_f[binx]=0.0;
-                    // if(tmp->dtype()==Type.Double and this->dtype()==Type.Double and Rtn->dtype()==Type.Double){
-                    //   doublebeta[binx]=0.0;
-                    // }else if(tmp->dtype()==Type.ComplexDouble and this->dtype()==Type.ComplexDouble and Rtn->dtype()==Type.ComplexDouble){
-                    //   complexbeta[binx]=0.0;
-                    // }
-                  }
-                  if((tmp->dtype()==Type.Double and this->dtype()==Type.Double and Rtn->dtype()==Type.Double) or
-                     (tmp->dtype()==Type.ComplexDouble and this->dtype()==Type.ComplexDouble and Rtn->dtype()==Type.ComplexDouble) or
-                     (tmp->dtype()==Type.Float and this->dtype()==Type.Float and Rtn->dtype()==Type.Float) or
-                     (tmp->dtype()==Type.ComplexFloat and this->dtype()==Type.ComplexFloat and Rtn->dtype()==Type.ComplexFloat)
-                     ){
-                    ms[binx] = this->_blocks[a].shape()[0];
-                    ns[binx] = Rtn->_blocks[b].shape()[1];
-                    ks[binx] = comm_dim;
-                    LMems[binx] = this->_blocks[a].storage()._impl->Mem;
-                    RMems[binx] = Rtn->_blocks[b].storage()._impl->Mem;
-                    CMems[binx] = tmp->_blocks[targ_b].storage()._impl->Mem;
-                    // linalg::d_Matmul(this->_blocks[a], Rtn->_blocks[b], tmp->_blocks[targ_b], 1.0, 1.0, false);
-                  } else {
-                    tmp->_blocks[targ_b] += linalg::Matmul(this->_blocks[a], Rtn->_blocks[b]).reshape(tmp->_blocks[targ_b].shape());
-                  }
-                  // Rtn->_blocks[b].reshape_(oldshapeR);
-                  // Rtn->_blocks[b].permute_(inv_mapperR);
-                }
-
-                if(tmp->dtype()==Type.Double and this->dtype()==Type.Double and Rtn->dtype()==Type.Double){
-                  blas_int group_count = itoiR_idx.size();
-                  // std::vector<blas_int> group_size(group_count,1);
-                  group_size.resize(group_count,1);
-                  dgemm_batch(transs.data(),transs.data(),ns.data(),ms.data(),ks.data(),doublealpha.data(),
-                    (const cytnx_double**)RMems.data(),ns.data(),(const cytnx_double**)LMems.data(),
-                    ks.data(),doublebeta.data(),(cytnx_double**)CMems.data(),ns.data(),&group_count,group_size.data());
-                }else if(tmp->dtype()==Type.ComplexDouble and this->dtype()==Type.ComplexDouble and Rtn->dtype()==Type.ComplexDouble){
-                  blas_int group_count = itoiR_idx.size();
-                  // std::vector<blas_int> group_size(group_count,1);
-                  group_size.resize(group_count,1);
-                  zgemm_batch(transs.data(),transs.data(),ns.data(),ms.data(),ks.data(),complexalpha.data(),
-                    (const cytnx_complex128**)RMems.data(),ns.data(),(const cytnx_complex128**)LMems.data(),
-                    ks.data(),complexbeta.data(),(cytnx_complex128**)CMems.data(),ns.data(),&group_count,group_size.data());
-                }else if(tmp->dtype()==Type.Float and this->dtype()==Type.Float and Rtn->dtype()==Type.Float){
-                  blas_int group_count = itoiR_idx.size();
-                  // std::vector<blas_int> group_size(group_count,1);
-                  group_size.resize(group_count,1);
-                  sgemm_batch(transs.data(),transs.data(),ns.data(),ms.data(),ks.data(),floatalpha.data(),
-                    (const cytnx_float**)RMems.data(),ns.data(),(const cytnx_float**)LMems.data(),
-                    ks.data(),floatbeta.data(),(cytnx_float**)CMems.data(),ns.data(),&group_count,group_size.data());
-                }else if(tmp->dtype()==Type.ComplexFloat and this->dtype()==Type.ComplexFloat and Rtn->dtype()==Type.ComplexFloat){
-                  blas_int group_count = itoiR_idx.size();
-                  // std::vector<blas_int> group_size(group_count,1);
-                  group_size.resize(group_count,1);
-                  cgemm_batch(transs.data(),transs.data(),ns.data(),ms.data(),ks.data(),complexalpha_f.data(),
-                    (const cytnx_complex64**)RMems.data(),ns.data(),(const cytnx_complex64**)LMems.data(),
-                    ks.data(),complexbeta_f.data(),(cytnx_complex64**)CMems.data(),ns.data(),&group_count,group_size.data());
-                }
-
-                for(cytnx_uint64 binx = 0;binx<itoiR_idx.size();binx++){
-                  cytnx_uint64 b = itoiR_idx[binx];
-
-                  Rtn->_blocks[b].reshape_(oldshapeR[b]);
-                  Rtn->_blocks[b].permute_(inv_mapperR);
-                }
-
-                this->_blocks[a].reshape_(oldshapeL);
-                this->_blocks[a].permute_(inv_mapperL);
-              }
-
-              for(cytnx_int64 a=0;a<tmp->_blocks.size();a++){
-                tmp->_blocks[a].reshape_(oldshapeC[a]);
-                if(!reshaped[a]){
-                  // cout<<"IM ININININ"<<endl;
-                  // tmp->_blocks[a].storage().print_info();
-                  // tmp->_blocks[a].storage().print();
-                  tmp->_blocks[a].storage().set_zeros();
-                  // cout<<"-----------"<<endl;
-                  // tmp->_blocks[a].storage().print_info();
-                  // tmp->_blocks[a].storage().print();
-                  // cout<<"IM OUTOUTOUT"<<endl;
-                }
-              }
-
-            }
-
-            boost::intrusive_ptr<UniTensor_base> out(tmp);
-            return out;
-
-
-
-        } // does it contract all the bond?
-
-        cytnx_error_msg(true,"something wrong!%s","\n");
-
-    } // does it contract all the bond?
-
-
-
-  };
-
-
-  void BlockUniTensor::Transpose_(){
-    // modify tag
-    for (int i = 0; i < this->bonds().size(); i++) {
-      this->bonds()[i].redirect_();
-      // this->bonds()[i].qnums() = this->bonds()[i].calc_reverse_qnums();
-    }
-
-  };
-
-  void BlockUniTensor::normalize_(){
-    Scalar out(0,this->dtype());
-    for(auto &block: this->_blocks){
-        out += Scalar(linalg::Pow(linalg::Norm(block),2).item());
-    }
-    out = sqrt(out);
-    for(auto &block: this->_blocks){
-        block/=out;
-    }
-  };
-
-  void BlockUniTensor::Trace_(const cytnx_int64 &a, const cytnx_int64 &b, const bool &by_label){
-
-    // 1) from label to indx.
-    cytnx_int64 ida, idb;
-
-    if (by_label) {
-      ida = vec_where(this->_labels, std::to_string(a));
-      idb = vec_where(this->_labels, std::to_string(b));
-    } else {
-      cytnx_error_msg(a < 0 || b < 0, "[ERROR] invalid index a, b%s", "\n");
-      cytnx_error_msg(a >= this->rank() || b >= this->rank(), "[ERROR] index out of bound%s", "\n");
-      ida = a;
-      idb = b;
-    }
-
-    this->Trace_(ida,idb);
-
-  }
-
-  void BlockUniTensor::Trace_(const std::string &a, const std::string &b){
-    // 1) from label to indx.
-    cytnx_int64 ida, idb;
-
-    ida = vec_where(this->_labels, a);
-    idb = vec_where(this->_labels, b);
-
-    this->Trace_(ida,idb);
-  }
-  void BlockUniTensor::Trace_(const cytnx_int64 &a, const cytnx_int64 &b){
-
-    cytnx_int64 ida = a;
-    cytnx_int64 idb = b;
-
-    // check if indices are the same:
-    cytnx_error_msg(a < 0 || b < 0, "[ERROR] invalid index a, b%s", "\n");
-    cytnx_error_msg(a >= this->rank() || b >= this->rank(), "[ERROR] index out of bound%s", "\n");
-
-    cytnx_error_msg(ida == idb,
-                    "[ERROR][BlockUniTensor::Trace_] index a and index b should not be the same.%s",
-                    "\n");
-
-    // check if two bonds type are contractable:
-    cytnx_error_msg(this->_bonds[ida].type() == this->_bonds[idb].type(),"[ERROR] BD_BRA/BD_OUT can only contract with BD_KET/BD_IN%s","\n");
-
-    // check if two bonds dimension matches:
-    cytnx_error_msg(
-      this->_bonds[ida]._impl->_degs != this->_bonds[idb]._impl->_degs,
-      "[ERROR][BlockUniTensor::Trace_] The dimension of two bond for trace does not match!%s",
-      "\n");
-
-    // check if two bonds qnum matches:
-    cytnx_error_msg(
-      this->_bonds[ida]._impl->_qnums != this->_bonds[idb]._impl->_qnums,
-      "[ERROR][BlockUniTensor::Trace_] The quantum numbers of two bond for trace does not match!%s",
-      "\n");
-
-
-    // update rowrank:
-    cytnx_int64 tmpRk = this->_rowrank;
-    if (ida < tmpRk) this->_rowrank--;
-    if (idb < tmpRk) this->_rowrank--;
-
-    // 1) remove the bond, labels:
-    if (ida > idb) std::swap(ida, idb);
-    this->_bonds.erase(this->_bonds.begin() + idb);
-    this->_bonds.erase(this->_bonds.begin() + ida);
-    this->_labels.erase(this->_labels.begin() + idb);
-    this->_labels.erase(this->_labels.begin() + ida);
-
-    //trace the block!
-    std::vector<Tensor> new_blocks;
-    vec2d<cytnx_uint64> new_itoi;
-    if(this->_labels.size()==0){
-        // if there is no leg left, leaving only one block, and let API to handle the BlockUniTensor->DenseUniTensor!
-        new_blocks.push_back(zeros(1,this->dtype(),this->device()));
-        for(cytnx_int64 i=0;i<this->_blocks.size();i++){
-            if(this->_inner_to_outer_idx[i][ida] == this->_inner_to_outer_idx[i][idb]){
-                if(this->is_diag()) new_blocks.back()+=linalg::Sum(this->_blocks[i]);
-                else new_blocks.back() += this->_blocks[i].Trace(ida,idb);
-            }
-        }
-
-    }else{
-        std::map<std::vector<cytnx_uint64> , cytnx_uint64> tmap;
-        std::map<std::vector<cytnx_uint64> , cytnx_uint64>::iterator itr;
-        for(cytnx_int64 i=0;i<this->_blocks.size();i++){
-            //std::cout << "blk: " << i << std::endl;
-            if(this->_inner_to_outer_idx[i][ida] == this->_inner_to_outer_idx[i][idb]){
-                auto s = this->_inner_to_outer_idx[i];
-                s.erase(s.begin() + idb);
-                s.erase(s.begin() + ida);
-                auto itr = tmap.find(s);
-                if(itr!=tmap.end())
-                    new_blocks[itr->second] += this->_blocks[i].Trace(ida,idb);
-                else{
-                    tmap[s] = new_blocks.size();
-                    new_blocks.push_back(this->_blocks[i].Trace(ida,idb));
-                    new_itoi.push_back(s);
-                }
-            }
-        }
-
-    }
-
-    this->_blocks = new_blocks;
-    this->_inner_to_outer_idx = new_itoi;
-
-  }
-
-
-  Tensor BlockUniTensor::Norm() const{
-        Scalar t;
-        if (this->_blocks.size()) {
-          t = linalg::Norm(this->_blocks[0]).item();
-          t *= t;
-          for (int blk = 1; blk < this->_blocks.size(); blk++) {
-            Scalar tmp = linalg::Norm(this->_blocks[blk]).item();
-            t += tmp * tmp;
-          }
-
-        } else {
-          t = Scalar(0, Type.Double);
-        }
-
-        t = sqrt(t);
-        Tensor R({1}, t.dtype());
-
-        R(0) = t;
-        return R;
-  }
-
-
-  // helper function:
-  void BlockUniTensor::_fx_locate_elem(cytnx_int64 &bidx, std::vector<cytnx_uint64> &loc_in_T,const std::vector<cytnx_uint64> &locator) const {
-    // 1. check if out of range:
-    cytnx_error_msg(locator.size() != this->_bonds.size(),
-                    "[ERROR] len(locator) does not match the rank of tensor.%s", "\n");
-
-
-    for (int i = 0; i < this->_bonds.size(); i++) {
-      cytnx_error_msg(locator[i] >= this->_bonds[i].dim(),
-                      "[ERROR][BlockUniTensor][elem_exists] locator @index: %d out of range.\n",
-                      i);
-    }
-
-    // 2. calculate the location is in which qindices:
-    if(this->is_diag()){
-        if(locator[0]!=locator[1]) bidx = -1;
-        else{
-            loc_in_T.push_back(locator[0]);
-            std::vector<cytnx_uint64> qindices(2);
-            // its diag, so we can just use single bond!
-            for(int d=0;d<this->_bonds[0]._impl->_degs.size();d++){
-                if(loc_in_T[0] >= this->_bonds[0]._impl->_degs[d]) loc_in_T[0] -= this->_bonds[0]._impl->_degs[d];
-                else{qindices[0] = qindices[1] = d; break;}
-            }
-            auto it = std::find(this->_inner_to_outer_idx.begin(),this->_inner_to_outer_idx.end(),qindices);
-            if(it == this->_inner_to_outer_idx.end()) bidx = -1;
-            else bidx = it - this->_inner_to_outer_idx.begin();
-
-        }
-
-    }else{
-        loc_in_T = locator;
-        std::vector<cytnx_uint64> qindices(loc_in_T.size());
-        for(int i=0;i<this->_bonds.size();i++){
-            for(int d=0;d<this->_bonds[i]._impl->_degs.size();d++){
-                if(loc_in_T[i] >= this->_bonds[i]._impl->_degs[d]) loc_in_T[i] -= this->_bonds[i]._impl->_degs[d];
-                else{qindices[i] = d; break;}
-            }
-        }
-
-        auto it = std::find(this->_inner_to_outer_idx.begin(),this->_inner_to_outer_idx.end(),qindices);
-
-        if(it == this->_inner_to_outer_idx.end()) bidx = -1;
-        else bidx = it - this->_inner_to_outer_idx.begin();
-    }
-  }
-
-
-
-  bool BlockUniTensor::elem_exists(const std::vector<cytnx_uint64> &locator) const{
-    cytnx_int64 bidx;
-    std::vector<cytnx_uint64> loc_in_T;
-    this->_fx_locate_elem(bidx,loc_in_T,locator);
-    return !(bidx < 0);
-  }
-
-  //-------------------------------------------
-  // at_for_sparse
-  Scalar::Sproxy BlockUniTensor::at_for_sparse(const std::vector<cytnx_uint64> &locator){
-    cytnx_int64 bidx;
-    std::vector<cytnx_uint64> loc_in_T;
-    this->_fx_locate_elem(bidx,loc_in_T,locator);
-    if(bidx<0){
-        return Scalar::Sproxy(this->NullRefTensor.storage()._impl,0);
-    }else{
-        return this->_blocks[bidx].at(loc_in_T);
-    }
-  }
-  cytnx_complex128 &BlockUniTensor::at_for_sparse(const std::vector<cytnx_uint64> &locator,
-                                    const cytnx_complex128 &aux){
-    cytnx_int64 bidx;
-    std::vector<cytnx_uint64> loc_in_T;
-    this->_fx_locate_elem(bidx,loc_in_T,locator);
-    return this->_blocks[bidx].at<cytnx_complex128>(loc_in_T);
-  }
-  cytnx_complex64 &BlockUniTensor::at_for_sparse(const std::vector<cytnx_uint64> &locator,
-                                   const cytnx_complex64 &aux){
-    cytnx_int64 bidx;
-    std::vector<cytnx_uint64> loc_in_T;
-    this->_fx_locate_elem(bidx,loc_in_T,locator);
-    return this->_blocks[bidx].at<cytnx_complex64>(loc_in_T);
-
-  }
-  cytnx_double &BlockUniTensor::at_for_sparse(const std::vector<cytnx_uint64> &locator, const cytnx_double &aux){
-    cytnx_int64 bidx;
-    std::vector<cytnx_uint64> loc_in_T;
-    this->_fx_locate_elem(bidx,loc_in_T,locator);
-    return this->_blocks[bidx].at<cytnx_double>(loc_in_T);
-
-  }
-  cytnx_float &BlockUniTensor::at_for_sparse(const std::vector<cytnx_uint64> &locator, const cytnx_float &aux){
-    cytnx_int64 bidx;
-    std::vector<cytnx_uint64> loc_in_T;
-    this->_fx_locate_elem(bidx,loc_in_T,locator);
-    return this->_blocks[bidx].at<cytnx_float>(loc_in_T);
-  }
-  cytnx_uint64 &BlockUniTensor::at_for_sparse(const std::vector<cytnx_uint64> &locator, const cytnx_uint64 &aux){
-    cytnx_int64 bidx;
-    std::vector<cytnx_uint64> loc_in_T;
-    this->_fx_locate_elem(bidx,loc_in_T,locator);
-    return this->_blocks[bidx].at<cytnx_uint64>(loc_in_T);
-  }
-  cytnx_int64 &BlockUniTensor::at_for_sparse(const std::vector<cytnx_uint64> &locator, const cytnx_int64 &aux){
-    cytnx_int64 bidx;
-    std::vector<cytnx_uint64> loc_in_T;
-    this->_fx_locate_elem(bidx,loc_in_T,locator);
-    return this->_blocks[bidx].at<cytnx_int64>(loc_in_T);
-  }
-  cytnx_uint32 &BlockUniTensor::at_for_sparse(const std::vector<cytnx_uint64> &locator, const cytnx_uint32 &aux){
-    cytnx_int64 bidx;
-    std::vector<cytnx_uint64> loc_in_T;
-    this->_fx_locate_elem(bidx,loc_in_T,locator);
-    return this->_blocks[bidx].at<cytnx_uint32>(loc_in_T);
-  }
-  cytnx_int32 &BlockUniTensor::at_for_sparse(const std::vector<cytnx_uint64> &locator, const cytnx_int32 &aux){
-    cytnx_int64 bidx;
-    std::vector<cytnx_uint64> loc_in_T;
-    this->_fx_locate_elem(bidx,loc_in_T,locator);
-    return this->_blocks[bidx].at<cytnx_int32>(loc_in_T);
-  }
-  cytnx_uint16 &BlockUniTensor::at_for_sparse(const std::vector<cytnx_uint64> &locator, const cytnx_uint16 &aux){
-    cytnx_int64 bidx;
-    std::vector<cytnx_uint64> loc_in_T;
-    this->_fx_locate_elem(bidx,loc_in_T,locator);
-    return this->_blocks[bidx].at<cytnx_uint16>(loc_in_T);
-  }
-  cytnx_int16 &BlockUniTensor::at_for_sparse(const std::vector<cytnx_uint64> &locator, const cytnx_int16 &aux){
-    cytnx_int64 bidx;
-    std::vector<cytnx_uint64> loc_in_T;
-    this->_fx_locate_elem(bidx,loc_in_T,locator);
-    return this->_blocks[bidx].at<cytnx_int16>(loc_in_T);
-  }
-
-
-  const Scalar::Sproxy BlockUniTensor::at_for_sparse(const std::vector<cytnx_uint64> &locator) const{
-    cytnx_int64 bidx;
-    std::vector<cytnx_uint64> loc_in_T;
-    this->_fx_locate_elem(bidx,loc_in_T,locator);
-    if(bidx<0){
-        return Scalar::Sproxy(this->NullRefTensor.storage()._impl,0);
-    }else{
-        return this->_blocks[bidx].at(loc_in_T);
-    }
-  }
-  const cytnx_complex128 &BlockUniTensor::at_for_sparse(const std::vector<cytnx_uint64> &locator,
-                                    const cytnx_complex128 &aux)const {
-    cytnx_int64 bidx;
-    std::vector<cytnx_uint64> loc_in_T;
-    this->_fx_locate_elem(bidx,loc_in_T,locator);
-    return this->_blocks[bidx].at<cytnx_complex128>(loc_in_T);
-  }
-  const cytnx_complex64 &BlockUniTensor::at_for_sparse(const std::vector<cytnx_uint64> &locator,
-                                   const cytnx_complex64 &aux)const {
-    cytnx_int64 bidx;
-    std::vector<cytnx_uint64> loc_in_T;
-    this->_fx_locate_elem(bidx,loc_in_T,locator);
-    return this->_blocks[bidx].at<cytnx_complex64>(loc_in_T);
-
-  }
-  const cytnx_double &BlockUniTensor::at_for_sparse(const std::vector<cytnx_uint64> &locator, const cytnx_double &aux)const {
-    cytnx_int64 bidx;
-    std::vector<cytnx_uint64> loc_in_T;
-    this->_fx_locate_elem(bidx,loc_in_T,locator);
-    return this->_blocks[bidx].at<cytnx_double>(loc_in_T);
-
-  }
-  const cytnx_float &BlockUniTensor::at_for_sparse(const std::vector<cytnx_uint64> &locator, const cytnx_float &aux)const {
-    cytnx_int64 bidx;
-    std::vector<cytnx_uint64> loc_in_T;
-    this->_fx_locate_elem(bidx,loc_in_T,locator);
-    return this->_blocks[bidx].at<cytnx_float>(loc_in_T);
-  }
-  const cytnx_uint64 &BlockUniTensor::at_for_sparse(const std::vector<cytnx_uint64> &locator, const cytnx_uint64 &aux)const {
-    cytnx_int64 bidx;
-    std::vector<cytnx_uint64> loc_in_T;
-    this->_fx_locate_elem(bidx,loc_in_T,locator);
-    return this->_blocks[bidx].at<cytnx_uint64>(loc_in_T);
-  }
-  const cytnx_int64 &BlockUniTensor::at_for_sparse(const std::vector<cytnx_uint64> &locator, const cytnx_int64 &aux)const {
-    cytnx_int64 bidx;
-    std::vector<cytnx_uint64> loc_in_T;
-    this->_fx_locate_elem(bidx,loc_in_T,locator);
-    return this->_blocks[bidx].at<cytnx_int64>(loc_in_T);
-  }
-  const cytnx_uint32 &BlockUniTensor::at_for_sparse(const std::vector<cytnx_uint64> &locator, const cytnx_uint32 &aux)const {
-    cytnx_int64 bidx;
-    std::vector<cytnx_uint64> loc_in_T;
-    this->_fx_locate_elem(bidx,loc_in_T,locator);
-    return this->_blocks[bidx].at<cytnx_uint32>(loc_in_T);
-  }
-  const cytnx_int32 &BlockUniTensor::at_for_sparse(const std::vector<cytnx_uint64> &locator, const cytnx_int32 &aux)const {
-    cytnx_int64 bidx;
-    std::vector<cytnx_uint64> loc_in_T;
-    this->_fx_locate_elem(bidx,loc_in_T,locator);
-    return this->_blocks[bidx].at<cytnx_int32>(loc_in_T);
-  }
-  const cytnx_uint16 &BlockUniTensor::at_for_sparse(const std::vector<cytnx_uint64> &locator, const cytnx_uint16 &aux)const {
-    cytnx_int64 bidx;
-    std::vector<cytnx_uint64> loc_in_T;
-    this->_fx_locate_elem(bidx,loc_in_T,locator);
-    return this->_blocks[bidx].at<cytnx_uint16>(loc_in_T);
-  }
-  const cytnx_int16 &BlockUniTensor::at_for_sparse(const std::vector<cytnx_uint64> &locator, const cytnx_int16 &aux)const {
-    cytnx_int64 bidx;
-    std::vector<cytnx_uint64> loc_in_T;
-    this->_fx_locate_elem(bidx,loc_in_T,locator);
-    return this->_blocks[bidx].at<cytnx_int16>(loc_in_T);
-  }
-
-
-  void BlockUniTensor::_save_dispatch(std::fstream &f) const {
-    // cytnx_error_msg(true,"[ERROR] Save for SparseUniTensor is under developing!!%s","\n");
-
-    cytnx_uint64 Nblocks = this->_blocks.size();
-    f.write((char *)&Nblocks, sizeof(cytnx_uint64));
-
-    // save inner_to_outer_idx:
-    for(unsigned int b=0;b<Nblocks;b++){
-        f.write((char*)&this->_inner_to_outer_idx[b][0],sizeof(cytnx_uint64)*this->_bonds.size());
-    }
-    for (unsigned int i = 0; i < this->_blocks.size(); i++) {
-      this->_blocks[i]._Save(f);
-    }
-  }
-
-  void BlockUniTensor::_load_dispatch(std::fstream &f) {
-    // cytnx_error_msg(true,"[ERROR] Save for SparseUniTensor is under developing!!%s","\n");
-
-    cytnx_uint64 Nblocks;
-    f.read((char *)&Nblocks, sizeof(cytnx_uint64));
-
-    this->_inner_to_outer_idx = std::vector< std::vector<cytnx_uint64> >(Nblocks,std::vector<cytnx_uint64>(this->_bonds.size()));
-    // read inner_to_outer_idx:
-    for(unsigned int b=0;b<Nblocks;b++){
-        f.read((char*)&this->_inner_to_outer_idx[b][0],sizeof(cytnx_uint64)*this->_bonds.size());
-    }
-    this->_blocks.resize(Nblocks);
-
-    for (unsigned int i = 0; i < this->_blocks.size(); i++) {
-      this->_blocks[i]._Load(f);
-    }
-  }
-
-
-  void BlockUniTensor::truncate_(const cytnx_int64 &bond_idx, const cytnx_uint64 &q_index,
-                           const bool &by_label){
-
-        cytnx_error_msg(this->is_diag(),"[ERROR][BlockUniTensor][truncate_] cannot use truncate_ when is_diag() = true.%s","\n");
-        cytnx_int64 bidx = bond_idx;
-        if(by_label){
-            auto it = std::find(this->_labels.begin(), this->_labels.end(), to_string(bond_idx));
-            cytnx_error_msg(it == this->_labels.end(),
-                    "[ERROR] label [%d] does not exist in current UniTensor.\n", bond_idx);
-            bidx = it - this->_labels.begin();
-        }
-
-        cytnx_error_msg((bidx>=this->_labels.size())|| (bidx < 0), "[ERROR][BlockUniTensor][truncate_] bond_idx out of bound.%s","\n");
-        cytnx_error_msg(q_index >= this->_bonds[bidx].qnums().size(), "[ERROR][BlockUniTensor][truncate_] q_index out of bound @ specify Bond @[%d].\n",bidx);
-
-        cytnx_error_msg(this->_bonds[bidx].qnums().size()==1,"[ERROR][BlockUniTensor][truncate_] cannot remove the only qnums on a given Bond!%s","\n");
-
-        this->_bonds[bidx]._impl->_rm_qnum(q_index);
-
-        //traversal all blocks, find all blocks that need to remove:
-        std::vector<cytnx_uint64> locs;
-        for(cytnx_int64 b=0;b<this->_blocks.size();b++){
-            if(this->_inner_to_outer_idx[b][bidx] == q_index) locs.push_back(b);
-        }
-
-        //remove!
-        vec_erase_(this->_inner_to_outer_idx,locs);
-        vec_erase_(this->_blocks,locs);
-
-
-
-  }
-  void BlockUniTensor::truncate_(const std::string &bond_idx, const cytnx_uint64 &q_index){
-    auto it = std::find(this->_labels.begin(), this->_labels.end(), bond_idx);
-    cytnx_error_msg(it == this->_labels.end(),
-                    "[ERROR] label [%s] does not exist in current UniTensor.\n", bond_idx.c_str());
-
-    cytnx_int64 idx = it - this->_labels.begin();
-    this->truncate_(idx,q_index,false);
-  }
-  void BlockUniTensor::truncate_(const cytnx_int64 &bond_idx, const cytnx_uint64 &q_index){
-    this->truncate_(bond_idx,q_index,false);
-  }
-
-
-
-  void BlockUniTensor::Mul_(const Scalar &rhs) {
-    // cytnx_error_msg(true,"[ERROR] cannot perform arithmetic on all tagged tensor, @spase
-    // unitensor%s","\n");
-    for (cytnx_int64 i = 0; i < this->_blocks.size(); i++) {
-      this->_blocks[i] *= rhs;
-    }
-  }
-
-  void BlockUniTensor::Div_(const Scalar &rhs) {
-    // cytnx_error_msg(true,"[ERROR] cannot perform arithmetic on all tagged tensor, @spase
-    // unitensor%s","\n");
-    for (cytnx_int64 i = 0; i < this->_blocks.size(); i++) {
-      this->_blocks[i] /= rhs;
-    }
-  }
-
-
-  void BlockUniTensor::Add_(const boost::intrusive_ptr<UniTensor_base> &rhs){
-    //checking Type:
-    cytnx_error_msg(rhs->uten_type()!=UTenType.Block,"[ERROR] cannot add two UniTensor with different type/format.%s","\n");
-
-    BlockUniTensor* Rtn = (BlockUniTensor*)rhs.get();
-
-    // 1) check each bond.
-    cytnx_error_msg(this->_bonds.size()!=Rtn->_bonds.size(),"[ERROR] cannot add two BlockUniTensor with different rank!%s","\n");
-    for(cytnx_int64 i=0;i<this->_bonds.size();i++){
-        cytnx_error_msg(this->_bonds[i] != Rtn->_bonds[i],"[ERROR] Bond @ index: %d does not match. Therefore cannot perform Add of two UniTensor\n",i);
-    }
-
-    cytnx_error_msg(this->is_diag()!=Rtn->is_diag(),"[ERROR] cannot add BlockUniTensor with is_diag=true and is_diag=false.%s","\n");
-
-    // 2) finding the blocks (they might be not in the same order!
-    for(cytnx_int64 b=0;b<this->_blocks.size();b++){
-        for(cytnx_int64 a=0;a<Rtn->_blocks.size();a++){
-            if(this->_inner_to_outer_idx[b] == Rtn->_inner_to_outer_idx[(b+a)%Rtn->_blocks.size()]){
-                this->_blocks[b] += Rtn->_blocks[(b+a)%Rtn->_blocks.size()];
-                break;
-            }
-        }
-    }
-
-  }
-
-  void BlockUniTensor::Mul_(const boost::intrusive_ptr<UniTensor_base> &rhs){
-    //checking Type:
-    cytnx_error_msg(rhs->uten_type()!=UTenType.Block,"[ERROR] cannot add two UniTensor with different type/format.%s","\n");
-
-    BlockUniTensor* Rtn = (BlockUniTensor*)rhs.get();
-
-    // 1) check each bond.
-    cytnx_error_msg(this->_bonds.size()!=Rtn->_bonds.size(),"[ERROR] cannot add two BlockUniTensor with different rank!%s","\n");
-    for(cytnx_int64 i=0;i<this->_bonds.size();i++){
-        cytnx_error_msg(this->_bonds[i] != Rtn->_bonds[i],"[ERROR] Bond @ index: %d does not match. Therefore cannot perform Add of two UniTensor\n",i);
-    }
-
-    cytnx_error_msg(this->is_diag()!=Rtn->is_diag(),"[ERROR] cannot add BlockUniTensor with is_diag=true and is_diag=false.%s","\n");
-
-    // 2) finding the blocks (they might be not in the same order!
-    for(cytnx_int64 b=0;b<this->_blocks.size();b++){
-        for(cytnx_int64 a=0;a<Rtn->_blocks.size();a++){
-            if(this->_inner_to_outer_idx[b] == Rtn->_inner_to_outer_idx[(b+a)%Rtn->_blocks.size()]){
-                this->_blocks[b] *= Rtn->_blocks[(b+a)%Rtn->_blocks.size()];
-                break;
-            }
-        }
-    }
-
-  }
-
-  void BlockUniTensor::Sub_(const boost::intrusive_ptr<UniTensor_base> &rhs){
-    //checking Type:
-    cytnx_error_msg(rhs->uten_type()!=UTenType.Block,"[ERROR] cannot add two UniTensor with different type/format.%s","\n");
-
-    BlockUniTensor* Rtn = (BlockUniTensor*)rhs.get();
-
-    // 1) check each bond.
-    cytnx_error_msg(this->_bonds.size()!=Rtn->_bonds.size(),"[ERROR] cannot add two BlockUniTensor with different rank!%s","\n");
-    for(cytnx_int64 i=0;i<this->_bonds.size();i++){
-        cytnx_error_msg(this->_bonds[i] != Rtn->_bonds[i],"[ERROR] Bond @ index: %d does not match. Therefore cannot perform Add of two UniTensor\n",i);
-    }
-
-    cytnx_error_msg(this->is_diag()!=Rtn->is_diag(),"[ERROR] cannot add BlockUniTensor with is_diag=true and is_diag=false.%s","\n");
-
-    // 2) finding the blocks (they might be not in the same order!
-    for(cytnx_int64 b=0;b<this->_blocks.size();b++){
-        for(cytnx_int64 a=0;a<Rtn->_blocks.size();a++){
-            if(this->_inner_to_outer_idx[b] == Rtn->_inner_to_outer_idx[(b+a)%Rtn->_blocks.size()]){
-                this->_blocks[b] -= Rtn->_blocks[(b+a)%Rtn->_blocks.size()];
-                break;
-            }
-        }
-    }
-
-  }
-
-  void BlockUniTensor::_fx_group_duplicates(const std::vector<cytnx_uint64> &dup_bond_idxs, const std::vector<std::vector<cytnx_uint64> > &idx_mappers){
-
-        //checking the bonds that are duplicates
-        //auto mod_idxs = dup_bond_idxs; std::sort(mod_idx.begin(),mod_idx.end());
-
-        //generating new inner_to_outer_idx:
-        std::vector<std::vector<cytnx_uint64> > tmp_inner_to_outer_idx;
-
-
-
-        //process one by one:
-        for(cytnx_int64 bn=0;bn<dup_bond_idxs.size();bn++){
-            //cout << "BOND:" << dup_bond_idxs[bn] << endl;
-            //cout << "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n";
-            tmp_inner_to_outer_idx = this->_inner_to_outer_idx;
-
-            for(cytnx_int64 i=0;i<this->_inner_to_outer_idx.size();i++){
-                tmp_inner_to_outer_idx[i][dup_bond_idxs[bn]] = idx_mappers[bn][ this->_inner_to_outer_idx[i][dup_bond_idxs[bn]] ];
-            }
-
-            std::vector<int> mask(this->_blocks.size());
-            std::vector<Tensor> new_blocks;
-            std::vector<std::vector<cytnx_uint64> > new_inner_to_outer_idx;
-
-            std::vector<cytnx_uint64> no_combine; // same for each bond!
-            for(cytnx_uint64 i=0;i<this->rank();i++){
-                if(i!=dup_bond_idxs[bn]) no_combine.push_back(i);
-            }
-
-            for(cytnx_int64 b=0;b<this->_blocks.size();b++){
-                if(mask[b]==1) continue;
-                mask[b] = 1;
-                new_blocks.push_back(this->_blocks[b]);
-                new_inner_to_outer_idx.push_back(tmp_inner_to_outer_idx[b]);
-                for(cytnx_int64 a=b+1;a<this->_blocks.size();a++){
-                    if(mask[a]==1) continue;
-                    if(tmp_inner_to_outer_idx[a] == tmp_inner_to_outer_idx[b]){
-                        // need to combine two!
-                        // checking which bonds does not need to combine!
-                        mask[a] = 1;
-                        /*
-                        std::cout << "CALL DS:\n";
-                        std::cout << no_combine << std::endl;
-                        std::cout << "targ: old/new itoi:\n";
-                        std::cout << this->_inner_to_outer_idx[b] << std::endl;
-                        std::cout << tmp_inner_to_outer_idx[b] << std::endl;
-                        std::cout << "----------\n" << std::endl;
-                        std::cout << "src: old/new itoi:\n";
-                        std::cout << this->_inner_to_outer_idx[a] << std::endl;
-                        std::cout << tmp_inner_to_outer_idx[a] << std::endl;
-                        std::cout << "----------\n" << std::endl;
-                        std::cout << new_blocks.back().shape() << std::endl;
-                        std::cout << this->_blocks[a].shape() << std::endl;
-                        std::cout << "=============\n" << std::endl;
-                        */
-                        new_blocks.back() = linalg::Directsum(new_blocks.back(),this->_blocks[a],no_combine);
-
-                    }
-
-                }
-            }// traversal each block!
-
-            this->_blocks = new_blocks;
-            this->_inner_to_outer_idx = new_inner_to_outer_idx;
-
-        }
-
-  }
-
-  void BlockUniTensor::group_basis_(){
-
-       std::vector<cytnx_uint64> has_dup;
-       std::vector<std::vector<cytnx_uint64> > idx_mappers;
-       for(cytnx_uint64 i=0;i<this->_bonds.size();i++){
-           if(this->_bonds[i].has_duplicate_qnums()){
-                has_dup.push_back(i);
-                idx_mappers.push_back(this->_bonds[i].group_duplicates_());
-           }
-       }
-
-
-       // this modify _inner_to_outer_idx and blocks!
-       this->_fx_group_duplicates(has_dup,idx_mappers);
-
-  }
-
-
-  void BlockUniTensor::combineBonds(const std::vector<cytnx_int64> &indicators,
-                                    const bool &force) {
-    cytnx_error_msg(this->is_diag(),"[ERROR][BlockUniTensor] cannot combineBonds when is_diag = true!%s","\n");
-
-    cytnx_error_msg(indicators.size() < 2, "[ERROR] the number of bonds to combine must be > 1%s",
-                    "\n");
-    std::vector<cytnx_int64>::iterator it;
-    std::vector<cytnx_int64> idx_mapper; idx_mapper.reserve(this->rank());
-    //std::vector<cytnx_int64> new_shape_aft_perm; new_shape_aft_perm.reserve(this->rank()-indicators.size()+1);
-
-    //idx_mapper = std::vector<cytnx_uint64>(indicators.begin(), indicators.end());
-
-    cytnx_error_msg(this->_is_diag,
-                    "[ERROR] cannot combineBond on a is_diag=True UniTensor. suggestion: try "
-                    "UniTensor.to_dense()/to_dense_() first.%s [NOTE] this is BlockUniTensor, so currently under developing!\n",
-                    "\n");
-
-
-    //get the mapper:
-    int cnt = 0;
-    int idor;
-    for(int i=0;i<this->rank();i++){
-        if(cnt==indicators.size()){
-            idx_mapper.push_back(i);
-            //new_shape_aft_perm.push_back(0);
-        }else{
-            if(std::find(indicators.begin(),indicators.end(),i)==indicators.end()){
-                idx_mapper.push_back(i);
-                //new_shape_aft_perm.push_back(0);
-            }else{
-                if(i==indicators[0]){
-                    //new_shape_aft_perm.push_back(-1);
-                    idor = idx_mapper.size(); //new_shape_aft_perm.size();
-                    for(int j=0;j<indicators.size();j++)
-                        idx_mapper.push_back(indicators[j]);
-                }
-                cnt += 1;
-            }
-        }
-    }
-    //std::cout << idx_mapper << std::endl;
-    //std::cout << new_shape_aft_perm << std::endl;
-
-    this->permute_(idx_mapper);
-    this->contiguous_();
-
-    //group bonds:
-    std::vector<Bond> new_bonds;
-    std::vector<cytnx_uint64> cb_stride(indicators.size());
-    //std::cout << "idor" << idor << std::endl;
-    //std::cout << "rank" << this->rank() << std::endl;
-    for(int i=0;i<this->rank();i++){
-        if(i==idor){
-            Bond tmp = this->_bonds[i];
-            cb_stride[0] = this->_bonds[i].qnums().size();
-            for(int j=1;j<indicators.size();j++){
-                cb_stride[j] = this->_bonds[i+j].qnums().size();
-                if(force) tmp._impl->force_combineBond_(this->_bonds[i+j]._impl,false); // no grouping
-                else tmp.combineBond_(this->_bonds[i+j],false); // no grouping
-            }
-            new_bonds.push_back(tmp);
-            i += indicators.size()-1;
-
-        }else{
-            new_bonds.push_back(this->_bonds[i]);
-        }
-    }
-
-    // remove labels:
-    this->_labels.erase(this->_labels.begin()+idor+1,this->_labels.begin()+idor+1+indicators.size()-1);
-    this->_bonds = new_bonds;
-
-
-    //reshape each blocks, and update_inner_to_outer_idx:
-    //process stride:
-    memcpy(&cb_stride[0],&cb_stride[1],sizeof(cytnx_uint64)*(cb_stride.size()-1));
-    // for(int i=cb_stride.size()-2;i>=0;i--){
-    //     cb_stride[i] = cb_stride[i+1];
-    // }
-    cb_stride.back()=1;
-    for(int i=cb_stride.size()-2;i>=0;i--){
-        cb_stride[i]*=cb_stride[i+1];
-    }
-
-    std::vector<cytnx_int64> new_shape; new_shape.reserve(this->rank());
-    for(int b=0;b<this->_blocks.size();b++){
-        new_shape.clear();
-        for(int i=0;i<this->_blocks[b].shape().size();i++){
-            if(i==idor){
-                i+=indicators.size()-1;
-                new_shape.push_back(-1);
-            }else{
-                new_shape.push_back(this->_blocks[b].shape()[i]);
-            }
-        }
-        this->_blocks[b].reshape_(new_shape);
-    }
-
-    // cout<<"AAAAAAAAAAAAAAAAAAAAAAA"<<this->get_qindices(2)<<endl;
-    // cout<<"AAAAAAAAAAAAAAAAAAAAAAA"<<this->bonds()<<endl;
-
-    for(int b=0;b<this->_blocks.size();b++){
-        this->_inner_to_outer_idx[b][idor] *= cb_stride[0];
-        for(int i=idor+1;i<idor+indicators.size();i++){
-            this->_inner_to_outer_idx[b][idor]+= this->_inner_to_outer_idx[b][i] * cb_stride[i-idor];
-        }
-        if(idor+indicators.size()<this->_inner_to_outer_idx[b].size()){
-            memcpy(&this->_inner_to_outer_idx[b][idor+1],&this->_inner_to_outer_idx[b][idor+indicators.size()],sizeof(cytnx_uint64)*(this->_inner_to_outer_idx[b].size()-idor-indicators.size()));
-        }
-        this->_inner_to_outer_idx[b].resize(this->rank());
-    }
-    //std::cout << this->_inner_to_outer_idx << std::endl;
-
-    //check rowrank:
-    if(this->_rowrank >= this->rank()) this->_rowrank = this->rank();
-
-    this->_is_braket_form = this->_update_braket();
-
-    // cout<<"BBBBBBBBBBBBBBBBBBBBBBB"<<this->get_qindices(2)<<endl;
-    // cout<<"BBBBBBBBBBBBBBBBBBBBBBB"<<this->bonds()<<endl;
-    //regroup:
-    this->group_basis_();
-  }
-
-
-  void BlockUniTensor::combineBonds(const std::vector<std::string> &indicators,
-                                    const bool &force) {
-    cytnx_error_msg(indicators.size() < 2, "[ERROR] the number of bonds to combine must be > 1%s",
-                    "\n");
-    std::vector<std::string>::iterator it;
-    std::vector<cytnx_int64> idx_mapper;
-    // find the index of label:
-    for (cytnx_uint64 i = 0; i < indicators.size(); i++) {
-      it = std::find(this->_labels.begin(), this->_labels.end(), indicators[i]);
-      cytnx_error_msg(it == this->_labels.end(), "[ERROR] labels not found in current UniTensor%s",
-                      "\n");
-      idx_mapper.push_back(std::distance(this->_labels.begin(), it));
-    }
-    this->combineBonds(idx_mapper,force);
-  }
-
-  void BlockUniTensor::combineBonds(const std::vector<cytnx_int64> &indicators,
-                                    const bool &force, const bool &by_label) {
-    cytnx_error_msg(indicators.size() < 2, "[ERROR] the number of bonds to combine must be > 1%s",
-                    "\n");
-    std::vector<std::string>::iterator it;
-    std::vector<cytnx_int64> idx_mapper;
-    if (by_label) {
-      // find the index of label:
-      for (cytnx_uint64 i = 0; i < indicators.size(); i++) {
-        it = std::find(this->_labels.begin(), this->_labels.end(), std::to_string(indicators[i]));
-        cytnx_error_msg(it == this->_labels.end(),
-                        "[ERROR] labels not found in current UniTensor%s", "\n");
-        idx_mapper.push_back(std::distance(this->_labels.begin(), it));
-      }
-
-    } else {
-      idx_mapper = indicators;
-    }
-    this->combineBonds(idx_mapper,force);
-
-  }
-
-
-
-}  // namespace cytnx
diff --git a/src/Tensor.old.cpp b/src/Tensor.old.cpp
deleted file mode 100644
index d83e1a385..000000000
--- a/src/Tensor.old.cpp
+++ /dev/null
@@ -1,1390 +0,0 @@
-#include <typeinfo>
-#include "Tensor.hpp"
-#include "utils/utils_internal_interface.hpp"
-#include "linalg.hpp"
-#include "utils/is.hpp"
-#include "Type.hpp"
-using namespace std;
-
-namespace cytnx {
-
-  //----------------------------------------------
-  // Tproxy
-
-  Tensor Tensor::Tproxy::operator+=(const Tensor::Tproxy &rc) {
-    Tensor self;
-    self._impl = _insimpl->get(_accs);
-    // self += Tensor(rc);
-    cytnx::linalg::iAdd(self, Tensor(rc));
-
-    _insimpl->set(_accs, self._impl);
-    self._impl = this->_insimpl;
-    return self;
-  }
-  Tensor Tensor::Tproxy::operator-=(const Tensor::Tproxy &rc) {
-    Tensor self;
-    self._impl = _insimpl->get(_accs);
-    // self += Tensor(rc);
-    cytnx::linalg::iSub(self, Tensor(rc));
-
-    _insimpl->set(_accs, self._impl);
-    self._impl = this->_insimpl;
-    return self;
-  }
-  Tensor Tensor::Tproxy::operator/=(const Tensor::Tproxy &rc) {
-    Tensor self;
-    self._impl = _insimpl->get(_accs);
-    // self += Tensor(rc);
-    cytnx::linalg::iDiv(self, Tensor(rc));
-
-    _insimpl->set(_accs, self._impl);
-    self._impl = this->_insimpl;
-    return self;
-  }
-  Tensor Tensor::Tproxy::operator*=(const Tensor::Tproxy &rc) {
-    Tensor self;
-    self._impl = _insimpl->get(_accs);
-    // self += Tensor(rc);
-    cytnx::linalg::iMul(self, Tensor(rc));
-
-    _insimpl->set(_accs, self._impl);
-    self._impl = this->_insimpl;
-    return self;
-  }
-
-  // ADD
-  Tensor Tensor::Tproxy::operator+(
-    const cytnx_complex128 &rc) const {  //{return this->_operatorADD(rc);};
-    Tensor out;
-    out._impl = _insimpl->get(_accs);
-    return out.Add(rc);
-  }
-  Tensor Tensor::Tproxy::operator+(
-    const cytnx_complex64 &rc) const {  //{return this->_operatorADD(rc);};
-    Tensor out;
-    out._impl = _insimpl->get(_accs);
-    return out.Add(rc);
-  }
-  Tensor Tensor::Tproxy::operator+(
-    const cytnx_double &rc) const {  //{return this->_operatorADD(rc);};
-    Tensor out;
-    out._impl = _insimpl->get(_accs);
-    return out.Add(rc);
-  }
-  Tensor Tensor::Tproxy::operator+(
-    const cytnx_float &rc) const {  //{return this->_operatorADD(rc);};
-    Tensor out;
-    out._impl = _insimpl->get(_accs);
-    return out.Add(rc);
-  }
-  Tensor Tensor::Tproxy::operator+(
-    const cytnx_uint64 &rc) const {  //{return this->_operatorADD(rc);};
-    Tensor out;
-    out._impl = _insimpl->get(_accs);
-    return out.Add(rc);
-  }
-  Tensor Tensor::Tproxy::operator+(
-    const cytnx_int64 &rc) const {  //{return this->_operatorADD(rc);};
-    Tensor out;
-    out._impl = _insimpl->get(_accs);
-    return out.Add(rc);
-  }
-  Tensor Tensor::Tproxy::operator+(
-    const cytnx_uint32 &rc) const {  //{return this->_operatorADD(rc);};
-    Tensor out;
-    out._impl = _insimpl->get(_accs);
-    return out.Add(rc);
-  }
-  Tensor Tensor::Tproxy::operator+(
-    const cytnx_int32 &rc) const {  //{return this->_operatorADD(rc);};
-    Tensor out;
-    out._impl = _insimpl->get(_accs);
-    return out.Add(rc);
-  }
-  Tensor Tensor::Tproxy::operator+(
-    const cytnx_uint16 &rc) const {  //{return this->_operatorADD(rc);};
-    Tensor out;
-    out._impl = _insimpl->get(_accs);
-    return out.Add(rc);
-  }
-  Tensor Tensor::Tproxy::operator+(
-    const cytnx_int16 &rc) const {  //{return this->_operatorADD(rc);};
-    Tensor out;
-    out._impl = _insimpl->get(_accs);
-    return out.Add(rc);
-  }
-  Tensor Tensor::Tproxy::operator+(
-    const cytnx_bool &rc) const {  //{return this->_operatorADD(rc);};
-    Tensor out;
-    out._impl = _insimpl->get(_accs);
-    return out.Add(rc);
-  }
-
-  Tensor Tensor::Tproxy::operator+(const Tproxy &rc) const {
-    Tensor out;
-    out._impl = _insimpl->get(_accs);
-    return cytnx::linalg::Add(out, Tensor(rc));
-  }
-
-  // SUB:
-  Tensor Tensor::Tproxy::operator-(
-    const cytnx_complex128 &rc) const {  //{return this->_operatorADD(rc);};
-    Tensor out;
-    out._impl = _insimpl->get(_accs);
-    return out.Sub(rc);
-  }
-  Tensor Tensor::Tproxy::operator-(
-    const cytnx_complex64 &rc) const {  //{return this->_operatorADD(rc);};
-    Tensor out;
-    out._impl = _insimpl->get(_accs);
-    return out.Sub(rc);
-  }
-  Tensor Tensor::Tproxy::operator-(
-    const cytnx_double &rc) const {  //{return this->_operatorADD(rc);};
-    Tensor out;
-    out._impl = _insimpl->get(_accs);
-    return out.Sub(rc);
-  }
-  Tensor Tensor::Tproxy::operator-(
-    const cytnx_float &rc) const {  //{return this->_operatorADD(rc);};
-    Tensor out;
-    out._impl = _insimpl->get(_accs);
-    return out.Sub(rc);
-  }
-  Tensor Tensor::Tproxy::operator-(
-    const cytnx_uint64 &rc) const {  //{return this->_operatorADD(rc);};
-    Tensor out;
-    out._impl = _insimpl->get(_accs);
-    return out.Sub(rc);
-  }
-  Tensor Tensor::Tproxy::operator-(
-    const cytnx_int64 &rc) const {  //{return this->_operatorADD(rc);};
-    Tensor out;
-    out._impl = _insimpl->get(_accs);
-    return out.Sub(rc);
-  }
-  Tensor Tensor::Tproxy::operator-(
-    const cytnx_uint32 &rc) const {  //{return this->_operatorADD(rc);};
-    Tensor out;
-    out._impl = _insimpl->get(_accs);
-    return out.Sub(rc);
-  }
-  Tensor Tensor::Tproxy::operator-(
-    const cytnx_int32 &rc) const {  //{return this->_operatorADD(rc);};
-    Tensor out;
-    out._impl = _insimpl->get(_accs);
-    return out.Sub(rc);
-  }
-  Tensor Tensor::Tproxy::operator-(
-    const cytnx_uint16 &rc) const {  //{return this->_operatorADD(rc);};
-    Tensor out;
-    out._impl = _insimpl->get(_accs);
-    return out.Sub(rc);
-  }
-  Tensor Tensor::Tproxy::operator-(
-    const cytnx_int16 &rc) const {  //{return this->_operatorADD(rc);};
-    Tensor out;
-    out._impl = _insimpl->get(_accs);
-    return out.Sub(rc);
-  }
-  Tensor Tensor::Tproxy::operator-(
-    const cytnx_bool &rc) const {  //{return this->_operatorADD(rc);};
-    Tensor out;
-    out._impl = _insimpl->get(_accs);
-    return out.Sub(rc);
-  }
-  Tensor Tensor::Tproxy::operator-(const Tproxy &rc) const {
-    Tensor out;
-    out._impl = _insimpl->get(_accs);
-    return cytnx::linalg::Sub(out, Tensor(rc));
-  }
-  Tensor Tensor::Tproxy::operator-() const {
-    Tensor out;
-    out._impl = _insimpl->get(_accs);
-    return out.Mul(-1);
-  }
-
-  // MUL
-  Tensor Tensor::Tproxy::operator*(
-    const cytnx_complex128 &rc) const {  //{return this->_operatorADD(rc);};
-    Tensor out;
-    out._impl = _insimpl->get(_accs);
-    return out.Mul(rc);
-  }
-  Tensor Tensor::Tproxy::operator*(
-    const cytnx_complex64 &rc) const {  //{return this->_operatorADD(rc);};
-    Tensor out;
-    out._impl = _insimpl->get(_accs);
-    return out.Mul(rc);
-  }
-  Tensor Tensor::Tproxy::operator*(
-    const cytnx_double &rc) const {  //{return this->_operatorADD(rc);};
-    Tensor out;
-    out._impl = _insimpl->get(_accs);
-    return out.Mul(rc);
-  }
-  Tensor Tensor::Tproxy::operator*(
-    const cytnx_float &rc) const {  //{return this->_operatorADD(rc);};
-    Tensor out;
-    out._impl = _insimpl->get(_accs);
-    return out.Mul(rc);
-  }
-  Tensor Tensor::Tproxy::operator*(
-    const cytnx_uint64 &rc) const {  //{return this->_operatorADD(rc);};
-    Tensor out;
-    out._impl = _insimpl->get(_accs);
-    return out.Mul(rc);
-  }
-  Tensor Tensor::Tproxy::operator*(
-    const cytnx_int64 &rc) const {  //{return this->_operatorADD(rc);};
-    Tensor out;
-    out._impl = _insimpl->get(_accs);
-    return out.Mul(rc);
-  }
-  Tensor Tensor::Tproxy::operator*(
-    const cytnx_uint32 &rc) const {  //{return this->_operatorADD(rc);};
-    Tensor out;
-    out._impl = _insimpl->get(_accs);
-    return out.Mul(rc);
-  }
-  Tensor Tensor::Tproxy::operator*(
-    const cytnx_int32 &rc) const {  //{return this->_operatorADD(rc);};
-    Tensor out;
-    out._impl = _insimpl->get(_accs);
-    return out.Mul(rc);
-  }
-  Tensor Tensor::Tproxy::operator*(
-    const cytnx_uint16 &rc) const {  //{return this->_operatorADD(rc);};
-    Tensor out;
-    out._impl = _insimpl->get(_accs);
-    return out.Mul(rc);
-  }
-  Tensor Tensor::Tproxy::operator*(
-    const cytnx_int16 &rc) const {  //{return this->_operatorADD(rc);};
-    Tensor out;
-    out._impl = _insimpl->get(_accs);
-    return out.Mul(rc);
-  }
-  Tensor Tensor::Tproxy::operator*(
-    const cytnx_bool &rc) const {  //{return this->_operatorADD(rc);};
-    Tensor out;
-    out._impl = _insimpl->get(_accs);
-    return out.Mul(rc);
-  }
-  Tensor Tensor::Tproxy::operator*(const Tproxy &rc) const {
-    Tensor out;
-    out._impl = _insimpl->get(_accs);
-    return cytnx::linalg::Mul(out, Tensor(rc));
-  }
-
-  // DIV
-  Tensor Tensor::Tproxy::operator/(
-    const cytnx_complex128 &rc) const {  //{return this->_operatorADD(rc);};
-    Tensor out;
-    out._impl = _insimpl->get(_accs);
-    return out.Div(rc);
-  }
-  Tensor Tensor::Tproxy::operator/(
-    const cytnx_complex64 &rc) const {  //{return this->_operatorADD(rc);};
-    Tensor out;
-    out._impl = _insimpl->get(_accs);
-    return out.Div(rc);
-  }
-  Tensor Tensor::Tproxy::operator/(
-    const cytnx_double &rc) const {  //{return this->_operatorADD(rc);};
-    Tensor out;
-    out._impl = _insimpl->get(_accs);
-    return out.Div(rc);
-  }
-  Tensor Tensor::Tproxy::operator/(
-    const cytnx_float &rc) const {  //{return this->_operatorADD(rc);};
-    Tensor out;
-    out._impl = _insimpl->get(_accs);
-    return out.Div(rc);
-  }
-  Tensor Tensor::Tproxy::operator/(
-    const cytnx_uint64 &rc) const {  //{return this->_operatorADD(rc);};
-    Tensor out;
-    out._impl = _insimpl->get(_accs);
-    return out.Div(rc);
-  }
-  Tensor Tensor::Tproxy::operator/(
-    const cytnx_int64 &rc) const {  //{return this->_operatorADD(rc);};
-    Tensor out;
-    out._impl = _insimpl->get(_accs);
-    return out.Div(rc);
-  }
-  Tensor Tensor::Tproxy::operator/(
-    const cytnx_uint32 &rc) const {  //{return this->_operatorADD(rc);};
-    Tensor out;
-    out._impl = _insimpl->get(_accs);
-    return out.Div(rc);
-  }
-  Tensor Tensor::Tproxy::operator/(
-    const cytnx_int32 &rc) const {  //{return this->_operatorADD(rc);};
-    Tensor out;
-    out._impl = _insimpl->get(_accs);
-    return out.Div(rc);
-  }
-  Tensor Tensor::Tproxy::operator/(
-    const cytnx_uint16 &rc) const {  //{return this->_operatorADD(rc);};
-    Tensor out;
-    out._impl = _insimpl->get(_accs);
-    return out.Div(rc);
-  }
-  Tensor Tensor::Tproxy::operator/(
-    const cytnx_int16 &rc) const {  //{return this->_operatorADD(rc);};
-    Tensor out;
-    out._impl = _insimpl->get(_accs);
-    return out.Div(rc);
-  }
-  Tensor Tensor::Tproxy::operator/(
-    const cytnx_bool &rc) const {  //{return this->_operatorADD(rc);};
-    Tensor out;
-    out._impl = _insimpl->get(_accs);
-    return out.Div(rc);
-  }
-  Tensor Tensor::Tproxy::operator/(const Tproxy &rc) const {
-    Tensor out;
-    out._impl = _insimpl->get(_accs);
-    return cytnx::linalg::Div(out, Tensor(rc));
-  }
-
-  //-----------------------------------------------
-  void Tensor_impl::Init(const std::vector<cytnx_uint64> &shape, const unsigned int &dtype,
-                         int device, const bool &init_zero) {
-    // check:
-    cytnx_error_msg(dtype >= N_Type, "%s", "[ERROR] invalid argument: dtype");
-    cytnx_error_msg(shape.size() == 0, "%s",
-                    "[ERROR] invalid argument: shape. Must at least have one element.");
-    cytnx_uint64 Nelem = 1;
-    for (int i = 0; i < shape.size(); i++) {
-      cytnx_error_msg(shape[i] == 0, "%s", "[ERROR] shape cannot have 0 dimension in any rank.");
-      Nelem *= shape[i];
-    }
-    // this->_storage = __SII.USIInit[dtype]();
-    this->_storage.Init(Nelem, dtype, device, init_zero);
-    this->_shape = shape;
-    this->_mapper = vec_range(shape.size());
-    this->_invmapper = this->_mapper;
-    this->_contiguous = true;
-    // cout << shape << endl;
-  }
-  void Tensor_impl::Init(const Storage &in) {
-    cytnx_error_msg(in.dtype() == Type.Void,
-                    "[ERROR] cannot init Tensor using un-initialized Storage%s", "\n");
-    this->_storage = in;
-    this->_shape.clear();
-    this->_shape.push_back(in.size());
-    this->_mapper.clear();
-    this->_mapper.push_back(0);
-    this->_invmapper = this->_mapper;
-    this->_contiguous = true;
-  }
-  // void Tensor_impl::Init(const Storage &in, const std::vector<cytnx_uint64> &shape,
-  //   const unsigned int &dtype, int device) {
-  //   cytnx_error_msg(in.dtype() == Type.Void,
-  //                   "[ERROR] cannot init Tensor using un-initialized Storage%s", "\n");
-  //   // check:
-  //   cytnx_error_msg(dtype >= N_Type, "%s", "[ERROR] invalid argument: dtype");
-  //   cytnx_error_msg(shape.size() == 0, "%s",
-  //                   "[ERROR] invalid argument: shape. Must at least have one element.");
-  //   cytnx_uint64 Nelem = 1;
-  //   for (int i = 0; i < shape.size(); i++) {
-  //     cytnx_error_msg(shape[i] == 0, "%s", "[ERROR] shape cannot have 0 dimension in any rank.");
-  //     Nelem *= shape[i];
-  //   }
-  //   this->_storage = in;
-  //   // this->_storage = __SII.USIInit[dtype]();
-  //   this->_shape = shape;
-  //   this->_mapper = vec_range(shape.size());
-  //   this->_invmapper = this->_mapper;
-  //   this->_contiguous = true;
-  // }
-
-  boost::intrusive_ptr<Tensor_impl> Tensor_impl::permute(const std::vector<cytnx_uint64> &rnks) {
-    // check::
-    if (rnks.size() != this->_shape.size()) {
-      cytnx_error_msg(true, "%s",
-                      "reshape a tensor with a specify shape that does not match with the shape of "
-                      "the incident tensor.");
-    }
-
-    if (vec_unique(rnks).size() != rnks.size()) {
-      cytnx_error_msg(true, "%s", "tensor permute with duplicated index.\n");
-    }
-
-    std::vector<cytnx_uint64> new_fwdmap(this->_shape.size());
-    std::vector<cytnx_uint64> new_shape(this->_shape.size());
-    std::vector<cytnx_uint64> new_idxmap(this->_shape.size());
-
-    // for(int i=0;i<this->_shape.size();i++)
-    //     std::cout << this->_mapper[i] << " " << this->_invmapper[i] << std::endl;
-
-    boost::intrusive_ptr<Tensor_impl> out(new Tensor_impl());
-
-    for (cytnx_uint32 i = 0; i < rnks.size(); i++) {
-      if (rnks[i] >= rnks.size()) {
-        cytnx_error_msg(1, "%s", "reshape a tensor with invalid rank index.");
-      }
-      // std::cout << this->_mapper[rnks[i]] << " " << i << std::endl;
-      new_idxmap[this->_mapper[rnks[i]]] = i;
-      new_fwdmap[i] = this->_mapper[rnks[i]];
-      new_shape[i] = this->_shape[rnks[i]];
-    }
-
-    out->_invmapper = std::move(new_idxmap);
-    out->_shape = std::move(new_shape);
-    out->_mapper = std::move(new_fwdmap);
-
-    /// checking if permute back to contiguous:
-    bool iconti = true;
-    for (cytnx_uint32 i = 0; i < rnks.size(); i++) {
-      // if (new_fwdmap[i] != new_idxmap[i]) {
-      //   iconti = false;
-      //   break;
-      // }
-      if (out->_mapper[i] != i) {
-        iconti = false;
-        break;
-      }
-    }
-    out->_contiguous = iconti;
-
-    // ref storage
-    out->_storage = this->_storage;
-    return out;
-  }
-
-  void Tensor_impl::permute_(const std::vector<cytnx_uint64> &rnks) {
-    // check::
-    if (rnks.size() != this->_shape.size()) {
-      cytnx_error_msg(true, "%s",
-                      "reshape a tensor with a specify shape that does not match with the shape of "
-                      "the incident tensor.");
-    }
-
-    if (vec_unique(rnks).size() != rnks.size()) {
-      cytnx_error_msg(true, "%s", "tensor permute with duplicated index.\n");
-    }
-
-    // std::vector<cytnx_uint64> new_fwdmap(this->_shape.size());
-    // std::vector<cytnx_uint64> new_shape(this->_shape.size());
-    // std::vector<cytnx_uint64> new_idxmap(this->_shape.size());
-
-    // smallvec<cytnx_uint64> new_fwdmap(this->_shape.size());
-    // smallvec<cytnx_uint64> new_shape(this->_shape.size());
-    // smallvec<cytnx_uint64> new_idxmap(this->_shape.size());
-    std::vector<cytnx_uint64> new_fwdmap(this->_shape.size());
-    std::vector<cytnx_uint64> new_shape(this->_shape.size());
-    std::vector<cytnx_uint64> new_idxmap(this->_shape.size());
-
-    // for(int i=0;i<this->_shape.size();i++)
-    //     std::cout << this->_mapper[i] << " " << this->_invmapper[i] << std::endl;
-
-    for (cytnx_uint32 i = 0; i < rnks.size(); i++) {
-      if (rnks[i] >= rnks.size()) {
-        cytnx_error_msg(1, "%s", "reshape a tensor with invalid rank index.");
-      }
-      // std::cout << this->_mapper[rnks[i]] << " " << i << std::endl;
-      // new_idxmap[this->_mapper[rnks[i]]] = i;
-      this->_invmapper[this->_mapper[rnks[i]]] = i;
-      new_fwdmap[i] = this->_mapper[rnks[i]];
-      new_shape[i] = this->_shape[rnks[i]];
-    }
-
-    // this->_invmapper = std::move(new_idxmap);
-    for (cytnx_uint64 i = 0; i < this->_shape.size(); i++) {
-      this->_shape[i] = new_shape[i];
-      this->_mapper[i] = new_fwdmap[i];
-    }
-
-    // this->_shape = std::move(new_shape);
-    // this->_mapper = std::move(new_fwdmap);
-
-    /// checking if permute back to contiguous:
-    bool iconti = true;
-    for (cytnx_uint32 i = 0; i < rnks.size(); i++) {
-      // if (this->_mapper[i] != this->_invmapper[i]) {
-      //   iconti = false;
-      //   break;
-      // }
-      if (this->_mapper[i] != i) {
-        iconti = false;
-        break;
-      }
-    }
-    this->_contiguous = iconti;
-  }
-
-  // shadow new:
-  //
-
-  boost::intrusive_ptr<Tensor_impl> Tensor_impl::get(
-    const std::vector<cytnx::Accessor> &accessors) {
-    cytnx_error_msg(accessors.size() > this->_shape.size(), "%s",
-                    "The input indexes rank is out of range! (>Tensor's rank).");
-
-    std::vector<cytnx::Accessor> acc = accessors;
-    for (int i = 0; i < this->_shape.size() - accessors.size(); i++) {
-      acc.push_back(Accessor::all());
-    }
-
-    /*
-    cout << "acc type bef" << endl;
-    for(int i=0;i<acc.size();i++){
-        cout << acc[i].type() << " ";
-    }
-    */
-    acc = vec_map(acc, this->_invmapper);  // contiguous.
-    /*
-    cout << "acc type aft" << endl;
-    for(int i=0;i<acc.size();i++){
-        cout << acc[i].type() << " ";
-    }
-    */
-
-    //[1] curr_shape:
-    auto curr_shape = vec_map(this->_shape, this->_invmapper);
-    // cout << "curr_shape" << endl;
-    // cout << curr_shape << endl;
-
-    //[2] from back to front, check until last all:
-    cytnx_uint64 Nunit = 1;
-    int tmpidx = 0;
-    while (tmpidx < curr_shape.size()) {
-      if (acc.back().type() == Accessor::All) {
-        Nunit *= curr_shape[curr_shape.size() - 1 - tmpidx];
-        tmpidx++;
-        acc.pop_back();
-      } else {
-        break;
-      }
-    }
-    // cout << "tmpidx" << tmpidx << endl;
-    // cout << "Nunit" << Nunit << endl;
-    // cout << acc.size() << endl;
-
-    // acc-> locators
-
-    std::vector<cytnx_uint64> get_shape(acc.size());
-    std::vector<std::vector<cytnx_uint64>> locators(acc.size());
-    for (cytnx_uint32 i = 0; i < acc.size(); i++) {
-      cytnx_error_msg(acc[i].type() == Accessor::Qns,
-                      "[ERROR] Tensor cannot accept accessor with qnum list.%s", "\n");
-      acc[i].get_len_pos(curr_shape[i], get_shape[i], locators[i]);
-    }
-    // cout << "get_shape" << endl;
-    // cout << get_shape << endl;
-
-    // create Tensor:
-    for (cytnx_uint64 i = 0; i < tmpidx; i++) {
-      get_shape.push_back(curr_shape[acc.size() + i]);
-    }
-    boost::intrusive_ptr<Tensor_impl> out(new Tensor_impl());
-    out->Init(get_shape, this->dtype(), this->device());
-    // cout << get_shape << endl;
-
-    if (locators.size() == 0) {
-      locators.resize(1);
-      locators[0].push_back(0);
-    }
-
-    // call storage
-    this->storage()._impl->GetElem_byShape_v2(out->storage()._impl, curr_shape, locators, Nunit);
-
-    // permute back:
-    std::vector<cytnx_int64> new_mapper(this->_mapper.begin(), this->_mapper.end());
-    std::vector<cytnx_int64> new_shape;
-    std::vector<cytnx_int32> remove_id;
-    for (unsigned int i = 0; i < out->_shape.size(); i++) {
-      if (out->shape()[i] == 1 && (acc[i].type() == Accessor::Singl))
-        remove_id.push_back(this->_mapper[this->_invmapper[i]]);
-      else
-        new_shape.push_back(out->shape()[i]);
-    }
-
-    // cout << "mapper" << endl;
-    // cout << new_mapper << endl;
-    // cout << "inv_mapper" << endl;
-    // cout << this->_invmapper << endl;
-
-    // cout << "remove_id" << endl;
-    // cout << remove_id << endl;
-    // cout << "out shape raw" << endl;
-    // cout << out->shape() << endl;
-
-    // cout << "perm" << endl;
-    // cout << perm << endl;
-    // cout << new_shape << endl;
-    if (new_shape.size()) {  // exclude the case where only single element exists!
-
-      out->reshape_(new_shape);  // remove size-1 axis
-
-      std::vector<cytnx_uint64> perm;
-      for (unsigned int i = 0; i < new_mapper.size(); i++) {
-        perm.push_back(new_mapper[i]);
-        for (unsigned int j = 0; j < remove_id.size(); j++) {
-          if (new_mapper[i] > remove_id[j])
-            perm.back() -= 1;
-          else if (new_mapper[i] == remove_id[j]) {
-            perm.pop_back();
-            break;
-          }
-        }
-      }
-      out->permute_(perm);
-    } else {
-      out->reshape_({1});  // if it is only one element.
-    }
-
-    return out;
-  }
-
-  boost::intrusive_ptr<Tensor_impl> Tensor_impl::get_deprecated(
-    const std::vector<cytnx::Accessor> &accessors) {
-    cytnx_error_msg(accessors.size() > this->_shape.size(), "%s",
-                    "The input indexes rank is out of range! (>Tensor's rank).");
-
-    std::vector<cytnx::Accessor> acc = accessors;
-    for (int i = 0; i < this->_shape.size() - accessors.size(); i++) {
-      acc.push_back(Accessor::all());
-    }
-
-    vector<cytnx_uint64> get_shape(acc.size());
-
-    // vector<cytnx_uint64> new_shape;
-    std::vector<std::vector<cytnx_uint64>> locators(this->_shape.size());
-    for (cytnx_uint32 i = 0; i < acc.size(); i++) {
-      acc[i].get_len_pos(this->_shape[i], get_shape[i], locators[i]);
-      // std::cout << this->_shape[i] << " " << get_shape[i] << "|";
-      // for(int j=0;j<locators[i].size();j++) std::cout << locators[i][j] << " ";
-      // std::cout << std::endl;
-    }
-
-    boost::intrusive_ptr<Tensor_impl> out(new Tensor_impl());
-    out->Init(get_shape, this->dtype(), this->device());
-
-    this->storage()._impl->GetElem_byShape(out->storage()._impl, this->shape(), this->_mapper,
-                                           get_shape, locators);
-
-    vector<cytnx_int64> new_shape;
-    for (cytnx_uint32 i = 0; i < acc.size(); i++)
-      if (get_shape[i] != 1) new_shape.push_back(get_shape[i]);
-
-    if (new_shape.size() == 0)
-      out->reshape_({1});
-    else
-      out->reshape_(new_shape);
-    return out;
-  }
-
-  void Tensor_impl::set(const std::vector<cytnx::Accessor> &accessors,
-                        const boost::intrusive_ptr<Tensor_impl> &rhs) {
-    // cout << "calling set" << endl;
-    cytnx_error_msg(accessors.size() > this->_shape.size(), "%s",
-                    "The input indexes rank is out of range! (>Tensor's rank).");
-
-    vector<cytnx::Accessor> acc = accessors;
-    for (int i = 0; i < this->_shape.size() - accessors.size(); i++) {
-      acc.push_back(Accessor::all());
-    }
-
-    // vector<cytnx_uint64> get_shape(acc.size());
-    acc = vec_map(acc, this->_invmapper);  // contiguous.
-
-    //[1] curr_shape:
-    auto curr_shape = vec_map(this->_shape, this->_invmapper);
-
-    //[2] from back to front, check until last all:
-    cytnx_uint64 Nunit = 1;
-    int tmpidx = 0;
-    while (tmpidx < curr_shape.size()) {
-      if (acc.back().type() == Accessor::All) {
-        Nunit *= curr_shape[curr_shape.size() - 1 - tmpidx];
-        tmpidx++;
-        acc.pop_back();
-      } else {
-        break;
-      }
-    }
-
-    std::vector<cytnx_uint64> get_shape(acc.size());
-    std::vector<std::vector<cytnx_uint64>> locators(acc.size());
-    for (cytnx_uint32 i = 0; i < acc.size(); i++) {
-      cytnx_error_msg(acc[i].type() == Accessor::Qns,
-                      "[ERROR] Tensor cannot accept accessor with qnum list.%s", "\n");
-      acc[i].get_len_pos(curr_shape[i], get_shape[i], locators[i]);
-    }
-
-    /// checking if its scalar assign!
-    if (rhs->storage().size() == 1) {
-      this->storage()._impl->SetElem_byShape_v2(rhs->storage()._impl, curr_shape, locators, Nunit,
-                                                true);
-      // std::cout << "Scalar" << endl;
-
-    } else {
-      for (cytnx_uint64 i = 0; i < tmpidx; i++) {
-        get_shape.push_back(curr_shape[acc.size() + i]);
-      }
-
-      // std::cout << get_shape << endl;
-
-      // permute input to currect pos
-      std::vector<cytnx_int64> new_mapper(this->_mapper.begin(), this->_mapper.end());
-      std::vector<cytnx_uint64> new_shape;
-      std::vector<cytnx_int32> remove_id;
-      for (unsigned int i = 0; i < get_shape.size(); i++) {
-        if (acc[i].type() == Accessor::Singl)
-          remove_id.push_back(this->_mapper[this->_invmapper[i]]);
-        else
-          new_shape.push_back(get_shape[i]);
-      }
-
-      if (new_shape.size() == 0) new_shape.push_back(1);
-
-      // use current size to infer rhs permutation.
-      std::vector<cytnx_uint64> perm;
-      for (unsigned int i = 0; i < new_mapper.size(); i++) {
-        perm.push_back(new_mapper[i]);
-
-        for (unsigned int j = 0; j < remove_id.size(); j++) {
-          if (new_mapper[i] > remove_id[j])
-            perm.back() -= 1;
-          else if (new_mapper[i] == remove_id[j]) {
-            perm.pop_back();
-            break;
-          }
-        }
-      }
-
-      std::vector<cytnx_uint64> iperm(perm.size());
-      for (unsigned int i = 0; i < iperm.size(); i++) iperm[perm[i]] = i;
-
-      // std::cout << new_shape << endl;
-      boost::intrusive_ptr<Tensor_impl> tmp;
-      // std::cout << iperm << std::endl;
-      tmp = rhs->permute(iperm)->contiguous();
-      cytnx_error_msg(new_shape != tmp->shape(), "[ERROR][Tensor.set_elems]%s",
-                      "inconsistent shape");
-      this->storage()._impl->SetElem_byShape_v2(tmp->storage()._impl, curr_shape, locators, Nunit,
-                                                false);
-    }
-  }
-
-  template <class T>
-  void Tensor_impl::set(const std::vector<cytnx::Accessor> &accessors, const T &rc) {
-    cytnx_error_msg(accessors.size() > this->_shape.size(), "%s",
-                    "The input indexes rank is out of range! (>Tensor's rank).");
-
-    std::vector<cytnx::Accessor> acc = accessors;
-    for (int i = 0; i < this->_shape.size() - accessors.size(); i++) {
-      acc.push_back(Accessor::all());
-    }
-
-    acc = vec_map(acc, this->_invmapper);  // contiguous.
-
-    //[1] curr_shape:
-    auto curr_shape = vec_map(this->_shape, this->_invmapper);
-
-    //[2] from back to front, check until last all:
-    cytnx_uint64 Nunit = 1;
-    int tmpidx = 0;
-    while (tmpidx < curr_shape.size()) {
-      if (acc.back().type() == Accessor::All) {
-        Nunit *= curr_shape[curr_shape.size() - 1 - tmpidx];
-        tmpidx++;
-        acc.pop_back();
-      } else {
-        break;
-      }
-    }
-    // cout << "tmpidx" << tmpidx << endl;
-    // cout << "Nunit" << Nunit << endl;
-    // cout << acc.size() << endl;
-
-    // acc-> locators
-
-    std::vector<cytnx_uint64> get_shape(acc.size());
-    std::vector<std::vector<cytnx_uint64>> locators(acc.size());
-    for (cytnx_uint32 i = 0; i < acc.size(); i++) {
-      cytnx_error_msg(acc[i].type() == Accessor::Qns,
-                      "[ERROR] Tensor cannot accept accessor with qnum list.%s", "\n");
-      acc[i].get_len_pos(curr_shape[i], get_shape[i], locators[i]);
-    }
-    // cout << "get_shape" << endl;
-    // cout << get_shape << endl;
-
-    // call storage
-    Scalar c = rc;
-
-    Storage tmp(1, c.dtype(), this->device());
-    tmp.set_item(0, rc);
-    this->storage()._impl->SetElem_byShape_v2(tmp._impl, curr_shape, locators, Nunit, true);
-  }
-  template void Tensor_impl::set<cytnx_complex128>(const std::vector<cytnx::Accessor> &,
-                                                   const cytnx_complex128 &);
-  template void Tensor_impl::set<cytnx_complex64>(const std::vector<cytnx::Accessor> &,
-                                                  const cytnx_complex64 &);
-  template void Tensor_impl::set<cytnx_double>(const std::vector<cytnx::Accessor> &,
-                                               const cytnx_double &);
-  template void Tensor_impl::set<cytnx_float>(const std::vector<cytnx::Accessor> &,
-                                              const cytnx_float &);
-  template void Tensor_impl::set<cytnx_int64>(const std::vector<cytnx::Accessor> &,
-                                              const cytnx_int64 &);
-  template void Tensor_impl::set<cytnx_uint64>(const std::vector<cytnx::Accessor> &,
-                                               const cytnx_uint64 &);
-  template void Tensor_impl::set<cytnx_int32>(const std::vector<cytnx::Accessor> &,
-                                              const cytnx_int32 &);
-  template void Tensor_impl::set<cytnx_uint32>(const std::vector<cytnx::Accessor> &,
-                                               const cytnx_uint32 &);
-  template void Tensor_impl::set<cytnx_int16>(const std::vector<cytnx::Accessor> &,
-                                              const cytnx_int16 &);
-  template void Tensor_impl::set<cytnx_uint16>(const std::vector<cytnx::Accessor> &,
-                                               const cytnx_uint16 &);
-  template void Tensor_impl::set<cytnx_bool>(const std::vector<cytnx::Accessor> &,
-                                             const cytnx_bool &);
-  template void Tensor_impl::set<Scalar>(const std::vector<cytnx::Accessor> &, const Scalar &);
-
-  void Tensor_impl::set(const std::vector<cytnx::Accessor> &accessors, const Scalar::Sproxy &rc) {
-    this->set(accessors, Scalar(rc));
-  }
-
-  std::ostream &operator<<(std::ostream &os, const Tensor &in) {
-    if (in.is_contiguous())
-      in._impl->storage()._impl->PrintElem_byShape(os, in.shape());
-    else
-      in._impl->storage()._impl->PrintElem_byShape(os, in.shape(), in._impl->invmapper());
-    return os;
-  }
-  std::ostream &operator<<(std::ostream &os, const Tensor::Tproxy &in) {
-    os << Tensor(in) << std::endl;
-    return os;
-  }
-  //===================================================================
-  // wrapper
-
-  void Tensor::Tofile(const std::string &fname) const {
-    if (!this->is_contiguous()) {
-      auto A = this->contiguous();
-      A.storage().Tofile(fname);
-    } else {
-      this->_impl->_storage.Tofile(fname);
-    }
-  }
-  void Tensor::Tofile(const char *fname) const {
-    if (!this->is_contiguous()) {
-      auto A = this->contiguous();
-      A.storage().Tofile(fname);
-    } else {
-      this->_impl->_storage.Tofile(fname);
-    }
-  }
-  void Tensor::Tofile(fstream &f) const {
-    if (!this->is_contiguous()) {
-      auto A = this->contiguous();
-      A.storage().Tofile(f);
-    } else {
-      this->_impl->_storage.Tofile(f);
-    }
-  }
-  void Tensor::Save(const std::string &fname) const {
-    fstream f;
-    f.open((fname + ".cytn"), ios::out | ios::trunc | ios::binary);
-    if (!f.is_open()) {
-      cytnx_error_msg(true, "[ERROR] invalid file path for save.%s", "\n");
-    }
-    this->_Save(f);
-    f.close();
-  }
-  void Tensor::Save(const char *fname) const {
-    fstream f;
-    string ffname = string(fname) + ".cytn";
-    f.open(ffname, ios::out | ios::trunc | ios::binary);
-    if (!f.is_open()) {
-      cytnx_error_msg(true, "[ERROR] invalid file path for save.%s", "\n");
-    }
-    this->_Save(f);
-    f.close();
-  }
-  void Tensor::_Save(fstream &f) const {
-    // header
-    // check:
-    cytnx_error_msg(!f.is_open(), "[ERROR] invalid fstream!.%s", "\n");
-
-    unsigned int IDDs = 888;
-    f.write((char *)&IDDs, sizeof(unsigned int));
-    cytnx_uint64 shp = this->shape().size();
-    cytnx_uint64 Conti = this->is_contiguous();
-    f.write((char *)&shp, sizeof(cytnx_uint64));
-
-    f.write((char *)&Conti, sizeof(cytnx_uint64));
-    f.write((char *)&this->_impl->_shape[0], sizeof(cytnx_uint64) * shp);
-    f.write((char *)&this->_impl->_mapper[0], sizeof(cytnx_uint64) * shp);
-    f.write((char *)&this->_impl->_invmapper[0], sizeof(cytnx_uint64) * shp);
-
-    // pass to storage for save:
-    this->_impl->_storage._Save(f);
-  }
-
-  Tensor Tensor::Fromfile(const std::string &fname, const unsigned int &dtype,
-                          const cytnx_int64 &count) {
-    return Tensor::from_storage(Storage::Fromfile(fname, dtype, count));
-  }
-  Tensor Tensor::Fromfile(const char *fname, const unsigned int &dtype, const cytnx_int64 &count) {
-    return Tensor::from_storage(Storage::Fromfile(fname, dtype, count));
-  }
-  Tensor Tensor::Load(const std::string &fname) {
-    Tensor out;
-    fstream f;
-    f.open(fname, ios::in | ios::binary);
-    if (!f.is_open()) {
-      cytnx_error_msg(true, "[ERROR] invalid file path for load.%s", "\n");
-    }
-    out._Load(f);
-    f.close();
-    return out;
-  }
-  Tensor Tensor::Load(const char *fname) {
-    Tensor out;
-    fstream f;
-    f.open(fname, ios::in | ios::binary);
-    if (!f.is_open()) {
-      cytnx_error_msg(true, "[ERROR] invalid file path for load.%s", "\n");
-    }
-    out._Load(f);
-    f.close();
-    return out;
-  }
-  void Tensor::_Load(fstream &f) {
-    // header
-    // check:
-    cytnx_error_msg(!f.is_open(), "[ERROR] invalid fstream!.%s", "\n");
-
-    unsigned int tmpIDDs;
-    f.read((char *)&tmpIDDs, sizeof(unsigned int));
-    cytnx_error_msg(tmpIDDs != 888, "[ERROR] the object is not a cytnx tensor!%s", "\n");
-
-    cytnx_uint64 shp;
-    cytnx_uint64 Conti;
-    f.read((char *)&shp, sizeof(cytnx_uint64));
-    f.read((char *)&Conti, sizeof(cytnx_uint64));
-    this->_impl->_contiguous = Conti;
-
-    this->_impl->_shape.resize(shp);
-    this->_impl->_mapper.resize(shp);
-    this->_impl->_invmapper.resize(shp);
-    f.read((char *)&this->_impl->_shape[0], sizeof(cytnx_uint64) * shp);
-    f.read((char *)&this->_impl->_mapper[0], sizeof(cytnx_uint64) * shp);
-    f.read((char *)&this->_impl->_invmapper[0], sizeof(cytnx_uint64) * shp);
-
-    // pass to storage for save:
-    this->_impl->_storage._Load(f);
-  }
-
-  Tensor Tensor::real() {
-    Tensor out;
-    out._impl = this->_impl->_clone_meta_only();
-    out._impl->_storage = this->_impl->_storage.real();
-    return out;
-  };
-
-  Tensor Tensor::imag() {
-    Tensor out;
-    out._impl = this->_impl->_clone_meta_only();
-    out._impl->_storage = this->_impl->_storage.imag();
-    return out;
-  }
-
-  ///@cond
-  // +=
-  template <>
-  Tensor &Tensor::operator+=<Tensor>(const Tensor &rc) {
-    cytnx::linalg::iAdd(*this, rc);
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator+=<Tensor::Tproxy>(const Tensor::Tproxy &rc) {
-    cytnx::linalg::iAdd(*this, Tensor(rc));
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator+=<cytnx_complex128>(const cytnx_complex128 &rc) {
-    this->_impl->storage() = cytnx::linalg::Add(*this, rc)._impl->storage();
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator+=<cytnx_complex64>(const cytnx_complex64 &rc) {
-    this->_impl->storage() = cytnx::linalg::Add(*this, rc)._impl->storage();
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator+=<cytnx_double>(const cytnx_double &rc) {
-    this->_impl->storage() = cytnx::linalg::Add(*this, rc)._impl->storage();
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator+=<cytnx_float>(const cytnx_float &rc) {
-    this->_impl->storage() = cytnx::linalg::Add(*this, rc)._impl->storage();
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator+=<cytnx_int64>(const cytnx_int64 &rc) {
-    this->_impl->storage() = cytnx::linalg::Add(*this, rc)._impl->storage();
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator+=<cytnx_uint64>(const cytnx_uint64 &rc) {
-    this->_impl->storage() = cytnx::linalg::Add(*this, rc)._impl->storage();
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator+=<cytnx_int32>(const cytnx_int32 &rc) {
-    this->_impl->storage() = cytnx::linalg::Add(*this, rc)._impl->storage();
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator+=<cytnx_uint32>(const cytnx_uint32 &rc) {
-    this->_impl->storage() = cytnx::linalg::Add(*this, rc)._impl->storage();
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator+=<cytnx_int16>(const cytnx_int16 &rc) {
-    this->_impl->storage() = cytnx::linalg::Add(*this, rc)._impl->storage();
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator+=<cytnx_uint16>(const cytnx_uint16 &rc) {
-    this->_impl->storage() = cytnx::linalg::Add(*this, rc)._impl->storage();
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator+=<cytnx_bool>(const cytnx_bool &rc) {
-    this->_impl->storage() = cytnx::linalg::Add(*this, rc)._impl->storage();
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator+=<Scalar>(const Scalar &rc) {
-    this->_impl->storage() = cytnx::linalg::Add(*this, rc)._impl->storage();
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator+=<Scalar::Sproxy>(const Scalar::Sproxy &rc) {
-    return this->operator+=(Scalar(rc));
-  }
-  // -=
-  template <>
-  Tensor &Tensor::operator-=<Tensor>(const Tensor &rc) {
-    cytnx::linalg::iSub(*this, rc);
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator-=<Tensor::Tproxy>(const Tensor::Tproxy &rc) {
-    cytnx::linalg::iSub(*this, Tensor(rc));
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator-=<cytnx_complex128>(const cytnx_complex128 &rc) {
-    this->_impl->storage() = cytnx::linalg::Sub(*this, rc)._impl->storage();
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator-=<cytnx_complex64>(const cytnx_complex64 &rc) {
-    this->_impl->storage() = cytnx::linalg::Sub(*this, rc)._impl->storage();
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator-=<cytnx_double>(const cytnx_double &rc) {
-    this->_impl->storage() = cytnx::linalg::Sub(*this, rc)._impl->storage();
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator-=<cytnx_float>(const cytnx_float &rc) {
-    this->_impl->storage() = cytnx::linalg::Sub(*this, rc)._impl->storage();
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator-=<cytnx_int64>(const cytnx_int64 &rc) {
-    this->_impl->storage() = cytnx::linalg::Sub(*this, rc)._impl->storage();
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator-=<cytnx_uint64>(const cytnx_uint64 &rc) {
-    this->_impl->storage() = cytnx::linalg::Sub(*this, rc)._impl->storage();
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator-=<cytnx_int32>(const cytnx_int32 &rc) {
-    this->_impl->storage() = cytnx::linalg::Sub(*this, rc)._impl->storage();
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator-=<cytnx_uint32>(const cytnx_uint32 &rc) {
-    this->_impl->storage() = cytnx::linalg::Sub(*this, rc)._impl->storage();
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator-=<cytnx_int16>(const cytnx_int16 &rc) {
-    this->_impl->storage() = cytnx::linalg::Sub(*this, rc)._impl->storage();
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator-=<cytnx_uint16>(const cytnx_uint16 &rc) {
-    this->_impl->storage() = cytnx::linalg::Sub(*this, rc)._impl->storage();
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator-=<cytnx_bool>(const cytnx_bool &rc) {
-    this->_impl->storage() = cytnx::linalg::Sub(*this, rc)._impl->storage();
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator-=<Scalar>(const Scalar &rc) {
-    this->_impl->storage() = cytnx::linalg::Sub(*this, rc)._impl->storage();
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator-=<Scalar::Sproxy>(const Scalar::Sproxy &rc) {
-    return this->operator-=(Scalar(rc));
-  }
-  // *=
-  template <>
-  Tensor &Tensor::operator*=<Tensor>(const Tensor &rc) {
-    cytnx::linalg::iMul(*this, rc);
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator*=<Tensor::Tproxy>(const Tensor::Tproxy &rc) {
-    cytnx::linalg::iMul(*this, Tensor(rc));
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator*=<cytnx_complex128>(const cytnx_complex128 &rc) {
-    this->_impl->storage() = cytnx::linalg::Mul(*this, rc)._impl->storage();
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator*=<cytnx_complex64>(const cytnx_complex64 &rc) {
-    this->_impl->storage() = cytnx::linalg::Mul(*this, rc)._impl->storage();
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator*=<cytnx_double>(const cytnx_double &rc) {
-    this->_impl->storage() = cytnx::linalg::Mul(*this, rc)._impl->storage();
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator*=<cytnx_float>(const cytnx_float &rc) {
-    this->_impl->storage() = cytnx::linalg::Mul(*this, rc)._impl->storage();
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator*=<cytnx_int64>(const cytnx_int64 &rc) {
-    this->_impl->storage() = cytnx::linalg::Mul(*this, rc)._impl->storage();
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator*=<cytnx_uint64>(const cytnx_uint64 &rc) {
-    this->_impl->storage() = cytnx::linalg::Mul(*this, rc)._impl->storage();
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator*=<cytnx_int32>(const cytnx_int32 &rc) {
-    this->_impl->storage() = cytnx::linalg::Mul(*this, rc)._impl->storage();
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator*=<cytnx_uint32>(const cytnx_uint32 &rc) {
-    this->_impl->storage() = cytnx::linalg::Mul(*this, rc)._impl->storage();
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator*=<cytnx_int16>(const cytnx_int16 &rc) {
-    this->_impl->storage() = cytnx::linalg::Mul(*this, rc)._impl->storage();
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator*=<cytnx_uint16>(const cytnx_uint16 &rc) {
-    this->_impl->storage() = cytnx::linalg::Mul(*this, rc)._impl->storage();
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator*=<cytnx_bool>(const cytnx_bool &rc) {
-    this->_impl->storage() = cytnx::linalg::Mul(*this, rc)._impl->storage();
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator*=<Scalar>(const Scalar &rc) {
-    this->_impl->storage() = cytnx::linalg::Mul(*this, rc)._impl->storage();
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator*=<Scalar::Sproxy>(const Scalar::Sproxy &rc) {
-    return this->operator*=(Scalar(rc));
-  }
-
-  // /=
-  template <>
-  Tensor &Tensor::operator/=<Tensor>(const Tensor &rc) {
-    cytnx::linalg::iDiv(*this, rc);
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator/=<Tensor::Tproxy>(const Tensor::Tproxy &rc) {
-    cytnx::linalg::iDiv(*this, Tensor(rc));
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator/=<cytnx_complex128>(const cytnx_complex128 &rc) {
-    this->_impl->storage() = cytnx::linalg::Div(*this, rc)._impl->storage();
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator/=<cytnx_complex64>(const cytnx_complex64 &rc) {
-    this->_impl->storage() = cytnx::linalg::Div(*this, rc)._impl->storage();
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator/=<cytnx_double>(const cytnx_double &rc) {
-    this->_impl->storage() = cytnx::linalg::Div(*this, rc)._impl->storage();
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator/=<cytnx_float>(const cytnx_float &rc) {
-    this->_impl->storage() = cytnx::linalg::Div(*this, rc)._impl->storage();
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator/=<cytnx_int64>(const cytnx_int64 &rc) {
-    this->_impl->storage() = cytnx::linalg::Div(*this, rc)._impl->storage();
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator/=<cytnx_uint64>(const cytnx_uint64 &rc) {
-    this->_impl->storage() = cytnx::linalg::Div(*this, rc)._impl->storage();
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator/=<cytnx_int32>(const cytnx_int32 &rc) {
-    // std::cout << "entry /= int32" << std::endl;
-    this->_impl->storage() = cytnx::linalg::Div(*this, rc)._impl->storage();
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator/=<cytnx_uint32>(const cytnx_uint32 &rc) {
-    this->_impl->storage() = cytnx::linalg::Div(*this, rc)._impl->storage();
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator/=<cytnx_int16>(const cytnx_int16 &rc) {
-    this->_impl->storage() = cytnx::linalg::Div(*this, rc)._impl->storage();
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator/=<cytnx_uint16>(const cytnx_uint16 &rc) {
-    this->_impl->storage() = cytnx::linalg::Div(*this, rc)._impl->storage();
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator/=<cytnx_bool>(const cytnx_bool &rc) {
-    this->_impl->storage() = cytnx::linalg::Div(*this, rc)._impl->storage();
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator/=<Scalar>(const Scalar &rc) {
-    this->_impl->storage() = cytnx::linalg::Div(*this, rc)._impl->storage();
-    return *this;
-  }
-  template <>
-  Tensor &Tensor::operator/=<Scalar::Sproxy>(const Scalar::Sproxy &rc) {
-    return this->operator/=(Scalar(rc));
-  }
-  ///@endcond
-
-  // std::vector<Tensor> Tensor::Svd(const bool &is_U, const bool &is_vT) const {
-  //   return linalg::Svd(*this, is_U, is_vT);
-  // }
-  std::vector<Tensor> Tensor::Svd(const bool &is_UvT) const { return linalg::Svd(*this, is_UvT); }
-  std::vector<Tensor> Tensor::Eigh(const bool &is_V, const bool &row_v) const {
-    return linalg::Eigh(*this, is_V, row_v);
-  }
-
-  Tensor &Tensor::InvM_() {
-    linalg::InvM_(*this);
-    return *this;
-  }
-  Tensor Tensor::InvM() const { return linalg::InvM(*this); }
-  Tensor &Tensor::Inv_(const double &clip) {
-    linalg::Inv_(*this, clip);
-    return *this;
-  }
-  Tensor Tensor::Inv(const double &clip) const { return linalg::Inv(*this, clip); }
-
-  Tensor &Tensor::Conj_() {
-    linalg::Conj_(*this);
-    return *this;
-  }
-  Tensor Tensor::Conj() const { return linalg::Conj(*this); }
-
-  Tensor &Tensor::Exp_() {
-    linalg::Exp_(*this);
-    return *this;
-  }
-  Tensor Tensor::Exp() const { return linalg::Exp(*this); }
-  Tensor Tensor::Norm() const { return linalg::Norm(*this); }
-
-  Tensor Tensor::Pow(const cytnx_double &p) const { return linalg::Pow(*this, p); }
-
-  Tensor &Tensor::Pow_(const cytnx_double &p) {
-    linalg::Pow_(*this, p);
-    return *this;
-  }
-
-  Tensor &Tensor::Abs_() {
-    linalg::Abs_(*this);
-    return *this;
-  }
-  Tensor Tensor::Abs() const { return linalg::Abs(*this); }
-  Tensor Tensor::Max() const { return linalg::Max(*this); }
-  Tensor Tensor::Min() const { return linalg::Min(*this); }
-
-  Tensor Tensor::Trace(const cytnx_uint64 &a, const cytnx_uint64 &b) const {
-    Tensor out = linalg::Trace(*this, a, b);
-    return out;
-  }
-
-  bool Tensor::same_data(const Tensor &rhs) const {
-    return is(this->_impl->storage(), rhs.storage());
-  }
-
-  //===========================
-  // Tensor am Tproxy
-  Tensor operator+(const Tensor &lhs, const Tensor::Tproxy &rhs) {
-    return cytnx::linalg::Add(lhs, Tensor(rhs));
-  }
-  Tensor operator-(const Tensor &lhs, const Tensor::Tproxy &rhs) {
-    return cytnx::linalg::Sub(lhs, Tensor(rhs));
-  }
-  Tensor operator*(const Tensor &lhs, const Tensor::Tproxy &rhs) {
-    return cytnx::linalg::Mul(lhs, Tensor(rhs));
-  }
-  Tensor operator/(const Tensor &lhs, const Tensor::Tproxy &rhs) {
-    return cytnx::linalg::Div(lhs, Tensor(rhs));
-  }
-
-  //===========================
-  // Tensor am Sproxy
-  Tensor operator+(const Tensor &lhs, const Scalar::Sproxy &rhs) {
-    return cytnx::linalg::Add(lhs, Scalar(rhs));
-  }
-  Tensor operator-(const Tensor &lhs, const Scalar::Sproxy &rhs) {
-    return cytnx::linalg::Sub(lhs, Scalar(rhs));
-  }
-  Tensor operator*(const Tensor &lhs, const Scalar::Sproxy &rhs) {
-    return cytnx::linalg::Mul(lhs, Scalar(rhs));
-  }
-  Tensor operator/(const Tensor &lhs, const Scalar::Sproxy &rhs) {
-    return cytnx::linalg::Div(lhs, Scalar(rhs));
-  }
-
-}  // namespace cytnx
diff --git a/src/backend/utils_internal_cpu/GetElems_cpu.cpp.new b/src/backend/utils_internal_cpu/GetElems_cpu.cpp.new
deleted file mode 100644
index 365780a89..000000000
--- a/src/backend/utils_internal_cpu/GetElems_cpu.cpp.new
+++ /dev/null
@@ -1,242 +0,0 @@
-#include "GetElems_cpu.hpp"
-
-#ifdef UNI_OMP
-#include <omp.h>
-#endif
-
-namespace cytnx{
-    namespace utils_internal{
-
-        void GetElems_cpu_cd(void* out, void *in,const std::vector<cytnx_uint64> &offj, const std::vector<cytnx_uint64> &new_offj, const std::vector<std::vector<cytnx_uint64> >&locators, const cytnx_uint64 &TotalElem, const cytnx_uint64 &CommElem){
-                //Start copy elem:
-                cytnx_complex128* elem_ptr_     = static_cast<cytnx_complex128*>(in);
-                cytnx_complex128* new_elem_ptr_ = static_cast<cytnx_complex128*>(out);
-
-                #ifdef UNI_OMP
-                #pragma omp parallel for schedule(dynamic)
-                #endif
-                for(cytnx_uint64 n=0;n < TotalElem; n++){
-                    //map from mem loc of new tensor to old tensor
-                    cytnx_uint64 Loc=0;
-                    cytnx_uint64 tmpn = n;
-                    for(cytnx_uint32 r=0;r < offj.size();r++){
-                        if(locators[r].size()) Loc += locators[r][tmpn/new_offj[r]]*offj[r];
-                        else Loc += cytnx_uint64(tmpn/new_offj[r])*offj[r];
-                        tmpn %= new_offj[r];
-                    }
-                    mempcy(&new_elem_ptr_[n*CommElem],&elem_ptr_[Loc*CommElem],sizeof(cytnx_complex128)*CommElem);
-                    //new_elem_ptr_[n] = elem_ptr_[Loc];
-                }
-        }
-        void GetElems_cpu_cf(void* out, void *in,const std::vector<cytnx_uint64> &offj, const std::vector<cytnx_uint64> &new_offj, const std::vector<std::vector<cytnx_uint64> >&locators, const cytnx_uint64 &TotalElem, const cytnx_uint64 &CommElem){
-                //Start copy elem:
-                cytnx_complex64* elem_ptr_     = static_cast<cytnx_complex64*>(in);
-                cytnx_complex64* new_elem_ptr_ = static_cast<cytnx_complex64*>(out);
-
-                #ifdef UNI_OMP
-                #pragma omp parallel for schedule(dynamic)
-                #endif
-                for(cytnx_uint64 n=0;n < TotalElem; n++){
-                    //map from mem loc of new tensor to old tensor
-                    cytnx_uint64 Loc=0;
-                    cytnx_uint64 tmpn = n;
-                    for(cytnx_uint32 r=0;r < offj.size();r++){
-                        if(locators[r].size()) Loc += locators[r][tmpn/new_offj[r]]*offj[r];
-                        else Loc += cytnx_uint64(tmpn/new_offj[r])*offj[r];
-                        tmpn %= new_offj[r];
-                    }
-                    mempcy(&new_elem_ptr_[n*CommElem],&elem_ptr_[Loc*CommElem],sizeof(cytnx_complex64)*CommElem);
-                    //new_elem_ptr_[n] = elem_ptr_[Loc];
-                }
-        }
-        void GetElems_cpu_d(void* out, void *in,const std::vector<cytnx_uint64> &offj, const std::vector<cytnx_uint64> &new_offj, const std::vector<std::vector<cytnx_uint64> >&locators, const cytnx_uint64 &TotalElem, const cytnx_uint64 &CommElem){
-                //Start copy elem:
-                cytnx_double* elem_ptr_     = static_cast<cytnx_double*>(in);
-                cytnx_double* new_elem_ptr_ = static_cast<cytnx_double*>(out);
-
-                #ifdef UNI_OMP
-                #pragma omp parallel for schedule(dynamic)
-                #endif
-                for(cytnx_uint64 n=0;n < TotalElem; n++){
-                    //map from mem loc of new tensor to old tensor
-                    cytnx_uint64 Loc=0;
-                    cytnx_uint64 tmpn = n;
-                    for(cytnx_uint32 r=0;r < offj.size();r++){
-                        if(locators[r].size()) Loc += locators[r][tmpn/new_offj[r]]*offj[r];
-                        else Loc += cytnx_uint64(tmpn/new_offj[r])*offj[r];
-                        tmpn %= new_offj[r];
-                    }
-                    mempcy(&new_elem_ptr_[n*CommElem],&elem_ptr_[Loc*CommElem],sizeof(cytnx_double)*CommElem);
-                    //new_elem_ptr_[n] = elem_ptr_[Loc];
-                }
-        }
-        void GetElems_cpu_f(void* out, void *in,const std::vector<cytnx_uint64> &offj, const std::vector<cytnx_uint64> &new_offj, const std::vector<std::vector<cytnx_uint64> >&locators, const cytnx_uint64 &TotalElem, const cytnx_uint64 &CommElem){
-                //Start copy elem:
-                cytnx_float* elem_ptr_     = static_cast<cytnx_float*>(in);
-                cytnx_float* new_elem_ptr_ = static_cast<cytnx_float*>(out);
-
-                #ifdef UNI_OMP
-                #pragma omp parallel for schedule(dynamic)
-                #endif
-                for(cytnx_uint64 n=0;n < TotalElem; n++){
-                    //map from mem loc of new tensor to old tensor
-                    cytnx_uint64 Loc=0;
-                    cytnx_uint64 tmpn = n;
-                    for(cytnx_uint32 r=0;r < offj.size();r++){
-                        if(locators[r].size()) Loc += locators[r][tmpn/new_offj[r]]*offj[r];
-                        else Loc += cytnx_uint64(tmpn/new_offj[r])*offj[r];
-                        tmpn %= new_offj[r];
-                    }
-                    mempcy(&new_elem_ptr_[n*CommElem],&elem_ptr_[Loc*CommElem],sizeof(cytnx_float)*CommElem);
-                    //new_elem_ptr_[n] = elem_ptr_[Loc];
-                }
-        }
-        void GetElems_cpu_i64(void* out, void *in,const std::vector<cytnx_uint64> &offj, const std::vector<cytnx_uint64> &new_offj, const std::vector<std::vector<cytnx_uint64> >&locators, const cytnx_uint64 &TotalElem, const cytnx_uint64 &CommElem){
-                //Start copy elem:
-                cytnx_int64* elem_ptr_     = static_cast<cytnx_int64*>(in);
-                cytnx_int64* new_elem_ptr_ = static_cast<cytnx_int64*>(out);
-
-                #ifdef UNI_OMP
-                #pragma omp parallel for schedule(dynamic)
-                #endif
-                for(cytnx_uint64 n=0;n < TotalElem; n++){
-                    //map from mem loc of new tensor to old tensor
-                    cytnx_uint64 Loc=0;
-                    cytnx_uint64 tmpn = n;
-                    for(cytnx_uint32 r=0;r < offj.size();r++){
-                        if(locators[r].size()) Loc += locators[r][tmpn/new_offj[r]]*offj[r];
-                        else Loc += cytnx_uint64(tmpn/new_offj[r])*offj[r];
-                        tmpn %= new_offj[r];
-                    }
-                    mempcy(&new_elem_ptr_[n*CommElem],&elem_ptr_[Loc*CommElem],sizeof(cytnx_int64)*CommElem);
-                    //new_elem_ptr_[n] = elem_ptr_[Loc];
-                }
-        }
-        void GetElems_cpu_u64(void* out, void *in,const std::vector<cytnx_uint64> &offj, const std::vector<cytnx_uint64> &new_offj, const std::vector<std::vector<cytnx_uint64> >&locators, const cytnx_uint64 &TotalElem, const cytnx_uint64 &CommElem){
-                //Start copy elem:
-                cytnx_uint64* elem_ptr_     = static_cast<cytnx_uint64*>(in);
-                cytnx_uint64* new_elem_ptr_ = static_cast<cytnx_uint64*>(out);
-
-                #ifdef UNI_OMP
-                #pragma omp parallel for schedule(dynamic)
-                #endif
-                for(cytnx_uint64 n=0;n < TotalElem; n++){
-                    //map from mem loc of new tensor to old tensor
-                    cytnx_uint64 Loc=0;
-                    cytnx_uint64 tmpn = n;
-                    for(cytnx_uint32 r=0;r < offj.size();r++){
-                        if(locators[r].size()) Loc += locators[r][tmpn/new_offj[r]]*offj[r];
-                        else Loc += cytnx_uint64(tmpn/new_offj[r])*offj[r];
-                        tmpn %= new_offj[r];
-                    }
-                    mempcy(&new_elem_ptr_[n*CommElem],&elem_ptr_[Loc*CommElem],sizeof(cytnx_uint64)*CommElem);
-                    //new_elem_ptr_[n] = elem_ptr_[Loc];
-                }
-        }
-        void GetElems_cpu_i32(void* out, void *in,const std::vector<cytnx_uint64> &offj, const std::vector<cytnx_uint64> &new_offj, const std::vector<std::vector<cytnx_uint64> >&locators, const cytnx_uint64 &TotalElem, const cytnx_uint64 &CommElem){
-                //Start copy elem:
-                cytnx_int32* elem_ptr_     = static_cast<cytnx_int32*>(in);
-                cytnx_int32* new_elem_ptr_ = static_cast<cytnx_int32*>(out);
-
-                #ifdef UNI_OMP
-                #pragma omp parallel for schedule(dynamic)
-                #endif
-                for(cytnx_uint64 n=0;n < TotalElem; n++){
-                    //map from mem loc of new tensor to old tensor
-                    cytnx_uint64 Loc=0;
-                    cytnx_uint64 tmpn = n;
-                    for(cytnx_uint32 r=0;r < offj.size();r++){
-                        if(locators[r].size()) Loc += locators[r][tmpn/new_offj[r]]*offj[r];
-                        else Loc += cytnx_uint64(tmpn/new_offj[r])*offj[r];
-                        tmpn %= new_offj[r];
-                    }
-                    mempcy(&new_elem_ptr_[n*CommElem],&elem_ptr_[Loc*CommElem],sizeof(cytnx_int32)*CommElem);
-                    //new_elem_ptr_[n] = elem_ptr_[Loc];
-                }
-        }
-        void GetElems_cpu_u32(void* out, void *in,const std::vector<cytnx_uint64> &offj, const std::vector<cytnx_uint64> &new_offj, const std::vector<std::vector<cytnx_uint64> >&locators, const cytnx_uint64 &TotalElem, const cytnx_uint64 &CommElem){
-                //Start copy elem:
-                cytnx_uint32* elem_ptr_     = static_cast<cytnx_uint32*>(in);
-                cytnx_uint32* new_elem_ptr_ = static_cast<cytnx_uint32*>(out);
-
-                #ifdef UNI_OMP
-                #pragma omp parallel for schedule(dynamic)
-                #endif
-                for(cytnx_uint64 n=0;n < TotalElem; n++){
-                    //map from mem loc of new tensor to old tensor
-                    cytnx_uint64 Loc=0;
-                    cytnx_uint64 tmpn = n;
-                    for(cytnx_uint32 r=0;r < offj.size();r++){
-                        if(locators[r].size()) Loc += locators[r][tmpn/new_offj[r]]*offj[r];
-                        else Loc += cytnx_uint64(tmpn/new_offj[r])*offj[r];
-                        tmpn %= new_offj[r];
-                    }
-                    mempcy(&new_elem_ptr_[n*CommElem],&elem_ptr_[Loc*CommElem],sizeof(cytnx_uint32)*CommElem);
-                    //new_elem_ptr_[n] = elem_ptr_[Loc];
-                }
-        }
-        void GetElems_cpu_i16(void* out, void *in,const std::vector<cytnx_uint64> &offj, const std::vector<cytnx_uint64> &new_offj, const std::vector<std::vector<cytnx_uint64> >&locators, const cytnx_uint64 &TotalElem, const cytnx_uint64 &CommElem){
-                //Start copy elem:
-                cytnx_int16* elem_ptr_     = static_cast<cytnx_int16*>(in);
-                cytnx_int16* new_elem_ptr_ = static_cast<cytnx_int16*>(out);
-
-                #ifdef UNI_OMP
-                #pragma omp parallel for schedule(dynamic)
-                #endif
-                for(cytnx_uint64 n=0;n < TotalElem; n++){
-                    //map from mem loc of new tensor to old tensor
-                    cytnx_uint64 Loc=0;
-                    cytnx_uint64 tmpn = n;
-                    for(cytnx_uint32 r=0;r < offj.size();r++){
-                        if(locators[r].size()) Loc += locators[r][tmpn/new_offj[r]]*offj[r];
-                        else Loc += cytnx_uint64(tmpn/new_offj[r])*offj[r];
-                        tmpn %= new_offj[r];
-                    }
-                    mempcy(&new_elem_ptr_[n*CommElem],&elem_ptr_[Loc*CommElem],sizeof(cytnx_int16)*CommElem);
-                    //new_elem_ptr_[n] = elem_ptr_[Loc];
-                }
-        }
-        void GetElems_cpu_u16(void* out, void *in,const std::vector<cytnx_uint64> &offj, const std::vector<cytnx_uint64> &new_offj, const std::vector<std::vector<cytnx_uint64> >&locators, const cytnx_uint64 &TotalElem, const cytnx_uint64 &CommElem){
-                //Start copy elem:
-                cytnx_uint16* elem_ptr_     = static_cast<cytnx_uint16*>(in);
-                cytnx_uint16* new_elem_ptr_ = static_cast<cytnx_uint16*>(out);
-
-                #ifdef UNI_OMP
-                #pragma omp parallel for schedule(dynamic)
-                #endif
-                for(cytnx_uint64 n=0;n < TotalElem; n++){
-                    //map from mem loc of new tensor to old tensor
-                    cytnx_uint64 Loc=0;
-                    cytnx_uint64 tmpn = n;
-                    for(cytnx_uint32 r=0;r < offj.size();r++){
-                        if(locators[r].size()) Loc += locators[r][tmpn/new_offj[r]]*offj[r];
-                        else Loc += cytnx_uint64(tmpn/new_offj[r])*offj[r];
-                        tmpn %= new_offj[r];
-                    }
-                    mempcy(&new_elem_ptr_[n*CommElem],&elem_ptr_[Loc*CommElem],sizeof(cytnx_uint16)*CommElem);
-                    //new_elem_ptr_[n] = elem_ptr_[Loc];
-                }
-        }
-        void GetElems_cpu_b(void* out, void *in,const std::vector<cytnx_uint64> &offj, const std::vector<cytnx_uint64> &new_offj, const std::vector<std::vector<cytnx_uint64> >&locators, const cytnx_uint64 &TotalElem, const cytnx_uint64 &CommElem){
-                //Start copy elem:
-                cytnx_bool* elem_ptr_     = static_cast<cytnx_bool*>(in);
-                cytnx_bool* new_elem_ptr_ = static_cast<cytnx_bool*>(out);
-
-                #ifdef UNI_OMP
-                #pragma omp parallel for schedule(dynamic)
-                #endif
-                for(cytnx_uint64 n=0;n < TotalElem; n++){
-                    //map from mem loc of new tensor to old tensor
-                    cytnx_uint64 Loc=0;
-                    cytnx_uint64 tmpn = n;
-                    for(cytnx_uint32 r=0;r < offj.size();r++){
-                        if(locators[r].size()) Loc += locators[r][tmpn/new_offj[r]]*offj[r];
-                        else Loc += cytnx_uint64(tmpn/new_offj[r])*offj[r];
-                        tmpn %= new_offj[r];
-                    }
-                    mempcy(&new_elem_ptr_[n*CommElem],&elem_ptr_[Loc*CommElem],sizeof(cytnx_bool)*CommElem);
-                    //new_elem_ptr_[n] = elem_ptr_[Loc];
-                }
-        }
-    }
-}
diff --git a/src/backend/utils_internal_cpu/GetElems_cpu.hpp.new b/src/backend/utils_internal_cpu/GetElems_cpu.hpp.new
deleted file mode 100644
index a08f7eec1..000000000
--- a/src/backend/utils_internal_cpu/GetElems_cpu.hpp.new
+++ /dev/null
@@ -1,33 +0,0 @@
-#ifndef CYTNX_BACKEND_UTILS_INTERNAL_CPU_GETELEMS_CPU_HPP_H_
-#define CYTNX_BACKEND_UTILS_INTERNAL_CPU_GETELEMS_CPU_HPP_H_
-
-#include <cstdio>
-#include <cstdlib>
-#include <stdint.h>
-#include <climits>
-#include <vector>
-#include "Type.hpp"
-#include "cytnx_error.hpp"
-#include "backend/Storage.hpp"
-#include "Type.hpp"
-
-namespace cytnx{
-    namespace utils_internal{
-
-
-        void GetElems_cpu_cd(void* out, void *in,const std::vector<cytnx_uint64> &offj, const std::vector<cytnx_uint64> &new_offj, const std::vector<std::vector<cytnx_uint64> >&locators, const cytnx_uint64 &TotalElem, const cytnx_uint64 & CommElem);
-        void GetElems_cpu_cf(void* out, void *in,const std::vector<cytnx_uint64> &offj, const std::vector<cytnx_uint64> &new_offj, const std::vector<std::vector<cytnx_uint64> >&locators, const cytnx_uint64 &TotalElem, const cytnx_uint64 & CommElem);
-        void GetElems_cpu_d(void* out, void *in,const std::vector<cytnx_uint64> &offj, const std::vector<cytnx_uint64> &new_offj, const std::vector<std::vector<cytnx_uint64> >&locators, const cytnx_uint64 &TotalElem, const cytnx_uint64 & CommElem);
-        void GetElems_cpu_f(void* out, void *in,const std::vector<cytnx_uint64> &offj, const std::vector<cytnx_uint64> &new_offj, const std::vector<std::vector<cytnx_uint64> >&locators, const cytnx_uint64 &TotalElem, const cytnx_uint64 & CommElem);
-
-        void GetElems_cpu_i64(void* out, void *in,const std::vector<cytnx_uint64> &offj, const std::vector<cytnx_uint64> &new_offj, const std::vector<std::vector<cytnx_uint64> >&locators, const cytnx_uint64 &TotalElem, const cytnx_uint64 & CommElem);
-        void GetElems_cpu_u64(void* out, void *in,const std::vector<cytnx_uint64> &offj, const std::vector<cytnx_uint64> &new_offj, const std::vector<std::vector<cytnx_uint64> >&locators, const cytnx_uint64 &TotalElem, const cytnx_uint64 & CommElem);
-        void GetElems_cpu_i32(void* out, void *in,const std::vector<cytnx_uint64> &offj, const std::vector<cytnx_uint64> &new_offj, const std::vector<std::vector<cytnx_uint64> >&locators, const cytnx_uint64 &TotalElem, const cytnx_uint64 & CommElem);
-        void GetElems_cpu_u32(void* out, void *in,const std::vector<cytnx_uint64> &offj, const std::vector<cytnx_uint64> &new_offj, const std::vector<std::vector<cytnx_uint64> >&locators, const cytnx_uint64 &TotalElem, const cytnx_uint64 & CommElem);
-        void GetElems_cpu_i16(void* out, void *in,const std::vector<cytnx_uint64> &offj, const std::vector<cytnx_uint64> &new_offj, const std::vector<std::vector<cytnx_uint64> >&locators, const cytnx_uint64 &TotalElem, const cytnx_uint64 & CommElem);
-        void GetElems_cpu_u16(void* out, void *in,const std::vector<cytnx_uint64> &offj, const std::vector<cytnx_uint64> &new_offj, const std::vector<std::vector<cytnx_uint64> >&locators, const cytnx_uint64 &TotalElem, const cytnx_uint64 & CommElem);
-        void GetElems_cpu_b(void* out, void *in,const std::vector<cytnx_uint64> &offj, const std::vector<cytnx_uint64> &new_offj, const std::vector<std::vector<cytnx_uint64> >&locators, const cytnx_uint64 &TotalElem, const cytnx_uint64 & CommElem);
-    }
-}
-
-#endif  // CYTNX_BACKEND_UTILS_INTERNAL_CPU_GETELEMS_CPU_HPP_H_
diff --git a/src/linalg/Trace.cpp.old b/src/linalg/Trace.cpp.old
deleted file mode 100644
index 765de434d..000000000
--- a/src/linalg/Trace.cpp.old
+++ /dev/null
@@ -1,392 +0,0 @@
-#include "linalg.hpp"
-#include "utils/utils.hpp"
-#include "Tensor.hpp"
-#include "UniTensor.hpp"
-#include "cytnx.hpp"
-#ifdef UNI_OMP
-  #include <omp.h>
-#endif
-
-using namespace std;
-namespace cytnx {
-  namespace linalg {
-    cytnx::UniTensor Trace(const cytnx::UniTensor &Tin, const cytnx_int64 &a,
-                           const cytnx_int64 &b) {
-      return Tin.Trace(a, b);
-    }
-    cytnx::UniTensor Trace(const cytnx::UniTensor &Tin, const std::string &a,
-                           const std::string &b) {
-      return Tin.Trace(a, b);
-    }
-    cytnx::UniTensor Trace(const cytnx::UniTensor &Tin, const cytnx_int64 &a, const cytnx_int64 &b,
-                           const bool &by_label) {
-      return Tin.Trace(a, b, by_label);
-    }
-  }  // namespace linalg
-}  // namespace cytnx
-
-namespace cytnx {
-
-  template <class T>
-  void _trace_2d(Tensor &out, const Tensor &Tn, const cytnx_uint64 &Ndiag) {
-    T a = 0;
-    T *rawdata = Tn.storage().data<T>();
-    cytnx_uint64 Ldim = Tn.shape()[1];
-    for (cytnx_uint64 i = 0; i < Ndiag; i++) a += rawdata[i * Ldim + i];
-    out.storage().at<T>(0) = a;
-  }
-
-  template <class T>
-  void _trace_nd(Tensor &out, const Tensor &Tn, const cytnx_uint64 &Ndiag,
-                 const cytnx_uint64 &Nelem, const vector<cytnx_uint64> &accu,
-                 const vector<cytnx_uint64> &remain_rank_id, const vector<cytnx_int64> &shape,
-                 const cytnx_uint64 &ax1, const cytnx_uint64 &ax2) {
-    UniTensor I = UniTensor(eye(Ndiag), false, -1);
-    I.set_labels({"0", "1"});
-    UniTensor UTn = UniTensor(Tn, false, 2);
-    UTn.set_labels(vec_cast<cytnx_uint64, cytnx_int64>(vec_range(100, 100 + UTn.labels().size())));
-    // UTn.set_label(ax1, "0");
-    // UTn.set_label(ax2, "1");
-    UTn._impl->_labels[ax1]="0";
-    UTn._impl->_labels[ax2]="1";
-    out = Contract(I, UTn).get_block_();
-
-    // vector<cytnx_uint64> indexer(Tn.shape().size(), 0);
-    // cytnx_uint64 tmp;
-    // for (cytnx_uint64 i = 0; i < Nelem; i++) {
-    // tmp = i;
-    // // calculate indexer
-    // for (int x = 0; x < shape.size(); x++) {
-    // indexer[remain_rank_id[x]] = cytnx_uint64(tmp / accu[x]);
-    // tmp %= accu[x];
-    // }
-
-    // for (cytnx_uint64 d = 0; d < Ndiag; d++) {
-    // indexer[ax1] = indexer[ax2] = d;
-    // out.storage().at<T>(i) += Tn.at<T>(indexer);
-    // }
-    // }
-  }
-
-#ifdef UNI_OMP
-  template <class T>
-  void _trace_2d_para(Tensor &out, const Tensor &Tn, const cytnx_uint64 &Ndiag, const int &Nomp) {
-    T a = 0;
-    vector<T> buffer(Nomp);
-
-  #pragma omp parallel for schedule(dynamic)
-    for (cytnx_uint64 i = 0; i < Ndiag; i++) buffer[omp_get_thread_num()] += Tn.at<T>({i, i});
-
-    for (int i = 1; i < Nomp; i++) buffer[0] += buffer[i];
-    out.storage().at<T>({0}) = buffer[0];
-  }
-
-  template <class T>
-  void _trace_nd_para(Tensor &out, const Tensor &Tn, const cytnx_uint64 &Ndiag,
-                      const cytnx_uint64 &Nelem, const vector<cytnx_uint64> &accu,
-                      const vector<cytnx_uint64> &remain_rank_id, const vector<cytnx_int64> &shape,
-                      const cytnx_uint64 &ax1, const cytnx_uint64 &ax2, const int &Nomp) {
-    // decide parallel Nelem or Ndiag:
-    if (false and Nelem < Ndiag) {
-      // each thread need it's own indexer:
-      vector<vector<cytnx_uint64>> indexers(Nomp, vector<cytnx_uint64>(Tn.shape().size(), 0));
-  // cout << "Ne < Nd" << endl;
-  #pragma omp parallel for schedule(dynamic)
-      for (cytnx_uint64 i = 0; i < Nelem; i++) {
-        cytnx_uint64 tmp = i;
-        // calculate indexer
-        for (int x = 0; x < shape.size(); x++) {
-          indexers[omp_get_thread_num()][remain_rank_id[x]] = cytnx_uint64(tmp / accu[x]);
-          tmp %= accu[x];
-        }
-
-        for (cytnx_uint64 d = 0; d < Ndiag; d++) {
-          indexers[omp_get_thread_num()][ax1] = indexers[omp_get_thread_num()][ax2] = d;
-          out.storage().at<T>(i) += Tn.at<T>(indexers[omp_get_thread_num()]);
-        }
-      }
-
-    } else {
-  #pragma omp parallel
-      {
-        vector<cytnx_uint64> indexers(Tn.shape().size(), 0);
-  #pragma omp for schedule(static)
-        for (cytnx_uint64 i = 0; i < Nelem; i++) {
-          cytnx_uint64 tmp;
-          tmp = i;
-          // calculate indexer
-          for (int x = 0; x < shape.size(); x++) {
-            indexers[remain_rank_id[x]] = cytnx_uint64(tmp / accu[x]);
-            tmp %= accu[x];
-          }
-
-          for (cytnx_uint64 d = 0; d < Ndiag; d++) {
-            indexers[ax1] = indexers[ax2] = d;
-            out.storage().at<T>(i) += Tn.at<T>(indexers);
-          }
-        }
-      }
-    }
-  }
-#endif
-
-  namespace linalg {
-    // dtype -1: default
-    // device -2: default.
-    Tensor Trace(const Tensor &Tn, const cytnx_uint64 &axisA, const cytnx_uint64 &axisB) {
-      // checking:
-      cytnx_error_msg(Tn.shape().size() < 2, "[ERROR] Tensor must have at least rank-2.%s", "\n");
-      cytnx_error_msg(axisA >= Tn.shape().size(), "[ERROR] axisA out of bound.%s", "\n");
-      cytnx_error_msg(axisB >= Tn.shape().size(), "[ERROR] axisB out of bound.%s", "\n");
-      cytnx_error_msg(axisA == axisB, "[ERROR] axisB cannot be the same as axisA.%s", "\n");
-      // cytnx_error_msg(dtype == Type.Void,"[ERROR] cannot have output type to be
-      // Type.Void.%s","\n"); vector<cytnx_int64> indexer(Tn.shape().size());
-
-      cytnx_uint64 ax1, ax2;
-      if (axisA < axisB) {
-        ax1 = axisA;
-        ax2 = axisB;
-      } else {
-        ax1 = axisB;
-        ax2 = axisA;
-      }
-
-      // int out_dtype = dtype==-1?Tn.dtype():dtype;
-      // int out_device = device==-2?Tn.device():device;
-
-      // 1) get redundant rank:
-      vector<cytnx_int64> shape(Tn.shape().begin(), Tn.shape().end());
-      vector<cytnx_uint64> accu;
-      shape.erase(shape.begin() + ax2);
-      shape.erase(shape.begin() + ax1);
-      // 2) get out put elementsize.
-      cytnx_uint64 Nelem = 1;
-      for (int i = 0; i < shape.size(); i++) Nelem *= shape[i];
-      // 3) get diagonal element numbers:
-      cytnx_uint64 Ndiag = Tn.shape()[ax1] < Tn.shape()[ax2] ? Tn.shape()[ax1] : Tn.shape()[ax2];
-
-      Tensor out = Tensor({Nelem}, Tn.dtype(), Tn.device());
-      out.storage().set_zeros();
-
-#ifdef UNI_OMP
-      int Nomp = 1;
-  #pragma omp parallel
-      {
-        if (omp_get_thread_num() == 0) Nomp = omp_get_num_threads();
-      }
-      // std::cout << Nomp <<std::endl;
-
-      if (shape.size() == 0) {
-        switch (Tn.dtype()) {
-          case Type.ComplexDouble:
-            _trace_2d_para<cytnx_complex128>(out, Tn, Ndiag, Nomp);
-            break;
-          case Type.ComplexFloat:
-            _trace_2d_para<cytnx_complex64>(out, Tn, Ndiag, Nomp);
-            break;
-          case Type.Double:
-            _trace_2d_para<cytnx_double>(out, Tn, Ndiag, Nomp);
-            break;
-          case Type.Float:
-            _trace_2d_para<cytnx_float>(out, Tn, Ndiag, Nomp);
-            break;
-          case Type.Uint64:
-            _trace_2d_para<cytnx_uint64>(out, Tn, Ndiag, Nomp);
-            break;
-          case Type.Int64:
-            _trace_2d_para<cytnx_int64>(out, Tn, Ndiag, Nomp);
-            break;
-          case Type.Uint32:
-            _trace_2d_para<cytnx_uint32>(out, Tn, Ndiag, Nomp);
-            break;
-          case Type.Int32:
-            _trace_2d_para<cytnx_int32>(out, Tn, Ndiag, Nomp);
-            break;
-          case Type.Int16:
-            _trace_2d_para<cytnx_int16>(out, Tn, Ndiag, Nomp);
-            break;
-          case Type.Uint16:
-            _trace_2d_para<cytnx_uint16>(out, Tn, Ndiag, Nomp);
-            break;
-          case Type.Bool:
-            //_trace_2d_para<cytnx_bool>(out,Tn,Ndiag,Nomp);
-            cytnx_error_msg(
-              true,
-              "[ERROR][Trace] Bool type cannot perform Trace, use .astype() to promote first.%s",
-              "\n");
-            break;
-          default:
-            cytnx_error_msg(true, "[ERROR][Trace] invalid Type.%s", "\n");
-            break;
-        }
-      } else {
-        vector<cytnx_uint64> remain_rank_id;
-        vector<cytnx_uint64> accu(shape.size());
-        accu.back() = 1;
-        for (int i = shape.size() - 1; i > 0; i--) accu[i - 1] = accu[i] * shape[i];
-
-        for (cytnx_uint64 i = 0; i < Tn.shape().size(); i++) {
-          if (i != ax1 && i != ax2) remain_rank_id.push_back(i);
-        }
-
-        switch (Tn.dtype()) {
-          case Type.ComplexDouble:
-            _trace_nd_para<cytnx_complex128>(out, Tn, Ndiag, Nelem, accu, remain_rank_id, shape,
-                                             ax1, ax2, Nomp);
-            break;
-          case Type.ComplexFloat:
-            _trace_nd_para<cytnx_complex64>(out, Tn, Ndiag, Nelem, accu, remain_rank_id, shape, ax1,
-                                            ax2, Nomp);
-            break;
-          case Type.Double:
-            _trace_nd_para<cytnx_double>(out, Tn, Ndiag, Nelem, accu, remain_rank_id, shape, ax1,
-                                         ax2, Nomp);
-            break;
-          case Type.Float:
-            _trace_nd_para<cytnx_float>(out, Tn, Ndiag, Nelem, accu, remain_rank_id, shape, ax1,
-                                        ax2, Nomp);
-            break;
-          case Type.Uint64:
-            _trace_nd_para<cytnx_uint64>(out, Tn, Ndiag, Nelem, accu, remain_rank_id, shape, ax1,
-                                         ax2, Nomp);
-            break;
-          case Type.Int64:
-            _trace_nd_para<cytnx_int64>(out, Tn, Ndiag, Nelem, accu, remain_rank_id, shape, ax1,
-                                        ax2, Nomp);
-            break;
-          case Type.Int32:
-            _trace_nd_para<cytnx_int32>(out, Tn, Ndiag, Nelem, accu, remain_rank_id, shape, ax1,
-                                        ax2, Nomp);
-            break;
-          case Type.Uint32:
-            _trace_nd_para<cytnx_uint32>(out, Tn, Ndiag, Nelem, accu, remain_rank_id, shape, ax1,
-                                         ax2, Nomp);
-            break;
-          case Type.Uint16:
-            _trace_nd_para<cytnx_uint16>(out, Tn, Ndiag, Nelem, accu, remain_rank_id, shape, ax1,
-                                         ax2, Nomp);
-            break;
-          case Type.Int16:
-            _trace_nd_para<cytnx_int16>(out, Tn, Ndiag, Nelem, accu, remain_rank_id, shape, ax1,
-                                        ax2, Nomp);
-            break;
-          case Type.Bool:
-            cytnx_error_msg(
-              true,
-              "[ERROR][Trace] Bool type cannot perform Trace, use .astype() to promote first.%s",
-              "\n");
-            break;
-          default:
-            cytnx_error_msg(true, "[ERROR][Trace] Invalid Type.%s", "\n");
-            break;
-        }  // switch
-        out.reshape_(shape);
-      }
-
-#else
-
-      if (shape.size() == 0) {
-        switch (Tn.dtype()) {
-          case Type.ComplexDouble:
-            _trace_2d<cytnx_complex128>(out, Tn, Ndiag);
-            break;
-          case Type.ComplexFloat:
-            _trace_2d<cytnx_complex64>(out, Tn, Ndiag);
-            break;
-          case Type.Double:
-            _trace_2d<cytnx_double>(out, Tn, Ndiag);
-            break;
-          case Type.Float:
-            _trace_2d<cytnx_float>(out, Tn, Ndiag);
-            break;
-          case Type.Uint64:
-            _trace_2d<cytnx_uint64>(out, Tn, Ndiag);
-            break;
-          case Type.Int64:
-            _trace_2d<cytnx_int64>(out, Tn, Ndiag);
-            break;
-          case Type.Uint32:
-            _trace_2d<cytnx_uint32>(out, Tn, Ndiag);
-            break;
-          case Type.Int32:
-            _trace_2d<cytnx_int32>(out, Tn, Ndiag);
-            break;
-          case Type.Int16:
-            _trace_2d<cytnx_int16>(out, Tn, Ndiag);
-            break;
-          case Type.Uint16:
-            _trace_2d<cytnx_uint16>(out, Tn, Ndiag);
-            break;
-          case Type.Bool:
-            cytnx_error_msg(
-              true,
-              "[ERROR][Trace] Bool type cannot perform Trace, use .astype() to promote first.%s",
-              "\n");
-            break;
-          default:
-            cytnx_error_msg(true, "[ERROR][Trace] invalid Type.%s", "\n");
-            break;
-        }
-      } else {
-        vector<cytnx_uint64> remain_rank_id;
-        vector<cytnx_uint64> accu(shape.size());
-        accu.back() = 1;
-        for (int i = shape.size() - 1; i > 0; i--) accu[i - 1] = accu[i] * shape[i];
-
-        for (cytnx_uint64 i = 0; i < Tn.shape().size(); i++) {
-          if (i != ax1 && i != ax2) remain_rank_id.push_back(i);
-        }
-
-        switch (Tn.dtype()) {
-          case Type.ComplexDouble:
-            _trace_nd<cytnx_complex128>(out, Tn, Ndiag, Nelem, accu, remain_rank_id, shape, ax1,
-                                        ax2);
-            break;
-          case Type.ComplexFloat:
-            _trace_nd<cytnx_complex64>(out, Tn, Ndiag, Nelem, accu, remain_rank_id, shape, ax1,
-                                       ax2);
-            break;
-          case Type.Double:
-            _trace_nd<cytnx_double>(out, Tn, Ndiag, Nelem, accu, remain_rank_id, shape, ax1, ax2);
-            break;
-          case Type.Float:
-            _trace_nd<cytnx_float>(out, Tn, Ndiag, Nelem, accu, remain_rank_id, shape, ax1, ax2);
-            break;
-          case Type.Uint64:
-            _trace_nd<cytnx_uint64>(out, Tn, Ndiag, Nelem, accu, remain_rank_id, shape, ax1, ax2);
-            break;
-          case Type.Int64:
-            _trace_nd<cytnx_int64>(out, Tn, Ndiag, Nelem, accu, remain_rank_id, shape, ax1, ax2);
-            break;
-          case Type.Int32:
-            _trace_nd<cytnx_int32>(out, Tn, Ndiag, Nelem, accu, remain_rank_id, shape, ax1, ax2);
-            break;
-          case Type.Uint32:
-            _trace_nd<cytnx_uint32>(out, Tn, Ndiag, Nelem, accu, remain_rank_id, shape, ax1, ax2);
-            break;
-          case Type.Uint16:
-            _trace_nd<cytnx_uint16>(out, Tn, Ndiag, Nelem, accu, remain_rank_id, shape, ax1, ax2);
-            break;
-          case Type.Int16:
-            _trace_nd<cytnx_int16>(out, Tn, Ndiag, Nelem, accu, remain_rank_id, shape, ax1, ax2);
-            break;
-          case Type.Bool:
-            cytnx_error_msg(
-              true,
-              "[ERROR][Trace] Bool type cannot perform Trace, use .astype() to promote first.%s",
-              "\n");
-            break;
-          default:
-            cytnx_error_msg(true, "[ERROR][Trace] Invalid Type.%s", "\n");
-            break;
-        }  // switch
-        out.reshape_(shape);
-      }
-
-#endif
-
-      return out;
-    }
-
-  }  // namespace linalg
-}  // namespace cytnx