diff --git a/ABACUS.develop/examples/H2O-deepks-lcao/INPUT b/ABACUS.develop/examples/H2O-deepks-lcao/INPUT
index e9253e2028..94db0eb32e 100644
--- a/ABACUS.develop/examples/H2O-deepks-lcao/INPUT
+++ b/ABACUS.develop/examples/H2O-deepks-lcao/INPUT
@@ -27,3 +27,4 @@ mixing_beta             0.4
 out_band                0
 out_descriptor          1 
 lmax_descriptor			2
+newdm		1
\ No newline at end of file
diff --git a/ABACUS.develop/examples/H2O-deepks-lcao/STRU b/ABACUS.develop/examples/H2O-deepks-lcao/STRU
index 8dc2ac2824..6148031dd4 100644
--- a/ABACUS.develop/examples/H2O-deepks-lcao/STRU
+++ b/ABACUS.develop/examples/H2O-deepks-lcao/STRU
@@ -6,6 +6,10 @@ NUMERICAL_ORBITAL
 H_gga_8au_60Ry_2s1p.orb
 O_gga_7au_60Ry_2s2p1d.orb
 
+NUMERICAL_DESCRIPTOR
+jle.orb
+
+
 LATTICE_CONSTANT
 10
 
diff --git a/ABACUS.develop/source/Makefile.Objects b/ABACUS.develop/source/Makefile.Objects
index bed1c5618c..10825db039 100644
--- a/ABACUS.develop/source/Makefile.Objects
+++ b/ABACUS.develop/source/Makefile.Objects
@@ -81,6 +81,7 @@ memory.o\
 print_info.o\
 mathzone.o\
 mathzone_add1.o\
+math_integral.o\
 integral.o \
 poission.o \
 polint.o \
@@ -153,6 +154,7 @@ LCAO_matrix.o\
 LCAO_nnr.o \
 LCAO_diago.o\
 LCAO_evolve.o\
+LCAO_descriptor.o\
 ylm.o\
 FORCE_STRESS.o\
 FORCE_gamma.o\
@@ -262,7 +264,6 @@ variable_cell.o\
 dftu.o\
 dftu_yukawa.o\
 dftu_relax.o\
-lscc.o\
 
 OBJS_COMMON=atom_spec.o \
 unitcell.o \
diff --git a/ABACUS.develop/source/README b/ABACUS.develop/source/README
index 191291c344..d75f606b16 100644
--- a/ABACUS.develop/source/README
+++ b/ABACUS.develop/source/README
@@ -1,29 +1,5 @@
+
 Currently we are working on optimizing the code structure of ABACUS,
 implementing new functions, and adding more autotests.
 
 -- mohan 2021-02-11
-
-URGENT:
-
-Ask Xiaohui Liu: all functions named with 'after_vc' should be reconstructed.
-The 'FINAL_SCF' global varialble should be removed. 
-(condition: need to reconstruct these codes within a given time)
-
-Ask Xiaohui Liu and Daye Zheng: We need test examples.
-
-Ask Fuxiang He: we need to remove all TDDFT-related global variables
-in global_variable.h, we need TDDFT examples.
-(condition: need to reconstruct these codes within a given time)
-
-Ask Daye Zheng: MD, force, stress modules need reconstruction
-
-NEED TO DO:
-
-Ask Xiaohui: we need to remove DQ and NQX in global_variable.h, 
-but the NQX is computed in ./src_pw/pseudopot_cell_vnl.cpp
-
-Ask Peize: exx_lip.h and related Exx codes
-
-QUESTION:
-
-* in pw_basis.cpp, why ggwfc2=ggwfc if gamma_only is used?
diff --git a/ABACUS.develop/source/input.cpp b/ABACUS.develop/source/input.cpp
index af16488f3f..25fa136250 100644
--- a/ABACUS.develop/source/input.cpp
+++ b/ABACUS.develop/source/input.cpp
@@ -115,6 +115,7 @@ void Input::Default(void)
     atom_file = "";//xiaohui modify 2015-02-01
     kpoint_file = "";//xiaohui modify 2015-02-01
     pseudo_dir = "";
+	read_file_dir = "auto";
     pseudo_type = "auto"; // mohan add 2013-05-20 (xiaohui add 2013-06-23)
 	wannier_card = "";
     latname = "test";
@@ -359,7 +360,7 @@ void Input::Default(void)
 	kernel_type="rpa";
 	eels_method=0;
 	absorption_method=0;
-	system="bulk";
+	system_type="bulk";
 	eta=0.05;
 	domega=0.01;
 	nomega=300;
@@ -456,7 +457,7 @@ void Input::Default(void)
 
 	cell_factor = 1.2; //LiuXh add 20180619
 
-	newDM=1; // Shen Yu add 2019/5/9
+	new_dm=1; // Shen Yu add 2019/5/9
 	mulliken=0;// qi feng add 2019/9/10
 
 //----------------------------------------------------------			//Peize Lin add 2020-04-04
@@ -949,6 +950,10 @@ bool Input::Read(const string &fn)
         {
             read_value(ifs, restart_mode);
         }
+		else if (strcmp("read_file_dir", word) == 0)
+		{
+			read_value(ifs, read_file_dir);
+		}
         else if (strcmp("start_wfc", word) == 0)
         {
             read_value(ifs, start_wfc);
@@ -1404,7 +1409,7 @@ bool Input::Read(const string &fn)
 	    }
 	    else if (strcmp("system", word) == 0)
 	    {
-	        read_value(ifs, system);
+	        read_value(ifs, system_type);
 	    }
 	    else if (strcmp("eta", word) == 0)
 	    {
@@ -1657,7 +1662,7 @@ bool Input::Read(const string &fn)
 		}
 		else if (strcmp("newdm", word) == 0)
 		{
-			read_value(ifs, newDM);
+			read_value(ifs, new_dm);
 		}
 //----------------------------------------------------------------------------------
 //         Xin Qu added on 2020-10-29 for DFT+U
@@ -2040,6 +2045,7 @@ void Input::Bcast()
     Parallel_Common::bcast_double( mixing_gg0 ); //mohan add 2014-09-27
 
     Parallel_Common::bcast_string( restart_mode );
+	Parallel_Common::bcast_string( read_file_dir);
     Parallel_Common::bcast_string( start_wfc );
 	Parallel_Common::bcast_int( mem_saver );
 	Parallel_Common::bcast_int( printe );
@@ -2161,7 +2167,7 @@ void Input::Bcast()
 	Parallel_Common::bcast_string( kernel_type );
 	Parallel_Common::bcast_int( eels_method );
 	Parallel_Common::bcast_int( absorption_method );
-    Parallel_Common::bcast_string( system );
+    Parallel_Common::bcast_string( system_type );
     Parallel_Common::bcast_double( eta );
     Parallel_Common::bcast_double( domega );
     Parallel_Common::bcast_int( nomega );
@@ -2259,7 +2265,7 @@ void Input::Bcast()
 	
 		//Parallel_Common::bcast_int( epsilon0_choice );
     Parallel_Common::bcast_double( cell_factor); //LiuXh add 20180619
-    Parallel_Common::bcast_int( newDM ); // Shen Yu add 2019/5/9
+    Parallel_Common::bcast_int( new_dm ); // Shen Yu add 2019/5/9
     Parallel_Common::bcast_bool( restart_save ); // Peize Lin add 2020.04.04
     Parallel_Common::bcast_bool( restart_load ); // Peize Lin add 2020.04.04
 
@@ -2646,7 +2652,8 @@ void Input::Check(void)
 			else if (ks_solver == "hpseps")
 			{
 #ifdef __MPI
-				ofs_warning << "It's a good choice to use hpseps!" << endl;
+				ofs_warning << "It's not a good choice to use hpseps!" << endl;
+				if(gamma_only) WARNING_QUIT("Input","hpseps can not be used for gamma_only.");
 #else
 				WARNING_QUIT("Input","hpseps can not be used for series version.");
 #endif
@@ -2814,7 +2821,7 @@ void Input::Check(void)
 	// pengfei 2016-12-14
 	if(spectral_type!="None")
 	{
-		if( system!="bulk" && system!="surface")
+		if( system_type!="bulk" && system_type!="surface")
 		{
 			WARNING_QUIT("Input","system must be bulk or surface");
 		}
@@ -2936,6 +2943,20 @@ void Input::Check(void)
 			}
 		}
 	}
+
+	const string ss = "test -d " + read_file_dir;
+	if(read_file_dir=="auto")
+	{
+		global_readin_dir = global_out_dir;
+	}
+	else if( system( ss.c_str() ))
+	{
+		WARNING_QUIT("Input","please set right files directory for reading in.");
+	}
+	else
+	{
+		global_readin_dir = read_file_dir + '/';
+	}
 	
     return;
 }
diff --git a/ABACUS.develop/source/input.h b/ABACUS.develop/source/input.h
index d0733bdba5..3385c146ac 100644
--- a/ABACUS.develop/source/input.h
+++ b/ABACUS.develop/source/input.h
@@ -29,6 +29,7 @@ class Input
     string suffix;			// suffix of out put dir
     string atom_file;		// file contains atomic positions -- xiaohui modify 2015-02-01
     string pseudo_dir;      // directory of pseudopotential
+	string read_file_dir;   // directory of files for reading
     string pseudo_type;     // the type of pseudopotential, mohan add 2013-05-20, ABACUS supports
 			    			// UPF format (default) and vwr format. (xiaohui add 2013-06-23)
     string kpoint_file;		// file contains k-points -- xiaohui modify 2015-02-01
@@ -296,7 +297,7 @@ class Input
 	int      absorption_method;      // 0: vasp's method  1: pwscf's method
 	//int		 epsilon_choice;         // 0: hilbert_transform method; 1: standard method
 	string   kernel_type;           // the kernel type: rpa, tdlda ...
-	string system;                 // bulk or surface
+	string system_type;                 // bulk or surface
 	double  eta;                   // unit(Ry)
 	double  domega;                // unit(Ry)
 	int     nomega;
@@ -398,7 +399,7 @@ class Input
 //  2: use new DM algorithm and only show key debug information
 //  3: use new DM algorithm and show all detail debug information
 //==========================================================
-    int newDM;
+    int new_dm;
 
 //==========================================================
 //    DFT+U       Xin Qu added on 2020-10-29
diff --git a/ABACUS.develop/source/input_conv.cpp b/ABACUS.develop/source/input_conv.cpp
index 0cc6c51edb..8903aaba06 100644
--- a/ABACUS.develop/source/input_conv.cpp
+++ b/ABACUS.develop/source/input_conv.cpp
@@ -274,7 +274,7 @@ void Input_Conv::Convert(void)
 		}
 		//chi0_hilbert.epsilon = INPUT.epsilon;
 		chi0_hilbert.kernel_type = INPUT.kernel_type;
-		chi0_hilbert.system = INPUT.system;
+		chi0_hilbert.system = INPUT.system_type;
 		chi0_hilbert.eta = INPUT.eta;
 		chi0_hilbert.domega = INPUT.domega;
 		chi0_hilbert.nomega = INPUT.nomega;
@@ -314,7 +314,7 @@ void Input_Conv::Convert(void)
 	{
 		//chi0_standard.epsilon = INPUT.epsilon;
 		chi0_standard.epsilon = true;
-		chi0_standard.system = INPUT.system;
+		chi0_standard.system = INPUT.system_type;
 		chi0_standard.eta = INPUT.eta;
 		chi0_standard.domega = INPUT.domega;
 		chi0_standard.nomega = INPUT.nomega;
@@ -397,6 +397,8 @@ void Input_Conv::Convert(void)
 #endif
 	}
 	else{
+		delete[] soc.m_loc;
+		soc.m_loc = new Vector3<double> [INPUT.ntype];
 		LSPINORB = false;
 		NONCOLIN = false;
 		DOMAG = false;
@@ -578,7 +580,7 @@ void Input_Conv::Convert(void)
 
     ppcell.cell_factor = INPUT.cell_factor; //LiuXh add 20180619
 
-    NEW_DM=INPUT.newDM;  // Shen Yu add 2019/5/9
+//    NEW_DM=INPUT.new_dm;  // Shen Yu add 2019/5/9
 
 //----------------------------------------------------------
 // main parameters / electrons / spin ( 2/16 )
@@ -625,10 +627,11 @@ void Input_Conv::Convert(void)
 //----------------------------------------------------------
 // About LCAO
 //----------------------------------------------------------
-	ORB.ecutwfc = INPUT.lcao_ecut;
-	ORB.dk = INPUT.lcao_dk;
-	ORB.dR = INPUT.lcao_dr;
-	ORB.Rmax = INPUT.lcao_rmax; 
+// mohan add 2021-04-16
+//	ORB.ecutwfc = INPUT.lcao_ecut;
+//	ORB.dk = INPUT.lcao_dk;
+//	ORB.dR = INPUT.lcao_dr;
+//	ORB.Rmax = INPUT.lcao_rmax; 
 
 	// mohan add 2021-02-16
 	berryphase::berry_phase_flag = INPUT.berry_phase;
diff --git a/ABACUS.develop/source/run_lcao.cpp b/ABACUS.develop/source/run_lcao.cpp
index 8ac68eea46..d1e758f19e 100644
--- a/ABACUS.develop/source/run_lcao.cpp
+++ b/ABACUS.develop/source/run_lcao.cpp
@@ -45,7 +45,15 @@ void Run_lcao::lcao_line(void)
 
     // * reading the localized orbitals/projectors 
 	// * construct the interpolation tables.
-	hm.orb_con.set_orb_tables();
+	hm.orb_con.set_orb_tables(
+		UOT, 
+		ORB,
+		INPUT.lcao_ecut,
+		INPUT.lcao_dk,
+		INPUT.lcao_dr,
+		INPUT.lcao_rmax, 
+		ucell.lat0, 
+		Exx_Abfs::Lmax);
 
 	// * allocate H and S matrices according to computational resources
 	// * set the 'trace' between local H/S and global H/S
diff --git a/ABACUS.develop/source/src_external/GRID_api/Makefile b/ABACUS.develop/source/src_external/GRID_api/Makefile
new file mode 100644
index 0000000000..91366ca57b
--- /dev/null
+++ b/ABACUS.develop/source/src_external/GRID_api/Makefile
@@ -0,0 +1,63 @@
+# This is the Makefile of ABACUS-ORB API
+
+include Makefile.system
+include Makefile.Objects
+
+VPATH=../../src_global\
+:../../src_lcao\
+:./\
+
+#==========================
+# Define HONG
+#==========================
+HONG= -DMETIS -DMKL_ILP64
+
+#==========================
+# OPTIMIZE OPTIONS
+#==========================
+OPTS_GDB = -g -W
+
+#==========================
+# OBJECTS NEEDED
+#==========================
+#FP_OBJS_0=$(OBJS_ORBITAL)\
+#$(OBJS_GLOBAL)\
+#main.o\
+
+FP_OBJS_0=main.o\
+$(OBJS_TRY)\
+$(OBJS_ORBITAL)\
+
+FP_OBJS=$(patsubst %.o, ${OBJ_DIR}/%.o, ${FP_OBJS_0})
+PDIAG_OBJS=$(patsubst %.o, ${OBJ_DIR}/%.o, ${OBJS_PDIAG})
+PDIAG_MR=$(patsubst %.o, ${OBJ_DIR}/%.o, ${PDIAG_MR_0})
+
+#==========================
+# MAKING OPTIONS
+#==========================
+fp_mpi : 
+	@ make init
+	@ make -j $(NP) serial2 
+
+init :
+	@ if [ ! -d $(OBJ_DIR) ]; then mkdir $(OBJ_DIR); fi
+	@ if [ ! -d $(OBJ_DIR)/README ]; then echo "This directory contains all of the .o files" > $(OBJ_DIR)/README; fi
+	@ if [ ! -d ../bin ]; then mkdir ../bin; fi
+
+serial : ${FP_OBJS} ${HEADERS} 
+	${CPLUSPLUS} ${OPTS} $(FP_OBJS) ${LIBS} -o ${VERSION}.x 
+
+serial2 : ${FP_OBJS} 
+	${CPLUSPLUS} ${OPTS} $(FP_OBJS) ${LIBS} -o ${VERSION}.x 
+
+#==========================
+# rules
+#==========================
+${OBJ_DIR}/%.o:%.cpp
+	${CPLUSPLUS_MPI} ${OPTS} ${OPTS_MPI} -c ${HONG} $< -o $@
+${OBJ_DIR}/%.o:%.f
+	${FORTRAN} -c ${HONG} $< -o $@	 
+
+.PHONY:clean
+clean:
+	@ if [ -d $(OBJ_DIR) ]; then rm -rf $(OBJ_DIR); fi
diff --git a/ABACUS.develop/source/src_external/GRID_api/Makefile.Objects b/ABACUS.develop/source/src_external/GRID_api/Makefile.Objects
new file mode 100644
index 0000000000..57b3e6e618
--- /dev/null
+++ b/ABACUS.develop/source/src_external/GRID_api/Makefile.Objects
@@ -0,0 +1,26 @@
+#
+# This is a test makefile for Electronic-structure
+#
+# This particular makefile defines all the executables and objects
+# files needed, who they depend on, and the compilation defaults.#
+# The file makefile.local is included below.
+# That file defines the actual commands to use to run the C++
+# compiler, library options and directories, etc., all of which are
+# machine specific and depend on the local installation.  Hence the name.
+#
+
+VERSION= ABACUS-GRID
+HEADERS= *.h
+
+OBJS_TRY=math_integral.o\
+complexarray.o\
+complexmatrix.o\
+matrix.o\
+
+OBJS_GRID=grid_base.o\
+grid_base_beta.o\
+grid_bigcell.o\
+grid_meshball.o\
+grid_meshk.o\
+grid_technique.o\
+
diff --git a/ABACUS.develop/source/src_external/GRID_api/Makefile.system b/ABACUS.develop/source/src_external/GRID_api/Makefile.system
new file mode 100644
index 0000000000..c9fa3891cf
--- /dev/null
+++ b/ABACUS.develop/source/src_external/GRID_api/Makefile.system
@@ -0,0 +1,14 @@
+include Makefile.vars
+
+#==========================
+# LIBS and INCLUDES
+#==========================
+LIBS = -lifcore -lm -lpthread 
+
+INCLUDES = -I. -Icommands 
+
+#==========================
+# OPTIMIZE OPTIONS
+#==========================
+OPTS     = ${INCLUDES} -Ofast -std=c++11 -simd -march=native -m64 -Werror -Wall -pedantic -g
+#OPTS_MPI = -cxx=${CPLUSPLUS}
diff --git a/ABACUS.develop/source/src_external/GRID_api/Makefile.vars b/ABACUS.develop/source/src_external/GRID_api/Makefile.vars
new file mode 100644
index 0000000000..f0e5a56adc
--- /dev/null
+++ b/ABACUS.develop/source/src_external/GRID_api/Makefile.vars
@@ -0,0 +1,29 @@
+CPLUSPLUS      = icpc
+#CPLUSPLUS     = /public/intel2017/bin/icpc
+
+#CPLUSPLUS_MPI = mpiicpc
+CPLUSPLUS_MPI = icpc 
+
+LAPACK_DIR    = $(MKLROOT)
+#LAPACK_DIR = /public/intel2017/compilers_and_libraries_2017.1.132/linux/mkl
+#LAPACK_DIR = $(MKLROOT)
+#LAPACK_DIR    = /public/intel2017/mkl
+
+FFTW_DIR = /home/mohan/1_Software/impi_fftw-3.3.8
+#FFTW_DIR = /home/qianrui/intelcompile/impi_fftw
+#FFTW_DIR       = /public/udata/xiaohui/software/fftw2
+#FFTW_DIR       =/opt/fftw/3.3.6-p12/intel/2017.update4
+#FFTW_DIR      = /public/fftw-3.3.8
+
+BOOST_DIR = /home/mohan/1_Software/impi_boost-1.70.0
+#BOOST_DIR = /home/qianrui/intelcompile/impi_boost
+#BOOST_DIR      = /public/udata/xiaohui/software/boost_1_39_0
+#BOOST_DIR      = /opt/boost/1.64.0
+
+ELPA_DIR = /home/mohan/1_Software/impi_elpa-16.05.005
+#ELPA_DIR = /home/qianrui/intelcompile/impi_elpa
+#ELPA_DIR   = /public/udata/xiaohui/ELPA-2016.05.004
+#ELPA_DIR = /opt/elpa/intel_2017_update4
+
+OBJ_DIR = obj
+NP      = 14
diff --git a/ABACUS.develop/source/src_external/GRID_api/main.cpp b/ABACUS.develop/source/src_external/GRID_api/main.cpp
new file mode 100644
index 0000000000..4258a489ff
--- /dev/null
+++ b/ABACUS.develop/source/src_external/GRID_api/main.cpp
@@ -0,0 +1,45 @@
+//#include "timer.h"
+#include <ctime>
+
+void calculate();
+
+int main(int argc, char **argv)
+{
+
+    calculate();
+
+    return 0;
+}
+
+
+void calculate()
+{
+/*
+	time_t time_start = std::time(NULL);
+
+//	timer::start();
+
+	//----------------------------------------------------------
+	// main program for doing electronic structure calculations
+	//----------------------------------------------------------
+//	Driver DD;
+//	DD.init();
+
+	time_t	time_finish= std::time(NULL);
+
+	// print out information before ABACUS ends
+	cout << "\n START  Time  : " << ctime(&time_start);
+	cout << " FINISH Time  : " << ctime(&time_finish);
+	cout << " TOTAL  Time  : " << difftime(time_finish, time_start) << endl;
+
+	double total_time = difftime(time_finish, time_start);
+	int hour = total_time / 3600;
+	int mins = ( total_time - 3600 * hour ) / 60;
+	int secs = total_time - 3600 * hour - 60 * mins ;
+	cout << " Total  Time  : " << hour << " h "
+	            << mins << " mins "
+	            << secs << " secs "<< endl;
+*/
+
+    return;
+}
diff --git a/ABACUS.develop/source/src_external/ORB_api/Makefile b/ABACUS.develop/source/src_external/ORB_api/Makefile
new file mode 100644
index 0000000000..637c9b3617
--- /dev/null
+++ b/ABACUS.develop/source/src_external/ORB_api/Makefile
@@ -0,0 +1,63 @@
+# This is the Makefile of ABACUS-ORB API
+
+include Makefile.system
+include Makefile.Objects
+
+VPATH=../../src_global\
+:../../src_lcao\
+:./\
+
+#==========================
+# Define HONG
+#==========================
+HONG= -DMETIS -DMKL_ILP64 -D__NORMAL
+
+#==========================
+# OPTIMIZE OPTIONS
+#==========================
+OPTS_GDB = -g -W
+
+#==========================
+# OBJECTS NEEDED
+#==========================
+#FP_OBJS_0=$(OBJS_ORBITAL)\
+#$(OBJS_GLOBAL)\
+#main.o\
+
+FP_OBJS_0=main.o\
+$(OBJS_TRY)\
+$(OBJS_ORBITAL)\
+
+FP_OBJS=$(patsubst %.o, ${OBJ_DIR}/%.o, ${FP_OBJS_0})
+PDIAG_OBJS=$(patsubst %.o, ${OBJ_DIR}/%.o, ${OBJS_PDIAG})
+PDIAG_MR=$(patsubst %.o, ${OBJ_DIR}/%.o, ${PDIAG_MR_0})
+
+#==========================
+# MAKING OPTIONS
+#==========================
+fp_mpi : 
+	@ make init
+	@ make -j $(NP) serial2 
+
+init :
+	@ if [ ! -d $(OBJ_DIR) ]; then mkdir $(OBJ_DIR); fi
+	@ if [ ! -d $(OBJ_DIR)/README ]; then echo "This directory contains all of the .o files" > $(OBJ_DIR)/README; fi
+	@ if [ ! -d ../bin ]; then mkdir ../bin; fi
+
+serial : ${FP_OBJS} ${HEADERS} 
+	${CPLUSPLUS} ${OPTS} $(FP_OBJS) ${LIBS} -o ${VERSION}.x 
+
+serial2 : ${FP_OBJS} 
+	${CPLUSPLUS} ${OPTS} $(FP_OBJS) ${LIBS} -o ${VERSION}.x 
+
+#==========================
+# rules
+#==========================
+${OBJ_DIR}/%.o:%.cpp
+	${CPLUSPLUS_MPI} ${OPTS} ${OPTS_MPI} -c ${HONG} $< -o $@
+${OBJ_DIR}/%.o:%.f
+	${FORTRAN} -c ${HONG} $< -o $@	 
+
+.PHONY:clean
+clean:
+	@ if [ -d $(OBJ_DIR) ]; then rm -rf $(OBJ_DIR); fi
diff --git a/ABACUS.develop/source/src_external/ORB_api/Makefile.Objects b/ABACUS.develop/source/src_external/ORB_api/Makefile.Objects
new file mode 100644
index 0000000000..cd8c50d06d
--- /dev/null
+++ b/ABACUS.develop/source/src_external/ORB_api/Makefile.Objects
@@ -0,0 +1,35 @@
+#
+# This is a test makefile for Electronic-structure
+#
+# This particular makefile defines all the executables and objects
+# files needed, who they depend on, and the compilation defaults.#
+# The file makefile.local is included below.
+# That file defines the actual commands to use to run the C++
+# compiler, library options and directories, etc., all of which are
+# machine specific and depend on the local installation.  Hence the name.
+#
+
+VERSION= ABACUS-ORB
+HEADERS= *.h
+
+OBJS_TRY=math_integral.o\
+complexarray.o\
+complexmatrix.o\
+matrix.o\
+
+OBJS_ORBITAL=ORB_control.o\
+ORB_read.o\
+ORB_atomic.o\
+ORB_atomic_lm.o\
+ORB_nonlocal.o\
+ORB_nonlocal_lm.o\
+ORB_gaunt_table.o\
+ORB_table_beta.o\
+ORB_table_phi.o\
+ORB_table_alpha.o\
+ORB_gen_tables.o\
+
+OBJS_GLOBAL=sph_bessel.o\
+sph_bessel_recursive-d1.o\
+sph_bessel_recursive-d2.o\
+timer.o\
diff --git a/ABACUS.develop/source/src_external/ORB_api/Makefile.system b/ABACUS.develop/source/src_external/ORB_api/Makefile.system
new file mode 100644
index 0000000000..c9fa3891cf
--- /dev/null
+++ b/ABACUS.develop/source/src_external/ORB_api/Makefile.system
@@ -0,0 +1,14 @@
+include Makefile.vars
+
+#==========================
+# LIBS and INCLUDES
+#==========================
+LIBS = -lifcore -lm -lpthread 
+
+INCLUDES = -I. -Icommands 
+
+#==========================
+# OPTIMIZE OPTIONS
+#==========================
+OPTS     = ${INCLUDES} -Ofast -std=c++11 -simd -march=native -m64 -Werror -Wall -pedantic -g
+#OPTS_MPI = -cxx=${CPLUSPLUS}
diff --git a/ABACUS.develop/source/src_external/ORB_api/Makefile.vars b/ABACUS.develop/source/src_external/ORB_api/Makefile.vars
new file mode 100644
index 0000000000..f0e5a56adc
--- /dev/null
+++ b/ABACUS.develop/source/src_external/ORB_api/Makefile.vars
@@ -0,0 +1,29 @@
+CPLUSPLUS      = icpc
+#CPLUSPLUS     = /public/intel2017/bin/icpc
+
+#CPLUSPLUS_MPI = mpiicpc
+CPLUSPLUS_MPI = icpc 
+
+LAPACK_DIR    = $(MKLROOT)
+#LAPACK_DIR = /public/intel2017/compilers_and_libraries_2017.1.132/linux/mkl
+#LAPACK_DIR = $(MKLROOT)
+#LAPACK_DIR    = /public/intel2017/mkl
+
+FFTW_DIR = /home/mohan/1_Software/impi_fftw-3.3.8
+#FFTW_DIR = /home/qianrui/intelcompile/impi_fftw
+#FFTW_DIR       = /public/udata/xiaohui/software/fftw2
+#FFTW_DIR       =/opt/fftw/3.3.6-p12/intel/2017.update4
+#FFTW_DIR      = /public/fftw-3.3.8
+
+BOOST_DIR = /home/mohan/1_Software/impi_boost-1.70.0
+#BOOST_DIR = /home/qianrui/intelcompile/impi_boost
+#BOOST_DIR      = /public/udata/xiaohui/software/boost_1_39_0
+#BOOST_DIR      = /opt/boost/1.64.0
+
+ELPA_DIR = /home/mohan/1_Software/impi_elpa-16.05.005
+#ELPA_DIR = /home/qianrui/intelcompile/impi_elpa
+#ELPA_DIR   = /public/udata/xiaohui/ELPA-2016.05.004
+#ELPA_DIR = /opt/elpa/intel_2017_update4
+
+OBJ_DIR = obj
+NP      = 14
diff --git a/ABACUS.develop/source/src_external/ORB_api/main.cpp b/ABACUS.develop/source/src_external/ORB_api/main.cpp
new file mode 100644
index 0000000000..4258a489ff
--- /dev/null
+++ b/ABACUS.develop/source/src_external/ORB_api/main.cpp
@@ -0,0 +1,45 @@
+//#include "timer.h"
+#include <ctime>
+
+void calculate();
+
+int main(int argc, char **argv)
+{
+
+    calculate();
+
+    return 0;
+}
+
+
+void calculate()
+{
+/*
+	time_t time_start = std::time(NULL);
+
+//	timer::start();
+
+	//----------------------------------------------------------
+	// main program for doing electronic structure calculations
+	//----------------------------------------------------------
+//	Driver DD;
+//	DD.init();
+
+	time_t	time_finish= std::time(NULL);
+
+	// print out information before ABACUS ends
+	cout << "\n START  Time  : " << ctime(&time_start);
+	cout << " FINISH Time  : " << ctime(&time_finish);
+	cout << " TOTAL  Time  : " << difftime(time_finish, time_start) << endl;
+
+	double total_time = difftime(time_finish, time_start);
+	int hour = total_time / 3600;
+	int mins = ( total_time - 3600 * hour ) / 60;
+	int secs = total_time - 3600 * hour - 60 * mins ;
+	cout << " Total  Time  : " << hour << " h "
+	            << mins << " mins "
+	            << secs << " secs "<< endl;
+*/
+
+    return;
+}
diff --git a/ABACUS.develop/source/src_global/complexarray.cpp b/ABACUS.develop/source/src_global/complexarray.cpp
index b3826c499f..7637a6f159 100644
--- a/ABACUS.develop/source/src_global/complexarray.cpp
+++ b/ABACUS.develop/source/src_global/complexarray.cpp
@@ -1,11 +1,3 @@
-/***********************************************************
-    DFT++ is a density functional package developed
-	by the research group
-    of Professor Tomas Arias
-
-    Copyright 1996-2003 Sohrab Ismail-Beigi
-************************************************************/
-
 #include <iostream>
 #include <fstream>
 #include <iomanip>
diff --git a/ABACUS.develop/source/src_global/complexarray.h b/ABACUS.develop/source/src_global/complexarray.h
index 838091e65c..28dc04066d 100644
--- a/ABACUS.develop/source/src_global/complexarray.h
+++ b/ABACUS.develop/source/src_global/complexarray.h
@@ -1,10 +1,3 @@
-/*
-    DFT++ is a density functional package developed by the research group
-    of Professor Tomas Arias
-
-    Copyright 1996-2003 Sohrab Ismail-Beigi
-*/
-
 #ifndef COMPLEX_ARRAY_H
 #define COMPLEX_ARRAY_H
 
diff --git a/ABACUS.develop/source/src_global/complexmatrix.cpp b/ABACUS.develop/source/src_global/complexmatrix.cpp
index 3df5bea358..057a1dc046 100644
--- a/ABACUS.develop/source/src_global/complexmatrix.cpp
+++ b/ABACUS.develop/source/src_global/complexmatrix.cpp
@@ -1,15 +1,14 @@
-//==========================================================
-// AUTHOR : Lixin He, Mohan Chen
-// LAST UPDATE : 2009-03-23 modify "=" operator
-//==========================================================
-
 #include <cassert>
 #include <new>
 #include <cstdlib>
 #include <cstring>
 #include <iostream>
 #include "complexmatrix.h"
+
+#ifdef __NORMAL
+#else
 #include "lapack_connector.h"
+#endif
 
 // constructor with sizes
 ComplexMatrix::ComplexMatrix(const int nrows, const int ncols, const bool flag_zero)
@@ -75,7 +74,9 @@ ComplexMatrix::ComplexMatrix(const matrix &m)
 	{
 		c = new complex<double>[size];
 		for( int i=0; i<size; ++i)
+		{
 			c[i] = m.c[i];
+		}
 	}
 }
 
@@ -123,7 +124,7 @@ void ComplexMatrix::create(const int nr_in, const int nc_in, const bool flag_zer
 	}
 }
 
-void ComplexMatrix::set_as_identity_matrix()
+void ComplexMatrix::set_as_identity_matrix(void)
 {
 	for(int i=0; i<nr; i++)
 	{
@@ -165,23 +166,28 @@ ComplexMatrix operator*(const ComplexMatrix &m1, const ComplexMatrix &m2)
 	assert(m1.nc == m2.nr);
 	ComplexMatrix mprod(m1.nr, m2.nc);
 
+// mohan add 2021-04-05
+#ifdef __NORMAL
 	complex<double> z;
-//	for (int i = 0;i < m1.nr;i++)
-//	{
-//		for (int j = 0;j < m2.nc;j++)
-//		{
-//			z = complex<double>(0,0);
-//			for (int k = 0;k < m1.nc;k++)
-//			{
-//				z += m1(i, k) * m2(k, j);
-//			}
-//			mprod(i, j) = z;
-//		}
-//	}
+	for (int i = 0;i < m1.nr;i++)
+	{
+		for (int j = 0;j < m2.nc;j++)
+		{
+			z = complex<double>(0,0);
+			for (int k = 0;k < m1.nc;k++)
+			{
+				z += m1(i, k) * m2(k, j);
+			}
+			mprod(i, j) = z;
+		}
+	}
+#else
 	// Peize Lin accelerate 2017-10-27
 	LapackConnector::gemm('N', 'N', m1.nr, m2.nc, m1.nc,
 		1, m1.c, m1.nc, m2.c, m2.nc,
 		0, mprod.c, mprod.nc);
+#endif
+
 	return mprod;
 }
 
@@ -403,4 +409,4 @@ ComplexMatrix conj(const ComplexMatrix &m)
 	for(int i=0; i!=m.size; ++i)
 		cm.c[i] = conj(m.c[i]);
 	return cm;
-}
\ No newline at end of file
+}
diff --git a/ABACUS.develop/source/src_global/complexmatrix.h b/ABACUS.develop/source/src_global/complexmatrix.h
index 34fdfe2b47..d43afb658d 100644
--- a/ABACUS.develop/source/src_global/complexmatrix.h
+++ b/ABACUS.develop/source/src_global/complexmatrix.h
@@ -1,15 +1,10 @@
-//==========================================================
-// Author : Lixin He, Mohan Chen
-// Update : Peize Lin
-// Last Update : 2018-09-04
-//==========================================================
 #ifndef COMPLEXMATRIX_H
 #define COMPLEXMATRIX_H
 
 #include <complex>
 using namespace std;
 
-#include "src_global/matrix.h"
+#include "matrix.h"
 
 #ifdef _MCD_CHECK
 #include "src_parallel/mcd.h"
diff --git a/ABACUS.develop/source/src_global/constants.h b/ABACUS.develop/source/src_global/constants.h
index 6e0b31b71a..258358a774 100644
--- a/ABACUS.develop/source/src_global/constants.h
+++ b/ABACUS.develop/source/src_global/constants.h
@@ -1,7 +1,3 @@
-//==========================================================
-// AUTHOR : Lixin He,mohan
-// DATE : 2008-11-07
-//==========================================================
 #ifndef CONSTANT_H
 #define CONSTANT_H
 #include <complex>
diff --git a/ABACUS.develop/source/src_global/global_function-func_each_2.h b/ABACUS.develop/source/src_global/global_function-func_each_2.h
index 76561f3c3a..1d7b01d31f 100644
--- a/ABACUS.develop/source/src_global/global_function-func_each_2.h
+++ b/ABACUS.develop/source/src_global/global_function-func_each_2.h
@@ -1,23 +1,6 @@
 // AUTHOR:	Peize Lin
 // Date: 	2016-09-07
 
-
-/*˵����
-����
-	T tA;
-	const T tB;
-	const ��������(������);
-��
-	func( tA, tB, �������� );
-���
-	L1<L2<...<Ln<T>>...>> t_listA;
-	const L1<L2<...<Ln<T>>...>> t_listB;
-	������L1��L2��...LnΪvector��map��
-��
-	FUNC_EACH_2( t_listA, t_listB, func, �������� );
-*/
-
-
 #ifndef FUNC_EACH_2_H
 #define FUNC_EACH_2_H
 
@@ -63,4 +46,4 @@ void FUNC_EACH_2(
 	}
 }
 
-#endif // FUNC_EACH_2_H
\ No newline at end of file
+#endif // FUNC_EACH_2_H
diff --git a/ABACUS.develop/source/src_global/global_function.h b/ABACUS.develop/source/src_global/global_function.h
index fc9835478b..bcebfff51b 100644
--- a/ABACUS.develop/source/src_global/global_function.h
+++ b/ABACUS.develop/source/src_global/global_function.h
@@ -1,8 +1,3 @@
-//==========================================================
-// AUTHOR : mohan
-// LAST UPDATE : 2009-02-26
-// Add : READ_VALUE; SCAN_BEGIN; SCAN_END; 2009-02-26
-//==========================================================
 #ifndef GLOBAL_FUNCTION_H
 #define GLOBAL_FUNCTION_H
 
diff --git a/ABACUS.develop/source/src_global/global_variable.cpp b/ABACUS.develop/source/src_global/global_variable.cpp
index e7d0a7c5de..3bffb172c2 100644
--- a/ABACUS.develop/source/src_global/global_variable.cpp
+++ b/ABACUS.develop/source/src_global/global_variable.cpp
@@ -122,6 +122,7 @@ string	global_pseudo_dir = "./";
 string  global_pseudo_type = "upf"; // mohan add 2013-05-20, default is UPF, we can also use VWR (xiaohui add 2013-06-23)
 string	global_epm_pseudo_card;
 string	global_out_dir;
+string  global_readin_dir; //zhengdy modified
 
 ofstream ofs_running;
 ofstream ofs_warning;
@@ -176,5 +177,3 @@ int NPOL      = 1;
 int PRENSPIN  = 1;
 
 bool FINAL_SCF = false; //LiuXh add 20180619
-
-int NEW_DM=0;  // Shen Yu add 2019/5/9
diff --git a/ABACUS.develop/source/src_global/global_variable.h b/ABACUS.develop/source/src_global/global_variable.h
index b8b0b2a3e1..b82760862f 100644
--- a/ABACUS.develop/source/src_global/global_variable.h
+++ b/ABACUS.develop/source/src_global/global_variable.h
@@ -149,6 +149,7 @@ extern string	global_wannier_card;
 extern string	global_pseudo_dir;
 extern string   global_pseudo_type; // mohan add 2013-05-20 (xiaohui add 2013-06-23)
 extern string 	global_out_dir;
+extern string   global_readin_dir; //zhengdy modified
 
 extern ofstream ofs_running;
 extern ofstream ofs_warning;
@@ -195,6 +196,5 @@ extern int test_ion_dynamics;
 extern int test_deconstructor;
 
 extern bool FINAL_SCF; //LiuXh add 20180619
-extern int NEW_DM;  // Shen Yu add 2019/5/9
 
 #endif
diff --git a/ABACUS.develop/source/src_global/gram_schmidt_orth-inl.h b/ABACUS.develop/source/src_global/gram_schmidt_orth-inl.h
index 1358260fcd..dd985feffd 100644
--- a/ABACUS.develop/source/src_global/gram_schmidt_orth-inl.h
+++ b/ABACUS.develop/source/src_global/gram_schmidt_orth-inl.h
@@ -10,6 +10,7 @@
 
 #include "mathzone.h"
 #include "lapack_connector.h"
+#include "math_integral.h" // mohan add 2021-04-03
 
 template<typename Func_Type, typename R_Type>
 Gram_Schmidt_Orth<Func_Type,R_Type>::Gram_Schmidt_Orth( const vector<R_Type> &rab_in, const Coordinate &coordinate_in )
@@ -75,13 +76,13 @@ Func_Type Gram_Schmidt_Orth<Func_Type,R_Type>::cal_norm( const vector<Func_Type>
 	{
 		case Coordinate::Cartesian:
 		{
-			Mathzone::Simpson_Integral( f.size(), VECTOR_TO_PTR(f), VECTOR_TO_PTR(rab), norm);		
+			Integral::Simpson_Integral( f.size(), VECTOR_TO_PTR(f), VECTOR_TO_PTR(rab), norm);		
 			break;
 		}
 		case Coordinate::Sphere:	
 		{
 			const vector<Func_Type> &&tmp_func = Mathzone::Pointwise_Product( f, radial_2 );
-			Mathzone::Simpson_Integral( f.size(), VECTOR_TO_PTR(tmp_func), VECTOR_TO_PTR(rab), norm);	
+			Integral::Simpson_Integral( f.size(), VECTOR_TO_PTR(tmp_func), VECTOR_TO_PTR(rab), norm);	
 			break;
 		}
 		default:
@@ -93,4 +94,4 @@ Func_Type Gram_Schmidt_Orth<Func_Type,R_Type>::cal_norm( const vector<Func_Type>
 	return norm;
 }
 
-#endif	// GRAM_SCHMIDT_ORTH_INL_H
\ No newline at end of file
+#endif	// GRAM_SCHMIDT_ORTH_INL_H
diff --git a/ABACUS.develop/source/src_global/integral.cpp b/ABACUS.develop/source/src_global/integral.cpp
index ea7430dccb..6638d256e0 100644
--- a/ABACUS.develop/source/src_global/integral.cpp
+++ b/ABACUS.develop/source/src_global/integral.cpp
@@ -5,12 +5,12 @@
 #include <cmath>
 using namespace std;
 
-int Integral::n_root = 512;
-bool Integral::calc_wx = false;
-double* Integral::gauleg_w;
-double* Integral::gauleg_x;
+int Integral_G::n_root = 512;
+bool Integral_G::calc_wx = false;
+double* Integral_G::gauleg_w;
+double* Integral_G::gauleg_x;
 
-double Integral::Gauss_Legendre
+double Integral_G::Gauss_Legendre
 (
 	const double &a,
 	const double &b,
@@ -27,7 +27,7 @@ double Integral::Gauss_Legendre
 	
 	if(!calc_wx) 
 	{
-		Integral::gauleg();
+		Integral_G::gauleg();
 		calc_wx = true;
 	}
 
@@ -55,7 +55,7 @@ double Integral::Gauss_Legendre
 	return sum * dab / 2.0;
 }
 
-void Integral::gauleg()
+void Integral_G::gauleg(void)
 {
 	int m, j, i;
 	double z1,z,xm,xl,pp,p3,p2,p1;
diff --git a/ABACUS.develop/source/src_global/integral.h b/ABACUS.develop/source/src_global/integral.h
index 7f1f3fde7e..f26ae67b59 100644
--- a/ABACUS.develop/source/src_global/integral.h
+++ b/ABACUS.develop/source/src_global/integral.h
@@ -1,11 +1,12 @@
-#ifndef INTEGRAL_H
-#define INTEGRAL_H
+#ifndef INTEGRAL_G_H
+#define INTEGRAL_G_H
 
-class Integral
+class Integral_G
 {
 	public:
-	Integral();
-	~Integral();
+
+	Integral_G();
+	~Integral_G();
 
 	static double Gauss_Legendre
 	(
diff --git a/ABACUS.develop/source/src_global/lapack_connector.h b/ABACUS.develop/source/src_global/lapack_connector.h
index 3494a65c18..19f6476027 100644
--- a/ABACUS.develop/source/src_global/lapack_connector.h
+++ b/ABACUS.develop/source/src_global/lapack_connector.h
@@ -1,18 +1,3 @@
-// =============================================================================
-//                          C++ Header File
-// Project:         LapackConnector
-// File:            LapackConnector.hpp
-// Author:          sltk
-// Comment:         LapackConnector provide the connector to the fortran Lapack routine.
-// Warning:
-// Start time:      2007-03-08
-// Last modified:   2008-08-12 ywcui : add zhegvx
-// 					2008-08-13 mohan : find bug,test.
-// 					2008-09-03 mohan : Add zgesv
-// 					2009-03-08 mohan : add ilaenv
-//					2010-01-22 spshu : add dgesvd
-// =============================================================================
-
 #ifndef LAPACKCONNECTOR_HPP
 #define LAPACKCONNECTOR_HPP
 
@@ -23,7 +8,7 @@
 #include "matrix.h"
 #include "complexmatrix.h"
 #include "blas_connector.h"
-#include "src_global/global_function.h"
+#include "global_function.h"
 
 
 extern "C"
diff --git a/ABACUS.develop/source/src_global/math_integral.cpp b/ABACUS.develop/source/src_global/math_integral.cpp
new file mode 100644
index 0000000000..1b7b994e63
--- /dev/null
+++ b/ABACUS.develop/source/src_global/math_integral.cpp
@@ -0,0 +1,220 @@
+#include "math_integral.h"
+#include <stddef.h> // use size_t
+#include <cassert>
+
+Integral::Integral(){}
+
+Integral::~Integral(){}
+
+
+// Peize Lin accelerate 2017-10-02
+/*
+void Integral::Simpson_Integral
+(
+    const int mesh,
+    const double *func,
+    const double *rab,
+    double &asum
+)
+{
+    //     simpson's rule integration. On input:
+    //     mesh = mhe number of grid points (should be odd)
+    //     func(i)= function to be integrated
+    //     rab(i) = r(i) * dr(i)/di * di
+    //     For the logarithmic grid not including r=0 :
+    //     r(i) = r_0*exp((i-1)*dx) ==> rab(i)=r(i)*dx
+    //     For the logarithmic grid including r=0 :
+    //     r(i) = a(exp((i-1)*dx)-1) ==> rab(i)=(r(i)+a)*dx
+    //     Output in asum = \sum_i c_i f(i)*rab(i) = \int_0^\infty f(r) dr
+    //     where c_i are alternativaly 2/3, 4/3 except c_1 = c_mesh = 1/3
+    
+    //  simpson's rule integrator for function stored on the
+    //  radial logarithmic mesh
+    //	routine assumes that mesh is an odd number so run check
+    if (mesh % 2 == 0)
+    {
+        cout << "\n error in subroutine simpson ";
+        cout << "\n routine assumes mesh is odd but mesh = "
+             << mesh << endl;
+        return;
+    }
+
+    asum = 0.00;
+    const double r12 = 1.00 / 12.00;
+    double f3 = func [0] * rab [0] * r12;
+    for (int i = 1;i < mesh;i += 2)
+    {
+        const double f1 = f3;
+        const double f2 = func [i] * rab [i] * r12;
+        f3 = func [i + 1] * rab [i + 1] * r12;
+        asum += 4.00 * f1 + 16.00 * f2 + 4.00 * f3;
+    }
+    return;
+}// end subroutine simpson
+*/
+
+
+// Peize Lin accelerate 2017-10-02
+void Integral::Simpson_Integral
+(
+    const int mesh,
+    const double * const func,
+    const double * const rab,
+    double &asum
+)
+{
+    /*     simpson's rule integration. On input:
+    !      mesh = mhe number of grid points (should be odd)
+    !      func(i)= function to be integrated
+    !      rab(i) = r(i) * dr(i)/di * di
+    !      For the logarithmic grid not including r=0 :
+    !      r(i) = r_0*exp((i-1)*dx) ==> rab(i)=r(i)*dx
+    !      For the logarithmic grid including r=0 :
+    !      r(i) = a(exp((i-1)*dx)-1) ==> rab(i)=(r(i)+a)*dx
+    !      Output in asum = \sum_i c_i f(i)*rab(i) = \int_0^\infty f(r) dr
+    !      where c_i are alternativaly 2/3, 4/3 except c_1 = c_mesh = 1/3
+    */
+    //  simpson's rule integrator for function stored on the
+    //  radial logarithmic mesh
+    //	routine assumes that mesh is an odd number so run check
+    assert(mesh&1);
+
+    asum = 0.00;
+	const size_t end = mesh-2;
+    for( size_t i=1; i!=end; i+=2 )
+    {
+		const double f1 = func[i]*rab[i];
+		asum += f1 + f1 + func[i+1]*rab[i+1];
+    }
+	const double f1 = func[mesh-2]*rab[mesh-2];
+	asum += f1+f1;
+	asum += asum;
+	asum += func[0]*rab[0] + func[mesh-1]*rab[mesh-1];
+	asum /= 3.0;
+    return;
+}// end subroutine simpson
+
+
+// Peize Lin accelerate 2017-10-02
+void Integral::Simpson_Integral
+(
+    const int mesh,
+    const double * const func,
+    const double dr,
+    double &asum
+)
+{
+    /*     simpson's rule integration. On input:
+    !      mesh = mhe number of grid points (should be odd)
+    !      func(i)= function to be integrated
+    !      rab(i) = r(i) * dr(i)/di * di
+    !      For the logarithmic grid not including r=0 :
+    !      r(i) = r_0*exp((i-1)*dx) ==> rab(i)=r(i)*dx
+    !      For the logarithmic grid including r=0 :
+    !      r(i) = a(exp((i-1)*dx)-1) ==> rab(i)=(r(i)+a)*dx
+    !      Output in asum = \sum_i c_i f(i)*rab(i) = \int_0^\infty f(r) dr
+    !      where c_i are alternativaly 2/3, 4/3 except c_1 = c_mesh = 1/3
+    */
+    //  simpson's rule integrator for function stored on the
+    //  radial logarithmic mesh
+    //	routine assumes that mesh is an odd number so run check
+    assert(mesh&1);
+
+    asum = 0.00;
+	const size_t end = mesh-2;
+    for(size_t i=1; i!=end; i+=2 )
+    {
+		const double f1 = func[i];
+		asum += f1 + f1 + func[i+1];
+    }
+	const double f1 = func[mesh-2];
+	asum += f1+f1;
+	asum += asum;
+	asum += func[0] + func[mesh-1];
+	asum *= dr/3.0;
+    return;
+}// end subroutine simpson
+
+
+// Peize Lin add 2016-02-14
+void Integral::Simpson_Integral_0toall
+(
+    const int mesh,
+    const double * const func,
+    const double * const rab,
+    double * const asum
+)
+{
+    // asum(r) = \int_{r'=0}^{r} dr' f(r') 
+
+    const double r2=1.00/2.00, r3=1.00/3.00;
+    asum[0] = 0.00;
+    double f3 = func [0] * rab [0];
+    for( int i=1; i<mesh; i+=2)
+    {
+        const double f1 = f3;
+        const double f2 = func[i] * rab[i] ;
+        f3 = func[i+1] * rab[i+1] ;
+        asum[i] = asum[i-1] + r2*( f1 + f2);
+        if(i+1<mesh)
+        {
+            asum[i+1] = asum[i-1] + r3*( f1 + 4.00*f2 + f3 );
+        }
+    }
+    return;
+}
+
+
+// Peize Lin add 2016-02-14
+// faster but still have bug
+/*void Integral::Simpson_Integral_alltoinf
+(
+    const int mesh,
+    const double *func,
+    const double *rab,
+    double *asum
+)
+{
+    // asum(r) = \int_{r'=r}^{+\infty} dr' f(r') 
+    //         = \inf_{r'=r}^{mesh} dr' f(r')
+
+    const double r2=1.00/2.00, r3=1.00/3.00;
+    asum[mesh-1] = 0.00;
+    const int odd_mesh = (mesh-1)^~1;
+    double f1 = func[odd_mesh] * rab[odd_mesh];
+    for( size_t i=(mesh-3)|1; i>0; i-=2)
+    {
+        const double f3 = f1;   
+        if( i+3==mesh )
+        {
+            const double f4 = func[mesh-1] * rab[mesh-1];
+            asum[mesh-2] = r2*(f3 + f4);
+        }
+        const double f2 = func[i] * rab[i] ;
+        f1 = func[i-1] * rab[i-1] ;
+        asum[i-1] = asum[i+1] + r3*( f1 + 4.00*f2 + f3 );
+        asum[i] = asum[i-1] - r2*( f1 + f2);
+    }
+    return;
+}*/
+
+
+// Peize Lin add 2016-06-11
+// a little lower
+void Integral::Simpson_Integral_alltoinf
+(
+    const int mesh,
+    const double * const func,
+    const double * const rab,
+    double * const asum
+)
+{
+    Integral::Simpson_Integral_0toall( mesh, func, rab, asum );
+
+    const double asum_all = asum[mesh-1];
+    for (int i = 0;i < mesh; ++i)
+	{
+        asum[i] = asum_all - asum[i];
+	}
+	return;
+}
diff --git a/ABACUS.develop/source/src_global/math_integral.h b/ABACUS.develop/source/src_global/math_integral.h
new file mode 100644
index 0000000000..16c64f6a9e
--- /dev/null
+++ b/ABACUS.develop/source/src_global/math_integral.h
@@ -0,0 +1,51 @@
+#ifndef MATH_INTEGRAL_H
+#define MATH_INTEGRAL_H
+
+// mohan add 2021-04-03
+
+class Integral
+{
+
+	public:
+
+    Integral();
+    ~Integral();
+
+	// Peize Lin accelerate 2017-10-02
+    static void Simpson_Integral
+    (
+        const int mesh,
+        const double * const func,
+        const double * const rab,
+        double &asum
+    );
+
+	// Peize Lin accelerate 2017-10-02
+	static void Simpson_Integral
+	(
+		const int mesh,
+		const double * const func,
+		const double dr,
+		double &asum
+	);
+
+    // Peize Lin add 2016-02-14
+    static void Simpson_Integral_0toall
+    (
+        const int mesh,
+        const double * const func,
+        const double * const rab,
+        double * const asum
+    );
+
+    // Peize Lin add 2016-02-14
+    static void Simpson_Integral_alltoinf
+    (
+        const int mesh,
+        const double * const func,
+        const double * const rab,
+        double * const asum
+    );     
+
+};
+#endif
diff --git a/ABACUS.develop/source/src_global/mathzone.cpp b/ABACUS.develop/source/src_global/mathzone.cpp
index 7a8f3176b1..5acca148be 100644
--- a/ABACUS.develop/source/src_global/mathzone.cpp
+++ b/ABACUS.develop/source/src_global/mathzone.cpp
@@ -1391,208 +1391,6 @@ int Mathzone::Semi_Fact(const int n)
     return semif;
 }
 
-// Peize Lin accelerate 2017-10-02
-/*
-void Mathzone::Simpson_Integral
-(
-    const int mesh,
-    const double *func,
-    const double *rab,
-    double &asum
-)
-{
-    //     simpson's rule integration. On input:
-    //     mesh = mhe number of grid points (should be odd)
-    //     func(i)= function to be integrated
-    //     rab(i) = r(i) * dr(i)/di * di
-    //     For the logarithmic grid not including r=0 :
-    //     r(i) = r_0*exp((i-1)*dx) ==> rab(i)=r(i)*dx
-    //     For the logarithmic grid including r=0 :
-    //     r(i) = a(exp((i-1)*dx)-1) ==> rab(i)=(r(i)+a)*dx
-    //     Output in asum = \sum_i c_i f(i)*rab(i) = \int_0^\infty f(r) dr
-    //     where c_i are alternativaly 2/3, 4/3 except c_1 = c_mesh = 1/3
-    
-    //  simpson's rule integrator for function stored on the
-    //  radial logarithmic mesh
-    //	routine assumes that mesh is an odd number so run check
-    if (mesh % 2 == 0)
-    {
-        cout << "\n error in subroutine simpson ";
-        cout << "\n routine assumes mesh is odd but mesh = "
-             << mesh << endl;
-        return;
-    }
-
-    asum = 0.00;
-    const double r12 = 1.00 / 12.00;
-    double f3 = func [0] * rab [0] * r12;
-    for (int i = 1;i < mesh;i += 2)
-    {
-        const double f1 = f3;
-        const double f2 = func [i] * rab [i] * r12;
-        f3 = func [i + 1] * rab [i + 1] * r12;
-        asum += 4.00 * f1 + 16.00 * f2 + 4.00 * f3;
-    }
-    return;
-}// end subroutine simpson
-*/
-
-// Peize Lin accelerate 2017-10-02
-void Mathzone::Simpson_Integral
-(
-    const int mesh,
-    const double *func,
-    const double *rab,
-    double &asum
-)
-{
-    /*     simpson's rule integration. On input:
-    !      mesh = mhe number of grid points (should be odd)
-    !      func(i)= function to be integrated
-    !      rab(i) = r(i) * dr(i)/di * di
-    !      For the logarithmic grid not including r=0 :
-    !      r(i) = r_0*exp((i-1)*dx) ==> rab(i)=r(i)*dx
-    !      For the logarithmic grid including r=0 :
-    !      r(i) = a(exp((i-1)*dx)-1) ==> rab(i)=(r(i)+a)*dx
-    !      Output in asum = \sum_i c_i f(i)*rab(i) = \int_0^\infty f(r) dr
-    !      where c_i are alternativaly 2/3, 4/3 except c_1 = c_mesh = 1/3
-    */
-    //  simpson's rule integrator for function stored on the
-    //  radial logarithmic mesh
-    //	routine assumes that mesh is an odd number so run check
-    assert(mesh&1);
-
-    asum = 0.00;
-	const size_t end = mesh-2;
-    for( size_t i=1; i!=end; i+=2 )
-    {
-		const double f1 = func[i]*rab[i];
-		asum += f1 + f1 + func[i+1]*rab[i+1];
-    }
-	const double f1 = func[mesh-2]*rab[mesh-2];
-	asum += f1+f1;
-	asum += asum;
-	asum += func[0]*rab[0] + func[mesh-1]*rab[mesh-1];
-	asum /= 3.0;
-    return;
-}// end subroutine simpson
-
-// Peize Lin accelerate 2017-10-02
-void Mathzone::Simpson_Integral
-(
-    const int mesh,
-    const double *func,
-    const double dr,
-    double &asum
-)
-{
-    /*     simpson's rule integration. On input:
-    !      mesh = mhe number of grid points (should be odd)
-    !      func(i)= function to be integrated
-    !      rab(i) = r(i) * dr(i)/di * di
-    !      For the logarithmic grid not including r=0 :
-    !      r(i) = r_0*exp((i-1)*dx) ==> rab(i)=r(i)*dx
-    !      For the logarithmic grid including r=0 :
-    !      r(i) = a(exp((i-1)*dx)-1) ==> rab(i)=(r(i)+a)*dx
-    !      Output in asum = \sum_i c_i f(i)*rab(i) = \int_0^\infty f(r) dr
-    !      where c_i are alternativaly 2/3, 4/3 except c_1 = c_mesh = 1/3
-    */
-    //  simpson's rule integrator for function stored on the
-    //  radial logarithmic mesh
-    //	routine assumes that mesh is an odd number so run check
-    assert(mesh&1);
-
-    asum = 0.00;
-	const size_t end = mesh-2;
-    for( size_t i=1; i!=end; i+=2 )
-    {
-		const double f1 = func[i];
-		asum += f1 + f1 + func[i+1];
-    }
-	const double f1 = func[mesh-2];
-	asum += f1+f1;
-	asum += asum;
-	asum += func[0] + func[mesh-1];
-	asum *= dr/3.0;
-    return;
-}// end subroutine simpson
-
-// Peize Lin add 2016-02-14
-void Mathzone::Simpson_Integral_0toall
-(
-    const int mesh,
-    const double *func,
-    const double *rab,
-    double *asum
-)
-{
-    // asum(r) = \int_{r'=0}^{r} dr' f(r') 
-
-    const double r2=1.00/2.00, r3=1.00/3.00;
-    asum[0] = 0.00;
-    double f3 = func [0] * rab [0];
-    for( int i=1; i<mesh; i+=2)
-    {
-        const double f1 = f3;
-        const double f2 = func[i] * rab[i] ;
-        f3 = func[i+1] * rab[i+1] ;
-        asum[i] = asum[i-1] + r2*( f1 + f2);
-        if(i+1<mesh)
-        {
-            asum[i+1] = asum[i-1] + r3*( f1 + 4.00*f2 + f3 );
-        }
-    }
-    return;
-}
-
-// Peize Lin add 2016-02-14
-// faster but still have bug
-/*void Mathzone::Simpson_Integral_alltoinf
-(
-    const int mesh,
-    const double *func,
-    const double *rab,
-    double *asum
-)
-{
-    // asum(r) = \int_{r'=r}^{+\infty} dr' f(r') 
-    //         = \inf_{r'=r}^{mesh} dr' f(r')
-
-    const double r2=1.00/2.00, r3=1.00/3.00;
-    asum[mesh-1] = 0.00;
-    const int odd_mesh = (mesh-1)^~1;
-    double f1 = func[odd_mesh] * rab[odd_mesh];
-    for( size_t i=(mesh-3)|1; i>0; i-=2)
-    {
-        const double f3 = f1;   
-        if( i+3==mesh )
-        {
-            const double f4 = func[mesh-1] * rab[mesh-1];
-            asum[mesh-2] = r2*(f3 + f4);
-        }
-        const double f2 = func[i] * rab[i] ;
-        f1 = func[i-1] * rab[i-1] ;
-        asum[i-1] = asum[i+1] + r3*( f1 + 4.00*f2 + f3 );
-        asum[i] = asum[i-1] - r2*( f1 + f2);
-    }
-    return;
-}*/
-
-// Peize Lin add 2016-06-11
-// a little lower
-void Mathzone::Simpson_Integral_alltoinf
-(
-    const int mesh,
-    const double *func,
-    const double *rab,
-    double *asum
-)
-{
-    Mathzone::Simpson_Integral_0toall( mesh, func, rab, asum );
-    const double asum_all = asum[mesh-1];
-    for (int i = 0;i < mesh; ++i)
-        asum[i] = asum_all - asum[i];
-}
 
 void Mathzone::To_Polar_Coordinate
 (
diff --git a/ABACUS.develop/source/src_global/mathzone.h b/ABACUS.develop/source/src_global/mathzone.h
index e6b58265b9..c9e369c8ca 100644
--- a/ABACUS.develop/source/src_global/mathzone.h
+++ b/ABACUS.develop/source/src_global/mathzone.h
@@ -2,9 +2,12 @@
 #define MATHZONE_H
 
 #include "realarray.h"
+#include "vector3.h"
+#include "matrix3.h"
 #include <vector>
 #include <map>
 #include <cassert>
+#include <complex>
 using namespace std;
 
 class Mathzone
@@ -144,41 +147,6 @@ class Mathzone
     static long double Fact(const int n);
     static int Semi_Fact(const int n);
 
-	// Peize Lin accelerate 2017-10-02
-    static void Simpson_Integral
-    (
-        const int mesh,
-        const double *func,
-        const double *rab,
-        double &asum
-    );
-	// Peize Lin accelerate 2017-10-02
-	static void Simpson_Integral
-	(
-		const int mesh,
-		const double *func,
-		const double dr,
-		double &asum
-	);
-
-    // Peize Lin add 2016-02-14
-    static void Simpson_Integral_0toall
-    (
-        const int mesh,
-        const double *func,
-        const double *rab,
-        double *asum
-    );
-
-    // Peize Lin add 2016-02-14
-    static void Simpson_Integral_alltoinf
-    (
-        const int mesh,
-        const double *func,
-        const double *rab,
-        double *asum
-    );     
-	
 	// Peize Lin add 2016-08-03
 	template< typename Type >
 	static vector<Type> Pointwise_Product( const vector<Type> &f1, const vector<Type> &f2 )
diff --git a/ABACUS.develop/source/src_global/matrix.cpp b/ABACUS.develop/source/src_global/matrix.cpp
index db726d667e..7fbb7fc0a4 100644
--- a/ABACUS.develop/source/src_global/matrix.cpp
+++ b/ABACUS.develop/source/src_global/matrix.cpp
@@ -11,7 +11,11 @@
 
 using namespace std;
 #include "matrix.h"
+
+#ifdef __NORMAL
+#else
 #include "lapack_connector.h"
+#endif
 
 //*********************************************************
 // The init() function is the main initialization routine.
@@ -23,7 +27,13 @@ using namespace std;
 
 void matrixAlloc()
 {
+// mohan add 2021-04-25
+#ifdef __NORMAL
+	cout << "Allocation error for Matrix" << endl;
+	exit(0);
+#else
 	WARNING_QUIT("matrix","Allocation error for Matrix");
+#endif
 }
 
 matrix::matrix( const int nrows, const int ncols, const bool flag_zero )
@@ -189,19 +199,28 @@ matrix operator*(const matrix &m1, const matrix &m2)
     // allocate the result and zero it out
     matrix mprod( m1.nr, m2.nc, false );
 
+#ifdef __NORMAL
+	mprod.zero_out();
     // do the multiply and return
-//    for (int i = 0;i < m1.nr;i++)
-//        for (int j = 0;j < m2.nc;j++)
-//            for (int k = 0;k < m1.nc;k++)
-//                //mprod(i, j) += m2(i, k) * m1(k, j);
-//                mprod(i, j) += m1(i, k) * m2(k, j);
-	
+    for (int i = 0;i < m1.nr;i++)
+	{
+        for (int j = 0;j < m2.nc;j++)
+		{
+            for (int k = 0;k < m1.nc;k++)
+			{
+                mprod(i, j) += m1(i, k) * m2(k, j);
+			}
+		}
+	}
+#else
 	// Peize Lin accelerate 2017-10-27
 	LapackConnector::gemm(
 		'N', 'N', 
 		m1.nr, m2.nc, m1.nc,
 		1, m1.c, m1.nc, m2.c, m2.nc, 
 		0, mprod.c, mprod.nc);
+#endif
+
 	return mprod;
 }
 
@@ -377,7 +396,9 @@ double matrix::min() const
 	double value = std::numeric_limits<double>::max();
 	const int size = nr * nc;
 	for( int i=0; i<size; ++i )
+	{
 		value = std::min( value, c[i] );
+	}
 	return value;
 }
 
@@ -387,11 +408,23 @@ double matrix::absmax() const
 	double value = 0;
 	const int size = nr * nc;
 	for( int i=0; i<size; ++i )
+	{
 		value = std::max( value, std::abs(c[i]) );
+	}
 	return value;
 }
 
 double matrix::norm() const
 {
+// mohan add 2021-04-25, no tests.
+#ifdef  __NORMAL
+	double nn = 0.0;
+	for(int i=0; i<nr*nc; ++i)
+	{
+		nn += c[i]*c[i];
+	}	
+	return sqrt(nn);
+#else
 	return LapackConnector::nrm2(nr*nc,c,1);
-}
\ No newline at end of file
+#endif
+}
diff --git a/ABACUS.develop/source/src_global/matrix.h b/ABACUS.develop/source/src_global/matrix.h
index 2f4200ff74..e6609bc1c7 100644
--- a/ABACUS.develop/source/src_global/matrix.h
+++ b/ABACUS.develop/source/src_global/matrix.h
@@ -16,6 +16,7 @@ class matrix
 {
 	/* data */
 public:
+
 	int nr=0;
 	int nc=0;   /* Number of rows and columns */
 	double *c=nullptr;    /* Holds the data */
diff --git a/ABACUS.develop/source/src_global/matrix3.cpp b/ABACUS.develop/source/src_global/matrix3.cpp
index 5495eb1de9..29371c0a77 100644
--- a/ABACUS.develop/source/src_global/matrix3.cpp
+++ b/ABACUS.develop/source/src_global/matrix3.cpp
@@ -23,6 +23,13 @@ void Matrix3::Identity(void)
 	e31 = 0;e32 = 0;e33 = 1;
 }
 
+void Matrix3::Zero(void)
+{
+	e11 = 0;e12 = 0;e13 = 0;
+	e21 = 0;e22 = 0;e23 = 0;
+	e31 = 0;e32 = 0;e33 = 0;
+}
+
 double Matrix3::Det(void) const 
 {
 	return	e11*e22*e33 -
diff --git a/ABACUS.develop/source/src_global/matrix3.h b/ABACUS.develop/source/src_global/matrix3.h
index a22afc9144..7729fe5cde 100644
--- a/ABACUS.develop/source/src_global/matrix3.h
+++ b/ABACUS.develop/source/src_global/matrix3.h
@@ -9,8 +9,8 @@
 #include "../src_parallel/mcd.h"
 #endif
 
-#include "src_global/vector3.h"
-#include "src_global/matrix.h"
+#include "vector3.h"
+#include "matrix.h"
 
 class Matrix3
 {
@@ -26,6 +26,7 @@ class Matrix3
 
 	void Reset(void);
 	void Identity(void);
+	void Zero(void);
 	double Det(void) const ;
 	Matrix3	Transpose(void) const ;
 	Matrix3	Inverse(void) const ;
diff --git a/ABACUS.develop/source/src_global/poission.cpp b/ABACUS.develop/source/src_global/poission.cpp
index 378877899e..ac78f9b579 100644
--- a/ABACUS.develop/source/src_global/poission.cpp
+++ b/ABACUS.develop/source/src_global/poission.cpp
@@ -32,11 +32,11 @@ void Poission::SolPoissonEq
     //value at the beginning
     a = r[0];
     b = r[mesh-1];
-    pot[0] = Integral::Gauss_Legendre(a, b, rad_f2, r, mesh) * 4.0 * PI * e2;
+    pot[0] = Integral_G::Gauss_Legendre(a, b, rad_f2, r, mesh) * 4.0 * PI * e2;
 
     //value at the end
     assert(r[mesh-1] > tiny);
-    pot[mesh-1] = Integral::Gauss_Legendre(a, b, rad_f1, r, mesh) * 4.0 * PI / r[mesh-1] * e2;
+    pot[mesh-1] = Integral_G::Gauss_Legendre(a, b, rad_f1, r, mesh) * 4.0 * PI / r[mesh-1] * e2;
 	
 	//points in the interval
     for(int ir = 1; ir < mesh-1; ir++)
@@ -46,10 +46,10 @@ void Poission::SolPoissonEq
         c = r[mesh-1];
 
         //integrate inside
-        const double inside = Integral::Gauss_Legendre(a, b, rad_f1, r, mesh) / r[ir];
+        const double inside = Integral_G::Gauss_Legendre(a, b, rad_f1, r, mesh) / r[ir];
 
         //integrate outside
-        const double outside = Integral::Gauss_Legendre(b, c, rad_f2, r, mesh);
+        const double outside = Integral_G::Gauss_Legendre(b, c, rad_f2, r, mesh);
 
         //inside + outside
         pot[ir] = (inside + outside) * 4.0 * PI * e2;
diff --git a/ABACUS.develop/source/src_global/sph_bessel.h b/ABACUS.develop/source/src_global/sph_bessel.h
index 4b5a7391e1..c83dc6e9c4 100644
--- a/ABACUS.develop/source/src_global/sph_bessel.h
+++ b/ABACUS.develop/source/src_global/sph_bessel.h
@@ -1,7 +1,6 @@
 #ifndef SPH_BESSEL_H
 #define SPH_BESSEL_H
 
-#include "realarray.h"
 using namespace std;
 
 class Sph_Bessel
diff --git a/ABACUS.develop/source/src_global/sph_bessel_recursive-d1.cpp b/ABACUS.develop/source/src_global/sph_bessel_recursive-d1.cpp
index 231a18e4b8..e65ce87ccc 100644
--- a/ABACUS.develop/source/src_global/sph_bessel_recursive-d1.cpp
+++ b/ABACUS.develop/source/src_global/sph_bessel_recursive-d1.cpp
@@ -4,18 +4,13 @@
 //==========================================================
 
 #include "sph_bessel_recursive.h"
-
-#include "src_global/constants.h"
+#include "constants.h"
 
 #include<cmath>
 #include<stdexcept>
 
-// Peize Lin test
-#include<iostream>
-#include<sys/time.h>
-using namespace std;
 
-vector<Sph_Bessel_Recursive::D1> Sph_Bessel_Recursive_Pool::D1::sb_pool;
+std::vector<Sph_Bessel_Recursive::D1> Sph_Bessel_Recursive_Pool::D1::sb_pool;
 
 void Sph_Bessel_Recursive::D1::set_dx( const double dx_in )
 {
@@ -28,7 +23,7 @@ void Sph_Bessel_Recursive::D1::set_dx( const double dx_in )
 	}
 }
 
-const vector<vector<double>> & Sph_Bessel_Recursive::D1::cal_jlx( const int lmax, const size_t ix_size )
+const std::vector<std::vector<double>> & Sph_Bessel_Recursive::D1::cal_jlx( const int lmax, const size_t ix_size )
 {
 	if(lmax<0)
 		throw std::invalid_argument("Sph_Bessel_Recursive::jlx l<0");
diff --git a/ABACUS.develop/source/src_global/sph_bessel_recursive-d2.cpp b/ABACUS.develop/source/src_global/sph_bessel_recursive-d2.cpp
index 945f691f69..f012cd25e4 100644
--- a/ABACUS.develop/source/src_global/sph_bessel_recursive-d2.cpp
+++ b/ABACUS.develop/source/src_global/sph_bessel_recursive-d2.cpp
@@ -4,20 +4,12 @@
 //==========================================================
 
 #include "sph_bessel_recursive.h"
-
-#include "src_global/constants.h"
+#include "constants.h"
 
 #include<cmath>
 #include<stdexcept>
 
-// Peize Lin test
-#include<iostream>
-#include<sys/time.h>
-#include"src_external/src_test/src_ri/exx_lcao-test.h"
-#include"src_lcao/global_fp.h"
-using namespace std;
-
-vector<Sph_Bessel_Recursive::D2> Sph_Bessel_Recursive_Pool::D2::sb_pool;
+std::vector<Sph_Bessel_Recursive::D2> Sph_Bessel_Recursive_Pool::D2::sb_pool;
 
 void Sph_Bessel_Recursive::D2::set_dx( const double dx_in )
 {
@@ -30,7 +22,7 @@ void Sph_Bessel_Recursive::D2::set_dx( const double dx_in )
 	}
 }
 
-const vector<vector<vector<double>>> & Sph_Bessel_Recursive::D2::cal_jlx( const int lmax, const size_t ix1_size, const size_t ix2_size )
+const std::vector<std::vector<std::vector<double>>> & Sph_Bessel_Recursive::D2::cal_jlx( const int lmax, const size_t ix1_size, const size_t ix2_size )
 {
 	if(lmax<0)
 		throw std::invalid_argument("Sph_Bessel_Recursive::jlx l<0");
@@ -50,7 +42,7 @@ void Sph_Bessel_Recursive::D2::cal_jlx_0( const int l_size, const size_t ix1_siz
 		const double jlx0 = (0==l) ? 1.0 : 0.0;
 
 		if( jlx[l].size()<ix1_size )
-			jlx[l].resize(ix1_size,vector<double>(1,jlx0));
+			jlx[l].resize(ix1_size,std::vector<double>(1,jlx0));
 		
 		if( jlx[l][0].size()<ix2_size )
 			jlx[l][0].resize(ix2_size,jlx0);
diff --git a/ABACUS.develop/source/src_global/sph_bessel_recursive.h b/ABACUS.develop/source/src_global/sph_bessel_recursive.h
index d8ee591e6b..219a638fe0 100644
--- a/ABACUS.develop/source/src_global/sph_bessel_recursive.h
+++ b/ABACUS.develop/source/src_global/sph_bessel_recursive.h
@@ -7,7 +7,7 @@
 #define SPH_BESSEL_RECURSIVE_H
 
 #include<vector>
-using namespace std;
+#include"stddef.h"
 
 class Sph_Bessel_Recursive
 {
@@ -21,14 +21,14 @@ class Sph_Bessel_Recursive
 class Sph_Bessel_Recursive::D1
 {
 public:	
-	const vector<vector<double>> & cal_jlx( const int lmax, const size_t ix_size );
-	const vector<vector<double>> & get_jlx()const{ return jlx; }
+	const std::vector<std::vector<double>> & cal_jlx( const int lmax, const size_t ix_size );
+	const std::vector<std::vector<double>> & get_jlx()const{ return jlx; }
 	
 	void set_dx(const double dx_in);
 	double get_dx()const{ return dx; }
 
 private:
-	vector<vector<double>> jlx;		// jlx[l][x]
+	std::vector<std::vector<double>> jlx;		// jlx[l][x]
 	double dx;
 	bool finish_set_dx = false;
 	
@@ -45,14 +45,14 @@ class Sph_Bessel_Recursive::D1
 class Sph_Bessel_Recursive::D2
 {
 public:	
-	const vector<vector<vector<double>>> & cal_jlx( const int lmax, const size_t ix1_size, const size_t ix2_size );
-	const vector<vector<vector<double>>> & get_jlx()const{ return jlx; }
+	const std::vector<std::vector<std::vector<double>>> & cal_jlx( const int lmax, const size_t ix1_size, const size_t ix2_size );
+	const std::vector<std::vector<std::vector<double>>> & get_jlx()const{ return jlx; }
 	
 	void set_dx(const double dx_in);
 	double get_dx()const{ return dx; }
 
 private:
-	vector<vector<vector<double>>> jlx;		// jlx[l][x1][x2]
+	std::vector<std::vector<std::vector<double>>> jlx;		// jlx[l][x1][x2]
 	double dx;
 	bool finish_set_dx = false;
 	
@@ -72,12 +72,12 @@ class Sph_Bessel_Recursive_Pool
 	class D1
 	{
 		public:
-		static vector<Sph_Bessel_Recursive::D1> sb_pool;
+		static std::vector<Sph_Bessel_Recursive::D1> sb_pool;
 	};
 	class D2
 	{
 		public:
-		static vector<Sph_Bessel_Recursive::D2> sb_pool;
+		static std::vector<Sph_Bessel_Recursive::D2> sb_pool;
 	};
 };
 
diff --git a/ABACUS.develop/source/src_global/vector3.h b/ABACUS.develop/source/src_global/vector3.h
index a80c3d03b7..d321ad53e4 100644
--- a/ABACUS.develop/source/src_global/vector3.h
+++ b/ABACUS.develop/source/src_global/vector3.h
@@ -1,6 +1,3 @@
-//==========================================================
-//
-//==========================================================
 #ifndef VECTOR3_H
 #define VECTOR3_H
 
@@ -42,20 +39,20 @@ class Vector3
 	void print(void)const ;		// mohan add 2009-11-29
 };
 
-template <class T> Vector3<T> operator+( const Vector3<T> &u, const Vector3<T> &v ) { return Vector3<T>( u.x+v.x, u.y+v.y, u.z+v.z ); }
-template <class T> Vector3<T> operator-( const Vector3<T> &u, const Vector3<T> &v ) { return Vector3<T>( u.x-v.x, u.y-v.y, u.z-v.z ); }
+template <class T> inline Vector3<T> operator+( const Vector3<T> &u, const Vector3<T> &v ) { return Vector3<T>( u.x+v.x, u.y+v.y, u.z+v.z ); }
+template <class T> inline Vector3<T> operator-( const Vector3<T> &u, const Vector3<T> &v ) { return Vector3<T>( u.x-v.x, u.y-v.y, u.z-v.z ); }
 //u.v=(ux*vx)+(uy*vy)+(uz*vz)                                                     
-template <class T> T          operator*( const Vector3<T> &u, const Vector3<T> &v ) { return ( u.x*v.x + u.y*v.y + u.z*v.z ); }
-template <class T> Vector3<T> operator*( const T &s,          const Vector3<T> &u ) { return Vector3<T>( u.x*s, u.y*s, u.z*s ); }
-template <class T> Vector3<T> operator*( const Vector3<T> &u, const T &s          ) { return Vector3<T>( u.x*s, u.y*s, u.z*s ); } // mohan add 2009-5-10
-template <class T> Vector3<T> operator/( const Vector3<T> &u, const T &s          ) { return Vector3<T>( u.x/s, u.y/s, u.z/s ); }
+template <class T> inline T          operator*( const Vector3<T> &u, const Vector3<T> &v ) { return ( u.x*v.x + u.y*v.y + u.z*v.z ); }
+template <class T> inline Vector3<T> operator*( const T &s,          const Vector3<T> &u ) { return Vector3<T>( u.x*s, u.y*s, u.z*s ); }
+template <class T> inline Vector3<T> operator*( const Vector3<T> &u, const T &s          ) { return Vector3<T>( u.x*s, u.y*s, u.z*s ); } // mohan add 2009-5-10
+template <class T> inline Vector3<T> operator/( const Vector3<T> &u, const T &s          ) { return Vector3<T>( u.x/s, u.y/s, u.z/s ); }
 //u.v=(ux*vx)+(uy*vy)+(uz*vz)
-template <class T> T          dot      ( const Vector3<T> &u, const Vector3<T> &v ) { return ( u.x*v.x + u.y*v.y + u.z*v.z ); }
+template <class T> inline T          dot      ( const Vector3<T> &u, const Vector3<T> &v ) { return ( u.x*v.x + u.y*v.y + u.z*v.z ); }
 // | i  j  k  |
 // | ux uy uz |
 // | vx vy vz |
 // u.v=(uy*vz-uz*vy)i+(-ux*vz+uz*vx)j+(ux*vy-uy*vx)k
-template <class T> Vector3<T> operator^(const Vector3<T> &u,const Vector3<T> &v)
+template <class T> inline Vector3<T> operator^(const Vector3<T> &u,const Vector3<T> &v)
 {	
 	return Vector3<T> ( u.y * v.z - u.z * v.y,
 	                   -u.x * v.z + u.z * v.x,
@@ -65,7 +62,7 @@ template <class T> Vector3<T> operator^(const Vector3<T> &u,const Vector3<T> &v)
 // | ux uy uz |
 // | vx vy vz |
 // u.v=(uy*vz-uz*vy)i+(-ux*vz+uz*vx)j+(ux*vy-uy*vzx)k
-template <class T> Vector3<T> cross(const Vector3<T> &u,const Vector3<T> &v)
+template <class T> inline Vector3<T> cross(const Vector3<T> &u,const Vector3<T> &v)
 {
 	return Vector3<T> ( u.y * v.z - u.z * v.y,
 	                   -u.x * v.z + u.z * v.x,
@@ -80,9 +77,9 @@ template <class T> Vector3<T> cross(const Vector3<T> &u,const Vector3<T> &v)
 //}
 
 //whether m1 != m2
-template <class T> bool operator !=(const Vector3<T> &u, const Vector3<T> &v){ return !(u == v); }
+template <class T> inline bool operator !=(const Vector3<T> &u, const Vector3<T> &v){ return !(u == v); }
 //whether u == v
-template <class T> bool operator ==(const Vector3<T> &u, const Vector3<T> &v)
+template <class T> inline bool operator ==(const Vector3<T> &u, const Vector3<T> &v)
 {
 	if(u.x == v.x && u.y == v.y && u.z == v.z)
 		return true;
diff --git a/ABACUS.develop/source/src_global/ylm.cpp b/ABACUS.develop/source/src_global/ylm.cpp
index 1b0c2b7088..e1284e9c4b 100644
--- a/ABACUS.develop/source/src_global/ylm.cpp
+++ b/ABACUS.develop/source/src_global/ylm.cpp
@@ -1045,7 +1045,7 @@ void Ylm::grad_rl_sph_harm
 	return;
 }
 	
-void Ylm::set_coefficients ()
+void Ylm::set_coefficients(void)
 {
 	Ylm::ylmcoef[0] = 1.0 / sqrt(FOUR_PI);
 	Ylm::ylmcoef[1] = sqrt (3.0 / FOUR_PI);
@@ -1086,6 +1086,7 @@ void Ylm::set_coefficients ()
 	return;
 }
 
+
 void Ylm::test1 (void)
 {
 	Vector3<double> R (20.0, 0.0, 0.0);
@@ -1121,6 +1122,7 @@ void Ylm::test1 (void)
 	return;
 }
 
+
 void Ylm::test2 (void)
 {
 	Vector3<double> R (0.1,-0.2,0.5);
@@ -1515,7 +1517,8 @@ void Ylm::rlylm
 	return;
 }
 
-void Ylm::test()
+
+void Ylm::test(void)
 {
 	Vector3<double> R(0.0, 0.0, 1.0);
 	
@@ -1609,6 +1612,7 @@ void Ylm::test()
 	return;
 }
 
+
 void Ylm::ZEROS(double u[], const int& n)
 {
 	for(int i = 0; i < n; i++)
@@ -1618,6 +1622,7 @@ void Ylm::ZEROS(double u[], const int& n)
 	return;
 }
 
+
 //==========================================================
 // MEMBER FUNCTION : 
 // NAME : Fact ( n! )
@@ -1637,6 +1642,7 @@ long double Ylm::Fact(const int n)
 	return f;
 }
 
+
 int Ylm::Semi_Fact(const int n)
 {
 	int semif = 1;
@@ -1647,10 +1653,10 @@ int Ylm::Semi_Fact(const int n)
 	return semif;
 }
 
+
 double Ylm::sgn(const double x)
 {
 	if(x < 0.0) return -1.0;
 	if(x > 0.0) return 1.0;
 	return 0.0;
 }
-
diff --git a/ABACUS.develop/source/src_io/bessel_basis.cpp b/ABACUS.develop/source/src_io/bessel_basis.cpp
index 47f7a40d8e..5c0c44df3b 100644
--- a/ABACUS.develop/source/src_io/bessel_basis.cpp
+++ b/ABACUS.develop/source/src_io/bessel_basis.cpp
@@ -1,6 +1,7 @@
 #include "bessel_basis.h"
 #include "../src_pw/global.h"
 #include "../src_parallel/parallel_common.h"
+#include "../src_global/math_integral.h"
 
 Bessel_Basis::Bessel_Basis()
 {
@@ -306,7 +307,7 @@ void Bessel_Basis::init_TableOne(
 				}
 				
 				// make table value
-				Mathzone::Simpson_Integral(rmesh, function, rab, this->TableOne(l, ie, ik) );
+				Integral::Simpson_Integral(rmesh, function, rab, this->TableOne(l, ie, ik) );
 			}
 			
 		}// end ie
diff --git a/ABACUS.develop/source/src_io/cal_r_overlap_R.cpp b/ABACUS.develop/source/src_io/cal_r_overlap_R.cpp
index ff803ecd56..31d94aa6d1 100644
--- a/ABACUS.develop/source/src_io/cal_r_overlap_R.cpp
+++ b/ABACUS.develop/source/src_io/cal_r_overlap_R.cpp
@@ -61,7 +61,7 @@ void cal_r_overlap_R::init()
 		ORB.get_dR(),// delta R, for making radial table
 		ORB.get_dk()); // delta k, for integration in k space
 		
-	MOT.init_Table_Spherical_Bessel (2, 3, Lmax_used, Lmax);
+	MOT.init_Table_Spherical_Bessel (2, 3, Lmax_used, Lmax, Exx_Abfs::Lmax);
 
 	Ylm::set_coefficients();
 
diff --git a/ABACUS.develop/source/src_io/energy_dos.cpp b/ABACUS.develop/source/src_io/energy_dos.cpp
index 4f54e904cf..47f91e2950 100644
--- a/ABACUS.develop/source/src_io/energy_dos.cpp
+++ b/ABACUS.develop/source/src_io/energy_dos.cpp
@@ -326,8 +326,17 @@ void energy::perform_dos(void)
 				atom_arrange::set_sr_NL();
 				atom_arrange::search( SEARCH_RADIUS );//qifeng-2019-01-21
 
-				// mohan update 2021-02-10
-				hm.orb_con.set_orb_tables();
+				// mohan update 2021-04-16
+				hm.orb_con.set_orb_tables(
+						UOT, 
+						ORB,
+						INPUT.lcao_ecut,
+						INPUT.lcao_dk,
+						INPUT.lcao_dr,
+						INPUT.lcao_rmax, 
+						ucell.lat0, 
+						Exx_Abfs::Lmax);
+
 				LM.allocate_HS_R(LNNR.nnr);
 				LM.zeros_HSR('S', LNNR.nnr);
 				UHM.genH.calculate_S_no();
@@ -409,7 +418,7 @@ void energy::perform_dos(void)
 				atom_arrange::delete_vector( SEARCH_RADIUS );
 #endif
 				// mohan update 2021-02-10
-				hm.orb_con.clear_after_ions();
+				hm.orb_con.clear_after_ions(UOT, ORB);
 			}//else
 
 		 MPI_Reduce(pdosk[is].c, pdos[is].c , NUM , MPI_DOUBLE , MPI_SUM, 0, MPI_COMM_WORLD);
diff --git a/ABACUS.develop/source/src_io/mulliken_charge.cpp b/ABACUS.develop/source/src_io/mulliken_charge.cpp
index a2b3fd15bf..1df4861cfc 100644
--- a/ABACUS.develop/source/src_io/mulliken_charge.cpp
+++ b/ABACUS.develop/source/src_io/mulliken_charge.cpp
@@ -166,7 +166,20 @@ void Mulliken_Charge::cal_mulliken(void)
 			mud[0].create(ParaO.ncol,ParaO.nrow);
 			atom_arrange::set_sr_NL();
 			atom_arrange::search( SEARCH_RADIUS );//qifeng-2019-01-21
-			hm.orb_con.set_orb_tables();
+
+			// 2021-04-16
+			hm.orb_con.set_orb_tables(
+					UOT, 
+					ORB,
+					INPUT.lcao_ecut,
+					INPUT.lcao_dk,
+					INPUT.lcao_dr,
+					INPUT.lcao_rmax, 
+					ucell.lat0, 
+					Exx_Abfs::Lmax);
+
+
+
 			LM.allocate_HS_R(LNNR.nnr);
 			LM.zeros_HSR('S', LNNR.nnr);
 			UHM.genH.calculate_S_no();
@@ -224,7 +237,7 @@ void Mulliken_Charge::cal_mulliken(void)
 #ifdef __MPI
 			atom_arrange::delete_vector( SEARCH_RADIUS );
 #endif
-			hm.orb_con.clear_after_ions();
+			hm.orb_con.clear_after_ions(UOT, ORB);
 
 		}//else                     
 		MPI_Reduce(MecMulP[is], DecMulP[is] , NLOCAL , MPI_DOUBLE , MPI_SUM, 0, MPI_COMM_WORLD);
diff --git a/ABACUS.develop/source/src_io/read_atoms.cpp b/ABACUS.develop/source/src_io/read_atoms.cpp
index 3e8f51bf46..e568172bb7 100644
--- a/ABACUS.develop/source/src_io/read_atoms.cpp
+++ b/ABACUS.develop/source/src_io/read_atoms.cpp
@@ -32,6 +32,12 @@ void UnitCell_pseudo::read_atom_species(ifstream &ifa)
 						<< setw(12) << atom_mass[i] 
 						<< setw(18) << pseudo_fn[i];
 			}
+
+			// Peize Lin test for bsse 2021.04.07
+			const string bsse_label = "empty";
+			this->atoms[i].flag_empty_element = 
+				(search( atom_label[i].begin(), atom_label[i].end(), bsse_label.begin(), bsse_label.end() ) != atom_label[i].end())
+				? true : false;
 		}
 	}
 
@@ -71,6 +77,10 @@ void UnitCell_pseudo::read_atom_species(ifstream &ifa)
 				
 			}
 		}	
+		// caoyu add 2021-03-16
+		if (SCAN_BEGIN(ifa, "NUMERICAL_DESCRIPTOR")) {
+			ifa >> ORB.descriptor_file;
+		}
 	}
 
 	// Peize Lin add 2016-09-23
diff --git a/ABACUS.develop/source/src_io/read_pseudopot.cpp b/ABACUS.develop/source/src_io/read_pseudopot.cpp
index 99d3e4cbd1..645fdc3781 100644
--- a/ABACUS.develop/source/src_io/read_pseudopot.cpp
+++ b/ABACUS.develop/source/src_io/read_pseudopot.cpp
@@ -9,20 +9,13 @@
 //==========================================================
 void UnitCell_pseudo::read_pseudopot(const string &pp_dir)
 {
-	if(test_pseudo_cell) TITLE("UnitCell_pseudo","read_pseudopot");
+	TITLE("UnitCell_pseudo","read_pseudopot");
 //----------------------------------------------------------
 // EXPLAIN : setup reading log for pseudopot_upf
 //----------------------------------------------------------
 	stringstream ss;
 	ss << global_out_dir << "atom_pseudo.log";
 	
-//	ofstream ofs;
-	
-//	if(MY_RANK==0)
-//	{
-//		ofs.open( ss.str().c_str(), ios::out);
-//	}
-
 //----------------------------------------------------------
 // EXPLAIN : Read in the atomic pseudo potential
 //----------------------------------------------------------
@@ -37,10 +30,17 @@ void UnitCell_pseudo::read_pseudopot(const string &pp_dir)
 		if(MY_RANK==0)
 		{
 			pp_address = pp_dir + this->pseudo_fn[i];
-			//error = upf.read_pseudo_upf( pp_address ); xiaohui modify 2013-06-23
 			error = upf.init_pseudo_reader( pp_address ); //xiaohui add 2013-06-23
-			//average pseudopotential if needed
-			error_ap = upf.average_p(); //added by zhengdy 2020-10-20
+
+			if(error==0) // mohan add 2021-04-16
+			{
+				if(this->atoms[i].flag_empty_element)	// Peize Lin add for bsse 2021.04.07
+				{
+					upf.set_empty_element();			
+				}
+				//average pseudopotential if needed
+				error_ap = upf.average_p(); //added by zhengdy 2020-10-20
+			}
 		}
 
 #ifdef __MPI
@@ -54,15 +54,15 @@ void UnitCell_pseudo::read_pseudopot(const string &pp_dir)
 		{
 			cout << " Pseudopotential directory now is : " << pp_address << endl;
 			ofs_warning << " Pseudopotential directory now is : " << pp_address << endl;
-			WARNING_QUIT("UnitCell_pseudo::read_pseudopot","Couldn't find pseudopotential file.");
+			WARNING_QUIT("read_pseudopot","Couldn't find pseudopotential file.");
 		}
 		else if(error==2)
 		{
-			WARNING_QUIT("UnitCell_pseudo::read_pseudopot","Something in pseudopotential not match.");
+			WARNING_QUIT("read_pseudopot","Pseudopotential data do not match.");
 		}
 		else if(error==3)
 		{
-			WARNING_QUIT("UnitCell_pseudo::read_pseudopot","Please check the reference states in pseudopotential .vwr file.\n Also the norm of the read in pseudo wave functions\n explicitly please check S, P and D channels.\n If the norm of the wave function is \n unreasonable large (should be near 1.0), ABACUS would quit. \n The solution is to turn off the wave functions  \n and the corresponding non-local projectors together\n in .vwr pseudopotential file.");
+			WARNING_QUIT("read_pseudopot","Check the reference states in pseudopotential .vwr file.\n Also the norm of the read in pseudo wave functions\n explicitly please check S, P and D channels.\n If the norm of the wave function is \n unreasonable large (should be near 1.0), ABACUS would quit. \n The solution is to turn off the wave functions  \n and the corresponding non-local projectors together\n in .vwr pseudopotential file.");
 		}
 //		OUT(ofs_running,"PP_ERRROR",error);
 
diff --git a/ABACUS.develop/source/src_io/to_wannier90.cpp b/ABACUS.develop/source/src_io/to_wannier90.cpp
index 6601106b13..9d9685cfdf 100644
--- a/ABACUS.develop/source/src_io/to_wannier90.cpp
+++ b/ABACUS.develop/source/src_io/to_wannier90.cpp
@@ -1,6 +1,6 @@
 #include "to_wannier90.h"
 #include "../src_lcao/global_fp.h" // mohan add 2021-01-30, this module should be modified
- 
+#include "../src_global/math_integral.h" 
 
 
 toWannier90::toWannier90(int num_kpts, Matrix3 recip_lattice)
@@ -1430,7 +1430,7 @@ void toWannier90::integral(const int meshr, const double *psir, const double *r,
 	}
 	
 	double unit = 0.0;
-	Mathzone::Simpson_Integral(meshr, inner_part, rab, unit);
+	Integral::Simpson_Integral(meshr, inner_part, rab, unit);
 	delete[] inner_part;
 
 	double *aux = new double[meshr];
@@ -1445,7 +1445,7 @@ void toWannier90::integral(const int meshr, const double *psir, const double *r,
 		}
 		
 		double vqint = 0.0;
-		Mathzone::Simpson_Integral(meshr, vchi, rab, vqint);
+		Integral::Simpson_Integral(meshr, vchi, rab, vqint);
 
 		table[iq] =  vqint * pref;
 	}
diff --git a/ABACUS.develop/source/src_io/unk_overlap_lcao.cpp b/ABACUS.develop/source/src_io/unk_overlap_lcao.cpp
index 82e6c6b57e..f9ccb7432d 100644
--- a/ABACUS.develop/source/src_io/unk_overlap_lcao.cpp
+++ b/ABACUS.develop/source/src_io/unk_overlap_lcao.cpp
@@ -69,7 +69,7 @@ void unkOverlap_lcao::init()
 		ORB.get_dR(),// delta R, for making radial table
 		ORB.get_dk()); // delta k, for integration in k space
 		
-	MOT.init_Table_Spherical_Bessel (2, 3, Lmax_used, Lmax);
+	MOT.init_Table_Spherical_Bessel (2, 3, Lmax_used, Lmax, Exx_Abfs::Lmax);
 
 	Ylm::set_coefficients ();
 
diff --git a/ABACUS.develop/source/src_io/write_input.cpp b/ABACUS.develop/source/src_io/write_input.cpp
index c715edc478..5db6d49b03 100644
--- a/ABACUS.develop/source/src_io/write_input.cpp
+++ b/ABACUS.develop/source/src_io/write_input.cpp
@@ -32,7 +32,6 @@ void Input::Print(const string &fn)const
 	OUTP(ofs,"nche_sto",nche_sto,"number of orders for Chebyshev expansion in stochastic DFT");
 	OUTP(ofs,"symmetry",symmetry,"turn symmetry on or off");	
 	OUTP(ofs,"nelec",nelec,"input number of electrons");
-	OUTP(ofs,"newdm",newDM,"");
 
 	ofs << "\n#Parameters (2.PW)" << endl;
 	OUTP(ofs,"ecutwfc",ecutwfc,"#energy cutoff for wave functions");
@@ -57,6 +56,7 @@ void Input::Print(const string &fn)const
 	OUTP(ofs,"out_band",out_band,"output energy and band structure");
 	OUTP(ofs,"restart_save",restart_save,"print to disk every step for restart");
 	OUTP(ofs,"restart_load",restart_load,"restart from disk");
+	OUTP(ofs,"read_file_dir",read_file_dir,"directory of files for reading");
 	OUTP(ofs,"nx",nx,"number of points along x axis for FFT grid");
 	OUTP(ofs,"ny",ny,"number of points along y axis for FFT grid");
 	OUTP(ofs,"nz",nz,"number of points along z axis for FFT grid");	
@@ -91,6 +91,7 @@ void Input::Print(const string &fn)const
 
 	ofs << "\n#Parameters (4.LCAO)" << endl;
 	OUTP(ofs,"basis_type",basis_type,"PW; LCAO in pw; LCAO");
+	OUTP(ofs,"new_dm",new_dm,"Type of density matrix; 0: old 1: new");
 	if(ks_solver=="HPSEPS" || ks_solver=="genelpa" || ks_solver=="scalapack_gvx")
 	{
 		OUTP(ofs,"nb2d",nb2d,"2d distribution of atoms");
@@ -208,7 +209,7 @@ void Input::Print(const string &fn)const
 	OUTP(ofs,"kernel_type",kernel_type,"the kernel type: rpa, tdlda ...");
 	OUTP(ofs,"eels_method",eels_method,"0: hilbert_transform method; 1: standard method");
 	OUTP(ofs,"absorption_method",absorption_method,"0: vasp's method  1: pwscf's method");
-	OUTP(ofs,"system",system,"the calculate system");
+	OUTP(ofs,"system",system_type,"the calculate system");
 	OUTP(ofs,"eta",eta,"eta(Ry)");
 	OUTP(ofs,"domega",domega,"domega(Ry)");
 	OUTP(ofs,"nomega",nomega,"nomega");
diff --git a/ABACUS.develop/source/src_ions/ions_move_basic.cpp b/ABACUS.develop/source/src_ions/ions_move_basic.cpp
index dbac3e1158..723b2e854a 100644
--- a/ABACUS.develop/source/src_ions/ions_move_basic.cpp
+++ b/ABACUS.develop/source/src_ions/ions_move_basic.cpp
@@ -36,19 +36,13 @@ void Ions_Move_Basic::setup_gradient(double* pos, double *grad, const matrix &fo
 	// the unit of pos: Bohr.
 	// the unit of force: Ry/Bohr.
 	// the unit of gradient: 
+	ucell.save_cartesian_position(pos);
 	int iat=0;
 	for(int it = 0;it < ucell.ntype;it++)
 	{
 		Atom* atom = &ucell.atoms[it];
 		for(int ia =0;ia< ucell.atoms[it].na;ia++)
 		{	
-			pos[3*iat  ] = atom->tau[ia].x*ucell.lat0;
-			pos[3*iat+1] = atom->tau[ia].y*ucell.lat0;
-			pos[3*iat+2] = atom->tau[ia].z*ucell.lat0;
-
-			// mohan remove mbl constrain 2010-04-26
-			// mohan add mbl constrain 2010-07-11
-			// mohan add ucell.lat0 2010-07-27
 			if(atom->mbl[ia].x == 1)
 			{
 				grad[3*iat  ] = -force(iat, 0)*ucell.lat0;
@@ -77,15 +71,12 @@ void Ions_Move_Basic::move_atoms(double *move, double *pos)
 	assert(move!=NULL);
 	assert(pos!=NULL);
 
-	// unit: Bohr
-	int iat=0;
-
 	//------------------------
 	// for test only
 	//------------------------
 	if(test_ion_dynamics)
 	{
-		iat=0;
+		int iat=0;
 		ofs_running << "\n movement of ions (unit is Bohr) : " << endl;
 		ofs_running << " " << setw(12) << "Atom" << setw(15) << "x" << setw(15) << "y" << setw(15) << "z" << endl;
 		for(int it = 0;it < ucell.ntype;it++)
@@ -105,89 +96,20 @@ void Ions_Move_Basic::move_atoms(double *move, double *pos)
 		assert( iat == ucell.nat );
 	}
 
-	iat = 0;
-	double move_threshold = 1.0e-10;
-	for(int it = 0;it < ucell.ntype;it++)
+	const double move_threshold = 1.0e-10;
+	const int total_freedom = ucell.nat * 3;
+	for(int i =0;i<total_freedom;i++)
 	{
-		Atom* atom = &ucell.atoms[it];
-		for(int ia =0;ia< atom->na;ia++)
+		if( abs(move[i]) > move_threshold )
 		{
-			// mohan add 2010-08-06
-			// otherwise, there might be bug for
-			// sltk_grid, on system CO when C
-			// atom is put on (0,0,0)
-			for(int i=0; i<3; i++)
-			{
-				if( abs(move[3*iat+i]) < move_threshold )
-				{
-					move[3*iat+i] = 0.0;
-				}
-			}
-		
-			// mohan modify 2010-04-26
-			if(atom->mbl[ia].x!=0)
-			{
-				atom->tau[ia].x = (move[3*iat]+pos[3*iat])/ucell.lat0;
-			}
-			if(atom->mbl[ia].y!=0)
-			{
-				atom->tau[ia].y = (move[3*iat+1]+pos[3*iat+1])/ucell.lat0;
-			}
-			if(atom->mbl[ia].z!=0)
-			{
-				atom->tau[ia].z = (move[3*iat+2]+pos[3*iat+2])/ucell.lat0;
-			}
-
-			// the direct coordinates also need to be updated.
-			atom->taud[ia] = atom->tau[ia] * ucell.GT;
-//			cout << " tau=" << atom->tau[ia].x << " " << atom->tau[ia].y << " " << atom->tau[ia].z << endl;
-			iat++;
+			pos[i] += move[i];
 		}
 	}
-	assert(iat == ucell.nat);
-
-	//----------------------------------------------
-	// because of the periodic boundary condition
-	// we need to adjust the atom positions,
-	// first adjust direct coordinates,
-	// then update them into cartesian coordinates,
-	//----------------------------------------------
-	for(int it=0; it<ucell.ntype; it++)
-	{
-		Atom* atom = &ucell.atoms[it];
-		for(int ia=0; ia<atom->na; ia++)
-		{
-			// mohan update 2011-03-21
-			if(atom->taud[ia].x<0) atom->taud[ia].x += 1.0;
-			if(atom->taud[ia].y<0) atom->taud[ia].y += 1.0;
-			if(atom->taud[ia].z<0) atom->taud[ia].z += 1.0;
-			if(atom->taud[ia].x>=1.0) atom->taud[ia].x -= 1.0;
-			if(atom->taud[ia].y>=1.0) atom->taud[ia].y -= 1.0;
-			if(atom->taud[ia].z>=1.0) atom->taud[ia].z -= 1.0;
-
-			if(atom->taud[ia].x<0 || atom->taud[ia].y<0
-				|| atom->taud[ia].z<0 ||
-				atom->taud[ia].x>=1.0 ||
-				atom->taud[ia].y>=1.0 ||
-				atom->taud[ia].z>=1.0)
-			{
-				ofs_warning << " it=" << it+1 << " ia=" << ia+1 << endl;
-				ofs_warning << "d=" << atom->taud[ia].x << " " << 
-				atom->taud[ia].y << " " << atom->taud[ia].z << endl;
-				WARNING_QUIT("Ions_Move_Basic::move_ions","the movement of atom is larger than the length of cell.");
-			}
+	ucell.update_pos_tau(pos);
 
-			atom->tau[ia] = atom->taud[ia] * ucell.latvec;
-		}
-	}
-//2015-09-16
-#ifdef __MPI
-    MPI_Barrier(MPI_COMM_WORLD);
-    for (int i=0;i<ucell.ntype;i++)
-    {
-        ucell.atoms[i].bcast_atom(); // bcast tau array
-    }
-#endif
+	ucell.periodic_boundary_adjustment();
+	
+	ucell.bcast_atoms_tau();
 
 	//--------------------------------------------
 	// Print out the structure file.
diff --git a/ABACUS.develop/source/src_lcao/DM_gamma.cpp b/ABACUS.develop/source/src_lcao/DM_gamma.cpp
index f98f018642..81a89fe62d 100644
--- a/ABACUS.develop/source/src_lcao/DM_gamma.cpp
+++ b/ABACUS.develop/source/src_lcao/DM_gamma.cpp
@@ -108,7 +108,7 @@ int Local_Orbital_Charge::setAlltoallvParameter(MPI_Comm comm_2D, int blacs_ctxt
         int prow, pcol;
         Cblacs_pcoord(blacs_ctxt, pnum, &prow, &pcol);
         receiver_size_process[pnum]=nRow_in_proc[prow]*nCol_in_proc[pcol];
-        if(NEW_DM>1)
+        if(INPUT.new_dm>1)
         {
             OUT(ofs_running,"pnum",pnum);
             OUT(ofs_running,"prow",prow);
@@ -182,7 +182,7 @@ int Local_Orbital_Charge::setAlltoallvParameter(MPI_Comm comm_2D, int blacs_ctxt
                   sender_2D_index, sender_size_process, sender_displacement_process, MPI_INT, comm_2D);
 
 
-    if(NEW_DM>1)
+    if(INPUT.new_dm>1)
     {
         ofs_running<<"receiver_size is "<<receiver_size<<" ; receiver_size of each process is:\n";
         for(int i=0; i<nprocs; ++i)
@@ -281,7 +281,7 @@ void Local_Orbital_Charge::cal_dk_gamma_from_2D(void)
 
     for(int is=0; is<NSPIN; ++is)
     {
-        if(NEW_DM>1)
+        if(INPUT.new_dm>1)
         // outputDM( ParaO.blacs_ctxt, ParaO.nb);
         {
             // int myid;
@@ -319,7 +319,7 @@ void Local_Orbital_Charge::cal_dk_gamma_from_2D(void)
                                                                 // so the row and column index should be switched
             if(sender_buffer[i]!=0) ++nNONZERO;
         }
-        if(NEW_DM>1) 
+        if(INPUT.new_dm>1) 
         {
             OUT(ofs_running,"number of non-zero elements in sender_buffer",nNONZERO);
             OUT(ofs_running,"sender_size",sender_size);
@@ -348,7 +348,7 @@ void Local_Orbital_Charge::cal_dk_gamma_from_2D(void)
             if(receiver_buffer[i]!=0) ++nNONZERO;
         }
 
-        if(NEW_DM>1)
+        if(INPUT.new_dm>1)
         {
             OUT(ofs_running,"number of non-zero elements in receiver_buffer",nNONZERO);
             OUT(ofs_running,"receiver_size",receiver_size);
diff --git a/ABACUS.develop/source/src_lcao/FORCE_gamma.cpp b/ABACUS.develop/source/src_lcao/FORCE_gamma.cpp
index 9a06374dbe..f21b2e23e1 100644
--- a/ABACUS.develop/source/src_lcao/FORCE_gamma.cpp
+++ b/ABACUS.develop/source/src_lcao/FORCE_gamma.cpp
@@ -33,7 +33,7 @@ void Force_LCAO_gamma::ftable_gamma (
     // calculate the 'energy density matrix' here.
     this->cal_foverlap(isforce, isstress, foverlap, soverlap);
 
-    if(NEW_DM>0)
+    if(INPUT.new_dm>0)
     {
         this->cal_ftvnl_dphi(LOC.wfc_dm_2d.dm_gamma, isforce, isstress, ftvnl_dphi, stvnl_dphi);
         this->cal_fvnl_dbeta(LOC.wfc_dm_2d.dm_gamma, isforce, isstress, fvnl_dbeta, svnl_dbeta);
diff --git a/ABACUS.develop/source/src_lcao/FORCE_gamma_edm.cpp b/ABACUS.develop/source/src_lcao/FORCE_gamma_edm.cpp
index e82c9e9e1c..6b725fe446 100644
--- a/ABACUS.develop/source/src_lcao/FORCE_gamma_edm.cpp
+++ b/ABACUS.develop/source/src_lcao/FORCE_gamma_edm.cpp
@@ -655,7 +655,7 @@ void Force_LCAO_gamma::cal_foverlap(
     timer::tick("Force_LCAO_gamma","cal_foverlap",'G');
 
     // set energy density matrix.
-    if(NEW_DM>0)
+    if(INPUT.new_dm>0)
     {
         timer::tick("Force_LCAO_gamma","cal_edm_2d",'H');
 
diff --git a/ABACUS.develop/source/src_lcao/FORCE_k.cpp b/ABACUS.develop/source/src_lcao/FORCE_k.cpp
index 7ba5aa0cc8..3289020223 100644
--- a/ABACUS.develop/source/src_lcao/FORCE_k.cpp
+++ b/ABACUS.develop/source/src_lcao/FORCE_k.cpp
@@ -864,7 +864,8 @@ void Force_LCAO_k::cal_fvnl_dbeta_k(
 											{
 												for(int ipol=0;ipol<3;ipol++)
 												{
-													svnl_dbeta(jpol, ipol) += dm2d[is][iir] * (nlm[jpol] * r1[ipol] + nlm1[jpol] * r0[ipol]);
+													svnl_dbeta(jpol, ipol) += dm2d[is][iir] * 
+													(nlm[jpol] * r1[ipol] + nlm1[jpol] * r0[ipol]);
 												}
 											}
 										}
@@ -883,7 +884,8 @@ void Force_LCAO_k::cal_fvnl_dbeta_k(
 
 	assert( iir == LNNR.nnr );
 
-	if(isstress){
+	if(isstress)
+	{
 		for(int i=0;i<3;i++)
 		{
 			for(int j=0;j<3;j++)
diff --git a/ABACUS.develop/source/src_lcao/LCAO_descriptor.cpp b/ABACUS.develop/source/src_lcao/LCAO_descriptor.cpp
new file mode 100644
index 0000000000..c2ced8db93
--- /dev/null
+++ b/ABACUS.develop/source/src_lcao/LCAO_descriptor.cpp
@@ -0,0 +1,412 @@
+//caoyu add 2021-03-29
+#include "LCAO_descriptor.h"
+#include "LCAO_matrix.h"
+#include "../src_global/lapack_connector.h"
+#include "../src_global/intarray.h"
+#include "../src_global/complexmatrix.h"
+#include "global_fp.h"
+#include "../src_pw/global.h"
+#include "../src_io/winput.h"
+
+LCAO_Descriptor::LCAO_Descriptor()
+{
+    S_mu_alpha = new double[1];
+    PDM = new double[1];
+    mu_index = new IntArray[1];
+    d = new double[1];
+}
+LCAO_Descriptor::~LCAO_Descriptor()
+{
+    delete[] S_mu_alpha;
+    delete[] PDM;
+    delete[] mu_index;
+    delete[] d;
+}
+
+void LCAO_Descriptor::build_S_descriptor(const bool &calc_deri)
+{
+    TITLE("LCAO_Descriptor", "build_S_descriptor");
+
+    // =======init==============
+    // cal n(descriptor) per atom , related to Lmax, nchi(L) and m. (not total_nchi!)
+	this->des_per_atom=0; // mohan add 2021-04-21
+    for (int l = 0; l <= ORB.get_lmax_d(); l++)
+    {
+        this->des_per_atom += ORB.Alpha[0].getNchi(l) * (2 * l + 1);
+    }
+    this->n_descriptor = ucell.nat * this->des_per_atom;
+    const long Ssize = this->n_descriptor * NLOCAL;
+    delete[] S_mu_alpha;
+    S_mu_alpha = new double[Ssize];
+    ZEROS(S_mu_alpha, Ssize);
+
+    this->init_mu_index();
+    // =======init==============
+
+    //array to store data
+    double olm[3] = {0.0, 0.0, 0.0};
+
+    //\sum{T} e**{ikT} <\phi_{ia}|d\phi_{k\beta}(T)>	//???
+    Vector3<double> tau1, tau2, dtau;
+    Vector3<double> dtau1, dtau2, tau0;
+    for (int T1 = 0; T1 < ucell.ntype; ++T1)
+    {
+        Atom *atom1 = &ucell.atoms[T1];
+        for (int I1 = 0; I1 < atom1->na; ++I1)
+        {
+            tau1 = atom1->tau[I1];
+            //GridD.Find_atom(tau1);
+            GridD.Find_atom(tau1, T1, I1);
+            int *T2arr = new int[GridD.getAdjacentNum() + 1];
+            int *I2arr = new int[GridD.getAdjacentNum() + 1];
+            for (int ad = 0; ad < GridD.getAdjacentNum() + 1; ++ad)
+            {
+                T2arr[ad] = GridD.getType(ad);
+                I2arr[ad] = GridD.getNatom(ad);
+            }
+            for (int ad = 0; ad < GridD.getAdjacentNum() + 1; ++ad)
+            {
+                //const int T2 = GridD.getType(ad);
+                //const int I2 = GridD.getNatom(ad);
+                Atom *atom2 = &ucell.atoms[T2arr[ad]];
+                tau2 = GridD.getAdjacentTau(ad);
+                dtau = tau2 - tau1;
+                double distance = dtau.norm() * ucell.lat0;
+                double rcut = ORB.Phi[T1].getRcut() + ORB.Alpha[0].getRcut(); //Rcut is subject to ORB.Phi to keep dimension of S_mu_alpha same as Sloc
+                if (distance < rcut)
+                {
+                    int iw1_all = ucell.itiaiw2iwt(T1, I1, 0); //iw1_all = combined index (it, ia, iw)
+
+                    for (int jj = 0; jj < atom1->nw * NPOL; ++jj)
+                    {
+                        const int jj0 = jj / NPOL;
+                        const int L1 = atom1->iw2l[jj0];
+                        const int N1 = atom1->iw2n[jj0];
+                        const int m1 = atom1->iw2m[jj0];
+
+                        //init iw2_all
+                        int iw2_all = 0;
+                        int iatom = 0;
+                        for (int it = 0; it < T2arr[ad]; it++)
+                        {
+                            for (int ia = 0; ia < ucell.atoms[it].na; ia++)
+                            {
+                                iatom++; // cal how many atoms before ad in ucell
+                            }
+                        }
+                        iatom += I2arr[ad];
+                        iw2_all = iatom * this->des_per_atom;
+
+                        for (int L2 = 0; L2 <= ORB.Alpha[0].getLmax(); ++L2)
+                        {
+                            for (int N2 = 0; N2 < ORB.Alpha[0].getNchi(L2); ++N2)
+                            {
+                                for (int m2 = 0; m2 < 2 * L2 + 1; ++m2)
+                                {
+                                    olm[0] = olm[1] = olm[2] = 0.0;
+                                    if (!calc_deri)
+                                    {
+                                        UOT.snap_psialpha(olm, 0, tau1,
+                                                          T1, L1, m1, N1, GridD.getAdjacentTau(ad),
+                                                          T2arr[ad], L2, m2, N2);
+                                        if (GAMMA_ONLY_LOCAL)
+                                        {
+                                            this->set_S_mu_alpha(iw1_all, iw2_all, olm[0]);
+                                        }
+                                    }
+                                    ++iw2_all;
+                                } //m2
+                            }     //N2
+                        }         //nw2(L2)
+                        ++iw1_all;
+                    } // nw1
+                }     // distance
+            }         // ad
+            delete[] T2arr;
+            delete[] I2arr;
+        } // I1
+    }     // T1
+    if (!GAMMA_ONLY_LOCAL)
+    {
+        WARNING_QUIT("LCAO_Descriptor::build_S_descriptor", "muti-kpoint method for descriptor is not implemented yet! ");
+    }
+    return;
+}
+
+void LCAO_Descriptor::set_S_mu_alpha(const int &iw1_all, const int &iw2_all, const double &v)
+{
+    //const int ir = ParaO.trace_loc_row[iw1_all];
+    //const int ic = ParaO.trace_loc_col[iw2_all];
+    //no parellel yet
+    const int ir = iw1_all;
+    const int ic = iw2_all;
+    //const int index = ir * ParaO.ncol + ic;
+    long index;
+    if (KS_SOLVER == "genelpa" || KS_SOLVER == "scalapack_gvx") // save the matrix as column major format
+    {
+        index = ic * NLOCAL + ir;
+    }
+    else
+    {
+        index = ir * this->n_descriptor + ic; //row: lcao orbitals; col: descriptor basis
+    }
+    this->S_mu_alpha[index] += v;
+    return;
+}
+
+void LCAO_Descriptor::cal_projected_DM()
+{
+    //step 1: get dm: the coefficient of wfc, not charge density
+    double *dm = new double[NLOCAL * NLOCAL];
+    ZEROS(dm, NLOCAL * NLOCAL);
+    for (int i = 0; i < LOC.wfc_dm_2d.dm_gamma[0].nr; i++)
+    {
+        for (int j = 0; j < LOC.wfc_dm_2d.dm_gamma[0].nc; j++)
+        {
+            dm[i * NLOCAL + j] = LOC.wfc_dm_2d.dm_gamma[0](i, j); //only consider default NSPIN = 1
+        }
+    }
+/*
+    //===============test==============
+    cout << "test: out wfc_dm_2d.dm_gamma[0](i, j)" << endl;
+    for (int ir = 0; ir < NLOCAL; ir++)
+    {
+        for (int ic = 0; ic < NLOCAL; ic++)
+        {
+            cout << dm[ir * NLOCAL + ic] << " ";
+        }
+        cout << endl;
+    }
+    //===============\test==============
+*/
+    //step 2: get SS_alpha_mu and SS_nu_beta
+    double *ss = this->S_mu_alpha; //SS_nu_beta
+/*
+    //===============test==============
+    cout << "test: out S_nu_beta" << endl;
+    for (int ir = 0; ir < NLOCAL; ir++)
+    {
+        for (int ic = 0; ic < this->n_descriptor; ic++)
+        {
+            cout << ss[ir * this->n_descriptor + ic] << " ";
+        }
+        cout << endl;
+    }
+    //===============\test==============
+*/
+    //step 3 : multiply
+    //cal ssT*DM*ss
+
+    const long tmp_PDM_size = NLOCAL * this->n_descriptor;
+    double *tmp_PDM = new double[tmp_PDM_size];
+    ZEROS(tmp_PDM, tmp_PDM_size);
+    const long PDM_size = this->n_descriptor * this->n_descriptor;
+    delete[] this->PDM;
+    this->PDM = new double[PDM_size];
+    ZEROS(this->PDM, PDM_size);
+
+    const char t = 'T';  //transpose
+    const char nt = 'N'; //non transpose
+    const double alpha = 1;
+    const double beta = 0;
+    double *a = dm;
+    double *b = ss;
+    double *c = tmp_PDM;
+    dgemm_(&nt, &nt, &NLOCAL, &n_descriptor, &NLOCAL, &alpha, a, &NLOCAL, b, &NLOCAL, &beta, c, &NLOCAL); //S_nu_nu*SS_nu_beta
+    a = ss;
+    b = c;
+    c = this->PDM;
+    dgemm_(&t, &nt, &n_descriptor, &n_descriptor, &NLOCAL, &alpha, a, &NLOCAL, b, &NLOCAL, &beta, c, &n_descriptor); //SS_alpha_mu*S_mu_mu*DM*S_nu_nu*SS_nu_beta
+/*
+    //===============test==============
+    cout << "test: out PDM" << endl;
+    for (int ir = 0; ir < n_descriptor; ir++)
+    {
+        for (int ic = 0; ic < n_descriptor; ic++)
+        {
+            cout << this->PDM[ir * n_descriptor + ic] << " ";
+        }
+        cout << endl;
+    }
+    //===============\test==============
+*/
+    delete[] tmp_PDM;
+    delete[] dm;
+    return;
+}
+
+void LCAO_Descriptor::cal_descriptor()
+{
+    delete[] d;
+    d = new double[this->n_descriptor];
+    //==========print preparation=============
+    ofs_running << " print out each DM_Inl" << endl;
+    ofstream ofs;
+    stringstream ss;
+    ss << winput::spillage_outdir << "/"
+       << "projected_DM.dat";
+    if (MY_RANK == 0)
+    {
+        ofs.open(ss.str().c_str());
+    }
+    //==========print preparation=============
+    const int lmax = ORB.get_lmax_d();
+    int id = 0;
+    for (int it = 0; it < ucell.ntype; it++)
+    {
+        for (int ia = 0; ia < ucell.atoms[it].na; ia++)
+        {
+            ofs << ucell.atoms[it].label << " atom_index " << ia + 1 << " n_descriptor " << this->des_per_atom << endl;
+            for (int l = 0; l <= lmax; l++)
+            {
+                int nmax = ORB.Alpha[0].getNchi(l);
+                for (int n = 0; n < nmax; n++)
+                {
+                    const int dim = 2 * l + 1;
+                    // descriptor for atom (it, ia)
+                    ComplexMatrix des(dim, dim);
+                    for (int m = 0; m < 2 * l + 1; m++)
+                    {
+                        const int ii = mu_index[it](ia, l, n, m);
+                        for (int m2 = 0; m2 < 2 * l + 1; m2++)
+                        {
+                            const int jj = mu_index[it](ia, l, n, m2);
+
+                            long index = ii * this->n_descriptor + jj;
+                            assert(index >= 0);
+                            assert(index < this->n_descriptor * this->n_descriptor);
+                            complex<double> tmp(this->PDM[index], 0);
+                            des(m, m2) += tmp;
+                        }
+                    }
+
+                    this->print_projected_DM(ofs, des, it, ia, l, n);
+
+                    //ofs_running << "dimension of des is " << 2 * l + 1 << endl;
+                    if (l == 0)
+                    {
+                        this->d[id] = des(0, 0).real();
+                        ++id;
+                    }
+                    else
+                    {
+                        // diagonalizae
+                        // assume des matrix is Hermitian
+                        char jobz = 'N'; // eigenvalues only
+                        char uplo = 'U'; // upper matrix is stored
+                        int ndim = des.nr;
+                        double *tmpd = new double[ndim]();
+                        const int lwork = 2 * ndim;
+                        complex<double> *work = new complex<double>[lwork]();
+                        double *rwork = new double[3 * ndim - 2]();
+                        int infor = 0;
+                        // diag by calling zheev
+                        LapackConnector::zheev(jobz, uplo, ndim, des, ndim, tmpd, work, lwork, rwork, &infor);
+                        // put the eigenvalues into d (descriptor)
+                        for (int idim = 0; idim < ndim; ++idim)
+                        {
+                            this->d[id] = tmpd[idim];
+                            ++id;
+                        }
+                        delete[] tmpd;
+                        delete[] rwork;
+                        delete[] work;
+                    }
+                } //n
+            }     //l
+        }         //ia
+    }             //it
+    this->print_descriptor();
+    return;
+}
+
+void LCAO_Descriptor::init_mu_index(void)
+{
+    ofs_running << " Initialize the mu index for deepks (lcao line)" << endl;
+    const int lmax = ORB.get_lmax_d();
+    const int nmax = ORB.get_nchimax_d();
+    assert(lmax >= 0);
+    assert(nmax >= 0);
+    ofs_running << " lmax = " << lmax << endl;
+    ofs_running << " nmax = " << nmax << endl;
+
+    delete[] this->mu_index;
+    this->mu_index = new IntArray[ucell.ntype];
+
+    int mu = 0;
+    for (int it = 0; it < ucell.ntype; it++)
+    {
+        this->mu_index[it].create(
+            ucell.atoms[it].na,
+            lmax + 1, // l starts from 0
+            nmax,
+            2 * lmax + 1); // m ==> 2*l+1
+
+        ofs_running << "Type " << it + 1
+                    << " number_of_atoms " << ucell.atoms[it].na << endl;
+
+        for (int ia = 0; ia < ucell.atoms[it].na; ia++)
+        {
+            for (int l = 0; l < lmax + 1; l++)
+            {
+                for (int n = 0; n < ORB.Alpha[0].getNchi(l); n++)
+                {
+                    for (int m = 0; m < 2 * l + 1; m++)
+                    {
+                        this->mu_index[it](ia, l, n, m) = mu;
+                        mu++;
+                    }
+                }
+            }
+        }
+    }
+    assert(this->n_descriptor == mu);
+    ofs_running << "descriptors_per_atom " << this->des_per_atom << endl;
+    ofs_running << "total_descriptors " << this->n_descriptor << endl;
+
+    return;
+}
+
+void LCAO_Descriptor::print_projected_DM(ofstream &ofs, ComplexMatrix &des, const int &it, const int &ia, const int &l, const int &n)
+{
+    ofs << "L=" << l << "   N=" << n << endl;
+    for (int i = 0; i < 2 * l + 1; i++)
+    {
+        for (int j = 0; j < 2 * l + 1; j++)
+        {
+            ofs << des(i, j).real() << " ";
+        }
+        ofs << endl;
+    }
+    return;
+}
+void LCAO_Descriptor::print_descriptor()
+{
+    TITLE("LCAO_Descriptor", "print_descriptor");
+    ofstream ofs;
+    stringstream ss;
+    // the parameter 'winput::spillage_outdir' is read from INPUTw.
+    ss << winput::spillage_outdir << "/"
+       << "descriptor.dat";
+    if (MY_RANK == 0)
+    {
+        ofs.open(ss.str().c_str());
+    }
+    for (int it = 0; it < ucell.ntype; it++)
+    {
+        for (int ia = 0; ia < ucell.atoms[it].na; ia++)
+        {
+            ofs << ucell.atoms[it].label << " atom_index " << ia + 1 << " n_descriptor " << this->des_per_atom << endl;
+            int id0 = this->mu_index[it](ia, 0, 0, 0);
+            for (int id = id0; id < id0 + this->des_per_atom; ++id)
+            {
+                if ((id - id0) > 0 && (id - id0) % 8 == 0)
+                    ofs << endl;
+                ofs << d[id] << " ";
+            }
+            ofs << endl << endl;
+        }
+    }
+    ofs_running << "descriptors are printed" << endl;
+    return;
+}
diff --git a/ABACUS.develop/source/src_lcao/LCAO_descriptor.h b/ABACUS.develop/source/src_lcao/LCAO_descriptor.h
new file mode 100644
index 0000000000..c92c1b67d1
--- /dev/null
+++ b/ABACUS.develop/source/src_lcao/LCAO_descriptor.h
@@ -0,0 +1,59 @@
+#ifndef LCAO_DESCRIPTOR_H
+#define LCAO_DESCRIPTOR_H
+
+#include "../src_global/intarray.h"
+#include "../src_global/complexmatrix.h"
+
+//caoyu add 2021-03-29
+class LCAO_Descriptor
+{
+public:
+
+    LCAO_Descriptor();
+    ~LCAO_Descriptor();
+
+	// cal S_alpha_mu: overlap between lcao basis Phi and descriptor basis Al
+    void build_S_descriptor(const bool &calc_deri); 
+
+	// cal PDM: S_alpha_mu * inv(Sloc) * DM * inv(Sloc) * S_nu_beta
+    void cal_projected_DM(void);
+
+	// cal d: EIGENVALUE of PDM in block of I_n_l
+    void cal_descriptor(void);
+    void print_descriptor(void);
+
+private:
+
+	// overlap between lcao and descriptor basis
+    double *S_mu_alpha;
+
+	// projected density matrix
+    double *PDM;
+
+	// descriptors
+    double *d;
+
+    int n_descriptor;
+
+	// \sum_L{Nchi(L)*(2L+1)}
+    int des_per_atom;
+
+    IntArray *mu_index;
+
+    void init_mu_index(void);
+    
+	void set_S_mu_alpha(
+		const int &iw1_all, 
+		const int &iw2_all, 
+		const double &v);
+
+    void print_projected_DM(
+		ofstream &ofs, 
+		ComplexMatrix &des, 
+		const int &it, 
+		const int &ia, 
+		const int &l, 
+		const int &n);
+};
+
+#endif
diff --git a/ABACUS.develop/source/src_lcao/LCAO_gen_fixedH.cpp b/ABACUS.develop/source/src_lcao/LCAO_gen_fixedH.cpp
index ae2b14d3fe..3378918e5a 100644
--- a/ABACUS.develop/source/src_lcao/LCAO_gen_fixedH.cpp
+++ b/ABACUS.develop/source/src_lcao/LCAO_gen_fixedH.cpp
@@ -123,6 +123,8 @@ void LCAO_gen_fixedH::build_ST_new(const char& dtype, const bool& calc_deri)
 							complex<double> *olm2 = &olm1[0];
 							if(!calc_deri)
 							{
+								// PLEASE use UOT as an input parameter of this subroutine
+								// mohan add 2021-03-30
 								UOT.snap_psipsi( olm, 0, dtype, tau1, 
 										T1, L1, m1, N1, GridD.getAdjacentTau(ad), 
 										T2, L2, m2, N2,
@@ -647,11 +649,21 @@ void LCAO_gen_fixedH::build_Nonlocal_mu(const bool &calc_deri)
 }
 
 
-void LCAO_gen_fixedH::build_Nonlocal_beta(const bool& calc_deri)
+void LCAO_gen_fixedH::build_Nonlocal_beta(const bool& calc_deri) //update by liuyu 2021-04-07
 {
     TITLE("LCAO_gen_fixedH","build_Nonlocal_beta");
     timer::tick ("LCAO_gen_fixedH","build_Nonlocal_beta",'G');
 
+	matrix Rcut;
+	Rcut.create(ucell.ntype, ucell.ntype);
+	for(int i=0; i<ucell.ntype; i++)
+	{
+        for(int j=0; j<ucell.ntype; j++)
+        {
+            Rcut(i,j) = ORB.Phi[i].getRcut() + ORB.Phi[j].getRcut();
+        }
+    }
+	
     for (int T0 = 0; T0 < ucell.ntype; T0++)
     {
 		Atom* atom0 = &ucell.atoms[T0]; 
@@ -666,44 +678,50 @@ void LCAO_gen_fixedH::build_Nonlocal_beta(const bool& calc_deri)
             {
                 const int T1 = GridD.getType(ad);
                 const int I1 = GridD.getNatom(ad);
-				const int iat = ucell.itia2iat(T1, I1);
+				//const int iat = ucell.itia2iat(T1, I1);
                 const int start = ucell.itiaiw2iwt(T1, I1, 0);
                 const Vector3<double> tau1 = GridD.getAdjacentTau(ad);
 				const Atom* atom1 = &ucell.atoms[T1];
+				const int nw1_tot = atom1->nw*NPOL;
 
 				// use to label < mu | H | nu(prime) >
-				int nnr = LNNR.nlocstart[iat];
+				//int nnr = LNNR.nlocstart[iat];
             
 				//(3)
 				for (int ad2=0; ad2 < GridD.getAdjacentNum()+1 ; ad2++)
 				{
+					//if(ad2<ad && !calc_deri) continue; //add by liuyu 20210406
 					const int T2 = GridD.getType(ad2);
 					const int I2 = GridD.getNatom(ad2);
 					const int start2 = ucell.itiaiw2iwt(T2, I2, 0);
 					const Vector3<double> tau2 = GridD.getAdjacentTau(ad2);
 					const Atom* atom2 = &ucell.atoms[T2];
+					const int nw2_tot = atom2->nw*NPOL;
 
 					Vector3<double> dtau = tau2 - tau1;
 					double distance = dtau.norm() * ucell.lat0;
-					double rcut = ORB.Phi[T1].getRcut() + ORB.Phi[T2].getRcut();
+					double rcut = Rcut(T1,T2);
+					//double rcut = ORB.Phi[T1].getRcut() + ORB.Phi[T2].getRcut();
 					if(distance < rcut)
 					{
 						// ------------- enter the nnr increaing zone --------------
-						for (int j=0; j<atom1->nw*NPOL; j++)
+						//for (int j=0; j<atom1->nw*NPOL; j++)
+						for (int j=0; j<nw1_tot; j++)
 						{
-							const int j0 = j/NPOL;
 							const int iw1_all = start + j;
 							const int mu = ParaO.trace_loc_row[iw1_all];
-							if(mu < 0)continue; 
+							if(mu < 0)continue;
+							const int j0 = j/NPOL;
 
 							// mohan fix bug 2010-12-20
 							// atom2[T2] -> atom2.
-							for (int k=0; k<atom2->nw*NPOL; k++)
+							//for (int k=0; k<atom2->nw*NPOL; k++)
+							for (int k=0; k<nw2_tot; k++)
 							{
-								const int k0 = k/NPOL;
 								const int iw2_all = start2 + k;
 								const int nu = ParaO.trace_loc_col[iw2_all];
 								if(nu < 0)continue;
+								const int k0 = k/NPOL;
 
 								double nlm[3];
 								nlm[0] = nlm[1] = nlm[2] = 0.0;
@@ -722,17 +740,18 @@ void LCAO_gen_fixedH::build_Nonlocal_beta(const bool& calc_deri)
 											ucell.atoms[T0].tau[I0], T0
 											);
 
-									if(GAMMA_ONLY_LOCAL)
-									{
+									//if(GAMMA_ONLY_LOCAL)
+									//{
 										LM.set_HSgamma(iw1_all,iw2_all,nlm[0],'N');//N stands for nonlocal.
-									}
-									else
-									{
-										WARNING_QUIT("LCAO_gen_fixedH::build_Nonlocal_beta","not consistent with k point algorithm.");
+										//if(ad!=ad2) LM.set_HSgamma(iw2_all,iw1_all,nlm[0],'N'); //add by liuyu 20210406
+									//}
+								//	else
+								//	{
+								//		WARNING_QUIT("LCAO_gen_fixedH::build_Nonlocal_beta","not consistent with k point algorithm.");
 //										assert( nnr < LNNR.nnr );
 //										LM.Hloc_fixedR[ nnr ] += nlm[0];
 //										++nnr;
-									}
+								//	}
 								}
 								else  // calculate force
 								{
@@ -748,19 +767,19 @@ void LCAO_gen_fixedH::build_Nonlocal_beta(const bool& calc_deri)
 											ucell.atoms[T0].tau[I0], T0
 											);
 
-									if(GAMMA_ONLY_LOCAL)
-									{
+									//if(GAMMA_ONLY_LOCAL)
+									//{
 										//add part of nonlocal ps derivatives to T matrix
 										LM.set_force(iw1_all, iw2_all, nlm[0], nlm[1], nlm[2], 'N');
-									}
-									else
-									{
-										WARNING_QUIT("LCAO_gen_fixedH::build_Nonlocal_beta","not consistent with k point algorithm.");
+									//}
+									//else
+									//{
+										//WARNING_QUIT("LCAO_gen_fixedH::build_Nonlocal_beta","not consistent with k point algorithm.");
 										//LM.DHloc_fixedR_x[ nnr ] += nlm[0];
 										//LM.DHloc_fixedR_y[ nnr ] += nlm[1];
 										//LM.DHloc_fixedR_z[ nnr ] += nlm[2];
-										++nnr;
-									}
+										//++nnr;
+									//}
 								}
 							}// end k
 						}// j 
@@ -768,7 +787,7 @@ void LCAO_gen_fixedH::build_Nonlocal_beta(const bool& calc_deri)
                 }// ad2
 				// mohan add 2011-06-16
 
-				if(!GAMMA_ONLY_LOCAL) // mohan fix bug 2011-06-26
+				/*if(!GAMMA_ONLY_LOCAL) // mohan fix bug 2011-06-26
 				{
 					if( iat < ucell.nat-1 )
 					{
@@ -780,7 +799,7 @@ void LCAO_gen_fixedH::build_Nonlocal_beta(const bool& calc_deri)
 							WARNING_QUIT("build_Nonlocal_beta","nnr");
 						}
 					}
-				}
+				}*/
             }// ad
         }// end I0
     }// end T0
diff --git a/ABACUS.develop/source/src_lcao/LOOP_cell.cpp b/ABACUS.develop/source/src_lcao/LOOP_cell.cpp
index 003b9eb7a3..785bd935c4 100644
--- a/ABACUS.develop/source/src_lcao/LOOP_cell.cpp
+++ b/ABACUS.develop/source/src_lcao/LOOP_cell.cpp
@@ -24,6 +24,7 @@ void LOOP_cell::opt_cell(void)
     UFFT.allocate();
 
     // output is ppcell.vloc 3D local pseudopotentials
+	// without structure factors
     // this function belongs to cell LOOP
     ppcell.init_vloc(pw.nggm, ppcell.vloc);
 
@@ -34,6 +35,8 @@ void LOOP_cell::opt_cell(void)
     pot.init_pot(ion_step, pw.strucFac);
 
 
+	// PLEASE simplify the Exx_Global interface
+	// mohan add 2021-03-25
 	// Peize Lin 2016-12-03
 	if (CALCULATION=="scf" || CALCULATION=="relax" || CALCULATION=="cell-relax")
 	{
@@ -52,6 +55,8 @@ void LOOP_cell::opt_cell(void)
 		}
 	}	
 
+	// PLEASE do not use INPUT global variable
+	// mohan add 2021-03-25
 	// Quxin added for DFT+U
 	if(INPUT.dft_plus_u) 
 	{
diff --git a/ABACUS.develop/source/src_lcao/LOOP_ions.cpp b/ABACUS.develop/source/src_lcao/LOOP_ions.cpp
index 294b5d4519..779652d5cf 100644
--- a/ABACUS.develop/source/src_lcao/LOOP_ions.cpp
+++ b/ABACUS.develop/source/src_lcao/LOOP_ions.cpp
@@ -13,6 +13,7 @@
 #include "ELEC_scf.h"
 #include "src_global/sltk_atom_arrange.h"
 #include "src_pw/vdwd2.h"
+#include "LCAO_descriptor.h"
 
 LOOP_ions::LOOP_ions()
 {}
@@ -108,32 +109,16 @@ void LOOP_ions::opt_ions(void)
 		
 		time_t eend = time(NULL);
 
-        //xiaohui add 2014-07-07, for second-order extrapolation
-        int iat=0;
+		//for second-order extrapolation
         if(CALCULATION=="relax" || CALCULATION=="cell-relax")
         {
-            for(int it = 0;it < ucell.ntype;it++)
-            {
-                Atom* atom = &ucell.atoms[it];
-                for(int ia =0;ia< ucell.atoms[it].na;ia++)
-                {
-                    CE.pos_old2[3*iat  ] = CE.pos_old1[3*iat  ];
-                    CE.pos_old2[3*iat+1] = CE.pos_old1[3*iat+1];
-                    CE.pos_old2[3*iat+2] = CE.pos_old1[3*iat+2];
-
-                    CE.pos_old1[3*iat  ] = CE.pos_now[3*iat  ];
-                    CE.pos_old1[3*iat+1] = CE.pos_now[3*iat+1];
-                    CE.pos_old1[3*iat+2] = CE.pos_now[3*iat+2];
-
-                    CE.pos_now[3*iat  ] = atom->tau[ia].x*ucell.lat0;
-                    CE.pos_now[3*iat+1] = atom->tau[ia].y*ucell.lat0;
-                    CE.pos_now[3*iat+2] = atom->tau[ia].z*ucell.lat0;
-
-                    iat++;
-                }
-            }
+            CE.update_all_pos(ucell);
         }
 
+		// PLEASE design a proper interface to output potentials,
+		// not only electrostatic potential but also others
+		// mohan add 2021-03-25
+		// we need to have a proper
         if(pot.out_potential == 2)
         {
             stringstream ssp;
@@ -147,6 +132,14 @@ void LOOP_ions::opt_ions(void)
 		{
 			this->output_HS_R(); //LiuXh add 2019-07-15
 		}
+        //caoyu add 2021-03-31
+        if (INPUT.out_descriptor)
+        {
+            LCAO_Descriptor ld;
+            ld.build_S_descriptor(0);  //derivation not needed yet
+            ld.cal_projected_DM();
+            ld.cal_descriptor();
+        }
 
         time_t fstart = time(NULL);
         if (CALCULATION=="scf" || CALCULATION=="relax" || CALCULATION=="cell-relax")
@@ -155,23 +148,12 @@ void LOOP_ions::opt_ions(void)
         }            
         time_t fend = time(NULL);
 
-
+		// PLEASE move the details of CE to other places
+		// mohan add 2021-03-25
         //xiaohui add 2014-07-07, for second-order extrapolation
-        iat=0;
         if(FORCE)
         {
-            for(int it = 0;it < ucell.ntype;it++)
-            {
-                Atom* atom = &ucell.atoms[it];
-                for(int ia =0;ia< ucell.atoms[it].na;ia++)
-                {
-                    CE.pos_next[3*iat  ] = atom->tau[ia].x*ucell.lat0;
-                    CE.pos_next[3*iat+1] = atom->tau[ia].y*ucell.lat0;
-                    CE.pos_next[3*iat+2] = atom->tau[ia].z*ucell.lat0;
-
-                    iat++;
-                }
-            }
+            CE.save_pos_next(ucell);
         }
 		
         if(OUT_LEVEL=="i")
@@ -218,7 +200,7 @@ void LOOP_ions::opt_ions(void)
     }
 
 	// mohan update 2021-02-10
-    hm.orb_con.clear_after_ions();
+    hm.orb_con.clear_after_ions(UOT, ORB);
 
     timer::tick("LOOP_ions","opt_ions",'B'); 
     return;
@@ -266,7 +248,7 @@ bool LOOP_ions::force_stress(
             }
             else // ions are not converged
             {
-                CE.istep = istep;
+                CE.update_istep(istep); 
                 CE.extrapolate_charge();
 
                 if(pot.extra_pot=="dm")
@@ -380,7 +362,7 @@ xiaohui modify 2014-08-09*/
             }
             else
             {
-                CE.istep = force_step;
+                CE.update_istep(force_step);
                 CE.extrapolate_charge();
 
                 if(pot.extra_pot=="dm")
diff --git a/ABACUS.develop/source/src_lcao/ORB_atomic.cpp b/ABACUS.develop/source/src_lcao/ORB_atomic.cpp
index 9d68cbfb44..2cd425c7f9 100644
--- a/ABACUS.develop/source/src_lcao/ORB_atomic.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_atomic.cpp
@@ -1,7 +1,3 @@
-//=========================================================
-//AUTHOR : liaochen
-//DATE : 2008-11-12
-//=========================================================
 #include "ORB_atomic.h"
 
 Vector3<double> Numerical_Orbital::R1;
diff --git a/ABACUS.develop/source/src_lcao/ORB_atomic.h b/ABACUS.develop/source/src_lcao/ORB_atomic.h
index 6f4e2defea..48000ccbf8 100644
--- a/ABACUS.develop/source/src_lcao/ORB_atomic.h
+++ b/ABACUS.develop/source/src_lcao/ORB_atomic.h
@@ -6,7 +6,13 @@
 #ifndef NUMERICAL_ORBITAL_H
 #define NUMERICAL_ORBITAL_H
 
-#include "../src_pw/tools.h"
+#include <string>
+using namespace std;
+
+#include "../src_global/intarray.h"
+#include "../src_global/vector3.h"
+
+//#include "../src_pw/tools.h"
 #include "ORB_atomic_lm.h"
 
 //=========================================================
diff --git a/ABACUS.develop/source/src_lcao/ORB_atomic_lm.cpp b/ABACUS.develop/source/src_lcao/ORB_atomic_lm.cpp
index 7a6d724c19..665ea33ba5 100644
--- a/ABACUS.develop/source/src_lcao/ORB_atomic_lm.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_atomic_lm.cpp
@@ -1,10 +1,8 @@
-//=========================================================
-//AUTHOR : liaochen, mohan
-//DATE : 2008-11-12
-//=========================================================
 #include "ORB_atomic_lm.h"
-#include "src_global/sph_bessel_recursive.h"
-#include "src_global/lapack_connector.h"
+#include "../src_global/sph_bessel_recursive.h"
+#include "../src_global/lapack_connector.h"
+#include "../src_global/timer.h"
+#include "../src_global/math_integral.h"
 #include <omp.h>
 
 Numerical_Orbital_Lm::Numerical_Orbital_Lm()
@@ -27,12 +25,7 @@ Numerical_Orbital_Lm::Numerical_Orbital_Lm()
 }
 
 Numerical_Orbital_Lm::~Numerical_Orbital_Lm()
-{
-	if(test_deconstructor)
-	{
-		cout << " ~Numerical_Orbital_Lm()" << endl;
-	}
-}
+{}
 
 void Numerical_Orbital_Lm::set_orbital_info
 (
@@ -220,7 +213,8 @@ void Numerical_Orbital_Lm::extra_uniform(const double &dr_uniform_in)
 	#pragma omp parallel for schedule(static)
 	for (int ir = 0; ir < this->nr_uniform; ir++)
 	{
-		const double psi_uniform_tmp  = Mathzone_Add1::Uni_RadialF(VECTOR_TO_PTR(this->psi), this->nr, this->rab[0], ir * dr_uniform); 
+		const double psi_uniform_tmp  = 
+		Mathzone_Add1::Uni_RadialF(VECTOR_TO_PTR(this->psi), this->nr, this->rab[0], ir * dr_uniform); 
 		this->psi_uniform[ir] = psi_uniform_tmp;
 //    	this->psi_uniform[ir] = Mathzone::Polynomial_Interpolation(this->psi, this->nr, this->rab[0], ir * dr_uniform); 
     }
@@ -396,7 +390,7 @@ void Numerical_Orbital_Lm::cal_kradial(void)
 			integrated_func[ir] = this->psir[ir] * this->r_radial[ir] * jl[ir];
 		}
 
-		Mathzone::Simpson_Integral(
+		Integral::Simpson_Integral(
 				this->nr,
 				integrated_func,
 				VECTOR_TO_PTR(this->rab),
@@ -450,7 +444,7 @@ void Numerical_Orbital_Lm::cal_kradial_sbpool(void)
 		const vector<double> &jlk = jl[ik];
 		for (int ir = 0; ir < nr; ir++)
 			integrated_func[ir] = psir2[ir] * jlk[ir];
-		Mathzone::Simpson_Integral(
+		Integral::Simpson_Integral(
 				this->nr,
 				VECTOR_TO_PTR(integrated_func),
 				dr,
@@ -468,16 +462,22 @@ void Numerical_Orbital_Lm::cal_kradial_sbpool(void)
 
 	// dr must be all the same for Sph_Bessel_Recursive_Pool
 	const double dr = this->rab[0];
+	
 	for( int ir=1; ir<this->nr; ++ir )
+	{
 		assert( dr == this->rab[ir] );
+	}
 
 	Sph_Bessel_Recursive::D2* pSB = nullptr;
 	for( auto & sb : Sph_Bessel_Recursive_Pool::D2::sb_pool )
+	{
 		if( this->dk * dr == sb.get_dx() )
 		{
 			pSB = &sb;
 			break;
 		}
+	}
+
 	if(!pSB)
 	{
 		Sph_Bessel_Recursive_Pool::D2::sb_pool.push_back({});
@@ -491,20 +491,42 @@ void Numerical_Orbital_Lm::cal_kradial_sbpool(void)
 
 	vector<double> r_tmp(nr);
 	for( int ir=0; ir!=nr; ++ir )
+	{
 		r_tmp[ir] = this->psir[ir] * this->r_radial[ir] * this->rab[ir];
+	}
+
 	constexpr double one_three=1.0/3.0, two_three=2.0/3.0, four_three=4.0/3.0;
-	r_tmp[0]*=one_three;	r_tmp[nr-1]*=one_three;
+	r_tmp[0]*=one_three;	
+	r_tmp[nr-1]*=one_three;
+
 	for( int ir=1; ir!=nr-1; ++ir )
+	{
 		r_tmp[ir] *= (ir&1) ? four_three : two_three;
+	}
 
+#ifdef __NORMAL
+	// need to be checked (avoid using Lapack)
+	for(int ik=0; ik<nk; ++ik)
+	{
+		double psi_f_tmp = 0.0; 
+		for(int ir=0; ir<nr; ++ir)
+		{
+			psi_f_tmp += r_tmp[ir]*jl[ik][ir];
+		}
+		psi_f_tmp *= pref;
+	}
+#else
 	#pragma omp parallel for schedule(static)
 	for (int ik = 0; ik < nk; ik++)
 	{
-		const double psi_f_tmp = pref * LapackConnector::dot( this->nr, VECTOR_TO_PTR(r_tmp), 1, VECTOR_TO_PTR(jl[ik]), 1 ) ;
+		const double psi_f_tmp = 
+		pref * LapackConnector::dot( this->nr, VECTOR_TO_PTR(r_tmp), 1, VECTOR_TO_PTR(jl[ik]), 1 ) ;
 		this->psif[ik] = psi_f_tmp;
 		this->psik[ik] = psi_f_tmp * k_radial[ik];
 		this->psik2[ik] = this->psik[ik] * k_radial[ik];
 	}
+#endif
+	return;
 }
 
 // Peize Lin add 2017-12-11
@@ -568,7 +590,7 @@ void Numerical_Orbital_Lm::norm_test(void)const
 	double sumr = 0.0;
 	//double sumk = 0.0;
 
-	Mathzone::Simpson_Integral(this->nr, f, VECTOR_TO_PTR(this->rab), sumr);
+	Integral::Simpson_Integral(this->nr, f, VECTOR_TO_PTR(this->rab), sumr);
 
 	delete[] f;
 	f = new double[nk];
@@ -577,7 +599,7 @@ void Numerical_Orbital_Lm::norm_test(void)const
 		f[ik] = this->psik[ik] * this->psik[ik];
 	}
 
-//	Mathzone::Simpson_Integral(this->nk, f, this->k_radial, sumk);
+//	Integral::Simpson_Integral(this->nk, f, this->k_radial, sumk);
 	
 	//means nothing.
 	//ofs_running << setw(12) << sumk << endl;
diff --git a/ABACUS.develop/source/src_lcao/ORB_control.cpp b/ABACUS.develop/source/src_lcao/ORB_control.cpp
index 302e93083f..57661b7108 100644
--- a/ABACUS.develop/source/src_lcao/ORB_control.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_control.cpp
@@ -4,19 +4,23 @@
 #include "src_global/sltk_atom_arrange.h"
 #include "ORB_gen_tables.h"
 #include "build_st_pw.h"
+#include "../src_pdiag/pdiag_double.h"
 
 ORB_control::ORB_control()
 {}
 
 ORB_control::~ORB_control()
-{
-	if(test_deconstructor)
-	{
-		cout << " ~ORB_control()" << endl;
-	}
-}
+{}
 
-void ORB_control::set_orb_tables(void)
+void ORB_control::set_orb_tables(
+	ORB_gen_tables &OGT, 
+	LCAO_Orbitals &orb,
+	const double &lcao_ecut_in, // mohan add 2021-04-16
+	const double &lcao_dk_in, // mohan add 2021-04-16
+	const double &lcao_dr_in, // mohan add 2021-04-16
+	const double &lcao_rmax_in, // mohan add 2021-04-16
+	const double &lat0,
+	const int &Lmax_exx)
 {
     TITLE("ORB_control","set_orb_tables");
 	timer::tick("ORB_control","set_orb_tables",'B');
@@ -25,7 +29,20 @@ void ORB_control::set_orb_tables(void)
     // (1) FUNCTION : use 'info' to generate 'Numerical Orbital'
     // (1) RESULT : We have 'Numerical Orbital' for calculate S-table and T-table.
 	//=============================================================================
-    ORB.Read_Orbitals();
+
+	// mohan add 2021-04-16
+	assert(lcao_ecut_in>0.0);
+	assert(lcao_dk_in>0.0);
+	assert(lcao_dr_in>0.0);
+	assert(lcao_rmax_in>0.0);
+
+	// mohan add 2021-04-16
+	orb.ecutwfc = lcao_ecut_in;
+	orb.dk = lcao_dk_in;
+	orb.dR = lcao_dr_in;
+	orb.Rmax = lcao_rmax_in;
+	
+    orb.Read_Orbitals(ucell.ntype, ucell.lmax);
 
 	if(CALCULATION=="test")
 	{
@@ -34,7 +51,7 @@ void ORB_control::set_orb_tables(void)
 	}
 
     //=============================================================================
-    // (2) FUNCTION : Generate Gaunt_Coefficients and S-table using UOT.init
+    // (2) FUNCTION : Generate Gaunt_Coefficients and S-table using OGT.init
     // 	   Must have 'Numerical Orbital' infomation
     // (2) RESULT : we have tabulated S table for use.
     //=============================================================================
@@ -43,23 +60,27 @@ void ORB_control::set_orb_tables(void)
     // 1: generate overlap table
     // 2: generate kinetic table
     // 3: generate overlap & kinetic table
-    UOT.gen_tables(job0);
+    OGT.gen_tables(job0, orb, Lmax_exx);
     // init lat0, in order to interpolated value from this table.
-    UOT.set_unit(ucell.lat0);
+
+	assert(lat0>0.0);
+    OGT.set_unit(lat0);
 
 
 	timer::tick("ORB_control","set_orb_tables",'B');
     return;
 }
 
-void ORB_control::clear_after_ions(void)
+void ORB_control::clear_after_ions(ORB_gen_tables &OGT, LCAO_Orbitals &orb)
 {
     TITLE("ORB_control","clear_after_ions");
-    UOT.MOT.Destroy_Table();
-    UOT.tbeta.Destroy_Table_Beta();
-    //caoyu add 2021-03-18
-    if (INPUT.out_descriptor && BASIS_TYPE == "lcao") {
-        UOT.talpha.Destroy_Table_Alpha();
+    OGT.MOT.Destroy_Table(orb);
+    OGT.tbeta.Destroy_Table_Beta(orb);
+    
+	//caoyu add 2021-03-18
+    if (INPUT.out_descriptor && BASIS_TYPE == "lcao") 
+	{
+        OGT.talpha.Destroy_Table_Alpha();
     }
     return;
 }
diff --git a/ABACUS.develop/source/src_lcao/ORB_control.h b/ABACUS.develop/source/src_lcao/ORB_control.h
index c69397bc1d..40536b0d05 100644
--- a/ABACUS.develop/source/src_lcao/ORB_control.h
+++ b/ABACUS.develop/source/src_lcao/ORB_control.h
@@ -1,14 +1,8 @@
-//==========================================================
-// AUTHOR : mohan, ywcui
-// Last Update: 2021-02-10
-//==========================================================
 #ifndef ORB_CONTROL_H 
 #define ORB_CONTROL_H 
 
-#include "../src_pw/tools.h"
-
 #include "ORB_gen_tables.h"
-#include "../src_pdiag/pdiag_double.h"
+#include "ORB_read.h"
 
 class ORB_control 
 {
@@ -19,8 +13,19 @@ class ORB_control
     ~ORB_control();
 
     // Generate the S(overlap),T,NL matrix.
-    void set_orb_tables();
-    void clear_after_ions();
+    void set_orb_tables(
+		ORB_gen_tables &OGT, 
+		LCAO_Orbitals &orb,
+		const double &lcao_ecut_in, // mohan add 2021-04-16
+		const double &lcao_dk_in, // mohan add 2021-04-16
+		const double &lcao_dr_in, // mohan add 2021-04-16
+		const double &lcao_rmax_in, // mohan add 2021-04-16
+		const double &lat0,
+		const int &Lmax_exx);
+
+    void clear_after_ions(
+		ORB_gen_tables &OGT, 
+		LCAO_Orbitals &orb);
 
 };
 #endif
diff --git a/ABACUS.develop/source/src_lcao/ORB_gaunt_table.cpp b/ABACUS.develop/source/src_lcao/ORB_gaunt_table.cpp
index b36b373ff0..18c0449dec 100644
--- a/ABACUS.develop/source/src_lcao/ORB_gaunt_table.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_gaunt_table.cpp
@@ -1,4 +1,13 @@
 #include "ORB_gaunt_table.h"
+#include <math.h>
+#include <cassert>
+#include "../src_global/timer.h"
+#include "../src_global/memory.h"
+#include "../src_global/mathzone.h"
+#include "../src_global/global_function.h"
+//#include "../src_global/matrix3.h"
+#include "../src_global/vector3.h"
+#include "../src_global/constants.h"
 
 ORB_gaunt_table::ORB_gaunt_table(){}
 ORB_gaunt_table::~ORB_gaunt_table(){}
@@ -37,64 +46,30 @@ void ORB_gaunt_table::init_Gaunt(const int &lmax)
 					
 							Gaunt_Coefficients(dim1, dim2, dim) = 
 								this->Get_Gaunt_SH (L1, m1, L2, m2, L, m);	
-
-							/*
-							if (dim1 == 1 && dim2 == 1 && dim == 4)
-							{
-								cout << "\nGaunt_Coef = " << Gaunt_Coefficients (dim1, dim2, dim) << endl;
-								cout << "\nGet_Gaunt_SH = " << Get_Gaunt_SH (L1, m1, L2, m2, L, m) << endl;
-							}
-							*/
-
-						//	Gaunt_Coefficients(dim1, dim2, dim) = 
-						//		this->Cal_Gaunt_single(L1, m1, L2, m2, L, m, 0.0, PI, 0.0, TWO_PI);
-						
-							//test
-//							double G_revers = this->Cal_Gaunt_single(L2, m2, L1, m1, L, m, 0.0, pi, 0.0, tpi);
-
-//							cout <<  Gaunt_Coefficients(dim1, dim2, dim3) << setw(20) << G_revers << endl;
-							
-							/*
-							//test
-							int M1, M2, M;
-							if(m1 % 2 == 0) M1 = - m1 / 2;
-							else	M1 = (m1+1) / 2;
-							if(m2 % 2 == 0) M2 = - m2 / 2;
-							else	M2 = (m2+1) / 2;
-							if(m % 2 == 0) M = - m / 2;
-							else	M = (m+1) / 2;
-							*/
-							
-							/*
-							double G1 = Get_Gaunt_SH(L1, m1, L2, m2, L, m);
-
-							if( fabs(Gaunt_Coefficients(dim1, dim2, dim) - G1) > 1e-5)
-							{
-								cout << "\nl1 = " << L1 << " m1 = " << m1 << " l2 = " << L2 << " m2 = " << m2 
-												<< " L = " << L << " M = " << m << endl;
-							
-								cout << Gaunt_Coefficients(dim1, dim2, dim) << setw(20) << G1 << setw(20) 
-											<< Gaunt_Coefficients(dim1, dim2, dim) - G1 << endl;
-							}
-							*/
-						}
-                    }
-                }
-            }
-        }
-    }
+						}// m2
+                    }// L2
+                }// m1
+            }// L1
+        }// m
+    }// L
 
     timer::tick("ORB_gaunt_table", "init_Gaunt",'D');
     return;
 }
 
+
 double ORB_gaunt_table::Cal_Gaunt_single
 (
-    const int &L1, const int &m1,
-    const int &L2, const int &m2,
-    const int &L, const int &m,
-    const double &s1, const double &e1,
-    const double &s2, const double &e2
+    const int &L1, 
+	const int &m1,
+    const int &L2, 
+	const int &m2,
+    const int &L, 
+	const int &m,
+    const double &s1, 
+	const double &e1,
+    const double &s2, 
+	const double &e2
 )
 {
 	timer::tick("ORB_gaunt_table", "Cal_Gaunt_single");
@@ -121,7 +96,6 @@ double ORB_gaunt_table::Cal_Gaunt_single
 		for (int j = 0;j < 16;j++)
 		{
 			double theta = ((s1 + e1) + (e1 - s1) * absc[i]) / 2;
-			//double phi = ((s2 + e2) + (e2 - s2) * absc[j]) / 2;
 
 			result += weight[i] * weight[j] * sin(theta) *
 			          this->Ylm_Gaunt( this->get_lm_index(L1, m1), 16 * i + j) *
@@ -135,11 +109,14 @@ double ORB_gaunt_table::Cal_Gaunt_single
 	return result;
 }
 
+
 void ORB_gaunt_table::init_Ylm_Gaunt
 (
  	const int &lmax,
-    const double &s1, const double &e1,
-    const double &s2, const double &e2
+    const double &s1, 
+	const double &e1,
+    const double &s2, 
+	const double &e2
 )
 {
 	TITLE("ORB_gaunt_table", "init_Ylm_Gaunt");
@@ -153,12 +130,6 @@ void ORB_gaunt_table::init_Ylm_Gaunt
 		0.09501250983763744, 0.2816035507792589, 0.4580167776572274, 0.6178762444026438, 
 		0.755404408355003, 0.8656312023878318, 0.9445750230732326, 0.9894009349916499 };
 
-	//static double weight[16] = {
-	//	0.02715245941175406, 0.06225352393864778, 0.0951585116824929, 0.1246289712555339, 
-	//	0.1495959888165768, 0.1691565193950026, 0.1826034150449236, 0.1894506104550685, 
-	//	0.1894506104550685, 0.1826034150449236, 0.1691565193950026, 0.1495959888165768, 
-	//	0.1246289712555339, 0.0951585116824929, 0.06225352393864778, 0.02715245941175406 };
-
 	//initialization of ylm_map
 
 	Vector3<double> g_gaunt[256];
@@ -182,21 +153,25 @@ void ORB_gaunt_table::init_Ylm_Gaunt
 	return;
 }
 
-int ORB_gaunt_table::get_lm_index(const int l, const int m)
+
+int ORB_gaunt_table::get_lm_index(
+	const int l, 
+	const int m)
 {
 	return l*l+m;
 }
 
+
 /**********************
 //Rasch and Yu's Method
 ***********************/
-
 //total pointers
 int ORB_gaunt_table::P_EL(const int& L)
 {
 	return (L+1) * (L+2) * (L+3) * (L+4) / 24;
 }
 
+
 //effective pointers
 int ORB_gaunt_table::EP_EL(const int& L)
 {
@@ -204,6 +179,7 @@ int ORB_gaunt_table::EP_EL(const int& L)
 	else return (L+1) * (L+3) * (L+5) * (3*L+5) / 192;
 }
 
+
 int ORB_gaunt_table::index_func
 (
  	const int& l1,
@@ -219,6 +195,7 @@ int ORB_gaunt_table::index_func
 	return aux1 + aux2 + aux3 + m3;
 }
 
+
 void ORB_gaunt_table::init_Gaunt_CH(const int& Lmax)
 {
 	TITLE("ORB_gaunt_table","init_Gaunt_CH");
@@ -272,15 +249,16 @@ void ORB_gaunt_table::init_Gaunt_CH(const int& Lmax)
 					}
 
 					ic1++;
-				}
-			}
-		}
-	}
+				}// m3
+			}// l3
+		}// l2
+	} // l1
 
 	timer::tick("ORB_gaunt_table","init_Gaunt_CH",'D');
 	return;
 }
 
+
 //using wigner 3j expression
 double ORB_gaunt_table::Calc_Gaunt_CH
 (
@@ -324,6 +302,7 @@ double ORB_gaunt_table::Calc_Gaunt_CH
 	timer::tick("ORB_gaunt_table","Calc_Gaunt_CH");
 }
 	
+
 double ORB_gaunt_table::Get_Gaunt_CH
 (
  	const int& l1,
@@ -382,6 +361,7 @@ double ORB_gaunt_table::Get_Gaunt_CH
 	catch( out_of_range ){ return 0; }
 }
 	
+
 //Input value
 //m1, m2, m3 are restricted within 0 to 2l+1
 //and should be transformed first
@@ -466,19 +446,6 @@ double ORB_gaunt_table::Get_Gaunt_SH
 	timer::tick("ORB_gaunt_table","Get_Gaunt_SH");
 }
 
-/*	// Peize Lin delete 2016-08-26
-void ORB_gaunt_table::ZEROS()
-{
-	for(int ir = 0; ir < 5000; ir++)
-	{
-		for(int ic = 0; ic < 30; ic++)
-		{
-			Gaunt_CH[ir][ic] = 0.0;
-		}
-	}
-	return;
-}
-*/
 
 double ORB_gaunt_table::Fact(const int& n)
 {
@@ -490,7 +457,12 @@ double ORB_gaunt_table::Fact(const int& n)
 	return val;
 }
 
-void ORB_gaunt_table::Swap(int& l1, int& m1, int& l2, int & m2)
+
+void ORB_gaunt_table::Swap(
+	int& l1, 
+	int& m1, 
+	int& l2, 
+	int & m2)
 {
 	int tmp1, tmp2;
 	if(l1 >= l2) return;
@@ -508,6 +480,7 @@ void ORB_gaunt_table::Swap(int& l1, int& m1, int& l2, int & m2)
 	return;
 }
 
+
 int ORB_gaunt_table::Index_M(const int& m)
 {
 	if(m % 2 == 0) return (- m / 2);
diff --git a/ABACUS.develop/source/src_lcao/ORB_gaunt_table.h b/ABACUS.develop/source/src_lcao/ORB_gaunt_table.h
index aa09f9c51e..0f0b265eb3 100644
--- a/ABACUS.develop/source/src_lcao/ORB_gaunt_table.h
+++ b/ABACUS.develop/source/src_lcao/ORB_gaunt_table.h
@@ -1,12 +1,9 @@
-//=========================================================
-//AUTHOR : Mohan 
-//DATE : 2009-04-23
-//=========================================================
 #ifndef ORB_GAUNT_TABLE_H
 #define ORB_GAUNT_TABLE_H
 
-#include "../src_pw/tools.h"
 #include <map>
+#include "../src_global/realarray.h"
+#include "../src_global/matrix.h"
 
 class ORB_gaunt_table
 {
@@ -59,52 +56,64 @@ class ORB_gaunt_table
 	//============================================================
 	// (1) Make Ylm_Gaunt Table.
 	//============================================================
-	void init_Ylm_Gaunt(const int &lmax, const double &s1,const double &e1,
-		const double &s2,const double &e2);
+	void init_Ylm_Gaunt(
+		const int &lmax, 
+		const double &s1,
+		const double &e1,
+		const double &s2,
+		const double &e2);
 
 	//============================================================
 	// (2) Use Ylm_Gaunt to calculate Gaunt Coefficinets element
 	//============================================================
 	double Cal_Gaunt_single(
-	   	const int &l1, const int &m1, 
-	   	const int &l2, const int &m2, 
-	   	const int &l, const int &m, 
-	   	const double &s1, const double &e1,    
-	   	const double &s2, const double &e2);
+	   	const int &l1, 
+		const int &m1, 
+	   	const int &l2, 
+		const int &m2, 
+	   	const int &l, 
+		const int &m, 
+	   	const double &s1, 
+		const double &e1,    
+	   	const double &s2, 
+		const double &e2);
 
 	//============================================================
 	// (3) Make the whole Gaunt Coefficients table
 	//============================================================
 	void init_Gaunt(const int &lmax);
 
-
 	//========================================================
 	// Small function
 	//========================================================
 	static int get_lm_index(const int l, const int m);
+
 	static int Index_M(const int& m);
 
 	private:
 	
-	//Index Function
-	//Yu's mehtod
+	// Index Function
+	// Yu's mehtod
 	// Peize Lin delete void ZEROS(); 2016-08-26
 	
 	int P_EL(const int& L);
+
 	int EP_EL(const int& L);
+
 	int index_func(
-					const int& l1,
-					const int& l2,
-					const int& l3,
-					const int& m3	);
+			const int& l1,
+			const int& l2,
+			const int& l3,
+			const int& m3	);
 	
 	double Fact(const int& n);
+
 	void Swap(
-				int& l1,
-				int& m1,
-				int& l2,
-				int& m2	);
-	
+			int& l1,
+			int& m1,
+			int& l2,
+			int& m2	);
+
 	//2*Lmax+1
 	std::map<int,std::map<int,double>> Gaunt_CH;		// Peize Lin update 2016-08-26
 	
diff --git a/ABACUS.develop/source/src_lcao/ORB_gen_tables.cpp b/ABACUS.develop/source/src_lcao/ORB_gen_tables.cpp
index c9b2ff6a32..51e3e0ed45 100644
--- a/ABACUS.develop/source/src_lcao/ORB_gen_tables.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_gen_tables.cpp
@@ -1,1003 +1,1019 @@
-#include "src_pw/global.h"
-#include "ORB_read.h"
-#include "ORB_gen_tables.h"
-#include "src_global/ylm.h"
-
-// here is a member of ORB_gen_tables class
-ORB_gen_tables UOT;
-
-ORB_gen_tables::ORB_gen_tables(){}
-ORB_gen_tables::~ORB_gen_tables(){}
-
-// call in hamilt_linear::init_before_ions.
-void ORB_gen_tables::gen_tables( const int &job0 )
-{
-	TITLE("ORB_gen_tables","gen_tables");
-	timer::tick("ORB_gen_tables","gen_tables",'C');
-
-	ofs_running << "\n SETUP THE TWO-CENTER INTEGRATION TABLES" << endl;
-	
-	//=========================================
-	// (1) MOT: make overlap table.
-	//=========================================
-	MOT.allocate(
-		ORB.get_ntype(),// number of atom types
-        ORB.get_lmax(),// max L used to calculate overlap
-        ORB.get_kmesh(), // kpoints, for integration in k space
-        ORB.get_Rmax(),// max value of radial table
-        ORB.get_dR(),// delta R, for making radial table
-        ORB.get_dk() ); // delta k, for integration in k space
-
-	tbeta.allocate(
-		ORB.get_ntype(),// number of atom types
-        ORB.get_lmax(),// max L used to calculate overlap
-        ORB.get_kmesh(), // kpoints, for integration in k space
-        ORB.get_Rmax(),// max value of radial table
-        ORB.get_dR(),// delta R, for making radial table
-        ORB.get_dk() ); // delta k, for integration in k space
-
-	//caoyu add 2021-03-18
-	if (INPUT.out_descriptor && BASIS_TYPE == "lcao") {
-		talpha.allocate(
-			ORB.get_ntype(),// number of atom types
-			ORB.get_lmax(),// max L used to calculate overlap
-			ORB.get_kmesh(), // kpoints, for integration in k space
-			ORB.get_Rmax(),// max value of radial table
-			ORB.get_dR(),// delta R, for making radial table
-			ORB.get_dk()); // delta k, for integration in k space
-	}
-
-	// OV: overlap
-	MOT.init_OV_Tpair();
-	MOT.init_OV_Opair();
-
-	// NL: nonlocal
-	tbeta.init_NL_Tpair();
-	tbeta.init_NL_Opair(); // add 2009-5-8
-
-	//caoyu add 2021-03-18
-	// DS: Descriptor
-	if (INPUT.out_descriptor && BASIS_TYPE == "lcao") {
-		talpha.init_DS_Opair();
-		talpha.init_DS_2Lplus1();
-	}
-
-	//=========================================
-	// (2) init Ylm Coef
-	//=========================================
-	//liaochen add 2010/4/29
-	Ylm::set_coefficients ();
-
-	// Peize Lin update 2016-01-26
-	int Lmax_used, Lmax;
-	MOT.init_Table_Spherical_Bessel (2,1, Lmax_used, Lmax);
-	
-	//calculate S(R) for interpolation
-	MOT.init_Table(job0);
-	tbeta.init_Table_Beta( MOT.pSB );// add 2009-5-8
-
-	//caoyu add 2021-03-18
-	if (INPUT.out_descriptor && BASIS_TYPE == "lcao") {
-		talpha.init_Table_Alpha(MOT.pSB);
-		talpha.print_Table_DSR();	
-	}
-
-	//=========================================
-	// (3) make Gaunt coefficients table
-	//=========================================
-
-	const int lmax = (Lmax_used-1) / 2 ;
-	//MGT.init_Ylm_Gaunt(ORB.get_lmax()+1, 0.0,PI,0.0,TWO_PI);
-	MGT.init_Gaunt_CH( lmax );
-	//MGT.init_Gaunt(ORB.get_lmax()+1);
-	MGT.init_Gaunt( lmax );
-
-
-
-	timer::tick("ORB_gen_tables","gen_tables",'C');
-	return;
-}
-
-void ORB_gen_tables::snap_psibeta(
-	double nlm[],
-	const int& job,
-	const Vector3<double> &R1,
-	const int &T1,
-	const int &L1,
-	const int &m1,
-	const int &N1,
-	const Vector3<double> &R2,
-	const int &T2,
-	const int &L2,
-	const int &m2,
-	const int &N2,
-	const Vector3<double> &R0,// The projector.
-	const int &T0,
-	complex<double> *nlm1,
-	const int is) const
-{
-	//TITLE ("ORB_gen_tables","snap_psibeta");
-
-	//optimized by zhengdy-soc
-	if(NSPIN==4 && ORB.Beta[T0].get_count_soc(is)==0) 
-	{
-		return;
-	}
-
-	timer::tick ("ORB_gen_tables","snap_psibeta",'X');
-
-	bool has_so = 0;
-	if(ORB.Beta[T0].get_count_soc(0)>0 ) has_so = 1;
-
-	const int nproj = ORB.nproj[T0];
-	bool *calproj = new bool[nproj];
-	int* rmesh1 = new int[nproj];
-	int* rmesh2 = new int[nproj];
-
-	//rcut of orbtials and projectors
-	const double Rcut1 = ORB.Phi[T1].getRcut();
-	const double Rcut2 = ORB.Phi[T2].getRcut();
-	
-	//in our calculation, we always put orbital phi at the left side of <phi|beta>
-	//because <phi|beta> = <beta|phi>
-	const Vector3<double> dRa = (R0-R1)*this->lat0 ; 
-	const Vector3<double> dRb = (R0-R2)*this->lat0 ;
-	
-	double distance10 = dRa.norm();
-	double distance20 = dRb.norm();
-
-	// mohan add 2011-03-10
-	// because the table length is different accordint to each length
-	// of projector, so sometimes some shorter projectors need not be 
-	// calculated.
-	bool all_out = true;
-	for(int ip=0; ip<nproj; ip++)
-	{
-		const double Rcut0 = ORB.Beta[T0].Proj[ip].getRcut();
-		if( distance10 > (Rcut1 + Rcut0) || distance20 > (Rcut2 + Rcut0) )  
-		{
-			calproj[ip] = false;
-		}
-		else
-		{
-			all_out = false;
-			calproj[ip] = true;
-			//length of table for interpolation
-			rmesh1[ip] = tbeta.get_rmesh(Rcut1, Rcut0);
-			rmesh2[ip] = tbeta.get_rmesh(Rcut2, Rcut0);
-		}
-	}
-
-	if(all_out)
-	{
-		delete[] calproj;
-		delete[] rmesh1;
-		delete[] rmesh2;
-		timer::tick ("ORB_gen_tables","snap_psibeta",'X');
-		return;
-	}
-
-
-	//FOR INTERPOLATION
-	double* curr; //current pointer
-	int iqa, iqb;
-	double psa, psb;
-	double x0a,x1a,x2a,x3a,x123a,x120a,x032a,x031a;
-	double x0b,x1b,x2b,x3b,x123b,x120b,x032b,x031b;
-	
-	psa = distance10 / tbeta.dr;
-	iqa = static_cast<int>(psa);
-   	x0a = psa - static_cast<double>(iqa);
-  	x1a = 1.0 - x0a;
-   	x2a = 2.0 - x0a;
-    x3a = 3.0 - x0a;
-	x123a = x1a*x2a*x3a/6.0;
-	x120a = x1a*x2a*x0a/6.0;
-	x032a = x0a*x3a*x2a/2.0;
-	x031a = x0a*x3a*x1a/2.0;
-	
-	psb = distance20 / tbeta.dr;
-	iqb = (int) psb;
-   	x0b = psb - (double)iqb ;
-  	x1b = 1.0 - x0b;
-   	x2b = 2.0 - x0b;
-    x3b = 3.0 - x0b;
-	x123b = x1b*x2b*x3b/6.0;
-	x120b = x1b*x2b*x0b/6.0;
-	x032b = x0b*x3b*x2b/2.0;
-	x031b = x0b*x3b*x1b/2.0;
-	
-	//UNIT VECTOR
-			
-	//double unit_vec_dRa[3];
-	//unit_vec_dRa[0] = dRa.x;
-	//unit_vec_dRa[1] = dRa.y;
-	//unit_vec_dRa[2] = dRa.z;
-	
-	double unit_vec_dRb[3];
-	unit_vec_dRb[0] = dRb.x;
-	unit_vec_dRb[1] = dRb.y;
-	unit_vec_dRb[2] = dRb.z;
-	
-	//special case for R = 0;
-	const double tiny1 = 1e-12;
-	const double tiny2 = 1e-10;
-
-	if(distance10 < tiny1) distance10 += tiny1;
-	if(distance20 < tiny1) distance20 += tiny1;
-	
-
-	// Find three dimension of 'Table_NR' '
-	// Notice!!! T1 must be orbital, 
-	// T0 must be nonlocal orbital
-	// usage : pairs_nonlocal_type(T1 : orbital, T0 : projector);
-	const int Tpair1 = tbeta.NL_Tpair(T1, T0);
-	const int Tpair2 = tbeta.NL_Tpair(T2, T0);
-	const int T1_2Lplus1 = tbeta.NL_L2plus1(T1, T0);
-	const int T2_2Lplus1 = tbeta.NL_L2plus1(T2, T0);
-
-	//gaunt index
-	const int gindex1 = L1*L1+m1;
-	const int gindex2 = L2*L2+m2;
-
-	// Peize Lin change rlya, rlyb, grlyb 2016-08-26
-	vector<double> rlya;
-	vector<double> rlyb;
-	vector<vector<double>> grlyb;
-	
-	Ylm::rl_sph_harm (T1_2Lplus1-1, dRa.x, dRa.y, dRa.z, rlya);
-	if (job == 0) 
-	{
-		Ylm::rl_sph_harm (T2_2Lplus1-1, dRb.x, dRb.y, dRb.z, rlyb);
-	}
-	else 
-	{
-		Ylm::grad_rl_sph_harm (T2_2Lplus1-1, dRb.x, dRb.y, dRb.z, rlyb, grlyb);
-	}
-	//==============================================================================
-	// Formula :                         T1       T0          T0        T2
-	// sum_{L0}sum_{m0}
-	// 			D_{L0,L0} <psi1_{L1,N1}|Beta_{L0,m0}><Beta_{L0,m0}|psi2_{L2,N2}>
-	//==============================================================================
-	//double v = 0.0;
-
-	// mohan update 2011-03-07
-	int n_projection =1;
-	if(has_so) 
-	{
-		n_projection = ORB.Beta[T0].get_nproj_soc();
-	}
-
-	vector<complex<double>> term_a_nc(n_projection,{0,0});		// Peize Lin change ptr to vector at 2020.01.31
-	vector<complex<double>> term_b_nc(n_projection,{0,0});		// Peize Lin change ptr to vector at 2020.01.31
-	int ip = -1;
-
-	for(int nb=0; nb<nproj; nb++)
-	{
-		if( !calproj[nb] ) continue;
-
-		const int L0 = ORB.Beta[T0].getL_Beta(nb);
-		//const int next_ip = 2* L0 +1;
-	
-
-//-------------------------------------------------------------------
-// move iterations for psi1 and psi2 from cal_fvnl_dbeta 
-// to here --- 2021/03/20 mohan chen
-//-------------------------------------------------------------------
-
-
-		// <psi1 | Beta>
-		const int Opair1 = tbeta.NL_Opair(Tpair1, L1, N1, nb); 
-		// <psi2 | Beta>
-		const int Opair2 = tbeta.NL_Opair(Tpair2, L2, N2, nb); 
-		
-			
-		for(int m0=0; m0<2*L0+1; m0++)
-		{
-			++ip;
-			int gindex0 = L0*L0+m0;
-			
-			//loop of {lmn}
-			double term_a = 0.0;
-			double term_b = 0.0;
-			double term_c[3] = {0,0,0};	
-			
-			//=============
-			// FIRST PART	
-			//=============
-			for(int L=0; L<T1_2Lplus1; L++)
-			{
-				//triangle rule for gaunt coefficients
-				int AL = L1 + L0;
-				int SL = abs (L1 - L0);
-				if ((L > AL) || (L < SL) || ((L-SL) % 2 == 1)) continue;
-			
-				//prefac = (i)^{lphi - lbeta - l}
-				//R0-R1 ==> <phi|beta>
-				double i_exp = pow(-1.0, (L1-L0-L)/2);
-				double rl1 = pow(distance10, L);			
-				double Interp_Vnla = 0.0;
-				if (distance10 > tiny2)
-				{	
-					curr = tbeta.Table_NR[0][Tpair1][Opair1][L];
-					if( iqa >= rmesh1[nb]-4)
-					{
-						Interp_Vnla = 0.0;
-					}
-					else
-					{
-						Interp_Vnla = i_exp * (x123a*curr[iqa]+x120a*curr[iqa+3]+x032a*curr[iqa+1]-x031a*curr[iqa+2]);
-					}
-					Interp_Vnla /= rl1;
-				}
-				else 
-				{
-					Interp_Vnla = i_exp * tbeta.Table_NR[0][Tpair1][Opair1][L][0];
-				}
-	
-				//------------------------------------------
-				//  Overlap value = S_from_table * G * Ylm				
-				//------------------------------------------
-				for(int m=0; m<2*L+1; m++)
-				{
-					int gindexa = L*L+m;
-					//double tmpGaunt = this->MGT.Get_Gaunt_SH(L1, m1, L0, m0, L, m); 
-					double tmpGaunt = this->MGT.Gaunt_Coefficients (gindex1, gindex0, gindexa);
-					term_a += tmpGaunt * Interp_Vnla * rlya[ MGT.get_lm_index(L, m) ];
-				}
-			} //end L
-
-			//=============
-			// SECOND PART	
-			//=============
-			for(int L=0; L<T2_2Lplus1; L++)
-			{
-				//triangle rule for gaunt coefficients
-				int AL = L2 + L0;
-				int SL = abs (L2 - L0);
-				if ((L > AL) || (L < SL) || ((L-SL) % 2 == 1)) continue;
-
-				double Interp_Vnlb = 0.0;
-				double Interp_Vnlc = 0.0;
-				
-				//prefac
-				double i_exp = pow(-1.0, (L2-L0-L)/2);
-				double rl2 = pow (distance20, L);	
-				if (distance20 > tiny2)
-				{
-					curr = tbeta.Table_NR[0][Tpair2][Opair2][L];
-   					
-					if( iqb >= rmesh2[nb]-4) 
-					{
-						Interp_Vnlb = 0.0;
-					}
-					else 
-					{
-						Interp_Vnlb = i_exp * (x123b*curr[iqb]+x120b*curr[iqb+3]+x032b*curr[iqb+1]-curr[iqb+2]*x031b);
-					}
-					
-					Interp_Vnlb /= rl2;
-				}
-				else 
-				{
-					Interp_Vnlb = i_exp * tbeta.Table_NR[0][Tpair2][Opair2][L][0];
-				}
-
-				
-				if (job == 1) // 1 means calculate the derivative part.
-				{
-					if (distance20 > tiny2)
-					{
-						curr = tbeta.Table_NR[1][Tpair2][Opair2][L];
-   					
-						if( iqb >= rmesh2[nb]-4) 
-						{
-							Interp_Vnlc = 0.0;
-						}
-						else 
-						{
-							Interp_Vnlc = i_exp * (x123b*curr[iqb]+x120b*curr[iqb+3]+x032b*curr[iqb+1]-curr[iqb+2]*x031b);
-						}
-						Interp_Vnlc = Interp_Vnlc / pow(distance20, L) - Interp_Vnlb * L / distance20;
-					}
-					else 
-					{
-						Interp_Vnlc = 0.0;
-					}
-				}
-				
-				// sum up the second part.	
-				for(int m=0; m<2*L+1; m++)
-				{
-					int gindexb = L*L+m;
-					//double tmpGaunt = this->MGT.Get_Gaunt_SH(L0, m0, L2, m2, L, m);
-					double tmpGaunt = this->MGT.Gaunt_Coefficients (gindex0, gindex2, gindexb);
-					const int lm = MGT.get_lm_index(L, m);
-					
-					switch (job)
-					{
-						case 0:// calculate the overlap part.
-						{
-							term_b += tmpGaunt * Interp_Vnlb * rlyb[lm];
-							break;
-						}
-						case 1: // calculate the derivative part.
-						{
-							double tt1 = tmpGaunt * Interp_Vnlc * rlyb[lm] / distance20;
-							double tt2 = tmpGaunt * Interp_Vnlb;
-										
-							for(int ir = 0; ir < 3; ir++)
-							{
-								term_c[ir] += tt1 * unit_vec_dRb[ir] 
-											+ tt2 * grlyb[lm][ir];
-							}
-
-							break;
-						}
-						default: break;
-					}
-				}// end m of SECOND PART
-			}// end L of SECOND PART
-		
-		
-			//added by zhengdy-soc, store them for soc case
-			if(has_so)
-			{
-				term_a_nc[ip] = term_a;
-				term_b_nc[ip] = term_b;
-			}
-		
-			//===============================================
-			// THIRD PART: SUM THE VALUE FROM ALL PROJECTS.
-			//===============================================
-			switch (job)
-			{
-				case 0://calculate the overlap part.
-				{
-					//nlm[0] += term_a * term_b * ORB.Beta[T0].getCoefficient_D(L0, L0);//LiuXh 2016-01-14
-					if(!has_so) 
-					{
-						nlm[0] += term_a * term_b * ORB.Beta[T0].getCoefficient_D(nb, nb);//LiuXh 2016-01-14
-					}
-					break;
-				}
-				case 1: //calculate the derivative part.
-				{
-					for(int jr = 0; jr < 3; jr++) 
-					{
-						//nlm[jr] += term_c[jr] * term_a * ORB.Beta[T0].getCoefficient_D(L0, L0);//LiuXh 2016-01-14
-						if(!has_so) 
-						{
-							nlm[jr] += term_c[jr] * term_a * ORB.Beta[T0].getCoefficient_D(nb, nb);//LiuXh 2016-01-14
-						}
-					}
-					break;
-				}
-				default: break;
-			}
-		}//!m0
-	}//!L0
-
-	//zhengdy-soc, calculate non-local term
-	if(has_so)
-	{
-		switch (job)
-		{
-			case 0://overlap part
-				for(int no=0;no<ORB.Beta[T0].get_count_soc(is);no++)
-				{
-					const int p1 = ORB.Beta[T0].get_index1_soc(is, no);
-					const int p2 = ORB.Beta[T0].get_index2_soc(is, no);
-					if(NSPIN==4 && nlm1!=NULL)
-					{
-						nlm1[is] += term_a_nc[p1] * term_b_nc[p2] * ORB.Beta[T0].getCoefficient_D_so(is, p2, p1);
-					}
-					else if(NSPIN!=4)
-					{
-						nlm[0] += (term_a_nc[p1] * term_b_nc[p2] * ORB.Beta[T0].getCoefficient_D_so(0, p2, p1)).real();
-					}
-					else
-					{
-						WARNING_QUIT("ORB_gen_tables::snap_psibeta","Conflict! Didn't count non-local part");
-					}
-				}
-				break;
-			case 1://need to be added later
-			{break;}
-			default: break;
-		}
-	}
-
-	delete[] calproj;
-	delete[] rmesh1;
-	delete[] rmesh2;
-
-	timer::tick ("ORB_gen_tables","snap_psibeta",'X');
-	return;
-}
-
-void ORB_gen_tables::snap_psipsi(
-	double olm[],
-	const int &job, //0, 1
-	const char &dtype, // derivative type: S or T
-	const Vector3<double> &R1,
-    const int &T1,
-    const int &L1,
-    const int &m1,
-    const int &N1,
-    const Vector3<double> &R2,
-    const int &T2,
-    const int &L2,
-    const int &m2,
-    const int &N2,
-	complex<double> *olm1)const
-{
-	//TITLE("ORB_gen_tables","snap_psipsi");
-	//timer::tick ("ORB_gen_tables", "snap_psipsi");
-	if(job != 0 && job != 1)
-	{
-		WARNING_QUIT("ORB_gen_tables::snap_psipsi","job must be equal to 0 or 1!");
-	}
-	
-	Numerical_Orbital::set_position(R1, R2);
-	assert(this->lat0>0.0);
-
-	// (1) get distance between R1 and R2 (a.u.)
-	// judge if there exist overlap
-	double distance = Numerical_Orbital::get_distance()*this->lat0;
-	
-	const double Rcut1 = ORB.Phi[T1].getRcut();
-	const double Rcut2 = ORB.Phi[T2].getRcut();
-
-	if(job == 0) ZEROS(olm, 1);
-	else if(job == 1) ZEROS(olm, 3);
-	
-	if( distance > (Rcut1 + Rcut2) ) return;
-	
-	//if distance == 0
-	//\int psi(r) psi(r-R) dr independent of R if R == 0
-	//distance += tiny1 avoid overflow during calculation
-	const double tiny1 = 1e-12;
-	const double tiny2 = 1e-10;
-	if(distance < tiny1) distance += tiny1;
-	
-	// (2) if there exist overlap, calculate the mesh number
-	// between two atoms
-	const int rmesh = this->MOT.get_rmesh(Rcut1, Rcut2);
-	
-	// (3) Find three dimension of 'Table_S' or 'Table_T'
-	// dim1 : type pairs,
-	// dim2 : radial orbital pairs,
-	// dim3 : find lmax between T1 and T2, and get lmax*2+1
-	const int dim1 = this->MOT.OV_Tpair(T1, T2);
-	const int dim3 = this->MOT.OV_L2plus1(T1, T2); //2*lmax+1
-	
-	int dim2;
-	if (T1 <= T2) dim2 = this->MOT.OV_Opair(dim1, L1, L2, N1, N2); 
-	else dim2 = this->MOT.OV_Opair(dim1, L2, L1, N2, N1);
-		
-	// Find the needed Ylm(dR) dimension 
-	const int nlm = dim3 * dim3; //(2lmax+1)*(2lmax+!)
-
-	//Gaunt Index
-	const int gindex1 = L1*L1+m1;
-	const int gindex2 = L2*L2+m2;
-
-	assert(nlm < 400);
-	// Peize Lin change rly, grly 2016-08-26
-	vector<double> rly;			
-	vector<vector<double>> grly;
-	
-//	double *ylm = new double[nlm];
-//	dR = R1 - R2;
-	double arr_dR[3];
-	arr_dR[0] = Numerical_Orbital::getX() * this->lat0;
-	arr_dR[1] = Numerical_Orbital::getY() * this->lat0;
-	arr_dR[2] = Numerical_Orbital::getZ() * this->lat0;
-	
-	//double xdr = arr_dR[0] / distance;
-	//double ydr = arr_dR[1] / distance;
-	//double zdr = arr_dR[2] / distance;
-	
-	//=======================
-	// *r**l*Ylm_real
-	// include its derivations
-	//=======================
-	if (job == 0) 
-	{
-//		Ylm::rlylm(dim3, arr_dR[0], arr_dR[1], arr_dR[2], rly);
-//		Ylm::sph_harm (dim3-1, xdr, ydr, zdr, rly);
-		Ylm::rl_sph_harm (dim3-1, arr_dR[0], arr_dR[1], arr_dR[2], rly);
-	}
-	else 
-	{
-//		Ylm::rlylm(dim3, arr_dR[0], arr_dR[1], arr_dR[2], rly, grly);
-		Ylm::grad_rl_sph_harm (dim3-1, arr_dR[0], arr_dR[1], arr_dR[2], rly, grly);
-	}
-
-	switch( dtype )
-	{
-		case 'S':
-		for (int L = 0; L < dim3; L++) //maxL = dim3-1
-		{
-			//===========================================================
-			// triangle rule for L and sum of L, L1, L2 should be even
-			//===========================================================
-			int AL = L1 + L2;
-			int SL = abs (L1 - L2);
-
-			if ((L > AL) || (L < SL) || ((L-SL) % 2 == 1)) continue;
-			
-			double Interp_Slm = 0.0;
-			double Interp_dSlm = 0.0;
-			double tmpOlm0 = 0.0;
-			double tmpOlm1 = 0.0;
-			
-			// prefactor
-			double i_exp = pow(-1.0, (L1 - L2 - L) / 2);
-			double rl = pow (distance, L);
-
-			if (distance > tiny2)
-			{
-				Interp_Slm = i_exp * Mathzone::Polynomial_Interpolation(
-						MOT.Table_SR[0][dim1][dim2][L],	rmesh, MOT.dr, distance );
-				Interp_Slm /= rl;
-			}
-			else // distance = 0.0; 
-			{
-				Interp_Slm = i_exp * MOT.Table_SR[0][dim1][dim2][L][0];
-			}
-				
-			if (job == 1)//calculate the derivative.
-			{
-				if (distance > tiny2)
-				{
-					Interp_dSlm = i_exp * Mathzone::Polynomial_Interpolation(
-						MOT.Table_SR[1][dim1][dim2][L], rmesh, MOT.dr, distance );
-					Interp_dSlm = Interp_dSlm / pow (distance, L) - Interp_Slm * L / distance;
-				}
-				else 
-				{
-					Interp_dSlm = 0.0;
-				}
-			}
-			
-			for (int m = 0; m < 2*L+1; m++)
-			{
-				int gindex = L*L+m;
-	//			double tmpGaunt1 = MGT.Get_Gaunt_SH(L1, m1, L2, m2, L, m);
-				double tmpGaunt = MGT.Gaunt_Coefficients (gindex1, gindex2, gindex);	
-							
-				tmpOlm0 = Interp_Slm * tmpGaunt;
-	
-				if (job == 1) 
-				{
-					tmpOlm1 = Interp_dSlm * tmpGaunt;
-				}
-				
-				switch( job )
-				{
-					case 0: // calculate overlap.
-					{	
-						if(NSPIN!=4) olm[0] += tmpOlm0 * rly[ MGT.get_lm_index(L, m) ] ;
-						else if(olm1!= NULL)
-						{
-							olm1[0] += tmpOlm0 * rly[ MGT.get_lm_index(L,m) ] ;
-							olm1[1] += 0;//tmpOlm0 * (tmp(0,0)+tmp(0,1));
-							olm1[2] += 0;//tmpOlm0 * (tmp(1,0)+tmp(1,1));
-							olm1[3] += tmpOlm0 * rly[ MGT.get_lm_index(L,m) ] ;
-							
-						}
-						else
-						{
-							WARNING_QUIT("ORB_gen_tables::snap_psipsi","something wrong!");
-							
-						}
-					
-						/*		
-						if( abs ( tmpOlm0 * rly[ MGT.get_lm_index(L, m) ] ) > 1.0e-3 )
-						{
-						cout << " L=" << L << " m=" << m << " tmpOlm0=" << tmpOlm0 
-						<< " rly=" << rly[ MGT.get_lm_index(L, m) ] 
-						<< " r=" << olm[0]
-						<< endl;
-						}
-						*/
-						break;
-					}
-					case 1: // calculate gradient.
-					{
-						for(int ir = 0; ir < 3; ir++)
-						{
-							olm[ir] += tmpOlm0 * grly[ MGT.get_lm_index(L, m) ][ir]
-									 + tmpOlm1 * rly[ MGT.get_lm_index(L, m) ] * arr_dR[ir] / distance;
-						}
-						break;
-					}
-					default: break;
-				}
-			}//!m
-		}
-		break;
-
-		case 'T':
-		for (int L = 0; L < dim3; L++)
-		{
-			int AL = L1 + L2;
-			int SL = abs (L1 - L2);
-
-			if ((L > AL) || (L < SL) || ((L-SL) % 2 == 1)) continue;
-
-			double Interp_Tlm, Interp_dTlm, tmpKem0, tmpKem1;
-			Interp_Tlm = Interp_dTlm = tmpKem0 = tmpKem1 = 0.0;
-			
-			//pre-fac
-			double i_exp = pow(-1.0, (L1 - L2 - L) / 2);
-
-			double rl = pow (distance, L);
-			if (distance > tiny2)
-			{
-				Interp_Tlm = i_exp * Mathzone::Polynomial_Interpolation(
-						MOT.Table_TR[0][dim1][dim2][L],	rmesh, MOT.dr, distance );	
-				Interp_Tlm /= rl;
-			}
-			else Interp_Tlm = i_exp * MOT.Table_TR[0][dim1][dim2][L][0];
-				
-			
-			if (job == 1)
-			{
-				if (distance > tiny2)
-				{
-					Interp_dTlm = i_exp * Mathzone::Polynomial_Interpolation(
-						MOT.Table_TR[1][dim1][dim2][L], rmesh, MOT.dr, distance );
-					Interp_dTlm = Interp_dTlm / rl - Interp_Tlm * L / distance;
-				}
-				else Interp_dTlm = 0.0;
-			}
-			
-			for (int m = 0; m < 2*L+1; m++)
-			{
-				int gindex = L*L+m;
-			//	double tmpGaunt = MGT.Get_Gaunt_SH(L1, m1, L2, m2, L, m);
-				double tmpGaunt = MGT.Gaunt_Coefficients (gindex1, gindex2, gindex);
-					
-				tmpKem0 = Interp_Tlm * tmpGaunt;
-				if (job == 1) 
-				{
-					tmpKem1 = Interp_dTlm * tmpGaunt;
-				}
-				
-				switch( job )
-				{
-					case 0:
-					{
-						if(NSPIN!=4) olm[0] += tmpKem0 * rly[ MGT.get_lm_index(L, m) ];
-						else if(olm1 != NULL)
-						{
-							olm1[0] += tmpKem0 * rly[ MGT.get_lm_index(L,m) ];
-							olm1[1] += 0;//tmpKem0 * (tmp(0,0)+tmp(0,1));
-							olm1[2] += 0;//tmpKem0 * (tmp(1,0)+tmp(1,1));
-							olm1[3] += tmpKem0 * rly[ MGT.get_lm_index(L,m) ];
-						}
-						else
-						{
-							WARNING_QUIT("ORB_gen_tables::snap_psipsi","something wrong in T.");
-						}
-						break;
-					}
-					case 1: 
-					{
-						for(int ir = 0; ir < 3; ir++)
-						{
-							olm[ir] += tmpKem0 * grly[ MGT.get_lm_index(L, m) ][ir]
-								    + tmpKem1 * rly[ MGT.get_lm_index(L, m) ] * arr_dR[ir] / distance;
-						}
-						break;
-					}
-					default: break;
-				}
-			}// end T: m
-		}// end T: :
-		break;
-	}
-//	timer::tick ("ORB_gen_tables", "snap_psipsi");
-	return;
-}
-
-double ORB_gen_tables::get_distance( const Vector3<double> &R1, const Vector3<double> &R2)const
-{
-	assert( this->lat0 > 0.0);
-	Vector3<double> dR = R1 - R2;
-	return dR.norm() * this->lat0;	
-}
-
-//caoyu add 2021-03-17
-void ORB_gen_tables::snap_psialpha(
-	double olm[],
-	const int& job,
-	const Vector3<double>& R1,
-	const int& T1,
-	const int& L1,
-	const int& m1,
-	const int& N1,
-	const Vector3<double>& R2,
-	const int& T2,
-	const int& L2,
-	const int& m2,
-	const int& N2,
-	complex<double>* olm1,
-	const int is) const
-{
-
-	if (job != 0 && job != 1)
-	{
-		WARNING_QUIT("ORB_gen_tables::snap_psialpha", "job must be equal to 0 or 1!");
-	}
-
-	Numerical_Orbital::set_position(R1, R2);
-	assert(this->lat0 > 0.0);
-
-	// (1) get distance between R1 and R2 (a.u.)
-	// judge if there exist overlap
-	double distance = Numerical_Orbital::get_distance() * this->lat0;
-
-	const double Rcut1 = ORB.Phi[T1].getRcut();
-	const double Rcut2 = ORB.Alpha[0].getRcut();
-
-	if (job == 0) ZEROS(olm, 1);
-	else if (job == 1) ZEROS(olm, 3);
-
-	if (distance > (Rcut1 + Rcut2)) return;
-
-	//if distance == 0
-	//\int psi(r) psi(r-R) dr independent of R if R == 0
-	//distance += tiny1 avoid overflow during calculation
-	const double tiny1 = 1e-12;
-	const double tiny2 = 1e-10;
-	if (distance < tiny1) distance += tiny1;
-
-	// (2) if there exist overlap, calculate the mesh number
-	// between two atoms
-	const int rmesh = this->talpha.get_rmesh(Rcut1, Rcut2);
-
-	// (3) Find three dimension of 'Table_DS'
-	// dim1 : type pairs, equal to T1 here 
-	// dim2 : radial orbital pairs,
-	// dim3 : find lmax between T1 and T2, and get lmax*2+1
-	const int dim1 = T1;
-	int dim2 = this->talpha.DS_Opair(dim1, L1, L2, N1, N2);
-	int dim3 = this->talpha.DS_2Lplus1[T1];
-
-	//Gaunt Index
-		const int gindex1 = L1 * L1 + m1;
-	const int gindex2 = L2 * L2 + m2;
-
-	// Peize Lin change rly, grly 2016-08-26
-	vector<double> rly;
-	vector<vector<double>> grly;
-
-	double arr_dR[3];
-	arr_dR[0] = Numerical_Orbital::getX() * this->lat0;
-	arr_dR[1] = Numerical_Orbital::getY() * this->lat0;
-	arr_dR[2] = Numerical_Orbital::getZ() * this->lat0;
-
-	//double xdr = arr_dR[0] / distance;
-	//double ydr = arr_dR[1] / distance;
-	//double zdr = arr_dR[2] / distance;
-
-	//=======================
-	// *r**l*Ylm_real
-	// include its derivations
-	//=======================
-	if (job == 0)
-	{
-		Ylm::rl_sph_harm(dim3 - 1, arr_dR[0], arr_dR[1], arr_dR[2], rly);
-	}
-	else
-	{
-		Ylm::grad_rl_sph_harm(dim3 - 1, arr_dR[0], arr_dR[1], arr_dR[2], rly, grly);
-	}
-
-	for (int L = 0; L < dim3; L++) //maxL = dim3-1
-	{
-		//===========================================================
-		// triangle rule for L and sum of L, L1, L2 should be even
-		//===========================================================
-		int AL = L1 + L2;
-		int SL = abs(L1 - L2);
-
-		if ((L > AL) || (L < SL) || ((L - SL) % 2 == 1)) continue;
-
-		double Interp_Slm = 0.0;
-		double Interp_dSlm = 0.0;
-		double tmpOlm0 = 0.0;
-		double tmpOlm1 = 0.0;
-
-		// prefactor
-		double i_exp = pow(-1.0, (L1 - L2 - L) / 2);
-		double rl = pow(distance, L);
-
-		if (distance > tiny2)
-		{
-			Interp_Slm = i_exp * Mathzone::Polynomial_Interpolation(
-				talpha.Table_DSR[0][dim1][dim2][L], rmesh, MOT.dr, distance);
-			Interp_Slm /= rl;
-		}
-		else // distance = 0.0; 
-		{
-			Interp_Slm = i_exp * talpha.Table_DSR[0][dim1][dim2][L][0];
-		}
-
-		if (job == 1)//calculate the derivative.
-		{
-			if (distance > tiny2)
-			{
-				Interp_dSlm = i_exp * Mathzone::Polynomial_Interpolation(
-					talpha.Table_DSR[1][dim1][dim2][L], rmesh, MOT.dr, distance);
-				Interp_dSlm = Interp_dSlm / pow(distance, L) - Interp_Slm * L / distance;
-			}
-			else
-			{
-				Interp_dSlm = 0.0;
-			}
-		}
-
-		for (int m = 0; m < 2 * L + 1; m++)
-		{
-			int gindex = L * L + m;
-			//			double tmpGaunt1 = MGT.Get_Gaunt_SH(L1, m1, L2, m2, L, m);
-			double tmpGaunt = MGT.Gaunt_Coefficients(gindex1, gindex2, gindex);
-
-			tmpOlm0 = Interp_Slm * tmpGaunt;
-
-			if (job == 1)
-			{
-				tmpOlm1 = Interp_dSlm * tmpGaunt;
-			}
-
-			switch (job)
-			{
-			case 0: // calculate overlap.
-			{
-				if (NSPIN != 4) olm[0] += tmpOlm0 * rly[MGT.get_lm_index(L, m)];
-				else if (olm1 != NULL)
-				{
-					olm1[0] += tmpOlm0 * rly[MGT.get_lm_index(L, m)];
-					olm1[1] += 0;//tmpOlm0 * (tmp(0,0)+tmp(0,1));
-					olm1[2] += 0;//tmpOlm0 * (tmp(1,0)+tmp(1,1));
-					olm1[3] += tmpOlm0 * rly[MGT.get_lm_index(L, m)];
-
-				}
-				else
-				{
-					WARNING_QUIT("ORB_gen_tables::snap_psialpha", "something wrong!");
-
-				}
-
-				/*
-				if( abs ( tmpOlm0 * rly[ MGT.get_lm_index(L, m) ] ) > 1.0e-3 )
-				{
-				cout << " L=" << L << " m=" << m << " tmpOlm0=" << tmpOlm0
-				<< " rly=" << rly[ MGT.get_lm_index(L, m) ]
-				<< " r=" << olm[0]
-				<< endl;
-				}
-				*/
-				break;
-			}
-			case 1: // calculate gradient.
-			{
-				for (int ir = 0; ir < 3; ir++)
-				{
-					olm[ir] += tmpOlm0 * grly[MGT.get_lm_index(L, m)][ir]
-						+ tmpOlm1 * rly[MGT.get_lm_index(L, m)] * arr_dR[ir] / distance;
-				}
-				break;
-			}
-			default: break;
-			}
-		}//!m
-	}
-
-	return;
-}
+#include "src_pw/global.h"
+#include "ORB_read.h"
+#include "ORB_gen_tables.h"
+#include "src_global/ylm.h"
+
+// here is a member of ORB_gen_tables class
+ORB_gen_tables UOT;
+
+ORB_gen_tables::ORB_gen_tables() {}
+ORB_gen_tables::~ORB_gen_tables() {}
+
+// call in hamilt_linear::init_before_ions.
+void ORB_gen_tables::gen_tables(
+	const int &job0,
+	LCAO_Orbitals &orb,
+	const int &Lmax_exx)
+{
+	TITLE("ORB_gen_tables", "gen_tables");
+	timer::tick("ORB_gen_tables", "gen_tables", 'C');
+
+	ofs_running << "\n SETUP THE TWO-CENTER INTEGRATION TABLES" << endl;
+
+	//=========================================
+	// (1) MOT: make overlap table.
+	//=========================================
+	MOT.allocate(
+		orb.get_ntype(), // number of atom types
+		orb.get_lmax(),	 // max L used to calculate overlap
+		orb.get_kmesh(), // kpoints, for integration in k space
+		orb.get_Rmax(),	 // max value of radial table
+		orb.get_dR(),	 // delta R, for making radial table
+		orb.get_dk());	 // delta k, for integration in k space
+
+	tbeta.allocate(
+		orb.get_ntype(), // number of atom types
+		orb.get_lmax(),	 // max L used to calculate overlap
+		orb.get_kmesh(), // kpoints, for integration in k space
+		orb.get_Rmax(),	 // max value of radial table
+		orb.get_dR(),	 // delta R, for making radial table
+		orb.get_dk());	 // delta k, for integration in k space
+
+	//caoyu add 2021-03-18
+	if (INPUT.out_descriptor && BASIS_TYPE == "lcao")
+	{
+		talpha.allocate(
+			orb.get_ntype(), // number of atom types
+			orb.get_lmax(),	 // max L used to calculate overlap
+			orb.get_kmesh(), // kpoints, for integration in k space
+			orb.get_Rmax(),	 // max value of radial table
+			orb.get_dR(),	 // delta R, for making radial table
+			orb.get_dk());	 // delta k, for integration in k space
+	}
+
+	// OV: overlap
+	MOT.init_OV_Tpair(orb);
+	MOT.init_OV_Opair(orb);
+
+	// NL: nonlocal
+	tbeta.init_NL_Tpair();
+	tbeta.init_NL_Opair(orb); // add 2009-5-8
+
+	//caoyu add 2021-03-18
+	// DS: Descriptor
+	if (INPUT.out_descriptor && BASIS_TYPE == "lcao")
+	{
+		talpha.init_DS_Opair();
+		talpha.init_DS_2Lplus1();
+	}
+
+	//=========================================
+	// (2) init Ylm Coef
+	//=========================================
+	//liaochen add 2010/4/29
+	Ylm::set_coefficients();
+
+	// PLEASE add explanations for all options of 'orb_num' and 'mode'
+	// mohan add 2021-04-03
+	// Peize Lin update 2016-01-26
+	int orb_num = 2; //
+	int mode = 1;	 // 1: <phi|phi> and <phi|beta>
+	int Lmax_used = 0;
+	int Lmax = 0;
+
+	MOT.init_Table_Spherical_Bessel(orb_num, mode, Lmax_used, Lmax, Lmax_exx);
+
+	//calculate S(R) for interpolation
+	MOT.init_Table(job0, orb);
+	tbeta.init_Table_Beta(MOT.pSB); // add 2009-5-8
+
+	//caoyu add 2021-03-18
+	if (INPUT.out_descriptor && BASIS_TYPE == "lcao")
+	{
+		talpha.init_Table_Alpha(MOT.pSB);
+		//talpha.print_Table_DSR();
+	}
+
+	//=========================================
+	// (3) make Gaunt coefficients table
+	//=========================================
+
+	const int lmax = (Lmax_used - 1) / 2;
+	//MGT.init_Ylm_Gaunt(orb.get_lmax()+1, 0.0,PI,0.0,TWO_PI);
+	MGT.init_Gaunt_CH(lmax);
+	//MGT.init_Gaunt(orb.get_lmax()+1);
+	MGT.init_Gaunt(lmax);
+
+	timer::tick("ORB_gen_tables", "gen_tables", 'C');
+	return;
+}
+
+void ORB_gen_tables::snap_psibeta(
+	double nlm[],
+	const int &job,
+	const Vector3<double> &R1,
+	const int &T1,
+	const int &L1,
+	const int &m1,
+	const int &N1,
+	const Vector3<double> &R2,
+	const int &T2,
+	const int &L2,
+	const int &m2,
+	const int &N2,
+	const Vector3<double> &R0, // The projector.
+	const int &T0,
+	complex<double> *nlm1,
+	const int is) const
+{
+	//TITLE ("ORB_gen_tables","snap_psibeta");
+
+	//optimized by zhengdy-soc
+	if (NSPIN == 4 && ORB.Beta[T0].get_count_soc(is) == 0)
+	{
+		return;
+	}
+
+	timer::tick("ORB_gen_tables", "snap_psibeta", 'X');
+
+	bool has_so = 0;
+	if (ORB.Beta[T0].get_count_soc(0) > 0)
+		has_so = 1;
+
+	const int nproj = ORB.nproj[T0];
+	bool *calproj = new bool[nproj];
+	int *rmesh1 = new int[nproj];
+	int *rmesh2 = new int[nproj];
+
+	//rcut of orbtials and projectors
+	const double Rcut1 = ORB.Phi[T1].getRcut();
+	const double Rcut2 = ORB.Phi[T2].getRcut();
+
+	//in our calculation, we always put orbital phi at the left side of <phi|beta>
+	//because <phi|beta> = <beta|phi>
+	const Vector3<double> dRa = (R0 - R1) * this->lat0;
+	const Vector3<double> dRb = (R0 - R2) * this->lat0;
+
+	double distance10 = dRa.norm();
+	double distance20 = dRb.norm();
+
+	// mohan add 2011-03-10
+	// because the table length is different accordint to each length
+	// of projector, so sometimes some shorter projectors need not be
+	// calculated.
+	bool all_out = true;
+	for (int ip = 0; ip < nproj; ip++)
+	{
+		const double Rcut0 = ORB.Beta[T0].Proj[ip].getRcut();
+		if (distance10 > (Rcut1 + Rcut0) || distance20 > (Rcut2 + Rcut0))
+		{
+			calproj[ip] = false;
+		}
+		else
+		{
+			all_out = false;
+			calproj[ip] = true;
+			//length of table for interpolation
+			rmesh1[ip] = tbeta.get_rmesh(Rcut1, Rcut0);
+			rmesh2[ip] = tbeta.get_rmesh(Rcut2, Rcut0);
+		}
+	}
+
+	if (all_out)
+	{
+		delete[] calproj;
+		delete[] rmesh1;
+		delete[] rmesh2;
+		timer::tick("ORB_gen_tables", "snap_psibeta", 'X');
+		return;
+	}
+
+	//FOR INTERPOLATION
+	double *curr; //current pointer
+	int iqa, iqb;
+	double psa, psb;
+	double x0a, x1a, x2a, x3a, x123a, x120a, x032a, x031a;
+	double x0b, x1b, x2b, x3b, x123b, x120b, x032b, x031b;
+
+	psa = distance10 / tbeta.dr;
+	iqa = static_cast<int>(psa);
+	x0a = psa - static_cast<double>(iqa);
+	x1a = 1.0 - x0a;
+	x2a = 2.0 - x0a;
+	x3a = 3.0 - x0a;
+	x123a = x1a * x2a * x3a / 6.0;
+	x120a = x1a * x2a * x0a / 6.0;
+	x032a = x0a * x3a * x2a / 2.0;
+	x031a = x0a * x3a * x1a / 2.0;
+
+	psb = distance20 / tbeta.dr;
+	iqb = (int)psb;
+	x0b = psb - (double)iqb;
+	x1b = 1.0 - x0b;
+	x2b = 2.0 - x0b;
+	x3b = 3.0 - x0b;
+	x123b = x1b * x2b * x3b / 6.0;
+	x120b = x1b * x2b * x0b / 6.0;
+	x032b = x0b * x3b * x2b / 2.0;
+	x031b = x0b * x3b * x1b / 2.0;
+
+	//UNIT VECTOR
+
+	//double unit_vec_dRa[3];
+	//unit_vec_dRa[0] = dRa.x;
+	//unit_vec_dRa[1] = dRa.y;
+	//unit_vec_dRa[2] = dRa.z;
+
+	double unit_vec_dRb[3];
+	unit_vec_dRb[0] = dRb.x;
+	unit_vec_dRb[1] = dRb.y;
+	unit_vec_dRb[2] = dRb.z;
+
+	//special case for R = 0;
+	const double tiny1 = 1e-12;
+	const double tiny2 = 1e-10;
+
+	if (distance10 < tiny1)
+		distance10 += tiny1;
+	if (distance20 < tiny1)
+		distance20 += tiny1;
+
+	// Find three dimension of 'Table_NR' '
+	// Notice!!! T1 must be orbital,
+	// T0 must be nonlocal orbital
+	// usage : pairs_nonlocal_type(T1 : orbital, T0 : projector);
+	const int Tpair1 = tbeta.NL_Tpair(T1, T0);
+	const int Tpair2 = tbeta.NL_Tpair(T2, T0);
+	const int T1_2Lplus1 = tbeta.NL_L2plus1(T1, T0);
+	const int T2_2Lplus1 = tbeta.NL_L2plus1(T2, T0);
+
+	//gaunt index
+	const int gindex1 = L1 * L1 + m1;
+	const int gindex2 = L2 * L2 + m2;
+
+	// Peize Lin change rlya, rlyb, grlyb 2016-08-26
+	vector<double> rlya;
+	vector<double> rlyb;
+	vector<vector<double>> grlyb;
+
+	Ylm::rl_sph_harm(T1_2Lplus1 - 1, dRa.x, dRa.y, dRa.z, rlya);
+	if (job == 0)
+	{
+		Ylm::rl_sph_harm(T2_2Lplus1 - 1, dRb.x, dRb.y, dRb.z, rlyb);
+	}
+	else
+	{
+		Ylm::grad_rl_sph_harm(T2_2Lplus1 - 1, dRb.x, dRb.y, dRb.z, rlyb, grlyb);
+	}
+	//==============================================================================
+	// Formula :                         T1       T0          T0        T2
+	// sum_{L0}sum_{m0}
+	// 			D_{L0,L0} <psi1_{L1,N1}|Beta_{L0,m0}><Beta_{L0,m0}|psi2_{L2,N2}>
+	//==============================================================================
+	//double v = 0.0;
+
+	// mohan update 2011-03-07
+	int n_projection = 1;
+	if (has_so)
+	{
+		n_projection = ORB.Beta[T0].get_nproj_soc();
+	}
+
+	vector<complex<double>> term_a_nc(n_projection, {0, 0}); // Peize Lin change ptr to vector at 2020.01.31
+	vector<complex<double>> term_b_nc(n_projection, {0, 0}); // Peize Lin change ptr to vector at 2020.01.31
+	int ip = -1;
+
+	for (int nb = 0; nb < nproj; nb++)
+	{
+		if (!calproj[nb])
+			continue;
+
+		const int L0 = ORB.Beta[T0].getL_Beta(nb);
+		//const int next_ip = 2* L0 +1;
+
+		//-------------------------------------------------------------------
+		// move iterations for psi1 and psi2 from cal_fvnl_dbeta
+		// to here --- 2021/03/20 mohan chen
+		//-------------------------------------------------------------------
+
+		// <psi1 | Beta>
+		const int Opair1 = tbeta.NL_Opair(Tpair1, L1, N1, nb);
+		// <psi2 | Beta>
+		const int Opair2 = tbeta.NL_Opair(Tpair2, L2, N2, nb);
+
+		for (int m0 = 0; m0 < 2 * L0 + 1; m0++)
+		{
+			++ip;
+			int gindex0 = L0 * L0 + m0;
+
+			//loop of {lmn}
+			double term_a = 0.0;
+			double term_b = 0.0;
+			double term_c[3] = {0, 0, 0};
+
+			//=============
+			// FIRST PART
+			//=============
+			for (int L = 0; L < T1_2Lplus1; L++)
+			{
+				//triangle rule for gaunt coefficients
+				int AL = L1 + L0;
+				int SL = abs(L1 - L0);
+				if ((L > AL) || (L < SL) || ((L - SL) % 2 == 1))
+					continue;
+
+				//prefac = (i)^{lphi - lbeta - l}
+				//R0-R1 ==> <phi|beta>
+				double i_exp = pow(-1.0, (L1 - L0 - L) / 2);
+				double rl1 = pow(distance10, L);
+				double Interp_Vnla = 0.0;
+				if (distance10 > tiny2)
+				{
+					curr = tbeta.Table_NR[0][Tpair1][Opair1][L];
+					if (iqa >= rmesh1[nb] - 4)
+					{
+						Interp_Vnla = 0.0;
+					}
+					else
+					{
+						Interp_Vnla = i_exp * (x123a * curr[iqa] + x120a * curr[iqa + 3] + x032a * curr[iqa + 1] - x031a * curr[iqa + 2]);
+					}
+					Interp_Vnla /= rl1;
+				}
+				else
+				{
+					Interp_Vnla = i_exp * tbeta.Table_NR[0][Tpair1][Opair1][L][0];
+				}
+
+				//------------------------------------------
+				//  Overlap value = S_from_table * G * Ylm
+				//------------------------------------------
+				for (int m = 0; m < 2 * L + 1; m++)
+				{
+					int gindexa = L * L + m;
+					//double tmpGaunt = this->MGT.Get_Gaunt_SH(L1, m1, L0, m0, L, m);
+					double tmpGaunt = this->MGT.Gaunt_Coefficients(gindex1, gindex0, gindexa);
+					term_a += tmpGaunt * Interp_Vnla * rlya[MGT.get_lm_index(L, m)];
+				}
+			} //end L
+
+			//=============
+			// SECOND PART
+			//=============
+			for (int L = 0; L < T2_2Lplus1; L++)
+			{
+				//triangle rule for gaunt coefficients
+				int AL = L2 + L0;
+				int SL = abs(L2 - L0);
+				if ((L > AL) || (L < SL) || ((L - SL) % 2 == 1))
+					continue;
+
+				double Interp_Vnlb = 0.0;
+				double Interp_Vnlc = 0.0;
+
+				//prefac
+				double i_exp = pow(-1.0, (L2 - L0 - L) / 2);
+				double rl2 = pow(distance20, L);
+				if (distance20 > tiny2)
+				{
+					curr = tbeta.Table_NR[0][Tpair2][Opair2][L];
+
+					if (iqb >= rmesh2[nb] - 4)
+					{
+						Interp_Vnlb = 0.0;
+					}
+					else
+					{
+						Interp_Vnlb = i_exp * (x123b * curr[iqb] + x120b * curr[iqb + 3] + x032b * curr[iqb + 1] - curr[iqb + 2] * x031b);
+					}
+
+					Interp_Vnlb /= rl2;
+				}
+				else
+				{
+					Interp_Vnlb = i_exp * tbeta.Table_NR[0][Tpair2][Opair2][L][0];
+				}
+
+				if (job == 1) // 1 means calculate the derivative part.
+				{
+					if (distance20 > tiny2)
+					{
+						curr = tbeta.Table_NR[1][Tpair2][Opair2][L];
+
+						if (iqb >= rmesh2[nb] - 4)
+						{
+							Interp_Vnlc = 0.0;
+						}
+						else
+						{
+							Interp_Vnlc = i_exp * (x123b * curr[iqb] + x120b * curr[iqb + 3] + x032b * curr[iqb + 1] - curr[iqb + 2] * x031b);
+						}
+						Interp_Vnlc = Interp_Vnlc / pow(distance20, L) - Interp_Vnlb * L / distance20;
+					}
+					else
+					{
+						Interp_Vnlc = 0.0;
+					}
+				}
+
+				// sum up the second part.
+				for (int m = 0; m < 2 * L + 1; m++)
+				{
+					int gindexb = L * L + m;
+					//double tmpGaunt = this->MGT.Get_Gaunt_SH(L0, m0, L2, m2, L, m);
+					double tmpGaunt = this->MGT.Gaunt_Coefficients(gindex0, gindex2, gindexb);
+					const int lm = MGT.get_lm_index(L, m);
+
+					switch (job)
+					{
+					case 0: // calculate the overlap part.
+					{
+						term_b += tmpGaunt * Interp_Vnlb * rlyb[lm];
+						break;
+					}
+					case 1: // calculate the derivative part.
+					{
+						double tt1 = tmpGaunt * Interp_Vnlc * rlyb[lm] / distance20;
+						double tt2 = tmpGaunt * Interp_Vnlb;
+
+						for (int ir = 0; ir < 3; ir++)
+						{
+							term_c[ir] += tt1 * unit_vec_dRb[ir] + tt2 * grlyb[lm][ir];
+						}
+
+						break;
+					}
+					default:
+						break;
+					}
+				} // end m of SECOND PART
+			}	  // end L of SECOND PART
+
+			//added by zhengdy-soc, store them for soc case
+			if (has_so)
+			{
+				term_a_nc[ip] = term_a;
+				term_b_nc[ip] = term_b;
+			}
+
+			//===============================================
+			// THIRD PART: SUM THE VALUE FROM ALL PROJECTS.
+			//===============================================
+			switch (job)
+			{
+			case 0: //calculate the overlap part.
+			{
+				//nlm[0] += term_a * term_b * ORB.Beta[T0].getCoefficient_D(L0, L0);//LiuXh 2016-01-14
+				if (!has_so)
+				{
+					nlm[0] += term_a * term_b * ORB.Beta[T0].getCoefficient_D(nb, nb); //LiuXh 2016-01-14
+				}
+				break;
+			}
+			case 1: //calculate the derivative part.
+			{
+				for (int jr = 0; jr < 3; jr++)
+				{
+					//nlm[jr] += term_c[jr] * term_a * ORB.Beta[T0].getCoefficient_D(L0, L0);//LiuXh 2016-01-14
+					if (!has_so)
+					{
+						nlm[jr] += term_c[jr] * term_a * ORB.Beta[T0].getCoefficient_D(nb, nb); //LiuXh 2016-01-14
+					}
+				}
+				break;
+			}
+			default:
+				break;
+			}
+		} //!m0
+	}	  //!L0
+
+	//zhengdy-soc, calculate non-local term
+	if (has_so)
+	{
+		switch (job)
+		{
+		case 0: //overlap part
+			for (int no = 0; no < ORB.Beta[T0].get_count_soc(is); no++)
+			{
+				const int p1 = ORB.Beta[T0].get_index1_soc(is, no);
+				const int p2 = ORB.Beta[T0].get_index2_soc(is, no);
+				if (NSPIN == 4 && nlm1 != NULL)
+				{
+					nlm1[is] += term_a_nc[p1] * term_b_nc[p2] * ORB.Beta[T0].getCoefficient_D_so(is, p2, p1);
+				}
+				else if (NSPIN != 4)
+				{
+					nlm[0] += (term_a_nc[p1] * term_b_nc[p2] * ORB.Beta[T0].getCoefficient_D_so(0, p2, p1)).real();
+				}
+				else
+				{
+					WARNING_QUIT("ORB_gen_tables::snap_psibeta", "Conflict! Didn't count non-local part");
+				}
+			}
+			break;
+		case 1: //need to be added later
+		{
+			break;
+		}
+		default:
+			break;
+		}
+	}
+
+	delete[] calproj;
+	delete[] rmesh1;
+	delete[] rmesh2;
+
+	timer::tick("ORB_gen_tables", "snap_psibeta", 'X');
+	return;
+}
+
+void ORB_gen_tables::snap_psipsi(
+	double olm[],
+	const int &job,	   //0, 1
+	const char &dtype, // derivative type: S or T
+	const Vector3<double> &R1,
+	const int &T1,
+	const int &L1,
+	const int &m1,
+	const int &N1,
+	const Vector3<double> &R2,
+	const int &T2,
+	const int &L2,
+	const int &m2,
+	const int &N2,
+	complex<double> *olm1) const
+{
+	//TITLE("ORB_gen_tables","snap_psipsi");
+	//timer::tick ("ORB_gen_tables", "snap_psipsi");
+	if (job != 0 && job != 1)
+	{
+		WARNING_QUIT("ORB_gen_tables::snap_psipsi", "job must be equal to 0 or 1!");
+	}
+
+	Numerical_Orbital::set_position(R1, R2);
+	assert(this->lat0 > 0.0);
+
+	// (1) get distance between R1 and R2 (a.u.)
+	// judge if there exist overlap
+	double distance = Numerical_Orbital::get_distance() * this->lat0;
+
+	const double Rcut1 = ORB.Phi[T1].getRcut();
+	const double Rcut2 = ORB.Phi[T2].getRcut();
+
+	if (job == 0)
+		ZEROS(olm, 1);
+	else if (job == 1)
+		ZEROS(olm, 3);
+
+	if (distance > (Rcut1 + Rcut2))
+		return;
+
+	//if distance == 0
+	//\int psi(r) psi(r-R) dr independent of R if R == 0
+	//distance += tiny1 avoid overflow during calculation
+	const double tiny1 = 1e-12;
+	const double tiny2 = 1e-10;
+	if (distance < tiny1)
+		distance += tiny1;
+
+	// (2) if there exist overlap, calculate the mesh number
+	// between two atoms
+	const int rmesh = this->MOT.get_rmesh(Rcut1, Rcut2);
+
+	// (3) Find three dimension of 'Table_S' or 'Table_T'
+	// dim1 : type pairs,
+	// dim2 : radial orbital pairs,
+	// dim3 : find lmax between T1 and T2, and get lmax*2+1
+	const int dim1 = this->MOT.OV_Tpair(T1, T2);
+	const int dim3 = this->MOT.OV_L2plus1(T1, T2); //2*lmax+1
+
+	int dim2;
+	if (T1 <= T2)
+		dim2 = this->MOT.OV_Opair(dim1, L1, L2, N1, N2);
+	else
+		dim2 = this->MOT.OV_Opair(dim1, L2, L1, N2, N1);
+
+	// Find the needed Ylm(dR) dimension
+	const int nlm = dim3 * dim3; //(2lmax+1)*(2lmax+!)
+
+	//Gaunt Index
+	const int gindex1 = L1 * L1 + m1;
+	const int gindex2 = L2 * L2 + m2;
+
+	assert(nlm < 400);
+	// Peize Lin change rly, grly 2016-08-26
+	vector<double> rly;
+	vector<vector<double>> grly;
+
+	//	double *ylm = new double[nlm];
+	//	dR = R1 - R2;
+	double arr_dR[3];
+	arr_dR[0] = Numerical_Orbital::getX() * this->lat0;
+	arr_dR[1] = Numerical_Orbital::getY() * this->lat0;
+	arr_dR[2] = Numerical_Orbital::getZ() * this->lat0;
+
+	//double xdr = arr_dR[0] / distance;
+	//double ydr = arr_dR[1] / distance;
+	//double zdr = arr_dR[2] / distance;
+
+	//=======================
+	// *r**l*Ylm_real
+	// include its derivations
+	//=======================
+	if (job == 0)
+	{
+		//		Ylm::rlylm(dim3, arr_dR[0], arr_dR[1], arr_dR[2], rly);
+		//		Ylm::sph_harm (dim3-1, xdr, ydr, zdr, rly);
+		Ylm::rl_sph_harm(dim3 - 1, arr_dR[0], arr_dR[1], arr_dR[2], rly);
+	}
+	else
+	{
+		//		Ylm::rlylm(dim3, arr_dR[0], arr_dR[1], arr_dR[2], rly, grly);
+		Ylm::grad_rl_sph_harm(dim3 - 1, arr_dR[0], arr_dR[1], arr_dR[2], rly, grly);
+	}
+
+	switch (dtype)
+	{
+	case 'S':
+		for (int L = 0; L < dim3; L++) //maxL = dim3-1
+		{
+			//===========================================================
+			// triangle rule for L and sum of L, L1, L2 should be even
+			//===========================================================
+			int AL = L1 + L2;
+			int SL = abs(L1 - L2);
+
+			if ((L > AL) || (L < SL) || ((L - SL) % 2 == 1))
+				continue;
+
+			double Interp_Slm = 0.0;
+			double Interp_dSlm = 0.0;
+			double tmpOlm0 = 0.0;
+			double tmpOlm1 = 0.0;
+
+			// prefactor
+			double i_exp = pow(-1.0, (L1 - L2 - L) / 2);
+			double rl = pow(distance, L);
+
+			if (distance > tiny2)
+			{
+				Interp_Slm = i_exp * Mathzone::Polynomial_Interpolation(
+										 MOT.Table_SR[0][dim1][dim2][L], rmesh, MOT.dr, distance);
+				Interp_Slm /= rl;
+			}
+			else // distance = 0.0;
+			{
+				Interp_Slm = i_exp * MOT.Table_SR[0][dim1][dim2][L][0];
+			}
+
+			if (job == 1) //calculate the derivative.
+			{
+				if (distance > tiny2)
+				{
+					Interp_dSlm = i_exp * Mathzone::Polynomial_Interpolation(
+											  MOT.Table_SR[1][dim1][dim2][L], rmesh, MOT.dr, distance);
+					Interp_dSlm = Interp_dSlm / pow(distance, L) - Interp_Slm * L / distance;
+				}
+				else
+				{
+					Interp_dSlm = 0.0;
+				}
+			}
+
+			for (int m = 0; m < 2 * L + 1; m++)
+			{
+				int gindex = L * L + m;
+				//			double tmpGaunt1 = MGT.Get_Gaunt_SH(L1, m1, L2, m2, L, m);
+				double tmpGaunt = MGT.Gaunt_Coefficients(gindex1, gindex2, gindex);
+
+				tmpOlm0 = Interp_Slm * tmpGaunt;
+
+				if (job == 1)
+				{
+					tmpOlm1 = Interp_dSlm * tmpGaunt;
+				}
+
+				switch (job)
+				{
+				case 0: // calculate overlap.
+				{
+					if (NSPIN != 4)
+						olm[0] += tmpOlm0 * rly[MGT.get_lm_index(L, m)];
+					else if (olm1 != NULL)
+					{
+						olm1[0] += tmpOlm0 * rly[MGT.get_lm_index(L, m)];
+						olm1[1] += 0; //tmpOlm0 * (tmp(0,0)+tmp(0,1));
+						olm1[2] += 0; //tmpOlm0 * (tmp(1,0)+tmp(1,1));
+						olm1[3] += tmpOlm0 * rly[MGT.get_lm_index(L, m)];
+					}
+					else
+					{
+						WARNING_QUIT("ORB_gen_tables::snap_psipsi", "something wrong!");
+					}
+
+					/*		
+						if( abs ( tmpOlm0 * rly[ MGT.get_lm_index(L, m) ] ) > 1.0e-3 )
+						{
+						cout << " L=" << L << " m=" << m << " tmpOlm0=" << tmpOlm0 
+						<< " rly=" << rly[ MGT.get_lm_index(L, m) ] 
+						<< " r=" << olm[0]
+						<< endl;
+						}
+						*/
+					break;
+				}
+				case 1: // calculate gradient.
+				{
+					for (int ir = 0; ir < 3; ir++)
+					{
+						olm[ir] += tmpOlm0 * grly[MGT.get_lm_index(L, m)][ir] + tmpOlm1 * rly[MGT.get_lm_index(L, m)] * arr_dR[ir] / distance;
+					}
+					break;
+				}
+				default:
+					break;
+				}
+			} //!m
+		}
+		break;
+
+	case 'T':
+		for (int L = 0; L < dim3; L++)
+		{
+			int AL = L1 + L2;
+			int SL = abs(L1 - L2);
+
+			if ((L > AL) || (L < SL) || ((L - SL) % 2 == 1))
+				continue;
+
+			double Interp_Tlm, Interp_dTlm, tmpKem0, tmpKem1;
+			Interp_Tlm = Interp_dTlm = tmpKem0 = tmpKem1 = 0.0;
+
+			//pre-fac
+			double i_exp = pow(-1.0, (L1 - L2 - L) / 2);
+
+			double rl = pow(distance, L);
+			if (distance > tiny2)
+			{
+				Interp_Tlm = i_exp * Mathzone::Polynomial_Interpolation(
+										 MOT.Table_TR[0][dim1][dim2][L], rmesh, MOT.dr, distance);
+				Interp_Tlm /= rl;
+			}
+			else
+				Interp_Tlm = i_exp * MOT.Table_TR[0][dim1][dim2][L][0];
+
+			if (job == 1)
+			{
+				if (distance > tiny2)
+				{
+					Interp_dTlm = i_exp * Mathzone::Polynomial_Interpolation(
+											  MOT.Table_TR[1][dim1][dim2][L], rmesh, MOT.dr, distance);
+					Interp_dTlm = Interp_dTlm / rl - Interp_Tlm * L / distance;
+				}
+				else
+					Interp_dTlm = 0.0;
+			}
+
+			for (int m = 0; m < 2 * L + 1; m++)
+			{
+				int gindex = L * L + m;
+				//	double tmpGaunt = MGT.Get_Gaunt_SH(L1, m1, L2, m2, L, m);
+				double tmpGaunt = MGT.Gaunt_Coefficients(gindex1, gindex2, gindex);
+
+				tmpKem0 = Interp_Tlm * tmpGaunt;
+				if (job == 1)
+				{
+					tmpKem1 = Interp_dTlm * tmpGaunt;
+				}
+
+				switch (job)
+				{
+				case 0:
+				{
+					if (NSPIN != 4)
+						olm[0] += tmpKem0 * rly[MGT.get_lm_index(L, m)];
+					else if (olm1 != NULL)
+					{
+						olm1[0] += tmpKem0 * rly[MGT.get_lm_index(L, m)];
+						olm1[1] += 0; //tmpKem0 * (tmp(0,0)+tmp(0,1));
+						olm1[2] += 0; //tmpKem0 * (tmp(1,0)+tmp(1,1));
+						olm1[3] += tmpKem0 * rly[MGT.get_lm_index(L, m)];
+					}
+					else
+					{
+						WARNING_QUIT("ORB_gen_tables::snap_psipsi", "something wrong in T.");
+					}
+					break;
+				}
+				case 1:
+				{
+					for (int ir = 0; ir < 3; ir++)
+					{
+						olm[ir] += tmpKem0 * grly[MGT.get_lm_index(L, m)][ir] + tmpKem1 * rly[MGT.get_lm_index(L, m)] * arr_dR[ir] / distance;
+					}
+					break;
+				}
+				default:
+					break;
+				}
+			} // end T: m
+		}	  // end T: :
+		break;
+	}
+	//	timer::tick ("ORB_gen_tables", "snap_psipsi");
+	return;
+}
+
+double ORB_gen_tables::get_distance(const Vector3<double> &R1, const Vector3<double> &R2) const
+{
+	assert(this->lat0 > 0.0);
+	Vector3<double> dR = R1 - R2;
+	return dR.norm() * this->lat0;
+}
+
+//caoyu add 2021-03-17
+void ORB_gen_tables::snap_psialpha(
+	double olm[],
+	const int &job,
+	const Vector3<double> &R1,
+	const int &T1,
+	const int &L1,
+	const int &m1,
+	const int &N1,
+	const Vector3<double> &R2,
+	const int &T2,
+	const int &L2,
+	const int &m2,
+	const int &N2) const
+{
+
+	if (job != 0 && job != 1)
+	{
+		WARNING_QUIT("ORB_gen_tables::snap_psialpha", "job must be equal to 0 or 1!");
+	}
+
+	Numerical_Orbital::set_position(R1, R2);
+	assert(this->lat0 > 0.0);
+
+	// (1) get distance between R1 and R2 (a.u.)
+	// judge if there exist overlap
+	double distance = Numerical_Orbital::get_distance() * this->lat0;
+
+	const double Rcut1 = ORB.Phi[T1].getRcut();
+	const double Rcut2 = ORB.Alpha[0].getRcut();
+
+	if (job == 0)
+		ZEROS(olm, 1);
+	else if (job == 1)
+		ZEROS(olm, 3);
+
+	if (distance > (Rcut1 + Rcut2))
+		return;
+
+	//if distance == 0
+	//\int psi(r) psi(r-R) dr independent of R if R == 0
+	//distance += tiny1 avoid overflow during calculation
+	const double tiny1 = 1e-12;
+	const double tiny2 = 1e-10;
+	if (distance < tiny1)
+		distance += tiny1;
+
+	// (2) if there exist overlap, calculate the mesh number
+	// between two atoms
+	const int rmesh = this->talpha.get_rmesh(Rcut1, Rcut2);
+
+	// (3) Find three dimension of 'Table_DS'
+	// dim1 : type pairs, equal to T1 here
+	// dim2 : radial orbital pairs,
+	// dim3 : find lmax between T1 and T2, and get lmax*2+1
+	const int dim1 = T1;
+	int dim2 = this->talpha.DS_Opair(dim1, L1, L2, N1, N2);
+	int dim3 = this->talpha.DS_2Lplus1[T1];
+
+	//Gaunt Index
+	const int gindex1 = L1 * L1 + m1;
+	const int gindex2 = L2 * L2 + m2;
+
+	// Peize Lin change rly, grly 2016-08-26
+	vector<double> rly;
+	vector<vector<double>> grly;
+
+	double arr_dR[3];
+	arr_dR[0] = Numerical_Orbital::getX() * this->lat0;
+	arr_dR[1] = Numerical_Orbital::getY() * this->lat0;
+	arr_dR[2] = Numerical_Orbital::getZ() * this->lat0;
+
+	//double xdr = arr_dR[0] / distance;
+	//double ydr = arr_dR[1] / distance;
+	//double zdr = arr_dR[2] / distance;
+
+	//=======================
+	// *r**l*Ylm_real
+	// include its derivations
+	//=======================
+	if (job == 0)
+	{
+		Ylm::rl_sph_harm(dim3 - 1, arr_dR[0], arr_dR[1], arr_dR[2], rly);
+	}
+	else
+	{
+		Ylm::grad_rl_sph_harm(dim3 - 1, arr_dR[0], arr_dR[1], arr_dR[2], rly, grly);
+	}
+
+	for (int L = 0; L < dim3; L++) //maxL = dim3-1
+	{
+		//===========================================================
+		// triangle rule for L and sum of L, L1, L2 should be even
+		//===========================================================
+		int AL = L1 + L2;
+		int SL = abs(L1 - L2);
+
+		if ((L > AL) || (L < SL) || ((L - SL) % 2 == 1))
+			continue;
+
+		double Interp_Slm = 0.0;
+		double Interp_dSlm = 0.0;
+		double tmpOlm0 = 0.0;
+		double tmpOlm1 = 0.0;
+
+		// prefactor
+		double i_exp = pow(-1.0, (L1 - L2 - L) / 2);
+		double rl = pow(distance, L);
+
+		if (distance > tiny2)
+		{
+			Interp_Slm = i_exp * Mathzone::Polynomial_Interpolation(
+									 talpha.Table_DSR[0][dim1][dim2][L], rmesh, MOT.dr, distance);
+			Interp_Slm /= rl;
+		}
+		else // distance = 0.0;
+		{
+			Interp_Slm = i_exp * talpha.Table_DSR[0][dim1][dim2][L][0];
+		}
+
+		if (job == 1) //calculate the derivative.
+		{
+			if (distance > tiny2)
+			{
+				Interp_dSlm = i_exp * Mathzone::Polynomial_Interpolation(
+										  talpha.Table_DSR[1][dim1][dim2][L], rmesh, MOT.dr, distance);
+				Interp_dSlm = Interp_dSlm / pow(distance, L) - Interp_Slm * L / distance;
+			}
+			else
+			{
+				Interp_dSlm = 0.0;
+			}
+		}
+
+		for (int m = 0; m < 2 * L + 1; m++)
+		{
+			int gindex = L * L + m;
+			//			double tmpGaunt1 = MGT.Get_Gaunt_SH(L1, m1, L2, m2, L, m);
+			double tmpGaunt = MGT.Gaunt_Coefficients(gindex1, gindex2, gindex);
+
+			tmpOlm0 = Interp_Slm * tmpGaunt;
+
+			if (job == 1)
+			{
+				tmpOlm1 = Interp_dSlm * tmpGaunt;
+			}
+
+			switch (job)
+			{
+			case 0: // calculate overlap.
+			{
+				if (NSPIN != 4)
+					olm[0] += tmpOlm0 * rly[MGT.get_lm_index(L, m)];
+				else
+				{
+					WARNING_QUIT("ORB_gen_tables::snap_psialpha", "deepks with NSPIN>1 has not implemented yet!");
+				}
+				/*
+				if( abs ( tmpOlm0 * rly[ MGT.get_lm_index(L, m) ] ) > 1.0e-3 )
+				{
+				cout << " L=" << L << " m=" << m << " tmpOlm0=" << tmpOlm0
+				<< " rly=" << rly[ MGT.get_lm_index(L, m) ]
+				<< " r=" << olm[0]
+				<< endl;
+				}
+				*/
+				break;
+			}
+			case 1: // calculate gradient.
+			{
+				for (int ir = 0; ir < 3; ir++)
+				{
+					olm[ir] += tmpOlm0 * grly[MGT.get_lm_index(L, m)][ir] + tmpOlm1 * rly[MGT.get_lm_index(L, m)] * arr_dR[ir] / distance;
+				}
+				break;
+			}
+			default:
+				break;
+			}
+		} //!m
+	}
+
+	return;
+}
diff --git a/ABACUS.develop/source/src_lcao/ORB_gen_tables.h b/ABACUS.develop/source/src_lcao/ORB_gen_tables.h
index 487cf48ead..0e025875fc 100644
--- a/ABACUS.develop/source/src_lcao/ORB_gen_tables.h
+++ b/ABACUS.develop/source/src_lcao/ORB_gen_tables.h
@@ -1,16 +1,15 @@
-//=========================================================
-//AUTHOR : Mohan 
-//DATE : 2009-04-22
-//=========================================================
-#ifndef USE_OVERLAP_TABLE_H
-#define USE_OVERLAP_TABLE_H
-
-#include "src_pw/tools.h"
+#ifndef ORB_GEN_TABLES_H
+#define ORB_GEN_TABLES_H
+
+//#include "../src_pw/tools.h"
+//#include "../src_global/ylm.h"
+
 #include "ORB_gaunt_table.h"
-#include "src_global/ylm.h"
 #include "ORB_table_beta.h"
 #include "ORB_table_phi.h"
 #include "ORB_table_alpha.h"		//caoyu add 2020-3-18
+#include "ORB_read.h"
+#include "../src_global/vector3.h"
 
 //------------------------------------
 // used to be 'Use_Overlap_Table',
@@ -25,7 +24,11 @@ class ORB_gen_tables
 	ORB_gen_tables();
 	~ORB_gen_tables();
 
-	void gen_tables( const int &job0 );
+	void gen_tables( 
+		const int &job0, 
+		LCAO_Orbitals &orb,
+		const int &Lmax_exx);
+
 	void set_unit( const double &v ){lat0=v;}
 	
 	void snap_psipsi(
@@ -79,9 +82,7 @@ class ORB_gen_tables
 		const int& I2,
 		const int& l2,
 		const int& m2,
-		const int& n2,
-		complex<double>* olm1 = NULL,
-		const int is = 0)const;
+		const int& n2)const;
 
 	// set as public because in hamilt_linear, 
 	// we need to destroy the tables: SR,TR,NR
@@ -103,6 +104,8 @@ class ORB_gen_tables
 
 };
 
+// PLEASE try to get rid of UOT, which is a global variable
+// mohan add 2021-03-30
 extern ORB_gen_tables UOT;
 
 #endif
diff --git a/ABACUS.develop/source/src_lcao/ORB_nonlocal.cpp b/ABACUS.develop/source/src_lcao/ORB_nonlocal.cpp
index bf60401f0e..ec3a798f41 100644
--- a/ABACUS.develop/source/src_lcao/ORB_nonlocal.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_nonlocal.cpp
@@ -3,6 +3,8 @@
 //DATE : 2008-03-04
 //=========================================================
 #include "ORB_nonlocal.h"
+#include "../src_global/global_function.h"
+#include "../src_global/constants.h"
 
 Numerical_Nonlocal::Numerical_Nonlocal()
 {
@@ -41,6 +43,7 @@ void Numerical_Nonlocal::set_type_info
 	const bool has_so
 )
 {
+	// PLEASE take care of this warning
 	if (type_in < 0 || type_in > 2)
 	{
 		WARNING("Numerical_Nonlocal", "bad input of type_in: not ready yet for type >2");
@@ -56,47 +59,35 @@ void Numerical_Nonlocal::set_type_info
 	}
 
 	this->lmax = lmax_in;
-//----------------------------------------------------------
-//EXPLAIN : Coefficient D used in calculate elements of NLps
-//----------------------------------------------------------
-/*2016-07-19, LiuXh
-	this->Coefficient_D.create( lmax_in+1, lmax_in+1);
-	for (int L1 = 0; L1 < lmax + 1; L1++)
-	{
-		for (int L2 = 0; L2 < lmax + 1; L2++)
-		{
-			this->Coefficient_D(L1, L2) = Coefficient_D_in(L1, L2);
-		}
-	}
-2016-07-19, LiuXh*/
 
 //----------------------------------------------------------
 //EXPLAIN : LfromBeta
 //----------------------------------------------------------
 	this->nproj = nproj_in;
-	if(has_so){ 
+
+	if(has_so)
+	{ 
 		this->nproj_soc = nproj_in_so;
 	}
-	//assert(nproj <= lmax_in+1); //LiuXh 2016-01-13, 2016-05-16
+
 	assert(nproj <= nproj_in+1); //LiuXh 2016-01-13, 2016-05-16
 	assert(nproj >= 0);
 
-//2016-07-19 begin, LiuXh
-	if(!has_so){
+	//2016-07-19 begin, LiuXh
+	if(!has_so)
+	{
 		this->Coefficient_D.create( nproj_in+1, nproj_in+1);
 		ZEROS(this->non_zero_count_soc, 4);
 		if(lmax_in > -1) //LiuXh add 20180328, fix bug of Hydrogen element with single projector pseudopot
-		{ //LiuXh add 20180328
-//			for (int L1 = 0; L1 < nproj + 1; L1++)
+		{
 			for (int L1 = 0; L1 < min(this->Coefficient_D.nr, Coefficient_D_in.nr); L1++)
 			{
-//				for (int L2 = 0; L2 < nproj + 1; L2++)
 				for (int L2 = 0; L2 < min(this->Coefficient_D.nc, Coefficient_D_in.nc); L2++)
 				{
 					this->Coefficient_D(L1, L2) = Coefficient_D_in(L1, L2);
 				}
 			}
-		} //LiuXh add 20180328
+		}
 	}
 	else//zhengdy-soc
 	{
diff --git a/ABACUS.develop/source/src_lcao/ORB_nonlocal.h b/ABACUS.develop/source/src_lcao/ORB_nonlocal.h
index 308fa52ee8..9b01388415 100644
--- a/ABACUS.develop/source/src_lcao/ORB_nonlocal.h
+++ b/ABACUS.develop/source/src_lcao/ORB_nonlocal.h
@@ -1,11 +1,10 @@
-//=========================================================
-//AUTHOR : liaochen, mohan
-//DATE : 2009-03-04
-//=========================================================
 #ifndef NUMERICAL_NONLOCAL_H
 #define NUMERICAL_NONLOCAL_H
 
-#include "../src_pw/tools.h"
+//#include "../src_pw/tools.h"
+
+#include "../src_global/complexarray.h"
+#include "../src_global/complexmatrix.h"
 #include "ORB_nonlocal_lm.h"
 //=========================================================
 //CLASS  Numerical_Nonlocal
@@ -87,9 +86,15 @@ class Numerical_Nonlocal
 	// each Beta may have different L.
 	int nproj;
 	int *LfromBeta;
+
+	// PLEASE consider the following parameters can be moved to the 'pseudopotential' module
+	// mohan note 2021-03-28
 	int nproj_soc;//demention of D_ij^so
+
 	ComplexArray Coefficient_D_so;   //(:,:,:),  spin-orbit case,  added by zhengdy-soc
+
 	int non_zero_count_soc[4];
+
 	int *index1_soc[4], *index2_soc[4];
 };
 
diff --git a/ABACUS.develop/source/src_lcao/ORB_nonlocal_lm.cpp b/ABACUS.develop/source/src_lcao/ORB_nonlocal_lm.cpp
index d693f7a6da..0c134e9873 100644
--- a/ABACUS.develop/source/src_lcao/ORB_nonlocal_lm.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_nonlocal_lm.cpp
@@ -4,6 +4,7 @@
 //=========================================================
 #include "ORB_nonlocal_lm.h"
 #include "../src_pw/global.h"
+#include "../src_global/math_integral.h"
 
 Numerical_Nonlocal_Lm::Numerical_Nonlocal_Lm()
 {
@@ -245,7 +246,7 @@ void Numerical_Nonlocal_Lm::get_kradial(void)
             integrated_func[ir] = this->beta_r[ir] * this->r_radial[ir] * jl[ir];
         }
 
-        Mathzone::Simpson_Integral(
+        Integral::Simpson_Integral(
                 this->nr,
                 integrated_func,
                 this->rab,
diff --git a/ABACUS.develop/source/src_lcao/ORB_nonlocal_lm.h b/ABACUS.develop/source/src_lcao/ORB_nonlocal_lm.h
index 4f3eaf42f5..a6f10e6560 100644
--- a/ABACUS.develop/source/src_lcao/ORB_nonlocal_lm.h
+++ b/ABACUS.develop/source/src_lcao/ORB_nonlocal_lm.h
@@ -1,11 +1,10 @@
-//=========================================================
-//AUTHOR : liaochen, mohan
-//DATE : 2008-03-04
-//=========================================================
 #ifndef NUMERICAL_NONLOCAL_LM
 #define NUMERICAL_NONLOCAL_LM
 
-#include "../src_pw/tools.h"
+#include <string>
+using namespace std;
+
+//#include "../src_pw/tools.h"
 //=========================================================
 //CLASS Numerical_Nonlocal_Lm
 //Note : contain information about each projector
diff --git a/ABACUS.develop/source/src_lcao/ORB_read.cpp b/ABACUS.develop/source/src_lcao/ORB_read.cpp
index bd85939faa..6377f76957 100644
--- a/ABACUS.develop/source/src_lcao/ORB_read.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_read.cpp
@@ -1,6 +1,7 @@
 #include "ORB_read.h"
 #include "../src_pw/global.h" // only use ucell.atoms[it]
 #include <cstring>		// Peize Lin fix bug about strcmp 2016-08-02
+#include "../src_global/math_integral.h"
 
 //==============================
 // Define an object here! 
@@ -89,15 +90,17 @@ void LCAO_Orbitals::bcast_files(void)
 //			nonlocal_file.push_back ( nfile );
 		}
 
-		ofs_running << " " << ucell.atoms[it].label << " orbital file: " << orbital_file[it] << endl;
-//		ofs_running << " " << ucell.atoms[it].label << " nonlocal file: " << nonlocal_file[it] << endl;
+		ofs_running << " orbital file: " << orbital_file[it] << endl;
+//		ofs_running << " nonlocal file: " << nonlocal_file[it] << endl;
 	}
 	return;
 }
 #endif
 
 
-void LCAO_Orbitals::Read_Orbitals(void)
+void LCAO_Orbitals::Read_Orbitals(
+	const int &ntype_in, 
+	const int &lmax_in)
 {
 	TITLE("LCAO_Orbitals", "Read_Orbitals");
 	timer::tick("LCAO_Orbitals","Read_Orbitals",'C');
@@ -140,12 +143,17 @@ void LCAO_Orbitals::Read_Orbitals(void)
     assert(dR > 0.0);
     assert(Rmax > 0.0);
 
-	this->ntype = ucell.ntype;
-	this->lmax = ucell.lmax;
-	for(int i=0; i<ucell.ntype; i++)
-	{
-		OUT(ofs_running,"atom label",ucell.atoms[i].label);
-	}
+	this->ntype = ntype_in; 
+	assert(ntype>0);
+
+	assert(lmax_in>=0); // mohan add 2021-04-16
+	this->lmax = lmax_in;
+
+// mohan comment out 2021-04-16
+//	for(int i=0; i<ntype; i++)
+//	{
+//		OUT(ofs_running,"atom label",ucell.atoms[i].label);
+//	}
 
 	//-------------------------------------------------
 	//(2) set the kmesh according to ecutwfc and dk. 
@@ -181,12 +189,9 @@ void LCAO_Orbitals::Read_Orbitals(void)
 	// Read in numerical atomic orbitals for each atom type.
 	//>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
 	delete[] this->Phi;
-// PLEASE avoid using 'ucell' as global variable 
-// if 'ntype' is really needed, the variable should be initialized
-// as a parameter of this class
-// mohan note 2021-03-23
-	this->Phi = new Numerical_Orbital[ucell.ntype];
-	for(int it=0; it<ucell.ntype; it++)
+
+	this->Phi = new Numerical_Orbital[ntype];
+	for(int it=0; it<ntype; it++)
 	{
 		this->Read_PAO(it);	
 	}
@@ -201,11 +206,11 @@ void LCAO_Orbitals::Read_Orbitals(void)
 	// mohan note 2011-03-04
 	//>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
 	delete[] this->Beta;
-	this->Beta = new Numerical_Nonlocal[ucell.ntype];
+	this->Beta = new Numerical_Nonlocal[ntype];
 
 	delete[] nproj;
-	this->nproj = new int[ucell.ntype];
-	ZEROS(nproj, ucell.ntype);
+	this->nproj = new int[ntype];
+	ZEROS(nproj, ntype);
 	
 	this->nprojmax = 0;
 	
@@ -214,7 +219,7 @@ void LCAO_Orbitals::Read_Orbitals(void)
 	// if false: get nonlocal information from .upf or .vwr directly
 	bool readin_nonlocal = false;
 
-	for(int it=0; it<ucell.ntype; it++)
+	for(int it=0; it<ntype; it++)
 	{
 		if(readin_nonlocal)
 		{
@@ -268,20 +273,15 @@ void LCAO_Orbitals::Set_NonLocal(const int &it, int &n_projectors)
 	// get the number of non-local projectors
 	n_projectors = atom->nbeta;
 
-
 // PLEASE avoid using capital letters for local variables
 // mohan note 2021-03-23 
-	const int N_PROJECTORS = atom->nh;//zhengdy-soc
-//cout << " number of projectros " << N_PROJECTORS << endl;
-//	cout << " number of projectros " << n_projectors << endl;
+	const int nh = atom->nh;//zhengdy-soc
 
 	// set the nonlocal projector objects
 	Numerical_Nonlocal_Lm* tmpBeta_lm = new Numerical_Nonlocal_Lm[n_projectors];
 
-	//const int nproj_allowed = atom->lmax + 1;	
-	//matrix Coefficient_D_in(nproj_allowed, nproj_allowed); //LiuXh 2016-01-14
 	matrix Coefficient_D_in(n_projectors, n_projectors); //LiuXh 2016-01-14
-	ComplexMatrix Coefficient_D_in_so(N_PROJECTORS*2, N_PROJECTORS*2);//zhengdy-soc
+	ComplexMatrix Coefficient_D_in_so(nh*2, nh*2);//zhengdy-soc
 
 	if(!atom->has_so)
 	{
@@ -289,11 +289,11 @@ void LCAO_Orbitals::Set_NonLocal(const int &it, int &n_projectors)
 		{
 			const int lnow = atom->lll[p1];
 
-		// this will be wrong if dion is non-diagoal
-		//Coefficient_D_in(lnow,lnow)=atom->dion(p1,p1);//LiuXh 2016-01-14
+			// this will be wrong if dion is non-diagoal
+			//Coefficient_D_in(lnow,lnow)=atom->dion(p1,p1);//LiuXh 2016-01-14
 			Coefficient_D_in(p1,p1)=atom->dion(p1,p1);//LiuXh 2016-01-14
 
-		// only keep the nonzero part.
+			// only keep the nonzero part.
 			int cut_mesh = atom->mesh; 
 			for(int ir=atom->mesh-1; ir>=0; --ir)
 			{
@@ -385,7 +385,7 @@ void LCAO_Orbitals::Set_NonLocal(const int &it, int &n_projectors)
 											conj(soc.rotylm(m2,mj))*soc.spinor(l2,j2,m,is2);
 									}
 									soc.fcoef(it,is1,is2,ip1,ip2) = coeff;
-									Coefficient_D_in_so(ip1 + N_PROJECTORS*is1, ip2 + N_PROJECTORS*is2) = atom->dion(p1,p2) * soc.fcoef(it, is1, is2, ip1, ip2);
+									Coefficient_D_in_so(ip1 + nh*is1, ip2 + nh*is2) = atom->dion(p1,p2) * soc.fcoef(it, is1, is2, ip1, ip2);
 									if(p1 != p2) soc.fcoef(it, is1, is2, ip1, ip2) = complex<double>(0.0,0.0);
 								}
 							}
@@ -393,7 +393,7 @@ void LCAO_Orbitals::Set_NonLocal(const int &it, int &n_projectors)
 						ip2++;
 					}
 				}
-				assert(ip2==N_PROJECTORS);
+				assert(ip2==nh);
 				ip1++;
 			}
 		// only keep the nonzero part.
@@ -431,7 +431,7 @@ void LCAO_Orbitals::Set_NonLocal(const int &it, int &n_projectors)
 			delete[] beta_r;
 		}
 
-		assert(ip1==N_PROJECTORS);
+		assert(ip1==nh);
 
 		this->Beta[it].set_type_info(
 			it, 
@@ -441,7 +441,7 @@ void LCAO_Orbitals::Set_NonLocal(const int &it, int &n_projectors)
 			Coefficient_D_in, 
 			Coefficient_D_in_so, 
 			n_projectors, 
-			N_PROJECTORS, 
+			nh, 
 			atom->lll, 
 			tmpBeta_lm, 
 			1);//zhengdy-soc 2018-09-10
@@ -684,9 +684,21 @@ void LCAO_Orbitals::Read_NonLocal(const int &it, int &n_projectors)
 		}
 	}// end projectors.
 	
-	this->Beta[it].set_type_info(it, label, ps_type, nlmax, Coefficient_D_in, Coefficient_D_in_so, n_projectors, 0, LfromBeta, tmpBeta_lm, ucell.atoms[it].has_so);
+	this->Beta[it].set_type_info(
+		it, 
+		label, 
+		ps_type, 
+		nlmax, 
+		Coefficient_D_in, 
+		Coefficient_D_in_so, 
+		n_projectors, 
+		0, 
+		LfromBeta, 
+		tmpBeta_lm, 
+		ucell.atoms[it].has_so);
 		
 	ifs.close();
+
 	delete[] LfromBeta;
 	delete[] tmpBeta_lm;
 
@@ -885,7 +897,7 @@ void LCAO_Orbitals::Read_PAO(const int& it)
 				inner[ir] = psir[ir] * psir[ir];
 			}
 			double unit = 0.0;
-			Mathzone::Simpson_Integral(meshr, inner, rab, unit);
+			Integral::Simpson_Integral(meshr, inner, rab, unit);
 
 			// check unit: \sum ( psi[r] * r )^2 = 1
 			ofs_running << setprecision(3) << setw(12) << unit;
@@ -900,7 +912,7 @@ void LCAO_Orbitals::Read_PAO(const int& it)
 			{
 				inner[ir] = psir[ir] * psir[ir];
 			}
-			Mathzone::Simpson_Integral(meshr, inner, rab, unit);
+			Integral::Simpson_Integral(meshr, inner, rab, unit);
 			delete[] inner;
 			ofs_running << setw(12) << unit << endl;
 			
@@ -944,58 +956,17 @@ void LCAO_Orbitals::Read_PAO(const int& it)
 void LCAO_Orbitals::set_nl_index(void)
 {
 	TITLE("LCAO_Orbitals","set_nl_index");
-	int ntype = ucell.ntype;
-
-	this->nkb=0;
-	for(int it=0; it<ntype; it++)
-	{
-		nkb += ucell.atoms[it].na * ucell.atoms[it].nh;
-//		cout << " projectors for " << ucell.atoms[it].label << " is " << ucell.atoms[it].nh << endl;
-	}
-
-	// mohan update 2011-05-01
-	if(nkb==0)
-	{
-		WARNING("LCAO_Orbitals","No non-local projectos, it must all be H atoms.");
-		return;
-	}
-	
-
-	this->itiaib2ib_all.create(ntype, ucell.namax, this->nkb);
-
-	int ib_all = 0;
-	for(int it=0; it<ucell.ntype; it++)
-	{
-		for(int ia=0; ia<ucell.atoms[it].na; ia++)
-		{
-			for(int ib=0; ib<ucell.atoms[it].nh; ib++)
-			{
-				itiaib2ib_all(it,ia,ib) = ib_all;
-				++ib_all;
-			}
-			/*
-			for(int ib=0; ib< this->nproj[it]; ib++)
-			{
-				for(int m=0; m< 2*Beta[it].Proj[ib].getL()+1; m++)
-				{
-					itiaib2ib_all(it,ia,ib) = ib_all;
-					++ib_all;
-				}
-			}
-			*/	
-		}
-	}
-	assert(ib_all==nkb);
 
+	assert(this->ntype>0);
 
 	int nh_max = 0;
-	for(int it=0; it<ucell.ntype; it++)
+	for(int it=0; it<ntype; it++)
 	{
 		nh_max = max(nh_max, ucell.atoms[it].nh);
 	}
 
 	this->ib2_ylm.create(ntype, nh_max);
-	for(int it=0; it<ucell.ntype; it++)
+	for(int it=0; it<ntype; it++)
 	{
 		int index = 0;
 		for(int ib=0; ib< this->nproj[it]; ib++)
@@ -1068,10 +1039,11 @@ void LCAO_Orbitals::Read_Descriptor(void)	//read descriptor basis
 	}
 
 #ifdef __MPI
-		Parallel_Common::bcast_int(lmax);
-		Parallel_Common::bcast_int(nchimax);
-		Parallel_Common::bcast_int(nchi, lmax + 1);
+	Parallel_Common::bcast_int(lmax);
+	Parallel_Common::bcast_int(nchimax);
+	Parallel_Common::bcast_int(nchi, lmax + 1);
 #endif		
+
 	this->lmax_d = lmax;
 	this->nchimax_d = nchimax;
 	// calculate total number of chi
@@ -1214,9 +1186,7 @@ void LCAO_Orbitals::Read_Descriptor(void)	//read descriptor basis
 			}
 			double unit = 0.0;
 
-// PLEASE make Simpson_Integral as input parameters?
-// mohan note 2021-03-23
-			Mathzone::Simpson_Integral(meshr, inner, rab, unit);
+			Integral::Simpson_Integral(meshr, inner, rab, unit);
 
 			// check unit: \sum ( psi[r] * r )^2 = 1
 			ofs_running << setprecision(3) << setw(12) << unit;
@@ -1231,7 +1201,7 @@ void LCAO_Orbitals::Read_Descriptor(void)	//read descriptor basis
 			{
 				inner[ir] = psir[ir] * psir[ir];
 			}
-			Mathzone::Simpson_Integral(meshr, inner, rab, unit);
+			Integral::Simpson_Integral(meshr, inner, rab, unit);
 			delete[] inner;
 			ofs_running << setw(12) << unit << endl;
 
diff --git a/ABACUS.develop/source/src_lcao/ORB_read.h b/ABACUS.develop/source/src_lcao/ORB_read.h
index 3010dc1efc..2a3aea4e04 100644
--- a/ABACUS.develop/source/src_lcao/ORB_read.h
+++ b/ABACUS.develop/source/src_lcao/ORB_read.h
@@ -1,12 +1,7 @@
-//=========================================================
-//AUTHOR : mohan
-//DATE : 2009-04-23
-//Last Update : 2021-02-11
-//=========================================================
 #ifndef LCAO_ORBITALS_H
 #define LCAO_ORBITALS_H
 
-#include "../src_pw/tools.h"
+//#include "../src_pw/tools.h"
 #include "ORB_atomic.h"
 #include "ORB_atomic_lm.h"
 #include "ORB_nonlocal.h"
@@ -25,7 +20,9 @@ class LCAO_Orbitals
 	LCAO_Orbitals();
 	~LCAO_Orbitals();
 
-	void Read_Orbitals(void);
+	void Read_Orbitals(
+		const int &ntype_in,
+		const int &lmax_in);
 
 	void Read_PAO(const int& it);
 
@@ -35,7 +32,6 @@ class LCAO_Orbitals
 	// read in the NONLOCAL projector from file.
 	void Read_NonLocal(const int& it, int &n_projectors);
 
-	void set_nl_index(void);
 
 	void Read_Descriptor(void);		//caoyu add 2020-3-16
 
@@ -72,8 +68,6 @@ class LCAO_Orbitals
 	double Rmax;
 	int *nproj; //mohan add 2010-12-19
 	int nprojmax; // mohan add 2010-03-07
-	int nkb; // total number of projectors.
-	IntArray itiaib2ib_all;
 	IntArray ib2_ylm;
 	
 	double dr_uniform;
@@ -92,7 +86,9 @@ class LCAO_Orbitals
 	int nchimax;
 	int lmax_d;	//caoyu add 2021-03-17
 	int nchimax_d;	//caoyu add 2021-03-17
-	int ntype;
+	int ntype; // number of elements
+
+	void set_nl_index(void);
 
 };
 
diff --git a/ABACUS.develop/source/src_lcao/ORB_table_alpha.cpp b/ABACUS.develop/source/src_lcao/ORB_table_alpha.cpp
index d0f639d28d..a160816d29 100644
--- a/ABACUS.develop/source/src_lcao/ORB_table_alpha.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_table_alpha.cpp
@@ -1,468 +1,462 @@
-//caoyu add 2021-03-17
-
-#include "ORB_table_alpha.h"
-#include "ORB_read.h"
-#include "../src_pw/global.h"
-#include <stdexcept>
-#include "../src_ri/exx_abfs.h"
-#include "../src_io/winput.h"
-
-double ORB_table_alpha::dr = -1.0;
-
-ORB_table_alpha::ORB_table_alpha()
-{
-	destroy_nr = false;
-
-	ntype = 0;
-	lmax = 0;
-	kmesh = 0;
-	Rmax = 0.0;
-	dr = 0.0;
-	dk = 0.0;
-
-	nlm = 0;
-	Rmesh = 0;
-
-	kpoint = new double[1];
-	r = new double[1];
-	rab = new double[1];
-	kab = new double[1];
-	DS_2Lplus1 = new int[1];
-}
-
-ORB_table_alpha::~ORB_table_alpha()
-{
-	delete[] kpoint;
-	delete[] r;
-	delete[] rab;
-	delete[] kab;
-	delete[] DS_2Lplus1;
-}
-
-void ORB_table_alpha::allocate
-(
-	const int& ntype_in,
-	const int& lmax_in,
-	const int& kmesh_in,
-	const double& Rmax_in,
-	const double& dr_in,
-	const double& dk_in
-)
-{
-	TITLE("ORB_table_alpha", "allocate");
-
-	this->ntype = ntype_in;// type of elements.
-	this->lmax = lmax_in;
-	this->kmesh = kmesh_in;
-	this->Rmax = Rmax_in;
-	this->dr = dr_in;
-	this->dk = dk_in;
-
-	assert(ntype > 0);
-	assert(lmax >= 0);
-	assert(kmesh > 0.0);
-	assert(Rmax >= 0.0);
-	assert(dr > 0.0);
-	assert(dk > 0.0);
-
-	// calculated from input parameters
-	this->nlm = (2 * lmax + 1) * (2 * lmax + 1);
-	this->Rmesh = static_cast<int>(Rmax / dr) + 4;
-	if (Rmesh % 2 == 0)
-	{
-		++Rmesh;
-	}
-
-	//	OUT(ofs_running,"lmax",lmax);
-	//	OUT(ofs_running,"Rmax (Bohr)",Rmax);
-	//	OUT(ofs_running,"dr (Bohr)",dr);
-	//	OUT(ofs_running,"dk",dk);
-	//	OUT(ofs_running,"nlm",nlm);
-	//	OUT(ofs_running,"kmesh",kmesh);
-
-	delete[] kpoint;
-	delete[] r;
-	kpoint = new double[kmesh];
-	r = new double[Rmesh];
-
-	delete[] rab;
-	delete[] kab;
-	kab = new double[kmesh];
-	rab = new double[Rmesh];
-
-	for (int ik = 0; ik < kmesh; ik++)
-	{
-		kpoint[ik] = ik * dk_in;
-		kab[ik] = dk_in;
-	}
-
-	for (int ir = 0; ir < Rmesh; ir++)
-	{
-		r[ir] = ir * dr;
-		rab[ir] = dr;
-	}
-
-	//	OUT(ofs_running,"allocate kpoint, r, rab, kab","Done");
-	return;
-}
-
-
-int ORB_table_alpha::get_rmesh(const double& R1, const double& R2)
-{
-	int rmesh = static_cast<int>((R1 + R2) / ORB_table_alpha::dr) + 5;
-	//mohan update 2009-09-08 +1 ==> +5
-	//considering interpolation or so on...
-	if (rmesh % 2 == 0) rmesh++;
-
-	if (rmesh <= 0)
-	{
-		ofs_warning << "\n R1 = " << R1 << " R2 = " << R2;
-		ofs_warning << "\n rmesh = " << rmesh;
-		WARNING_QUIT("ORB_table_alpha::get_rmesh", "rmesh <= 0");
-	}
-	return rmesh;
-}
-
-
-
-void ORB_table_alpha::cal_S_PhiAlpha_R(
-	Sph_Bessel_Recursive::D2* pSB, // mohan add 2021-03-06
-	const int& l,
-	const Numerical_Orbital_Lm& n1,
-	const Numerical_Orbital_Lm& n2,
-	const int& rmesh,
-	double* rs,
-	double* drs)
-{
-	timer::tick("ORB_table_alpha", "S_PhiAlpha_R");
-
-	assert(kmesh > 0);
-
-	//start calc	
-	double* k1_dot_k2 = new double[kmesh];
-
-	for (int ik = 0; ik < kmesh; ik++)
-	{
-		k1_dot_k2[ik] = n1.getPsi_k(ik) * n2.getPsi_k(ik);
-	}
-
-	//previous version
-	double* integrated_func = new double[kmesh];
-
-	const vector<vector<double>>& jlm1 = pSB->get_jlx()[l - 1];
-	const vector<vector<double>>& jl = pSB->get_jlx()[l];
-	const vector<vector<double>>& jlp1 = pSB->get_jlx()[l + 1];
-	for (int ir = 0; ir < rmesh; ir++)
-	{
-		ZEROS(integrated_func, kmesh);
-		double temp = 0.0;
-
-		for (int ik = 0; ik < kmesh; ik++)
-		{
-			integrated_func[ik] = jl[ir][ik] * k1_dot_k2[ik];
-		}
-		// Call simpson integration
-		Mathzone::Simpson_Integral(kmesh, integrated_func, kab, temp);
-		rs[ir] = temp * FOUR_PI;
-
-		//drs
-		double temp1, temp2;
-
-		if (l > 0)
-		{
-			for (int ik = 0; ik < kmesh; ik++)
-			{
-				integrated_func[ik] = jlm1[ir][ik] * k1_dot_k2[ik] * kpoint[ik];
-			}
-
-			Mathzone::Simpson_Integral(kmesh, integrated_func, kab, temp1);
-		}
-
-
-		for (int ik = 0; ik < kmesh; ik++)
-		{
-			integrated_func[ik] = jlp1[ir][ik] * k1_dot_k2[ik] * kpoint[ik];
-		}
-
-		Mathzone::Simpson_Integral(kmesh, integrated_func, kab, temp2);
-
-		if (l == 0)
-		{
-			drs[ir] = -FOUR_PI * temp2;
-		}
-		else
-		{
-			drs[ir] = FOUR_PI * (temp1 * l - (l + 1) * temp2) / (2.0 * l + 1);
-		}
-	}
-
-	//liaochen modify on 2010/4/22
-	//special case for R=0
-	//we store Slm(R) / R**l at the fisrt point, rather than Slm(R)
-	if (l > 0)
-	{
-		ZEROS(integrated_func, kmesh);
-		double temp = 0.0;
-
-		for (int ik = 0; ik < kmesh; ik++)
-		{
-			integrated_func[ik] = k1_dot_k2[ik] * pow(kpoint[ik], l);
-		}
-
-		// Call simpson integration
-		Mathzone::Simpson_Integral(kmesh, integrated_func, kab, temp);
-		rs[0] = FOUR_PI / Mathzone_Add1::dualfac(2 * l + 1) * temp;
-	}
-
-	delete[] integrated_func;
-
-
-	delete[] k1_dot_k2;
-	timer::tick("ORB_table_alpha", "S_PhiAlpha_R");
-	return;
-}
-
-
-void ORB_table_alpha::init_Table_Alpha(Sph_Bessel_Recursive::D2* pSB)
-{
-	TITLE("ORB_table_alpha", "init_Table_Alpha");
-	timer::tick("ORB_table_alpha", "init_Table_Alpha", 'D');
-
-	// (1) allocate 1st dimension ( overlap, derivative)
-	this->Table_DSR = new double**** [2];
-	// (2) allocate 2nd dimension ( overlap, derivative)
-	this->Table_DSR[0] = new double*** [this->ntype];
-	this->Table_DSR[1] = new double*** [this->ntype];
-
-	// <1Phi|2Alpha> 
-	for (int T1 = 0; T1 < ntype; T1++) // type 1 is orbital
-	{
-		const int Lmax1 = ORB.Phi[T1].getLmax();
-		const int Lmax2 = ORB.Alpha[0].getLmax();
-		const int lmax_now = std::max(Lmax1, Lmax2);
-		int L2plus1 = 2 * lmax_now + 1;
-		//-------------------------------------------------------------
-		// how many <psi|alpha_l>
-		// here we count all possible psi with (L,N) index for type T1.
-		//-------------------------------------------------------------
-		const int pairs_chi = ORB.Phi[T1].getTotal_nchi() * ORB.Alpha[0].getTotal_nchi();
-
-		if (pairs_chi == 0)continue;
-
-		// init 2nd dimension
-		this->Table_DSR[0][T1] = new double** [pairs_chi];
-		this->Table_DSR[1][T1] = new double** [pairs_chi];
-
-		const double Rcut1 = ORB.Phi[T1].getRcut();
-		for (int L1 = 0; L1 < Lmax1 + 1; L1++)
-		{
-			for (int N1 = 0; N1 < ORB.Phi[T1].getNchi(L1); N1++)
-			{
-				for (int L2 = 0; L2 < Lmax2 + 1; L2++)
-				{
-					for (int N2 = 0; N2 < ORB.Alpha[0].getNchi(L2); N2++)
-					{
-						// get the second index.
-						const int Opair = this->DS_Opair(T1, L1, L2, N1, N2);
-
-						// init 3rd dimension
-						this->Table_DSR[0][T1][Opair] = new double* [L2plus1];
-						this->Table_DSR[1][T1][Opair] = new double* [L2plus1];
-
-						const double Rcut1 = ORB.Phi[T1].getRcut();
-						const double Rcut2 = ORB.Alpha[0].getRcut();
-						assert(Rcut1 > 0.0 && Rcut1 < 100);
-						assert(Rcut2 > 0.0 && Rcut2 < 100);
-
-						const int rmesh = this->get_rmesh(Rcut1, Rcut2);
-						assert(rmesh < this->Rmesh);
-
-						//L=|L1-L2|,|L1-L2|+2,...,L1+L2
-						const int SL = abs(L1 - L2);
-						const int AL = L1 + L2;
-
-						for (int L = 0; L < L2plus1; L++)
-						{
-							//Allocation
-							this->Table_DSR[0][T1][Opair][L] = new double[rmesh];
-							this->Table_DSR[1][T1][Opair][L] = new double[rmesh];
-
-							Memory::record("ORB_table_alpha", "Table_DSR",
-								2 * this->ntype * pairs_chi * rmesh, "double");
-
-							//for those L whose Gaunt Coefficients = 0, we
-							//assign every element in Table_DSR as zero
-							if ((L > AL) || (L < SL) || ((L - SL) % 2 == 1))
-							{
-								ZEROS(Table_DSR[0][T1][Opair][L], rmesh);
-								ZEROS(Table_DSR[1][T1][Opair][L], rmesh);
-
-								continue;
-							}
-
-							this->cal_S_PhiAlpha_R(
-								pSB, // mohan add 2021-03-06
-								L,
-								ORB.Phi[T1].PhiLN(L1, N1),
-								ORB.Alpha[0].PhiLN(L2, N2), // mohan update 2011-03-07
-								rmesh,
-								this->Table_DSR[0][T1][Opair][L],
-								this->Table_DSR[1][T1][Opair][L]);
-						}// end L2plus1
-					}// end N2
-				}// end L2
-			}// end N1
-		}// end L1
-	}// end T1
-	destroy_nr = true;
-
-
-	//	OUT(ofs_running,"allocate non-local potential matrix","Done");
-	timer::tick("ORB_table_alpha", "init_Table_Alpha", 'D');
-	return;
-}
-
-
-void ORB_table_alpha::Destroy_Table_Alpha(void)
-{
-	if (!destroy_nr) return;
-
-	const int ntype = ORB.get_ntype();
-	for (int ir = 0; ir < 2; ir++)
-	{
-		for (int T1 = 0; T1 < ntype; T1++)
-		{
-			const int Lmax1 = ORB.Phi[T1].getLmax();
-			const int Lmax2 = ORB.Alpha[0].getLmax();
-			const int lmax_now = std::max(Lmax1, Lmax2);
-			const int pairs = ORB.Phi[T1].getTotal_nchi() * ORB.Alpha[0].getTotal_nchi();
-
-				// mohan fix bug 2011-03-30
-				if (pairs == 0) continue;
-			for (int dim2 = 0; dim2 < pairs; dim2++)
-			{
-				for (int L = 0; L < 2*lmax_now + 1; L++)
-				{
-					delete[] Table_DSR[ir][T1][dim2][L];
-				}
-				delete[] Table_DSR[ir][T1][dim2];
-			}
-			delete[] Table_DSR[ir][T1];
-		}
-		delete[] Table_DSR[ir];
-	}
-	delete[] Table_DSR;
-	return;
-}
-
-void ORB_table_alpha::init_DS_2Lplus1(void)
-{
-	TITLE("Make_Overlap_Table", "init_DS_2Lplus1");
-	assert(this->ntype > 0);
-	delete[] DS_2Lplus1;
-	DS_2Lplus1=new int[ntype]; // 2Lmax+1 for each T1
-
-	int index = 0;
-	for (int T1 = 0; T1 < ntype; T1++)
-	{
-			this->DS_2Lplus1[T1] = max(ORB.Phi[T1].getLmax(), ORB.Alpha[0].getLmax()) * 2 + 1;
-	}
-	return;
-}
-
-void ORB_table_alpha::init_DS_Opair(void)
-{
-	const int lmax = ORB.get_lmax();
-	const int nchimax = ORB.get_nchimax();
-	const int lmax_d = ORB.get_lmax_d();
-	const int nchimax_d = ORB.get_nchimax_d();
-	assert(lmax + 1 > 0);
-	assert(lmax_d + 1 > 0);
-	assert(nchimax > 0);
-	assert(nchimax_d > 0);
-
-	this->DS_Opair.create(this->ntype, lmax+1, lmax_d+1, nchimax, nchimax_d);
-
-	// <1psi|2beta>
-	// 1. orbital
-	for (int T1 = 0; T1 < ntype; T1++)	//alpha is not related to atom type !
-	{
-		int index = 0;
-		for (int L1 = 0; L1 < ORB.Phi[T1].getLmax() + 1; L1++)
-		{
-			for (int N1 = 0; N1 < ORB.Phi[T1].getNchi(L1); N1++)
-			{
-				for (int L2 = 0; L2 < ORB.Alpha[0].getLmax() + 1; L2++)
-				{
-					for (int N2 = 0; N2 < ORB.Alpha[0].getNchi(L2); N2++)
-					{
-						this->DS_Opair(T1, L1, L2, N1, N2) = index;
-						++index;
-					}
-				}
-			}
-		}
-	}
-	return;
-}
-
-//caoyu add 2021-03-20
-void ORB_table_alpha::print_Table_DSR(void)
-{
-	TITLE("ORB_table_alpha", "print_Table_DSR");
-	NEW_PART("Overlap table S between lcao orbital and descriptor basis : S_{I_mu_alpha}");
-
-	ofstream ofs;
-	stringstream ss;
-	// the parameter 'winput::spillage_outdir' is read from INPUTw.
-	ss << winput::spillage_outdir << "/" << "S_I_mu_alpha.dat";
-	if (MY_RANK == 0)
-	{
-		ofs.open(ss.str().c_str());
-	}
-
-	for (int T1 = 0; T1 < this->ntype; T1++)	//T1
-	{
-		const int Lmax1 = ORB.Phi[T1].getLmax();
-		const int Lmax2 = ORB.Alpha[0].getLmax();
-		for (int L1 = 0; L1 < Lmax1 + 1; L1++)
-		{
-			for (int N1 = 0; N1 < ORB.Phi[T1].getNchi(L1); N1++)
-			{
-				for (int L2 = 0; L2 < Lmax2 + 1; L2++)
-				{
-					for (int N2 = 0; N2 < ORB.Alpha[0].getNchi(L2); N2++)
-					{
-						const int Opair = this->DS_Opair(T1, L1, L2, N1, N2);	//Opair
-						ofs <<setw(20)<< "atom_type: " << ucell.atoms[T1].label << endl;
-						ofs <<setw(20)<< "lcao basis: " << "L1=" << L1 << ", N1=" << N1 << endl;
-						ofs <<setw(20)<< "descriptor basis: " << "L2=" << L2 << ", N2=" << N2 << endl;
-						for (int il = 0; il < this-> DS_2Lplus1[T1]; il++)
-						{
-							ofs << "L=" << il << endl;
-							const double Rcut1 = ORB.Phi[T1].getRcut();
-							const double Rcut2 = ORB.Alpha[0].getRcut();
-							const int rmesh = this->get_rmesh(Rcut1, Rcut2);
-							
-							if (Table_DSR[0][T1][Opair][il][1]==0)	//remain to be discussed
-							{
-								ofs << "S(R)=0"<<endl<<endl;
-								continue;
-							}
-							ofs << "Rcut1="<<Rcut1<<", Rcut2="<<Rcut2<<", rmesh="<<rmesh<<", dr="<<this->dr<<";"<<endl;
-							for (int ir = 0; ir < rmesh; ir++)
-							{
-								ofs << Table_DSR[0][T1][Opair][il][ir] << " ";
-								if ( (ir+1) % 8 == 0) ofs << endl;
-							}
-							ofs << endl <<endl;
-						}
-					}
-				}
-			}
-		}
-
-
-	}
-
-}
\ No newline at end of file
+//caoyu add 2021-03-17
+#include "ORB_table_alpha.h"
+#include "ORB_read.h"
+#include "../src_global/math_integral.h"
+#include <stdexcept>
+
+double ORB_table_alpha::dr = -1.0;
+
+ORB_table_alpha::ORB_table_alpha()
+{
+	destroy_nr = false;
+
+	ntype = 0;
+	lmax = 0;
+	kmesh = 0;
+	Rmax = 0.0;
+	dr = 0.0;
+	dk = 0.0;
+
+	nlm = 0;
+	Rmesh = 0;
+
+	kpoint = new double[1];
+	r = new double[1];
+	rab = new double[1];
+	kab = new double[1];
+	DS_2Lplus1 = new int[1];
+}
+
+ORB_table_alpha::~ORB_table_alpha()
+{
+	delete[] kpoint;
+	delete[] r;
+	delete[] rab;
+	delete[] kab;
+	delete[] DS_2Lplus1;
+}
+
+void ORB_table_alpha::allocate(
+	const int &ntype_in,
+	const int &lmax_in,
+	const int &kmesh_in,
+	const double &Rmax_in,
+	const double &dr_in,
+	const double &dk_in)
+{
+	TITLE("ORB_table_alpha", "allocate");
+
+	this->ntype = ntype_in; // type of elements.
+	this->lmax = lmax_in;
+	this->kmesh = kmesh_in;
+	this->Rmax = Rmax_in;
+	this->dr = dr_in;
+	this->dk = dk_in;
+
+	assert(ntype > 0);
+	assert(lmax >= 0);
+	assert(kmesh > 0.0);
+	assert(Rmax >= 0.0);
+	assert(dr > 0.0);
+	assert(dk > 0.0);
+
+	// calculated from input parameters
+	this->nlm = (2 * lmax + 1) * (2 * lmax + 1);
+	this->Rmesh = static_cast<int>(Rmax / dr) + 4;
+	if (Rmesh % 2 == 0)
+	{
+		++Rmesh;
+	}
+
+	//	OUT(ofs_running,"lmax",lmax);
+	//	OUT(ofs_running,"Rmax (Bohr)",Rmax);
+	//	OUT(ofs_running,"dr (Bohr)",dr);
+	//	OUT(ofs_running,"dk",dk);
+	//	OUT(ofs_running,"nlm",nlm);
+	//	OUT(ofs_running,"kmesh",kmesh);
+
+	delete[] kpoint;
+	delete[] r;
+	kpoint = new double[kmesh];
+	r = new double[Rmesh];
+
+	delete[] rab;
+	delete[] kab;
+	kab = new double[kmesh];
+	rab = new double[Rmesh];
+
+	for (int ik = 0; ik < kmesh; ik++)
+	{
+		kpoint[ik] = ik * dk_in;
+		kab[ik] = dk_in;
+	}
+
+	for (int ir = 0; ir < Rmesh; ir++)
+	{
+		r[ir] = ir * dr;
+		rab[ir] = dr;
+	}
+
+	//	OUT(ofs_running,"allocate kpoint, r, rab, kab","Done");
+	return;
+}
+
+int ORB_table_alpha::get_rmesh(const double &R1, const double &R2)
+{
+	int rmesh = static_cast<int>((R1 + R2) / ORB_table_alpha::dr) + 5;
+	//mohan update 2009-09-08 +1 ==> +5
+	//considering interpolation or so on...
+	if (rmesh % 2 == 0)
+		rmesh++;
+
+	if (rmesh <= 0)
+	{
+		ofs_warning << "\n R1 = " << R1 << " R2 = " << R2;
+		ofs_warning << "\n rmesh = " << rmesh;
+		WARNING_QUIT("ORB_table_alpha::get_rmesh", "rmesh <= 0");
+	}
+	return rmesh;
+}
+
+void ORB_table_alpha::cal_S_PhiAlpha_R(
+	Sph_Bessel_Recursive::D2 *pSB, // mohan add 2021-03-06
+	const int &l,
+	const Numerical_Orbital_Lm &n1,
+	const Numerical_Orbital_Lm &n2,
+	const int &rmesh,
+	double *rs,
+	double *drs)
+{
+	timer::tick("ORB_table_alpha", "S_PhiAlpha_R");
+
+	assert(kmesh > 0);
+
+	//start calc
+	double *k1_dot_k2 = new double[kmesh];
+
+	for (int ik = 0; ik < kmesh; ik++)
+	{
+		k1_dot_k2[ik] = n1.getPsi_k(ik) * n2.getPsi_k(ik);
+	}
+
+	//previous version
+	double *integrated_func = new double[kmesh];
+
+	const vector<vector<double>> &jlm1 = pSB->get_jlx()[l - 1];
+	const vector<vector<double>> &jl = pSB->get_jlx()[l];
+	const vector<vector<double>> &jlp1 = pSB->get_jlx()[l + 1];
+
+	for (int ir = 0; ir < rmesh; ir++)
+	{
+		ZEROS(integrated_func, kmesh);
+		double temp = 0.0;
+
+		for (int ik = 0; ik < kmesh; ik++)
+		{
+			integrated_func[ik] = jl[ir][ik] * k1_dot_k2[ik];
+		}
+		// Call simpson integration
+		Integral::Simpson_Integral(kmesh, integrated_func, kab, temp);
+		rs[ir] = temp * FOUR_PI;
+
+		//drs
+		double temp1, temp2;
+
+		if (l > 0)
+		{
+			for (int ik = 0; ik < kmesh; ik++)
+			{
+				integrated_func[ik] = jlm1[ir][ik] * k1_dot_k2[ik] * kpoint[ik];
+			}
+
+			Integral::Simpson_Integral(kmesh, integrated_func, kab, temp1);
+		}
+
+		for (int ik = 0; ik < kmesh; ik++)
+		{
+			integrated_func[ik] = jlp1[ir][ik] * k1_dot_k2[ik] * kpoint[ik];
+		}
+
+		Integral::Simpson_Integral(kmesh, integrated_func, kab, temp2);
+
+		if (l == 0)
+		{
+			drs[ir] = -FOUR_PI * temp2;
+		}
+		else
+		{
+			drs[ir] = FOUR_PI * (temp1 * l - (l + 1) * temp2) / (2.0 * l + 1);
+		}
+	}
+
+	//liaochen modify on 2010/4/22
+	//special case for R=0
+	//we store Slm(R) / R**l at the fisrt point, rather than Slm(R)
+	if (l > 0)
+	{
+		ZEROS(integrated_func, kmesh);
+		double temp = 0.0;
+
+		for (int ik = 0; ik < kmesh; ik++)
+		{
+			integrated_func[ik] = k1_dot_k2[ik] * pow(kpoint[ik], l);
+		}
+
+		// Call simpson integration
+		Integral::Simpson_Integral(kmesh, integrated_func, kab, temp);
+		rs[0] = FOUR_PI / Mathzone_Add1::dualfac(2 * l + 1) * temp;
+	}
+
+	delete[] integrated_func;
+	delete[] k1_dot_k2;
+
+	timer::tick("ORB_table_alpha", "S_PhiAlpha_R");
+	return;
+}
+
+void ORB_table_alpha::init_Table_Alpha(Sph_Bessel_Recursive::D2 *pSB)
+{
+	TITLE("ORB_table_alpha", "init_Table_Alpha");
+	timer::tick("ORB_table_alpha", "init_Table_Alpha", 'D');
+
+	assert(ntype > 0);
+
+	// (1) allocate 1st dimension ( overlap, derivative)
+	this->Table_DSR = new double ****[2];
+	// (2) allocate 2nd dimension ( overlap, derivative)
+	this->Table_DSR[0] = new double ***[this->ntype];
+	this->Table_DSR[1] = new double ***[this->ntype];
+
+	// <1Phi|2Alpha>
+	for (int T1 = 0; T1 < ntype; T1++) // type 1 is orbital
+	{
+		const int Lmax1 = ORB.Phi[T1].getLmax();
+		const int Lmax2 = ORB.Alpha[0].getLmax();
+		const int lmax_now = std::max(Lmax1, Lmax2);
+		int L2plus1 = 2 * lmax_now + 1;
+		//-------------------------------------------------------------
+		// how many <psi|alpha_l>
+		// here we count all possible psi with (L,N) index for type T1.
+		//-------------------------------------------------------------
+		const int pairs_chi = ORB.Phi[T1].getTotal_nchi() * ORB.Alpha[0].getTotal_nchi();
+
+		if (pairs_chi == 0)
+			continue;
+
+		// init 2nd dimension
+		this->Table_DSR[0][T1] = new double **[pairs_chi];
+		this->Table_DSR[1][T1] = new double **[pairs_chi];
+
+		const double Rcut1 = ORB.Phi[T1].getRcut();
+		for (int L1 = 0; L1 < Lmax1 + 1; L1++)
+		{
+			for (int N1 = 0; N1 < ORB.Phi[T1].getNchi(L1); N1++)
+			{
+				for (int L2 = 0; L2 < Lmax2 + 1; L2++)
+				{
+					for (int N2 = 0; N2 < ORB.Alpha[0].getNchi(L2); N2++)
+					{
+						// get the second index.
+						const int Opair = this->DS_Opair(T1, L1, L2, N1, N2);
+
+						// init 3rd dimension
+						this->Table_DSR[0][T1][Opair] = new double *[L2plus1];
+						this->Table_DSR[1][T1][Opair] = new double *[L2plus1];
+
+						const double Rcut1 = ORB.Phi[T1].getRcut();
+						const double Rcut2 = ORB.Alpha[0].getRcut();
+						assert(Rcut1 > 0.0 && Rcut1 < 100);
+						assert(Rcut2 > 0.0 && Rcut2 < 100);
+
+						const int rmesh = this->get_rmesh(Rcut1, Rcut2);
+						assert(rmesh < this->Rmesh);
+
+						//L=|L1-L2|,|L1-L2|+2,...,L1+L2
+						const int SL = abs(L1 - L2);
+						const int AL = L1 + L2;
+
+						for (int L = 0; L < L2plus1; L++)
+						{
+							//Allocation
+							this->Table_DSR[0][T1][Opair][L] = new double[rmesh];
+							this->Table_DSR[1][T1][Opair][L] = new double[rmesh];
+
+							Memory::record("ORB_table_alpha", "Table_DSR",
+										   2 * this->ntype * pairs_chi * rmesh, "double");
+
+							//for those L whose Gaunt Coefficients = 0, we
+							//assign every element in Table_DSR as zero
+							if ((L > AL) || (L < SL) || ((L - SL) % 2 == 1))
+							{
+								ZEROS(Table_DSR[0][T1][Opair][L], rmesh);
+								ZEROS(Table_DSR[1][T1][Opair][L], rmesh);
+
+								continue;
+							}
+
+							this->cal_S_PhiAlpha_R(
+								pSB, // mohan add 2021-03-06
+								L,
+								ORB.Phi[T1].PhiLN(L1, N1),
+								ORB.Alpha[0].PhiLN(L2, N2), // mohan update 2011-03-07
+								rmesh,
+								this->Table_DSR[0][T1][Opair][L],
+								this->Table_DSR[1][T1][Opair][L]);
+						} // end L2plus1
+					}	  // end N2
+				}		  // end L2
+			}			  // end N1
+		}				  // end L1
+	}					  // end T1
+	destroy_nr = true;
+
+	//	OUT(ofs_running,"allocate non-local potential matrix","Done");
+	timer::tick("ORB_table_alpha", "init_Table_Alpha", 'D');
+	return;
+}
+
+void ORB_table_alpha::Destroy_Table_Alpha(void)
+{
+	if (!destroy_nr)
+		return;
+
+	const int ntype = ORB.get_ntype();
+	for (int ir = 0; ir < 2; ir++)
+	{
+		for (int T1 = 0; T1 < ntype; T1++)
+		{
+			const int Lmax1 = ORB.Phi[T1].getLmax();
+			const int Lmax2 = ORB.Alpha[0].getLmax();
+			const int lmax_now = std::max(Lmax1, Lmax2);
+			const int pairs = ORB.Phi[T1].getTotal_nchi() * ORB.Alpha[0].getTotal_nchi();
+
+			// mohan fix bug 2011-03-30
+			if (pairs == 0)
+				continue;
+			for (int dim2 = 0; dim2 < pairs; dim2++)
+			{
+				for (int L = 0; L < 2 * lmax_now + 1; L++)
+				{
+					delete[] Table_DSR[ir][T1][dim2][L];
+				}
+				delete[] Table_DSR[ir][T1][dim2];
+			}
+			delete[] Table_DSR[ir][T1];
+		}
+		delete[] Table_DSR[ir];
+	}
+	delete[] Table_DSR;
+	return;
+}
+
+void ORB_table_alpha::init_DS_2Lplus1(void)
+{
+	TITLE("Make_Overlap_Table", "init_DS_2Lplus1");
+	assert(this->ntype > 0);
+	delete[] DS_2Lplus1;
+	DS_2Lplus1 = new int[ntype]; // 2Lmax+1 for each T1
+
+	int index = 0;
+	for (int T1 = 0; T1 < ntype; T1++)
+	{
+		this->DS_2Lplus1[T1] = max(ORB.Phi[T1].getLmax(), ORB.Alpha[0].getLmax()) * 2 + 1;
+	}
+	return;
+}
+
+void ORB_table_alpha::init_DS_Opair(void)
+{
+	const int lmax = ORB.get_lmax();
+	const int nchimax = ORB.get_nchimax();
+	const int lmax_d = ORB.get_lmax_d();
+	const int nchimax_d = ORB.get_nchimax_d();
+	assert(lmax + 1 > 0);
+	assert(lmax_d + 1 > 0);
+	assert(nchimax > 0);
+	assert(nchimax_d > 0);
+
+	this->DS_Opair.create(this->ntype, lmax + 1, lmax_d + 1, nchimax, nchimax_d);
+
+	// <1psi|2beta>
+	// 1. orbital
+	for (int T1 = 0; T1 < ntype; T1++) //alpha is not related to atom type !
+	{
+		int index = 0;
+		for (int L1 = 0; L1 < ORB.Phi[T1].getLmax() + 1; L1++)
+		{
+			for (int N1 = 0; N1 < ORB.Phi[T1].getNchi(L1); N1++)
+			{
+				for (int L2 = 0; L2 < ORB.Alpha[0].getLmax() + 1; L2++)
+				{
+					for (int N2 = 0; N2 < ORB.Alpha[0].getNchi(L2); N2++)
+					{
+						this->DS_Opair(T1, L1, L2, N1, N2) = index;
+						++index;
+					}
+				}
+			}
+		}
+	}
+	return;
+}
+
+/*
+//caoyu add 2021-03-20
+void ORB_table_alpha::print_Table_DSR(void)
+{
+	TITLE("ORB_table_alpha", "print_Table_DSR");
+	NEW_PART("Overlap table S between lcao orbital and descriptor basis : S_{I_mu_alpha}");
+
+	ofstream ofs;
+	stringstream ss;
+	// the parameter 'winput::spillage_outdir' is read from INPUTw.
+	ss << "./S_I_mu_alpha.dat";
+	if (MY_RANK == 0)
+	{
+		ofs.open(ss.str().c_str());
+	}
+
+	for (int T1 = 0; T1 < this->ntype; T1++)	//T1
+	{
+		const int Lmax1 = ORB.Phi[T1].getLmax();
+		const int Lmax2 = ORB.Alpha[0].getLmax();
+		for (int L1 = 0; L1 < Lmax1 + 1; L1++)
+		{
+			for (int N1 = 0; N1 < ORB.Phi[T1].getNchi(L1); N1++)
+			{
+				for (int L2 = 0; L2 < Lmax2 + 1; L2++)
+				{
+					for (int N2 = 0; N2 < ORB.Alpha[0].getNchi(L2); N2++)
+					{
+						const int Opair = this->DS_Opair(T1, L1, L2, N1, N2);	//Opair
+						//ofs <<setw(20)<< "atom_type: " << ucell.atoms[T1].label << endl;
+						ofs <<setw(20)<< "lcao basis: " << "L1=" << L1 << ", N1=" << N1 << endl;
+						ofs <<setw(20)<< "descriptor basis: " << "L2=" << L2 << ", N2=" << N2 << endl;
+						for (int il = 0; il < this-> DS_2Lplus1[T1]; il++)
+						{
+							ofs << "L=" << il << endl;
+							const double Rcut1 = ORB.Phi[T1].getRcut();
+							const double Rcut2 = ORB.Alpha[0].getRcut();
+							const int rmesh = this->get_rmesh(Rcut1, Rcut2);
+							
+							if (Table_DSR[0][T1][Opair][il][1]==0)	//remain to be discussed
+							{
+								ofs << "S(R)=0"<<endl<<endl;
+								continue;
+							}
+							ofs << "Rcut1="<<Rcut1<<", Rcut2="<<Rcut2<<", rmesh="<<rmesh<<", dr="<<this->dr<<";"<<endl;
+							for (int ir = 0; ir < rmesh; ir++)
+							{
+								ofs << Table_DSR[0][T1][Opair][il][ir] << " ";
+								if ( (ir+1) % 8 == 0) ofs << endl;
+							}
+							ofs << endl <<endl;
+						}// il
+					}// N2
+				}// L2
+			}// N1
+		}// L1
+	}// T1
+	return;
+}
+*/
diff --git a/ABACUS.develop/source/src_lcao/ORB_table_alpha.h b/ABACUS.develop/source/src_lcao/ORB_table_alpha.h
index 3c56c28d77..eca622e497 100644
--- a/ABACUS.develop/source/src_lcao/ORB_table_alpha.h
+++ b/ABACUS.develop/source/src_lcao/ORB_table_alpha.h
@@ -1,73 +1,68 @@
-//caoyu add 2021-03-17
-
-#ifndef ORB_TABLE_ALPHA_H 
-#define ORB_TABLE_ALPHA_H 
-
-#include "src_pw/tools.h"
-#include "ORB_atomic.h"
-#include "ORB_atomic_lm.h"
-#include "ORB_gaunt_table.h"
-#include "src_global/sph_bessel_recursive.h"
-
-
-class ORB_table_alpha
-{
-	public:
-
-	ORB_table_alpha();
-	~ORB_table_alpha();
-
-	void allocate(
-		const int& ntype,
-		const int& lmax_in,
-		const int& kmesh_in,
-		const double& Rmax_in,
-		const double& dR_in,
-		const double& dk_in);
-
-	double***** Table_DSR;//overlap between lcao basis phi and descriptor basis alpha
-	bool destroy_nr;
-
-	//-------------------------
-	// O stands for orbitals.
-	//-------------------------
-	void init_DS_Opair(void);
-	void init_DS_2Lplus1(void);
-	IntArray DS_Opair;
-	int* DS_2Lplus1;
-
-	void init_Table_Alpha(Sph_Bessel_Recursive::D2* pSB);
-
-	void Destroy_Table_Alpha(void);
-
-	static int get_rmesh(const double& R1, const double& R2);
-
-	static double dr;
-	int Rmesh;
-	int ntype;
-	int lmax;
-
-	void print_Table_DSR(void);		//caoyu add 2021-03-20
-
-	private:
-
-	void cal_S_PhiAlpha_R(
-		Sph_Bessel_Recursive::D2* pSB, // mohan add 2021-03-06
-		const int& l,	
-		const Numerical_Orbital_Lm& n1,
-		const Numerical_Orbital_Lm& n2,
-		const int& rmesh,
-		double* rs,
-		double* drs);
-
-	// variables
-	double Rmax;
-	double dk;
-	int nlm;
-	int kmesh;
-	double* kpoint;
-	double* r;
-	double* rab;
-	double* kab;
-};
-#endif
+#ifndef ORB_TABLE_ALPHA_H
+#define ORB_TABLE_ALPHA_H
+
+#include "ORB_atomic_lm.h"
+#include "../src_global/sph_bessel_recursive.h"
+
+//caoyu add 2021-03-17
+
+class ORB_table_alpha
+{
+public:
+	ORB_table_alpha();
+	~ORB_table_alpha();
+
+	void allocate(
+		const int &ntype,
+		const int &lmax_in,
+		const int &kmesh_in,
+		const double &Rmax_in,
+		const double &dR_in,
+		const double &dk_in);
+
+	double *****Table_DSR; //overlap between lcao basis phi and descriptor basis alpha
+	bool destroy_nr;
+
+	//-------------------------
+	// O stands for orbitals.
+	//-------------------------
+
+	void init_DS_Opair(void);
+	void init_DS_2Lplus1(void);
+	IntArray DS_Opair;
+	int *DS_2Lplus1;
+
+	void init_Table_Alpha(Sph_Bessel_Recursive::D2 *pSB);
+
+	void Destroy_Table_Alpha(void);
+
+	static int get_rmesh(const double &R1, const double &R2);
+
+	static double dr;
+	int Rmesh;
+	int ntype;
+	int lmax;
+
+	//void print_Table_DSR(void);		//caoyu add 2021-03-20
+
+private:
+	void cal_S_PhiAlpha_R(
+		Sph_Bessel_Recursive::D2 *pSB, // mohan add 2021-03-06
+		const int &l,
+		const Numerical_Orbital_Lm &n1,
+		const Numerical_Orbital_Lm &n2,
+		const int &rmesh,
+		double *rs,
+		double *drs);
+
+	// variables
+	double Rmax;
+	double dk;
+	int nlm;
+	int kmesh;
+	double *kpoint;
+	double *r;
+	double *rab;
+	double *kab;
+};
+#endif
diff --git a/ABACUS.develop/source/src_lcao/ORB_table_beta.cpp b/ABACUS.develop/source/src_lcao/ORB_table_beta.cpp
index b20574c327..5eff28e964 100644
--- a/ABACUS.develop/source/src_lcao/ORB_table_beta.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_table_beta.cpp
@@ -1,7 +1,7 @@
+#include <stdexcept>
 #include "ORB_table_beta.h"
 #include "ORB_read.h"
-#include <stdexcept>
-#include "../src_ri/exx_abfs.h"
+#include "../src_global/math_integral.h"
 
 double ORB_table_beta::dr = -1.0;
 
@@ -121,11 +121,11 @@ int ORB_table_beta::get_rmesh(const double &R1, const double &R2)
 
 void ORB_table_beta::cal_VNL_PhiBeta_R(
 		Sph_Bessel_Recursive::D2 *pSB, // mohan add 2021-03-06
-        const int &l,
-        const Numerical_Orbital_Lm &n1,
-        const Numerical_Nonlocal_Lm &n2,
-        const int &rmesh,
-        double *rs,
+		const int &l,
+		const Numerical_Orbital_Lm &n1,
+		const Numerical_Nonlocal_Lm &n2,
+		const int &rmesh,
+		double *rs,
 		double *drs)
 {
 	timer::tick ("ORB_table_beta", "VNL_PhiBeta_R");
@@ -156,7 +156,7 @@ void ORB_table_beta::cal_VNL_PhiBeta_R(
 			integrated_func[ik] = jl[ir][ik] * k1_dot_k2[ik];
 		}
 		// Call simpson integration
-		Mathzone::Simpson_Integral(kmesh,integrated_func,kab,temp);
+		Integral::Simpson_Integral(kmesh,integrated_func,kab,temp);
 		rs[ir] = temp * FOUR_PI;
 		
 		//drs
@@ -169,7 +169,7 @@ void ORB_table_beta::cal_VNL_PhiBeta_R(
 				integrated_func[ik] = jlm1[ir][ik] * k1_dot_k2[ik] * kpoint[ik];
 			}
 
-			Mathzone::Simpson_Integral(kmesh,integrated_func,kab,temp1);
+			Integral::Simpson_Integral(kmesh,integrated_func,kab,temp1);
 		}
 		
 				
@@ -178,7 +178,7 @@ void ORB_table_beta::cal_VNL_PhiBeta_R(
 			integrated_func[ik] = jlp1[ir][ik] * k1_dot_k2[ik] * kpoint[ik];
 		}
 		
-		Mathzone::Simpson_Integral(kmesh,integrated_func,kab,temp2);
+		Integral::Simpson_Integral(kmesh,integrated_func,kab,temp2);
 		
 		if (l == 0)
 		{
@@ -204,14 +204,13 @@ void ORB_table_beta::cal_VNL_PhiBeta_R(
 		}
 		
 		// Call simpson integration
-		Mathzone::Simpson_Integral(kmesh,integrated_func,kab,temp);
+		Integral::Simpson_Integral(kmesh,integrated_func,kab,temp);
 		rs[0] = FOUR_PI / Mathzone_Add1::dualfac (2*l+1) * temp;
 	}
 	
 	delete [] integrated_func;
-	
-
 	delete[] k1_dot_k2;
+
 	timer::tick ("ORB_table_beta", "VNL_PhiBeta_R");
 	return;
 }
@@ -227,7 +226,6 @@ void ORB_table_beta::init_Table_Beta(Sph_Bessel_Recursive::D2 *pSB)
 	// (2) allocate 2nd dimension ( overlap, derivative)
 	this->Table_NR[0] = new double*** [this->NL_nTpairs];
 	this->Table_NR[1] = new double*** [this->NL_nTpairs];
-
 	
 	// <1Phi|2Beta> 
 	for (int T1 = 0;  T1 < ntype ; T1++) // type 1 is orbital
@@ -236,8 +234,7 @@ void ORB_table_beta::init_Table_Beta(Sph_Bessel_Recursive::D2 *pSB)
 		{
 			// Tpair: type pair.
 			const int Tpair=this->NL_Tpair(T1,T2);
-			const int Lmax1 = ORB.Phi[T1].getLmax();
-			
+			const int Lmax1 = ORB.Phi[T1].getLmax();			
 			const int NBeta = ORB.nproj[T2];
 			
 			//-------------------------------------------------------------
@@ -262,7 +259,6 @@ void ORB_table_beta::init_Table_Beta(Sph_Bessel_Recursive::D2 *pSB)
             {
                 for (int N1 = 0; N1 < ORB.Phi[T1].getNchi(L1); N1++)
 				{
-					
 					// number of projectors.
 					for (int nb = 0; nb < NBeta; nb ++)
 					{
@@ -304,6 +300,7 @@ void ORB_table_beta::init_Table_Beta(Sph_Bessel_Recursive::D2 *pSB)
 							}
 
 							assert(nb < ORB.nproj[T2]);	
+
 							this->cal_VNL_PhiBeta_R(
 								pSB, // mohan add 2021-03-06
 								L,
@@ -327,11 +324,11 @@ void ORB_table_beta::init_Table_Beta(Sph_Bessel_Recursive::D2 *pSB)
 }
 
 
-void ORB_table_beta::Destroy_Table_Beta(void)
+void ORB_table_beta::Destroy_Table_Beta(LCAO_Orbitals &orb)
 {
 	if(!destroy_nr) return;
 
-	const int ntype = ORB.get_ntype();
+	const int ntype = orb.get_ntype();
 	for(int ir = 0; ir < 2; ir ++)
 	{
 		for(int T1=0; T1<ntype; T1++)
@@ -340,7 +337,7 @@ void ORB_table_beta::Destroy_Table_Beta(void)
 			{
 				const int Tpair = this->NL_Tpair(T1,T2); 
 				const int L2plus1 = this->NL_L2plus1(T1,T2);
-				const int pairs = ORB.Phi[T1].getTotal_nchi() * ORB.nproj[T2]; 
+				const int pairs = orb.Phi[T1].getTotal_nchi() * orb.nproj[T2]; 
 
 				// mohan fix bug 2011-03-30
 				if(pairs ==0) continue;
@@ -402,11 +399,11 @@ void ORB_table_beta::init_NL_Tpair(void)
 
 
 
-void ORB_table_beta::init_NL_Opair(void)
+void ORB_table_beta::init_NL_Opair(LCAO_Orbitals &orb)
 {
-	const int lmax = ORB.get_lmax();
-	const int nchimax = ORB.get_nchimax();
-	const int nprojmax = ORB.nprojmax;
+	const int lmax = orb.get_lmax();
+	const int nchimax = orb.get_nchimax();
+	const int nprojmax = orb.nprojmax;
 	
 	// may have bug if we use all H!
 	if( nprojmax == 0)
@@ -427,13 +424,13 @@ void ORB_table_beta::init_NL_Opair(void)
 		{
 			const int nlpair = this->NL_Tpair(T1, T0);
 			int index = 0;
-			for(int L1=0; L1<ORB.Phi[T1].getLmax()+1; L1++)
+			for(int L1=0; L1<orb.Phi[T1].getLmax()+1; L1++)
 			{
-				for(int N1=0; N1<ORB.Phi[T1].getNchi(L1); N1++)
+				for(int N1=0; N1<orb.Phi[T1].getNchi(L1); N1++)
 				{
 					// notice !! T0 must be Beta( Nonlocal projector)
 					// mohan update 2011-03-07
-					for(int ip=0; ip<ORB.nproj[T0]; ip++)
+					for(int ip=0; ip<orb.nproj[T0]; ip++)
 					{
 						assert( nlpair < NL_nTpairs );
 						assert( L1 < lmax+1 );
diff --git a/ABACUS.develop/source/src_lcao/ORB_table_beta.h b/ABACUS.develop/source/src_lcao/ORB_table_beta.h
index 17488fe18e..fe14ce1360 100644
--- a/ABACUS.develop/source/src_lcao/ORB_table_beta.h
+++ b/ABACUS.develop/source/src_lcao/ORB_table_beta.h
@@ -1,13 +1,9 @@
 #ifndef ORB_TABLE_BETA_H 
 #define ORB_TABLE_BETA_H 
 
-#include "src_pw/tools.h"
-#include "ORB_atomic.h"
-#include "ORB_atomic_lm.h"
-#include "ORB_nonlocal.h"
-#include "ORB_nonlocal_lm.h"
-#include "ORB_gaunt_table.h"
-#include "src_global/sph_bessel_recursive.h"
+#include "ORB_read.h" // use LCAO_Orbitals
+#include "ORB_atomic_lm.h" // use Numerical_Orbital_Lm
+#include "../src_global/sph_bessel_recursive.h" // use Sph_Bessel_Recursive
 
 class ORB_table_beta
 {
@@ -33,7 +29,9 @@ class ORB_table_beta
 	// O stands for orbitals.
 	//-------------------------
 	void init_NL_Tpair(void);
-    void init_NL_Opair(void);
+
+    void init_NL_Opair(LCAO_Orbitals &orb);
+
 	int NL_nTpairs;
 	IntArray NL_Tpair;
 	IntArray NL_Opair;
@@ -41,11 +39,12 @@ class ORB_table_beta
 
 	void init_Table_Beta(Sph_Bessel_Recursive::D2 *pSB);
 
-	void Destroy_Table_Beta(void);
+	void Destroy_Table_Beta(LCAO_Orbitals &orb);
 
 	static int get_rmesh( const double &R1, const double &R2);
 
 	static double dr;
+
 	int Rmesh;
 
 	private:
@@ -66,6 +65,7 @@ class ORB_table_beta
 	double dk;
 	int nlm;
 	int kmesh;
+
 	double *kpoint;
 	double *r;
 	double *rab;
diff --git a/ABACUS.develop/source/src_lcao/ORB_table_phi.cpp b/ABACUS.develop/source/src_lcao/ORB_table_phi.cpp
index a55f78d433..c79d52f091 100644
--- a/ABACUS.develop/source/src_lcao/ORB_table_phi.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_table_phi.cpp
@@ -1,7 +1,6 @@
-#include "ORB_table_phi.h"
-#include "ORB_read.h"
 #include <stdexcept>
-#include "../src_ri/exx_abfs.h"
+#include "ORB_table_phi.h"
+#include "../src_global/math_integral.h"
 
 double ORB_table_phi::dr = -1.0;
 
@@ -194,7 +193,8 @@ void ORB_table_phi::cal_ST_Phi12_R
 		}
 		// Call simpson integration
 		double temp = 0.0;
-		Mathzone::Simpson_Integral(kmesh,integrated_func,dk,temp);
+
+		Integral::Simpson_Integral(kmesh,integrated_func,dk,temp);
 		rs[ir] = temp * FOUR_PI ;
 		
 		// Peize Lin accelerate 2017-10-02
@@ -215,9 +215,8 @@ void ORB_table_phi::cal_ST_Phi12_R
 				integrated_func[ik] = (jlp1_r[ik]-fac*jlm1_r[ik]) * k1_dot_k2_dot_kpoint[ik];
 			}
 		}
-		// PLEASE try to make Simpson_Integral as input parameters
-		// mohan note 2021-03-23
-		Mathzone::Simpson_Integral(kmesh,integrated_func,dk,temp);
+
+		Integral::Simpson_Integral(kmesh,integrated_func,dk,temp);
 		drs[ir] = -FOUR_PI*(l+1)/(2.0*l+1) * temp;
 	}
 
@@ -235,9 +234,7 @@ void ORB_table_phi::cal_ST_Phi12_R
 			integrated_func[ik] = k1_dot_k2[ik] * pow (kpoint[ik], l);
 		}
 		
-		// PLEASE try to make Simpson_Integral as input parameters
-		// mohan note 2021-03-23
-		Mathzone::Simpson_Integral(kmesh,integrated_func,kab,temp);
+		Integral::Simpson_Integral(kmesh,integrated_func,kab,temp);
 		rs[0] = FOUR_PI / Mathzone_Add1::dualfac (2*l+1) * temp;
 	}
 
@@ -326,10 +323,8 @@ void ORB_table_phi::cal_ST_Phi12_R
 			integrated_func[ik] = jl_r[ik] * k1_dot_k2[ik];
 		}
 		double temp = 0.0;
-//		Mathzone::Simpson_Integral(kmesh,integrated_func,kab,temp);
-		// PLEASE try to make Simpson_Integral as input parameters
-		// mohan note 2021-03-23
-		Mathzone::Simpson_Integral(kmesh,VECTOR_TO_PTR(integrated_func),dk,temp);
+
+		Integral::Simpson_Integral(kmesh,VECTOR_TO_PTR(integrated_func),dk,temp);
 		rs[ir] = temp * FOUR_PI ;
 		
 		const vector<double> &jlm1_r = jlm1[ir];
@@ -349,10 +344,8 @@ void ORB_table_phi::cal_ST_Phi12_R
 				integrated_func[ik] = (jlp1_r[ik]-fac*jlm1_r[ik]) * k1_dot_k2_dot_kpoint[ik];
 			}
 		}
-//		Mathzone::Simpson_Integral(kmesh,integrated_func,kab,temp);
-		// PLEASE try to make Simpson_Integral as input parameters
-		// mohan note 2021-03-23
-		Mathzone::Simpson_Integral(kmesh,VECTOR_TO_PTR(integrated_func),dk,temp);
+
+		Integral::Simpson_Integral(kmesh,VECTOR_TO_PTR(integrated_func),dk,temp);
 		drs[ir] = -FOUR_PI*(l+1)/(2.0*l+1) * temp;
 	}
 
@@ -366,10 +359,9 @@ void ORB_table_phi::cal_ST_Phi12_R
 				integrated_func[ik] = k1_dot_k2[ik] * pow (kpoint[ik], l);
 			}
 			double temp = 0.0;
-	//		Mathzone::Simpson_Integral(kmesh,integrated_func,kab,temp);
-			// PLEASE try to make Simpson_Integral as input parameters
-			// mohan note 2021-03-23
-			Mathzone::Simpson_Integral(kmesh,VECTOR_TO_PTR(integrated_func),dk,temp);
+
+			Integral::Simpson_Integral(kmesh,VECTOR_TO_PTR(integrated_func),dk,temp);
+
 			// PLEASE try to make dualfac function as input parameters
 			// mohan note 2021-03-23
 			rs[0] = FOUR_PI / Mathzone_Add1::dualfac (2*l+1) * temp;
@@ -383,11 +375,13 @@ void ORB_table_phi::cal_ST_Phi12_R
 
 
 
-void ORB_table_phi::init_Table( const int &job0 )
+void ORB_table_phi::init_Table(
+	const int &job0, 
+	LCAO_Orbitals &orb)
 {
 	TITLE("ORB_table_phi", "init_Table");
 	timer::tick("ORB_table_phi", "init_Table",'D');
-	const int ntype = ORB.get_ntype();
+	const int ntype = orb.get_ntype();
 	assert( ORB_table_phi::dr > 0.0);
 	assert( OV_nTpairs>0);
 
@@ -430,8 +424,8 @@ void ORB_table_phi::init_Table( const int &job0 )
 		{
 			// get the bigger lmax between two types
 			const int Tpair=this->OV_Tpair(T1,T2);
-			const int Lmax1 = ORB.Phi[T1].getLmax();
-			const int Lmax2 = ORB.Phi[T2].getLmax();
+			const int Lmax1 = orb.Phi[T1].getLmax();
+			const int Lmax2 = orb.Phi[T2].getLmax();
 
 			//L2plus1 could be reduced by considering Gaunt Coefficient
 			//remain to be modified
@@ -448,8 +442,8 @@ void ORB_table_phi::init_Table( const int &job0 )
 			
 			const int L2plus1 =  2*lmax_now + 1;
 
-			const int nchi1 = ORB.Phi[T1].getTotal_nchi();
-			const int nchi2 = ORB.Phi[T2].getTotal_nchi();
+			const int nchi1 = orb.Phi[T1].getTotal_nchi();
+			const int nchi2 = orb.Phi[T2].getTotal_nchi();
 			const int pairs_chi = nchi1 * nchi2;
 
 			// init 2nd dimension
@@ -474,8 +468,8 @@ void ORB_table_phi::init_Table( const int &job0 )
 				break;
 			}
 
-			const double Rcut1 = ORB.Phi[T1].getRcut();
-			const double Rcut2 = ORB.Phi[T2].getRcut();
+			const double Rcut1 = orb.Phi[T1].getRcut();
+			const double Rcut2 = orb.Phi[T2].getRcut();
 			assert(Rcut1>0.0 && Rcut1<100);
 			assert(Rcut2>0.0 && Rcut2<100);
 
@@ -484,11 +478,11 @@ void ORB_table_phi::init_Table( const int &job0 )
 			
 			for (int L1 = 0; L1 < Lmax1 + 1; L1++)
 			{
-				for (int N1 = 0; N1 < ORB.Phi[T1].getNchi(L1); N1++)
+				for (int N1 = 0; N1 < orb.Phi[T1].getNchi(L1); N1++)
 				{
 					for (int L2 = 0; L2 < Lmax2 + 1; L2 ++)
 					{
-						for (int N2 = 0; N2 < ORB.Phi[T2].getNchi(L2); N2++)
+						for (int N2 = 0; N2 < orb.Phi[T2].getNchi(L2); N2++)
 						{		
 							// get the second index.
 							const int Opair = this->OV_Opair(Tpair,L1,L2,N1,N2);
@@ -582,8 +576,8 @@ void ORB_table_phi::init_Table( const int &job0 )
 									case 1:
 									{
 										this->cal_ST_Phi12_R(1,L, 
-												ORB.Phi[T1].PhiLN(L1,N1),
-												ORB.Phi[T2].PhiLN(L2,N2),
+												orb.Phi[T1].PhiLN(L1,N1),
+												orb.Phi[T2].PhiLN(L2,N2),
 												rmesh,
 												Table_SR[0][Tpair][Opair][L],
 												Table_SR[1][Tpair][Opair][L]);
@@ -593,8 +587,8 @@ void ORB_table_phi::init_Table( const int &job0 )
 									{
 
 										this->cal_ST_Phi12_R(2,L, 
-												ORB.Phi[T1].PhiLN(L1,N1),
-												ORB.Phi[T2].PhiLN(L2,N2),
+												orb.Phi[T1].PhiLN(L1,N1),
+												orb.Phi[T2].PhiLN(L2,N2),
 												rmesh,
 												Table_TR[0][Tpair][Opair][L],
 												Table_TR[1][Tpair][Opair][L]);
@@ -603,15 +597,15 @@ void ORB_table_phi::init_Table( const int &job0 )
 									case 3:
 									{	
 										this->cal_ST_Phi12_R(1,L, 
-												ORB.Phi[T1].PhiLN(L1,N1),
-												ORB.Phi[T2].PhiLN(L2,N2),
+												orb.Phi[T1].PhiLN(L1,N1),
+												orb.Phi[T2].PhiLN(L2,N2),
 												rmesh,
 												Table_SR[0][Tpair][Opair][L],
 												Table_SR[1][Tpair][Opair][L]);
 
 										this->cal_ST_Phi12_R(2,L, 
-												ORB.Phi[T1].PhiLN(L1,N1),
-												ORB.Phi[T2].PhiLN(L2,N2),
+												orb.Phi[T1].PhiLN(L1,N1),
+												orb.Phi[T2].PhiLN(L2,N2),
 												rmesh,
 												Table_TR[0][Tpair][Opair][L],
 												Table_TR[1][Tpair][Opair][L]);
@@ -647,11 +641,11 @@ void ORB_table_phi::init_Table( const int &job0 )
 }
 
 
-void ORB_table_phi::Destroy_Table(void)
+void ORB_table_phi::Destroy_Table(LCAO_Orbitals &orb)
 {
 	if(!destroy_sr && !destroy_tr) return;
 	
-	const int ntype = ORB.get_ntype();
+	const int ntype = orb.get_ntype();
 	int dim1 = 0;
 	for (int ir = 0; ir < 2; ir++)
 	{
@@ -661,10 +655,10 @@ void ORB_table_phi::Destroy_Table(void)
 			// means that T2 >= T1
     	    for (int T2 = T1; T2 < ntype; T2++)
         	{
-				const int Lmax1 = ORB.Phi[T1].getLmax();
-				const int Lmax2 = ORB.Phi[T2].getLmax();
+				const int Lmax1 = orb.Phi[T1].getLmax();
+				const int Lmax2 = orb.Phi[T2].getLmax();
 				const int lmax_now = std::max(Lmax1, Lmax2);
-				const int pairs = ORB.Phi[T1].getTotal_nchi() * ORB.Phi[T2].getTotal_nchi();
+				const int pairs = orb.Phi[T1].getTotal_nchi() * orb.Phi[T2].getTotal_nchi();
 				
 				for (int dim2 = 0; dim2 < pairs; dim2++)
 				{
@@ -696,7 +690,7 @@ void ORB_table_phi::Destroy_Table(void)
 
 
 
-void ORB_table_phi::init_OV_Tpair(void)
+void ORB_table_phi::init_OV_Tpair(LCAO_Orbitals &orb)
 {
 	TITLE("ORB_table_phi","init_OV_Tpair");
     assert(ntype>0);
@@ -720,7 +714,7 @@ void ORB_table_phi::init_OV_Tpair(void)
             
 			++index;
 			// (2) pairs about lmax
-			this->OV_L2plus1(T1,T2) = max(ORB.Phi[T1].getLmax(), ORB.Phi[T2].getLmax() )*2+1;
+			this->OV_L2plus1(T1,T2) = max(orb.Phi[T1].getLmax(), orb.Phi[T2].getLmax() )*2+1;
 			this->OV_L2plus1(T2,T1) = this->OV_L2plus1(T1,T2);
         }
     }
@@ -729,10 +723,10 @@ void ORB_table_phi::init_OV_Tpair(void)
 
 
 
-void ORB_table_phi::init_OV_Opair(void)
+void ORB_table_phi::init_OV_Opair(LCAO_Orbitals &orb)
 {
-    const int lmax = ORB.get_lmax(); 
-    const int nchimax = ORB.get_nchimax();
+    const int lmax = orb.get_lmax(); 
+    const int nchimax = orb.get_nchimax();
 	assert(lmax+1 > 0);
 	assert(nchimax > 0);
 	assert(OV_nTpairs > 0);
@@ -747,27 +741,32 @@ void ORB_table_phi::init_OV_Opair(void)
         {
 			const int dim1 = this->OV_Tpair(T1,T2);
 			int index=0;
-            for(int L1=0; L1<ORB.Phi[T1].getLmax()+1; L1++)
+            for(int L1=0; L1<orb.Phi[T1].getLmax()+1; L1++)
             {
-                for(int N1=0; N1<ORB.Phi[T1].getNchi(L1); N1++)
+                for(int N1=0; N1<orb.Phi[T1].getNchi(L1); N1++)
                 {
-                    for(int L2=0; L2<ORB.Phi[T2].getLmax()+1; L2++)
+                    for(int L2=0; L2<orb.Phi[T2].getLmax()+1; L2++)
                     {
-                        for(int N2=0; N2<ORB.Phi[T2].getNchi(L2); N2++)
+                        for(int N2=0; N2<orb.Phi[T2].getNchi(L2); N2++)
                         {
                             this->OV_Opair(dim1, L1, L2, N1, N2) = index;
                             ++index;
-                        }
-                    }
-                }
-            }
-        }
-    }
+                        }// N2
+                    }// L2
+                }// N1
+            }// L1
+        }// T2
+    }// T1
     return;
 }
 
 // Peize Lin update 2016-01-26
-void ORB_table_phi::init_Lmax (const int orb_num, const int mode, int &Lmax_used, int &Lmax) const
+void ORB_table_phi::init_Lmax (
+	const int orb_num, 
+	const int mode, 
+	int &Lmax_used, 
+	int &Lmax,
+	const int &Lmax_exx) const
 {
 	auto cal_Lmax_Phi = [](int &Lmax)
 	{
@@ -805,7 +804,7 @@ void ORB_table_phi::init_Lmax (const int orb_num, const int mode, int &Lmax_used
 					Lmax_used = 2*Lmax + 1;
 					break;
 				case 2:			// used in <jY|jY> or <Abfs|Abfs>
-					Lmax = max(Lmax, Exx_Abfs::Lmax);
+					Lmax = max(Lmax, Lmax_exx);
 					Lmax_used = 2*Lmax + 1;
 					break;
 				case 3:                // used in berryphase by jingan
@@ -824,8 +823,8 @@ void ORB_table_phi::init_Lmax (const int orb_num, const int mode, int &Lmax_used
 				case 1:			// used in <jY|PhiPhi> or <Abfs|PhiPhi>
 					cal_Lmax_Phi(Lmax);
 					Lmax_used = 2*Lmax + 1;
-					Lmax = max(Lmax, Exx_Abfs::Lmax);
-					Lmax_used += Exx_Abfs::Lmax;
+					Lmax = max(Lmax, Lmax_exx);
+					Lmax_used += Lmax_exx;
 					break;
 				default:
 					throw invalid_argument("ORB_table_phi::init_Lmax orb_num=3, mode error");
@@ -853,11 +852,16 @@ void ORB_table_phi::init_Lmax (const int orb_num, const int mode, int &Lmax_used
 }
 
 // Peize Lin update 2016-01-26
-void ORB_table_phi::init_Table_Spherical_Bessel (const int orb_num, const int mode, int &Lmax_used, int &Lmax)
+void ORB_table_phi::init_Table_Spherical_Bessel (
+	const int orb_num, 
+	const int mode, 
+	int &Lmax_used, 
+	int &Lmax,
+	const int &Lmax_exx)
 {
 	TITLE("ORB_table_phi", "init_Table_Spherical_Bessel");
 
-	this->init_Lmax (orb_num,mode,Lmax_used,Lmax);		// Peize Lin add 2016-01-26
+	this->init_Lmax (orb_num,mode,Lmax_used,Lmax,Lmax_exx);		// Peize Lin add 2016-01-26
 
 	for( auto & sb : Sph_Bessel_Recursive_Pool::D2::sb_pool )
 	{
diff --git a/ABACUS.develop/source/src_lcao/ORB_table_phi.h b/ABACUS.develop/source/src_lcao/ORB_table_phi.h
index 416a51075a..594d1ed260 100644
--- a/ABACUS.develop/source/src_lcao/ORB_table_phi.h
+++ b/ABACUS.develop/source/src_lcao/ORB_table_phi.h
@@ -1,12 +1,9 @@
 #ifndef ORB_TABLE_PHI_H 
 #define ORB_TABLE_PHI_H 
 
-#include "src_pw/tools.h"
-#include "ORB_atomic.h"
+#include "ORB_read.h"
 #include "ORB_atomic_lm.h"
-#include "ORB_gaunt_table.h"
-#include "center2_orb.h"
-#include "src_global/sph_bessel_recursive.h"
+#include "../src_global/sph_bessel_recursive.h"
 #include <set>
 
 class ORB_table_phi
@@ -24,8 +21,11 @@ class ORB_table_phi
 		const double &dR_in,
 		const double &dk_in);
 
-	void init_Table(const int &job);
-	void Destroy_Table(void);
+	void init_Table(
+		const int &job,
+		LCAO_Orbitals &orb);
+
+	void Destroy_Table(LCAO_Orbitals &orb);
 
 	// Five dimension:
 	// (1) 0: normal (S(R)) ; 1: derivative( dS/dR )
@@ -40,13 +40,24 @@ class ORB_table_phi
 	bool destroy_tr;
 	
 	//=================================================
-	//make table of Spherical bessel
-	//Sph_Bes : jlx[kmesh][Rmesh][L]
-	//L should be 2*Lmax, which is max L of all type
+	// make table of Spherical bessel
+	// Sph_Bes : jlx[kmesh][Rmesh][L]
+	// L should be 2*Lmax, which is max L of all type
 	//=================================================
 	// Peize Lin update 2016-01-26
-	void init_Lmax (const int orb_num, const int mode, int &Lmax_used, int &Lmax) const;
-	void init_Table_Spherical_Bessel (const int orb_num, const int mode, int &Lmax_used, int &Lmax);
+	void init_Lmax(
+		const int orb_num, 
+		const int mode, 
+		int &Lmax_used, 
+		int &Lmax,
+		const int &Lmax_exx) const;
+
+	void init_Table_Spherical_Bessel(
+		const int orb_num, 
+		const int mode, 
+		int &Lmax_used, 
+		int &Lmax,
+		const int &Lmax_exx);
 
 	// Peize Lin add 2017-04-24, and change all jlx in this class
 	Sph_Bessel_Recursive::D2* pSB = nullptr;
@@ -61,8 +72,10 @@ class ORB_table_phi
 	// T stands for atom type.
 	// O stands for orbitals.
 	//-------------------------
-    void init_OV_Tpair(void);
-    void init_OV_Opair(void);
+
+    void init_OV_Tpair(LCAO_Orbitals &orb);
+    void init_OV_Opair(LCAO_Orbitals &orb);
+
 	int OV_nTpairs;
     IntArray OV_Tpair;
     IntArray OV_Opair;
@@ -76,7 +89,6 @@ class ORB_table_phi
 	static double dr;
 	int Rmesh;
 
-	private:
 
 	void cal_ST_Phi12_R(
 		const int &job,
@@ -97,6 +109,8 @@ class ORB_table_phi
 		double *rs,
 		double *drs) const;
 
+	private:
+
 	// variables
     int ntype;
 	int lmax;
@@ -108,7 +122,5 @@ class ORB_table_phi
 	double *r;
 	double *rab;
 	double *kab;	
-
-	friend class Center2_Orb::Orb11;			// Peize Lin add 2016-01-24
 };
 #endif
diff --git a/ABACUS.develop/source/src_lcao/dftu_relax.cpp b/ABACUS.develop/source/src_lcao/dftu_relax.cpp
index 9051fd32a1..31424f7d6d 100644
--- a/ABACUS.develop/source/src_lcao/dftu_relax.cpp
+++ b/ABACUS.develop/source/src_lcao/dftu_relax.cpp
@@ -16,7 +16,7 @@
 #include "../src_pw/global.h"
 #include "global_fp.h"
 #include "../src_global/global_function.h"
-#include "../src_global/scalapack_connector.h"
+//#include "../src_global/scalapack_connector.h"
 #include "../src_global/lapack_connector.h"
 #include "../src_global/inverse_matrix.h"
 #include "LOOP_ions.h"
@@ -25,6 +25,26 @@
 #include "ORB_gen_tables.h"
 #include "../src_pw/charge.h"
 
+extern "C"
+{
+  void pzgemm_(
+		const char *transa, const char *transb,
+		const int *M, const int *N, const int *K,
+		const std::complex<double> *alpha,
+		const std::complex<double> *A, const int *IA, const int *JA, const int *DESCA,
+		const std::complex<double> *B, const int *IB, const int *JB, const int *DESCB,
+		const std::complex<double> *beta,
+		std::complex<double> *C, const int *IC, const int *JC, const int *DESCC);
+  
+  void pdgemm_(
+		const char *transa, const char *transb,
+		const int *M, const int *N, const int *K,
+		const double *alpha,
+		const double *A, const int *IA, const int *JA, const int *DESCA,
+		const double *B, const int *IB, const int *JB, const int *DESCB,
+		const double *beta,
+		double *C, const int *IC, const int *JC, const int *DESCC);
+}
 
 DFTU_RELAX::DFTU_RELAX(){}
 
@@ -144,7 +164,7 @@ void DFTU_RELAX::force_stress()
 				}
 				else
 				{
-					if(NSPIN==1 || NSPIN==4)
+					if(NSPIN==1)
 					{
 						double val = get_onebody_eff_pot(T1, iat1, L1, n1, 0, m1, m2, cal_type, false);
 						VU_k.at(0).at(irc) = complex<double>(val, 0.0);
@@ -191,13 +211,15 @@ void DFTU_RELAX::force_stress()
 }
 
 
-void DFTU_RELAX::cal_force_k(vector<vector<complex<double>>> &VU)
+void DFTU_RELAX::cal_force_k(const vector<vector<complex<double>>>& VU)
 {
 	TITLE("DFTU_RELAX", "cal_force_k");
 
-	const char transN = 'N', transT = 'T';
+	const char transN = 'N', transT = 'T', transC='C';
 	const int  one_int = 1;
-	const double alpha = 1.0, beta = 0.0;
+	// const double alpha = 1.0, beta = 0.0;
+  const complex<double> alpha(1.0,0.0), beta(0.0,0.0);
+  const complex<double> zero(0.0,0.0);
 	
 	vector<vector<complex<double>>> ftmp(ucell.nat);
 	for(int ia=0; ia<ucell.nat; ia++)
@@ -208,7 +230,7 @@ void DFTU_RELAX::cal_force_k(vector<vector<complex<double>>> &VU)
 	vector<vector<complex<double>>> dm_VU_dSm(3);
 	for(int dim=0; dim<3; dim++)
 	{
-		dm_VU_dSm.at(dim).resize(ParaO.nloc, complex<double>(0.0, 0.0));
+		dm_VU_dSm.at(dim).resize(ParaO.nloc, zero);
 	}
 	
 	for(int ik=0; ik<kv.nks; ik++)	
@@ -217,8 +239,8 @@ void DFTU_RELAX::cal_force_k(vector<vector<complex<double>>> &VU)
 
 		for(int dim=0; dim<3; dim++)
 		{
-			vector<complex<double>> mat_tmp(ParaO.nloc, complex<double>(0.0, 0.0));
-			vector<complex<double>> force_tmp(ParaO.nloc, complex<double>(0.0, 0.0));
+			vector<complex<double>> mat_tmp(ParaO.nloc);
+			vector<complex<double>> force_tmp(ParaO.nloc);
 
 			if(dim==0) //dim=1,2 are same as dim=0
 			{
@@ -251,7 +273,7 @@ void DFTU_RELAX::cal_force_k(vector<vector<complex<double>>> &VU)
 			//=========================================
 			ZEROS(VECTOR_TO_PTR(force_tmp), ParaO.nloc);
 
-			pzgemm_(&transN, &transT,
+			pzgemm_(&transN, &transC,
 				&NLOCAL, &NLOCAL, &NLOCAL,
 				&alpha, 
 				this->dSm_k[ik][dim], &one_int, &one_int, ParaO.desc, 
@@ -263,7 +285,7 @@ void DFTU_RELAX::cal_force_k(vector<vector<complex<double>>> &VU)
 			{
 				dm_VU_dSm.at(dim).at(irc) -= force_tmp.at(irc);
 			}
-		}//end dim				
+		}//end dim
 	}//end ik
 
 	for(int dim=0; dim<3; dim++)
@@ -296,35 +318,36 @@ void DFTU_RELAX::cal_force_k(vector<vector<complex<double>>> &VU)
 		}
 	}
 
-
 	return;
 }
 
 
-void DFTU_RELAX::cal_stress_k(vector<vector<complex<double>>> &VU)
+void DFTU_RELAX::cal_stress_k(const vector<vector<complex<double>>>& VU)
 {
 	TITLE("DFTU_RELAX", "cal_stress_k");
 
-	const char transN = 'N', transT = 'T';
+	const char transN = 'N', transT = 'T', transC='C';
 	const int  one_int = 1;
-	const double alpha = 1.0, beta = 0.0;
-	
+	//const double alpha = 1.0, beta = 0.0;
+	const complex<double> alpha(1.0,0.0), beta(0.0,0.0);
+  const complex<double> zero(0.0,0.0);
+  
 	int count = 0;
 	for(int dim1=0; dim1<3; dim1++)
 	{
 		for(int dim2=dim1; dim2<3; dim2++)
 		{
-			vector<complex<double>> dm_VU_sover(ParaO.nloc, complex<double>(0.0, 0.0));
+			vector<complex<double>> dm_VU_sover(ParaO.nloc, zero);
 
 			for(int ik=0; ik<kv.nks; ik++)
 			{
 				const int spin = kv.isk[ik];
 				
 				// The first term
-				vector<complex<double>> stress_tmp(ParaO.nloc, complex<double>(0.0, 0.0));
+				vector<complex<double>> stress_tmp(ParaO.nloc);
 
 				//Calculate mat_tmp=dm*VU
-				vector<complex<double>> mat_tmp(ParaO.nloc, complex<double>(0.0, 0.0));
+				vector<complex<double>> mat_tmp(ParaO.nloc);
 
 				pzgemm_(&transT, &transN,
 					&NLOCAL, &NLOCAL, &NLOCAL,
@@ -344,13 +367,14 @@ void DFTU_RELAX::cal_stress_k(vector<vector<complex<double>>> &VU)
 
 				for(int irc=0; irc<ParaO.nloc; irc++)
 				{
-					dm_VU_sover.at(irc) -= 0.5*stress_tmp.at(irc);
+					// dm_VU_sover.at(irc) -= 0.5*stress_tmp.at(irc);
+          dm_VU_sover.at(irc) -= 0.5*stress_tmp.at(irc);
 				}
 
 				// The second term
 				ZEROS(VECTOR_TO_PTR(stress_tmp), ParaO.nloc);
 
-				pzgemm_(&transN, &transT,
+				pzgemm_(&transN, &transC,
 					&NLOCAL, &NLOCAL, &NLOCAL,
 					&alpha, 
 					this->soverlap_k[ik][count], &one_int, &one_int, ParaO.desc, 
@@ -360,7 +384,8 @@ void DFTU_RELAX::cal_stress_k(vector<vector<complex<double>>> &VU)
 
 				for(int irc=0; irc<ParaO.nloc; irc++)
 				{
-					dm_VU_sover.at(irc) += 0.5*stress_tmp.at(irc);
+					// dm_VU_sover.at(irc) += 0.5*stress_tmp.at(irc);
+          dm_VU_sover.at(irc) -= 0.5*stress_tmp.at(irc);
 				}
 
 			}//end ik
@@ -385,8 +410,8 @@ void DFTU_RELAX::cal_stress_k(vector<vector<complex<double>>> &VU)
 			double val = stmp.real();
 			MPI_Allreduce(&val, &stress_dftu.at(dim1).at(dim2), 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
 
-			complex<double> tmp;
-			MPI_Allreduce(&stmp, &tmp, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+			// complex<double> tmp;
+			// MPI_Allreduce(&stmp, &tmp, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
 						
 			count++;
 		}//end dim2
@@ -413,7 +438,7 @@ void DFTU_RELAX::cal_stress_k(vector<vector<complex<double>>> &VU)
 }
 
 
-void DFTU_RELAX::cal_force_gamma(vector<vector<double>> &VU)
+void DFTU_RELAX::cal_force_gamma(const vector<vector<double>> &VU)
 {
 	TITLE("DFTU_RELAX", "cal_force_gamma");
 
@@ -578,7 +603,7 @@ void DFTU_RELAX::cal_force_gamma(vector<vector<double>> &VU)
 }
 
 
-void DFTU_RELAX::cal_stress_gamma(vector<vector<double>> &VU)
+void DFTU_RELAX::cal_stress_gamma(const vector<vector<double>> &VU)
 {
 	TITLE("DFTU_RELAX", "cal_stress_gamma");
 
@@ -637,7 +662,7 @@ void DFTU_RELAX::cal_stress_gamma(vector<vector<double>> &VU)
 
 				for(int irc=0; irc<ParaO.nloc; irc++)
 				{
-					dm_VU_sover.at(irc) += 0.5*stress_tmp.at(irc);
+					dm_VU_sover.at(irc) -= 0.5*stress_tmp.at(irc);
 				}
 
 			}//end ik
@@ -727,145 +752,179 @@ void DFTU_RELAX::folding_dSm_soverlap()
 	}
 	
 
-	Vector3<double> tau1, tau2, dtau;
-	Vector3<double> dtau1, dtau2, tau0;
+	  Vector3<double> tau1, tau2, dtau;
+	  Vector3<double> dtau1, dtau2, tau0;
     for(int T1=0; T1<ucell.ntype; ++T1)
     {
-		Atom* atom1 = &ucell.atoms[T1];
-        for(int I1=0; I1<atom1->na; ++I1)
+		  Atom* atom1 = &ucell.atoms[T1];
+      for(int I1=0; I1<atom1->na; ++I1)
+      {
+			  tau1 = atom1->tau[I1];
+        const int start1 = ucell.itiaiw2iwt(T1,I1,0);    
+
+        GridD.Find_atom(tau1, T1, I1);
+        for(int ad=0; ad<GridD.getAdjacentNum()+1; ++ad)
         {
-			tau1 = atom1->tau[I1];
-            
-            GridD.Find_atom(tau1, T1, I1);
-            for(int ad=0; ad<GridD.getAdjacentNum()+1; ++ad)
-            {
-                const int T2 = GridD.getType(ad);
-				const int I2 = GridD.getNatom(ad);
-
-				Atom* atom2 = &ucell.atoms[T2];
-
-				tau2 = GridD.getAdjacentTau(ad);
-				dtau = tau2 - tau1;
-
-				double distance = dtau.norm() * ucell.lat0;
-				double rcut = ORB.Phi[T1].getRcut() + ORB.Phi[T2].getRcut();				
-
-				if(distance < rcut)
-				{
-					int iw1_all = ucell.itiaiw2iwt( T1, I1, 0) ; //iw1_all = combined index (it, ia, iw)
-
-					for(int jj=0; jj<atom1->nw*NPOL; ++jj)
-					{
-						const int jj0 = jj/NPOL;
-						const int L1 = atom1->iw2l[jj0];
-						const int N1 = atom1->iw2n[jj0];
-						const int m1 = atom1->iw2m[jj0];
-						int iw2_all = ucell.itiaiw2iwt( T2, I2, 0);
-
-						for(int kk=0; kk<atom2->nw*NPOL; ++kk)
-						{
-							const int kk0 = kk/NPOL;
-							const int L2 = atom2->iw2l[kk0];
-							const int N2 = atom2->iw2n[kk0];
-							const int m2 = atom2->iw2m[kk0];
-							
-							if ( !ParaO.in_this_processor(iw1_all,iw2_all) )
-							{
-								++iw2_all;
-								continue;
-							}
-
-							int mu = ParaO.trace_loc_row[iw1_all];
-							int nu = ParaO.trace_loc_col[iw2_all];
-							int irc = nu*ParaO.nrow + mu;
-														
-							if(GAMMA_ONLY_LOCAL)
-							{
-								if(STRESS)
-								{
-									this->soverlap_gamma[0][irc] += LM.DSloc_Rx[nnr]*LM.DH_r[nnr*3+0];
-									this->soverlap_gamma[1][irc] += LM.DSloc_Rx[nnr]*LM.DH_r[nnr*3+1];
-									this->soverlap_gamma[2][irc] += LM.DSloc_Rx[nnr]*LM.DH_r[nnr*3+2];
-									this->soverlap_gamma[3][irc] += LM.DSloc_Ry[nnr]*LM.DH_r[nnr*3+1];
-									this->soverlap_gamma[4][irc] += LM.DSloc_Ry[nnr]*LM.DH_r[nnr*3+2];
-									this->soverlap_gamma[5][irc] += LM.DSloc_Rz[nnr]*LM.DH_r[nnr*3+2];
-								}
-							}
-							else
-							{
-								Vector3<double> dR(GridD.getBox(ad).x, GridD.getBox(ad).y, GridD.getBox(ad).z); 
-							
-								for(int ik=0; ik<kv.nks; ik++)
-								{								
-									const double arg = ( kv.kvec_d[ik] * dR ) * TWO_PI;
-									const complex<double> kphase = complex <double> ( cos(arg),  sin(arg) );
-
-									this->dSm_k[ik][0][irc] += LM.DSloc_Rx[nnr]*kphase;
-									this->dSm_k[ik][1][irc] += LM.DSloc_Ry[nnr]*kphase;
-									this->dSm_k[ik][2][irc] += LM.DSloc_Rz[nnr]*kphase;
-
-									if(STRESS)
-									{																												
-										this->soverlap_k[ik][0][irc] += LM.DSloc_Rx[nnr]*LM.DH_r[nnr*3+0]*kphase;
-										this->soverlap_k[ik][1][irc] += LM.DSloc_Rx[nnr]*LM.DH_r[nnr*3+1]*kphase;
-										this->soverlap_k[ik][2][irc] += LM.DSloc_Rx[nnr]*LM.DH_r[nnr*3+2]*kphase;
-										this->soverlap_k[ik][3][irc] += LM.DSloc_Ry[nnr]*LM.DH_r[nnr*3+1]*kphase;
-										this->soverlap_k[ik][4][irc] += LM.DSloc_Ry[nnr]*LM.DH_r[nnr*3+2]*kphase;
-										this->soverlap_k[ik][5][irc] += LM.DSloc_Rz[nnr]*LM.DH_r[nnr*3+2]*kphase;																
-									}
-								}	
-							}
-																																																																				
-							++nnr;													
-							++iw2_all;
-						}// nw2 
-
-						++iw1_all;
-						
-					}// nw1
-				}// distance
-				else if(distance>=rcut)
-				{
-					int start1 = ucell.itiaiw2iwt( T1, I1, 0);
-					int start2 = ucell.itiaiw2iwt( T2, I2, 0);
-					bool is_adj = false;
-					for (int ad0=0; ad0<GridD.getAdjacentNum()+1; ++ad0)
-					{
-						const int T0 = GridD.getType(ad0);
-						
-						tau0 = GridD.getAdjacentTau(ad0);
-						dtau1 = tau0 - tau1;
-						double distance1 = dtau1.norm() * ucell.lat0;
-						double rcut1 = ORB.Phi[T1].getRcut() + ORB.Beta[T0].get_rcut_max();
-						dtau2 = tau0 - tau2;
-						double distance2 = dtau2.norm() * ucell.lat0;
-						double rcut2 = ORB.Phi[T2].getRcut() + ORB.Beta[T0].get_rcut_max();
-						if(distance1<rcut1 && distance2<rcut2)
-						{
-							is_adj = true;
-							break;
-						}
-					}//ad0
-					if( is_adj )
-					{
-						for(int jj=0; jj<atom1->nw * NPOL; ++jj)
-						{
-							const int mu = ParaO.trace_loc_row[start1+jj];
-							if(mu<0)continue; 
-
-							for(int kk=0; kk<atom2->nw * NPOL; ++kk)
-							{
-								const int nu = ParaO.trace_loc_col[start2+kk];
-								if(nu<0)continue;
-
-								++nnr;
-							}//kk
-						}//jj
-					}
-				}//distance
-			}// ad
-		}// I1
-	}// T1
+          const int T2 = GridD.getType(ad);
+				  const int I2 = GridD.getNatom(ad);
+          const int start2 = ucell.itiaiw2iwt(T2, I2, 0);
+
+				  Atom* atom2 = &ucell.atoms[T2];
+
+				  tau2 = GridD.getAdjacentTau(ad);
+				  dtau = tau2 - tau1;
+
+				  double distance = dtau.norm() * ucell.lat0;
+				  double rcut = ORB.Phi[T1].getRcut() + ORB.Phi[T2].getRcut();
+
+          bool adj = false;
+				  if(distance < rcut) adj = true;
+				  else if(distance >= rcut)
+				  {
+				  	for (int ad0 = 0; ad0 < GridD.getAdjacentNum()+1; ++ad0)
+				  	{
+				  		const int T0 = GridD.getType(ad0); 
+				  		const int I0 = GridD.getNatom(ad0); 
+				  		const int iat0 = ucell.itia2iat(T0, I0);
+				  		const int start0 = ucell.itiaiw2iwt(T0, I0, 0);
+
+				  		tau0 = GridD.getAdjacentTau(ad0);
+				  		dtau1 = tau0 - tau1;
+				  		dtau2 = tau0 - tau2;
+
+				  		double distance1 = dtau1.norm() * ucell.lat0;
+				  		double distance2 = dtau2.norm() * ucell.lat0;
+
+				  		double rcut1 = ORB.Phi[T1].getRcut() + ORB.Beta[T0].get_rcut_max();
+				  		double rcut2 = ORB.Phi[T2].getRcut() + ORB.Beta[T0].get_rcut_max();
+
+				  		if( distance1 < rcut1 && distance2 < rcut2 )
+				  		{
+				  			adj = true;
+				  			break;
+				  		}
+				  	}
+				  }				
+
+				  if(adj)
+				  {
+				  	for(int jj=0; jj<atom1->nw*NPOL; ++jj)
+				  	{
+              const int jj0 = jj/NPOL;
+
+              const int iw1_all = start1 + jj0; 
+              const int mu = ParaO.trace_loc_row[iw1_all];
+					    if(mu<0)continue;
+
+				  		const int L1 = atom1->iw2l[jj0];
+				  		const int N1 = atom1->iw2n[jj0];
+				  		const int m1 = atom1->iw2m[jj0];
+
+
+				  		for(int kk=0; kk<atom2->nw*NPOL; ++kk)
+				  		{
+                const int kk0 = kk/NPOL;
+
+                const int iw2_all = start2 + kk0;
+						    const int nu = ParaO.trace_loc_col[iw2_all];
+						    if(nu<0)continue;
+
+				  			const int L2 = atom2->iw2l[kk0];
+				  			const int N2 = atom2->iw2n[kk0];
+				  			const int m2 = atom2->iw2m[kk0];
+  
+				  			// if ( !ParaO.in_this_processor(iw1_all,iw2_all) )
+				  			// {
+				  				// ++iw2_all;
+				  				// continue;
+				  			// }
+
+				  			// int mu = ParaO.trace_loc_row[iw1_all];
+				  			// int nu = ParaO.trace_loc_col[iw2_all];
+				  			int irc = nu*ParaO.nrow + mu;
+  
+				  			if(GAMMA_ONLY_LOCAL)
+							  {
+							  	if(STRESS)
+							  	{
+							  		this->soverlap_gamma[0][irc] += LM.DSloc_Rx[nnr]*LM.DH_r[nnr*3+0];
+							  		this->soverlap_gamma[1][irc] += LM.DSloc_Rx[nnr]*LM.DH_r[nnr*3+1];
+							  		this->soverlap_gamma[2][irc] += LM.DSloc_Rx[nnr]*LM.DH_r[nnr*3+2];
+							  		this->soverlap_gamma[3][irc] += LM.DSloc_Ry[nnr]*LM.DH_r[nnr*3+1];
+							  		this->soverlap_gamma[4][irc] += LM.DSloc_Ry[nnr]*LM.DH_r[nnr*3+2];
+							  		this->soverlap_gamma[5][irc] += LM.DSloc_Rz[nnr]*LM.DH_r[nnr*3+2];
+							  	}
+							  }
+				  			else
+				  			{
+				  				Vector3<double> dR(GridD.getBox(ad).x, GridD.getBox(ad).y, GridD.getBox(ad).z); 
+  
+				  				for(int ik=0; ik<kv.nks; ik++)
+				  				{
+				  					const double arg = ( kv.kvec_d[ik] * dR ) * TWO_PI;
+				  					const complex<double> kphase( cos(arg),  sin(arg) );
+
+				  					this->dSm_k[ik][0][irc] += LM.DSloc_Rx[nnr]*kphase;
+				  					this->dSm_k[ik][1][irc] += LM.DSloc_Ry[nnr]*kphase;
+				  					this->dSm_k[ik][2][irc] += LM.DSloc_Rz[nnr]*kphase;
+
+				  					if(STRESS)
+				  					{		
+				  						this->soverlap_k[ik][0][irc] += LM.DSloc_Rx[nnr]*LM.DH_r[nnr*3+0]*kphase;
+				  						this->soverlap_k[ik][1][irc] += LM.DSloc_Rx[nnr]*LM.DH_r[nnr*3+1]*kphase;
+				  						this->soverlap_k[ik][2][irc] += LM.DSloc_Rx[nnr]*LM.DH_r[nnr*3+2]*kphase;
+				  						this->soverlap_k[ik][3][irc] += LM.DSloc_Ry[nnr]*LM.DH_r[nnr*3+1]*kphase;
+				  						this->soverlap_k[ik][4][irc] += LM.DSloc_Ry[nnr]*LM.DH_r[nnr*3+2]*kphase;
+				  						this->soverlap_k[ik][5][irc] += LM.DSloc_Rz[nnr]*LM.DH_r[nnr*3+2]*kphase;																
+				  					}
+				  				}
+				  			}
+				  			++nnr;
+				  		}// kk
+				    }// jj
+				  }// adj
+				  // else if(distance>=rcut)
+				  // {
+				  	// int start1 = ucell.itiaiw2iwt( T1, I1, 0);
+				  	// int start2 = ucell.itiaiw2iwt( T2, I2, 0);
+				  	// bool is_adj = false;
+				  	// for (int ad0=0; ad0<GridD.getAdjacentNum()+1; ++ad0)
+				  	// {
+				  	// 	const int T0 = GridD.getType(ad0);
+				  		
+				  	// 	tau0 = GridD.getAdjacentTau(ad0);
+				  	// 	dtau1 = tau0 - tau1;
+				  	// 	double distance1 = dtau1.norm() * ucell.lat0;
+				  	// 	double rcut1 = ORB.Phi[T1].getRcut() + ORB.Beta[T0].get_rcut_max();
+				  	// 	dtau2 = tau0 - tau2;
+				  	// 	double distance2 = dtau2.norm() * ucell.lat0;
+				  	// 	double rcut2 = ORB.Phi[T2].getRcut() + ORB.Beta[T0].get_rcut_max();
+				  	// 	if(distance1<rcut1 && distance2<rcut2)
+				  	// 	{
+				  	// 		is_adj = true;
+				  	// 		break;
+				  	// 	}
+				  	// }//ad0
+				  	// if( is_adj )
+				  	// {
+				  // 		for(int jj=0; jj<atom1->nw * NPOL; ++jj)
+				  // 		{
+				  // 			const int mu = ParaO.trace_loc_row[start1+jj];
+				  // 			if(mu<0) continue; 
+
+				  // 			for(int kk=0; kk<atom2->nw * NPOL; ++kk)
+				  // 			{
+				  // 				const int nu = ParaO.trace_loc_col[start2+kk];
+				  // 				if(nu<0) continue;
+
+				  // 				++nnr;
+				  // 			}//kk
+				  // 		}//jj
+				  // 	// }
+				  // }//distance
+			  }// ad
+		  }// I1
+	  }// T1
 
 	return;
 }
@@ -944,7 +1003,7 @@ void DFTU_RELAX::erase_force_stress()
 				delete [] soverlap_gamma[i];
 			}
 			delete [] soverlap_gamma;
-
+      soverlap_gamma=nullptr;
 		}
 	}
 	else
@@ -962,6 +1021,7 @@ void DFTU_RELAX::erase_force_stress()
 			delete [] dSm_k[ik];
 		}
 		delete [] dSm_k;
+    dSm_k = nullptr;
 
 		if(STRESS)
 		{
@@ -978,6 +1038,7 @@ void DFTU_RELAX::erase_force_stress()
 				delete [] soverlap_k[ik];
 			}
 			delete [] soverlap_k;
+      soverlap_k = nullptr;
 		}
 	}
 			
diff --git a/ABACUS.develop/source/src_lcao/dftu_relax.h b/ABACUS.develop/source/src_lcao/dftu_relax.h
index 9908c7478d..9550359ac9 100644
--- a/ABACUS.develop/source/src_lcao/dftu_relax.h
+++ b/ABACUS.develop/source/src_lcao/dftu_relax.h
@@ -27,10 +27,10 @@ class DFTU_RELAX : public DFTU_Yukawa
     void folding_dSm_soverlap();
     void allocate_force_stress();
     void erase_force_stress();
-    void cal_force_k(vector<vector<complex<double>>> &VU);
-    void cal_force_gamma(vector<vector<double>> &VU);
-    void cal_stress_k(vector<vector<complex<double>>> &VU);
-    void cal_stress_gamma(vector<vector<double>> &VU);
+    void cal_force_k(const vector<vector<complex<double>>> &VU);
+    void cal_force_gamma(const vector<vector<double>> &VU);
+    void cal_stress_k(const vector<vector<complex<double>>> &VU);
+    void cal_stress_gamma(const vector<vector<double>> &VU);
 
     double get_onebody_eff_pot
     (
diff --git a/ABACUS.develop/source/src_lcao/dftu_yukawa.cpp b/ABACUS.develop/source/src_lcao/dftu_yukawa.cpp
index f789958888..9a502bc730 100644
--- a/ABACUS.develop/source/src_lcao/dftu_yukawa.cpp
+++ b/ABACUS.develop/source/src_lcao/dftu_yukawa.cpp
@@ -20,13 +20,6 @@
 #include "LOOP_ions.h"
 #include "LCAO_matrix.h"
 
-
-extern "C"
-{
-	void sphbsl_(int *n, double *r, double *A, double *val);
-	void sphhnk_(int *n, double *r, double *A, double *val);
-}
-
 DFTU_Yukawa::DFTU_Yukawa(){}
 
 DFTU_Yukawa::~DFTU_Yukawa(){}
@@ -95,14 +88,14 @@ void DFTU_Yukawa::cal_slater_Fk(const int L, const int T)
 						int l = 2*k;
 						if(ir0<ir1)  //less than
 						{
-						 	sphbsl_(&l, &r0, &lambda, &bslval);
-							sphhnk_(&l, &r1, &lambda, &hnkval);
+						 	bslval=this->spherical_Bessel(l, r0, lambda);
+							hnkval=this->spherical_Hankel(l, r1, lambda);
 						}
 						else //greater than
 						{
-						 	sphbsl_(&l, &r1, &lambda, &bslval);
-							sphhnk_(&l, &r0, &lambda, &hnkval);
-						}					
+						 	bslval=this->spherical_Bessel(l, r1, lambda);
+							hnkval=this->spherical_Hankel(l, r0, lambda);
+						}				
 						this->Fk.at(T).at(L).at(chi).at(k) -= (4*k+1)*lambda*pow(R_L0,2)*bslval*hnkval*pow(R_L1,2)*pow(r0,2)*pow(r1,2)*rab0*rab1;					
 					}
 				}
@@ -219,6 +212,69 @@ void DFTU_Yukawa::cal_slater_UJ(const int istep, const int iter)
 	return;
 }
 
+
+double DFTU_Yukawa::spherical_Bessel(const int k, const double r, const double lambda)
+{
+  TITLE("DFTU_Yukawa", "spherical_Bessel");
+
+  double val;
+  double x=r*lambda;
+  if(k==0)
+  {
+    if(x < 1.0e-3) val=1+pow(x,2)/6.0;
+    else val = sinh(x)/x;
+  }
+  else if(k==2)
+  {
+    if(x < 1.0e-2) val=-pow(x,2)/15.0 -pow(x,4)/210.0 - pow(x,6)/7560.0;
+    else val = 3*cosh(x)/pow(x,2) + (-3-pow(x,2))*sinh(x)/pow(x,3);
+  }
+  else if(k==4)
+  {
+    if(x < 5.0e-1) val=pow(x,4)/945.0 + pow(x,6)/20790.0 + pow(x,8)/1081080.0 + pow(x,10)/97297200.0;
+    else val = -5*(21+2*pow(x,2))*cosh(x)/pow(x,4)+(105+45*pow(x,2)+pow(x,4))*sinh(x)/pow(x,5);
+  }
+  else if(k==6)
+  {
+    if(x < 9.0e-1) val=-pow(x,6)/135135.0-pow(x,8)/4054050.0-pow(x,10)/275675400.0;
+    else val = 21*(495+60*pow(x,2)+pow(x,4))*cosh(x)/pow(x,6) + 
+              (-10395-4725*pow(x,2)-210*pow(x,4)-pow(x,6))*sinh(x)/pow(x,7);
+  }
+  return val;
+}
+
+
+double DFTU_Yukawa::spherical_Hankel(const int k, const double r, const double lambda)
+{
+  TITLE("DFTU_Yukawa", "spherical_Bessel");
+
+  double val;
+  double x=r*lambda;
+  if(k==0)
+  {
+    if(x < 1.0e-3) val=-1/x + 1 -x/2.0 + pow(x,2)/6.0;
+    else val = -exp(-x)/x;
+  }
+  else if(k==2)
+  {
+    if(x < 1.0e-2) val=3/pow(x,3)-1/(2*x)+x/8-pow(x,2)/15.0+pow(x,3)/48.0;
+    else val = exp(-x)*(3+3*x+pow(x,2))/pow(x,3);
+  }
+  else if(k==4)
+  {
+    if(x < 5.0e-1) val=-105/pow(x,5) + 15/(2*pow(x,3)) - 3/(8*x) + x/48 - pow(x,3)/384.0+pow(x,4)/945.0;
+    else val = -exp(-x)*(105+105*x+45*pow(x,2)+10*pow(x,3)+pow(x,4))/pow(x,5);
+  }
+  else if(k==6)
+  {
+    if(x < 9.0e-1) val=10395/pow(x,7) - 945/(2*pow(x,5)) + 105/(8*pow(x,3)) -5/(16*x)
+                        +x/128.0-pow(x,3)/3840.0 + pow(x,5)/46080.0 - pow(x,6)/135135.0;
+    else val = exp(-x)*(10395+10395*x+4725*pow(x,2)+1260*pow(x,3)+210*pow(x,4) + 
+                21*pow(x,5)+pow(x,6))/pow(x,7);
+  }
+  return val;
+}
+
 /*
 void DFTU::cal_unscreened_slater_Fk(const int L, const int T)
 {
diff --git a/ABACUS.develop/source/src_lcao/dftu_yukawa.h b/ABACUS.develop/source/src_lcao/dftu_yukawa.h
index 8ed5791cd2..49c7813a1f 100644
--- a/ABACUS.develop/source/src_lcao/dftu_yukawa.h
+++ b/ABACUS.develop/source/src_lcao/dftu_yukawa.h
@@ -23,6 +23,9 @@ class DFTU_Yukawa
     void cal_yukawa_lambda();
     void cal_slater_UJ(const int istep, const int iter);
 
+    double spherical_Bessel(const int k, const double r, const double lambda);
+    double spherical_Hankel(const int k, const double r, const double lambda);
+
     //void cal_unscreened_slater_Fk(const int L, const int T); //L:angular momnet, T:atom type
     //void cal_slater_Vsc(const int T, const int L);
 
diff --git a/ABACUS.develop/source/src_lcao/gint_gamma_env.cpp b/ABACUS.develop/source/src_lcao/gint_gamma_env.cpp
index b0b883d2c5..b9e14831f1 100644
--- a/ABACUS.develop/source/src_lcao/gint_gamma_env.cpp
+++ b/ABACUS.develop/source/src_lcao/gint_gamma_env.cpp
@@ -2,6 +2,7 @@
 #include "grid_technique.h"
 #include "ORB_read.h"
 #include "../src_pw/global.h"
+#include "../src_global/ylm.h"
 
 void Gint_Gamma::cal_env(const double* wfc, double* rho)
 {
diff --git a/ABACUS.develop/source/src_lcao/gint_gamma_fvl.cpp b/ABACUS.develop/source/src_lcao/gint_gamma_fvl.cpp
index 782830c817..21cc803710 100644
--- a/ABACUS.develop/source/src_lcao/gint_gamma_fvl.cpp
+++ b/ABACUS.develop/source/src_lcao/gint_gamma_fvl.cpp
@@ -5,6 +5,7 @@
 #include "src_global/blas_connector.h"
 
 #include "global_fp.h" // mohan add 2021-01-30
+#include "../src_global/ylm.h"
 
 void Gint_Gamma::cal_force(const double* vlocal_in)
 {
diff --git a/ABACUS.develop/source/src_lcao/gint_gamma_mull.cpp b/ABACUS.develop/source/src_lcao/gint_gamma_mull.cpp
index 3f88752f27..e7b094ed80 100644
--- a/ABACUS.develop/source/src_lcao/gint_gamma_mull.cpp
+++ b/ABACUS.develop/source/src_lcao/gint_gamma_mull.cpp
@@ -4,6 +4,7 @@
 #include "../src_pw/global.h"
 
 #include "global_fp.h" // mohan add 2021-01-30
+#include "../src_global/ylm.h"
 
 void Gint_Gamma::cal_mulliken(double** mulliken)
 {
diff --git a/ABACUS.develop/source/src_lcao/gint_gamma_rho.cpp b/ABACUS.develop/source/src_lcao/gint_gamma_rho.cpp
index e594d00c35..3a042204b5 100644
--- a/ABACUS.develop/source/src_lcao/gint_gamma_rho.cpp
+++ b/ABACUS.develop/source/src_lcao/gint_gamma_rho.cpp
@@ -6,6 +6,7 @@
 #include <mkl_service.h>
 
 #include "global_fp.h" // mohan add 2021-01-30
+#include "../src_global/ylm.h"
 
 void Gint_Gamma::setVindex(const int ncyz, const int ibx, const int jby, const int kbz, int* vindex) const
 {
diff --git a/ABACUS.develop/source/src_lcao/gint_gamma_vl.cpp b/ABACUS.develop/source/src_lcao/gint_gamma_vl.cpp
index 207aa636bb..16bc32dd86 100644
--- a/ABACUS.develop/source/src_lcao/gint_gamma_vl.cpp
+++ b/ABACUS.develop/source/src_lcao/gint_gamma_vl.cpp
@@ -6,6 +6,7 @@
 #include <mkl_service.h>
 
 #include "global_fp.h" // mohan add 2021-01-30
+#include "../src_global/ylm.h"
 //#include <vector>
 
 extern "C"
diff --git a/ABACUS.develop/source/src_lcao/gint_k.cpp b/ABACUS.develop/source/src_lcao/gint_k.cpp
index 4249aff9a7..55b6c3426d 100644
--- a/ABACUS.develop/source/src_lcao/gint_k.cpp
+++ b/ABACUS.develop/source/src_lcao/gint_k.cpp
@@ -39,7 +39,7 @@ void Gint_k::allocate_pvpR(void)
 		WARNING_QUIT("Gint_k::allocate_pvpR","pvpR has been allocated!");
 	}
 
-//	reduced = NURSE; 
+	//	reduced = NURSE; 
 	//xiaohui modify 2015-05-30
 	//cout << " reduced algorithm for grid integration = " << reduced << endl;
 
@@ -139,8 +139,11 @@ void Gint_k::destroy_pvpR(void)
 
 // fold the <phi | vl |dphi(R)> * DM(R) to 
 // calculate the force.
-void Gint_k::folding_force(matrix& fvl_dphi,
-	double* pvdpx, double* pvdpy, double* pvdpz)
+void Gint_k::folding_force(
+	matrix& fvl_dphi,
+	double* pvdpx, 
+	double* pvdpy, 
+	double* pvdpz)
 {
 	TITLE("Gint_k","folding_force");
 	timer::tick("Gint_k","folding_force");
@@ -311,9 +314,18 @@ void Gint_k::folding_force(matrix& fvl_dphi,
 
 // fold the <phi | vl * R_beta|dphi(R_alpha)> * DM(R) to 
 // calculate the stress.
-void Gint_k::folding_stress(matrix& fvl_dphi, matrix& svl_dphi,
-	double* pvdpx, double* pvdpy, double* pvdpz,
-	double* pvdp11, double* pvdp22, double* pvdp33,double* pvdp12, double* pvdp13, double* pvdp23)
+void Gint_k::folding_stress(
+	matrix& fvl_dphi, 
+	matrix& svl_dphi,
+	double* pvdpx, 
+	double* pvdpy, 
+	double* pvdpz,
+	double* pvdp11, 
+	double* pvdp22, 
+	double* pvdp33,
+	double* pvdp12, 
+	double* pvdp13, 
+	double* pvdp23)
 {
 	TITLE("Gint_k","folding_stress");
 	timer::tick("Gint_k","folding_stress");
@@ -1132,9 +1144,15 @@ void Gint_k::folding_vl_k_nc(const int &ik)
 	return;
 }
 
-void Gint_k::set_ijk_atom(const int &grid_index, const int &size,
-	double*** psir_ylm, double*** dr, bool** cal_flag, 
-	double** distance, double* ylma, const double &delta_r)
+void Gint_k::set_ijk_atom(
+	const int &grid_index, 
+	const int &size,
+	double*** psir_ylm, 
+	double*** dr, 
+	bool** cal_flag, 
+	double** distance, 
+	double* ylma, 
+	const double &delta_r)
 {
 	const Numerical_Orbital_Lm* pointer;
 	double mt[3];
@@ -1350,72 +1368,7 @@ void Gint_k::destroy_pvpR_tr(void)
 void Gint_k::distribute_pvpR_tr(void)
 {
     TITLE("Gint_k","distribute_pvpR_tr");
-/*
-    int lgd = 0;
-    double R_minX = GridD.getD_minX();
-    double R_minY = GridD.getD_minY();
-    double R_minZ = GridD.getD_minZ();
-
-    int R_x;
-    int R_y;
-    int R_z;
-
-    Vector3<double> tau1, dtau, dR;
-    for(int T1=0; T1<ucell.ntype; ++T1)
-    {
-        for(int I1=0; I1<ucell.atoms[T1].na; ++I1)
-        {
-            const int iat = ucell.itia2iat(T1,I1);
-            // atom in this grid piece.
-            if(GridT.in_this_processor[iat])
-            {
-                Atom* atom1 = &ucell.atoms[T1];
-                const int start1 = ucell.itiaiw2iwt(T1, I1, 0);
 
-                // get the start positions of elements.
-                const int DM_start = LNNR.nlocstartg[iat];
-
-                // get the coordinates of adjacent atoms.
-                tau1 = ucell.atoms[T1].tau[I1];
-                //GridD.Find_atom(tau1);	
-                GridD.Find_atom(tau1, T1, I1);
-                // search for the adjacent atoms.
-                int nad = 0;
-
-int adj_number = 0;
-                for(int ad = 0; ad < GridD.getAdjacentNum()+1; ad++)
-                {
-                    // get iat2
-                    const int T2 = GridD.getType(ad);
-                    const int I2 = GridD.getNatom(ad);
-                    const int iat2 = ucell.itia2iat(T2, I2);
-
-                    // adjacent atom is also on the grid.
-                    if(GridT.in_this_processor[iat2])
-                    {
-int index = 0;
-                        Atom* atom2 = &ucell.atoms[T2];
-                        dtau = GridD.getAdjacentTau(ad) - tau1;
-                        double distance = dtau.norm() * ucell.lat0;
-                        double rcut = ORB.Phi[T1].getRcut() + ORB.Phi[T2].getRcut();
-
-                        // for the local part, only need to calculate <phi_i | phi_j> within range
-                        // mohan note 2012-07-06
-                        if(distance < rcut)
-                        {
-adj_number++;
-                            const int start2 = ucell.itiaiw2iwt(T2, I2, 0);
-
-                            // calculate the distance between iat1 and iat2.
-                            // Vector3<double> dR = GridD.getAdjacentTau(ad) - tau1;
-                            dR.x = GridD.getBox(ad).x;
-                            dR.y = GridD.getBox(ad).y;
-                            dR.z = GridD.getBox(ad).z;
-
-                            R_x = (int) (dR.x -R_minX);
-                            R_y = (int) (dR.y -R_minY);
-                            R_z = (int) (dR.z -R_minZ);
-*/
     int R_x = GridD.getCellX();
     int R_y = GridD.getCellY();
     int R_z = GridD.getCellZ();
@@ -1497,6 +1450,7 @@ adj_number++;
     return;
 }
 
+
 void Gint_k::cal_vlocal_R(const int current_spin)
 {
     TITLE("Gint_k","cal_vlocal_R");
diff --git a/ABACUS.develop/source/src_lcao/gint_k_fvl.cpp b/ABACUS.develop/source/src_lcao/gint_k_fvl.cpp
index 110f8da139..bfa8f85e68 100644
--- a/ABACUS.develop/source/src_lcao/gint_k_fvl.cpp
+++ b/ABACUS.develop/source/src_lcao/gint_k_fvl.cpp
@@ -1,9 +1,10 @@
 #include "gint_k.h"
 #include "../src_pw/global.h"
 #include "LCAO_nnr.h"
-
 #include "global_fp.h" // mohan add 2021-01-30
 
+#include "../src_global/ylm.h"
+
 void Gint_k::fvl_k_RealSpace(matrix& fvl_dphi, const double *vl)
 {
 	TITLE("Gint_k","cal_force");
@@ -24,8 +25,6 @@ void Gint_k::fvl_k_RealSpace(matrix& fvl_dphi, const double *vl)
 	{
 		nnrg = 1;
 	}
-	
-		
 
 	// to store < phi | vlocal | dphi>
 	double* pvdpx = new double[nnrg];
@@ -199,226 +198,247 @@ void Gint_k::fvl_k_RealSpace(matrix& fvl_dphi, const double *vl)
 	return;
 }
 
-void Gint_k::svl_k_RealSpace(matrix& fvl_dphi, matrix& svl_dphi, const double *vl)
+void Gint_k::svl_k_RealSpace(
+	matrix& fvl_dphi, 
+	matrix& svl_dphi, 
+	const double *vl)
 {
-        TITLE("Gint_k","cal_stress");
-        timer::tick("Gint_k","cal_stress");
+	TITLE("Gint_k","cal_stress");
+	timer::tick("Gint_k","cal_stress");
 
-        if(!this->reduced)
-        {
-                WARNING_QUIT("Gint_k::cal_stress_k","The stress with k can only with reduced H.");
-        }
+	if(!this->reduced)
+	{
+		WARNING_QUIT("Gint_k::cal_stress_k","The stress with k can only with reduced H.");
+	}
 
-        int nnrg = LNNR.nnrg;
+	int nnrg = LNNR.nnrg;
 
-        if(OUT_LEVEL != "m") ofs_running << " LNNR.nnrg in cal_force_k = " << LNNR.nnrg << endl;
-        assert(nnrg>=0);
+	if(OUT_LEVEL != "m") ofs_running << " LNNR.nnrg in cal_force_k = " << LNNR.nnrg << endl;
+	assert(nnrg>=0);
 
-        // just because to make thea arrys meaningful.
-        if(LNNR.nnrg == 0)
-        {
-                nnrg = 1;
-        }
+	// just because to make thea arrys meaningful.
+	if(LNNR.nnrg == 0)
+	{
+		nnrg = 1;
+	}
 
-        // to store < phi | vlocal | dphi>
-        double* pvdpx = new double[nnrg];
-        double* pvdpy = new double[nnrg];
-        double* pvdpz = new double[nnrg];
-        double* pvdp11 = new double[nnrg];
-        double* pvdp22 = new double[nnrg];
-        double* pvdp33 = new double[nnrg];
-        double* pvdp12 = new double[nnrg];
-        double* pvdp13 = new double[nnrg];
-        double* pvdp23 = new double[nnrg];
-        ZEROS(pvdpx, nnrg);
-        ZEROS(pvdpy, nnrg);
-        ZEROS(pvdpz, nnrg);
-        ZEROS(pvdp11, nnrg);
-        ZEROS(pvdp22, nnrg);
-        ZEROS(pvdp33, nnrg);
-        ZEROS(pvdp12, nnrg);
-        ZEROS(pvdp13, nnrg);
-        ZEROS(pvdp23, nnrg);
+	// to store < phi | vlocal | dphi>
+	double* pvdpx = new double[nnrg];
+	double* pvdpy = new double[nnrg];
+	double* pvdpz = new double[nnrg];
+	double* pvdp11 = new double[nnrg];
+	double* pvdp22 = new double[nnrg];
+	double* pvdp33 = new double[nnrg];
+	double* pvdp12 = new double[nnrg];
+	double* pvdp13 = new double[nnrg];
+	double* pvdp23 = new double[nnrg];
+	ZEROS(pvdpx, nnrg);
+	ZEROS(pvdpy, nnrg);
+	ZEROS(pvdpz, nnrg);
+	ZEROS(pvdp11, nnrg);
+	ZEROS(pvdp22, nnrg);
+	ZEROS(pvdp33, nnrg);
+	ZEROS(pvdp12, nnrg);
+	ZEROS(pvdp13, nnrg);
+	ZEROS(pvdp23, nnrg);
 
 
-    const double delta_r = ORB.dr_uniform;
-    // it's a uniform grid to save orbital values, so the delta_r is a constant.
-    const int max_size = GridT.max_atom;
-    // how many meshcells in bigcell.
-    const int bxyz = GridT.bxyz;
-
-        double*** dr;// vectors between atom and grid: [bxyz, maxsize, 3]
-        double** distance; // distance between atom and grid: [bxyz, maxsize]
-        double*** psir_ylm;
-        bool** cal_flag;
-        double* ylma;
-        double*** dphi_x;
-        double*** dphi_y;
-        double*** dphi_z;
-    if(max_size!=0)
-    {
-        dr = new double**[bxyz];
-        distance = new double*[bxyz];
-        psir_ylm = new double**[bxyz];
-        cal_flag = new bool*[bxyz];
-                dphi_x = new double**[bxyz];
-                dphi_y = new double**[bxyz];
-                dphi_z = new double**[bxyz];
+	const double delta_r = ORB.dr_uniform;
+	// it's a uniform grid to save orbital values, so the delta_r is a constant.
+	const int max_size = GridT.max_atom;
+	// how many meshcells in bigcell.
+	const int bxyz = GridT.bxyz;
 
-        // mohan fix bug 2011-05-02
-        int nn = 0;
-        for(int it=0; it<ucell.ntype; it++)
-        {
-            nn = max(nn, (ucell.atoms[it].nwl+1)*(ucell.atoms[it].nwl+1));
-        }
-        ylma = new double[nn];
-        ZEROS(ylma, nn);
+	double*** dr;// vectors between atom and grid: [bxyz, maxsize, 3]
+	double** distance; // distance between atom and grid: [bxyz, maxsize]
+	double*** psir_ylm;
+	bool** cal_flag;
+	double* ylma;
+	double*** dphi_x;
+	double*** dphi_y;
+	double*** dphi_z;
 
-        for(int i=0; i<bxyz; i++)
-        {
-            dr[i] = new double*[max_size];
-            psir_ylm[i] = new double*[max_size];
-            distance[i] = new double[max_size];
-            cal_flag[i] = new bool[max_size];
-                        dphi_x[i] = new double*[max_size];
-                        dphi_y[i] = new double*[max_size];
-                        dphi_z[i] = new double*[max_size];
+	if(max_size!=0)
+	{
+		dr = new double**[bxyz];
+		distance = new double*[bxyz];
+		psir_ylm = new double**[bxyz];
+		cal_flag = new bool*[bxyz];
+		dphi_x = new double**[bxyz];
+		dphi_y = new double**[bxyz];
+		dphi_z = new double**[bxyz];
 
-            ZEROS(distance[i], max_size);
-            ZEROS(cal_flag[i], max_size);
+		// mohan fix bug 2011-05-02
+		int nn = 0;
+		for(int it=0; it<ucell.ntype; it++)
+		{
+			nn = max(nn, (ucell.atoms[it].nwl+1)*(ucell.atoms[it].nwl+1));
+		}
+		ylma = new double[nn];
+		ZEROS(ylma, nn);
 
-            for(int j=0; j<max_size; j++)
-            {
-                dr[i][j] = new double[3];
-                psir_ylm[i][j] = new double[ucell.nwmax];
-                                dphi_x[i][j] = new double[ucell.nwmax];
-                                dphi_y[i][j] = new double[ucell.nwmax];
-                                dphi_z[i][j] = new double[ucell.nwmax];
-                ZEROS(dr[i][j],3);
-                ZEROS(psir_ylm[i][j],ucell.nwmax);
-                ZEROS(dphi_x[i][j],ucell.nwmax);
-                ZEROS(dphi_y[i][j],ucell.nwmax);
-                ZEROS(dphi_z[i][j],ucell.nwmax);
-            }
-        }
-    }
+		for(int i=0; i<bxyz; i++)
+		{
+			dr[i] = new double*[max_size];
+			psir_ylm[i] = new double*[max_size];
+			distance[i] = new double[max_size];
+			cal_flag[i] = new bool[max_size];
+			dphi_x[i] = new double*[max_size];
+			dphi_y[i] = new double*[max_size];
+			dphi_z[i] = new double*[max_size];
 
-    assert(this->ncxyz!=0);
-    const double dv = ucell.omega/this->ncxyz;
-    int vl_index=0;
-    double* vldr3 = new double[bxyz];
-    ZEROS(vldr3, bxyz);
+			ZEROS(distance[i], max_size);
+			ZEROS(cal_flag[i], max_size);
 
-        for(int i=0; i<nbx; i++)
-        {
-                for(int j=0; j<nby; j++)
-                {
-                        for(int k=nbz_start; k<nbz_start+nbz; k++)
-                        {
-                                const int grid_index = (k-nbz_start) + j * nbz + i * nby * nbz;
-                                const int size = GridT.how_many_atoms[ grid_index ];
-                                if(size==0) continue;
-
-                                //---------------------------------
-                                // get the wave functions in this
-                                // grid.
-                                //---------------------------------
-                                this->set_ijk_atom_force(grid_index, size,
-                                psir_ylm, dr, cal_flag,
-                                distance, ylma, delta_r,
-                                dphi_x, dphi_y, dphi_z);
-
-                                int bindex = 0;
-                                // z is the fastest,
-                                for(int ii=0; ii<pw.bx; ii++)
-                                {
-                                        for(int jj=0; jj<pw.by; jj++)
-                                        {
-                                                for(int kk=0; kk<pw.bz; kk++)
-                                                {
-                                                        const int iii = i*pw.bx + ii;
-                                                        const int jjj = j*pw.by + jj;
-                                                        const int kkk = k*pw.bz + kk;
-                                                        vl_index = (kkk-pw.nczp_start) + jjj*pw.nczp + iii*pw.ncy*pw.nczp;
-                                                        vldr3[bindex] = vl[ vl_index ] * dv;
-                                                //        vldr3[bindex] = dv; // for overlap test
-
-                                                        ++bindex;
-                                                }
-                                        }
-                                }
-//cout<<"loop  "<<i<<" "<<j<<" "<<k<<endl;//test
-
-                                this->evaluate_vl_stress(grid_index, size,i,j,k,
-                                        psir_ylm, cal_flag, vldr3, distance,
-                                        dphi_x, dphi_y, dphi_z,
-                                        pvdpx, pvdpy, pvdpz,
-                                        pvdp11, pvdp22, pvdp33, pvdp12, pvdp13, pvdp23, dr, GridT);
-                        }// int k
-                }// int j
-        } // int i
-
-
-        //---------------------------------------
-        // Folding R here
-        //---------------------------------------
-
-
-        //LM.DHloc_fixedR_x
-        this->folding_stress(fvl_dphi, svl_dphi, pvdpx, pvdpy, pvdpz,
-                             pvdp11, pvdp22, pvdp33, pvdp12, pvdp13, pvdp23);
-    
-        delete[] pvdpx;
-        delete[] pvdpy;
-        delete[] pvdpz;
-        delete[] pvdp11;
-        delete[] pvdp22;
-        delete[] pvdp33;
-        delete[] pvdp12;
-        delete[] pvdp13;
-        delete[] pvdp23;
+			for(int j=0; j<max_size; j++)
+			{
+				dr[i][j] = new double[3];
+				psir_ylm[i][j] = new double[ucell.nwmax];
+				dphi_x[i][j] = new double[ucell.nwmax];
+				dphi_y[i][j] = new double[ucell.nwmax];
+				dphi_z[i][j] = new double[ucell.nwmax];
+				ZEROS(dr[i][j],3);
+				ZEROS(psir_ylm[i][j],ucell.nwmax);
+				ZEROS(dphi_x[i][j],ucell.nwmax);
+				ZEROS(dphi_y[i][j],ucell.nwmax);
+				ZEROS(dphi_z[i][j],ucell.nwmax);
+			}
+		}
+	}
 
-    delete[] vldr3;
-    if(max_size!=0)
-    {
-        for(int i=0; i<pw.bxyz; i++)
-        {
-            for(int j=0; j<max_size; j++)
-            {
-                delete[] dr[i][j];
-                delete[] psir_ylm[i][j];
-                                delete[] dphi_x[i][j];
-                                delete[] dphi_y[i][j];
-                                delete[] dphi_z[i][j];
-            }
-            delete[] dr[i];
-            delete[] distance[i];
-            delete[] psir_ylm[i];
-            delete[] cal_flag[i];
-                        delete[] dphi_x[i];
-                        delete[] dphi_y[i];
-                        delete[] dphi_z[i];
-        }
-        delete[] dr;
-        delete[] distance;
-        delete[] psir_ylm;
-                delete[] dphi_x;
-                delete[] dphi_y;
-                delete[] dphi_z;
-        delete[] cal_flag;
+	assert(this->ncxyz!=0);
+	const double dv = ucell.omega/this->ncxyz;
+	int vl_index=0;
+	double* vldr3 = new double[bxyz];
+	ZEROS(vldr3, bxyz);
 
-        delete[] ylma;
-    }
-        timer::tick("Gint_k","cal_stress");
-        return;
+	for(int i=0; i<nbx; i++)
+	{
+		for(int j=0; j<nby; j++)
+		{
+			for(int k=nbz_start; k<nbz_start+nbz; k++)
+			{
+				const int grid_index = (k-nbz_start) + j * nbz + i * nby * nbz;
+				const int size = GridT.how_many_atoms[ grid_index ];
+				if(size==0) continue;
+
+				//---------------------------------
+				// get the wave functions in this
+				// grid.
+				//---------------------------------
+				this->set_ijk_atom_force(grid_index, size,
+						psir_ylm, dr, cal_flag,
+						distance, ylma, delta_r,
+						dphi_x, dphi_y, dphi_z);
+
+				int bindex = 0;
+				// z is the fastest,
+				for(int ii=0; ii<pw.bx; ii++)
+				{
+					for(int jj=0; jj<pw.by; jj++)
+					{
+						for(int kk=0; kk<pw.bz; kk++)
+						{
+							const int iii = i*pw.bx + ii;
+							const int jjj = j*pw.by + jj;
+							const int kkk = k*pw.bz + kk;
+							vl_index = (kkk-pw.nczp_start) + jjj*pw.nczp + iii*pw.ncy*pw.nczp;
+							vldr3[bindex] = vl[ vl_index ] * dv;
+							//        vldr3[bindex] = dv; // for overlap test
+
+							++bindex;
+						}
+					}
+				}
+				//cout<<"loop  "<<i<<" "<<j<<" "<<k<<endl;//test
+
+				this->evaluate_vl_stress(grid_index, size,i,j,k,
+						psir_ylm, cal_flag, vldr3, distance,
+						dphi_x, dphi_y, dphi_z,
+						pvdpx, pvdpy, pvdpz,
+						pvdp11, pvdp22, pvdp33, pvdp12, pvdp13, pvdp23, dr, GridT);
+			}// int k
+		}// int j
+	} // int i
+
+
+	//---------------------------------------
+	// Folding R here
+	//---------------------------------------
+
+	//LM.DHloc_fixedR_x
+	this->folding_stress(fvl_dphi, svl_dphi, pvdpx, pvdpy, pvdpz,
+			pvdp11, pvdp22, pvdp33, pvdp12, pvdp13, pvdp23);
+
+	delete[] pvdpx;
+	delete[] pvdpy;
+	delete[] pvdpz;
+	delete[] pvdp11;
+	delete[] pvdp22;
+	delete[] pvdp33;
+	delete[] pvdp12;
+	delete[] pvdp13;
+	delete[] pvdp23;
+
+	delete[] vldr3;
+	if(max_size!=0)
+	{
+		for(int i=0; i<pw.bxyz; i++)
+		{
+			for(int j=0; j<max_size; j++)
+			{
+				delete[] dr[i][j];
+				delete[] psir_ylm[i][j];
+				delete[] dphi_x[i][j];
+				delete[] dphi_y[i][j];
+				delete[] dphi_z[i][j];
+			}
+			delete[] dr[i];
+			delete[] distance[i];
+			delete[] psir_ylm[i];
+			delete[] cal_flag[i];
+			delete[] dphi_x[i];
+			delete[] dphi_y[i];
+			delete[] dphi_z[i];
+		}
+		delete[] dr;
+		delete[] distance;
+		delete[] psir_ylm;
+		delete[] dphi_x;
+		delete[] dphi_y;
+		delete[] dphi_z;
+		delete[] cal_flag;
+
+		delete[] ylma;
+	}
+	timer::tick("Gint_k","cal_stress");
+	return;
 }
 
 
-void Gint_k::evaluate_vl_stress(const int &grid_index, const int &size, const int &i, const int &j, const int &k,
-	double*** psir_ylm, bool** cal_flag, double* vldr3, double** distance,
-	double*** dphi_x, double*** dphi_y, double*** dphi_z,
-	double* pvdpx, double* pvdpy, double* pvdpz, 
-        double* pvdp11, double* pvdp22, double* pvdp33, double* pvdp12, double* pvdp13, double* pvdp23, double*** dr,
+void Gint_k::evaluate_vl_stress(
+	const int &grid_index, 
+	const int &size, 
+	const int &i, 
+	const int &j, 
+	const int &k,
+	double*** psir_ylm, 
+	bool** cal_flag, 
+	double* vldr3, 
+	double** distance,
+	double*** dphi_x, 
+	double*** dphi_y, 
+	double*** dphi_z,
+	double* pvdpx, 
+	double* pvdpy, 
+	double* pvdpz, 
+	double* pvdp11, 
+	double* pvdp22, 
+	double* pvdp33, 
+	double* pvdp12, 
+	double* pvdp13, 
+	double* pvdp23, 
+	double*** dr,
 	const Grid_Technique &gt)
 {
 
@@ -950,10 +970,22 @@ void Gint_k::evaluate_vl_force(const int &grid_index, const int &size, const int
         return;
 }
 
-void Gint_k::set_ijk_atom_force(const int &grid_index, const int &size,
-	double*** psir_ylm, double*** dr, bool** cal_flag, 
-	double** distance, double* ylma, const double &delta_r,
-	double*** dphi_x, double*** dphi_y, double*** dphi_z)
+
+// PLEASE be aware that 'set_ijk' subroutines should be reconstructed
+// since it has been used everytime grid integral is needed
+// mohan add 2021-03-28
+void Gint_k::set_ijk_atom_force(
+	const int &grid_index, 
+	const int &size,
+	double*** psir_ylm, 
+	double*** dr, 
+	bool** cal_flag, 
+	double** distance, 
+	double* ylma, 
+	const double &delta_r,
+	double*** dphi_x, 
+	double*** dphi_y, 
+	double*** dphi_z)
 {
 	const Numerical_Orbital_Lm* pointer;
 	double mt[3];
@@ -1008,12 +1040,12 @@ void Gint_k::set_ijk_atom_force(const int &grid_index, const int &size,
             //-------------------------------------------------
             // Here we can not deal with the situation on
             // r = 0, so if r = 0,  r-->1e-9
-            //-------------------------------------------------
+			//-------------------------------------------------
 
-                        if (distance[ib][id] < 1e-9)    // pengfei Li add 2016-3-3
-                        {
-                            distance[ib][id] = 1e-9;
-                        }
+			if (distance[ib][id] < 1e-9)    // pengfei Li add 2016-3-3
+			{
+				distance[ib][id] = 1e-9;
+			}
 
 			// these parameters are about interpolation
 			// because once we know the distance from atom to grid point,
diff --git a/ABACUS.develop/source/src_lcao/grid_base.cpp b/ABACUS.develop/source/src_lcao/grid_base.cpp
index 12c1264e2f..e8382fc421 100644
--- a/ABACUS.develop/source/src_lcao/grid_base.cpp
+++ b/ABACUS.develop/source/src_lcao/grid_base.cpp
@@ -143,6 +143,7 @@ void Grid_Base::init(
 	return;
 }
 
+
 void Grid_Base::get_rcut_max(void)
 {
 	assert( ORB.get_ntype() > 0 );
@@ -200,7 +201,9 @@ void Grid_Base::get_small_box(
 	    tau_dir.x, tau_dir.y, tau_dir.z
 	);
 
-	if (tau_dir.x < 0.0 || tau_dir.x > 1.0 || tau_dir.y < 0.0 || tau_dir.y > 1.0 || tau_dir.z < 0.0 || tau_dir.z > 1.0)
+	if (tau_dir.x < 0.0 || tau_dir.x > 1.0 
+	|| tau_dir.y < 0.0 || tau_dir.y > 1.0 
+	|| tau_dir.z < 0.0 || tau_dir.z > 1.0)
 	{
 		cout << "\n tau.x = " << tau.x;
 		cout << "\n tau.y = " << tau.y;
@@ -209,7 +212,8 @@ void Grid_Base::get_small_box(
 		cout << "\n tau_dir.x = " << tau_dir.x;
 		cout << "\n tau_dir.y = " << tau_dir.y;
 		cout << "\n tau_dir.z = " << tau_dir.z;
-		WARNING_QUIT("Grid_Base::get_small_box","Positions(x,y,z) Of tau and R2 in Direct Coordinates should be between 0 and 1!");
+		WARNING_QUIT("Grid_Base::get_small_box",
+		"Positions(x,y,z) Of tau and R2 in Direct Coordinates should be between 0 and 1!");
 	}
 
 	tau_max_direct = tau_dir + this->Rcut_max_direct[T];
diff --git a/ABACUS.develop/source/src_lcao/grid_base.h b/ABACUS.develop/source/src_lcao/grid_base.h
index dd81eb1266..2514a1597a 100644
--- a/ABACUS.develop/source/src_lcao/grid_base.h
+++ b/ABACUS.develop/source/src_lcao/grid_base.h
@@ -46,40 +46,72 @@ class Grid_Base
 
 	int* ijk_index;
 
-	Matrix3 latvec,latvec0;
-	Vector3<double> a1, a2, a3;
-	double a1_len, a2_len, a3_len;
+	Matrix3 latvec;
+	Matrix3 latvec0;
+
+	Vector3<double> a1;
+	Vector3<double> a2;
+	Vector3<double> a3;
+
+	double a1_len;
+	double a2_len;
+	double a3_len;
+
 	Vector3<double> da_d;
-	double da1, da2, da3;
-	int nx, ny, nz, nxyz;
+
+	double da1;
+	double da2;
+	double da3;
+
+	int nx;
+	int ny;
+	int nz;
+	int nxyz;
+
 	Vector3<double> *cartesian;
+
 	double lat0;
 
 	int test;
+
 	double *Rcut_max;
+
 	Vector3<double> *Rcut_max_direct;
-	int grid_number, grid_number_last;
+
+	int grid_number;
+	int grid_number_last;
 	
-	double *norm1, *norm2;
+	double *norm1;
+	double *norm2;
 	
-	double Rcut1, Rcut2;
+	double Rcut1;
+	double Rcut2;
 	
-	Vector3<double> *dR1, *dR2;
+	Vector3<double> *dR1;
+	Vector3<double> *dR2;
 
 	const Numerical_Orbital_Lm* pointer1;
 	const Numerical_Orbital_Lm* pointer2;
 
-	int iw1_all,iw2_all;
-	int index1, index2;
-	Vector3<int> edge_min, edge_max;
+	int iw1_all;
+	int iw2_all;
+	int index1;
+	int index2;
+
+	Vector3<int> edge_min;
+	Vector3<int> edge_max;
 
 	enum cal_type{ cal_charge, cal_local } job;
 	
 	double** yy1;
 	double** yy2;
-	int n1,n2; // (lmax+1)^2
-	int n1_last, n2_last;
-	int lmax1, lmax2;
+
+	int n1; // (lmax+1)^2
+	int n2;
+	int n1_last;
+	int n2_last;
+	int lmax1;
+	int lmax2;
 
 };
 
diff --git a/ABACUS.develop/source/src_lcao/grid_base_beta.h b/ABACUS.develop/source/src_lcao/grid_base_beta.h
index 3e5c8cc2c5..b4091dec95 100644
--- a/ABACUS.develop/source/src_lcao/grid_base_beta.h
+++ b/ABACUS.develop/source/src_lcao/grid_base_beta.h
@@ -1,13 +1,11 @@
-//=========================================================
-//AUTHOR : mohan
-//DATE : 2008-09-16
-//=========================================================
 #ifndef GRID_BASE_BETA_H
 #define GRID_BASE_BETA_H
 
 #include "../src_pw/tools.h"
 #include "ORB_atomic_lm.h"
 
+//AUTHOR : mohan
+//DATE : 2008-09-16
 // this class is inherited by Grid_Integral_Beta.h
 // this class provides basic Grid operation and the 
 // corresponding information.
@@ -34,7 +32,8 @@ class Grid_Base_Beta
 	double* rho1; // about charge
 	double **density_kernel;
 	double vfactor;
-	Matrix3 latvec,latvec0;
+	Matrix3 latvec;
+	Matrix3 latvec0;
 	int* nnn;
 	double lat0;
 	enum cal_type{ cal_charge, cal_local, cal_vnlb } job;
diff --git a/ABACUS.develop/source/src_lcao/grid_bigcell.h b/ABACUS.develop/source/src_lcao/grid_bigcell.h
index fa951fc136..37e7acea29 100644
--- a/ABACUS.develop/source/src_lcao/grid_bigcell.h
+++ b/ABACUS.develop/source/src_lcao/grid_bigcell.h
@@ -1,5 +1,6 @@
 #ifndef GRID_BIGCELL_H
 #define GRID_BIGCELL_H
+
 #include "../src_pw/tools.h"
 #include "grid_meshcell.h"
 
diff --git a/ABACUS.develop/source/src_lcao/grid_meshball.h b/ABACUS.develop/source/src_lcao/grid_meshball.h
index 67dd4b3cb4..72ccdcc1e1 100644
--- a/ABACUS.develop/source/src_lcao/grid_meshball.h
+++ b/ABACUS.develop/source/src_lcao/grid_meshball.h
@@ -30,6 +30,7 @@ class Grid_MeshBall : public Grid_BigCell
 	// init the meshball radius,
 	// search each meshcell of this meshball.
 	void init_meshball(void);
+
 	void delete_meshball_positions(void); //LiuXh add 2018-12-14
 
 	private:
diff --git a/ABACUS.develop/source/src_lcao/grid_technique.cpp b/ABACUS.develop/source/src_lcao/grid_technique.cpp
index 6fddc38f0d..3d75ad84af 100644
--- a/ABACUS.develop/source/src_lcao/grid_technique.cpp
+++ b/ABACUS.develop/source/src_lcao/grid_technique.cpp
@@ -93,6 +93,10 @@ void Grid_Technique::set_pbc_grid(
 	return;
 }
 
+
+// PLEASE update this 'init_atoms_on_grid' to make
+// it adapted to 'cuboid' shape of grid
+// mohan add 2021-04-06
 void Grid_Technique::init_atoms_on_grid(void)
 {
 	TITLE("Grid_Technique","init_atoms_on_grid");
@@ -352,7 +356,11 @@ void Grid_Technique::cal_trace_beta(void)
 {
 	// save the atom information in trace_beta//
 	delete[] trace_beta;
-	int nkb=ORB.nkb;
+
+	// mohan modify 2021-04-06
+	//int nkb=ORB.nkb;
+	int nkb=ppcell.nkb;
+
 	this->trace_beta = new int[nkb];
 	for(int i=0; i<nkb; i++)
 	{
@@ -394,6 +402,7 @@ void Grid_Technique::cal_trace_beta(void)
 }
 
 
+// set 'lgd' variable
 void Grid_Technique::cal_trace_lo(void)
 {	
 	TITLE("Grid_Technique","cal_trace_lo");
@@ -425,10 +434,12 @@ void Grid_Technique::cal_trace_lo(void)
 				if(NSPIN==4)
 				{//added by zhengdy-soc, need to be double in soc
 					nw0 *= 2;
-					lgd += nw0;
+					this->lgd += nw0;
 				}
 				else
-					lgd += ucell.atoms[it].nw;
+				{
+					this->lgd += ucell.atoms[it].nw;
+				}
 				
 				for(int iw=0; iw<nw0; iw++)
 				{
@@ -439,6 +450,7 @@ void Grid_Technique::cal_trace_lo(void)
 			}
 			else
 			{
+				// global index of atomic orbitals
 				iw_all += ucell.atoms[it].nw;
 				if(NSPIN==4) iw_all += ucell.atoms[it].nw;
 			}
@@ -464,5 +476,3 @@ void Grid_Technique::cal_trace_lo(void)
 	assert(iw_all == NLOCAL);
 	return;
 }
-
-
diff --git a/ABACUS.develop/source/src_lcao/grid_technique.h b/ABACUS.develop/source/src_lcao/grid_technique.h
index a5dba02873..db183b1b02 100644
--- a/ABACUS.develop/source/src_lcao/grid_technique.h
+++ b/ABACUS.develop/source/src_lcao/grid_technique.h
@@ -1,10 +1,10 @@
-// Author: mohan
-// Date: 2009-10-17
 #ifndef GRID_TECHNIQUE_H
 #define GRID_TECHNIQUE_H
 
 #include "grid_meshball.h"
 
+// Author: mohan
+// Date: 2009-10-17
 class Grid_Technique : public Grid_MeshBall
 {
 	// public variables.
diff --git a/ABACUS.develop/source/src_lcao/local_orbital_charge.cpp b/ABACUS.develop/source/src_lcao/local_orbital_charge.cpp
index 860915b6a1..92d35c2a7f 100644
--- a/ABACUS.develop/source/src_lcao/local_orbital_charge.cpp
+++ b/ABACUS.develop/source/src_lcao/local_orbital_charge.cpp
@@ -119,7 +119,7 @@ void Local_Orbital_Charge::sum_bands(void)
         }
         else if(KS_SOLVER=="genelpa" || KS_SOLVER=="scalapack_gvx")
         {
-            if(NEW_DM>0)
+            if(INPUT.new_dm>0)
             {
                 //density matrix has already been calcualted.
                 timer::tick("LCAO_Charge","cal_dm_2d",'F');
diff --git a/ABACUS.develop/source/src_lcao/lscc.f b/ABACUS.develop/source/src_lcao/lscc.f
deleted file mode 100644
index 492c941ca9..0000000000
--- a/ABACUS.develop/source/src_lcao/lscc.f
+++ /dev/null
@@ -1,87 +0,0 @@
-      subroutine sphbsl(n,r,A,val) 
-        integer :: n
-        real*8 :: r,A
-        real*8 :: x,val
-        x = r*A
-        if (n .eq. 0) then
-        
-          if ( x .lt. 1.d-3 ) then
-            val = 1 + x**2/6
-          else
-            val = dsinh(x)/x
-          end if
-        else if (n .eq. 2) then
-        
-          if ( x .lt. 1.d-2 ) then
-            val = -x**2/15 -x**4/210 - x**6/7560
-          else
-            val = 3*dcosh(x)/x**2 + (-3-x**2)*dsinh(x)/x**3
-          end if
-        
-        else if (n .eq. 4) then
-
-          if( x .lt. 5.d-1)then
-            val = x**4/945 + x**6/20790 + x**8/1081080 + x**10/97297200
-          else
-            val = -5*(21+2*x**2)*dcosh(x)/x**4+(105+45*x**2+x**4)*
-     &       dsinh(x)/x**5
-          end if
-        
-        else if (n .eq. 6) then
-        
-          if ( x .lt. 9.d-1) then
-            val = -x**6/135135-x**8/4054050-x**10/275675400
-          else
-            val = 21*(495+60*x**2+x**4)*dcosh(x)/x**6 +
-     &       (-10395-4725*x**2-210*x**4-x**6)*dsinh(x)/x**7
-          end if
-        
-        else
-        end if
-      END subroutine sphbsl
-
-      subroutine sphhnk(n,r,A,val)
-        integer :: n
-        real*8 :: r,A
-        real*8 :: x,val
-        x = r*A
-        if (n .eq. 0) then
-        
-          if ( x .lt. 1.d-3 ) then
-            val = -1/x + 1 -x/2 + x**2/6
-          else
-            val = -dexp(-x)/x
-          endif
-        
-        else if (n .eq. 2) then
-        
-          if ( x .lt. 1.d-2) then
-            val = 3/x**3-1/(2*x)+x/8-x**2/15+x**3/48
-          else
-            val = dexp(-x)*(3+3*x+x**2)/x**3
-          endif
-        
-        else if (n .eq. 4) then
-        
-          if (x .lt. 5.d-1) then
-            val = -105/x**5 + 15/(2*x**3) - 3/(8*x) + x/48 - x**3/384 
-     &        +x**4/945
-          else
-            val = -dexp(-x)*(105+105*x+45*x**2+10*x**3+x**4)/x**5
-          endif
-        
-        else if (n .eq. 6) then
-
-          if (x .lt. 9.d-1) then
-            val = 10395/x**7 - 945/(2*x**5) + 105/(8*x**3) -5/(16*x) + 
-     &            x/128-x**3/3840 + x**5/46080 - x**6/135135
-          else
-            val = dexp(-x)*(10395+10395*x+4725*x**2+1260*x**3+210*x**
-     &       4+21*x**5+x**6)/x**7
-          endif
-        
-        else
-        endif
-      END SUBROUTINE sphhnk
-
-
diff --git a/ABACUS.develop/source/src_lcao/run_md.cpp b/ABACUS.develop/source/src_lcao/run_md.cpp
index e85f99ac2a..92f0f20d52 100644
--- a/ABACUS.develop/source/src_lcao/run_md.cpp
+++ b/ABACUS.develop/source/src_lcao/run_md.cpp
@@ -128,28 +128,7 @@ void Run_MD::opt_ions(void)
 		time_t eend = time(NULL);
 
         //xiaohui add 2014-07-07, for second-order extrapolation
-		int iat=0;
-
-		for(int it = 0;it < ucell.ntype;it++)
-		{
-			Atom* atom = &ucell.atoms[it];
-			for(int ia =0;ia< ucell.atoms[it].na;ia++)
-			{
-				CE.pos_old2[3*iat  ] = CE.pos_old1[3*iat  ];
-				CE.pos_old2[3*iat+1] = CE.pos_old1[3*iat+1];
-				CE.pos_old2[3*iat+2] = CE.pos_old1[3*iat+2];
-
-				CE.pos_old1[3*iat  ] = CE.pos_now[3*iat  ];
-				CE.pos_old1[3*iat+1] = CE.pos_now[3*iat+1];
-				CE.pos_old1[3*iat+2] = CE.pos_now[3*iat+2];
-
-				CE.pos_now[3*iat  ] = atom->tau[ia].x*ucell.lat0;
-				CE.pos_now[3*iat+1] = atom->tau[ia].y*ucell.lat0;
-				CE.pos_now[3*iat+2] = atom->tau[ia].z*ucell.lat0;
-
-				iat++;
-			}
-		}
+		CE.update_all_pos(ucell);
 
 		if(mdtype==1||mdtype==2)   
 		{
@@ -181,22 +160,10 @@ void Run_MD::opt_ions(void)
         time_t fend = time(NULL);
 
         //xiaohui add 2014-07-07, for second-order extrapolation
-		iat=0;
-		for(int it = 0;it < ucell.ntype;it++)
-		{
-			Atom* atom = &ucell.atoms[it];
-			for(int ia =0;ia< ucell.atoms[it].na;ia++)
-			{
-				CE.pos_next[3*iat  ] = atom->tau[ia].x*ucell.lat0;
-				CE.pos_next[3*iat+1] = atom->tau[ia].y*ucell.lat0;
-				CE.pos_next[3*iat+2] = atom->tau[ia].z*ucell.lat0;
-
-				iat++;
-			}
-		}
+		CE.save_pos_next(ucell);
 
 		//xiaohui add CE.istep = istep 2014-07-07
-		CE.istep = istep;
+		CE.update_istep(istep);
 
 		// charge extrapolation if istep>0.
 		CE.extrapolate_charge();
@@ -236,7 +203,7 @@ void Run_MD::opt_ions(void)
     }
 
 	// mohan update 2021-02-10
-    hm.orb_con.clear_after_ions();
+    hm.orb_con.clear_after_ions(UOT, ORB);
 
     timer::tick("Run_MD","opt_ions",'B'); 
     return;
@@ -276,7 +243,7 @@ bool Run_MD::force_stress(const int &istep, int &force_step, int &stress_step)
             }
             else // ions are not converged
             {
-                CE.istep = istep;
+                CE.update_istep(istep);
                 CE.extrapolate_charge();
 
                 if(pot.extra_pot=="dm")
@@ -400,7 +367,7 @@ xiaohui modify 2014-08-09*/
             //atom_arrange::delete_vector( SEARCH_RADIUS );
 #endif
                 //CE.istep = istep;
-                CE.istep = force_step;
+                CE.update_istep(force_step);
                 CE.extrapolate_charge();
 
                 if(pot.extra_pot=="dm")//xiaohui modify 2015-02-01
diff --git a/ABACUS.develop/source/src_pdiag/pdiag_double.cpp b/ABACUS.develop/source/src_pdiag/pdiag_double.cpp
index 6a3b6a3c7b..df36a8d0ec 100644
--- a/ABACUS.develop/source/src_pdiag/pdiag_double.cpp
+++ b/ABACUS.develop/source/src_pdiag/pdiag_double.cpp
@@ -549,7 +549,7 @@ void Pdiag_Double::diago_double_begin(
         delete[] eigen;
 	    OUT(ofs_running,"eigenvalues were copied to ekb");
 
-        if(NEW_DM==0)
+        if(INPUT.new_dm==0)
         {
             // convert wave function to band distribution 
 			// and calculate the density matrix in the tranditional way
@@ -655,9 +655,9 @@ void Pdiag_Double::diago_double_begin(
 		}
 		memcpy( ekb, ekb_tmp.data(), sizeof(double)*NBANDS ); 
 		
-		if(NEW_DM==0)
+		if(INPUT.new_dm==0)
 		{
-			throw domain_error("NEW_DM must be 1. "+TO_STRING(__FILE__)+" line "+TO_STRING(__LINE__));
+			throw domain_error("INPUT.new_dm must be 1. "+TO_STRING(__FILE__)+" line "+TO_STRING(__LINE__));
 		}
 	}
 	else if(KS_SOLVER=="lapack_gvx")
@@ -700,9 +700,9 @@ void Pdiag_Double::diago_double_begin(
 			throw runtime_error("M="+TO_STRING(M)+". NBANDS="+TO_STRING(NBANDS)+". "+TO_STRING(__FILE__)+" line "+TO_STRING(__LINE__));
 		}
 		
-		if(NEW_DM==0)
+		if(INPUT.new_dm==0)
 		{
-			throw domain_error("NEW_DM must be 1. "+TO_STRING(__FILE__)+" line "+TO_STRING(__LINE__));
+			throw domain_error("INPUT.new_dm must be 1. "+TO_STRING(__FILE__)+" line "+TO_STRING(__LINE__));
 		}
 	}
 	else if(KS_SOLVER=="scalapack_gvx")
@@ -755,9 +755,9 @@ void Pdiag_Double::diago_double_begin(
 		{
 			throw runtime_error("M="+TO_STRING(M)+". NZ="+TO_STRING(NZ)+". "+TO_STRING(__FILE__)+" line "+TO_STRING(__LINE__));
 		}
-		if(NEW_DM==0)
+		if(INPUT.new_dm==0)
 		{
-			throw domain_error("NEW_DM must be 1. "+TO_STRING(__FILE__)+" line "+TO_STRING(__LINE__));
+			throw domain_error("INPUT.new_dm must be 1. "+TO_STRING(__FILE__)+" line "+TO_STRING(__LINE__));
 		}
 	}	
     //delete[] Stmp; //LiuXh 20171109
@@ -1037,8 +1037,8 @@ void Pdiag_Double::diago_complex_begin(const int &ik, complex<double> **wfc, Com
 		if(M!=NZ)
 			throw runtime_error("M="+TO_STRING(M)+". NZ="+TO_STRING(NZ)+". "+TO_STRING(__FILE__)+" line "+TO_STRING(__LINE__));
 		
-//		if(NEW_DM==0)
-//			throw domain_error("NEW_DM must be 1. "+TO_STRING(__FILE__)+" line "+TO_STRING(__LINE__));
+//		if(INPUT.new_dm==0)
+//			throw domain_error("INPUT.new_dm must be 1. "+TO_STRING(__FILE__)+" line "+TO_STRING(__LINE__));
 		// the follow will be deleted after finish newdm
 		{
 			//change eigenvector matrix from block-cycle distribute matrix to column-divided distribute matrix
diff --git a/ABACUS.develop/source/src_pw/atom_pseudo.h b/ABACUS.develop/source/src_pw/atom_pseudo.h
index 54af751714..f66cc4f323 100644
--- a/ABACUS.develop/source/src_pw/atom_pseudo.h
+++ b/ABACUS.develop/source/src_pw/atom_pseudo.h
@@ -23,6 +23,7 @@ class Atom_pseudo : public pseudo_us
 	Vector3<int> *mbl; //If this atom can move
 	string pseudo_fn;// File name of pseudopotentia
 	double mass; // the mass of atom
+	bool flag_empty_element = false;	// whether is the empty element for bsse.	Peize Lin add 2021.04.07
 
 protected:
 
diff --git a/ABACUS.develop/source/src_pw/charge.cpp b/ABACUS.develop/source/src_pw/charge.cpp
index 36981de51b..6494728ae1 100644
--- a/ABACUS.develop/source/src_pw/charge.cpp
+++ b/ABACUS.develop/source/src_pw/charge.cpp
@@ -21,6 +21,8 @@
 #include "charge.h"
 #include "magnetism.h"
 #include "../src_parallel/parallel_grid.h"
+#include "../src_global/math_integral.h"
+#include <vector>
 
 Charge::Charge()
 {
@@ -168,268 +170,257 @@ void Charge::renormalize_rho(void)
 // rho_at (read from pseudopotential files)
 // allocate work space (psic must already be allocated)
 //-------------------------------------------------------
-void Charge::atomic_rho(const int spin_number_need, double** rho_in)const
+void Charge::atomic_rho(const int spin_number_need, double** rho_in)const		// Peize Lin refactor 2021.04.08
 {
     TITLE("Charge","atomic_rho");
     timer::tick("Charge","atomic_rho");
 
-	assert(ucell.meshx>0);
-    double *rho1d = new double[ucell.meshx];
-    
-	// one dimension of charge in G space.
-	double *rho_lgl= new double[ pw.nggm ];
-    ZEROS(rho1d, ucell.meshx);
-    ZEROS(rho_lgl, pw.nggm);
-
-	// use interpolation to get three dimension charge density.
-    ComplexMatrix rho_g3d( spin_number_need, pw.ngmc);
-
-
-	// check the start magnetization
-	int startmag_type = 1;
-	for(int it=0; it<ucell.ntype; it++)
+	const ComplexMatrix rho_g3d = [&]()->ComplexMatrix
 	{
-		for(int ia=0; ia<ucell.atoms[it].na; ia++)
+		// use interpolation to get three dimension charge density.
+		ComplexMatrix rho_g3d( spin_number_need, pw.ngmc);
+		
+		// check the start magnetization
+		const int startmag_type = [&]()->int
 		{
-			if(ucell.atoms[it].mag[ia]!=0.0)
-			{
-				startmag_type = 2;
-				break;
-			}
-		}
-	}
-
-	if(NSPIN==4) 
-	{
-		startmag_type = 1;//zhengdy-soc, type 2 is still wrong.
-	}
-	OUT(ofs_warning,"startmag_type",startmag_type);
-
-
-    for (int it = 0;it < ucell.ntype;it++)
-    {
-		Atom* atom = &ucell.atoms[it];
-
-		// mesh point of this element.
-        const int mesh = atom->msh;
-
-        //----------------------------------------------------------
-        // Here we check the electron number 
-        //----------------------------------------------------------
-		double* rhoatm = new double[mesh];
-		for(int ir=0; ir<mesh; ++ir)
+			if(NSPIN==4)		//zhengdy-soc, type 2 is still wrong.
+				return 1;
+			for(int it=0; it<ucell.ntype; it++)
+				for(int ia=0; ia<ucell.atoms[it].na; ia++)
+					if(ucell.atoms[it].mag[ia]!=0.0)
+						return 2;
+			return 1;
+		}();
+		OUT(ofs_warning,"startmag_type",startmag_type);
+
+		for (int it = 0;it < ucell.ntype;it++)
 		{
-			double r2=atom->r[ir]*atom->r[ir];
-			rhoatm[ir]=atom->rho_at[ir]/FOUR_PI/r2;
-		}
-		rhoatm[0] = pow( (rhoatm[2]/rhoatm[1]), 1./(atom->r[2]-atom->r[1]) );//zws add
-		rhoatm[0] = pow(rhoatm[0], atom->r[1]);
-		rhoatm[0] = rhoatm[1] / rhoatm[0];  
-
-		double charge = 0.0;
-		Mathzone::Simpson_Integral(atom->msh,atom->rho_at,atom->rab,charge);
+			const Atom* const atom = &ucell.atoms[it];
 
-		OUT(ofs_warning,"charge from rho_at",charge);
-		assert(charge!=0.0);
-		double scale=1.0;
+			if(!atom->flag_empty_element)		// Peize Lin add for bsse 2021.04.07
+			{		
+				const std::vector<double> rho_lgl = [&]()->std::vector<double>
+				{
+					// one dimension of charge in G space.
+					std::vector<double> rho_lgl(pw.nggm,0);
 
-		if(charge!=atom->zv)
-		{
-			OUT(ofs_warning,"charge should be",atom->zv);
-			scale = atom->zv/charge;
-		}
+					// mesh point of this element.
+					const int mesh = atom->msh;
 
-		for(int ir=0; ir<mesh; ++ir)
-		{
-			rhoatm[ir] *= scale;
-			rhoatm[ir] *= (FOUR_PI*atom->r[ir]*atom->r[ir]);
-		}
+					//----------------------------------------------------------
+					// Here we check the electron number 
+					//----------------------------------------------------------
+					const std::vector<double> rhoatm = [&]()->std::vector<double>
+					{
+						std::vector<double> rhoatm(mesh);		
+						for(int ir=0; ir<mesh; ++ir)
+						{
+							double r2=atom->r[ir]*atom->r[ir];
+							rhoatm[ir]=atom->rho_at[ir]/FOUR_PI/r2;
+						}
+						rhoatm[0] = pow( (rhoatm[2]/rhoatm[1]), 1./(atom->r[2]-atom->r[1]) );//zws add
+						rhoatm[0] = pow(rhoatm[0], atom->r[1]);
+						rhoatm[0] = rhoatm[1] / rhoatm[0];
 
-        //----------------------------------------------------------
-        // Here we compute the G=0 term
-        //----------------------------------------------------------
-        if (pw.gstart == 1)
-        {
-            for (int ir = 0;ir < mesh;ir++)
-            {
-//              rho1d [ir] = atom->rho_at[ir];
-				rho1d[ir] = rhoatm[ir];
-            }
-            Mathzone::Simpson_Integral(mesh, rho1d, atom->rab , rho_lgl[0]);
-        }
+						double charge = 0.0;
+						Integral::Simpson_Integral(atom->msh,atom->rho_at,atom->rab,charge);
+						OUT(ofs_warning,"charge from rho_at",charge);
+						assert(charge!=0.0 || charge==atom->zv);		// Peize Lin add charge==atom->zv for bsse 2021.04.07
 
+						double scale=1.0;
+						if(charge!=atom->zv)
+						{
+							OUT(ofs_warning,"charge should be",atom->zv);
+							scale = atom->zv/charge;
+						}
 
-        if (test_charge>0) cout<<"\n |G|=0 term done." <<endl;
-        //----------------------------------------------------------
-        // Here we compute the G<>0 term
-        // But if in parallel case
-        // G=0 term only belong to 1 cpu.
-        // Other processors start from '0'
-        //----------------------------------------------------------
-        for (int ig = pw.gstart; ig < pw.nggm ;ig++)
-        {
-            const double gx = sqrt(pw.ggs [ig]) * ucell.tpiba;
-            for (int ir = 0; ir < mesh;ir++)
-            {
-                if ( atom->r[ir] < 1.0e-8 )
-                {
-                    rho1d[ir] = rhoatm[ir];
-                    //rho1d[ir] = atom->rho_at[ir];
-                }
-                else
-                {
-                    const double gxx = gx * atom->r[ir];
-                    rho1d[ir] = rhoatm[ir] * sin(gxx) / gxx;
-                    rho1d[ir] = rhoatm[ir] * sin(gxx) / gxx;
-                }
-            }
-            Mathzone::Simpson_Integral(mesh , rho1d, atom->rab , rho_lgl [ig]);
-        }
-		delete[] rhoatm;
-        
-		
-		if (test_charge>0) cout<<" |G|>0 term done." <<endl;
-        //----------------------------------------------------------
-        // EXPLAIN : Complete the transfer of rho from real space to
-        // reciprocal space
-        //----------------------------------------------------------
-        for (int ig=0; ig< pw.nggm ; ig++)
-        {
-            rho_lgl[ig] /= ucell.omega;
-        }
-        //----------------------------------------------------------
-        // EXPLAIN : compute the 3D atomic charge in reciprocal space
-        //----------------------------------------------------------
-        if(spin_number_need==1)
-        {
-            for (int ig=0; ig< pw.ngmc ;ig++)
-            {
-                rho_g3d(0, ig) += pw.strucFac(it, ig) * rho_lgl[ pw.ig2ngg[ig] ];
-            }
-		}
-		// mohan add 2011-06-14, initialize the charge density according to each atom 
-		else if(spin_number_need==2)
-		{
-			if(startmag_type==1)
-			{
-				for (int ig = 0; ig < pw.ngmc ; ig++)
-				{
-					const complex<double> swap = pw.strucFac(it, ig)* rho_lgl[pw.ig2ngg[ig]];
-					//rho_g3d(0, ig) += swap * mag.nelup_percent(it);
-					//rho_g3d(1, ig) += swap * mag.neldw_percent(it);
-					const double up = 0.5 * ( 1 + mag.start_magnetization[it] / atom->zv );
-					const double dw = 0.5 * ( 1 - mag.start_magnetization[it] / atom->zv );
-					rho_g3d(0, ig) += swap * up;
-					rho_g3d(1, ig) += swap * dw;
-				}
-			}
-			// mohan add 2011-06-14
-			else if(startmag_type==2)
-			{
-				complex<double> swap = ZERO;
-				complex<double> ci_tpi = NEG_IMAG_UNIT * TWO_PI;
-				for (int ia = 0; ia < atom->na; ia++)
+						for(int ir=0; ir<mesh; ++ir)
+						{
+							rhoatm[ir] *= scale;
+							rhoatm[ir] *= (FOUR_PI*atom->r[ir]*atom->r[ir]);
+						}
+						return rhoatm;
+					}();
+
+					assert(ucell.meshx>0);
+					vector<double> rho1d(ucell.meshx);
+					//----------------------------------------------------------
+					// Here we compute the G=0 term
+					//----------------------------------------------------------
+					if (pw.gstart == 1)
+					{
+						for (int ir = 0;ir < mesh;ir++)
+						{
+			//              rho1d [ir] = atom->rho_at[ir];
+							rho1d[ir] = rhoatm[ir];
+						}
+						Integral::Simpson_Integral(mesh, rho1d.data(), atom->rab, rho_lgl[0]);
+					}
+					if (test_charge>0) cout<<"\n |G|=0 term done." <<endl;
+					//----------------------------------------------------------
+					// Here we compute the G<>0 term
+					// But if in parallel case
+					// G=0 term only belong to 1 cpu.
+					// Other processors start from '0'
+					//----------------------------------------------------------
+					for (int ig = pw.gstart; ig < pw.nggm ;ig++)
+					{
+						const double gx = sqrt(pw.ggs [ig]) * ucell.tpiba;
+						for (int ir = 0; ir < mesh;ir++)
+						{
+							if ( atom->r[ir] < 1.0e-8 )
+							{
+								rho1d[ir] = rhoatm[ir];
+								//rho1d[ir] = atom->rho_at[ir];
+							}
+							else
+							{
+								const double gxx = gx * atom->r[ir];
+								rho1d[ir] = rhoatm[ir] * sin(gxx) / gxx;
+								rho1d[ir] = rhoatm[ir] * sin(gxx) / gxx;
+							}
+						}
+						Integral::Simpson_Integral(mesh, rho1d.data(), atom->rab, rho_lgl[ig]);
+					}
+					
+					if (test_charge>0) cout<<" |G|>0 term done." <<endl;
+					//----------------------------------------------------------
+					// EXPLAIN : Complete the transfer of rho from real space to
+					// reciprocal space
+					//----------------------------------------------------------
+					for (int ig=0; ig< pw.nggm ; ig++)
+						rho_lgl[ig] /= ucell.omega;
+					return rho_lgl;
+				}();
+				//----------------------------------------------------------
+				// EXPLAIN : compute the 3D atomic charge in reciprocal space
+				//----------------------------------------------------------
+				if(spin_number_need==1)
 				{
-					//const double up = 0.5 * ( 1 + atom->mag[ia] );
-					//const double dw = 0.5 * ( 1 - atom->mag[ia] );
-					const double up = 0.5 * ( 1 + atom->mag[ia] / atom->zv );
-					const double dw = 0.5 * ( 1 - atom->mag[ia] / atom->zv );
-					//cout << " atom " << ia << " up=" << up << " dw=" << dw << endl;
-
-					for (int ig = 0; ig < pw.ngmc ; ig++)
+					for (int ig=0; ig< pw.ngmc ;ig++)
 					{
-						const double Gtau = 
-							pw.gcar[ig].x * atom->tau[ia].x
-							+ pw.gcar[ig].y * atom->tau[ia].y
-							+ pw.gcar[ig].z * atom->tau[ia].z; 
-
-						swap = exp(ci_tpi * Gtau) * rho_lgl[pw.ig2ngg[ig]];
-
-						rho_g3d(0, ig) += swap * up;
-						rho_g3d(1, ig) += swap * dw;
+						rho_g3d(0, ig) += pw.strucFac(it, ig) * rho_lgl[ pw.ig2ngg[ig] ];
 					}
 				}
-			}
-		}
-		else if(spin_number_need==4)
-		{
-			//noncolinear case
-			if(startmag_type == 1)
-			{
-				for (int ig = 0; ig < pw.ngmc ; ig++)
+				// mohan add 2011-06-14, initialize the charge density according to each atom 
+				else if(spin_number_need==2)
 				{
-					const complex<double> swap = pw.strucFac(it, ig)* rho_lgl[pw.ig2ngg[ig]];
-					rho_g3d(0, ig) += swap ;
-					if(DOMAG)
+					if(startmag_type==1)
 					{
-						rho_g3d(1, ig) += swap * (mag.start_magnetization[it] / atom->zv) 
-						* sin(soc.angle1[it]) * cos(soc.angle2[it]);
-						rho_g3d(2, ig) += swap * (mag.start_magnetization[it] / atom->zv) 
-						* sin(soc.angle1[it]) * sin(soc.angle2[it]);
-						rho_g3d(3, ig) += swap * (mag.start_magnetization[it] / atom->zv) 
-						* cos(soc.angle1[it]);
+						for (int ig = 0; ig < pw.ngmc ; ig++)
+						{
+							const complex<double> swap = pw.strucFac(it, ig)* rho_lgl[pw.ig2ngg[ig]];
+							//rho_g3d(0, ig) += swap * mag.nelup_percent(it);
+							//rho_g3d(1, ig) += swap * mag.neldw_percent(it);
+							const double up = 0.5 * ( 1 + mag.start_magnetization[it] / atom->zv );
+							const double dw = 0.5 * ( 1 - mag.start_magnetization[it] / atom->zv );
+							rho_g3d(0, ig) += swap * up;
+							rho_g3d(1, ig) += swap * dw;
+						}
 					}
-					else if(DOMAG_Z)
+					// mohan add 2011-06-14
+					else if(startmag_type==2)
 					{
-						//rho_g3d(3, ig) += swap * mag.start_magnetization[it];
-						rho_g3d(3, ig) += swap * (mag.start_magnetization[it] / atom->zv);
+						complex<double> swap = ZERO;
+						complex<double> ci_tpi = NEG_IMAG_UNIT * TWO_PI;
+						for (int ia = 0; ia < atom->na; ia++)
+						{
+							//const double up = 0.5 * ( 1 + atom->mag[ia] );
+							//const double dw = 0.5 * ( 1 - atom->mag[ia] );
+							const double up = 0.5 * ( 1 + atom->mag[ia] / atom->zv );
+							const double dw = 0.5 * ( 1 - atom->mag[ia] / atom->zv );
+							//cout << " atom " << ia << " up=" << up << " dw=" << dw << endl;
+
+							for (int ig = 0; ig < pw.ngmc ; ig++)
+							{
+								const double Gtau = 
+									pw.gcar[ig].x * atom->tau[ia].x
+									+ pw.gcar[ig].y * atom->tau[ia].y
+									+ pw.gcar[ig].z * atom->tau[ia].z; 
+
+								swap = exp(ci_tpi * Gtau) * rho_lgl[pw.ig2ngg[ig]];
+
+								rho_g3d(0, ig) += swap * up;
+								rho_g3d(1, ig) += swap * dw;
+							}
+						}
 					}
 				}
-			}
-			else if(startmag_type == 2)
-			{//zdy-warning-not-available
-				complex<double> swap = ZERO;
-				complex<double> ci_tpi = NEG_IMAG_UNIT * TWO_PI;
-				for(int ia = 0;ia<atom->na;ia++)
+				else if(spin_number_need==4)
 				{
-					for (int ig = 0; ig < pw.ngmc ; ig++)
+					//noncolinear case
+					if(startmag_type == 1)
 					{
-						const double Gtau =
-							pw.gcar[ig].x * atom->tau[ia].x
-							+ pw.gcar[ig].y * atom->tau[ia].y
-							+ pw.gcar[ig].z * atom->tau[ia].z;
-
-						swap = exp(ci_tpi * Gtau) * rho_lgl[pw.ig2ngg[ig]];
-
-						rho_g3d(0, ig) += swap;
-						if(DOMAG)
+						for (int ig = 0; ig < pw.ngmc ; ig++)
 						{
-							rho_g3d(1, ig) += swap * (atom->mag[ia] / atom->zv) 
+							const complex<double> swap = pw.strucFac(it, ig)* rho_lgl[pw.ig2ngg[ig]];
+							rho_g3d(0, ig) += swap ;
+							if(DOMAG)
+							{
+								rho_g3d(1, ig) += swap * (mag.start_magnetization[it] / atom->zv) 
 								* sin(soc.angle1[it]) * cos(soc.angle2[it]);
-							rho_g3d(2, ig) += swap * (atom->mag[ia] / atom->zv) 
+								rho_g3d(2, ig) += swap * (mag.start_magnetization[it] / atom->zv) 
 								* sin(soc.angle1[it]) * sin(soc.angle2[it]);
-							rho_g3d(3, ig) += swap * (atom->mag[ia] / atom->zv) 
+								rho_g3d(3, ig) += swap * (mag.start_magnetization[it] / atom->zv) 
 								* cos(soc.angle1[it]);
+							}
+							else if(DOMAG_Z)
+							{
+								//rho_g3d(3, ig) += swap * mag.start_magnetization[it];
+								rho_g3d(3, ig) += swap * (mag.start_magnetization[it] / atom->zv);
+							}
 						}
-						else if(DOMAG_Z)
+					}
+					else if(startmag_type == 2)
+					{//zdy-warning-not-available
+						complex<double> swap = ZERO;
+						complex<double> ci_tpi = NEG_IMAG_UNIT * TWO_PI;
+						for(int ia = 0;ia<atom->na;ia++)
 						{
-							rho_g3d(3, ig) += swap * (atom->mag[ia] / atom->zv);
+							for (int ig = 0; ig < pw.ngmc ; ig++)
+							{
+								const double Gtau =
+									pw.gcar[ig].x * atom->tau[ia].x
+									+ pw.gcar[ig].y * atom->tau[ia].y
+									+ pw.gcar[ig].z * atom->tau[ia].z;
+
+								swap = exp(ci_tpi * Gtau) * rho_lgl[pw.ig2ngg[ig]];
+
+								rho_g3d(0, ig) += swap;
+								if(DOMAG)
+								{
+									rho_g3d(1, ig) += swap * (atom->mag[ia] / atom->zv) 
+										* sin(soc.angle1[it]) * cos(soc.angle2[it]);
+									rho_g3d(2, ig) += swap * (atom->mag[ia] / atom->zv) 
+										* sin(soc.angle1[it]) * sin(soc.angle2[it]);
+									rho_g3d(3, ig) += swap * (atom->mag[ia] / atom->zv) 
+										* cos(soc.angle1[it]);
+								}
+								else if(DOMAG_Z)
+								{
+									rho_g3d(3, ig) += swap * (atom->mag[ia] / atom->zv);
+								}
+							}
 						}
 					}
 				}
+				else
+				{
+					WARNING_QUIT("Charge::spin_number_need"," Either 1 or 2 or 4, check SPIN number !");
+				}
 			}
 		}
-		else
-		{
-			WARNING_QUIT("Charge::spin_number_need"," Either 1 or 2 or 4, check SPIN number !");
-		}
-	}
-
-    delete [] rho_lgl;
-    delete [] rho1d;;
-
+		return rho_g3d;
+	}();
 
 	assert( spin_number_need > 0 );
-	double* ne = new double[spin_number_need];
-	ZEROS( ne, spin_number_need);
+	vector<double> ne(spin_number_need);
     for (int is = 0; is < spin_number_need;is++)
     {
         UFFT.ToRealSpace( is, rho_g3d, rho_in[is]);
 
 		for(int ir=0; ir<pw.nrxx; ++ir)
-		{
 			ne[is] += rho_in[is][ir];
-		}
 		ne[is] *= ucell.omega/(double)pw.ncxyz; 
 		Parallel_Reduce::reduce_double_pool( ne[is] );
 
@@ -487,12 +478,8 @@ void Charge::atomic_rho(const int spin_number_need, double** rho_in)const
 	OUT(ofs_warning,"total electron number from rho",ne_tot);
 	OUT(ofs_warning,"should be",ucell.nelec);
 	for(int is=0; is<spin_number_need; ++is)
-	{
 		for(int ir=0; ir<pw.nrxx; ++ir)
-		{
 			rho_in[is][ir] = rho_in[is][ir] / ne_tot * ucell.nelec;
-		}
-	}
 
 	// if TWO_EFEMI, 
 	// the total magnetism will affect the calculation of
@@ -501,7 +488,6 @@ void Charge::atomic_rho(const int spin_number_need, double** rho_in)const
 
 	//ofs_running << " Superposition of atomic wave function as First-Charge done." << endl;
 	//2014-06-22
-	delete[] ne;
 
     timer::tick("Charge","atomic_rho");
     return;
@@ -651,7 +637,7 @@ void Charge::non_linear_core_correction
             {
                 aux [ir] = r [ir] * r [ir] * rhoc [ir];
             }
-            Mathzone::Simpson_Integral(mesh, aux, rab, rhocg1);
+            Integral::Simpson_Integral(mesh, aux, rab, rhocg1);
             //rhocg [1] = fpi * rhocg1 / omega;
             rhocg [0] = FOUR_PI * rhocg1 / ucell.omega;//mohan modify 2008-01-19
             igl0 = 1;
@@ -666,7 +652,7 @@ void Charge::non_linear_core_correction
             {
                 aux [ir] = r[ir] * r[ir] * rhoc [ir] * aux [ir];
             } //  enddo
-            Mathzone::Simpson_Integral(mesh, aux, rab, rhocg1);
+            Integral::Simpson_Integral(mesh, aux, rab, rhocg1);
             rhocg [igl] = FOUR_PI * rhocg1 / ucell.omega;
         } //  enddo
         delete [] aux;
diff --git a/ABACUS.develop/source/src_pw/charge_extra.cpp b/ABACUS.develop/source/src_pw/charge_extra.cpp
index 8ef14f105c..13e0cdac34 100644
--- a/ABACUS.develop/source/src_pw/charge_extra.cpp
+++ b/ABACUS.develop/source/src_pw/charge_extra.cpp
@@ -14,6 +14,12 @@ Charge_Extra::Charge_Extra()
 	// for second-order extrapolation
 	this->delta_rho3 = new double*[NSPIN];
 
+	// PLEASE update the following lines, because
+	// the pw.nrxx may not be initialized yet
+	// since Charge_Extra is a member of LOOP_ions
+	// you can move the initialization of the following 
+	// arrays to somewhere else
+	// mohan add 2021-03-30
 	for(int is=0; is<NSPIN; is++)
 	{
 		delta_rho1[is] = new double[pw.nrxx];
@@ -414,3 +420,27 @@ void Charge_Extra::find_alpha_and_beta(void)
 	}
 	return;
 }
+
+void Charge_Extra::save_pos_next(const UnitCell_pseudo& ucell)
+{
+	ucell.save_cartesian_position(this->pos_next);
+	return;
+}
+
+void Charge_Extra::update_istep(const int &step)
+{
+	this->istep = step;
+	return;
+}
+
+void Charge_Extra::update_all_pos(const UnitCell_pseudo& ucell)
+{
+	const int total_freedom = ucell.nat * 3;
+	for(int i=0;i<total_freedom;i++)
+	{
+		this->pos_old2[i] = this->pos_old1[i];
+		this->pos_old1[i] = this->pos_now[i];
+	}
+	ucell.save_cartesian_position(this->pos_now);
+	return;
+}
diff --git a/ABACUS.develop/source/src_pw/charge_extra.h b/ABACUS.develop/source/src_pw/charge_extra.h
index 67fffaaeb2..cd323f1514 100644
--- a/ABACUS.develop/source/src_pw/charge_extra.h
+++ b/ABACUS.develop/source/src_pw/charge_extra.h
@@ -1,5 +1,6 @@
 #ifndef CHARGE_EXTRA_H
 #define CHARGE_EXTRA_H
+#include "src_pw/unitcell_pseudo.h"
 
 using namespace std;
 
@@ -17,6 +18,11 @@ class Charge_Extra
 	void allocate_ions(void);
 	void extrapolate_charge(void);
 
+	void save_pos_next(const UnitCell_pseudo& ucell);
+	void update_istep(const int &step);
+	void update_all_pos(const UnitCell_pseudo& ucell);
+
+	private:
 	// use "istep = ions.istep"
 	int istep;
 
diff --git a/ABACUS.develop/source/src_pw/diago_cg.cpp b/ABACUS.develop/source/src_pw/diago_cg.cpp
index ed6fb16dcc..6d69f022ec 100644
--- a/ABACUS.develop/source/src_pw/diago_cg.cpp
+++ b/ABACUS.develop/source/src_pw/diago_cg.cpp
@@ -444,7 +444,7 @@ void Diago_CG::schmit_orth
     //qianrui replace 2021-3-15
     char trans2='N';
     zgemv_(&trans2,&dim,&m,&NEG_ONE,psi.c,&dmx,lagrange,&inc,&ONE,psi_m,&inc);
-    psi_norm -= ddot_real(m,lagrange,lagrange);
+    psi_norm -= ddot_real(m,lagrange,lagrange,false);
     //======================================================================
     /*for (int j = 0; j < m; j++)
     {
@@ -486,23 +486,8 @@ double Diago_CG::ddot_real
 (
     const int &dim,
     const complex<double>* psi_L,
-    const complex<double>* psi_R
-)
-{
-    complex<double> result(0,0);
-    for (int i=0;i<dim;i++)
-    {
-        result += conj( psi_L[i] ) * psi_R[i];
-    }
-    Parallel_Reduce::reduce_complex_double_pool( result );
-    return result.real();
-}
-
-complex<double> Diago_CG::ddot
-(
-    const int & dim,
-    const complex<double> * psi_L,
-    const complex<double> * psi_R
+    const complex<double>* psi_R,
+    const bool reduce
 )
 {
     //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
@@ -513,7 +498,7 @@ complex<double> Diago_CG::ddot
     pL=(double *)psi_L;
     pR=(double *)psi_R;
     double result=LapackConnector::dot(dim2,pL,1,pR,1);
-    Parallel_Reduce::reduce_double_pool( result );
+    if(reduce)  Parallel_Reduce::reduce_double_pool( result );
     return result;
     //======================================================================
     /*complex<double> result(0,0);
@@ -524,8 +509,23 @@ complex<double> Diago_CG::ddot
     Parallel_Reduce::reduce_complex_double_pool( result );
     return result.real();*/
     //>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
-}  // end of ddot
+}
 
+complex<double> Diago_CG::ddot
+(
+    const int & dim,
+    const complex<double> * psi_L,
+    const complex<double> * psi_R
+)
+{
+    complex<double> result(0, 0);
+    for (int i = 0; i < dim ; i++)
+    {
+        result += conj(psi_L[i]) *  psi_R[i] ;
+    }
+    Parallel_Reduce::reduce_complex_double_pool( result );
+    return result;
+}  // end of ddot
 
 // this return <psi(m)|psik>
 complex<double> Diago_CG::ddot
diff --git a/ABACUS.develop/source/src_pw/diago_cg.h b/ABACUS.develop/source/src_pw/diago_cg.h
index d389d90422..7065ade9de 100644
--- a/ABACUS.develop/source/src_pw/diago_cg.h
+++ b/ABACUS.develop/source/src_pw/diago_cg.h
@@ -15,13 +15,15 @@ class Diago_CG
     static double ddot_real(
         const int & dim,
         const complex<double>* psi_L,
-        const complex<double>* psi_R) ;
+        const complex<double>* psi_R,
+        const bool reduce = true) ;
 
     static complex<double> ddot(
         const int & dim,
         const complex<double>* psi_L,
         const complex<double>* psi_R ) ;
 
+
     static complex<double> ddot(
         const int & dim,
         const ComplexMatrix &psi,
diff --git a/ABACUS.develop/source/src_pw/forces.cpp b/ABACUS.develop/source/src_pw/forces.cpp
index 2b944bf137..580e51581b 100644
--- a/ABACUS.develop/source/src_pw/forces.cpp
+++ b/ABACUS.develop/source/src_pw/forces.cpp
@@ -5,6 +5,7 @@
 #include "symmetry.h"
 // new
 #include "H_XC_pw.h"
+#include "../src_global/math_integral.h"
 
 double Forces::output_acc = 1.0e-8; // (Ryd/angstrom).	
 
@@ -837,7 +838,7 @@ void Forces::cal_force_scc(matrix& forcescc)
                     aux[ir] = ucell.atoms[nt].rho_at[ir] * sin(gxx) / gxx;
                 }
             }
-            Mathzone::Simpson_Integral(mesh , aux, ucell.atoms[nt].rab , rhocgnt [ig]);
+            Integral::Simpson_Integral(mesh , aux, ucell.atoms[nt].rab , rhocgnt [ig]);
         }
 
         int iat = 0;
diff --git a/ABACUS.develop/source/src_pw/global.h b/ABACUS.develop/source/src_pw/global.h
index c9039d6d78..b417e75361 100644
--- a/ABACUS.develop/source/src_pw/global.h
+++ b/ABACUS.develop/source/src_pw/global.h
@@ -7,8 +7,8 @@
 #define GLOBAL_H
 
 #include "../run_pw.h"
-#include "src_global/global_variable.h"
-#include "src_global/global_function.h"
+#include "../src_global/global_variable.h"
+#include "../src_global/global_function.h"
 #include "pw_basis.h"
 #include "energy.h"
 #include "pseudopot_cell_vnl.h"
diff --git a/ABACUS.develop/source/src_pw/hamilt_pw.cpp b/ABACUS.develop/source/src_pw/hamilt_pw.cpp
index 702e12382c..2a1181ace7 100644
--- a/ABACUS.develop/source/src_pw/hamilt_pw.cpp
+++ b/ABACUS.develop/source/src_pw/hamilt_pw.cpp
@@ -13,7 +13,6 @@ Hamilt_PW::Hamilt_PW()
     spsi = new complex<double>[1];
     GR_index = new int[1];
     Bec = new complex<double>[1];
-    Ps = new complex<double>[1];
 }
 
 Hamilt_PW::~Hamilt_PW()
@@ -22,7 +21,6 @@ Hamilt_PW::~Hamilt_PW()
     delete[] spsi;
     delete[] GR_index;
     delete[] Bec;
-    delete[] Ps;
 }
 
 
@@ -43,13 +41,11 @@ void Hamilt_PW::allocate(
     delete[] spsi;
     delete[] GR_index;
     delete[] Bec;
-    delete[] Ps;
 
     this->hpsi = new complex<double> [npwx * npol];
     this->spsi = new complex<double> [npwx * npol];
     this->GR_index = new int[nrxx];
     this->Bec = new complex<double> [nkb];
-    this->Ps  = new complex<double> [nkb * npol];
 
     ZEROS(this->hpsi, npwx * npol);
     ZEROS(this->spsi, npwx * npol);
@@ -122,7 +118,7 @@ void Hamilt_PW::cinitcgg(
     ComplexMatrix hvec(nstart,n_band);
 	int dmin,dmax;
 	const int npw = kv.ngk[ik];
-	if(!NONCOLIN)
+	if(NSPIN != 4)
 	{
 		dmin= npw;
 		dmax = wf.npwx;
@@ -135,16 +131,19 @@ void Hamilt_PW::cinitcgg(
 	complex<double> *aux=new complex<double> [dmax*nstart];
 	complex<double> *paux = aux;
 	complex<double> *ppsi = psi.c;
-	for(int m=0;m<nstart;++m)
-	{
-		this->h_psi(ppsi, paux);
-		paux += dmax;
-		ppsi += dmax;
-	}
+	//qianrui replace it
+	this->h_psi(psi.c, aux, nstart);
+	//for(int m=0;m<nstart;++m)
+	//{
+	//	this->h_psi(ppsi, paux);
+	//	paux += dmax;
+	//	ppsi += dmax;
+	//}
 	char trans1 = 'C';
 	char trans2 = 'N';
 	zgemm_(&trans1,&trans2,&nstart,&nstart,&dmin,&ONE,psi.c,&dmax,aux,&dmax,&ZERO,hc.c,&nstart);
 	hc=transpose(hc,false);
+
 	zgemm_(&trans1,&trans2,&nstart,&nstart,&dmin,&ONE,psi.c,&dmax,psi.c,&dmax,&ZERO,sc.c,&nstart);
 	sc=transpose(sc,false);
 	//After psis are strictly normalized, we should use this part. 
@@ -462,103 +461,157 @@ void Hamilt_PW::s_1psi
 }
 
 
-void Hamilt_PW::h_psi(const complex<double> *psi_in, complex<double> *hpsi)
+void Hamilt_PW::h_psi(const complex<double> *psi_in, complex<double> *hpsi, const int m)
 {
     timer::tick("Hamilt_PW","h_psi",'H');
     int i = 0;
     int j = 0;
     int ig= 0;
 
-	if(NSPIN!=4) ZEROS(hpsi, wf.npw);
-	else ZEROS(hpsi, wf.npwx * NPOL);//added by zhengdy-soc
+	//if(NSPIN!=4) ZEROS(hpsi, wf.npw);
+	//else ZEROS(hpsi, wf.npwx * NPOL);//added by zhengdy-soc
+	int dmax = wf.npwx * NPOL;
 
 	//------------------------------------
 	//(1) the kinetical energy.
 	//------------------------------------
+	complex<double> *tmhpsi;
+	const complex<double> *tmpsi_in;
  	if(T_IN_H)
 	{	
-		for (ig = 0;ig < wf.npw;ig++)
+		tmhpsi = hpsi;
+		tmpsi_in = psi_in;
+		for(int ib = 0 ; ib < m; ++ib)
 		{
-			hpsi[ig] = wf.g2kin[ig] * psi_in[ig];
-		}
-		//added by zhengdy-soc
-		if(NSPIN==4)
-		{
-			for (ig = wf.npwx;ig < wf.npw + wf.npwx;ig++)
+			for(ig = 0;ig < wf.npw; ++ig)
 			{
-				hpsi[ig] = wf.g2kin[ig - wf.npwx] * psi_in[ig];
+				tmhpsi[ig] = wf.g2kin[ig] * tmpsi_in[ig];
 			}
+			if(NSPIN==4){
+				for(ig=wf.npw; ig < wf.npwx; ++ig)
+				{
+					tmhpsi[ig] = 0;
+				}
+				tmhpsi +=wf.npwx;
+				tmpsi_in += wf.npwx;
+				for (ig = 0;ig < wf.npw ;++ig)
+				{
+					tmhpsi[ig] = wf.g2kin[ig] * tmpsi_in[ig];
+				}
+				for(ig=wf.npw; ig < wf.npwx; ++ig)
+				{
+					tmhpsi[ig] =0;
+				}
+			}
+			tmhpsi += wf.npwx;
+			tmpsi_in += wf.npwx;
 		}
 	}
 
 	//------------------------------------
 	//(2) the local potential.
-	//------------------------------------
+	//-----------------------------------
+	timer::tick("Hamilt_PW","vloc",'H');
 	if(VL_IN_H)
 	{
-		if(NSPIN!=4)
+		tmhpsi = hpsi;
+		tmpsi_in = psi_in;
+		for(int ib = 0 ; ib < m; ++ib)
 		{
-			ZEROS( UFFT.porter, pw.nrxx);
-			UFFT.RoundTrip( psi_in, pot.vr_eff1, GR_index, UFFT.porter );
-
-			for (j = 0;j < wf.npw;j++)
-			{
-				hpsi[j] += UFFT.porter[ GR_index[j] ];
-			}
-		}
-		else
-		{
-			complex<double>* porter1 = new complex<double>[pw.nrxx];
-			ZEROS( UFFT.porter, pw.nrxx);
-			ZEROS( porter1, pw.nrxx);
-			for (int ig=0; ig< wf.npw; ig++)
-			{
-				UFFT.porter[ GR_index[ig]  ] = psi_in[ig];
-				porter1[ GR_index[ig]  ] = psi_in[ig + wf.npwx];
+			if(NSPIN!=4){
+				ZEROS( UFFT.porter, pw.nrxx);
+				UFFT.RoundTrip( tmpsi_in, pot.vr_eff1, GR_index, UFFT.porter );
+				for (j = 0;j < wf.npw;j++)
+				{
+					tmhpsi[j] += UFFT.porter[ GR_index[j] ];
+				}
 			}
-			// (2) fft to real space and doing things.
-			pw.FFT_wfc.FFT3D( UFFT.porter, 1);
-			pw.FFT_wfc.FFT3D( porter1, 1);
-			complex<double> sup,sdown;
-			for (int ir=0; ir< pw.nrxx; ir++)
+			else
 			{
-				sup = UFFT.porter[ir] * (pot.vr_eff(0,ir) + pot.vr_eff(3,ir)) +
-					porter1[ir] * (pot.vr_eff(1,ir) - complex<double>(0.0,1.0) * pot.vr_eff(2,ir));
-				sdown = porter1[ir] * (pot.vr_eff(0,ir) - pot.vr_eff(3,ir)) +
-				UFFT.porter[ir] * (pot.vr_eff(1,ir) + complex<double>(0.0,1.0) * pot.vr_eff(2,ir));
-				UFFT.porter[ir] = sup;
-				porter1[ir] = sdown;
-			}
-			// (3) fft back to G space.
-			pw.FFT_wfc.FFT3D( UFFT.porter, -1);
-			pw.FFT_wfc.FFT3D( porter1, -1);
+				complex<double>* porter1 = new complex<double>[pw.nrxx];
+				ZEROS( UFFT.porter, pw.nrxx);
+				ZEROS( porter1, pw.nrxx);
+				for (int ig=0; ig< wf.npw; ig++)
+				{
+					UFFT.porter[ GR_index[ig]  ] = tmpsi_in[ig];
+					porter1[ GR_index[ig]  ] = tmpsi_in[ig + wf.npwx];
+				}
+				// (2) fft to real space and doing things.
+				pw.FFT_wfc.FFT3D( UFFT.porter, 1);
+				pw.FFT_wfc.FFT3D( porter1, 1);
+				complex<double> sup,sdown;
+				for (int ir=0; ir< pw.nrxx; ir++)
+				{
+					sup = UFFT.porter[ir] * (pot.vr_eff(0,ir) + pot.vr_eff(3,ir)) +
+						porter1[ir] * (pot.vr_eff(1,ir) - complex<double>(0.0,1.0) * pot.vr_eff(2,ir));
+					sdown = porter1[ir] * (pot.vr_eff(0,ir) - pot.vr_eff(3,ir)) +
+					UFFT.porter[ir] * (pot.vr_eff(1,ir) + complex<double>(0.0,1.0) * pot.vr_eff(2,ir));
+					UFFT.porter[ir] = sup;
+					porter1[ir] = sdown;
+				}
+				// (3) fft back to G space.
+				pw.FFT_wfc.FFT3D( UFFT.porter, -1);
+				pw.FFT_wfc.FFT3D( porter1, -1);
 
-			for (j = 0;j < wf.npw;j++)
-			{
-				hpsi[j] += UFFT.porter[ GR_index[j] ];
-				hpsi[j+wf.npwx] += porter1[ GR_index[j] ];
+				for (j = 0;j < wf.npw;j++)
+				{
+					tmhpsi[j] += UFFT.porter[ GR_index[j] ];
+				}
+				for (j = 0;j < wf.npw;j++ )
+				{
+					tmhpsi[j+wf.npwx] += porter1[ GR_index[j] ];
+				}
+				delete[] porter1;
 			}
-			delete[] porter1;
+			tmhpsi += dmax;
+			tmpsi_in += dmax;
 		}
 	}
+	timer::tick("Hamilt_PW","vloc",'H');
 
 	//------------------------------------
 	// (3) the nonlocal pseudopotential.
 	//------------------------------------
+	timer::tick("Hamilt_PW","vnl",'H');
 	if(VNL_IN_H)
 	{
 		if ( ppcell.nkb > 0)
 		{
 			//<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-			//qianrui improve 2021-3-16
+			//qianrui optimize 2021-3-31
 			int nkb=ppcell.nkb;
-			ComplexMatrix becp(NPOL,nkb,false);
+			ComplexMatrix becp(NPOL * m, nkb, false);
 			char transa = 'C';
 			char transb = 'N';
-			zgemm_(&transa,&transb,&nkb,&NPOL,&wf.npw,&ONE,ppcell.vkb.c,&wf.npwx,psi_in,&wf.npwx,&ZERO,becp.c,&nkb);
-			becp=transpose(becp,false);
-			Parallel_Reduce::reduce_complex_double_pool( becp.c, ppcell.nkb * NPOL);
-			this->add_vuspsi(hpsi, becp.c);
+			if(m==1 && NPOL==1)
+			{
+				int inc = 1;
+				zgemv_(&transa, &wf.npw, &nkb, &ONE, ppcell.vkb.c, &wf.npwx, psi_in, &inc, &ZERO, becp.c, &inc);
+			}
+			else
+			{
+				int npm = NPOL * m;
+				zgemm_(&transa,&transb,&nkb,&npm,&wf.npw,&ONE,ppcell.vkb.c,&wf.npwx,psi_in,&wf.npwx,&ZERO,becp.c,&nkb);
+				//add_vuspsi is moddified, thus tranpose not needed here.
+				//if(NONCOLIN)
+				//{
+				//	ComplexMatrix partbecp(NPOL, nkb ,false);
+				//	for(int ib = 0; ib < m; ++ib)
+				//	{
+//
+				//		for ( i = 0;i < NPOL;i++)
+				//			for (j = 0;j < nkb;j++)
+				//				partbecp(i, j) = tmbecp[i*nkb+j];
+				//		for (j = 0; j < nkb; j++)
+				//			for (i = 0;i < NPOL;i++)
+				//				tmbecp[j*NPOL+i] = partbecp(i, j);
+				//		tmbecp += NPOL * nkb;
+				//	}
+				//}
+			}
+
+			Parallel_Reduce::reduce_complex_double_pool( becp.c, nkb * NPOL * m);
+			this->add_vuspsi(hpsi, becp.c, m);
 			//======================================================================
 			/*complex<double> *becp = new complex<double>[ ppcell.nkb * NPOL ];
 			ZEROS(becp,ppcell.nkb * NPOL);
@@ -582,21 +635,25 @@ void Hamilt_PW::h_psi(const complex<double> *psi_in, complex<double> *hpsi)
 			//>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
 		}
 	}
-
+	timer::tick("Hamilt_PW","vnl",'H');
     timer::tick("Hamilt_PW","h_psi",'H');
     return;
 }
 
-void Hamilt_PW::add_vuspsi(complex<double> *hpsi_in,const complex<double> *becp)
+void Hamilt_PW::add_vuspsi(complex<double> *hpsi_in,const complex<double> *becp, const int m)
 {
     timer::tick("Hamilt_PW","add_vuspsi",'I');
-    ZEROS( Ps, ppcell.nkb * NPOL );
+	int nkb = ppcell.nkb;
+	complex<double> *Ps  = new complex<double> [nkb * NPOL * m];
+    ZEROS( Ps, NPOL * m * nkb);
 
     int sum = 0;
     int iat = 0;
     // this function sum up each non-local pseudopotential located in each atom,
     // all we need to do is put the right Dij coefficient to each becp, which
     // is calculated before.
+    if(NSPIN!=4)
+	{
     for (int it=0; it<ucell.ntype; it++)
     {
         const int Nprojs = ucell.atoms[it].nh;
@@ -609,29 +666,58 @@ void Hamilt_PW::add_vuspsi(complex<double> *hpsi_in,const complex<double> *becp)
             {
                 for (int ip2=0; ip2<Nprojs; ip2++)
                 {
-					if(NSPIN!=4)
-						this->Ps[sum+ip2] += ppcell.deeq(CURRENT_SPIN, iat, ip, ip2) * becp[sum+ip];
-					else
+					for(int ib = 0; ib < m ; ++ib)
 					{
-						this->Ps[sum+ ip2*2] += ppcell.deeq_nc(0, iat, ip2, ip) * becp[sum+ip*2]
-							+ppcell.deeq_nc(1, iat, ip2, ip) * becp[sum+ip*2+1];
-						this->Ps[sum+ ip2*2+1] += ppcell.deeq_nc(2, iat, ip2, ip) * becp[sum+ip*2]
-							+ppcell.deeq_nc(3, iat, ip2, ip) * becp[sum+ip*2+1];
-					}
-				}// end ih
+						Ps[(sum + ip2) * m + ib] += ppcell.deeq(CURRENT_SPIN, iat, ip, ip2) * becp[ib * nkb + sum + ip];
+					}//end ib
+                }// end ih
+            }//end jh 
+			sum += Nprojs;
+			++iat;
+        } //end na
+    } //end nt
+	}
+	else
+	{
+	for (int it=0; it<ucell.ntype; it++)
+    {
+		int psind,becpind;
+		complex<double> becp1,becp2;
+        const int Nprojs = ucell.atoms[it].nh;
+        for (int ia=0; ia<ucell.atoms[it].na; ia++)
+        {
+            // each atom has Nprojs, means this is with structure factor;
+            // each projector (each atom) must multiply coefficient
+            // with all the other projectors.
+            for (int ip=0; ip<Nprojs; ip++)
+            {
+                for (int ip2=0; ip2<Nprojs; ip2++)
+                {
+					for(int ib = 0; ib < m ; ++ib)
+					{
+						psind = (sum+ip2) * 2 * m + ib * 2;
+						becpind = ib*nkb*2 + sum + ip;
+						becp1 =  becp[becpind];
+						becp2 =  becp[becpind + nkb];
+						Ps[psind] += ppcell.deeq_nc(0, iat, ip2, ip) * becp1
+							+ppcell.deeq_nc(1, iat, ip2, ip) * becp2;
+						Ps[psind +1] += ppcell.deeq_nc(2, iat, ip2, ip) * becp1
+							+ppcell.deeq_nc(3, iat, ip2, ip) * becp2;
+					}//end ib
+                }// end ih
             }//end jh
-		if(NSPIN!=4) sum += Nprojs;
-		else sum += 2 * Nprojs;
-		++iat;
+		 	sum += Nprojs;
+			++iat;
         } //end na
     } //end nt
+	}
 
 	/*
     for (int ig=0;ig<wf.npw;ig++)
     {
         for (int i=0;i< ppcell.nkb;i++)
         {
-            hpsi_in[ig]+=this->Ps[i]*ppcell.vkb(i,ig);
+            hpsi_in[ig]+=Ps[i]*ppcell.vkb(i,ig);
         }
     }
 	*/
@@ -639,10 +725,19 @@ void Hamilt_PW::add_vuspsi(complex<double> *hpsi_in,const complex<double> *becp)
 
 	// use simple method.
 	//<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-	//qianrui improve 2021-3-16
+	//qianrui optimize 2021-3-31
 	char transa = 'N';
 	char transb = 'T';
-	zgemm_(&transa,&transb,&wf.npw,&NPOL,&ppcell.nkb,&ONE,ppcell.vkb.c,&wf.npwx,Ps,&NPOL,&ONE,hpsi_in,&wf.npwx);
+	if(NPOL==1 && m==1)
+	{
+		int inc = 1;
+		zgemv_(&transa, &wf.npw, &ppcell.nkb, &ONE, ppcell.vkb.c, &wf.npwx, Ps, &inc, &ONE, hpsi_in, &inc);
+	}
+	else
+	{
+		int npm = NPOL*m;
+		zgemm_(&transa,&transb,&wf.npw,&npm,&ppcell.nkb,&ONE,ppcell.vkb.c,&wf.npwx,Ps,&npm,&ONE,hpsi_in,&wf.npwx);
+	}
 	//======================================================================
 	/*if(!NONCOLIN)
 	for(int i=0; i<ppcell.nkb; i++)
@@ -671,7 +766,7 @@ void Hamilt_PW::add_vuspsi(complex<double> *hpsi_in,const complex<double> *becp)
 		}
 	}*/
 	//>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
-
+	delete []Ps;
     timer::tick("Hamilt_PW","add_vuspsi",'I');
     return;
 }
diff --git a/ABACUS.develop/source/src_pw/hamilt_pw.h b/ABACUS.develop/source/src_pw/hamilt_pw.h
index 6113c5ae32..b79d094e2a 100644
--- a/ABACUS.develop/source/src_pw/hamilt_pw.h
+++ b/ABACUS.develop/source/src_pw/hamilt_pw.h
@@ -50,7 +50,7 @@ class Hamilt_PW
         complex<double> *hpsi,
         complex<double> *spsi);
 
-    void h_psi( const complex<double> *psi, complex<double> *hpsi);
+    void h_psi( const complex<double> *psi, complex<double> *hpsi, const int m = 1); // qianrui add a default parameter 2021-3-31
 
     void s_1psi(
         const int npw,
@@ -66,11 +66,9 @@ class Hamilt_PW
     // hpsi , spsi
     complex<double> *hpsi;
     complex<double> *spsi;
-
     complex<double> *Bec;
-    complex<double> *Ps;
 
-    void add_vuspsi(complex<double> *hpsi, const complex<double> *becp);
+    void add_vuspsi(complex<double> *hpsi, const complex<double> *becp, const int m);
 
 	private:
 
diff --git a/ABACUS.develop/source/src_pw/ions.cpp b/ABACUS.develop/source/src_pw/ions.cpp
index d3faddfcd1..57f5177d5c 100644
--- a/ABACUS.develop/source/src_pw/ions.cpp
+++ b/ABACUS.develop/source/src_pw/ions.cpp
@@ -156,29 +156,9 @@ void Ions::opt_ions_pw(void)
 		}
 	
 
-		int iat=0; //LiuXh add 20180619
 		if(CALCULATION=="relax"|| CALCULATION=="md" || CALCULATION=="cell-relax")
 		{
-			for(int it = 0;it < ucell.ntype;it++)
-			{
-				Atom* atom = &ucell.atoms[it];
-				for(int ia =0;ia< ucell.atoms[it].na;ia++)
-				{
-					CE.pos_old2[3*iat  ] = CE.pos_old1[3*iat  ];
-					CE.pos_old2[3*iat+1] = CE.pos_old1[3*iat+1];
-					CE.pos_old2[3*iat+2] = CE.pos_old1[3*iat+2];
-
-					CE.pos_old1[3*iat  ] = CE.pos_now[3*iat  ];
-					CE.pos_old1[3*iat+1] = CE.pos_now[3*iat+1];
-					CE.pos_old1[3*iat+2] = CE.pos_now[3*iat+2];
-
-					CE.pos_now[3*iat  ] = atom->tau[ia].x*ucell.lat0;
-					CE.pos_now[3*iat+1] = atom->tau[ia].y*ucell.lat0;
-					CE.pos_now[3*iat+2] = atom->tau[ia].z*ucell.lat0;
-
-					iat++;
-				}
-			}
+			CE.update_all_pos(ucell);
 		}
 
 		if(pot.out_potential == 2)
@@ -224,7 +204,7 @@ void Ions::opt_ions_pw(void)
 
     }
 
-    if(CALCULATION=="scf" || CALCULATION=="relax")
+    if(CALCULATION=="scf" || CALCULATION=="relax" || CALCULATION=="cell-relax")
     {
         ofs_running << "\n\n --------------------------------------------" << endl;
         ofs_running << setprecision(16);
@@ -404,25 +384,15 @@ bool Ions::force_stress(const int &istep, int &force_step, int &stress_step)  //
             }
             else
             {
-                //stress_step = 1;
+				ofs_running << " Setup the structure factor in plane wave basis." << endl;
                 pw.setup_structure_factor();
-                int iat=0; //LiuXh add 20180619
-                for(int it = 0;it < ucell.ntype;it++)
-                {
-                    Atom* atom = &ucell.atoms[it];
-                    for(int ia =0;ia< ucell.atoms[it].na;ia++)
-                    {
-                        CE.pos_next[3*iat  ] = atom->tau[ia].x*ucell.lat0;
-                        CE.pos_next[3*iat+1] = atom->tau[ia].y*ucell.lat0;
-                        CE.pos_next[3*iat+2] = atom->tau[ia].z*ucell.lat0;
-
-                        iat++;
-                    }
-                }
-                CE.istep = force_step;
-
+				ofs_running << " Setup the extrapolated charge." << endl;
+                CE.save_pos_next(ucell);
+                CE.update_istep(force_step);
                 CE.extrapolate_charge();
+				ofs_running << " Setup the Vl+Vh+Vxc according to new structure factor and new charge." << endl;
                 pot.init_pot( istep, pw.strucFac );
+				ofs_running << " Setup the new wave functions?" << endl;
                 wf.wfcinit();
                 ++force_step;
                 return 0;
diff --git a/ABACUS.develop/source/src_pw/potential.cpp b/ABACUS.develop/source/src_pw/potential.cpp
index c95cffda30..8998db9ffe 100644
--- a/ABACUS.develop/source/src_pw/potential.cpp
+++ b/ABACUS.develop/source/src_pw/potential.cpp
@@ -117,7 +117,7 @@ void Potential::init_pot(
             for(int is=0; is<NSPIN; is++)
             {
                 stringstream ssc;
-                ssc << global_out_dir << "SPIN" << is + 1 << "_CHG";
+                ssc << global_readin_dir << "SPIN" << is + 1 << "_CHG";
                 ofs_running << ssc.str() << endl;
                 // mohan update 2012-02-10
                 if(CHR.read_rho( is, ssc.str(), CHR.rho[is] )) 
diff --git a/ABACUS.develop/source/src_pw/pseudopot_cell_vl.cpp b/ABACUS.develop/source/src_pw/pseudopot_cell_vl.cpp
index 7c08a04648..8c879bdde9 100644
--- a/ABACUS.develop/source/src_pw/pseudopot_cell_vl.cpp
+++ b/ABACUS.develop/source/src_pw/pseudopot_cell_vl.cpp
@@ -2,6 +2,7 @@
 #include "tools.h"
 #include "../src_pw/myfunc.h"
 #include "pseudopot_cell_vl.h"
+#include "../src_global/math_integral.h"
 
 pseudopot_cell_vl::pseudopot_cell_vl()
 {
@@ -138,7 +139,7 @@ void pseudopot_cell_vl::vloc_of_g(
 	{
 		aux[ir] = r[ir] * zp_in * e2 / ucell.omega;
 	}
-	Mathzone::Simpson_Integral(msh, aux, rab, vloc_1d[0] );
+	Integral::Simpson_Integral(msh, aux, rab, vloc_1d[0] );
 	vloc_1d[0] *= 4*3.1415926;
 	cout << "  vloc_1d[0]=" <<  vloc_1d[0]/pw.ngmc << endl;
 	cout << "  vloc_1d[0]=" <<  vloc_1d[0]/pw.ncxyz << endl;
@@ -156,7 +157,7 @@ void pseudopot_cell_vl::vloc_of_g(
 			aux[ir] = r [ir] * (r [ir] * vloc_at [ir] + zp_in * e2);
 			//aux[ir] = r [ir] * (r [ir] * vloc_at [ir] );
 		}
-		Mathzone::Simpson_Integral(msh, aux, rab, vloc_1d[0] );
+		Integral::Simpson_Integral(msh, aux, rab, vloc_1d[0] );
 		igl0 = 1;	
 	}
 	else
@@ -182,7 +183,7 @@ void pseudopot_cell_vl::vloc_of_g(
 		{
 			aux [ir] = aux1 [ir] * sin(gx * r [ir]) / gx;
 		}
-		Mathzone::Simpson_Integral(msh, aux, rab, vloc_1d[ig] );
+		Integral::Simpson_Integral(msh, aux, rab, vloc_1d[ig] );
 		//  here we add the analytic fourier transform of the erf function
 		vloc_1d[ig] -= fac * exp(- gx2 * 0.25)/ gx2;
 	} // enddo
diff --git a/ABACUS.develop/source/src_pw/pseudopot_cell_vnl.cpp b/ABACUS.develop/source/src_pw/pseudopot_cell_vnl.cpp
index bc06304a20..fd83eb7201 100644
--- a/ABACUS.develop/source/src_pw/pseudopot_cell_vnl.cpp
+++ b/ABACUS.develop/source/src_pw/pseudopot_cell_vnl.cpp
@@ -7,6 +7,7 @@
 #include "tools.h"
 #include "wavefunc.h"
 #include "../src_lcao/ORB_gen_tables.h"
+#include "../src_global/math_integral.h"
 
 pseudopot_cell_vnl::pseudopot_cell_vnl()
 {
@@ -398,7 +399,7 @@ void pseudopot_cell_vnl::init_vnl(UnitCell_pseudo &cell)
 					          jl[ir] * cell.atoms[it].r[ir];
 				} 
 				double vqint;
-				Mathzone::Simpson_Integral(kkbeta, aux, cell.atoms[it].rab, vqint);
+				Integral::Simpson_Integral(kkbeta, aux, cell.atoms[it].rab, vqint);
 				this->tab(it, ib, iq) = vqint * pref;
 			} 
 		} 
@@ -624,7 +625,7 @@ void pseudopot_cell_vnl::init_vnl_alpha(void)          // pengfei Li 2018-3-23
 								  ucell.atoms[it].r[ir] * ucell.atoms[it].r[ir];
 					}
 					double vqint;
-					Mathzone::Simpson_Integral(kkbeta, aux, ucell.atoms[it].rab, vqint);
+					Integral::Simpson_Integral(kkbeta, aux, ucell.atoms[it].rab, vqint);
 					this->tab_alpha(it, ib, L, iq) = vqint * pref;
 				}
 			}
diff --git a/ABACUS.develop/source/src_pw/pseudopot_upf.cpp b/ABACUS.develop/source/src_pw/pseudopot_upf.cpp
index 2b6a76f8fa..235e53bbb3 100644
--- a/ABACUS.develop/source/src_pw/pseudopot_upf.cpp
+++ b/ABACUS.develop/source/src_pw/pseudopot_upf.cpp
@@ -9,7 +9,7 @@
 #include <math.h>
 #include <string>
 #include <sstream>
-#include <cstring>		// Peize Lin fix bug about strcpy 2016-08-02
+#include <cstring> // Peize Lin fix bug about strcpy 2016-08-02
 
 int Number[2];
 
@@ -35,14 +35,14 @@ Pseudopot_upf::Pseudopot_upf()
 	this->jchi = new double[1];
 	this->jjj = new double[1];
 
-        functional_error = 0;//xiaohui add 2015-03-24
+	functional_error = 0;//xiaohui add 2015-03-24
 }
 
 Pseudopot_upf::~Pseudopot_upf()
 {
-	delete [] els;  //header_15
-	delete [] lchi; //header_16
-	delete [] oc;   //header_17
+	delete [] els; 
+	delete [] lchi;
+	delete [] oc;
 
 	delete [] r;    //mesh_1
 	delete [] rab;  //mesh_2
@@ -61,7 +61,7 @@ Pseudopot_upf::~Pseudopot_upf()
 
 int Pseudopot_upf::init_pseudo_reader(const string &fn)
 {
-    if(test_pp) TITLE("Pseudopot_upf","init");
+    TITLE("Pseudopot_upf","init");
     // First check if this pseudo-potential has spin-orbit information
     ifstream ifs(fn.c_str(), ios::in);
 
@@ -72,10 +72,10 @@ int Pseudopot_upf::init_pseudo_reader(const string &fn)
     }
 
     //cout << "global_pseudo_type =" << global_pseudo_type << endl;
-    if(global_pseudo_type=="auto") //{zws
+    if(global_pseudo_type=="auto") //zws
 	{
 		set_pseudo_type(fn);
-	} //}
+	}
 
 	// read in the .UPF type of pseudopotentials
 	if(global_pseudo_type=="upf")
@@ -98,9 +98,6 @@ int Pseudopot_upf::init_pseudo_reader(const string &fn)
 		return info;
 	}
 
-
-
-
 	return 0;
 }
 
@@ -108,7 +105,7 @@ int Pseudopot_upf::init_pseudo_reader(const string &fn)
 //----------------------------------------------------------
 // setting the type of the pseudopotential file
 //----------------------------------------------------------
-int Pseudopot_upf::set_pseudo_type(const string &fn) //{zws add
+int Pseudopot_upf::set_pseudo_type(const string &fn) //zws add
 {
     ifstream pptype_ifs(fn.c_str(), ios::in);
     string dummy, strversion;
@@ -202,7 +199,7 @@ int Pseudopot_upf::read_pseudo_vwr(ifstream &ifs)
 	if(mesh%2==0) 
 	{
 		mesh=mesh-1;
-		ofs_running << " Mesh number - 1, because we need odd number, \n this may affect some polar atomic orbitals." << endl;
+		ofs_running << " Mesh number - 1, we need odd number, \n this may affect some polar atomic orbitals." << endl;
 	}
 	ofs_running << setw(15) << "MESH" << setw(15) << mesh << endl;
 	// (2) read in nlcc: nonlinear core correction
@@ -2296,3 +2293,25 @@ int Pseudopot_upf::average_p()
 	this->has_so = 0;	
 	return error;
 }
+
+// Peize Lin add for bsse 2021.04.07
+void Pseudopot_upf::set_empty_element(void)
+{
+	this->zp = 0;
+	for(int ir=0; ir<mesh; ++ir)
+	{
+		this->vloc[ir] = 0;
+	}
+	for(int i=0; i<nbeta; ++i)
+	{
+		for(int j=0; j<nbeta; ++j)
+		{
+			this->dion(i,j) = 0;
+		}
+	}
+	for(int ir=0; ir<mesh; ++ir)
+	{
+		this->rho_at[ir] = 0;
+	}
+	return;
+}
diff --git a/ABACUS.develop/source/src_pw/pseudopot_upf.h b/ABACUS.develop/source/src_pw/pseudopot_upf.h
index 2d24d4f749..23c41007ef 100644
--- a/ABACUS.develop/source/src_pw/pseudopot_upf.h
+++ b/ABACUS.develop/source/src_pw/pseudopot_upf.h
@@ -89,6 +89,7 @@ class Pseudopot_upf
 
 	bool functional_error;//xiaohui add 2015-03-24
 	int average_p(); //zhengdy add 2020-10-20
+	void set_empty_element();		// Peize Lin add for bsse 2022.04.07
 
 private:
 
diff --git a/ABACUS.develop/source/src_pw/pw_basis.cpp b/ABACUS.develop/source/src_pw/pw_basis.cpp
index 900ee567fc..2af04bfc65 100644
--- a/ABACUS.develop/source/src_pw/pw_basis.cpp
+++ b/ABACUS.develop/source/src_pw/pw_basis.cpp
@@ -6,6 +6,7 @@
 #include "tools.h"
 #include "pw_basis.h"
 #include "../src_pw/pw_complement.h"
+#include <omp.h>
 
 PW_Basis::PW_Basis()
 {
@@ -729,63 +730,35 @@ void PW_Basis::get_nggm(const int ngmc_local)
 
 
 //  Calculate structure factor
-void PW_Basis::setup_structure_factor(void)
+void PW_Basis::setup_structure_factor(void)			// Peize Lin optimize and add OpenMP 2021.04.01
 {
     TITLE("PW_Basis","setup_structure_factor");
     timer::tick("PW_Basis","setup_struc_factor");
-    complex<double> ci_tpi = NEG_IMAG_UNIT * TWO_PI;
-    complex<double> x;
+    const complex<double> ci_tpi = NEG_IMAG_UNIT * TWO_PI;
 
-    this->strucFac.create( Ucell->ntype, this->ngmc);
-    this->strucFac.zero_out();
+    this->strucFac.create(Ucell->ntype, this->ngmc);
     Memory::record("PW_Basis","struc_fac", Ucell->ntype*this->ngmc,"complexmatrix");
 
-#define complex2cal_strufac
 //	string outstr;
 //	outstr = global_out_dir + "strucFac.dat"; 
 //	ofstream ofs( outstr.c_str() ) ;
 
-    for (int it=0; it< Ucell->ntype; it++)
+    for (int it=0; it<Ucell->ntype; it++)
     {
-        const Atom* atom = &Ucell->atoms[it];	
-        for (int ig=0; ig< this->ngmc; ig++)
+		const int na = Ucell->atoms[it].na;
+		const Vector3<double> * const tau = Ucell->atoms[it].tau;
+
+		#pragma omp parallel for schedule(static)
+        for (int ig=0; ig<this->ngmc; ig++)
         {
-#ifdef complex2cal_strufac
+			const Vector3<double> gcar_ig = gcar[ig];
             complex<double> sum_phase = ZERO;
-#else
-            double sum_cos = 0.0;
-            double sum_sin = 0.0;
-#endif
-            for (int ia=0; ia< atom->na; ia++)
+            for (int ia=0; ia<na; ia++)
             {
-                //----------------------------------------------------------
-                // EXPLAIN : Don't use Dot function until we can optimize
-                // it, use the following x*x + y*y + z*z instead!
-                //----------------------------------------------------------
                 // e^{-i G*tau}
-
-#ifdef complex2cal_strufac
-                sum_phase += exp( ci_tpi * (
-                                      gcar[ig].x * atom->tau[ia].x +
-                                      gcar[ig].y * atom->tau[ia].y +
-                                      gcar[ig].z * atom->tau[ia].z ) );
-#else
-                const double theta = TWO_PI * (
-                                         gcar[ig].x * atom->tau[ia].x +
-                                         gcar[ig].y * atom->tau[ia].y +
-                                         gcar[ig].z * atom->tau[ia].z );
-                sum_cos += cos( theta );
-                sum_sin += sin( theta );
-#endif
+                sum_phase += exp( ci_tpi * (gcar_ig * tau[ia]) );
             }
-#ifdef complex2cal_strufac
-            this->strucFac(it, ig) = sum_phase;
-#else
-            this->strucFac(it, ig) = complex<double>( sum_cos, -sum_sin );
-#endif
-
-//			double tmpx = strucFac(it, ig).real() ;
-//			double tmpy = strucFac(it, ig).imag() ;
+            this->strucFac(it,ig) = sum_phase;
         }
     }
 
diff --git a/ABACUS.develop/source/src_pw/stress_func_cc.cpp b/ABACUS.develop/source/src_pw/stress_func_cc.cpp
index 410c0d4bc1..0a09c7f13d 100644
--- a/ABACUS.develop/source/src_pw/stress_func_cc.cpp
+++ b/ABACUS.develop/source/src_pw/stress_func_cc.cpp
@@ -1,5 +1,6 @@
 #include "./stress_func.h"
 #include "./H_XC_pw.h"
+#include "../src_global/math_integral.h"
 
 //NLCC term, need to be tested
 void Stress_Func::stress_cc(matrix& sigma, const bool is_pw)
@@ -182,7 +183,7 @@ void Stress_Func::deriv_drhoc
 		{
 			aux [ir] = r [ir] * rhoc [ir] * (r [ir] * cos (gx * r [ir] ) / gx - sin (gx * r [ir] ) / pow(gx,2));
 		}//ir
-		Mathzone::Simpson_Integral(mesh, aux, rab, rhocg1);
+		Integral::Simpson_Integral(mesh, aux, rab, rhocg1);
 		drhocg [igl] = FOUR_PI / ucell.omega * rhocg1;
 	}//igl
 	
diff --git a/ABACUS.develop/source/src_pw/stress_func_loc.cpp b/ABACUS.develop/source/src_pw/stress_func_loc.cpp
index afd8c77e37..78edceb05b 100644
--- a/ABACUS.develop/source/src_pw/stress_func_loc.cpp
+++ b/ABACUS.develop/source/src_pw/stress_func_loc.cpp
@@ -1,4 +1,5 @@
-#include"stress_func.h"
+#include "stress_func.h"
+#include "../src_global/math_integral.h"
 
 //calculate local pseudopotential stress in PW or VL_dVL stress in LCAO
 void Stress_Func::stress_loc(matrix& sigma, const bool is_pw)
@@ -176,7 +177,7 @@ double*  dvloc
 			aux [i] = aux1 [i] * (r [i] * cos (gx * r [i] ) / gx - sin (gx * r [i] ) / pow(gx,2));
 		}
 		// simpson (msh, aux, rab, vlcp);
-		Mathzone::Simpson_Integral(msh, aux, rab, vlcp );
+		Integral::Simpson_Integral(msh, aux, rab, vlcp );
 		// DV(g^2)/Dg^2 = (DV(g)/Dg)/2g
 		vlcp *= FOUR_PI / ucell.omega / 2.0 / gx;
 		// subtract the long-range term
diff --git a/ABACUS.develop/source/src_pw/stress_pw.cpp b/ABACUS.develop/source/src_pw/stress_pw.cpp
index b66d0eae23..127f913b54 100644
--- a/ABACUS.develop/source/src_pw/stress_pw.cpp
+++ b/ABACUS.develop/source/src_pw/stress_pw.cpp
@@ -2,16 +2,14 @@
 #include "./H_XC_pw.h"
 #include "src_pw/vdwd2.h"
 
-void Stress_PW::cal_stress(matrix& sigma)
+void Stress_PW::cal_stress(matrix& sigmatot)
 {
 	TITLE("Stress_PW","cal_stress");
 	timer::tick("Stress_PW","cal_stress",'E');    
 
-	sigma.create(3,3);
+	sigmatot.create(3,3);
 	matrix sigmaxc;
 	sigmaxc.create(3,3);
-	matrix sigmatot;
-	sigmatot.create(3,3);
 	matrix sigmahar;
 	sigmahar.create(3,3);
 	matrix sigmakin;
diff --git a/ABACUS.develop/source/src_pw/stress_pw.h b/ABACUS.develop/source/src_pw/stress_pw.h
index 48e5ce1234..8a4e8f3174 100644
--- a/ABACUS.develop/source/src_pw/stress_pw.h
+++ b/ABACUS.develop/source/src_pw/stress_pw.h
@@ -11,7 +11,7 @@ class Stress_PW:public Stress_Func
 	~Stress_PW (){};
 
 	//calculate the stress in PW basis
-	void cal_stress(matrix& sigma);
+	void cal_stress(matrix& sigmatot);
 
 	private :
 	//call the vdw stress
diff --git a/ABACUS.develop/source/src_pw/unitcell.cpp b/ABACUS.develop/source/src_pw/unitcell.cpp
index a31cadaa72..5a47691423 100644
--- a/ABACUS.develop/source/src_pw/unitcell.cpp
+++ b/ABACUS.develop/source/src_pw/unitcell.cpp
@@ -266,3 +266,101 @@ void UnitCell::set_iat2it(void)
 	}
 	return;
 }
+
+void UnitCell::update_pos_tau(const double* pos)
+{
+    int iat = 0;
+	for(int it = 0;it < this->ntype;it++)
+	{
+		Atom* atom = &this->atoms[it];
+		for(int ia =0;ia< atom->na;ia++)
+		{		
+			if(atom->mbl[ia].x!=0)
+			{
+				atom->tau[ia].x = pos[3*iat] / this->lat0;
+			}
+			if(atom->mbl[ia].y!=0)
+			{
+				atom->tau[ia].y = pos[3*iat+1] / this->lat0;
+			}
+			if(atom->mbl[ia].z!=0)
+			{
+				atom->tau[ia].z = pos[3*iat+2] / this->lat0;
+			}
+
+			// the direct coordinates also need to be updated.
+			atom->taud[ia] = atom->tau[ia] * this->GT;
+			iat++;
+		}
+	}
+	assert(iat == this->nat);
+    return;
+}
+
+void UnitCell::periodic_boundary_adjustment()
+{
+    //----------------------------------------------
+	// because of the periodic boundary condition
+	// we need to adjust the atom positions,
+	// first adjust direct coordinates,
+	// then update them into cartesian coordinates,
+	//----------------------------------------------
+	for(int it=0; it<this->ntype; it++)
+	{
+		Atom* atom = &this->atoms[it];
+		for(int ia=0; ia<atom->na; ia++)
+		{
+			// mohan update 2011-03-21
+			if(atom->taud[ia].x<0) atom->taud[ia].x += 1.0;
+			if(atom->taud[ia].y<0) atom->taud[ia].y += 1.0;
+			if(atom->taud[ia].z<0) atom->taud[ia].z += 1.0;
+			if(atom->taud[ia].x>=1.0) atom->taud[ia].x -= 1.0;
+			if(atom->taud[ia].y>=1.0) atom->taud[ia].y -= 1.0;
+			if(atom->taud[ia].z>=1.0) atom->taud[ia].z -= 1.0;
+
+			if(atom->taud[ia].x<0 || atom->taud[ia].y<0
+				|| atom->taud[ia].z<0 ||
+				atom->taud[ia].x>=1.0 ||
+				atom->taud[ia].y>=1.0 ||
+				atom->taud[ia].z>=1.0)
+			{
+				ofs_warning << " it=" << it+1 << " ia=" << ia+1 << endl;
+				ofs_warning << "d=" << atom->taud[ia].x << " " << 
+				atom->taud[ia].y << " " << atom->taud[ia].z << endl;
+				WARNING_QUIT("Ions_Move_Basic::move_ions","the movement of atom is larger than the length of cell.");
+			}
+
+			atom->tau[ia] = atom->taud[ia] * this->latvec;
+		}
+	}
+    return;
+}
+
+void UnitCell::bcast_atoms_tau()
+{
+#ifdef __MPI
+    MPI_Barrier(MPI_COMM_WORLD);
+    for (int i=0;i<ucell.ntype;i++)
+    {
+        ucell.atoms[i].bcast_atom(); // bcast tau array
+    }
+#endif
+}
+
+void UnitCell::save_cartesian_position(double* pos)const
+{
+    int iat=0;
+	for(int it = 0;it < this->ntype;it++)
+	{
+		Atom* atom = &this->atoms[it];
+		for(int ia =0; ia<atom->na; ia++)
+		{	
+			pos[3*iat  ] = atom->tau[ia].x*this->lat0;
+			pos[3*iat+1] = atom->tau[ia].y*this->lat0;
+			pos[3*iat+2] = atom->tau[ia].z*this->lat0;
+            iat++;
+        }
+    }
+    assert(iat == this->nat);
+    return;
+}
diff --git a/ABACUS.develop/source/src_pw/unitcell.h b/ABACUS.develop/source/src_pw/unitcell.h
index 02139b858a..d66af0d358 100644
--- a/ABACUS.develop/source/src_pw/unitcell.h
+++ b/ABACUS.develop/source/src_pw/unitcell.h
@@ -70,6 +70,11 @@ class UnitCell
     void print_cell_cif(const string &fn)const;
     const double& getNelec(void)const {return electrons_number;}
 
+    void update_pos_tau(const double* pos);
+    void periodic_boundary_adjustment();
+    void bcast_atoms_tau();
+    void save_cartesian_position(double* pos)const;
+
 protected:
 
     double electrons_number;
diff --git a/ABACUS.develop/source/src_pw/vdwd2.cpp b/ABACUS.develop/source/src_pw/vdwd2.cpp
index 77a355ebaa..b3159576fc 100644
--- a/ABACUS.develop/source/src_pw/vdwd2.cpp
+++ b/ABACUS.develop/source/src_pw/vdwd2.cpp
@@ -103,7 +103,7 @@ void Vdwd2::cal_stress()
     TITLE("Vdwd2","stress");
 	para.initset(ucell);
 
-	stress.Reset();
+	stress.Zero();
 	
 	for( int it1=0; it1!=ucell.ntype; ++it1 )
 	{
diff --git a/ABACUS.develop/source/src_pw/wavefunc_in_pw.cpp b/ABACUS.develop/source/src_pw/wavefunc_in_pw.cpp
index eda6a710f7..5d9c872561 100644
--- a/ABACUS.develop/source/src_pw/wavefunc_in_pw.cpp
+++ b/ABACUS.develop/source/src_pw/wavefunc_in_pw.cpp
@@ -1,5 +1,6 @@
 #include "wavefunc_in_pw.h"
 #include <cstring>		// Peize Lin fix bug about strcmp 2016-08-02
+#include "../src_global/math_integral.h"
 
 void Wavefunc_in_pw::make_table_q(std::vector<string> &fn, realArray &table_local)
 {
@@ -212,7 +213,7 @@ const double *rab, const int &l, double* table)
 	}
 	
 	double unit = 0.0;
-	Mathzone::Simpson_Integral(meshr, inner_part, rab, unit);
+	Integral::Simpson_Integral(meshr, inner_part, rab, unit);
 	delete[] inner_part;
 	OUT(ofs_running,"normalize unit",unit);
 
@@ -228,7 +229,7 @@ const double *rab, const int &l, double* table)
 		}
 		
 		double vqint = 0.0;
-		Mathzone::Simpson_Integral(meshr, vchi, rab, vqint);
+		Integral::Simpson_Integral(meshr, vchi, rab, vqint);
 
 		table[iq] =  vqint * pref;
 	}
diff --git a/ABACUS.develop/source/src_pw/wf_atomic.cpp b/ABACUS.develop/source/src_pw/wf_atomic.cpp
index d2a64b57bd..c7320262d5 100644
--- a/ABACUS.develop/source/src_pw/wf_atomic.cpp
+++ b/ABACUS.develop/source/src_pw/wf_atomic.cpp
@@ -1,5 +1,6 @@
 #include "wf_atomic.h"
 #include "global.h"
+#include "../src_global/math_integral.h"
 
 WF_atomic::WF_atomic()
 {
@@ -79,7 +80,7 @@ void WF_atomic::init_at_1(void)
                 inner_part[ir] = atom->chi(ic,ir) * atom->chi(ic,ir);
             }
             double unit = 0.0;
-            Mathzone::Simpson_Integral(atom->msh, inner_part, atom->rab, unit);
+            Integral::Simpson_Integral(atom->msh, inner_part, atom->rab, unit);
             delete[] inner_part;
 
 			ofs_running << " the unit of pseudo atomic orbital is " << unit; 
@@ -101,7 +102,7 @@ void WF_atomic::init_at_1(void)
                 inner_part[ir] = atom->chi(ic,ir) * atom->chi(ic,ir);
             }
             unit = 0.0;
-            Mathzone::Simpson_Integral(atom->msh, inner_part, atom->rab, unit);
+            Integral::Simpson_Integral(atom->msh, inner_part, atom->rab, unit);
             delete[] inner_part;
 
 			ofs_running << ", renormalize to " << unit << endl;
@@ -119,7 +120,7 @@ void WF_atomic::init_at_1(void)
                     }
 
                     double vqint = 0.0;
-                    Mathzone::Simpson_Integral(atom->msh, vchi, atom->rab, vqint);
+                    Integral::Simpson_Integral(atom->msh, vchi, atom->rab, vqint);
 
                     ppcell.tab_at(it, ic, iq) =  vqint * pref;
                     //				if( it == 0 && ic == 0 )
diff --git a/ABACUS.develop/source/src_ri/conv_coulomb_pot.cpp b/ABACUS.develop/source/src_ri/conv_coulomb_pot.cpp
index ef87663767..54bc284341 100644
--- a/ABACUS.develop/source/src_ri/conv_coulomb_pot.cpp
+++ b/ABACUS.develop/source/src_ri/conv_coulomb_pot.cpp
@@ -5,6 +5,7 @@
 #include "src_global/global_function.h"
 
 #include "src_external/src_test/test_function.h"
+#include "src_global/math_integral.h" // mohan add 2021-04-03
 
 Conv_Coulomb_Pot::Conv_Coulomb_Pot(const Numerical_Orbital_Lm &orb_in)
 	:orb(orb_in)
@@ -37,7 +38,7 @@ void Conv_Coulomb_Pot::cal_conv_coulomb_pot()
 	{
 		tmp_func[ir] = orb.getPsi(ir) * pow( orb.getRadial(ir), orb.getL()+2 );
 	}	
-	Mathzone::Simpson_Integral_0toall( orb.getNr(), VECTOR_TO_PTR(tmp_func), orb.getRab(), VECTOR_TO_PTR(tmp_integral) );
+	Integral::Simpson_Integral_0toall( orb.getNr(), VECTOR_TO_PTR(tmp_func), orb.getRab(), VECTOR_TO_PTR(tmp_integral) );
 	conv_coulomb_pot[0]=0;
 	for( size_t ir=1; ir!=orb.getNr(); ++ir )
 	{
@@ -53,7 +54,7 @@ void Conv_Coulomb_Pot::cal_conv_coulomb_pot()
 	{
 		tmp_func[ir] = orb.getPsi(ir) / pow( orb.getRadial(ir), orb.getL()-1 );
 	}
-	Mathzone::Simpson_Integral_alltoinf( orb.getNr(), VECTOR_TO_PTR(tmp_func), orb.getRab(), VECTOR_TO_PTR(tmp_integral) );
+	Integral::Simpson_Integral_alltoinf( orb.getNr(), VECTOR_TO_PTR(tmp_func), orb.getRab(), VECTOR_TO_PTR(tmp_integral) );
 	for( size_t ir=0; ir!=orb.getNr(); ++ir )
 	{
 		conv_coulomb_pot[ir] += tmp_integral[ir] * pow( orb.getRadial(ir), orb.getL() );
diff --git a/ABACUS.develop/source/src_ri/exx_abfs-io.cpp b/ABACUS.develop/source/src_ri/exx_abfs-io.cpp
index d390bcb293..5c2f5b1803 100644
--- a/ABACUS.develop/source/src_ri/exx_abfs-io.cpp
+++ b/ABACUS.develop/source/src_ri/exx_abfs-io.cpp
@@ -7,6 +7,7 @@
 #include "src_pw/global.h"
 #include "src_lcao/ORB_read.h"
 #include "src_global/global_function.h"
+#include "src_global/math_integral.h" // mohan add 2021-04-03
 
 
 vector<vector<vector<Numerical_Orbital_Lm>>> Exx_Abfs::IO::construct_abfs(
@@ -212,7 +213,7 @@ vector<vector<Numerical_Orbital_Lm>> Exx_Abfs::IO::construct_abfs_T(
 				inner[ir] = psir[ir] * psir[ir];
 			}
 			double unit = 0.0;	
-			Mathzone::Simpson_Integral(meshr, VECTOR_TO_PTR(inner), VECTOR_TO_PTR(rab), unit);
+			Integral::Simpson_Integral(meshr, VECTOR_TO_PTR(inner), VECTOR_TO_PTR(rab), unit);
 			for( int ir=0; ir!=meshr; ++ir )
 			{
 				psis[L][N][ir] /= sqrt(unit);
@@ -510,4 +511,4 @@ void Exx_Abfs::IO::print_matrix(
 			}
 		}
 	}
-}
\ No newline at end of file
+}
diff --git a/ABACUS.develop/source/src_ri/exx_abfs-matrix_lcaoslcaos_lcaoslcaos.cpp b/ABACUS.develop/source/src_ri/exx_abfs-matrix_lcaoslcaos_lcaoslcaos.cpp
index 763692cab3..7f1626c038 100644
--- a/ABACUS.develop/source/src_ri/exx_abfs-matrix_lcaoslcaos_lcaoslcaos.cpp
+++ b/ABACUS.develop/source/src_ri/exx_abfs-matrix_lcaoslcaos_lcaoslcaos.cpp
@@ -21,7 +21,7 @@ void Exx_Abfs::Matrix_Lcaoslcaos_Lcaoslcaos::init(
 //		ORB.get_dk() / kmesh_times);				// delta k, for integration in k space
 		ORB.get_dk());											// Peize Lin change 2017-04-16
 	int Lmax_used, Lmax;
-	MOT.init_Table_Spherical_Bessel (4,mode, Lmax_used, Lmax);
+	MOT.init_Table_Spherical_Bessel (4,mode, Lmax_used, Lmax, Exx_Abfs::Lmax);
 //	MOT.init_OV_Tpair();							// for MOT.OV_L2plus1
 //	MOT.Destroy_Table_Spherical_Bessel (Lmax_used);				// why?
 
diff --git a/ABACUS.develop/source/src_ri/exx_abfs-matrix_orbs11.cpp b/ABACUS.develop/source/src_ri/exx_abfs-matrix_orbs11.cpp
index b4b11c5e0c..dbb966a44d 100644
--- a/ABACUS.develop/source/src_ri/exx_abfs-matrix_orbs11.cpp
+++ b/ABACUS.develop/source/src_ri/exx_abfs-matrix_orbs11.cpp
@@ -36,7 +36,7 @@ void Exx_Abfs::Matrix_Orbs11::init(
 //ofs<<"TIME@Exx_Abfs::Matrix_Orbs11::init::MOT.allocate\t"<<time_during(t_start)<<endl;
 	int Lmax_used, Lmax;
 //gettimeofday( &t_start, NULL);
-	MOT.init_Table_Spherical_Bessel (2, mode, Lmax_used, Lmax);
+	MOT.init_Table_Spherical_Bessel (2, mode, Lmax_used, Lmax, Exx_Abfs::Lmax);
 //	MOT.init_OV_Tpair();							// for MOT.OV_L2plus1
 //	MOT.Destroy_Table_Spherical_Bessel (Lmax_used);				// why?
 //ofs<<"TIME@Exx_Abfs::Matrix_Orbs11::init::MOT.init_Table_Spherical_Bessel\t"<<time_during(t_start)<<endl;
diff --git a/ABACUS.develop/source/src_ri/exx_abfs-matrix_orbs21.cpp b/ABACUS.develop/source/src_ri/exx_abfs-matrix_orbs21.cpp
index c270afc72b..37ceea7394 100644
--- a/ABACUS.develop/source/src_ri/exx_abfs-matrix_orbs21.cpp
+++ b/ABACUS.develop/source/src_ri/exx_abfs-matrix_orbs21.cpp
@@ -37,7 +37,7 @@ void Exx_Abfs::Matrix_Orbs21::init(
 //ofs<<"TIME@Exx_Abfs::Matrix_Orbs21::init::MOT.allocate\t"<<time_during(t_start)<<endl;
 	int Lmax_used, Lmax;
 //gettimeofday( &t_start, NULL);
-	MOT.init_Table_Spherical_Bessel (3,mode, Lmax_used, Lmax);
+	MOT.init_Table_Spherical_Bessel (3,mode, Lmax_used, Lmax, Exx_Abfs::Lmax);
 //	MOT.init_OV_Tpair();							// for MOT.OV_L2plus1
 //	MOT.Destroy_Table_Spherical_Bessel (Lmax_used);				// why?
 //ofs<<"TIME@Exx_Abfs::Matrix_Orbs21::init::MOT.init_Table_Spherical_Bessel\t"<<time_during(t_start)<<endl;
diff --git a/ABACUS.develop/source/src_ri/exx_abfs-matrix_orbs22.cpp b/ABACUS.develop/source/src_ri/exx_abfs-matrix_orbs22.cpp
index ce9da0d84e..af048d02d0 100644
--- a/ABACUS.develop/source/src_ri/exx_abfs-matrix_orbs22.cpp
+++ b/ABACUS.develop/source/src_ri/exx_abfs-matrix_orbs22.cpp
@@ -32,7 +32,7 @@ gettimeofday( &t_start, NULL);
 //		ORB.get_dk() / kmesh_times);				// delta k, for integration in k space
 		ORB.get_dk());											// Peize Lin change 2017-04-16
 	int Lmax_used, Lmax;
-	MOT.init_Table_Spherical_Bessel (4,mode, Lmax_used, Lmax);
+	MOT.init_Table_Spherical_Bessel (4,mode, Lmax_used, Lmax, Exx_Abfs::Lmax);
 //	MOT.init_OV_Tpair();							// for MOT.OV_L2plus1
 //	MOT.Destroy_Table_Spherical_Bessel (Lmax_used);				// why?