From 7e2d5770781a162d77eeb78490a4391720448fd2 Mon Sep 17 00:00:00 2001
From: cguzman <christian.guzman@bsc.es>
Date: Wed, 13 Dec 2023 16:56:19 +0100
Subject: [PATCH] merge gpu branch onto main

---
 CMakeLists.txt                                |  54 +--
 compile/power9/check.sh                       |   1 -
 compile/power9/compile.camp.sh                |   4 +-
 data/CAMP_v1_paper/README.md                  |   7 +-
 data/CAMP_v1_paper/binned/mock_monarch.F90    |   3 +
 data/CAMP_v1_paper/modal/mock_monarch.F90     |   3 +
 .../boot_camp/part_1_code/box_model.F90       |   5 +-
 .../boot_camp/part_3_code/box_model.F90       |   1 +
 .../boot_camp/part_4_code/box_model.F90       |   1 +
 doc/references.bib                            |  36 ++
 src/Jacobian.c                                |  14 +-
 src/Jacobian.h                                |   7 +-
 src/aero_phase_solver.c                       |   6 +-
 src/camp_common.h                             |   1 -
 src/camp_core.F90                             |  50 ++-
 .../camp_debug_2.c => camp_debug.c}           |  56 +--
 src/camp_debug.h                              |  11 +
 src/camp_solver.c                             |  61 +--
 src/camp_solver.h                             |   5 +-
 src/camp_solver_data.F90                      |  39 +-
 src/cuda/cuda_structs.h                       |   2 -
 src/cuda/cvode_cuda.cu                        | 393 ++----------------
 src/cuda/cvode_gpu.cu                         |  15 +-
 src/cuda/cvode_init.cu                        |  46 +-
 src/debug_and_stats/camp_debug_2.h            |  22 -
 src/rxn_solver.h                              |   1 -
 src/rxns.h                                    |   1 -
 src/rxns/rxn_CMAQ_H2O2.c                      |   2 +-
 src/rxns/rxn_CMAQ_OH_HNO3.c                   |   2 +-
 src/rxns/rxn_HL_phase_transfer.c              |  20 +-
 src/rxns/rxn_SIMPOL_phase_transfer.c          |  20 +-
 src/rxns/rxn_aqueous_equilibrium.c            |  28 +-
 src/rxns/rxn_arrhenius.c                      |   4 +-
 src/rxns/rxn_condensed_phase_arrhenius.c      |   4 +-
 src/rxns/rxn_condensed_phase_photolysis.c     |   4 +-
 src/rxns/rxn_emission.c                       |   2 +-
 src/rxns/rxn_first_order_loss.c               |   2 +-
 src/rxns/rxn_photolysis.c                     |   2 +-
 src/rxns/rxn_ternary_chemical_activation.c    |   2 +-
 src/rxns/rxn_troe.c                           |   2 +-
 src/rxns/rxn_wennberg_no_ro2.c                |   6 +-
 src/rxns/rxn_wennberg_tunneling.c             |   2 +-
 src/rxns/rxn_wet_deposition.c                 |   2 +-
 src/solver_stats.F90                          |   8 -
 src/sub_models/sub_model_PDFiTE.c             |  24 +-
 src/sub_models/sub_model_ZSR_aerosol_water.c  |  52 +--
 src/time_derivative.c                         |  12 +-
 src/time_derivative.h                         |   6 +-
 test/chemistry/cb05cl_ae5/test_cb05cl_ae5.F90 |   7 -
 test/monarch/TestMonarch.py                   |  12 +-
 test/monarch/TestMonarch1.py                  |   8 +-
 test/monarch/TestMonarch2.py                  |   5 +-
 test/monarch/TestMonarch3.py                  |   5 +-
 test/monarch/TestMonarch4.py                  |   5 +-
 test/monarch/TestMonarch5.py                  |  36 ++
 test/monarch/camp_monarch_interface.F90       |  45 +-
 test/monarch/checkGPU.py                      |   4 -
 test/monarch/checkGPU.sh                      |   2 -
 test/monarch/diff_TestMonarch.py              |   4 -
 test/monarch/mainMonarch.py                   |  98 ++---
 test/monarch/mock_monarch.F90                 |   4 +-
 test/monarch/run.sh                           |   4 +-
 test/monarch/sbatch_run.sh                    |   6 +-
 test/monarch/settings/TestMonarch.json        |   8 +-
 test/monarch_output/stats_monarch_netcdf.py   |   7 +-
 test/monarch_output/tmp.py                    |  50 ---
 .../test_aero_rep_single_particle.F90         |   3 -
 .../test_sub_model_ZSR_aerosol_water.c        |  28 +-
 68 files changed, 466 insertions(+), 926 deletions(-)
 rename src/{debug_and_stats/camp_debug_2.c => camp_debug.c} (70%)
 delete mode 100644 src/debug_and_stats/camp_debug_2.h
 create mode 100644 test/monarch/TestMonarch5.py
 delete mode 100644 test/monarch_output/tmp.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 69f9605c9..3191d492e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -225,12 +225,10 @@ endif()
 
 ######################################################################
 # copy dirs
-if(DISABLE_TESTS)
-else ()
-SET(ENABLE_TESTS ON)
-endif ()
 
-if(ENABLE_TESTS)
+SET(USE_TESTS ON)
+
+if(USE_TESTS)
 add_custom_target(copy_data ALL ${CMAKE_COMMAND} -E copy_directory ${CMAKE_SOURCE_DIR}/data ${CMAKE_BINARY_DIR}/data_run)
 add_custom_target(copy_test ALL ${CMAKE_COMMAND} -E copy_directory ${CMAKE_SOURCE_DIR}/test ${CMAKE_BINARY_DIR}/test_run)
 add_custom_target(copy_mechanism ALL ${CMAKE_COMMAND} -E copy_directory ${CMAKE_SOURCE_DIR}/mechanisms ${CMAKE_BINARY_DIR}/mechanisms_run)
@@ -241,23 +239,24 @@ endif()
 ######################################################################
 # Unit test macro
 
-if (ENABLE_TESTS)
-  macro(do_unit_test test_name result)
-    if(ENABLE_MPI)
-      add_test(unit_test_${test_name} mpirun -v -np 1 ${CMAKE_BINARY_DIR}/unit_test_${test_name})
-    else()
-      add_test(unit_test_${test_name} ${CMAKE_BINARY_DIR}/unit_test_${test_name})
-    endif()
-    set_tests_properties(unit_test_${test_name}
-            PROPERTIES PASS_REGULAR_EXPRESSION ${result})
-  endmacro(do_unit_test)
+if (USE_TESTS)
+macro(do_unit_test test_name result)
+if(ENABLE_MPI)
+  add_test(unit_test_${test_name} mpirun -v -np 2 ${CMAKE_BINARY_DIR}/unit_test_${test_name})
+else()
+  add_test(unit_test_${test_name} ${CMAKE_BINARY_DIR}/unit_test_${test_name})
+endif()
+set_tests_properties(unit_test_${test_name}
+        PROPERTIES PASS_REGULAR_EXPRESSION ${result})
+endmacro(do_unit_test)
 endif()
 
 ######################################################################
 # tests
 
 enable_testing()
-if (ENABLE_TESTS)
+
+if (USE_TESTS)
 do_unit_test(property "PASS")
 do_unit_test(chem_spec_data "PASS")
 do_unit_test(aero_phase_data "PASS")
@@ -277,7 +276,7 @@ if (ENABLE_GPU)
   add_test(NAME test_gpu COMMAND checkGPU.sh WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/test/monarch)
 endif()
 
-if (ENABLE_TESTS)
+if (USE_TESTS)
     # New unit tests (UNDER DEVELOPMENT)
   add_test(test_rxn_arrhenius_mech ${CMAKE_BINARY_DIR}/test_run/unit_tests/input_files/run_rxn_arrhenius.sh ${MPI_TEST_FLAG})
 
@@ -406,7 +405,7 @@ set(SUB_MODELS_SRC ${SUB_MODELS_F_SRC} ${SUB_MODELS_C_SRC})
 set(CAMP_C_SRC
         src/camp_solver.c src/rxn_solver.c src/aero_phase_solver.c
         src/aero_rep_solver.c src/sub_model_solver.c
-        src/debug_and_stats/camp_debug_2.c
+        src/camp_debug.c
         src/time_derivative.c
         src/Jacobian.c src/debug_diff_check.c)
 
@@ -479,14 +478,6 @@ install(
   )
 endif()
 
-######################################################################
-# camp-chem box model
-
-add_executable(camp_box_model test/camp_box_model_data.F90
-                              test/camp_box_model.F90)
-
-target_link_libraries(camp_box_model camplib)
-
 ######################################################################
 # test_chemistry_cb05cl_ae5
 
@@ -538,7 +529,16 @@ if (ENABLE_GPU)
   target_link_libraries(mock_monarch camplib)
 endif()
 
-if (ENABLE_TESTS)
+if (USE_TESTS)
+
+######################################################################
+# camp-chem box model
+
+add_executable(camp_box_model test/camp_box_model_data.F90
+    test/camp_box_model.F90)
+
+target_link_libraries(camp_box_model camplib)
+
 
 ######################################################################
 # test_chemistry_cb05cl_ae5
diff --git a/compile/power9/check.sh b/compile/power9/check.sh
index 2a3b85736..5446182c2 100644
--- a/compile/power9/check.sh
+++ b/compile/power9/check.sh
@@ -2,7 +2,6 @@ set -e
 cd ../../build
 make -j 4
 ctest --output-on-failure
-#make test
 #./unit_test_aero_rep_single_particle
 cd ../test/monarch
 #./checkGPU.sh
diff --git a/compile/power9/compile.camp.sh b/compile/power9/compile.camp.sh
index f98388fda..2ed62243f 100644
--- a/compile/power9/compile.camp.sh
+++ b/compile/power9/compile.camp.sh
@@ -37,7 +37,6 @@ elif [ LOCAL_MACHINE==CGUZMAN ]; then
       echo "MPI is not installed. Installing..."
       sudo apt update
       sudo apt install -y mpi-default-dev
-      #if run | Invalid MIT-MAGIC-COOKIE-1 key THEN sudo apt-remove openmpi-bin AND sudo apt-get install libcr-dev mpich2 mpich2-doc
   fi
 else
   echo "Unknown architecture"
@@ -65,8 +64,9 @@ cmake -D CMAKE_C_COMPILER=$(which mpicc) \
 -D ENABLE_GPU=ON \
 -D ENABLE_GSL:BOOL=FALSE \
 -D ENABLE_NETCDF=ON \
--D DISABLE_INSTALL_OPTIONS=TRUE \
 ..
 
+ln -sf ../test/monarch/settings
+ln -sf ../test/monarch/out
 make -j 4 VERBOSE=1
 cd $curr_path
diff --git a/data/CAMP_v1_paper/README.md b/data/CAMP_v1_paper/README.md
index 346a4c310..fc287955f 100644
--- a/data/CAMP_v1_paper/README.md
+++ b/data/CAMP_v1_paper/README.md
@@ -2,10 +2,7 @@
 
 Configurations for recreating experiments for:
 
- * M. Dawson, C. Guzman, J. H. Curtis, M. Acosta, S. Zhu, D. Dabdub,
-     A. Conley, M. West, N. Riemer, and O. Jorba (2021),
-     Chemistry Across Multiple Phases (CAMP) version 1.0: An
-     Integrated multi-phase chemistry model, in preparation
+ * Dawson, M. L., Guzman, C., Curtis, J. H., Acosta, M., Zhu, S., Dabdub, D., Conley, A., West, M., Riemer, N., and Jorba, O.: Chemistry Across Multiple Phases (CAMP) version 1.0: an integrated multiphase chemistry model, Geosci. Model Dev., 15, 3663–3689, https://doi.org/10.5194/gmd-15-3663-2022, 2022.
 
 
 The binned and modal box model experiments are run as part of the CAMP testing suite. The results will be in the build folder under:
@@ -13,4 +10,4 @@ The binned and modal box model experiments are run as part of the CAMP testing s
 ```
 data_run/CAMP_v1_paper/binned/out/
 data_run/CAMP_v1_paper/modal/out/
-```
\ No newline at end of file
+```
diff --git a/data/CAMP_v1_paper/binned/mock_monarch.F90 b/data/CAMP_v1_paper/binned/mock_monarch.F90
index 05feb8d5c..92ee9d15c 100644
--- a/data/CAMP_v1_paper/binned/mock_monarch.F90
+++ b/data/CAMP_v1_paper/binned/mock_monarch.F90
@@ -362,6 +362,9 @@ program mock_monarch
   ! finalize mpi
   call camp_mpi_finalize()
 
+  ! Free the interface and the solver
+  deallocate(camp_interface)
+
 contains
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
diff --git a/data/CAMP_v1_paper/modal/mock_monarch.F90 b/data/CAMP_v1_paper/modal/mock_monarch.F90
index a45392a21..2d394d63c 100755
--- a/data/CAMP_v1_paper/modal/mock_monarch.F90
+++ b/data/CAMP_v1_paper/modal/mock_monarch.F90
@@ -344,6 +344,9 @@ program mock_monarch
   ! finalize mpi
   call camp_mpi_finalize()
 
+  ! Free the interface and the solver
+  deallocate(camp_interface)
+
 contains
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
diff --git a/doc/camp_tutorial/boot_camp/part_1_code/box_model.F90 b/doc/camp_tutorial/boot_camp/part_1_code/box_model.F90
index 226f97f9e..0d610f183 100644
--- a/doc/camp_tutorial/boot_camp/part_1_code/box_model.F90
+++ b/doc/camp_tutorial/boot_camp/part_1_code/box_model.F90
@@ -90,11 +90,12 @@ program box_model
                      camp_state%state_var( idx_O2  )
   end do
 
-  deallocate( camp_state )
-
 #ifdef CAMP_USE_MPI
   call camp_mpi_finalize( )
 #endif
 
+  deallocate( camp_core )
+  deallocate( camp_state )
+
 end program box_model
 !! [Solve and output]
diff --git a/doc/camp_tutorial/boot_camp/part_3_code/box_model.F90 b/doc/camp_tutorial/boot_camp/part_3_code/box_model.F90
index 7f938c796..f098dee7d 100644
--- a/doc/camp_tutorial/boot_camp/part_3_code/box_model.F90
+++ b/doc/camp_tutorial/boot_camp/part_3_code/box_model.F90
@@ -108,6 +108,7 @@ program box_model
                      camp_state%state_var( idx_O2  )
   end do
 
+  deallocate( camp_core )
   deallocate( camp_state )
 
 #ifdef CAMP_USE_MPI
diff --git a/doc/camp_tutorial/boot_camp/part_4_code/box_model.F90 b/doc/camp_tutorial/boot_camp/part_4_code/box_model.F90
index 457b6639d..4689625b9 100644
--- a/doc/camp_tutorial/boot_camp/part_4_code/box_model.F90
+++ b/doc/camp_tutorial/boot_camp/part_4_code/box_model.F90
@@ -170,6 +170,7 @@ program box_model
 #endif
   !! [output]
 
+  deallocate( camp_core )
   deallocate( camp_state )
 
 end program box_model
diff --git a/doc/references.bib b/doc/references.bib
index 12cec3614..1c9f9af32 100644
--- a/doc/references.bib
+++ b/doc/references.bib
@@ -1,3 +1,39 @@
+@article{Tie2003,
+author = {Tie, Xuexi and Emmons, Louisa and Horowitz, Larry and Brasseur, Guy and Ridley, Brian and Atlas, Elliot and Stround, Craig and Hess, Peter and Klonecki, Andrzej and Madronich, Sasha and Talbot, Robert and Dibb, Jack},
+title = {Effect of sulfate aerosol on tropospheric NOx and ozone budgets: Model simulations and TOPSE evidence},
+journal = {Journal of Geophysical Research: Atmospheres},
+volume = {108},
+number = {D4},
+pages = {},
+keywords = {tropospheric aerosol, NOx, ozone},
+doi = {https://doi.org/10.1029/2001JD001508},
+url = {https://agupubs.onlinelibrary.wiley.com/doi/abs/10.1029/2001JD001508},
+eprint = {https://agupubs.onlinelibrary.wiley.com/doi/pdf/10.1029/2001JD001508},
+abstract = {The distributions of NOx and O3 are analyzed during TOPSE (Tropospheric Ozone Production about the Spring Equinox). In this study these data are compared with the calculations of a global chemical/transport model (Model for OZone And Related chemical Tracers (MOZART)). Specifically, the effect that hydrolysis of N2O5 on sulfate aerosols has on tropospheric NOx and O3 budgets is studied. The results show that without this heterogeneous reaction, the model significantly overestimates NOx concentrations at high latitudes of the Northern Hemisphere (NH) in winter and spring in comparison to the observations during TOPSE; with this reaction, modeled NOx concentrations are close to the measured values. This comparison provides evidence that the hydrolysis of N2O5 on sulfate aerosol plays an important role in controlling the tropospheric NOx and O3 budgets. The calculated reduction of NOx attributed to this reaction is 80 to 90\% in winter at high latitudes over North America. Because of the reduction of NOx, O3 concentrations are also decreased. The maximum O3 reduction occurs in spring although the maximum NOx reduction occurs in winter when photochemical O3 production is relatively low. The uncertainties related to uptake coefficient and aerosol loading in the model is analyzed. The analysis indicates that the changes in NOx due to these uncertainties are much smaller than the impact of hydrolysis of N2O5 on sulfate aerosol. The effect that hydrolysis of N2O5 on global NOx and O3 budgets are also assessed by the model. The results suggest that in the Northern Hemisphere, the average NOx budget decreases 50\% due to this reaction in winter and 5\% in summer. The average O3 budget is reduced by 8\% in winter and 6\% in summer. In the Southern Hemisphere (SH), the sulfate aerosol loading is significantly smaller than in the Northern Hemisphere. As a result, sulfate aerosol has little impact on NOx and O3 budgets of the Southern Hemisphere.},
+year = {2003}
+}
+@article{Wennberg2018,
+author = {Wennberg, Paul O. and Bates, Kelvin H. and Crounse, John D. and Dodson, Leah G. and McVay, Renee C. and Mertens, Laura A. and Nguyen, Tran B. and Praske, Eric and Schwantes, Rebecca H. and Smarte, Matthew D. and St Clair, Jason M. and Teng, Alexander P. and Zhang, Xuan and Seinfeld, John H.},
+title = {Gas-Phase Reactions of Isoprene and Its Major Oxidation Products},
+journal = {Chemical Reviews},
+volume = {118},
+number = {7},
+pages = {3337-3390},
+year = {2018},
+doi = {10.1021/acs.chemrev.7b00439},
+note ={PMID: 29522327},
+URL = {https://doi.org/10.1021/acs.chemrev.7b00439},
+eprint = {https://doi.org/10.1021/acs.chemrev.7b00439}
+}
+@techreport{JPL15,
+author = {J. B. Burkholder, S. P. Sander, J. Abbatt, J. R. Barker, R. E. Huie, C. E. Kolb, M. J. Kurylo, V. L. Orkin, D. M.
+Wilmouth, and P. H. Wine},
+title = {Chemical Kinetics and Photochemical Data for Use in Atmospheric Studies, Evaluation No. 18 JPL Publication 15-10},
+institution = {Jet Propulsion Laboratory},
+location = {Pasadena},
+year = {2015},
+url = {http://jpldataeval.jpl.nasa.gov}
+}
 @article{Ervens2003,
 abstract = {A detailed and extended chemical mechanism describing tropospheric aqueous phase chemistry (147 species and 438 reactions) is presented here as Chemical Aqueous Phase Radical Mechanism (CAPRAM) 2.4 (MODAC mechanism). The mechanism based on the former version 2.3 [ Herrmann et al., 2000] contains extended organic and transition metal chemistry and is formulated more explicitly based on a critical review of the literature. The aqueous chemistry has been coupled to the gas phase mechanism Regional Atmospheric Chemistry Modeling (RACM) [ Stockwell et al., 1997], and phase exchange accounted for using the resistance model of Schwartz [1986]. A method for estimating mass accommodation coefficients ({\&}{\#}945;) is described, which accounts for functional groups contained in a particular compound. A condensed version has also been developed to allow the use of CAPRAM 2.4 (MODAC mechanism) in higher-scale models. Here the reproducibility of the concentration levels of selected target species (i.e., NO x , S(IV), H2O2, NO3, OH, O3, and H+) within the limits of ± 5{\%} was used as a goal for eliminating insignificant reactions from the complete CAPRAM 2.4 (MODAC mechanism). This has been done using a range of initial conditions chosen to represent different atmospheric scenarios, and this produces a robust and concise set of reactions. The most interesting results are obtained using atmospheric conditions typical for an urban scenario, and the effects introduced by updating the aqueous phase chemistry are highlighted, in particular, with regard to radicals, redox cycling of transition metal ions and organic compounds. Finally, the reduced scheme has been incorporated into a one-dimensional (1-D) marine cloud model to demonstrate the applicability of this mechanism.},
 author = {Ervens, B.},
diff --git a/src/Jacobian.c b/src/Jacobian.c
index 4d5dbf06c..d206a72a2 100644
--- a/src/Jacobian.c
+++ b/src/Jacobian.c
@@ -60,13 +60,13 @@ int jacobian_initialize(Jacobian *jac, unsigned int num_spec,
     return 0;
   }
   jac->production_partials =
-      (long double *)malloc(num_elem * sizeof(long double));
+      (double *)malloc(num_elem * sizeof(double));
   if (!jac->production_partials) {
     free(jac->col_ptrs);
     free(jac->row_ids);
     return 0;
   }
-  jac->loss_partials = (long double *)malloc(num_elem * sizeof(long double));
+  jac->loss_partials = (double *)malloc(num_elem * sizeof(double));
   if (!jac->loss_partials) {
     free(jac->col_ptrs);
     free(jac->row_ids);
@@ -171,13 +171,13 @@ unsigned int jacobian_build_matrix(Jacobian *jac) {
     exit(EXIT_FAILURE);
   }
   jac->production_partials =
-      (long double *)malloc(jac->num_elem * sizeof(long double));
+      (double *)malloc(jac->num_elem * sizeof(double));
   if (!jac->production_partials) {
     jacobian_free(jac);
     return 0;
   }
   jac->loss_partials =
-      (long double *)malloc(jac->num_elem * sizeof(long double));
+      (double *)malloc(jac->num_elem * sizeof(double));
   if (!jac->loss_partials) {
     jacobian_free(jac);
     return 0;
@@ -228,8 +228,8 @@ void jacobian_output(Jacobian jac, double *dest_array) {
   for (unsigned int i_col = 0; i_col < jac.num_spec; ++i_col) {
     for (unsigned int i_elem = jac.col_ptrs[i_col];
          i_elem < jac.col_ptrs[i_col + 1]; ++i_elem) {
-      long double drf_dy = jac.production_partials[i_elem];
-      long double drr_dy = jac.loss_partials[i_elem];
+      double drf_dy = jac.production_partials[i_elem];
+      double drr_dy = jac.loss_partials[i_elem];
       dest_array[i_elem] = drf_dy - drr_dy;
     }
   }
@@ -237,7 +237,7 @@ void jacobian_output(Jacobian jac, double *dest_array) {
 
 void jacobian_add_value(Jacobian jac, unsigned int elem_id,
                         unsigned int prod_or_loss,
-                        long double jac_contribution) {
+                        double jac_contribution) {
   if (prod_or_loss == JACOBIAN_PRODUCTION)
     jac.production_partials[elem_id] += jac_contribution;
   if (prod_or_loss == JACOBIAN_LOSS)
diff --git a/src/Jacobian.h b/src/Jacobian.h
index 864f17855..51ed221b0 100644
--- a/src/Jacobian.h
+++ b/src/Jacobian.h
@@ -33,9 +33,8 @@ typedef struct {
   unsigned int num_elem;   // Number of potentially non-zero Jacobian elements
   unsigned int *col_ptrs;  // Index of start/end of each column in data array
   unsigned int *row_ids;   // Row id of each Jacobian element in data array
-  long double
-      *production_partials;    // Data array for productions rate partial derivs
-  long double *loss_partials;  // Data array for loss rate partial derivs
+  double *production_partials;    // Data array for productions rate partial derivs
+  double *loss_partials;  // Data array for loss rate partial derivs
   JacobianColumnElements *elements;  // Jacobian elements flagged for inclusion
 } Jacobian;
 
@@ -138,7 +137,7 @@ void jacobian_output(Jacobian jac, double *dest_array);
  */
 void jacobian_add_value(Jacobian jac, unsigned int elem_id,
                         unsigned int prod_or_loss,
-                        long double jac_contribution);
+                        double jac_contribution);
 
 /** \brief Prints the Jacobian structure
  *
diff --git a/src/aero_phase_solver.c b/src/aero_phase_solver.c
index 90a4f0bbd..8d207895d 100644
--- a/src/aero_phase_solver.c
+++ b/src/aero_phase_solver.c
@@ -101,15 +101,15 @@ void aero_phase_get_mass__kg_m3(ModelData *model_data, int aero_phase_idx,
             [model_data->aero_phase_float_indices[aero_phase_idx]]);
 
   // Sum the mass and MW
-  long double l_mass = MINIMUM_MASS_;
-  long double moles = MINIMUM_MASS_ / MINIMUM_MW_;
+  double l_mass = MINIMUM_MASS_;
+  double moles = MINIMUM_MASS_ / MINIMUM_MW_;
   int i_jac = 0;
   for (int i_spec = 0; i_spec < NUM_STATE_VAR_; i_spec++) {
     if (SPEC_TYPE_(i_spec) == CHEM_SPEC_VARIABLE ||
         SPEC_TYPE_(i_spec) == CHEM_SPEC_CONSTANT ||
         SPEC_TYPE_(i_spec) == CHEM_SPEC_PSSA) {
       l_mass += state_var[i_spec];
-      moles += state_var[i_spec] / (long double)MW_(i_spec);
+      moles += state_var[i_spec] / (double)MW_(i_spec);
       if (jac_elem_mass) jac_elem_mass[i_jac] = 1.0L;
       if (jac_elem_MW) jac_elem_MW[i_jac] = 1.0L / MW_(i_spec);
       i_jac++;
diff --git a/src/camp_common.h b/src/camp_common.h
index 05d990051..dfd2bbae6 100644
--- a/src/camp_common.h
+++ b/src/camp_common.h
@@ -270,7 +270,6 @@ typedef struct {
   float rate_cells_gpu;
 #endif
   int use_cpu;
-  int nGPUs;
 
   void *cvode_mem;       // CVodeMem object
   ModelData model_data;  // Model data (used during initialization and solving)
diff --git a/src/camp_core.F90 b/src/camp_core.F90
index fbd0ff247..b3079d40d 100644
--- a/src/camp_core.F90
+++ b/src/camp_core.F90
@@ -183,8 +183,13 @@ module camp_camp_core
     procedure :: get_rel_tol
     !> Get the absolute tolerance for a species on the state array
     procedure :: get_abs_tol
+    !> Create a file for saving output concentrations
+    procedure :: init_export_solver_state
+    !> Export output concentrations to calculate accuracy between CPU and GPU versions at checkGPU test
     procedure :: export_solver_state
+    !> Join the files created by each MPI process at "export_solver_state" function into a single file.
     procedure :: join_solver_state
+    !> Export execution time of GPU and CPU code to calculate speedups at TestMonarch.py
     procedure :: export_solver_stats
     !> Get a new model state variable
     procedure :: new_state_one_cell
@@ -679,7 +684,7 @@ subroutine initialize(this)
 
     ! Variables for setting initial state values
     class(aero_rep_data_t), pointer :: rep
-    integer(kind=i_kind) :: i, i_state_elem, i_name
+    integer(kind=i_kind) :: i_state_elem, i_name
 
     ! Species name for looking up properties
     character(len=:), allocatable :: spec_name
@@ -1151,23 +1156,18 @@ end function spec_state_id
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 
   !> Initialize the solver
-  subroutine solver_initialize(this, use_cpu, nGPUs)
+  subroutine solver_initialize(this, use_cpu)
     class(camp_core_t), intent(inout) :: this
     integer, intent(in), optional :: use_cpu
-    integer, intent(in), optional :: nGPUs
     type(string_t), allocatable :: spec_names(:)
-    integer :: i_spec, n_gas_spec, use_cpu1, nGPUs1
+    integer :: i_spec, n_gas_spec, use_cpu1
     call assert_msg(662920365, .not.this%solver_is_initialized, &
             "Attempting to initialize the solver twice.")
 
     use_cpu1=1
-    nGPUs1=1
     if (present(use_cpu)) then
       use_cpu1=use_cpu
     end if
-    if (present(nGPUs)) then
-      nGPUs1=nGPUs
-    end if
 
     ! Set up either two solvers (gas and aerosol) or one solver (combined)
     if (this%split_gas_aero) then
@@ -1193,8 +1193,7 @@ subroutine solver_initialize(this, use_cpu, nGPUs)
                 GAS_RXN,         & ! Reaction phase
                 this%n_cells,    & ! # of cells computed simultaneosly
                 spec_names,       & ! Species names
-                use_cpu1, &
-                nGPUs1 &
+                use_cpu1&
       )
       call this%solver_data_aero%initialize( &
                 this%var_type,   & ! State array variable types
@@ -1206,8 +1205,7 @@ subroutine solver_initialize(this, use_cpu, nGPUs)
                 AERO_RXN,        & ! Reaction phase
                 this%n_cells,    & ! # of cells computed simultaneosly
                 spec_names,       & ! Species names
-                use_cpu1, &
-                nGPUs1 &
+                use_cpu1&
               )
     else
 
@@ -1230,10 +1228,8 @@ subroutine solver_initialize(this, use_cpu, nGPUs)
                 GAS_AERO_RXN,    & ! Reaction phase
                 this%n_cells,    & ! # of cells computed simultaneosly
                 spec_names,       & ! Species names
-                use_cpu1, &
-                nGPUs1 &
+                use_cpu1&
                 )
-
     end if
 
     this%solver_is_initialized = .true.
@@ -1505,6 +1501,24 @@ subroutine solve(this, camp_state, time_step, rxn_phase, solver_stats)
 
   end subroutine solve
 
+  subroutine init_export_solver_state(this)
+    use camp_rxn_data
+    use iso_c_binding
+    class(camp_core_t), intent(inout) :: this
+    integer(kind=i_kind) :: phase
+    type(camp_solver_data_t), pointer :: solver
+    phase = GAS_AERO_RXN
+    if (phase.eq.GAS_RXN) then
+      solver => this%solver_data_gas
+    else if (phase.eq.AERO_RXN) then
+      solver => this%solver_data_aero
+    else if (phase.eq.GAS_AERO_RXN) then
+      solver => this%solver_data_gas_aero
+    end if
+    call solver%init_export_solver_data_state()
+  end subroutine
+
+  !> Export output concentrations to calculate accuracy between CPU and GPU versions at checkGPU test
   subroutine export_solver_state(this)
     use camp_rxn_data
     use iso_c_binding
@@ -1522,6 +1536,7 @@ subroutine export_solver_state(this)
     call solver%export_solver_data_state()
   end subroutine
 
+  !> Join the files created by each MPI process at "export_solver_state" function into a single file.
   subroutine join_solver_state(this)
     use camp_rxn_data
     use iso_c_binding
@@ -1539,6 +1554,7 @@ subroutine join_solver_state(this)
     call solver%join_solver_data_state()
   end subroutine
 
+  !> Export execution time of GPU and CPU code to calculate speedups at TestMonarch.py
   subroutine export_solver_stats(this)
     use camp_rxn_data
     use iso_c_binding
@@ -1562,7 +1578,7 @@ subroutine export_solver_stats(this)
   integer(kind=i_kind) function pack_size(this, comm)
 
     !> Chemical model
-    class(camp_core_t), intent(inout) :: this
+    class(camp_core_t), intent(in) :: this
     !> MPI communicator
     integer, intent(in), optional :: comm
 
@@ -1621,7 +1637,7 @@ end function pack_size
   subroutine bin_pack(this, buffer, pos, comm)
 
     !> Chemical model
-    class(camp_core_t), intent(inout) :: this
+    class(camp_core_t), intent(in) :: this
     !> Memory buffer
     character, intent(inout) :: buffer(:)
     !> Current buffer position
diff --git a/src/debug_and_stats/camp_debug_2.c b/src/camp_debug.c
similarity index 70%
rename from src/debug_and_stats/camp_debug_2.c
rename to src/camp_debug.c
index 387d74351..271bf1e14 100644
--- a/src/debug_and_stats/camp_debug_2.c
+++ b/src/camp_debug.c
@@ -6,12 +6,7 @@
  * SPDX-License-Identifier: MIT
  */
 
-#include "camp_debug_2.h"
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-#include "../camp_solver.h"
+#include "camp_solver.h"
 
 #ifdef CAMP_DEBUG_GPU
 #ifdef CAMP_USE_MPI
@@ -95,49 +90,22 @@ void join_export_state(){
 #endif
 }
 
-void init_export_stats(){
-#ifdef CAMP_DEBUG_GPU
-  int rank;
-  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-  char file_path[]="out/stats.csv";
-  if(rank==0){
-    printf("export_stats enabled\n");
-    FILE *fptr;
-    fptr = fopen(file_path,"w");
-    fprintf(fptr, "timecvStep,timeCVode\n");
-    fclose(fptr);
-  }
-#endif
-}
-
 void export_stats(SolverData *sd){
 #ifdef CAMP_DEBUG_GPU
   int rank;
   MPI_Comm_rank(MPI_COMM_WORLD, &rank);
   if (rank == 0) {
     FILE *fptr;
-    fptr = fopen("out/stats.csv", "a");
-    CVodeMem cv_mem = (CVodeMem) sd->cvode_mem;
-    fprintf(fptr, "%.17le,",cv_mem->timecvStep);
-    fprintf(fptr, "%.17le",sd->timeCVode);
-    fprintf(fptr, "\n");
-    fclose(fptr);
-  }
-#endif
-}
-
-void print_double(double *x, int len, const char *s){
-#ifdef USE_PRINT_ARRAYS
-  for (int i=0; i<len; i++){
-    printf("%s[%d]=%.17le\n",s,i,x[i]);
-  }
-#endif
-}
-
-void print_int(int *x, int len, const char *s){
-#ifdef USE_PRINT_ARRAYS
-  for (int i=0; i<len; i++){
-    printf("%s[%d]=%d\n",s,i,x[i]);
+    if ((fptr = fopen("out/stats.csv", "w")) != NULL) {
+      fprintf(fptr, "timecvStep,timeCVode\n");
+      CVodeMem cv_mem = (CVodeMem) sd->cvode_mem;
+      fprintf(fptr, "%.17le,",cv_mem->timecvStep);
+      fprintf(fptr, "%.17le",sd->timeCVode);
+      fprintf(fptr, "\n");
+      fclose(fptr);
+    }else {
+      printf("File '%s' does not exist.\n", "out/stats.csv");
+    }
   }
 #endif
-}
+}
\ No newline at end of file
diff --git a/src/camp_debug.h b/src/camp_debug.h
index 41af364e3..8ce253c25 100644
--- a/src/camp_debug.h
+++ b/src/camp_debug.h
@@ -8,6 +8,17 @@
 #ifndef CAMP_DEBUG_H
 #define CAMP_DEBUG_H
 
+#include "camp_common.h"
+
+// Create a file for saving output concentrations
+void init_export_state();
+// Export output concentrations to calculate accuracy between CPU and GPU versions at checkGPU test
+void export_state(SolverData *sd);
+// Join the files created by each MPI process at "export_solver_state" function into a single file.
+void join_export_state();
+// Export execution time of GPU and CPU code to calculate speedups at TestMonarch.py
+void export_stats(SolverData *sd);
+
 // file name prefix
 int file_name_prefix = 1;
 
diff --git a/src/camp_solver.c b/src/camp_solver.c
index 54931f40d..ecc2a188c 100644
--- a/src/camp_solver.c
+++ b/src/camp_solver.c
@@ -30,7 +30,6 @@
 #include <gsl/gsl_roots.h>
 #endif
 #include "camp_debug.h"
-#include "debug_and_stats/camp_debug_2.h"
 
 #ifdef CAMP_DEBUG_GPU
 #ifdef CAMP_USE_MPI
@@ -96,8 +95,9 @@ void *solver_new(int n_state_var, int n_cells, int *var_type, int n_rxn,
                  int n_aero_rep_float_param, int n_aero_rep_env_param,
                  int n_sub_model, int n_sub_model_int_param,
                  int n_sub_model_float_param, int n_sub_model_env_param,
-                 int use_cpu, int nGPUs) {
+                 int use_cpu) {
   // Create the SolverData object
+
   SolverData *sd = (SolverData *)malloc(sizeof(SolverData));
   if (sd == NULL) {
     printf("\n\nERROR allocating space for SolverData\n\n");
@@ -124,7 +124,6 @@ void *solver_new(int n_state_var, int n_cells, int *var_type, int n_rxn,
   sd->model_data.n_per_cell_state_var = n_state_var;
 
   sd->use_cpu = use_cpu;
-  sd->nGPUs = nGPUs;
 #ifdef DEV_CPU_GPU
   sd->rate_cells_gpu=1;
   printf("Set cells to gpu to %lf %\n",sd->rate_cells_gpu*100);
@@ -373,8 +372,6 @@ void *solver_new(int n_state_var, int n_cells, int *var_type, int n_rxn,
 
 #ifdef CAMP_DEBUG_GPU
   sd->timeCVode = 0.;
-  init_export_stats();
-  init_export_state();
 #endif
 
   // Return a pointer to the new SolverData object
@@ -596,16 +593,6 @@ int solver_run(void *solver_data, double *state, double *env, double t_initial,
     aero_rep_update_env_state(md);
     sub_model_update_env_state(md);
     rxn_update_env_state(md);
-    //if(i_cell==0){
-      //print_double(md->grid_cell_env,CAMP_NUM_ENV_PARAM_,"env689");
-      //print_double(md->grid_cell_state,n_state_var,"state688");
-      //double *yp = N_VGetArrayPointer(sd->y);
-      //print_double(yp,md->n_per_cell_dep_var,"y660");
-    //}
-    //print_double(md->grid_cell_env,CAMP_NUM_ENV_PARAM_,"env689");
-    //double *yp = N_VGetArrayPointer(sd->y)+i_cell*md->n_per_cell_dep_var;
-    //print_double(yp,md->n_per_cell_dep_var,"y660");
-    //print_double(md->grid_cell_state,md->n_per_cell_state_var,"state688");
   }
 
   //Reset jac solving, otherwise values from previous iteration would be carried to current iteration
@@ -670,8 +657,6 @@ int solver_run(void *solver_data, double *state, double *env, double t_initial,
 #endif
     sd->solver_flag = flag;
 #ifdef FAILURE_DETAIL
-    if (flag < 0) {
-#else
     if (check_flag(&flag, "CVode", 1) != CAMP_SOLVER_SUCCESS) {
       if (flag == -6) {
         long int lsflag;
@@ -683,13 +668,14 @@ int solver_run(void *solver_data, double *state, double *env, double t_initial,
       if (flag != 0)
         printf("\nCall to f() at failed state failed with flag %d \n",flag);
       solver_print_stats(sd->cvode_mem);
+#else
+    if (flag < 0) {
 #endif
       return CAMP_SOLVER_FAIL;
     }
   }
   // Update the species concentrations on the state array
   i_dep_var = 0;
-  //printf("NV_Ith_S(sd->y, i_dep_var)\n");
   for (int i_cell = 0; i_cell < n_cells; i_cell++) {
     for (int i_spec = 0; i_spec < n_state_var; i_spec++) {
       if (md->var_type[i_spec] == CHEM_SPEC_VARIABLE) {
@@ -732,10 +718,6 @@ int solver_run(void *solver_data, double *state, double *env, double t_initial,
  * \param last_time_step__s     Pointer to set to the last time step size [s]
  * \param next_time_step__s     Pointer to set to the next time step size [s]
  * \param Jac_eval_fails        Number of Jacobian evaluation failures
- * \param RHS_evals_total       Total calls to `f()`
- * \param Jac_evals_total       Total calls to `Jac()`
- * \param RHS_time__s           Compute time for calls to f() [s]
- * \param Jac_time__s           Compute time for calls to Jac() [s]
  * \param max_loss_precision    Indicators of loss of precision in derivative
  *                              calculation for each species
  */
@@ -745,8 +727,6 @@ void solver_get_statistics(void *solver_data, int *solver_flag, int *num_steps,
                            int *NLS_convergence_fails, int *DLS_Jac_evals,
                            int *DLS_RHS_evals, double *last_time_step__s,
                            double *next_time_step__s, int *Jac_eval_fails,
-                           int *RHS_evals_total, int *Jac_evals_total,
-                           double *RHS_time__s, double *Jac_time__s,
                            double *max_loss_precision
                            ) {
   SolverData *sd = (SolverData *)solver_data;
@@ -798,21 +778,17 @@ void solver_get_statistics(void *solver_data, int *solver_flag, int *num_steps,
   }
 #endif
 #ifdef CAMP_DEBUG
-  *RHS_evals_total = -1;
-  *Jac_evals_total = -1;
-  *RHS_time__s = 0.0;
-  *Jac_time__s = 0.0;
   *max_loss_precision = sd->max_loss_precision;
 #else
-  *RHS_evals_total = -1;
-  *Jac_evals_total = -1;
-  *RHS_time__s = 0.0;
-  *Jac_time__s = 0.0;
   *max_loss_precision = 0.0;
 #endif
 
 }
 
+void init_export_solver_state(){
+  init_export_state();
+}
+
 void export_solver_state(void *solver_data){
   SolverData *sd = (SolverData *)solver_data;
   export_state(sd);
@@ -917,7 +893,6 @@ int f(realtype t, N_Vector y, N_Vector deriv, void *solver_data) {
   SUNMatMatvec(md->J_solver, md->J_tmp, md->J_tmp2);
   N_VLinearSum(1.0, md->J_deriv, 1.0, md->J_tmp2, md->J_tmp);
 
-  //print_double(md->total_state,n_state_var,"state602");
   // Loop through the grid cells and update the derivative array
   for (int i_cell = 0; i_cell < n_cells; ++i_cell) {
     // Set the grid cell state pointers
@@ -951,20 +926,6 @@ int f(realtype t, N_Vector y, N_Vector deriv, void *solver_data) {
       time_derivative_output(sd->time_deriv, deriv_data, NULL,
                              sd->output_precision);
     }
-    if(i_cell==0) {
-      //double *yp = N_VGetArrayPointer(y);
-      //print_double(yp,86,"y646");
-      //double *J_state = N_VGetArrayPointer(md->J_state);
-      //print_double(J_state,86,"J_state644");
-      //print_double(jac_deriv_data,86,"J_tmp643");
-      //double *J_deriv = N_VGetArrayPointer(md->J_deriv);
-      //print_double(J_deriv,86,"J_deriv644");
-      //double *J_tmp2 = N_VGetArrayPointer(md->J_tmp2);
-      //print_double(J_tmp2,86,"J_tmp2645");
-      //print_double(sd->time_deriv.loss_rates,sd->time_deriv.num_spec,"loss_rates");
-      //print_double(sd->time_deriv.production_rates,sd->time_deriv.num_spec,"production_rates");
-      //print_double(deriv_data,86,"deriv_data645");
-    }
 #ifdef CAMP_DEBUG
     sd->max_loss_precision = time_derivative_max_loss_precision(sd->time_deriv);
 #endif
@@ -1066,8 +1027,8 @@ int Jac(realtype t, N_Vector y, N_Vector deriv, SUNMatrix J, void *solver_data,
     JacMap *jac_map = md->jac_map;
     SM_DATA_S(md->J_params)[0] = 1.0;  // dummy value for non-sub model calcs
     for (int i_map = 0; i_map < md->n_mapped_values; ++i_map){
-      long double drf_dy = sd->jac.production_partials[jac_map[i_map].rxn_id];
-      long double drr_dy = sd->jac.loss_partials[jac_map[i_map].rxn_id];
+      double drf_dy = sd->jac.production_partials[jac_map[i_map].rxn_id];
+      double drr_dy = sd->jac.loss_partials[jac_map[i_map].rxn_id];
 
       SM_DATA_S(J)
       [i_cell * md->n_per_cell_solver_jac_elem + jac_map[i_map].solver_id] +=
@@ -1827,7 +1788,9 @@ bool is_anything_going_on_here(SolverData *sd, realtype t_initial,
         }
       }
     }
+#ifdef CAMP_DEBUG
     printf("DEBUG: is_anything_going_on_here is false, returning success without cvode computing\n");
+#endif
     return false;
   }
   return true;
diff --git a/src/camp_solver.h b/src/camp_solver.h
index 6e8fdae8f..e780048aa 100644
--- a/src/camp_solver.h
+++ b/src/camp_solver.h
@@ -21,7 +21,7 @@ void *solver_new(int n_state_var, int n_cells, int *var_type, int n_rxn,
                  int n_aero_rep_float_param, int n_aero_rep_env_param,
                  int n_sub_model, int n_sub_model_int_param,
                  int n_sub_model_float_param, int n_sub_model_env_param,
-                 int use_cpu, int nGPUs);
+                 int use_cpu);
 void solver_set_spec_name(void *solver_data, char *spec_name,
                           int size_spec_name, int i);
 void solver_initialize(void *solver_data, double *abs_tol, double rel_tol,
@@ -38,9 +38,8 @@ void solver_get_statistics(void *solver_data, int *solver_flag, int *num_steps,
                            int *NLS_convergence_fails, int *DLS_Jac_evals,
                            int *DLS_RHS_evals, double *last_time_step__s,
                            double *next_time_step__s, int *Jac_eval_fails,
-                           int *RHS_evals_total, int *Jac_evals_total,
-                           double *RHS_time__s, double *Jac_time__s,
                            double *max_loss_precision);
+void init_export_solver_state();
 void export_solver_state(void *solver_data);
 void join_solver_state(void *solver_data);
 void export_solver_stats(void *solver_data);
diff --git a/src/camp_solver_data.F90 b/src/camp_solver_data.F90
index 759ddbd62..1f8edc00c 100644
--- a/src/camp_solver_data.F90
+++ b/src/camp_solver_data.F90
@@ -50,7 +50,7 @@ type(c_ptr) function solver_new(n_state_var, n_cells, var_type, &
                     n_aero_rep_int_param, n_aero_rep_float_param, &
                     n_aero_rep_env_param, n_sub_model, n_sub_model_int_param,&
                     n_sub_model_float_param, n_sub_model_env_param,&
-                    use_cpu, nGPUs) bind (c)
+                    use_cpu) bind (c)
       use iso_c_binding
       !> Number of variables on the state array per grid cell
       !! (including const, PSSA, etc.)
@@ -92,7 +92,6 @@ type(c_ptr) function solver_new(n_state_var, n_cells, var_type, &
       !> Total number of environment-dependent parameters for all sub models
       integer(kind=c_int), value :: n_sub_model_env_param
       integer(kind=c_int), value :: use_cpu
-      integer(kind=c_int), value :: nGPUs
     end function solver_new
 
     !> Set specie name
@@ -167,8 +166,7 @@ subroutine solver_get_statistics( solver_data, solver_flag, num_steps, &
                     RHS_evals, LS_setups, error_test_fails, NLS_iters, &
                     NLS_convergence_fails, DLS_Jac_evals, DLS_RHS_evals, &
                     last_time_step__s, next_time_step__s, Jac_eval_fails, &
-                    RHS_evals_total, Jac_evals_total, RHS_time__s, &
-                    Jac_time__s, max_loss_precision) bind (c)
+                    max_loss_precision) bind (c)
       use iso_c_binding
       !> Pointer to the solver data
       type(c_ptr), value :: solver_data
@@ -196,18 +194,14 @@ subroutine solver_get_statistics( solver_data, solver_flag, num_steps, &
       type(c_ptr), value :: next_time_step__s
       !> Number of Jacobian evaluation failures
       type(c_ptr), value :: Jac_eval_fails
-      !> Total number of calls to `f()`
-      type(c_ptr), value :: RHS_evals_total
-      !> Total number of calls to `Jac()`
-      type(c_ptr), value :: Jac_evals_total
-      !> Compute time for calls to `f()`
-      type(c_ptr), value :: RHS_time__s
-      !> Compute time for calls to `Jac()`
-      type(c_ptr), value :: Jac_time__s
       !> Maximum loss of precision on last call the f()
       type(c_ptr), value :: max_loss_precision
     end subroutine solver_get_statistics
 
+    subroutine init_export_solver_state() bind (c)
+      use iso_c_binding
+    end subroutine
+
     subroutine export_solver_state( solver_data) bind (c)
       use iso_c_binding
       type(c_ptr), value :: solver_data
@@ -417,9 +411,15 @@ end subroutine solver_free
     procedure :: update_aero_rep_data
     !> Integrate over a given time step
     procedure :: solve
+    !> Get solver statistics after an integration attempt
     procedure:: get_solver_stats
+    !> Create a file for saving output concentrations
+    procedure:: init_export_solver_data_state
+    !> Export output concentrations to calculate accuracy between CPU and GPU versions at checkGPU test
     procedure:: export_solver_data_state
+    !> Join the files created by each MPI process at "export_solver_state" function into a single file.
     procedure:: join_solver_data_state
+    !> Export execution time of GPU and CPU code to calculate speedups at TestMonarch.py
     procedure:: export_solver_data_stats
     !> Checks whether a solver is available
     procedure :: is_solver_available
@@ -454,7 +454,7 @@ end function constructor
   !> Initialize the solver
   subroutine initialize(this, var_type, abs_tol, mechanisms, aero_phases, &
           aero_reps, sub_models, rxn_phase, n_cells,&
-          spec_names, use_cpu, nGPUs)
+          spec_names, use_cpu)
 
     !> Solver data
     class(camp_solver_data_t), intent(inout) :: this
@@ -474,7 +474,6 @@ subroutine initialize(this, var_type, abs_tol, mechanisms, aero_phases, &
     !> Sub models to include
     type(sub_model_data_ptr), pointer, intent(in) :: sub_models(:)
     integer, intent(in) :: use_cpu
-    integer, intent(in) :: nGPUs
     !> Reactions phase to solve -- gas, aerosol, or both (default)
     !! Use parameters in camp_rxn_data to specify phase:
     !! GAS_RXN, AERO_RXN, GAS_AERO_RXN
@@ -658,8 +657,7 @@ subroutine initialize(this, var_type, abs_tol, mechanisms, aero_phases, &
             n_sub_model_int_param,             & ! # of sub model int params
             n_sub_model_float_param,           & ! # of sub model real params
             n_sub_model_env_param,              & ! # of sub model env params
-            use_cpu,&
-            nGPUs&
+            use_cpu&
             )
 
     ! Add all the condensed reaction data to the solver data block for
@@ -964,15 +962,16 @@ subroutine get_solver_stats( this, solver_stats)
             c_loc( solver_stats%last_time_step__s     ),   & ! Last time step [s]
             c_loc( solver_stats%next_time_step__s     ),   & ! Next time step [s]
             c_loc( solver_stats%Jac_eval_fails        ),   & ! Number of Jac eval fails
-            c_loc( solver_stats%RHS_evals_total       ),   & ! total f() calls
-            c_loc( solver_stats%Jac_evals_total       ),   & ! total Jac() calls
-            c_loc( solver_stats%RHS_time__s           ),   & ! Compute time f() [s]
-            c_loc( solver_stats%Jac_time__s           ),   & ! Compute time Jac() [s]
             c_loc( solver_stats%max_loss_precision) & ! Maximum loss of precision
     )
 
   end subroutine
 
+  subroutine init_export_solver_data_state( this)
+    class(camp_solver_data_t), intent(inout) :: this
+    call init_export_solver_state()
+  end subroutine
+
   subroutine export_solver_data_state( this)
     class(camp_solver_data_t), intent(inout) :: this
     call export_solver_state(this%solver_c_ptr)
diff --git a/src/cuda/cuda_structs.h b/src/cuda/cuda_structs.h
index f83d6d9e6..1c918c285 100644
--- a/src/cuda/cuda_structs.h
+++ b/src/cuda/cuda_structs.h
@@ -8,7 +8,6 @@
 
 typedef struct {
     unsigned int num_spec;          // Number of species in the derivative
-    // long double is treated as double in GPU
     double *production_rates;  // Production rates for all species
     double *loss_rates;        // Loss rates for all species
 #ifdef CAMP_DEBUG
@@ -90,7 +89,6 @@ typedef struct{
   int*    jA;
   int*    iA;
   int cells_method;
-  int threads,blocks;
   int nnz;
   int nnz_J_solver;
   size_t deriv_size;
diff --git a/src/cuda/cvode_cuda.cu b/src/cuda/cvode_cuda.cu
index c9fd84232..6a777de09 100644
--- a/src/cuda/cvode_cuda.cu
+++ b/src/cuda/cvode_cuda.cu
@@ -5,32 +5,6 @@
 
 #include "cvode_cuda.h"
 
-__device__
-void print_double(double *x, int len, const char *s){
-#ifdef USE_PRINT_ARRAYS
-  __syncthreads();
-  if(threadIdx.x==0 && blockIdx.x==0){
-    for (int i=0; i<len; i++){
-      printf("%s[%d]=%.17le\n",s,i,x[i]);
-    }
-  }
-  __syncthreads();
-#endif
-}
-
-__device__
-void print_int(int *x, int len, const char *s){
-#ifdef USE_PRINT_ARRAYS
-  __syncthreads();
-  if(threadIdx.x==0 && blockIdx.x==0){
-    for (int i=0; i<len; i++){
-      printf("%s[%d]=%d\n",s,i,x[i]);
-    }
-  }
-  __syncthreads();
-#endif
-}
-
 __device__
 double dSUNRpowerR(double base, double exponent){
   if (base <= ZERO) return(ZERO);
@@ -368,11 +342,10 @@ void rxn_gpu_photolysis_calc_jac_contrib(ModelDataVariable *sc, JacobianGPU jac,
   }
 }
 
-__device__ void cudaDevicemin_2(double *g_odata, double in, volatile double *sdata, int n_shr_empty){
+__device__ void cudaDevicemin(double *g_odata, double in, volatile double *sdata, int n_shr_empty){
   unsigned int tid = threadIdx.x;
   __syncthreads();
   sdata[tid] = in;
-  __syncthreads();
   if(tid<n_shr_empty)
     sdata[tid+blockDim.x]=sdata[tid];
   __syncthreads();
@@ -387,21 +360,6 @@ __device__ void cudaDevicemin_2(double *g_odata, double in, volatile double *sda
   __syncthreads();
 }
 
-#ifdef DEBUG_CVODE_GPU
-__device__
-void printmin(ModelDataGPU *md,double* y, const char *s) {
-  __syncthreads();
-  extern __shared__ double flag_shr2[];
-  int tid= threadIdx.x + blockDim.x*blockIdx.x;
-  __syncthreads();
-  double min;
-  cudaDevicemin_2(&min, y[tid], flag_shr2, md->n_shr_empty);
-  __syncthreads();
-  if(tid==0)printf("%s min %le\n",s,min);
-  __syncthreads();
-}
-#endif
-
 __device__ void cudaDeviceBCGprecond_2(double* dA, int* djA, int* diA, double* ddiag, double alpha){
   int row= threadIdx.x + blockDim.x*blockIdx.x;
   int nnz=diA[blockDim.x];
@@ -429,15 +387,13 @@ __device__ void cudaDeviceSpmv_2CSR(double* dx, double* db, double* dA, int* djA
   }
   __syncthreads();
   dx[row]=sum;
-  __syncthreads();
 }
 
 __device__ void cudaDeviceSpmv_2CSC_block(double* dx, double* db, double* dA, int* djA, int* diA){
   int row = threadIdx.x + blockDim.x*blockIdx.x;
-  __syncthreads();
   dx[row]=0.0;
-  __syncthreads();
   int nnz=diA[blockDim.x];
+  __syncthreads();
   for(int j=diA[threadIdx.x]; j<diA[threadIdx.x+1]; j++){
     double mult = db[row]*dA[j+nnz*blockIdx.x];
     atomicAdd_block(&(dx[djA[j]+blockDim.x*blockIdx.x]),mult);
@@ -470,10 +426,9 @@ __device__ void cudaDevicedotxy_2(double *g_idata1, double *g_idata2,
   __syncthreads();
   if(tid<n_shr_empty)
     sdata[tid+blockDim.x]=0.;
-  __syncthreads();
 #ifdef IS_DEBUG_MODE_cudaDevicedotxy_2
   //used for compare with cpu
-  sdata[0]=0.;
+  sdata[0] = 0.;
   __syncthreads();
   if(tid==0){
     for(int j=0;j<blockDim.x;j++){
@@ -515,7 +470,6 @@ __device__ void cudaDeviceVWRMS_Norm_2(double *g_idata1, double *g_idata2, doubl
   __syncthreads();
   if(tid<n_shr_empty)
     sdata[tid+blockDim.x]=0.;
-  __syncthreads();
   sdata[tid] = g_idata1[i]*g_idata2[i];
   sdata[tid] = sdata[tid]*sdata[tid];
   __syncthreads();
@@ -526,7 +480,7 @@ __device__ void cudaDeviceVWRMS_Norm_2(double *g_idata1, double *g_idata2, doubl
     for(int j=0;j<blockDim.x;j++){
       sum+=sdata[j];
     }
-    sdata[0]=sum;
+    sdata[0] = sum;
   }
   __syncthreads();
 #else
@@ -542,20 +496,18 @@ __device__ void cudaDeviceVWRMS_Norm_2(double *g_idata1, double *g_idata2, doubl
 
 __device__
 void cudaDeviceJacCopy(int* Ap, double* Ax, double* Bx) {
-  __syncthreads();
   int nnz=Ap[blockDim.x];
   for(int j=Ap[threadIdx.x]; j<Ap[threadIdx.x+1]; j++){
     Bx[j+nnz*blockIdx.x]=Ax[j+nnz*blockIdx.x];
   }
-  __syncthreads();
 }
 
 __device__
-int cudaDevicecamp_solver_check_model_state(ModelDataGPU *md, ModelDataVariable *sc, double *y, int *flag)
+int cudaDevicecamp_solver_check_model_state(ModelDataGPU *md, ModelDataVariable *sc, double *y)
 {
   int i = blockIdx.x * blockDim.x + threadIdx.x;
-  __syncthreads();
   extern __shared__ int flag_shr[];
+  __syncthreads();
   flag_shr[0] = 0;
   __syncthreads();
   if (y[i] < -SMALL) {
@@ -566,9 +518,9 @@ int cudaDevicecamp_solver_check_model_state(ModelDataGPU *md, ModelDataVariable
             TINY : y[i];
   }
   __syncthreads();
-  *flag = flag_shr[0];
+  int flag = flag_shr[0];
   __syncthreads();
-  return *flag;
+  return flag;
 }
 
 __device__ void solveRXN(
@@ -616,6 +568,7 @@ __device__ void cudaDevicecalc_deriv(double time_step, double *y,
   cudaDeviceSpmv_2(md->J_tmp2, md->J_tmp, md->J_solver, md->djA, md->diA);
   md->J_tmp[i]=md->J_deriv[i]+md->J_tmp2[i];
   TimeDerivativeGPU deriv_data;
+  __syncthreads();
   deriv_data.production_rates = md->production_rates;
   deriv_data.loss_rates = md->loss_rates;
   __syncthreads();
@@ -625,12 +578,11 @@ __device__ void cudaDevicecalc_deriv(double time_step, double *y,
   deriv_data.production_rates = &( md->production_rates[blockDim.x*blockIdx.x]);
   deriv_data.loss_rates = &( md->loss_rates[blockDim.x*blockIdx.x]);
   sc->grid_cell_state = &( md->state[md->state_size_cell*blockIdx.x]);
-  int n_rxn = md->n_rxn;
   __syncthreads();
+  int n_rxn = md->n_rxn;
 #ifdef IS_DEBUG_MODE_removeAtomic
   if(threadIdx.x==0){
     for (int j = 0; j < n_rxn; j++){
-      //printf("n_rxn %d i %d j %d \n",n_rxn,i,j);
       solveRXN(j,deriv_data, time_step, md, sc);
     }
   }
@@ -665,22 +617,12 @@ __device__ void cudaDevicecalc_deriv(double time_step, double *y,
   } else {
     yout[i] = 0.0;
   }
-  //print_double(y,86,"y646");
-  //print_double(md->J_state,86,"J_state644");
-  //print_double(md->J_tmp,86,"J_tmp643");
-  //print_double(md->J_deriv,86,"J_deriv644");
-  //print_double(md->J_tmp2,86,"J_tmp2645");
-  //print_double(deriv_data.loss_rates,86,"loss_rates");
-  //print_double(deriv_data.production_rates,86,"production_rates");
-  //print_double(yout,86,"deriv_data645");
-  __syncthreads();
 }
 
 __device__
 int cudaDevicef(double time_step, double *y,
-        double *yout, ModelDataGPU *md, ModelDataVariable *sc, int *flag)
+        double *yout, ModelDataGPU *md, ModelDataVariable *sc)
 {
-  __syncthreads();
 #ifdef CAMP_PROFILE_DEVICE_FUNCTIONS
   int clock_khz=md->clock_khz;
   clock_t start;
@@ -688,13 +630,8 @@ int cudaDevicef(double time_step, double *y,
 #endif
   time_step = sc->cv_next_h;
   time_step = time_step > 0. ? time_step : md->init_time_step;
-  //print_double(md->state,md->state_size_cell,"state661");
-  int checkflag=cudaDevicecamp_solver_check_model_state(md, sc, y, flag);
-  //print_double(md->state,md->state_size_cell,"state663");
-  __syncthreads();
+  int checkflag=cudaDevicecamp_solver_check_model_state(md, sc, y);
   if(checkflag==CAMP_SOLVER_FAIL){
-    *flag=CAMP_SOLVER_FAIL;
-    __syncthreads();
 #ifdef CAMP_PROFILE_DEVICE_FUNCTIONS
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     if(threadIdx.x==0) sc->timef += ((double)(int)(clock() - start))/(clock_khz*1000);
@@ -706,46 +643,36 @@ int cudaDevicef(double time_step, double *y,
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     if(threadIdx.x==0) sc->timef += ((double)(int)(clock() - start))/(clock_khz*1000);
 #endif
-  __syncthreads();
-  *flag=0;
-  __syncthreads();
   return 0;
 }
 
 __device__
 int CudaDeviceguess_helper(double h_n, double* y_n,
    double* y_n1, double* hf, double* atmp1,
-   double* acorr, int *flag, ModelDataGPU *md, ModelDataVariable *sc
+   double* acorr, ModelDataGPU *md, ModelDataVariable *sc
 ) {
   extern __shared__ double sdata[];
   int i = blockIdx.x * blockDim.x + threadIdx.x;
-  __syncthreads();
   double min;
-  cudaDevicemin_2(&min, y_n[i], sdata, md->n_shr_empty);
+  cudaDevicemin(&min, y_n[i], sdata, md->n_shr_empty);
   if(min>-SMALL){
     return 0;
   }
-  __syncthreads();
 #ifdef CAMP_PROFILE_DEVICE_FUNCTIONS
   int clock_khz=md->clock_khz;
   clock_t start;
   start = clock();
 #endif
   atmp1[i]=y_n1[i];
-  __syncthreads();
   if (h_n > 0.) {
     acorr[i]=(1./h_n)*hf[i];
   } else {
     acorr[i]=hf[i];
   }
-  //print_double(acorr,86,"acorr711");
   double t_0 = h_n > 0. ? sc->cv_tn - h_n : sc->cv_tn - 1.;
   double t_j = 0.;
-  __syncthreads();
   for (int iter = 0; iter < GUESS_MAX_ITER && t_0 + t_j < sc->cv_tn; iter++) {
-    __syncthreads();
     double h_j = sc->cv_tn - (t_0 + t_j);
-    //print_double(atmp1,86,"atmp720");
 #ifdef IS_DEBUG_MODE_CudaDeviceguess_helper
     if(threadIdx.x==0){
       int i_fast = -1;
@@ -759,7 +686,7 @@ int CudaDeviceguess_helper(double h_n, double* y_n,
       }
       if (i_fast >= 0 && h_n > 0.)
         h_j *= 0.95 + 0.1 * iter / (double)GUESS_MAX_ITER;
-      sdata[0]=h_j;
+      sdata[0] = h_j;
     }
     __syncthreads();
     h_j=sdata[0];
@@ -769,32 +696,24 @@ int CudaDeviceguess_helper(double h_n, double* y_n,
     if (t_star < 0. || (t_star == 0. && acorr[i] >= 0.)){
       t_star=h_j;
     }
-    cudaDevicemin_2(&min, t_star, sdata, md->n_shr_empty);
+    cudaDevicemin(&min, t_star, sdata, md->n_shr_empty);
     if(min<h_j){
       h_j = min;
       h_j *= 0.95 + 0.1 * iter / (double)GUESS_MAX_ITER;
     }
 #endif
     h_j = sc->cv_tn < t_0 + t_j + h_j ? sc->cv_tn - (t_0 + t_j) : h_j;
-    __syncthreads();
     if (h_n == 0. && sc->cv_tn - (h_j + t_j + t_0) > md->cv_reltol) {
-      __syncthreads();
 #ifdef CAMP_PROFILE_DEVICE_FUNCTIONS
     if(threadIdx.x==0) sc->timeguess_helper += ((double)(clock() - start))/(clock_khz*1000);
 #endif
     return -1;
     }
     atmp1[i]+=h_j*acorr[i];
-    __syncthreads();
     t_j += h_j;
-    int aux_flag=0;
-    //print_double(atmp1,86,"atmp1766");
-    int fflag=cudaDevicef(t_0 + t_j, atmp1, acorr,md,sc,&aux_flag);
-    //print_double(acorr,86,"acorr721");
-    __syncthreads();
+    int fflag=cudaDevicef(t_0 + t_j, atmp1, acorr,md,sc);
     if (fflag == CAMP_SOLVER_FAIL) {
       acorr[i] = 0.;
-      __syncthreads();
 #ifdef CAMP_PROFILE_DEVICE_FUNCTIONS
     if(threadIdx.x==0) sc->timeguess_helper += ((double)(clock() - start))/(clock_khz*1000);
 #endif
@@ -802,24 +721,19 @@ int CudaDeviceguess_helper(double h_n, double* y_n,
     }
     if (iter == GUESS_MAX_ITER - 1 && t_0 + t_j < sc->cv_tn) {
       if (h_n == 0.){
-        __syncthreads();
 #ifdef CAMP_PROFILE_DEVICE_FUNCTIONS
     if(threadIdx.x==0) sc->timeguess_helper += ((double)(clock() - start))/(clock_khz*1000);
 #endif
         return -1;
       }
     }
-    __syncthreads();
   }
-  __syncthreads();
   acorr[i]=atmp1[i]-y_n[i];
   if (h_n > 0.) acorr[i]=acorr[i]*0.999;
   hf[i]=atmp1[i]-y_n1[i];
-  __syncthreads();
 #ifdef CAMP_PROFILE_DEVICE_FUNCTIONS
     if(threadIdx.x==0)  sc->timeguess_helper += ((double)(clock() - start))/(clock_khz*1000);
 #endif
-  __syncthreads();
   return 1;
 }
 
@@ -862,12 +776,10 @@ __device__ void solveRXNJac(
 
 __device__ void cudaDevicecalc_Jac(double *y,ModelDataGPU *md, ModelDataVariable *sc
 ){
-  __syncthreads();
 #ifdef CAMP_PROFILE_DEVICE_FUNCTIONS
   int clock_khz=md->clock_khz;
   clock_t start;
   start = clock();
-  __syncthreads();
 #endif
   JacobianGPU *jac = &md->jac;
   JacobianGPU jacBlock;
@@ -875,7 +787,6 @@ __device__ void cudaDevicecalc_Jac(double *y,ModelDataGPU *md, ModelDataVariable
   jacBlock.num_elem = jac->num_elem;
   jacBlock.production_partials = &( jac->production_partials[jacBlock.num_elem[0]*blockIdx.x]);
   jacBlock.loss_partials = &( jac->loss_partials[jacBlock.num_elem[0]*blockIdx.x]);
-  __syncthreads();
   sc->grid_cell_state = &( md->state[md->state_size_cell*blockIdx.x]);
   __syncthreads();
   int n_rxn = md->n_rxn;
@@ -918,34 +829,28 @@ __device__ void cudaDevicecalc_Jac(double *y,ModelDataGPU *md, ModelDataVariable
     jacBlock.production_partials[jac_map[j].rxn_id] = 0.0;
     jacBlock.loss_partials[jac_map[j].rxn_id] = 0.0;
   }
-    __syncthreads();
+  __syncthreads();
 #ifdef CAMP_PROFILE_DEVICE_FUNCTIONS
     if(threadIdx.x==0) sc->timecalc_Jac += ((double)(clock() - start))/(clock_khz*1000);
 #endif
 }
 
 __device__
-int cudaDeviceJac(int *flag, ModelDataGPU *md, ModelDataVariable *sc)
+int cudaDeviceJac(ModelDataGPU *md, ModelDataVariable *sc)
 {
   int i = blockIdx.x * blockDim.x + threadIdx.x;
   int retval;
-  __syncthreads();
 #ifdef CAMP_PROFILE_DEVICE_FUNCTIONS
   int clock_khz=md->clock_khz;
   clock_t start;
   start = clock();
 #endif
   md->use_deriv_est=0;
-  int aux_flag=0;
-  __syncthreads();
-  //print_double(md->dcv_y,86,"dcv_y914");
-  retval=cudaDevicef(sc->cv_next_h, md->dcv_y, md->dftemp,md,sc,&aux_flag);
+  retval=cudaDevicef(sc->cv_next_h, md->dcv_y, md->dftemp,md,sc);
   md->use_deriv_est=1;
-  __syncthreads();
   if(retval==CAMP_SOLVER_FAIL)
     return CAMP_SOLVER_FAIL;
   cudaDevicecalc_Jac(md->dcv_y,md, sc);
-  __syncthreads();
   int nnz = md->n_mapped_values[0];
   int n_iters = nnz / blockDim.x;
   for (int z = 0; z < n_iters; z++) {
@@ -957,16 +862,11 @@ int cudaDeviceJac(int *flag, ModelDataGPU *md, ModelDataVariable *sc)
     int j = threadIdx.x + n_iters*blockDim.x + nnz * blockIdx.x;
     md->J_solver[j]=md->dA[j];
   }
-  __syncthreads();
   md->J_state[i]=md->dcv_y[i];
   md->J_deriv[i]=md->dftemp[i];
-  __syncthreads();
 #ifdef CAMP_PROFILE_DEVICE_FUNCTIONS
     if(threadIdx.x==0)  sc->timeJac += ((double)(clock() - start))/(clock_khz*1000);
 #endif
-  __syncthreads();
-  *flag = 0;
-  __syncthreads();
   return 0;
 }
 
@@ -984,30 +884,21 @@ int cudaDevicelinsolsetup(
          (convfail == CV_FAIL_OTHER);
   jok = !jbad;
   if (jok==1) {
-    __syncthreads();
     sc->cv_jcur = 0;
     cudaDeviceJacCopy(md->diA, md->dsavedJ, md->dA);
-    __syncthreads();
   } else {
-    __syncthreads();
     sc->nstlj = sc->cv_nst;
     sc->cv_jcur = 1;
-    __syncthreads();
-    int aux_flag=0;
-    __syncthreads();
-    int guess_flag=cudaDeviceJac(&aux_flag,md,sc);
-    __syncthreads();
+    int guess_flag=cudaDeviceJac(md,sc);
     if (guess_flag < 0) {
       return -1;}
     if (guess_flag > 0) {
       return 1;}
    cudaDeviceJacCopy(md->diA, md->dA, md->dsavedJ);
   }
-  __syncthreads();
   int i = blockIdx.x * blockDim.x + threadIdx.x;
   md->dx[i]=0.;
   cudaDeviceBCGprecond_2(md->dA, md->djA, md->diA, md->ddiag, -sc->cv_gamma);
-  __syncthreads();
   return 0;
 }
 
@@ -1046,10 +937,7 @@ void solveBcgCudaDeviceCVODE(ModelDataGPU *md, ModelDataVariable *sc)
     temp1 = sqrt(temp1);
     rho0 = rho1;
     it++;
-  __syncthreads();
-  //if(i==0)printf("end iter %d BCG GPU\n",it);
   }
-  __syncthreads();
 #ifdef CAMP_PROFILE_DEVICE_FUNCTIONS
   if(threadIdx.x==0) sc->counterBCGInternal += it;
   if(threadIdx.x==0) sc->counterBCG++;
@@ -1060,45 +948,31 @@ __device__
 int cudaDevicecvNewtonIteration(ModelDataGPU *md, ModelDataVariable *sc){
   extern __shared__ double flag_shr2[];
   int i = blockIdx.x * blockDim.x + threadIdx.x;
-  int aux_flag=0;
   double del, delp, dcon;
   int m = 0;
   del = delp = 0.0;
   int retval;
-  __syncthreads();
 #ifdef CAMP_PROFILE_DEVICE_FUNCTIONS
   int clock_khz=md->clock_khz;
   clock_t start;
 #endif
   for(;;) {
-    __syncthreads();
 #ifdef CAMP_PROFILE_DEVICE_FUNCTIONS
     start = clock();
 #endif
-    //print_double(md->dtempv,86,"dtempvN_VLinearSum1");
     md->dtempv[i]=sc->cv_rl1*md->dzn[i+md->nrows]+md->cv_acor[i];
-    //print_double(md->dtempv,86,"dtempvN_VLinearSum2");
     md->dtempv[i]=sc->cv_gamma*md->dftemp[i]-md->dtempv[i];
-    //print_double(md->dtempv,86,"dtempvcv_lsolve1");
     solveBcgCudaDeviceCVODE(md, sc);
-    __syncthreads();
 #ifdef CAMP_PROFILE_DEVICE_FUNCTIONS
     if(threadIdx.x==0) sc->dtBCG += ((double)(int)(clock() - start))/(clock_khz*1000);
 #endif
     md->dtempv[i] = md->dx[i];
-    //print_double(md->dtempv,86,"dtempvcv_lsolve2");
-    __syncthreads();
     cudaDeviceVWRMS_Norm_2(md->dx, md->dewt, &del, md->n_shr_empty);
     md->dftemp[i]=md->dcv_y[i]+md->dtempv[i];
-    __syncthreads();
-    //print_double(md->dcv_y,86,"dcv_y2994");
-    //print_double(md->dftemp,86,"cv_ftemplsolve");
     int guessflag=CudaDeviceguess_helper(0., md->dftemp,
-       md->dcv_y, md->dtempv, md->dtempv1,md->dtempv2, &aux_flag, md, sc
-    );
-    __syncthreads();
+       md->dcv_y, md->dtempv, md->dtempv1,md->dtempv2, md, sc);
     if (guessflag < 0) {
-      if (!(sc->cv_jcur)) { //Bool set up during linsolsetup just before Jacobian
+      if (!(sc->cv_jcur)) {
         return TRY_AGAIN;
       } else {
         return RHSFUNC_RECVR;
@@ -1106,29 +980,22 @@ int cudaDevicecvNewtonIteration(ModelDataGPU *md, ModelDataVariable *sc){
     }
     md->dftemp[i]=md->dcv_y[i]+md->dtempv[i];
     double min;
-    cudaDevicemin_2(&min, md->dftemp[i], flag_shr2, md->n_shr_empty);
+    cudaDevicemin(&min, md->dftemp[i], flag_shr2, md->n_shr_empty);
     if (min < -CAMP_TINY) {
       return CONV_FAIL;
     }
-    __syncthreads();
     md->cv_acor[i]+=md->dtempv[i];
     md->dcv_y[i]=md->dzn[i]+md->cv_acor[i];
-    //print_double(md->cv_acor,86,"cv_acor1060");
-    //print_double(md->dcv_y,86,"dcv_y1060");
     if (m > 0) {
       sc->cv_crate = SUNMAX(0.3 * sc->cv_crate, del / delp);
     }
     dcon = del * SUNMIN(1.0, sc->cv_crate) / md->cv_tq[4+blockIdx.x*(NUM_TESTS + 1)];
-    flag_shr2[0]=0;
+    __syncthreads();
+    flag_shr2[0] = 0;
     __syncthreads();
     if (dcon <= 1.) {
-      //print_double(md->cv_acor,86,"cv_acor1505");
-      //print_double(md->dewt,86,"dewt1505");
       cudaDeviceVWRMS_Norm_2(md->cv_acor, md->dewt, &sc->cv_acnrm, md->n_shr_empty);
-      //print_double(&sc->cv_acnrm,1,"cv_acnrm1151");
-      __syncthreads();
       sc->cv_jcur = 0;
-      __syncthreads();
       return CV_SUCCESS;
     }
     m++;
@@ -1140,10 +1007,7 @@ int cudaDevicecvNewtonIteration(ModelDataGPU *md, ModelDataVariable *sc){
       }
     }
     delp = del;
-    __syncthreads();
-    //print_double(md->dcv_y,86,"dcv_y1137");
-    retval=cudaDevicef(sc->cv_next_h, md->dcv_y, md->dftemp, md, sc, &aux_flag);
-    __syncthreads();
+    retval=cudaDevicef(sc->cv_next_h, md->dcv_y, md->dftemp, md, sc);
     md->cv_acor[i]=md->dcv_y[i]+md->dzn[i];
     if (retval < 0) {
       return CV_RHSFUNC_FAIL;
@@ -1155,7 +1019,6 @@ int cudaDevicecvNewtonIteration(ModelDataGPU *md, ModelDataVariable *sc){
         return RHSFUNC_RECVR;
       }
     }
-    __syncthreads();
 #ifdef CAMP_PROFILE_DEVICE_FUNCTIONS
     if(threadIdx.x==0) sc->dtPostBCG += ((double)(clock() - start))/(clock_khz*1000);
 #endif
@@ -1167,10 +1030,8 @@ int cudaDevicecvNlsNewton(int nflag,
         ModelDataGPU *md, ModelDataVariable *sc
 ) {
   extern __shared__ int flag_shr[];
-  int flagDevice = 0;
   int i = blockIdx.x * blockDim.x + threadIdx.x;
   int retval=0;
-  __syncthreads();
 #ifdef CAMP_PROFILE_DEVICE_FUNCTIONS
   int clock_khz=md->clock_khz;
   clock_t start;
@@ -1182,31 +1043,18 @@ int cudaDevicecvNlsNewton(int nflag,
                   (sc->cv_nst == 0) ||
                   (sc->cv_nst >= sc->cv_nstlp + MSBP) ||
                   (dgamrat > DGMAX);
-  __syncthreads();
-  //print_double(md->cv_last_yn,86,"cv_last_yn1175");
   md->dftemp[i]=md->dzn[i]-md->cv_last_yn[i];
-  //print_double(md->dftemp,86,"cv_ftemppN_VLinearSum2");
   md->cv_acor_init[i]=0.;
-  __syncthreads();
   int guessflag=CudaDeviceguess_helper(sc->cv_h, md->dzn,
        md->cv_last_yn, md->dftemp, md->dtempv1,
-       md->cv_acor_init,  &flagDevice,md, sc
+       md->cv_acor_init, md, sc
   );
-  __syncthreads();
   if(guessflag<0){
     return RHSFUNC_RECVR;
   }
   for(;;) {
-    __syncthreads();
-    //print_double(md->cv_acor_init,86,"cv_acor_init1140");
     md->dcv_y[i] = md->dzn[i]+md->cv_acor_init[i];
-    //print_double(md->dcv_y,86,"dcv_y1139");
-    int aux_flag=0;
-    //print_double(md->dftemp,86,"cv_ftemppcv_f1");
-    //print_double(&sc->cv_tn,1,"cv_tn1216");
-    retval=cudaDevicef(sc->cv_tn, md->dcv_y,md->dftemp,md,sc,&aux_flag);
-    //print_double(md->dftemp,86,"cv_ftemppcv_f2");
-    //print_double(md->dcv_y,86,"dcv_y1144");
+    retval=cudaDevicef(sc->cv_tn, md->dcv_y,md->dftemp,md,sc);
     if (retval < 0) {
       return CV_RHSFUNC_FAIL;
     }
@@ -1214,14 +1062,10 @@ int cudaDevicecvNlsNewton(int nflag,
       return RHSFUNC_RECVR;
     }
     if (callSetup) {
-      __syncthreads();
 #ifdef CAMP_PROFILE_DEVICE_FUNCTIONS
       start = clock();
 #endif
-      __syncthreads();
       int linflag=cudaDevicelinsolsetup(md, sc,convfail);
-      //print_double(md->dftemp,86,"cv_ftempp1160");
-      __syncthreads();
 #ifdef CAMP_PROFILE_DEVICE_FUNCTIONS
       if(threadIdx.x==0) sc->timelinsolsetup += ((double)(clock() - start))/(clock_khz*1000);
 #endif
@@ -1229,7 +1073,6 @@ int cudaDevicecvNlsNewton(int nflag,
       sc->cv_gamrat = sc->cv_crate = 1.0;
       sc->cv_gammap = sc->cv_gamma;
       sc->cv_nstlp = sc->cv_nst;
-      __syncthreads();
       if (linflag < 0) {
         flag_shr[0] = CV_LSETUP_FAIL;
         break;
@@ -1239,28 +1082,20 @@ int cudaDevicecvNlsNewton(int nflag,
         break;
       }
     }
-    __syncthreads();
     md->cv_acor[i] = md->cv_acor_init[i];
-    __syncthreads();
 #ifdef CAMP_PROFILE_DEVICE_FUNCTIONS
     start = clock();
 #endif
-    __syncthreads();
     int nItflag=cudaDevicecvNewtonIteration(md, sc);
-    __syncthreads();
 #ifdef CAMP_PROFILE_DEVICE_FUNCTIONS
     if(threadIdx.x==0)  sc->timeNewtonIteration += ((double)(clock() - start))/(clock_khz*1000);
 #endif
     if (nItflag != TRY_AGAIN) {
       return nItflag;
     }
-    __syncthreads();
     callSetup = 1;
-    __syncthreads();
     convfail = CV_FAIL_BAD_J;
-    __syncthreads();
   } //for(;;)
-  __syncthreads();
   return nflag;
 }
 
@@ -1268,36 +1103,27 @@ __device__
 void cudaDevicecvRescale(ModelDataGPU *md, ModelDataVariable *sc) {
   int i = blockIdx.x * blockDim.x + threadIdx.x;
   double factor;
-  __syncthreads();
   factor = sc->cv_eta;
   for (int j=1; j <= sc->cv_q; j++) {
     md->dzn[i+md->nrows*j]*=factor;
-    __syncthreads();
     factor *= sc->cv_eta;
-    __syncthreads();
   }
-  //print_double(&sc->cv_eta,1,"cv_eta_1290");
   sc->cv_h = sc->cv_hscale * sc->cv_eta;
   sc->cv_next_h = sc->cv_h;
   sc->cv_hscale = sc->cv_h;
-  __syncthreads();
 }
 
 __device__
 void cudaDevicecvRestore(ModelDataGPU *md, ModelDataVariable *sc, double saved_t) {
   int i = blockIdx.x * blockDim.x + threadIdx.x;
   int j, k;
-  __syncthreads();
   sc->cv_tn=saved_t;
-  print_double(md->dzn,86,"dzn1299");
   for (k = 1; k <= sc->cv_q; k++){
     for (j = sc->cv_q; j >= k; j--) {
       md->dzn[i+md->nrows*(j-1)]-=md->dzn[i+md->nrows*j];
     }
   }
   md->dzn[i]=md->cv_last_yn[i];
-  print_double(md->dzn,86,"dzn1306");
-  __syncthreads();
 }
 
 __device__
@@ -1313,20 +1139,15 @@ int cudaDevicecvHandleNFlag(ModelDataGPU *md, ModelDataVariable *sc, int *nflagP
   if (*nflagPtr == CV_RHSFUNC_FAIL) return(CV_RHSFUNC_FAIL);
   (*ncfPtr)++;
   sc->cv_etamax = 1.;
-  __syncthreads();
   if ((fabs(sc->cv_h) <= sc->cv_hmin*ONEPSM) ||
       (*ncfPtr == sc->cv_maxncf)) {
     if (*nflagPtr == CONV_FAIL)     return(CV_CONV_FAILURE);
     if (*nflagPtr == RHSFUNC_RECVR) return(CV_REPTD_RHSFUNC_ERR);
   }
-  __syncthreads();
   sc->cv_eta = SUNMAX(ETACF,
           sc->cv_hmin / fabs(sc->cv_h));
-  //print_double(&sc->cv_eta,1,"cv_eta_1337");
-  __syncthreads();
   *nflagPtr = PREV_CONV_FAIL;
   cudaDevicecvRescale(md, sc);
-  __syncthreads();
   return (PREDICT_AGAIN);
 }
 
@@ -1337,7 +1158,6 @@ void cudaDevicecvSetTqBDFt(ModelDataGPU *md, ModelDataVariable *sc,
   extern __shared__ int flag_shr[];
   double A1, A2, A3, A4, A5, A6;
   double C, Cpinv, Cppinv;
-  __syncthreads();
   A1 = 1. - alpha0_hat + alpha0;
   A2 = 1. + sc->cv_q * A1;
   md->cv_tq[2+blockIdx.x*(NUM_TESTS + 1)] = fabs(A1 / (alpha0 * A2));
@@ -1351,14 +1171,12 @@ void cudaDevicecvSetTqBDFt(ModelDataGPU *md, ModelDataVariable *sc,
       md->cv_tq[1+blockIdx.x*(NUM_TESTS + 1)] = fabs(C * Cpinv);
     }
     else md->cv_tq[1+blockIdx.x*(NUM_TESTS + 1)] = 1.;
-    __syncthreads();
     hsum += md->cv_tau[sc->cv_q+blockIdx.x*(L_MAX + 1)];
     xi_inv = sc->cv_h / hsum;
     A5 = alpha0 - (1. / (sc->cv_q+1));
     A6 = alpha0_hat - xi_inv;
     Cppinv = (1. - A6 + A5) / A2;
     md->cv_tq[3+blockIdx.x*(NUM_TESTS + 1)] = fabs(Cppinv / (xi_inv * (sc->cv_q+2) * A5));
-    __syncthreads();
   }
   md->cv_tq[4+blockIdx.x*(NUM_TESTS + 1)] = md->cv_nlscoef / md->cv_tq[2+blockIdx.x*(NUM_TESTS + 1)];
 }
@@ -1368,12 +1186,10 @@ void cudaDevicecvSetBDF(ModelDataGPU *md, ModelDataVariable *sc) {
   extern __shared__ int flag_shr[];
   double alpha0, alpha0_hat, xi_inv, xistar_inv, hsum;
   int z,j;
-  __syncthreads();
   md->cv_l[0+blockIdx.x*L_MAX] = md->cv_l[1+blockIdx.x*L_MAX] = xi_inv = xistar_inv = 1.;
   for (z=2; z <= sc->cv_q; z++) md->cv_l[z+blockIdx.x*L_MAX] = 0.;
   alpha0 = alpha0_hat = -1.;
   hsum = sc->cv_h;
-  __syncthreads();
   if (sc->cv_q > 1) {
     for (j=2; j < sc->cv_q; j++) {
       hsum += md->cv_tau[j-1+blockIdx.x*(L_MAX + 1)];
@@ -1381,7 +1197,6 @@ void cudaDevicecvSetBDF(ModelDataGPU *md, ModelDataVariable *sc) {
       alpha0 -= 1. / j;
       for (z=j; z >= 1; z--) md->cv_l[z+blockIdx.x*L_MAX] += md->cv_l[z-1+blockIdx.x*L_MAX]*xi_inv;
     }
-    __syncthreads();
     alpha0 -= 1. / sc->cv_q;
     xistar_inv = -md->cv_l[1+blockIdx.x*L_MAX] - alpha0;
     hsum += md->cv_tau[sc->cv_q-1+blockIdx.x*(L_MAX + 1)];
@@ -1390,49 +1205,37 @@ void cudaDevicecvSetBDF(ModelDataGPU *md, ModelDataVariable *sc) {
     for (z=sc->cv_q; z >= 1; z--)
       md->cv_l[z+blockIdx.x*L_MAX] += md->cv_l[z-1+blockIdx.x*L_MAX]*xistar_inv;
   }
-  __syncthreads();
   cudaDevicecvSetTqBDFt(md, sc, hsum, alpha0, alpha0_hat, xi_inv, xistar_inv);
 }
 
 __device__
 void cudaDevicecvSet(ModelDataGPU *md, ModelDataVariable *sc) {
   extern __shared__ int flag_shr[];
-  __syncthreads();
   cudaDevicecvSetBDF(md,sc);
-  __syncthreads();
   sc->cv_rl1 = 1.0 / md->cv_l[1+blockIdx.x*L_MAX];
   sc->cv_gamma = sc->cv_h * sc->cv_rl1;
-  __syncthreads();
   if (sc->cv_nst == 0){
     sc->cv_gammap = sc->cv_gamma;
   }
-  __syncthreads();
   sc->cv_gamrat = (sc->cv_nst > 0) ?
                     sc->cv_gamma / sc->cv_gammap : 1.;  // protect x / x != 1.0
-  __syncthreads();
 }
 
 __device__
 void cudaDevicecvPredict(ModelDataGPU *md, ModelDataVariable *sc) {
   int i = blockIdx.x * blockDim.x + threadIdx.x;
   int j, k;
-  __syncthreads();
   sc->cv_tn += sc->cv_h;
-  __syncthreads();
   if (md->cv_tstopset) {
     if ((sc->cv_tn - md->cv_tstop)*sc->cv_h > 0.)
       sc->cv_tn = md->cv_tstop;
   }
   md->cv_last_yn[i]=md->dzn[i];
-  __syncthreads();
-  print_double(md->dzn,86,"dzn1432");
   for (k = 1; k <= sc->cv_q; k++){
     for (j = sc->cv_q; j >= k; j--){
       md->dzn[i+md->nrows*(j-1)]+=md->dzn[i+md->nrows*j];
-      __syncthreads();
     }
   }
-  print_double(md->dzn,86,"dzn1439");
 }
 
 __device__
@@ -1453,7 +1256,6 @@ void cudaDevicecvDecreaseBDF(ModelDataGPU *md, ModelDataVariable *sc) {
     md->dzn[i+md->nrows*j]=-md->cv_l[j+blockIdx.x*L_MAX]*
       md->dzn[i+md->nrows*sc->cv_q]+md->dzn[i+md->nrows*j];
   }
-  print_double(md->dzn,86,"dzn1460");
 }
 
 __device__
@@ -1465,64 +1267,45 @@ int cudaDevicecvDoErrorTest(ModelDataGPU *md, ModelDataVariable *sc,
   double min_val;
   int retval;
   md->dftemp[i]=md->cv_l[blockIdx.x*L_MAX]*md->cv_acor[i]+md->dzn[i];
-  cudaDevicemin_2(&min_val, md->dftemp[i], flag_shr2, md->n_shr_empty);
+  cudaDevicemin(&min_val, md->dftemp[i], flag_shr2, md->n_shr_empty);
   if (min_val < 0. && min_val > -CAMP_TINY) {
     md->dftemp[i]=fabs(md->dftemp[i]);
     md->dzn[i]=md->dftemp[i]-md->cv_l[0+blockIdx.x*L_MAX]*md->cv_acor[i];
     min_val = 0.;
   }
-  print_double(md->dzn,86,"dzn1487");
-  //print_double(&md->cv_tq[2+blockIdx.x*(NUM_TESTS + 1)],1,"cv_tq_21504");
-  //print_double(&sc->cv_acnrm,1,"cv_acnrm1504");
   dsm = sc->cv_acnrm * md->cv_tq[2+blockIdx.x*(NUM_TESTS + 1)];
-  //print_double(&dsm,1,"dsm1504");
   *dsmPtr = dsm;
   if (dsm <= 1. && min_val >= 0.) return(CV_SUCCESS);
   (*nefPtr)++;
   *nflagPtr = PREV_ERR_FAIL;
   cudaDevicecvRestore(md, sc, saved_t);
-  __syncthreads();
   if ((fabs(sc->cv_h) <= sc->cv_hmin*ONEPSM) ||
       (*nefPtr == md->cv_maxnef)) return(CV_ERR_FAILURE);
   sc->cv_etamax = 1.;
-  __syncthreads();
   if (*nefPtr <= MXNEF1) {
     sc->cv_eta = 1. / (dSUNRpowerR(BIAS2*dsm,1./sc->cv_L) + ADDON);
-    __syncthreads();
     sc->cv_eta = SUNMAX(ETAMIN, SUNMAX(sc->cv_eta,
                            sc->cv_hmin / fabs(sc->cv_h)));
-    __syncthreads();
     if (*nefPtr >= SMALL_NEF)
       sc->cv_eta = SUNMIN(sc->cv_eta, ETAMXF);
-    __syncthreads();
-    //print_double(&sc->cv_eta,1,"cv_eta_1510");
     cudaDevicecvRescale(md, sc);
     return(TRY_AGAIN);
   }
-  __syncthreads();
   if (sc->cv_q > 1) {
     sc->cv_eta = SUNMAX(ETAMIN,sc->cv_hmin / fabs(sc->cv_h));
-    //print_double(&sc->cv_eta,1,"cv_eta_1517");
     cudaDevicecvDecreaseBDF(md, sc);
     sc->cv_L = sc->cv_q;
-    //print_int(&sc->cv_L,1,"cv_L1547");
     sc->cv_q--;
     sc->cv_qwait = sc->cv_L;
     cudaDevicecvRescale(md, sc);
-    __syncthreads();
     return(TRY_AGAIN);
   }
-  __syncthreads();
   sc->cv_eta = SUNMAX(ETAMIN, sc->cv_hmin / fabs(sc->cv_h));
-  //print_double(&sc->cv_eta,1,"cv_eta_1529");
-  __syncthreads();
   sc->cv_h *= sc->cv_eta;
   sc->cv_next_h = sc->cv_h;
   sc->cv_hscale = sc->cv_h;
-  __syncthreads();
   sc->cv_qwait = 10;
-  int aux_flag=0;
-  retval=cudaDevicef(sc->cv_tn, md->dzn, md->dtempv,md,sc, &aux_flag);
+  retval=cudaDevicef(sc->cv_tn, md->dzn, md->dtempv,md,sc);
   if (retval < 0)  return(CV_RHSFUNC_FAIL);
   if (retval > 0)  return(CV_UNREC_RHSFUNC_ERR);
   md->dzn[i+md->nrows]=sc->cv_h*md->dtempv[i];
@@ -1533,20 +1316,15 @@ __device__
 void cudaDevicecvCompleteStep(ModelDataGPU *md, ModelDataVariable *sc) {
   int i = blockIdx.x * blockDim.x + threadIdx.x;
   int z, j;
-  __syncthreads();
   sc->cv_nst++;
-  __syncthreads();
   sc->cv_hu = sc->cv_h;
   for (z=sc->cv_q; z >= 2; z--)  md->cv_tau[z+blockIdx.x*(L_MAX + 1)] = md->cv_tau[z-1+blockIdx.x*(L_MAX + 1)];
   if ((sc->cv_q==1) && (sc->cv_nst > 1))
     md->cv_tau[2+blockIdx.x*(L_MAX + 1)] = md->cv_tau[1+blockIdx.x*(L_MAX + 1)];
   md->cv_tau[1+blockIdx.x*(L_MAX + 1)] = sc->cv_h;
-  __syncthreads();
   for (j=0; j <= sc->cv_q; j++){
     md->dzn[i+md->nrows*j]+=md->cv_l[j+blockIdx.x*L_MAX]*md->cv_acor[i];
-    __syncthreads();
   }
-  print_double(md->dzn,86,"dzn1554");
   sc->cv_qwait--;
   if ((sc->cv_qwait == 1) && (sc->cv_q != md->cv_qmax)) {
     md->dzn[i+md->nrows*md->cv_qmax]=md->cv_acor[i];
@@ -1559,143 +1337,76 @@ __device__
 void cudaDevicecvChooseEta(ModelDataGPU *md, ModelDataVariable *sc) {
   int i = blockIdx.x * blockDim.x + threadIdx.x;
   double etam;
-  //print_double(&sc->cv_etaqm1,1,"cv_etaqm1605");
-  //print_double(&sc->cv_etaq,1,"cv_etaq1605");
-  //print_double(&sc->cv_etaqp1,1,"cv_etaqp1605");
   etam = SUNMAX(sc->cv_etaqm1, SUNMAX(sc->cv_etaq, sc->cv_etaqp1));
-  //print_double(&etam,1,"etam1605");
-  __syncthreads();
   if (etam < THRESH) {
     sc->cv_eta = 1.;
-    //print_double(&sc->cv_eta,1,"cv_eta1609");
     sc->cv_qprime = sc->cv_q;
     return;
   }
-  __syncthreads();
   if (etam == sc->cv_etaq) {
     sc->cv_eta = sc->cv_etaq;
-    //print_double(&sc->cv_eta,1,"cv_eta1616");
     sc->cv_qprime = sc->cv_q;
   } else if (etam == sc->cv_etaqm1) {
     sc->cv_eta = sc->cv_etaqm1;
-    //print_double(&sc->cv_eta,1,"cv_eta1620");
     sc->cv_qprime = sc->cv_q - 1;
   } else {
     sc->cv_eta = sc->cv_etaqp1;
-    //print_double(&sc->cv_eta,1,"cv_eta1624");
     sc->cv_qprime = sc->cv_q + 1;
-    __syncthreads();
     md->dzn[i+md->nrows*md->cv_qmax]=md->cv_acor[i];
   }
-  __syncthreads();
-  print_double(md->dzn,86,"dzn1581");
 }
 
 __device__
 void cudaDevicecvSetEta(ModelDataGPU *md, ModelDataVariable *sc) {
-  __syncthreads();
   if (sc->cv_eta < THRESH) {
     sc->cv_eta = 1.;
     sc->cv_hprime = sc->cv_h;
   } else {
-    __syncthreads();
     sc->cv_eta = SUNMIN(sc->cv_eta, sc->cv_etamax);
-    __syncthreads();
     sc->cv_eta /= SUNMAX(ONE,
             fabs(sc->cv_h)*md->cv_hmax_inv*sc->cv_eta);
-    __syncthreads();
     sc->cv_hprime = sc->cv_h * sc->cv_eta;
-    __syncthreads();
   }
-  //print_double(&sc->cv_eta,1,"cv_eta_1618");
-  __syncthreads();
 }
 
 __device__
 int cudaDevicecvPrepareNextStep(ModelDataGPU *md, ModelDataVariable *sc, double dsm) {
   int i = blockIdx.x * blockDim.x + threadIdx.x;
-  __syncthreads();
   if (sc->cv_etamax == 1.) {
     sc->cv_qwait = SUNMAX(sc->cv_qwait, 2);
     sc->cv_qprime = sc->cv_q;
     sc->cv_hprime = sc->cv_h;
     sc->cv_eta = 1.;
-    //print_double(&sc->cv_eta,1,"cv_eta_1631");
     return 0;
   }
-  __syncthreads();
-  //print_double(&dsm,1,"dsm1639");
-  //print_int(&sc->cv_L,1,"cv_L1639");
-  //double BIAS2dsm=BIAS2*dsm;
-  //print_double(&BIAS2dsm,1,"BIAS2dsm");
-  //double cv_L1=sc->cv_L;
-  //print_double(&cv_L1,1,"1cv_L");
-  //double cv_etaq_power=dSUNRpowerR(BIAS2dsm,cv_L1);
-  //print_double(&cv_etaq_power,1,"cv_etaq_power");
-  //double cv_etaq_sqrt=sqrt(BIAS2dsm);
-  //print_double(&cv_etaq_sqrt,1,"cv_etaq_sqrt");
   sc->cv_etaq=1./(dSUNRpowerR(BIAS2*dsm,1./sc->cv_L) + ADDON);
-  //print_double(&sc->cv_etaq,1,"cv_etaq1639");
-  /*
-  if(sc->cv_L!=2){
-    //print_int(&sc->cv_L,1,"cv_L1674");
-    if(i==0)printf("WARNING: pow is innacurate from CPU"
-    " result for CUDA/10.1.105 "
-    " (which is used during development at CTE-POWER) "
-    " (debug by compare pow(x,0.5) and"
-    " sqrt(x.0.5), double x=3.28586921557249207e-12)\n");
-  }
-   */
-  __syncthreads();
   if (sc->cv_qwait != 0) {
     sc->cv_eta = sc->cv_etaq;
-    //print_double(&sc->cv_eta,1,"cv_eta1639");
     sc->cv_qprime = sc->cv_q;
     cudaDevicecvSetEta(md, sc);
     return 0;
   }
-  __syncthreads();
   sc->cv_qwait = 2;
   double ddn;
   sc->cv_etaqm1 = 0.;
-  __syncthreads();
   if (sc->cv_q > 1) {
     cudaDeviceVWRMS_Norm_2(&md->dzn[md->nrows*sc->cv_q],
                          md->dewt, &ddn, md->n_shr_empty);
-    __syncthreads();
     ddn *= md->cv_tq[1+blockIdx.x*(NUM_TESTS + 1)];
-    __syncthreads();
     sc->cv_etaqm1 = 1./(dSUNRpowerR(BIAS1*ddn, 1./sc->cv_q) + ADDON);
   }
   double dup, cquot;
   sc->cv_etaqp1 = 0.;
-  __syncthreads();
   if (sc->cv_q != md->cv_qmax && sc->cv_saved_tq5 != 0.) {
     cquot = (md->cv_tq[5+blockIdx.x*(NUM_TESTS + 1)] / sc->cv_saved_tq5) *
             dSUNRpowerI(sc->cv_h/md->cv_tau[2+blockIdx.x*(L_MAX + 1)],(double)sc->cv_L);
     md->dtempv[i]=md->cv_acor[i]-cquot*md->dzn[i+md->nrows*md->cv_qmax];
-    //print_double(md->dtempv,86,"dtempv1658");
     cudaDeviceVWRMS_Norm_2(md->dtempv, md->dewt, &dup, md->n_shr_empty);
-    __syncthreads();
     dup *= md->cv_tq[3+blockIdx.x*(NUM_TESTS + 1)];
-    __syncthreads();
-    //print_double(&dup,1,"dup1728");
-    //print_int(&sc->cv_L,1,"cv_L1728");
-    //double BIAS3dup=BIAS3*dup;
-    //print_double(&BIAS3dup,1,"BIAS3dup");
-    //double cv_L1=1./(sc->cv_L+1);
-    //print_double(&cv_L1,1,"1cv_L1732");
-    //double cv_etaq_power=dSUNRpowerR(BIAS3dup,1./cv_L1);
-    //double cv_etaq_power=(double)pow((double)BIAS3dup,(double)cv_L1);
-    //print_double(&cv_etaq_power,1,"cv_etaq_power1734");
     sc->cv_etaqp1 = 1. / (dSUNRpowerR(BIAS3*dup, 1./(sc->cv_L+1)) + ADDON);
-    //print_double(&sc->cv_etaqp1,1,"cv_etaqp1728");
   }
-  __syncthreads();
   cudaDevicecvChooseEta(md, sc);
-  __syncthreads();
   cudaDevicecvSetEta(md, sc);
-  __syncthreads();
   return CV_SUCCESS;
 }
 
@@ -1722,12 +1433,9 @@ void cudaDevicecvIncreaseBDF(ModelDataGPU *md, ModelDataVariable *sc) {
   }
   A1 = (-alpha0 - alpha1) / prod;
   md->dzn[i+md->nrows*sc->cv_L]=A1*md->dzn[i+md->nrows*sc->cv_indx_acor];
-  __syncthreads();
   for (j=2; j <= sc->cv_q; j++){
     md->dzn[i+md->nrows*j]+=md->cv_l[j+blockIdx.x*L_MAX]*md->dzn[i+md->nrows*(sc->cv_L)];
-    __syncthreads();
   }
-  print_double(md->dzn,86,"dzn1687");
 }
 
 __device__
@@ -1744,7 +1452,6 @@ void cudaDevicecvAdjustParams(ModelDataGPU *md, ModelDataVariable *sc) {
     }
     sc->cv_q = sc->cv_qprime;
     sc->cv_L = sc->cv_q+1;
-    //print_int(&sc->cv_L,1,"cv_L1770");
     sc->cv_qwait = sc->cv_L;
   }
   cudaDevicecvRescale(md, sc);
@@ -1758,30 +1465,21 @@ int cudaDevicecvStep(ModelDataGPU *md, ModelDataVariable *sc) {
   int nflag=FIRST_CALL;
   double saved_t=sc->cv_tn;
   double dsm;
-  __syncthreads();
   if ((sc->cv_nst > 0) && (sc->cv_hprime != sc->cv_h)){
     cudaDevicecvAdjustParams(md, sc);
   }
-  __syncthreads();
   for (;;) {
-    __syncthreads();
     cudaDevicecvPredict(md, sc);
-    __syncthreads();
     cudaDevicecvSet(md, sc);
-    __syncthreads();
     nflag = cudaDevicecvNlsNewton(nflag,md, sc);
-    __syncthreads();
     int kflag = cudaDevicecvHandleNFlag(md, sc, &nflag, saved_t, &ncf);
-    __syncthreads();
     if (kflag == PREDICT_AGAIN) {
       continue;
     }
     if (kflag != DO_ERROR_TEST) {
       return (kflag);
     }
-    __syncthreads();
     int eflag=cudaDevicecvDoErrorTest(md,sc,&nflag,saved_t,&nef,&dsm);
-    __syncthreads();
     if (eflag == TRY_AGAIN){
       continue;
     }
@@ -1790,14 +1488,10 @@ int cudaDevicecvStep(ModelDataGPU *md, ModelDataVariable *sc) {
     }
     break;
   }
-  __syncthreads();
   cudaDevicecvCompleteStep(md, sc);
-  __syncthreads();
   cudaDevicecvPrepareNextStep(md, sc, dsm);
-  __syncthreads();
   sc->cv_etamax=10.;
   md->cv_acor[i]*=md->cv_tq[2+blockIdx.x*(NUM_TESTS + 1)];
-  __syncthreads();
   return(CV_SUCCESS);
   }
 
@@ -1808,7 +1502,6 @@ int cudaDeviceCVodeGetDky(ModelDataGPU *md, ModelDataVariable *sc,
   double s, c, r;
   double tfuzz, tp, tn1;
   int z, j;
-  __syncthreads();
    tfuzz = FUZZ_FACTOR * md->cv_uround * (fabs(sc->cv_tn) + fabs(sc->cv_hu));
    if (sc->cv_hu < 0.) tfuzz = -tfuzz;
    tp = sc->cv_tn - sc->cv_hu - tfuzz;
@@ -1816,7 +1509,6 @@ int cudaDeviceCVodeGetDky(ModelDataGPU *md, ModelDataVariable *sc,
    if ((t-tp)*(t-tn1) > 0.) {
      return(CV_BAD_T);
    }
-  __syncthreads();
    s = (t - sc->cv_tn) / sc->cv_h;
    for (j=sc->cv_q; j >= k; j--) {
      c = 1.;
@@ -1827,11 +1519,8 @@ int cudaDeviceCVodeGetDky(ModelDataGPU *md, ModelDataVariable *sc,
         dky[i]=c*md->dzn[i+md->nrows*j]+s*dky[i];
      }
    }
-  __syncthreads();
    if (k == 0) return(CV_SUCCESS);
-  __syncthreads();
    r = dSUNRpowerI(double(sc->cv_h),double(-k));
-  __syncthreads();
    dky[i]=dky[i]*r;
    return(CV_SUCCESS);
 }
@@ -1843,8 +1532,7 @@ int cudaDevicecvEwtSetSV(ModelDataGPU *md, ModelDataVariable *sc,double *weight)
   md->dtempv[i]=fabs(md->dzn[i]);
   double min;
   md->dtempv[i]=md->cv_reltol*md->dtempv[i]+md->cv_Vabstol[i];
-  cudaDevicemin_2(&min, md->dtempv[i], flag_shr2, md->n_shr_empty);
-__syncthreads();
+  cudaDevicemin(&min, md->dtempv[i], flag_shr2, md->n_shr_empty);
   if (min <= 0.) return(-1);
   weight[i]= 1./md->dtempv[i];
   return(0);
@@ -1860,18 +1548,16 @@ int cudaDeviceCVode(ModelDataGPU *md, ModelDataVariable *sc) {
   sc->cv_nst=0;
   sc->cv_nstlp=0;
   for(;;) {
-    __syncthreads();
 #ifdef CAMP_PROFILE_DEVICE_FUNCTIONS
     if(threadIdx.x==0) sc->countercvStep++;
 #endif
+    __syncthreads();
     flag_shr[0] = 0;
     __syncthreads();
     sc->cv_next_h = sc->cv_h;
     int ewtsetOK = 0;
     if (sc->cv_nst > 0) {
-      //print_double(md->dtempv,86,"dtempvcv_efun0");
       ewtsetOK = cudaDevicecvEwtSetSV(md, sc, md->dewt);
-      //print_double(md->dtempv,86,"dtempvcv_efun1");
       if (ewtsetOK != 0) {
         sc->cv_tretlast = sc->tret = sc->cv_tn;
         md->yout[i] = md->dzn[i];
@@ -1895,8 +1581,6 @@ int cudaDeviceCVode(ModelDataGPU *md, ModelDataVariable *sc) {
       md->yout[i] = md->dzn[i];
       sc->cv_tolsf *= 2.;
       if(i==0) printf("ERROR: cv_tolsf\n");
-      __syncthreads();
-      if(i==0) printf("ERROR: cv_tolsf\n");
       return CV_TOO_MUCH_ACC;
     } else {
       sc->cv_tolsf = 1.;
@@ -1909,9 +1593,7 @@ int cudaDeviceCVode(ModelDataGPU *md, ModelDataVariable *sc) {
         if(i==0)printf("WARNING: h below roundoff level in tn");
     }
 #endif
-    print_double(md->dzn,86,"dzn1858");
     kflag2 = cudaDevicecvStep(md, sc);
-    __syncthreads();
     if (kflag2 != CV_SUCCESS) {
       sc->cv_tretlast = sc->tret = sc->cv_tn;
       md->yout[i] = md->dzn[i];
@@ -1931,7 +1613,6 @@ int cudaDeviceCVode(ModelDataGPU *md, ModelDataVariable *sc) {
         sc->cv_tretlast = sc->tret = md->cv_tstop;
         md->cv_tstopset = SUNFALSE;
         if(i==0) printf("ERROR: cv_tstopset\n");
-        __syncthreads();
         return CV_TSTOP_RETURN;
       }
       if ((sc->cv_tn + sc->cv_hprime - md->cv_tstop) * sc->cv_h > 0.) {
@@ -1948,28 +1629,20 @@ void cudaGlobalCVode(ModelDataGPU md_object) {
   ModelDataGPU *md = &md_object;
   extern __shared__ int flag_shr[];
   int i = blockIdx.x * blockDim.x + threadIdx.x;
-  //TODO CHECK IF USING SC AS LOCAL INSTEAD OF MD->SCELLS HAS BETTER MAPE AND FINE IN MONARCH
-  //IF WANT TO USE SC 1 PER BLOCK, THEN CHECK ALL SC->SOMETHING = SOMETHING AND BLOCKIDX.X CALLS AND ADD IF(THREADIDX.X==0)...SYNCTHREADS() TO AVOID OVERLAPPING
-  //ModelDataVariable *sc = &md->sCells[blockIdx.x];
   ModelDataVariable sc_object = md->sCells[blockIdx.x];
   ModelDataVariable *sc = &sc_object;
-  __syncthreads();
   int istate;
   if(i<md->nrows){
 #ifdef CAMP_PROFILE_DEVICE_FUNCTIONS
     int clock_khz=md->clock_khz;
     clock_t start;
     start = clock();
-    __syncthreads();
 #endif
     istate=cudaDeviceCVode(md,sc);
-    __syncthreads();
 #ifdef CAMP_PROFILE_DEVICE_FUNCTIONS
   if(threadIdx.x==0) sc->dtcudaDeviceCVode += ((double)(int)(clock() - start))/(clock_khz*1000);
-  __syncthreads();
 #endif
   }
-  __syncthreads();
   if(threadIdx.x==0) md->flagCells[blockIdx.x]=istate;
 #ifdef CAMP_PROFILE_DEVICE_FUNCTIONS
   ModelDataVariable *mdvo = md->mdvo;
diff --git a/src/cuda/cvode_gpu.cu b/src/cuda/cvode_gpu.cu
index 53b683352..071b47c08 100644
--- a/src/cuda/cvode_gpu.cu
+++ b/src/cuda/cvode_gpu.cu
@@ -12,13 +12,6 @@ extern "C" {
 #include <mpi.h>
 #endif
 
-void print_double_cv_gpu(double *x, int len, const char *s){
-#ifdef USE_PRINT_ARRAYS
-  for (int i=0; i<len; i++){
-    printf("%s[%d]=%.17le\n",s,i,x[i]);
-  }
-#endif
-}
 
 int cvHandleFailure_gpu(CVodeMem cv_mem, int flag){
   switch (flag) {
@@ -617,10 +610,6 @@ int cudaCVode(void *cvode_mem, realtype tout, N_Vector yout,
                     cudaMemcpyHostToDevice, stream);
     cudaMemcpyAsync(&mGPU->sCells[i], &mCPU->mdvCPU, sizeof(ModelDataVariable), cudaMemcpyHostToDevice, stream);
   }
-  //double *zn0 = NV_DATA_S(cv_mem->cv_zn[0]);
-  //print_double_cv_gpu(zn0,86,"dzn807");
-  //double *zn1 = NV_DATA_S(cv_mem->cv_zn[1]);
-  //print_double_cv_gpu(zn1,86,"dzn825");
   cvodeRun(mGPU,stream);
   cudaMemcpyAsync(cv_acor_init, mGPU->cv_acor_init, mGPU->nrows * sizeof(double), cudaMemcpyDeviceToHost, stream);
   cudaMemcpyAsync(youtArray, mGPU->yout, mGPU->nrows * sizeof(double), cudaMemcpyDeviceToHost, stream);
@@ -639,7 +628,6 @@ int cudaCVode(void *cvode_mem, realtype tout, N_Vector yout,
     cv_mem->timecvStep+= mscvStep/1000;
 #ifdef CAMP_PROFILE_DEVICE_FUNCTIONS
     cudaMemcpy(&mCPU->mdvCPU, mGPU->mdvo, sizeof(ModelDataVariable), cudaMemcpyDeviceToHost);
-    //printf("mCPU->mdvCPU.dtcudaDeviceCVode %lf\n",mCPU->mdvCPU.dtcudaDeviceCVode);
 #endif
 #endif
   istate = CV_SUCCESS;
@@ -649,8 +637,9 @@ int cudaCVode(void *cvode_mem, realtype tout, N_Vector yout,
       int rank;
       MPI_Comm_rank(MPI_COMM_WORLD, &rank);
       printf("cudaCVode2 kflag %d cell %d rank %d\n",istate,i,rank);
+      printf("Exiting...\n");
+      MPI_Abort(MPI_COMM_WORLD, 1);
       istate = cvHandleFailure_gpu(cv_mem, istate);
-      //Optional: call EXPORT_NETCDF after this fail
     }
   }
   return(istate);
diff --git a/src/cuda/cvode_init.cu b/src/cuda/cvode_init.cu
index 8b5394e90..16c996f88 100644
--- a/src/cuda/cvode_init.cu
+++ b/src/cuda/cvode_init.cu
@@ -7,6 +7,7 @@
 extern "C" {
 #include "cvode_gpu.h"
 }
+#include <unistd.h>
 #ifdef CAMP_USE_MPI
 #include <mpi.h>
 #endif
@@ -34,39 +35,20 @@ void constructor_cvode_gpu(SolverData *sd){
   mCPU->env_size = CAMP_NUM_ENV_PARAM_ * n_cells * sizeof(double); //Temp and pressure
   size_t rxn_env_data_idx_size = (n_rxn+1) * sizeof(int);
   size_t map_state_deriv_size = n_dep_var * n_cells * sizeof(int);
-  int coresPerNode = 40;
-  int size;
-  MPI_Comm_size(MPI_COMM_WORLD, &size);
-  if (size > 40 && size % coresPerNode != 0) {
-    printf("ERROR: MORE THAN 40 MPI PROCESSES AND NOT MULTIPLE OF 40, WHEN CTE-POWER ONLY HAS 40 CORES PER NODE\n");
-    exit(0);
-  }
-  int nGPUsMax=4;
+  int nGPUsMax;
   cudaGetDeviceCount(&nGPUsMax);
-  if (sd->nGPUs > nGPUsMax) {
-    printf("ERROR: Not enough GPUs to launch, nGPUs %d nGPUsMax %d\n", sd->nGPUs, nGPUsMax);
-    exit(0);
-  }
-  if (size > sd->nGPUs*(coresPerNode/nGPUsMax)){
-    printf("ERROR: size,sd->nGPUs,coresPerNode,nGPUsMax %d %d %d %d "
-           "MORE MPI PROCESSES THAN DEVICES (FOLLOW PROPORTION, "
-           "FOR CTE-POWER IS 10 PROCESSES FOR EACH GPU)\n",size,sd->nGPUs,coresPerNode,nGPUsMax);
-    exit(0);
-  }
-  int rank;
-  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-  cudaSetDevice(0);
-  for (int i = 0; i < coresPerNode; i++) {
-    if (rank < coresPerNode / nGPUsMax * (i + 1) && rank >= coresPerNode / nGPUsMax * i && i<sd->nGPUs) {
-      cudaSetDevice(i);
-      mCPU->threads = 1024;
-      mCPU->blocks = (n_dep_var*n_cells + mCPU->threads - 1) / mCPU->threads;
-    }
-  }
+  int rankNode, sizeNode;
+  MPI_Comm commNode;
+  MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0,
+                      MPI_INFO_NULL, &commNode);
+  MPI_Comm_rank(commNode, &rankNode);
+  MPI_Comm_size(commNode, &sizeNode);
+  int MPIsPerGPU = sizeNode / nGPUsMax;
+  cudaSetDevice(rankNode / MPIsPerGPU);
   mGPU->n_rxn=md->n_rxn;
   mGPU->n_rxn_env_data=md->n_rxn_env_data;
-  cudaMalloc((void **) &mGPU->state, state_size);
-  cudaMalloc((void **) &mGPU->env, mCPU->env_size);
+  HANDLE_ERROR(cudaMalloc((void **) &mGPU->state, state_size));
+  HANDLE_ERROR(cudaMalloc((void **) &mGPU->env, mCPU->env_size));
   cudaMalloc((void **) &mGPU->rxn_env_data, md->n_rxn_env_data * n_cells * sizeof(double));
   cudaMalloc((void **) &mGPU->rxn_env_data_idx, rxn_env_data_idx_size);
   cudaMalloc((void **) &mGPU->map_state_deriv, map_state_deriv_size);
@@ -116,9 +98,9 @@ void constructor_cvode_gpu(SolverData *sd){
   Jacobian *jac = &sd->jac;
   JacobianGPU *jacgpu = &(mGPU->jac);
   cudaMalloc((void **) &jacgpu->num_elem, 1 * sizeof(jacgpu->num_elem));
-  cudaMemcpy(jacgpu->num_elem, &jac->num_elem, 1 * sizeof(jacgpu->num_elem), cudaMemcpyHostToDevice);
+  HANDLE_ERROR(cudaMemcpy(jacgpu->num_elem, &jac->num_elem, 1 * sizeof(jacgpu->num_elem), cudaMemcpyHostToDevice));
   int num_elem = jac->num_elem * n_cells;
-  cudaMalloc((void **) &(jacgpu->production_partials), num_elem * sizeof(double));
+  HANDLE_ERROR(cudaMalloc((void **) &(jacgpu->production_partials), num_elem * sizeof(double)));
   HANDLE_ERROR(cudaMalloc((void **) &(jacgpu->loss_partials), num_elem * sizeof(double)));
   double *aux=(double*)malloc(sizeof(double)*num_elem);
   for (int i = 0; i < num_elem; i++) {
diff --git a/src/debug_and_stats/camp_debug_2.h b/src/debug_and_stats/camp_debug_2.h
deleted file mode 100644
index acb947372..000000000
--- a/src/debug_and_stats/camp_debug_2.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * -----------------------------------------------------------------
- * Programmer(s): Christian G. Ruiz and Mario Acosta
- * -----------------------------------------------------------------
- * Copyright (C) 2022 Barcelona Supercomputing Center
- * SPDX-License-Identifier: MIT
- */
-
-#ifndef CAMP_DEBUG_2_H
-#define CAMP_DEBUG_2_H
-
-#include "../camp_common.h"
-
-void init_export_state();
-void export_state(SolverData *sd);
-void join_export_state();
-void init_export_stats();
-void export_stats(SolverData *sd);
-void print_double(double *x, int len, const char *s);
-void print_int(int *x, int len, const char *s);
-
-#endif  // CAMP_DEBUG_2_H
diff --git a/src/rxn_solver.h b/src/rxn_solver.h
index 311818d46..0ab48281e 100644
--- a/src/rxn_solver.h
+++ b/src/rxn_solver.h
@@ -12,7 +12,6 @@
 #define RXN_SOLVER_H
 #include "Jacobian.h"
 #include "camp_common.h"
-#include "debug_and_stats/camp_debug_2.h"
 
 /** Public reaction functions **/
 
diff --git a/src/rxns.h b/src/rxns.h
index c1af29d3f..69ada0cbb 100644
--- a/src/rxns.h
+++ b/src/rxns.h
@@ -15,7 +15,6 @@
 #define RXNS_H_
 #include "Jacobian.h"
 #include "camp_common.h"
-#include "debug_and_stats/camp_debug_2.h"
 
 // aqueous_equilibrium
 void rxn_aqueous_equilibrium_get_used_jac_elem(int *rxn_int_data,
diff --git a/src/rxns/rxn_CMAQ_H2O2.c b/src/rxns/rxn_CMAQ_H2O2.c
index e939ee9cb..0c8a23275 100644
--- a/src/rxns/rxn_CMAQ_H2O2.c
+++ b/src/rxns/rxn_CMAQ_H2O2.c
@@ -146,7 +146,7 @@ void rxn_CMAQ_H2O2_calc_deriv_contrib(ModelData *model_data,
   double *env_data = model_data->grid_cell_env;
 
   // Calculate the reaction rate
-  long double rate = RATE_CONSTANT_;
+  double rate = RATE_CONSTANT_;
   for (int i_spec = 0; i_spec < NUM_REACT_; i_spec++)
     rate *= state[REACT_(i_spec)];
 
diff --git a/src/rxns/rxn_CMAQ_OH_HNO3.c b/src/rxns/rxn_CMAQ_OH_HNO3.c
index 7d3e0f051..1292c517c 100644
--- a/src/rxns/rxn_CMAQ_OH_HNO3.c
+++ b/src/rxns/rxn_CMAQ_OH_HNO3.c
@@ -153,7 +153,7 @@ void rxn_CMAQ_OH_HNO3_calc_deriv_contrib(
   double *env_data = model_data->grid_cell_env;
 
   // Calculate the reaction rate
-  long double rate = RATE_CONSTANT_;
+  double rate = RATE_CONSTANT_;
   for (int i_spec = 0; i_spec < NUM_REACT_; i_spec++)
     rate *= state[REACT_(i_spec)];
 
diff --git a/src/rxns/rxn_HL_phase_transfer.c b/src/rxns/rxn_HL_phase_transfer.c
index 29efbb4df..ddc60dd02 100644
--- a/src/rxns/rxn_HL_phase_transfer.c
+++ b/src/rxns/rxn_HL_phase_transfer.c
@@ -303,18 +303,18 @@ void rxn_HL_phase_transfer_calc_deriv_contrib(
 
     // this was replaced with transition-regime rate equation
 #if 0
-    long double cond_rate =
-        ((long double)1.0) / (radius * radius / (3.0 * DIFF_COEFF_) +
+    double cond_rate =
+        ((double)1.0) / (radius * radius / (3.0 * DIFF_COEFF_) +
                               4.0 * radius / (3.0 * MFP_M_));
 #endif
 
     // Calculate the rate constant for diffusion limited mass transfer to the
     // aerosol phase (1/s)
-    long double cond_rate =
+    double cond_rate =
         gas_aerosol_transition_rxn_rate_constant(DIFF_COEFF_, MFP_M_, radius, ALPHA_);
 
     // Calculate the evaporation rate constant (1/s)
-    long double evap_rate = cond_rate / (EQUIL_CONST_);
+    double evap_rate = cond_rate / (EQUIL_CONST_);
 
     // Calculate the evaporation and condensation rates (ppm/s)
     cond_rate *= state[GAS_SPEC_];
@@ -397,17 +397,17 @@ void rxn_HL_phase_transfer_calc_jac_contrib(ModelData *model_data, Jacobian jac,
 
     // this was replaced with transition-regime rate equation
 #if 0
-    long double cond_rate = 1.0 / (radius * radius / (3.0 * DIFF_COEFF_) +
+    double cond_rate = 1.0 / (radius * radius / (3.0 * DIFF_COEFF_) +
                                    4.0 * radius / (3.0 * MFP_M_));
 #endif
 
     // Calculate the rate constant for diffusion limited mass transfer to the
     // aerosol phase (1/s)
-    long double cond_rate =
+    double cond_rate =
         gas_aerosol_transition_rxn_rate_constant(DIFF_COEFF_, MFP_M_, radius, ALPHA_);
 
     // Calculate the evaporation rate constant (1/s)
-    long double evap_rate = cond_rate / (EQUIL_CONST_);
+    double evap_rate = cond_rate / (EQUIL_CONST_);
 
     // Change in the gas-phase is evaporation - condensation (ppm/s)
     if (JAC_ID_(0) >= 0)
@@ -447,14 +447,14 @@ void rxn_HL_phase_transfer_calc_jac_contrib(ModelData *model_data, Jacobian jac,
     // Calculate d_rate/d_effecive_radius and d_rate/d_number_concentration
     // ( This was replaced with transition-regime rate equation. )
 #if 0
-    long double d_rate_d_radius =
+    double d_rate_d_radius =
         -rate * cond_rate *
         (2.0 * radius / (3.0 * DIFF_COEFF_) + 4.0 / (3.0 * MFP_M_));
 #endif
-    long double d_cond_d_radius =
+    double d_cond_d_radius =
         d_gas_aerosol_transition_rxn_rate_constant_d_radius(
             DIFF_COEFF_, MFP_M_, radius, ALPHA_) * state[GAS_SPEC_];
-    long double d_evap_d_radius = d_cond_d_radius / state[GAS_SPEC_] /
+    double d_evap_d_radius = d_cond_d_radius / state[GAS_SPEC_] /
                                   (EQUIL_CONST_)*state[AERO_SPEC_(i_phase)] /
                                   state[AERO_WATER_(i_phase)];
 
diff --git a/src/rxns/rxn_SIMPOL_phase_transfer.c b/src/rxns/rxn_SIMPOL_phase_transfer.c
index 627fff458..2b3f14058 100644
--- a/src/rxns/rxn_SIMPOL_phase_transfer.c
+++ b/src/rxns/rxn_SIMPOL_phase_transfer.c
@@ -332,22 +332,22 @@ void rxn_SIMPOL_phase_transfer_calc_deriv_contrib(
     // This was replaced with the transition-regime condensation rate
     // equations
 #if 0
-    long double cond_rate =
-        ((long double)1.0) / (radius * radius / (3.0 * DIFF_COEFF_) +
+    double cond_rate =
+        ((double)1.0) / (radius * radius / (3.0 * DIFF_COEFF_) +
                               4.0 * radius / (3.0 * MFP_M_));
 #endif
 
     // Calculate the rate constant for diffusion limited mass transfer to the
     // aerosol phase (m3/#/s)
-    long double cond_rate =
+    double cond_rate =
         gas_aerosol_transition_rxn_rate_constant(DIFF_COEFF_, MFP_M_, radius, ALPHA_);
 
     // Calculate the evaporation rate constant (ppm_x*m^3/kg_x/s)
-    long double evap_rate =
+    double evap_rate =
         cond_rate * (EQUIL_CONST_ * aero_phase_avg_MW / aero_phase_mass);
 
     // Get the activity coefficient (if one exists)
-    long double act_coeff = 1.0;
+    double act_coeff = 1.0;
     if (AERO_ACT_ID_(i_phase) > -1) {
       act_coeff = state[AERO_ACT_ID_(i_phase)];
     }
@@ -470,22 +470,22 @@ void rxn_SIMPOL_phase_transfer_calc_jac_contrib(ModelData *model_data,
     // This was replaced with the transition-regime condensation rate
     // equations
 #if 0
-    long double cond_rate =
-        ((long double)1.0) / (radius * radius / (3.0 * DIFF_COEFF_) +
+    double cond_rate =
+        ((double)1.0) / (radius * radius / (3.0 * DIFF_COEFF_) +
                               4.0 * radius / (3.0 * MFP_M_));
 #endif
 
     // Calculate the rate constant for diffusion limited mass transfer to the
     // aerosol phase (m3/#/s)
-    long double cond_rate =
+    double cond_rate =
         gas_aerosol_transition_rxn_rate_constant(DIFF_COEFF_, MFP_M_, radius, ALPHA_);
 
     // Calculate the evaporation rate constant (ppm_x*m^3/kg_x/s)
-    long double evap_rate =
+    double evap_rate =
         cond_rate * (EQUIL_CONST_ * aero_phase_avg_MW / aero_phase_mass);
 
     // Get the activity coefficient (if one exists)
-    long double act_coeff = 1.0;
+    double act_coeff = 1.0;
     if (AERO_ACT_ID_(i_phase) > -1) {
       act_coeff = state[AERO_ACT_ID_(i_phase)];
     }
diff --git a/src/rxns/rxn_aqueous_equilibrium.c b/src/rxns/rxn_aqueous_equilibrium.c
index 044fd5f82..7e8697c86 100644
--- a/src/rxns/rxn_aqueous_equilibrium.c
+++ b/src/rxns/rxn_aqueous_equilibrium.c
@@ -275,25 +275,25 @@ void rxn_aqueous_equilibrium_update_env_state(ModelData *model_data,
  * \param rate_reverse [output] calculated reverse rate
  * \return reaction rate per mixing ratio of water [M_X/s*kg_H2O/m^3]
  */
-long double calc_standard_rate(int *rxn_int_data, double *rxn_float_data,
+double calc_standard_rate(int *rxn_int_data, double *rxn_float_data,
                                double *rxn_env_data, bool is_water_partial,
-                               long double *rate_forward,
-                               long double *rate_reverse) {
+                               double *rate_forward,
+                               double *rate_reverse) {
   int *int_data = rxn_int_data;
   double *float_data = rxn_float_data;
 
-  long double react_fact, prod_fact;
-  long double water = WATER_CONC_;
+  double react_fact, prod_fact;
+  double water = WATER_CONC_;
 
   // Get the product of all reactants
-  react_fact = (long double)REACT_CONC_(0) * MASS_FRAC_TO_M_(0);
+  react_fact = (double)REACT_CONC_(0) * MASS_FRAC_TO_M_(0);
   for (int i_react = 1; i_react < NUM_REACT_; i_react++) {
     react_fact *= REACT_CONC_(i_react) * MASS_FRAC_TO_M_(i_react) / water;
   }
 
   // Get the product of all product
-  prod_fact = (long double)PROD_CONC_(0) * MASS_FRAC_TO_M_(NUM_REACT_);
-  prod_fact *= (long double)ACTIVITY_COEFF_VALUE_;
+  prod_fact = (double)PROD_CONC_(0) * MASS_FRAC_TO_M_(NUM_REACT_);
+  prod_fact *= (double)ACTIVITY_COEFF_VALUE_;
   for (int i_prod = 1; i_prod < NUM_PROD_; i_prod++) {
     prod_fact *=
         PROD_CONC_(i_prod) * MASS_FRAC_TO_M_(NUM_REACT_ + i_prod) / water;
@@ -331,7 +331,7 @@ void rxn_aqueous_equilibrium_calc_deriv_contrib(
   // Calculate derivative contributions for each aerosol phase
   for (int i_phase = 0, i_deriv = 0; i_phase < NUM_AERO_PHASE_; i_phase++) {
     // If no aerosol water is present, no reaction occurs
-    long double water = state[WATER_(i_phase)];
+    double water = state[WATER_(i_phase)];
     if (water < MIN_WATER_ * SMALL_WATER_CONC_(i_phase)) {
       i_deriv += NUM_REACT_ + NUM_PROD_;
       continue;
@@ -350,8 +350,8 @@ void rxn_aqueous_equilibrium_calc_deriv_contrib(
     }
 
     // Get the rate using the standard calculation
-    long double rate_forward, rate_reverse;
-    long double rate =
+    double rate_forward, rate_reverse;
+    double rate =
         calc_standard_rate(rxn_int_data, rxn_float_data, rxn_env_data, false,
                            &rate_forward, &rate_reverse);
     if (rate == ZERO) {
@@ -413,21 +413,21 @@ void rxn_aqueous_equilibrium_calc_jac_contrib(ModelData *model_data,
   // Calculate Jacobian contributions for each aerosol phase
   for (int i_phase = 0, i_jac = 0; i_phase < NUM_AERO_PHASE_; i_phase++) {
     // If not aerosol water is present, no reaction occurs
-    long double water = state[WATER_(i_phase)];
+    double water = state[WATER_(i_phase)];
     if (water < MIN_WATER_ * SMALL_WATER_CONC_(i_phase)) {
       i_jac += (NUM_REACT_ + NUM_PROD_) * (NUM_REACT_ + NUM_PROD_ + 2);
       continue;
     }
 
     // Calculate the forward rate (M/s)
-    long double forward_rate = RATE_CONST_FORWARD_;
+    double forward_rate = RATE_CONST_FORWARD_;
     for (int i_react = 0; i_react < NUM_REACT_; i_react++) {
       forward_rate *= state[REACT_(i_phase * NUM_REACT_ + i_react)] *
                       MASS_FRAC_TO_M_(i_react) / water;
     }
 
     // Calculate the reverse rate (M/s)
-    long double reverse_rate = RATE_CONST_REVERSE_;
+    double reverse_rate = RATE_CONST_REVERSE_;
     for (int i_prod = 0; i_prod < NUM_PROD_; i_prod++) {
       reverse_rate *= state[PROD_(i_phase * NUM_PROD_ + i_prod)] *
                       MASS_FRAC_TO_M_(NUM_REACT_ + i_prod) / water;
diff --git a/src/rxns/rxn_arrhenius.c b/src/rxns/rxn_arrhenius.c
index 79bbe3905..03dda4f64 100644
--- a/src/rxns/rxn_arrhenius.c
+++ b/src/rxns/rxn_arrhenius.c
@@ -138,7 +138,7 @@ void rxn_arrhenius_calc_deriv_contrib(ModelData *model_data,
   double *env_data = model_data->grid_cell_env;
 
   // Calculate the reaction rate
-  long double rate = RATE_CONSTANT_;
+  double rate = RATE_CONSTANT_;
   for (int i_spec = 0; i_spec < NUM_REACT_; i_spec++)
     rate *= state[REACT_(i_spec)];
 
@@ -186,7 +186,7 @@ void rxn_arrhenius_calc_jac_contrib(ModelData *model_data, Jacobian jac,
   int i_elem = 0;
   for (int i_ind = 0; i_ind < NUM_REACT_; i_ind++) {
     // Calculate d_rate / d_i_ind
-    long double rate = RATE_CONSTANT_;
+    double rate = RATE_CONSTANT_;
     for (int i_spec = 0; i_spec < NUM_REACT_; i_spec++)
       if (i_spec != i_ind) rate *= state[REACT_(i_spec)];
 
diff --git a/src/rxns/rxn_condensed_phase_arrhenius.c b/src/rxns/rxn_condensed_phase_arrhenius.c
index d64dfb757..620a5343e 100644
--- a/src/rxns/rxn_condensed_phase_arrhenius.c
+++ b/src/rxns/rxn_condensed_phase_arrhenius.c
@@ -192,7 +192,7 @@ void rxn_condensed_phase_arrhenius_calc_deriv_contrib(
   // Calculate derivative contributions for each aerosol phase
   for (int i_phase = 0, i_deriv = 0; i_phase < NUM_AERO_PHASE_; i_phase++) {
     // If this is an aqueous reaction, get the unit conversion from mol/m3 -> M
-    long double unit_conv = 1.0;
+    double unit_conv = 1.0;
     if (WATER_(i_phase) >= 0) {
       unit_conv = state[WATER_(i_phase)];  // convert from kg/m3->L/m3
 
@@ -206,7 +206,7 @@ void rxn_condensed_phase_arrhenius_calc_deriv_contrib(
     }
 
     // Calculate the reaction rate rate (M/s or mol/m3/s)
-    long double rate = RATE_CONSTANT_;
+    double rate = RATE_CONSTANT_;
     for (int i_react = 0; i_react < NUM_REACT_; i_react++) {
       rate *= state[REACT_(i_phase * NUM_REACT_ + i_react)] *
               KGM3_TO_MOLM3_(i_react) * unit_conv;
diff --git a/src/rxns/rxn_condensed_phase_photolysis.c b/src/rxns/rxn_condensed_phase_photolysis.c
index 425b69303..1073c6de5 100644
--- a/src/rxns/rxn_condensed_phase_photolysis.c
+++ b/src/rxns/rxn_condensed_phase_photolysis.c
@@ -211,7 +211,7 @@ void rxn_condensed_phase_photolysis_calc_deriv_contrib(
   // Calculate derivative contributions for each aerosol phase
   for (int i_phase = 0, i_deriv = 0; i_phase < NUM_AERO_PHASE_; i_phase++) {
     // If this is an aqueous reaction, get the unit conversion from mol/m3 -> M
-    long double unit_conv = 1.0;
+    double unit_conv = 1.0;
     if (WATER_(i_phase) >= 0) {
       unit_conv = state[WATER_(i_phase)];  // convert from kg/m3->L/m3
 
@@ -225,7 +225,7 @@ void rxn_condensed_phase_photolysis_calc_deriv_contrib(
     }
 
     // Calculate the reaction rate rate (M/s or mol/m3/s)
-    long double rate = RATE_CONSTANT_;
+    double rate = RATE_CONSTANT_;
     for (int i_react = 0; i_react < NUM_REACT_; i_react++) {
       rate *= state[REACT_(i_phase * NUM_REACT_ + i_react)] *
               KGM3_TO_MOLM3_(i_react) * unit_conv;
diff --git a/src/rxns/rxn_emission.c b/src/rxns/rxn_emission.c
index 5b710c7fc..5d03c9ff8 100644
--- a/src/rxns/rxn_emission.c
+++ b/src/rxns/rxn_emission.c
@@ -141,7 +141,7 @@ void rxn_emission_calc_deriv_contrib(ModelData *model_data,
 
   // Add contributions to the time derivative
   if (DERIV_ID_ >= 0)
-    time_derivative_add_value(time_deriv, DERIV_ID_, (long double)RATE_);
+    time_derivative_add_value(time_deriv, DERIV_ID_, (double)RATE_);
 
   return;
 }
diff --git a/src/rxns/rxn_first_order_loss.c b/src/rxns/rxn_first_order_loss.c
index b73839865..af7d9404c 100644
--- a/src/rxns/rxn_first_order_loss.c
+++ b/src/rxns/rxn_first_order_loss.c
@@ -149,7 +149,7 @@ void rxn_first_order_loss_calc_deriv_contrib(
   double *env_data = model_data->grid_cell_env;
 
   // Calculate the reaction rate
-  long double rate = RATE_CONSTANT_ * state[REACT_];
+  double rate = RATE_CONSTANT_ * state[REACT_];
 
   // Add contributions to the time derivative
   if (DERIV_ID_ >= 0) time_derivative_add_value(time_deriv, DERIV_ID_, -rate);
diff --git a/src/rxns/rxn_photolysis.c b/src/rxns/rxn_photolysis.c
index 6f468d307..00ebd5728 100644
--- a/src/rxns/rxn_photolysis.c
+++ b/src/rxns/rxn_photolysis.c
@@ -167,7 +167,7 @@ void rxn_photolysis_calc_deriv_contrib(
   double *env_data = model_data->grid_cell_env;
 
   // Calculate the reaction rate
-  long double rate = RATE_CONSTANT_;
+  double rate = RATE_CONSTANT_;
   for (int i_spec = 0; i_spec < NUM_REACT_; i_spec++)
     rate *= state[REACT_(i_spec)];
 
diff --git a/src/rxns/rxn_ternary_chemical_activation.c b/src/rxns/rxn_ternary_chemical_activation.c
index b00ff5d24..ff091f315 100644
--- a/src/rxns/rxn_ternary_chemical_activation.c
+++ b/src/rxns/rxn_ternary_chemical_activation.c
@@ -154,7 +154,7 @@ void rxn_ternary_chemical_activation_calc_deriv_contrib(
   double *env_data = model_data->grid_cell_env;
 
   // Calculate the reaction rate
-  long double rate = RATE_CONSTANT_;
+  double rate = RATE_CONSTANT_;
   for (int i_spec = 0; i_spec < NUM_REACT_; i_spec++)
     rate *= state[REACT_(i_spec)];
 
diff --git a/src/rxns/rxn_troe.c b/src/rxns/rxn_troe.c
index 09e1cf397..35edbb2a0 100644
--- a/src/rxns/rxn_troe.c
+++ b/src/rxns/rxn_troe.c
@@ -150,7 +150,7 @@ void rxn_troe_calc_deriv_contrib(ModelData *model_data,
   double *env_data = model_data->grid_cell_env;
 
   // Calculate the reaction rate
-  long double rate = RATE_CONSTANT_;
+  double rate = RATE_CONSTANT_;
   for (int i_spec = 0; i_spec < NUM_REACT_; i_spec++)
     rate *= state[REACT_(i_spec)];
 
diff --git a/src/rxns/rxn_wennberg_no_ro2.c b/src/rxns/rxn_wennberg_no_ro2.c
index d996c7acf..56dcb4898 100644
--- a/src/rxns/rxn_wennberg_no_ro2.c
+++ b/src/rxns/rxn_wennberg_no_ro2.c
@@ -167,9 +167,9 @@ void rxn_wennberg_no_ro2_calc_deriv_contrib(
   double *env_data = model_data->grid_cell_env;
 
   // Calculate the reaction rate
-  long double rate = 1.0;
-  long double k_a = ALKOXY_RATE_CONSTANT_;
-  long double k_n = NITRATE_RATE_CONSTANT_;
+  double rate = 1.0;
+  double k_a = ALKOXY_RATE_CONSTANT_;
+  double k_n = NITRATE_RATE_CONSTANT_;
   for (int i_spec = 0; i_spec < NUM_REACT_; i_spec++)
     rate *= state[REACT_(i_spec)];
 
diff --git a/src/rxns/rxn_wennberg_tunneling.c b/src/rxns/rxn_wennberg_tunneling.c
index 2816ec85e..6cd37a29a 100644
--- a/src/rxns/rxn_wennberg_tunneling.c
+++ b/src/rxns/rxn_wennberg_tunneling.c
@@ -137,7 +137,7 @@ void rxn_wennberg_tunneling_calc_deriv_contrib(
   double *env_data = model_data->grid_cell_env;
 
   // Calculate the reaction rate
-  long double rate = RATE_CONSTANT_;
+  double rate = RATE_CONSTANT_;
   for (int i_spec = 0; i_spec < NUM_REACT_; i_spec++)
     rate *= state[REACT_(i_spec)];
 
diff --git a/src/rxns/rxn_wet_deposition.c b/src/rxns/rxn_wet_deposition.c
index 55db42142..58bde7c45 100644
--- a/src/rxns/rxn_wet_deposition.c
+++ b/src/rxns/rxn_wet_deposition.c
@@ -157,7 +157,7 @@ void rxn_wet_deposition_calc_deriv_contrib(
   // Add contributions to the time derivative
   for (int i_spec = 0; i_spec < NUM_SPEC_; i_spec++) {
     if (DERIV_ID_(i_spec) >= 0) {
-      long double rate = RATE_CONSTANT_ * state[REACT_(i_spec)];
+      double rate = RATE_CONSTANT_ * state[REACT_(i_spec)];
       time_derivative_add_value(time_deriv, DERIV_ID_(i_spec), -rate);
     }
   }
diff --git a/src/solver_stats.F90 b/src/solver_stats.F90
index d1248f7ce..1fa083849 100644
--- a/src/solver_stats.F90
+++ b/src/solver_stats.F90
@@ -49,14 +49,6 @@ module camp_solver_stats
     real(kind=dp) :: next_time_step__s
     !> Jacobian evaluation failures
     integer(kind=i_kind) :: Jac_eval_fails
-    !> Total calls to `f()`
-    integer(kind=i_kind) :: RHS_evals_total
-    !> Total calls to `Jac()`
-    integer(kind=i_kind) :: Jac_evals_total
-    !> Compute time for calls to `f()` [s]
-    real(kind=dp) :: RHS_time__s
-    !> Compute time for calls to `Jac()` [s]
-    real(kind=dp) :: Jac_time__s
     !> Maximum loss of precision on last deriv call
     real(kind=dp) :: max_loss_precision
 #ifdef CAMP_DEBUG
diff --git a/src/sub_models/sub_model_PDFiTE.c b/src/sub_models/sub_model_PDFiTE.c
index cdbec019f..0a9ffc16b 100644
--- a/src/sub_models/sub_model_PDFiTE.c
+++ b/src/sub_models/sub_model_PDFiTE.c
@@ -173,7 +173,7 @@ void sub_model_PDFiTE_calculate(int *sub_model_int_data,
   double *float_data = sub_model_float_data;
 
   // Calculate the water activity---i.e., relative humidity (0-1)
-  long double a_w = PPM_TO_RH_ * state[GAS_WATER_ID_];
+  double a_w = PPM_TO_RH_ * state[GAS_WATER_ID_];
 
   // Keep a_w within 0-1
   // TODO Filter =( try to remove
@@ -205,16 +205,16 @@ void sub_model_PDFiTE_calculate(int *sub_model_int_data,
       // across all other ion pairs (not i_ion_pair)
       // where v_x is the stoichiometric coefficient for species x in
       // the other ion_pair and N_x is its concentration.
-      long double omega = 0.0;
+      double omega = 0.0;
       for (int j_ion_pair = 0; j_ion_pair < NUM_ION_PAIRS_; ++j_ion_pair) {
         if (i_ion_pair == j_ion_pair) continue;
-        omega += (long double)2.0 *
+        omega += (double)2.0 *
                  (NUM_CATION_(j_ion_pair) + NUM_ANION_(j_ion_pair)) *
                  CATION_N_(j_ion_pair) * ANION_N_(j_ion_pair);
       }
 
       // Initialize ln(gamma)
-      long double ln_gamma = 0.0;
+      double ln_gamma = 0.0;
 
       // Add contributions from each interacting ion_pair
       for (int i_inter = 0; i_inter < NUM_INTER_(i_ion_pair); i_inter++) {
@@ -230,7 +230,7 @@ void sub_model_PDFiTE_calculate(int *sub_model_int_data,
         int j_ion_pair = INTER_SPEC_ID_(i_ion_pair, i_inter);
 
         // Calculate ln_gamma_inter
-        long double ln_gamma_inter = 0.0;
+        double ln_gamma_inter = 0.0;
         for (int i_B = 0; i_B < NUM_B_(i_ion_pair, i_inter); i_B++) {
           ln_gamma_inter += B_Z_(i_ion_pair, i_inter, i_B) * pow(a_w, i_B);
         }
@@ -317,16 +317,16 @@ void sub_model_PDFiTE_get_jac_contrib(int *sub_model_int_data,
       // across all other ion pairs (not i_ion_pair)
       // where v_x is the stoichiometric coefficient for species x in
       // the other ion_pair and N_x is its concentration.
-      long double omega = 0.0;
+      double omega = 0.0;
       for (int j_ion_pair = 0; j_ion_pair < NUM_ION_PAIRS_; ++j_ion_pair) {
         if (i_ion_pair == j_ion_pair) continue;
-        omega += (long double)2.0 *
+        omega += (double)2.0 *
                  (NUM_CATION_(j_ion_pair) + NUM_ANION_(j_ion_pair)) *
                  CATION_N_(j_ion_pair) * ANION_N_(j_ion_pair);
       }
 
       // Initialize ln(gamma)
-      long double ln_gamma = 0.0;
+      double ln_gamma = 0.0;
 
       // Add contributions from each interacting ion_pair
       for (int i_inter = 0; i_inter < NUM_INTER_(i_ion_pair); i_inter++) {
@@ -342,7 +342,7 @@ void sub_model_PDFiTE_get_jac_contrib(int *sub_model_int_data,
         int j_ion_pair = INTER_SPEC_ID_(i_ion_pair, i_inter);
 
         // Calculate ln_gamma_inter
-        long double ln_gamma_inter = 0.0;
+        double ln_gamma_inter = 0.0;
         for (int i_B = 0; i_B < NUM_B_(i_ion_pair, i_inter); i_B++) {
           ln_gamma_inter += B_Z_(i_ion_pair, i_inter, i_B) * pow(a_w, i_B);
         }
@@ -367,7 +367,7 @@ void sub_model_PDFiTE_get_jac_contrib(int *sub_model_int_data,
 
       }  // Loop on interacting ion_pairs
 
-      long double gamma_i = exp(ln_gamma);
+      double gamma_i = exp(ln_gamma);
 
       // Loop through the ion pairs to set the partial derivatives
       for (int i_inter = 0; i_inter < NUM_INTER_(i_ion_pair); i_inter++) {
@@ -383,8 +383,8 @@ void sub_model_PDFiTE_get_jac_contrib(int *sub_model_int_data,
         int j_ion_pair = INTER_SPEC_ID_(i_ion_pair, i_inter);
 
         // Calculate ln_gamma_inter and dln_gamma_inter_d_water
-        long double ln_gamma_inter = B_Z_(i_ion_pair, i_inter, 0);
-        long double d_ln_gamma_inter_d_water = 0.0;
+        double ln_gamma_inter = B_Z_(i_ion_pair, i_inter, 0);
+        double d_ln_gamma_inter_d_water = 0.0;
         for (int i_B = 1; i_B < NUM_B_(i_ion_pair, i_inter); i_B++) {
           ln_gamma_inter += B_Z_(i_ion_pair, i_inter, i_B) * pow(a_w, i_B);
           d_ln_gamma_inter_d_water +=
diff --git a/src/sub_models/sub_model_ZSR_aerosol_water.c b/src/sub_models/sub_model_ZSR_aerosol_water.c
index fd79ff369..fe7bca6f5 100644
--- a/src/sub_models/sub_model_ZSR_aerosol_water.c
+++ b/src/sub_models/sub_model_ZSR_aerosol_water.c
@@ -222,7 +222,7 @@ void sub_model_ZSR_aerosol_water_calculate(int *sub_model_int_data,
   double *float_data = sub_model_float_data;
 
   // Calculate the water activity---i.e., relative humidity (0-1)
-  long double a_w = PPM_TO_RH_ * state[GAS_WATER_ID_];
+  double a_w = PPM_TO_RH_ * state[GAS_WATER_ID_];
 
   // Calculate the total aerosol water for each instance of the aerosol phase
   for (int i_phase = 0; i_phase < NUM_PHASE_; i_phase++) {
@@ -231,7 +231,7 @@ void sub_model_ZSR_aerosol_water_calculate(int *sub_model_int_data,
 
     // Get the contribution from each ion pair
     for (int i_ion_pair = 0; i_ion_pair < NUM_ION_PAIR_; i_ion_pair++) {
-      long double molality, conc;
+      double molality, conc;
 
       // Determine which type of activity calculation should be used
       switch (TYPE_(i_ion_pair)) {
@@ -239,7 +239,7 @@ void sub_model_ZSR_aerosol_water_calculate(int *sub_model_int_data,
         case ACT_TYPE_JACOBSON:;
 
           // Determine whether to use the minimum RH in the calculation
-          long double j_aw =
+          double j_aw =
               a_w > JACOB_low_RH_(i_ion_pair) ? a_w : JACOB_low_RH_(i_ion_pair);
 
           // Calculate the molality of the pure binary ion pair solution
@@ -249,11 +249,11 @@ void sub_model_ZSR_aerosol_water_calculate(int *sub_model_int_data,
           molality *= molality;  // (mol/kg)
 
           // Calculate the water associated with this ion pair
-          long double cation =
+          double cation =
               state[PHASE_ID_(i_phase) + JACOB_CATION_ID_(i_ion_pair)] /
               JACOB_NUM_CATION_(i_ion_pair) / JACOB_CATION_MW_(i_ion_pair) /
               1000.0;  // (umol/m3)
-          long double anion =
+          double anion =
               state[PHASE_ID_(i_phase) + JACOB_ANION_ID_(i_ion_pair)] /
               JACOB_NUM_ANION_(i_ion_pair) / JACOB_ANION_MW_(i_ion_pair) /
               1000.0;  // (umol/m3)
@@ -265,8 +265,8 @@ void sub_model_ZSR_aerosol_water_calculate(int *sub_model_int_data,
           //               (e^(alpha*cation) + e^(alpha*anion))
           // where alpha is a constant smoothing factor
           // orig eq: conc = (cation > anion ? anion : cation);
-          long double e_ac = exp(ALPHA_ * cation);
-          long double e_aa = exp(ALPHA_ * anion);
+          double e_ac = exp(ALPHA_ * cation);
+          double e_aa = exp(ALPHA_ * anion);
           conc = (cation * e_ac + anion * e_aa) / (e_ac + e_aa);
 
           *water += conc / molality * 1000.0;  // (ug/m3)
@@ -277,7 +277,7 @@ void sub_model_ZSR_aerosol_water_calculate(int *sub_model_int_data,
         case ACT_TYPE_EQSAM:;
 
           // Keep the water activity within the range specified in EQSAM
-          long double e_aw = a_w > 0.99 ? 0.99 : a_w;
+          double e_aw = a_w > 0.99 ? 0.99 : a_w;
           e_aw = e_aw < 0.001 ? 0.001 : e_aw;
 
           // Calculate the molality of the ion pair
@@ -323,15 +323,15 @@ void sub_model_ZSR_aerosol_water_get_jac_contrib(int *sub_model_int_data,
   double *env_data = model_data->grid_cell_env;
 
   // Calculate the water activity---i.e., relative humidity (0-1)
-  long double a_w = PPM_TO_RH_ * state[GAS_WATER_ID_];
-  long double d_aw_d_wg = PPM_TO_RH_;
+  double a_w = PPM_TO_RH_ * state[GAS_WATER_ID_];
+  double d_aw_d_wg = PPM_TO_RH_;
 
   // Calculate the total aerosol water for each instance of the aerosol phase
   for (int i_phase = 0; i_phase < NUM_PHASE_; i_phase++) {
     // Get the contribution from each ion pair
     for (int i_ion_pair = 0; i_ion_pair < NUM_ION_PAIR_; i_ion_pair++) {
-      long double molality, d_molal_d_wg;
-      long double conc;
+      double molality, d_molal_d_wg;
+      double conc;
 
       // Determine which type of activity calculation should be used
       switch (TYPE_(i_ion_pair)) {
@@ -339,9 +339,9 @@ void sub_model_ZSR_aerosol_water_get_jac_contrib(int *sub_model_int_data,
         case ACT_TYPE_JACOBSON:;
 
           // Determine whether to use the minimum RH in the calculation
-          long double j_aw =
+          double j_aw =
               a_w > JACOB_low_RH_(i_ion_pair) ? a_w : JACOB_low_RH_(i_ion_pair);
-          long double d_jaw_d_wg =
+          double d_jaw_d_wg =
               a_w > JACOB_low_RH_(i_ion_pair) ? d_aw_d_wg : 0.0;
 
           // Calculate the molality of the pure binary ion pair solution
@@ -355,30 +355,30 @@ void sub_model_ZSR_aerosol_water_get_jac_contrib(int *sub_model_int_data,
           d_molal_d_wg *= d_jaw_d_wg;
 
           // Calculate the water associated with this ion pair
-          long double cation =
+          double cation =
               state[PHASE_ID_(i_phase) + JACOB_CATION_ID_(i_ion_pair)] /
               JACOB_NUM_CATION_(i_ion_pair) / JACOB_CATION_MW_(i_ion_pair) /
               1000.0;  // (umol/m3)
-          long double d_cation_d_C = 1.0 / JACOB_NUM_CATION_(i_ion_pair) /
+          double d_cation_d_C = 1.0 / JACOB_NUM_CATION_(i_ion_pair) /
                                      JACOB_CATION_MW_(i_ion_pair) / 1000.0;
-          long double anion =
+          double anion =
               state[PHASE_ID_(i_phase) + JACOB_ANION_ID_(i_ion_pair)] /
               JACOB_NUM_ANION_(i_ion_pair) / JACOB_ANION_MW_(i_ion_pair) /
               1000.0;  // (umol/m3)
-          long double d_anion_d_A = 1.0 / JACOB_NUM_ANION_(i_ion_pair) /
+          double d_anion_d_A = 1.0 / JACOB_NUM_ANION_(i_ion_pair) /
                                     JACOB_ANION_MW_(i_ion_pair) / 1000.0;
 
           // Calculate the smooth-maximum ion pair concentration
           // (see calculate() function for details)
-          long double e_ac = exp(ALPHA_ * cation);
-          long double e_aa = exp(ALPHA_ * anion);
+          double e_ac = exp(ALPHA_ * cation);
+          double e_aa = exp(ALPHA_ * anion);
           conc = (cation * e_ac + anion * e_aa) / (e_ac + e_aa);
-          long double denom = (e_ac + e_aa) * (e_ac + e_aa);
-          long double d_conc_d_cation =
+          double denom = (e_ac + e_aa) * (e_ac + e_aa);
+          double d_conc_d_cation =
               (e_ac * e_ac +
                e_ac * e_aa * (1.0 - ALPHA_ * anion + ALPHA_ * cation)) /
               denom;
-          long double d_conc_d_anion =
+          double d_conc_d_anion =
               (e_aa * e_aa +
                e_ac * e_aa * (1.0 - ALPHA_ * cation + ALPHA_ * anion)) /
               denom;
@@ -397,9 +397,9 @@ void sub_model_ZSR_aerosol_water_get_jac_contrib(int *sub_model_int_data,
         case ACT_TYPE_EQSAM:;
 
           // Keep the water activity within the range specified in EQSAM
-          long double e_aw = a_w > 0.99 ? 0.99 : a_w;
+          double e_aw = a_w > 0.99 ? 0.99 : a_w;
           e_aw = e_aw < 0.001 ? 0.001 : e_aw;
-          long double d_eaw_d_wg = a_w > 0.99 ? 0.0 : d_aw_d_wg;
+          double d_eaw_d_wg = a_w > 0.99 ? 0.0 : d_aw_d_wg;
           d_eaw_d_wg = a_w < 0.001 ? 0.0 : d_eaw_d_wg;
 
           // Calculate the molality of the ion pair
@@ -418,7 +418,7 @@ void sub_model_ZSR_aerosol_water_get_jac_contrib(int *sub_model_int_data,
           for (int i_ion = 0; i_ion < EQSAM_NUM_ION_(i_ion_pair); i_ion++) {
             conc = state[PHASE_ID_(i_phase) + EQSAM_ION_ID_(i_ion_pair, i_ion)];
             conc = (conc > 0.0 ? conc : 0.0);
-            long double d_conc_d_ion = (conc > 0.0 ? 1.0 : 0.0);
+            double d_conc_d_ion = (conc > 0.0 ? 1.0 : 0.0);
 
             // Gas-phase water contribution
             J[EQSAM_GAS_WATER_JAC_ID_(i_phase, i_ion_pair)] +=
diff --git a/src/time_derivative.c b/src/time_derivative.c
index 34c7bb7a8..0dd822f39 100644
--- a/src/time_derivative.c
+++ b/src/time_derivative.c
@@ -17,11 +17,11 @@ int time_derivative_initialize(TimeDerivative *time_deriv,
   if (num_spec <= 0) return 0;
 
   time_deriv->production_rates =
-      (long double *)malloc(num_spec * sizeof(long double));
+      (double *)malloc(num_spec * sizeof(double));
   if (time_deriv->production_rates == NULL) return 0;
 
   time_deriv->loss_rates =
-      (long double *)malloc(num_spec * sizeof(long double));
+      (double *)malloc(num_spec * sizeof(double));
   if (time_deriv->loss_rates == NULL) {
     free(time_deriv->production_rates);
     return 0;
@@ -45,8 +45,8 @@ void time_derivative_reset(TimeDerivative time_deriv) {
 
 void time_derivative_output(TimeDerivative time_deriv, double *dest_array,
                             double *deriv_est, unsigned int output_precision) {
-  long double *r_p = time_deriv.production_rates;
-  long double *r_l = time_deriv.loss_rates;
+  double *r_p = time_deriv.production_rates;
+  double *r_l = time_deriv.loss_rates;
 
 #ifdef CAMP_DEBUG
   time_deriv.last_max_loss_precision = 1.0;
@@ -56,7 +56,7 @@ void time_derivative_output(TimeDerivative time_deriv, double *dest_array,
     double prec_loss = 1.0;
     if (*r_p + *r_l != 0.0) {
       if (deriv_est) {
-        long double scale_fact;
+        double scale_fact;
         scale_fact =
             1.0 / (*r_p + *r_l) /
             (1.0 / (*r_p + *r_l) + MAX_PRECISION_LOSS / fabsl(*r_p - *r_l));
@@ -88,7 +88,7 @@ void time_derivative_output(TimeDerivative time_deriv, double *dest_array,
 }
 
 void time_derivative_add_value(TimeDerivative time_deriv, unsigned int spec_id,
-                               long double rate_contribution) {
+                               double rate_contribution) {
   if (rate_contribution > 0.0) {
     time_deriv.production_rates[spec_id] += rate_contribution;
   } else {
diff --git a/src/time_derivative.h b/src/time_derivative.h
index 74052a290..c3d42fb5a 100644
--- a/src/time_derivative.h
+++ b/src/time_derivative.h
@@ -20,8 +20,8 @@
 /* Time derivative for solver species */
 typedef struct {
   unsigned int num_spec;          // Number of species in the derivative
-  long double *production_rates;  // Production rates for all species
-  long double *loss_rates;        // Loss rates for all species
+  double *production_rates;  // Production rates for all species
+  double *loss_rates;        // Loss rates for all species
 #ifdef CAMP_DEBUG
   double last_max_loss_precision;  // Maximum loss of precision at last output
 #endif
@@ -62,7 +62,7 @@ void time_derivative_output(TimeDerivative time_deriv, double *dest_array,
  * spec_id
  */
 void time_derivative_add_value(TimeDerivative time_deriv, unsigned int spec_id,
-                               long double rate_contribution);
+                               double rate_contribution);
 
 #ifdef CAMP_DEBUG
 /** \brief Maximum loss of precision at the last output of the derivative
diff --git a/test/chemistry/cb05cl_ae5/test_cb05cl_ae5.F90 b/test/chemistry/cb05cl_ae5/test_cb05cl_ae5.F90
index 027c607d1..f96cd7199 100644
--- a/test/chemistry/cb05cl_ae5/test_cb05cl_ae5.F90
+++ b/test/chemistry/cb05cl_ae5/test_cb05cl_ae5.F90
@@ -692,13 +692,6 @@ logical function run_standard_cb05cl_ae5_test() result(passed)
 
 #ifdef DEBUG
       if (i_repeat.eq.1) then
-        ! TIMERS - timers and counters are available here for the last call
-        ! to solve()
-        write(*,*) "Calls to f()", solver_stats%RHS_evals_total
-        write(*,*) "Calls to Jac()", solver_stats%Jac_evals_total
-        write(*,*) "Compute time f()", solver_stats%RHS_time__s, "s"
-        write(*,*) "Compute time Jac()", solver_stats%Jac_time__s, "s"
-
         call solver_stats%print()
 
         ! Check the Jacobian evaluations
diff --git a/test/monarch/TestMonarch.py b/test/monarch/TestMonarch.py
index 131354228..84fb69686 100644
--- a/test/monarch/TestMonarch.py
+++ b/test/monarch/TestMonarch.py
@@ -18,13 +18,10 @@ def all_timesteps():
   # conf.profileCuda = "nvprof"
   # conf.profileCuda = "nsight"
   conf.is_import = True
-  conf.mpiProcessesCaseBase = 1
-  conf.mpiProcessesCaseOptimList = [1]
-  conf.allocatedNodes = 1
-  conf.allocatedTasksPerNode = 160
-  conf.cells = [10]
-  # conf.cells = [100, 500, 1000, 5000, 10000]
-  conf.timeSteps = 5
+  conf.mpiProcessesCaseBase = 40
+  conf.mpiProcessesCaseOptimList = [40]
+  conf.cells = [100000]
+  conf.timeSteps = 720
   conf.caseBase = "CPU One-cell"
   # conf.caseBase = "GPU BDF"
   conf.casesOptim = []
@@ -34,5 +31,6 @@ def all_timesteps():
 
   run_main(conf)
 
+
 if __name__ == "__main__":
   all_timesteps()
diff --git a/test/monarch/TestMonarch1.py b/test/monarch/TestMonarch1.py
index b183870a7..74b9b813d 100644
--- a/test/monarch/TestMonarch1.py
+++ b/test/monarch/TestMonarch1.py
@@ -19,11 +19,8 @@ def all_timesteps():
   # conf.profileCuda = "nsight"
   conf.is_import = True
   conf.mpiProcessesCaseBase = 1
-  conf.mpiProcessesCaseOptimList = [10]
-  conf.allocatedNodes = 1
-  conf.allocatedTasksPerNode = 160
-  conf.cells = [100000]
-  # conf.cells = [100, 500, 1000, 5000, 10000]
+  conf.mpiProcessesCaseOptimList = [1]
+  conf.cells = [10000]
   conf.timeSteps = 720
   conf.caseBase = "CPU One-cell"
   # conf.caseBase = "GPU BDF"
@@ -34,5 +31,6 @@ def all_timesteps():
 
   run_main(conf)
 
+
 if __name__ == "__main__":
   all_timesteps()
diff --git a/test/monarch/TestMonarch2.py b/test/monarch/TestMonarch2.py
index 199f2d5d1..ffac564d3 100644
--- a/test/monarch/TestMonarch2.py
+++ b/test/monarch/TestMonarch2.py
@@ -20,10 +20,7 @@ def all_timesteps():
   conf.is_import = True
   conf.mpiProcessesCaseBase = 20
   conf.mpiProcessesCaseOptimList = [20]
-  conf.allocatedNodes = 1
-  conf.allocatedTasksPerNode = 160
-  conf.cells = [100000]
-  # conf.cells = [100, 500, 1000, 5000, 10000]
+  conf.cells = [10000]
   conf.timeSteps = 720
   conf.caseBase = "CPU One-cell"
   # conf.caseBase = "GPU BDF"
diff --git a/test/monarch/TestMonarch3.py b/test/monarch/TestMonarch3.py
index b0db80ee6..2107dc25e 100644
--- a/test/monarch/TestMonarch3.py
+++ b/test/monarch/TestMonarch3.py
@@ -20,10 +20,7 @@ def all_timesteps():
   conf.is_import = True
   conf.mpiProcessesCaseBase = 40
   conf.mpiProcessesCaseOptimList = [40]
-  conf.allocatedNodes = 1
-  conf.allocatedTasksPerNode = 160
-  conf.cells = [1000,5000,10000,50000,100000]
-  # conf.cells = [100, 500, 1000, 5000, 10000]
+  conf.cells = [1000,5000,10000]
   conf.timeSteps = 720
   conf.caseBase = "CPU One-cell"
   # conf.caseBase = "GPU BDF"
diff --git a/test/monarch/TestMonarch4.py b/test/monarch/TestMonarch4.py
index 304e38b92..c0f8f3368 100644
--- a/test/monarch/TestMonarch4.py
+++ b/test/monarch/TestMonarch4.py
@@ -20,10 +20,7 @@ def all_timesteps():
   conf.is_import = True
   conf.mpiProcessesCaseBase = 1
   conf.mpiProcessesCaseOptimList = [10,20,30,40]
-  conf.allocatedNodes = 1
-  conf.allocatedTasksPerNode = 160
-  conf.cells = [100000]
-  # conf.cells = [100, 500, 1000, 5000, 10000]
+  conf.cells = [10000]
   conf.timeSteps = 720
   conf.caseBase = "CPU One-cell"
   # conf.caseBase = "GPU BDF"
diff --git a/test/monarch/TestMonarch5.py b/test/monarch/TestMonarch5.py
new file mode 100644
index 000000000..0a5390783
--- /dev/null
+++ b/test/monarch/TestMonarch5.py
@@ -0,0 +1,36 @@
+#
+# Copyright (C) 2022 Barcelona Supercomputing Center and University of
+# Illinois at Urbana-Champaign
+# SPDX-License-Identifier: MIT
+#
+
+from mainMonarch import *
+
+
+def all_timesteps():
+  conf = TestMonarch()
+  conf.chemFile = "cb05_paperV2"
+  # conf.chemFile = "monarch_cb05"
+  conf.diffCellsL = []
+  conf.diffCellsL.append("Realistic")
+  # conf.diffCellsL.append("Ideal")
+  conf.profileCuda = ""
+  # conf.profileCuda = "nvprof"
+  # conf.profileCuda = "nsight"
+  conf.is_import = True
+  conf.mpiProcessesCaseBase = 1
+  conf.mpiProcessesCaseOptimList = [1]
+  conf.cells = [100000]
+  conf.timeSteps = 720
+  conf.caseBase = "CPU One-cell"
+  # conf.caseBase = "GPU BDF"
+  conf.casesOptim = []
+  # conf.casesOptim.append("CPU One-cell")
+  conf.casesOptim.append("GPU BDF")
+  conf.plotYKey = "Speedup timecvStep"
+
+  run_main(conf)
+
+
+if __name__ == "__main__":
+  all_timesteps()
diff --git a/test/monarch/camp_monarch_interface.F90 b/test/monarch/camp_monarch_interface.F90
index 2f7073b98..d23094294 100644
--- a/test/monarch/camp_monarch_interface.F90
+++ b/test/monarch/camp_monarch_interface.F90
@@ -6,7 +6,7 @@
 !> The camp_monarch_interface_t object and related functions
 
 !> Interface for the MONACH model and CAMP-camp
-module camp_monarch_interface_2
+module camp_monarch_interface
 
   use camp_constants, only : i_kind
   use camp_mpi
@@ -82,7 +82,7 @@ function constructor(camp_config_file, output_file_title, &
     integer, optional :: n_cells
     type(camp_solver_data_t), pointer :: camp_solver_data
     character, allocatable :: buffer(:)
-    integer(kind=i_kind) :: pos, pack_size
+    integer(kind=i_kind) :: pos, pack_size, size
     integer(kind=i_kind) :: i_spec, i_photo_rxn, rank, n_ranks, ierr
     type(string_t), allocatable :: unique_names(:)
     character(len=:), allocatable :: spec_name, settings_interface_file
@@ -94,7 +94,7 @@ function constructor(camp_config_file, output_file_title, &
     type(aero_rep_update_data_modal_binned_mass_GSD_t) :: update_data_GSD
     real(kind=dp) :: comp_start, comp_end
     character(len=128) :: i_str
-    integer :: local_comm,use_cpu, nGPUs
+    integer :: local_comm,use_cpu
 
     if (present(mpi_comm)) then
       local_comm = mpi_comm
@@ -225,15 +225,13 @@ function constructor(camp_config_file, output_file_title, &
     end if
     deallocate(buffer)
     use_cpu=1
-    nGPUs=1
     open(unit=32, file='settings/config_variables_c_solver.txt', status='old')
     read(32,'(A)') i_str
     if(trim(i_str) == "USE_CPU=OFF") then
       use_cpu = 0
     end if
-    read(32, *) nGPUs
     close(32)
-    call this%camp_core%solver_initialize(use_cpu,nGPUs)
+    call this%camp_core%solver_initialize(use_cpu)
     this%camp_state => this%camp_core%new_state()
     if(this%output_file_title=="cb05_paperV2") then
       allocate(this%offset_photo_rates_cells(this%n_cells))
@@ -268,10 +266,7 @@ function constructor(camp_config_file, output_file_title, &
         call this%camp_core%update_data(update_data_GSD)
       end if
     end do
-    !unique_names=this%camp_core%unique_names()
-    !do i=1, size(unique_names)
-    !  print*,i,trim(unique_names(i)%string)
-    !end do
+    call this%camp_core%init_export_solver_state()
     if (MONARCH_PROCESS==0) then
       call cpu_time(comp_end)
       write(*,*) "Initialization time: ", comp_end-comp_start, " s"
@@ -374,8 +369,6 @@ subroutine integrate(this, curr_time, time_step, I_W, I_E, I_S, &
             end do
             this%camp_state%state_var(this%map_camp_id(:)) = &
                             MONARCH_conc(i,j,k,this%map_monarch_id(:))
-            !print*,"MONARCH_conc381",MONARCH_conc(i,j,k,this%map_monarch_id(:))
-            !print*,"state_var421",this%camp_state%state_var(:)
             if(this%output_file_title=="monarch_cb05") then
               this%camp_state%state_var(this%gas_phase_water_id) = &
               water_conc(1,1,1,water_vapor_index)
@@ -384,14 +377,12 @@ subroutine integrate(this, curr_time, time_step, I_W, I_E, I_S, &
                       water_conc(1,1,1,water_vapor_index) * &
                               mwair / mwwat * 1.e6
             end if
-            !print*,"state_var430",this%camp_state%state_var(:)
             if(this%output_file_title=="cb05_paperV2") then
               do r=1,size(this%specs_emi_id)
                 this%camp_state%state_var(this%specs_emi_id(r))=&
                         this%camp_state%state_var(this%specs_emi_id(r))&
                                 +this%specs_emi(r)*rate_emi(i_hour,z+1)*conv(i,j,k)
               end do
-            !print*,"state_var436",this%camp_state%state_var(1)
             end if
             call cpu_time(comp_start)
             call this%camp_core%solve(this%camp_state, real(time_step*60., kind=dp))
@@ -418,8 +409,6 @@ subroutine integrate(this, curr_time, time_step, I_W, I_E, I_S, &
             call this%camp_state%env_states(z+1)%set_pressure_Pa(real(pressure(i,j,k),kind=dp))
             this%camp_state%state_var(this%map_camp_id(:) + (z*state_size_per_cell))&
              = MONARCH_conc(i,j,k,this%map_monarch_id(:))
-            !print*,"MONARCH_conc381",MONARCH_conc(i,j,k,this%map_monarch_id(:))
-            !print*,"state_var421",this%camp_state%state_var(:)
             if(this%output_file_title=="monarch_cb05") then
               this%camp_state%state_var(this%gas_phase_water_id+(z*state_size_per_cell)) = &
                       water_conc(1,1,1,water_vapor_index)
@@ -427,7 +416,6 @@ subroutine integrate(this, curr_time, time_step, I_W, I_E, I_S, &
               this%camp_state%state_var(this%gas_phase_water_id+(z*state_size_per_cell)) = &
                       water_conc(1,1,1,water_vapor_index) * mwair / mwwat * 1.e6
             end if
-            !print*,"state_var430",this%camp_state%state_var(:)
             if(this%output_file_title=="cb05_paperV2") then
               do r=1,size(this%specs_emi_id)
                 this%camp_state%state_var(this%specs_emi_id(r)+z*state_size_per_cell)=&
@@ -435,7 +423,6 @@ subroutine integrate(this, curr_time, time_step, I_W, I_E, I_S, &
                                 +this%specs_emi(r)*rate_emi(i_hour,z+1)*conv(i,j,k)
               end do
             endif
-            !print*,"state_var436",this%camp_state%state_var(1+z*state_size_per_cell)
           end do
         end do
       end do
@@ -821,10 +808,28 @@ subroutine get_init_conc(this, MONARCH_conc, water_conc, &
     end if
   end subroutine get_init_conc
 
-
   elemental subroutine finalize(this)
     type(camp_monarch_interface_t), intent(inout) :: this
-    if (associated(this%camp_core)) deallocate(this%camp_core)
+    if (associated(this%camp_core)) &
+            deallocate(this%camp_core)
+    if (associated(this%camp_state)) &
+            deallocate(this%camp_state)
+    if (allocated(this%monarch_species_names)) &
+            deallocate(this%monarch_species_names)
+    if (allocated(this%map_monarch_id)) &
+            deallocate(this%map_monarch_id)
+    if (allocated(this%map_camp_id)) &
+            deallocate(this%map_camp_id)
+    if (allocated(this%init_conc_camp_id)) &
+            deallocate(this%init_conc_camp_id)
+    if (allocated(this%init_conc)) &
+            deallocate(this%init_conc)
+    if (associated(this%species_map_data)) &
+            deallocate(this%species_map_data)
+    if (associated(this%init_conc_data)) &
+            deallocate(this%init_conc_data)
+    if (associated(this%property_set)) &
+            deallocate(this%property_set)
 
   end subroutine finalize
 
diff --git a/test/monarch/checkGPU.py b/test/monarch/checkGPU.py
index 25d8924af..e5122424b 100644
--- a/test/monarch/checkGPU.py
+++ b/test/monarch/checkGPU.py
@@ -12,12 +12,8 @@ def checkGPU():
   conf.chemFile = "cb05_paperV2"
   conf.diffCellsL = []
   conf.diffCellsL.append("Realistic")
-  conf.nGPUsCaseBase = 1
-  conf.nGPUsCaseOptimList = [1]
   conf.mpiProcessesCaseBase = 1
   conf.mpiProcessesCaseOptimList.append(1)
-  conf.allocatedNodes = 1
-  conf.allocatedTasksPerNode = 160
   conf.cells = [10]
   conf.timeSteps = 3
   conf.timeStepsDt = 2
diff --git a/test/monarch/checkGPU.sh b/test/monarch/checkGPU.sh
index 59ef7ffa4..cb3b05b4a 100644
--- a/test/monarch/checkGPU.sh
+++ b/test/monarch/checkGPU.sh
@@ -1,5 +1,3 @@
 #!/usr/bin/env bash
 set -e
-ln -rs -fL out ../../build/out
-ln -rs -fL settings ../../build/settings
 python checkGPU.py
\ No newline at end of file
diff --git a/test/monarch/diff_TestMonarch.py b/test/monarch/diff_TestMonarch.py
index 2c19f7330..a9fe143ed 100644
--- a/test/monarch/diff_TestMonarch.py
+++ b/test/monarch/diff_TestMonarch.py
@@ -15,12 +15,8 @@ def all_timesteps():
   conf.diffCellsL.append("Realistic")
   #conf.diffCellsL.append("Ideal")
   conf.profileCuda = ""
-  conf.nGPUsCaseBase = 1
-  conf.nGPUsCaseOptimList = [1]
   conf.mpiProcessesCaseBase = 1
   conf.mpiProcessesCaseOptimList.append(1)
-  conf.allocatedNodes = 1
-  conf.allocatedTasksPerNode = 160
   conf.cells = [2]
   conf.timeSteps = 3
   conf.caseBase = "CPU One-cell"
diff --git a/test/monarch/mainMonarch.py b/test/monarch/mainMonarch.py
index 1e5abc424..55c52f4cb 100644
--- a/test/monarch/mainMonarch.py
+++ b/test/monarch/mainMonarch.py
@@ -8,8 +8,6 @@
 import json
 import subprocess
 from pandas import read_csv as pd_read_csv
-import time
-from math import ceil
 
 
 class TestMonarch:
@@ -24,8 +22,6 @@ def __init__(self):
     self.caseGpuCpu = ""
     self.caseMulticellsOnecell = ""
     self.mpiProcesses = 1
-    self.allocatedNodes = 1
-    self.allocatedTasksPerNode = 160
     # Cases configuration
     self.diffCellsL = ""
     self.mpiProcessesCaseBase = 1
@@ -45,33 +41,9 @@ def __init__(self):
     self.campConf = "settings/config_variables_c_solver.txt"
 
 
-def write_camp_config_file(conf):
-  try:
-    file1 = open(conf.campConf, "w")
-    if conf.caseGpuCpu == "CPU":
-      file1.write("USE_CPU=ON\n")
-    else:
-      file1.write("USE_CPU=OFF\n")
-    file1.write(str(nGPUs) + "\n")
-    file1.close()
-  except Exception as e:
-    print("write_camp_config_file fails", e)
-
-
 # from line_profiler_pycharm import profile
 # @profile
 def run(conf):
-  if conf.caseGpuCpu == "GPU":
-    maxCoresPerNode = 40
-    if (conf.mpiProcesses > maxCoresPerNode and
-        conf.mpiProcesses % maxCoresPerNode != 0):
-      print(
-        "ERROR: MORE THAN 40 MPI PROCESSES AND NOT "
-        "MULTIPLE OF 40, WHEN CTE-POWER ONLY HAS 40 CORES "
-        "PER NODE\n");
-      raise
-  coresPerGPU = 10
-  nGPUs = ceil(conf.mpiProcesses / coresPerGPU)
   exec_str = ""
   try:
     ddt_pid = subprocess.check_output(
@@ -80,8 +52,13 @@ def run(conf):
       exec_str += 'ddt --connect '
   except Exception:
     pass
-  exec_str += "mpirun -v -np " + str(
-    conf.mpiProcesses) + " --bind-to core "
+  maxCoresPerNode = 40
+  if conf.mpiProcesses <= maxCoresPerNode:
+    exec_str += "mpirun -v -np " + str(
+      conf.mpiProcesses) + " --bind-to core " #for plogin (fails squeue)
+  else:
+    exec_str += "srun --cpu-bind=core -n " + str(
+      conf.mpiProcesses) + " " #foqueue (slow plogin)
   if (conf.profileCuda == "nvprof" and conf.caseGpuCpu ==
       "GPU"):
     pathNvprof = ("../../compile/power9/" +
@@ -111,14 +88,13 @@ def run(conf):
       file1.write("USE_CPU=ON\n")
     else:
       file1.write("USE_CPU=OFF\n")
-    file1.write(str(nGPUs) + "\n")
     file1.close()
   except Exception as e:
     print("write_camp_config_file fails", e)
   print("exec_str:", exec_str, conf.diffCells,
         conf.caseGpuCpu,
         conf.caseMulticellsOnecell, "ncellsPerMPIProcess:",
-        conf.nCells, "nGPUs:", nGPUs)
+        conf.nCells)
   conf_name = "settings/TestMonarch.json"
   with open(conf_name, 'w', encoding='utf-8') as jsonFile:
     json.dump(conf.__dict__, jsonFile, indent=4,
@@ -126,10 +102,8 @@ def run(conf):
   nCellsStr = str(conf.nCells)
   if conf.nCells >= 1000:
     nCellsStr = str(int(conf.nCells / 1000)) + "k"
-  if conf.caseGpuCpu == "GPU":
-    caseGpuCpuName = str(nGPUs) + conf.caseGpuCpu
-  else:
-    caseGpuCpuName = str(conf.mpiProcesses) + "CPUcores"
+  caseGpuCpuName=conf.caseGpuCpu+str(conf.mpiProcesses) + "cores"
+  out = 0
   is_import = False
   data_path = ("out/stats" + caseGpuCpuName + nCellsStr +
                "cells" + str(conf.timeSteps) + "tsteps.csv")
@@ -137,30 +111,38 @@ def run(conf):
   data_path2 = ("out/state" + caseGpuCpuName + nCellsStr +
                 "cells" + str(conf.timeSteps) + "tsteps.csv")
   if conf.is_import and os.path.exists(data_path):
-    is_import = True
-    if conf.is_out and not os.path.exists(data_path2):
-      is_import = False
+    nRows_csv = (conf.timeSteps * conf.nCells *conf.mpiProcesses)
+    df = pd_read_csv(data_path, nrows=nRows_csv)
+    data = df.to_dict('list')
+    y_key_words = conf.plotYKey.split()
+    y_key = y_key_words[-1]
+    data = data[y_key]
+    print("data stats",data)
+    if data:
+      is_import = True
+    if conf.is_out:
+      if os.path.exists(data_path2):
+        is_import = True
+      else:
+        is_import = False
   if not is_import:
     os.system(exec_str)
-  data_path = ("out/stats" + caseGpuCpuName + nCellsStr +
-               "cells" + str(conf.timeSteps) + "tsteps.csv")
-  if not is_import:
     os.rename("out/stats.csv", data_path)
-  nRows_csv = (conf.timeSteps * conf.nCells *
-               conf.mpiProcesses)
-  df = pd_read_csv(data_path, nrows=nRows_csv)
-  data = df.to_dict('list')
-  y_key_words = conf.plotYKey.split()
-  y_key = y_key_words[-1]
-  data = data[y_key][0]
-  out = 0
-  if conf.is_out:
-    if not is_import:
+    if conf.is_out:
       os.rename("out/state.csv", data_path2)
-    df = pd_read_csv(data_path2, header=None,
-                     names=["Column1"])
-    out = df["Column1"].tolist()
-  return data, out
+    nRows_csv = (conf.timeSteps * conf.nCells *conf.mpiProcesses)
+    df = pd_read_csv(data_path, nrows=nRows_csv)
+    data = df.to_dict('list')
+    y_key_words = conf.plotYKey.split()
+    y_key = y_key_words[-1]
+    data = data[y_key]
+    print("data stats",data)
+  if conf.is_out:
+    if os.path.exists(data_path2):
+      df = pd_read_csv(data_path2, header=None,
+                       names=["Column1"])
+      out = df["Column1"].tolist()
+  return data[0], out
 
 
 # @profile
@@ -324,7 +306,7 @@ def plot_cases(conf, datay):
     datax = list(range(1, conf.timeSteps + 1, 1))
     plot_x_key = "Timesteps"
   namex = plot_x_key
-  print(namex, ":", datax)
+  print(namex, ":", datax[0],"to",datax[-1])
   if legend:
     print("plotTitle: ", plotTitle, " legend:", legend)
   else:
@@ -355,4 +337,4 @@ def run_main(conf):
         conf.mpiProcessesCaseOptimList[i] = cellsProcesses
 
   datay = run_diffCells(conf)
-  #plot_cases(conf, datay)
+  plot_cases(conf, datay)
diff --git a/test/monarch/mock_monarch.F90 b/test/monarch/mock_monarch.F90
index a194c9f70..23cc6cf7b 100644
--- a/test/monarch/mock_monarch.F90
+++ b/test/monarch/mock_monarch.F90
@@ -7,7 +7,7 @@ program mock_monarch_t
 
   use camp_constants, only: const
   use camp_util, only : assert_msg, almost_equal, to_string
-  use camp_monarch_interface_2
+  use camp_monarch_interface
   use camp_mpi
   use json_module
 
@@ -182,8 +182,8 @@ program mock_monarch_t
   curr_time = curr_time + TIME_STEP
   end do
   call camp_mpi_barrier()
-  call camp_interface%camp_core%export_solver_stats()
   call camp_interface%camp_core%join_solver_state()
+  call camp_interface%camp_core%export_solver_stats()
   call camp_mpi_barrier()
 
   if (camp_mpi_rank()==0) then
diff --git a/test/monarch/run.sh b/test/monarch/run.sh
index e49fbac4a..93cd97a19 100644
--- a/test/monarch/run.sh
+++ b/test/monarch/run.sh
@@ -15,11 +15,9 @@ make_and_check() {
 }
 
 make_run() {
-  ln -rs -fL out ../../build/out
-  ln -rs -fL settings ../../build/settings
   curr_path=$(pwd)
   cd ../../build
-  make
+  make -j 4
   cd $curr_path
   #python TestMonarch.py
   python checkGPU.py
diff --git a/test/monarch/sbatch_run.sh b/test/monarch/sbatch_run.sh
index e439f6060..cdc2f8ad4 100644
--- a/test/monarch/sbatch_run.sh
+++ b/test/monarch/sbatch_run.sh
@@ -1,10 +1,12 @@
 #!/usr/bin/env bash
 ##SBATCH --qos=debug
+##SBATCH -t 00:09:00
 #SBATCH --job-name=camp_test_monarch
 #SBATCH --output=out_sbatch.txt
 #SBATCH --error=err_sbatch.txt
 #SBATCH --nodes=1
-#SBATCH --ntasks-per-node=160
+#SBATCH --ntasks-per-node=40
+#SBATCH --cpus-per-task=4
 #SBATCH --gres=gpu:4
 #SBATCH --exclusive
 
@@ -12,7 +14,7 @@ set -e
 make_run(){
   curr_path=$(pwd)
   cd ../../build
-  make
+  make -j 4
   cd $curr_path
   python TestMonarch1.py
   python TestMonarch2.py
diff --git a/test/monarch/settings/TestMonarch.json b/test/monarch/settings/TestMonarch.json
index 7b5e3c4c0..624621fce 100644
--- a/test/monarch/settings/TestMonarch.json
+++ b/test/monarch/settings/TestMonarch.json
@@ -8,8 +8,6 @@
     "caseGpuCpu": "GPU",
     "caseMulticellsOnecell": "BDF",
     "mpiProcesses": 20,
-    "allocatedNodes": 1,
-    "allocatedTasksPerNode": 160,
     "diffCellsL": [
         "Realistic"
     ],
@@ -32,9 +30,5 @@
     "exportPath": "exports",
     "results_file": "_solver_stats.csv",
     "nCellsProcesses": 40,
-    "campConf": "settings/config_variables_c_solver.txt",
-    "nGPUsCaseBase": 1,
-    "nGPUsCaseOptimList": [
-        2
-    ]
+    "campConf": "settings/config_variables_c_solver.txt"
 }
\ No newline at end of file
diff --git a/test/monarch_output/stats_monarch_netcdf.py b/test/monarch_output/stats_monarch_netcdf.py
index 3d01023e1..fd51bf15f 100644
--- a/test/monarch_output/stats_monarch_netcdf.py
+++ b/test/monarch_output/stats_monarch_netcdf.py
@@ -54,8 +54,9 @@ def process_variable(dataset1, dataset2, var_name):
     quantile75, quantile95, max_error, relative_error
 
 
-file1_path_header = "../../../../monarch_out/cpu_tstep479_O3/"
-file2_path_header = "../../../../monarch_out/gpu_tstep479_O3/"
+file1_path_header = "/gpfs/scratch/bsc32/bsc32815/monarch_out/cpu_tstep6_O3/"
+#file2_path_header = "/gpfs/scratch/bsc32/bsc32815/monarch_out/gpu_tstep6_O3/"
+file2_path_header = "/gpfs/scratch/bsc32/bsc32815/a591/nmmb-monarch/OUTPUT/regional/000/20160721/CURRENT_RUN/"
 
 # Calculate the speedup
 file1 = file1_path_header + "out/stats.csv"
@@ -105,6 +106,6 @@ def process_variable(dataset1, dataset2, var_name):
 plt.ylabel("Relative Error [%]")
 plt.xticks(range(len(variable_names)), variable_names, rotation=90)
 plt.title("Box Plot of highest NRMSE for MONARCH-CAMP 2 GPUs 24h")
-plt.show()
+#plt.show()
 #worst_variables.to_csv("exports/summary_table.csv", index=False)
 
diff --git a/test/monarch_output/tmp.py b/test/monarch_output/tmp.py
deleted file mode 100644
index 085ff0168..000000000
--- a/test/monarch_output/tmp.py
+++ /dev/null
@@ -1,50 +0,0 @@
-import matplotlib as mpl
-
-mpl.use('TkAgg')
-import pandas as pd
-import seaborn as sns
-import matplotlib.pyplot as plt
-import numpy as np
-
-
-# Sample data
-#data = sns.load_dataset("tips")  # Load a built-in dataset as an example
-
-data = np.array([1,3,5])
-# Create a box plot
-sns.boxplot(data=data, showfliers=False)
-print(data)
-
-# Add labels and title
-plt.xlabel("Day of the week")
-plt.ylabel("Total Bill Amount ($)")
-plt.title("Box Plot of Total Bill Amount by Day")
-
-# Show the plot
-plt.show()
-raise
-
-
-# Create a DataFrame from your CSV data
-#data = pd.read_csv('exports/summary_table.csv')
-
-array1= [1,3,5]
-array2=[2,7,4]
-# Create a DataFrame from two arrays of variables (replace with your own data)
-data = pd.DataFrame({'Variable1': array1, 'Variable2': array2})
-
-# Calculate the difference between the two variables
-data['Difference'] = data['Variable1'] - data['Variable2']
-
-# Create a violin plot
-sns.violinplot(x='Difference', data=data)
-
-# Print the standard deviation of the difference
-std_deviation = data['Difference'].std()
-print(f"Standard Deviation of Difference: {std_deviation}")
-
-# Customize the plot
-plt.title("Violin Plot of Difference")
-plt.xlabel("Difference")
-plt.ylabel("Density")
-plt.show()
\ No newline at end of file
diff --git a/test/unit_aero_rep_data/test_aero_rep_single_particle.F90 b/test/unit_aero_rep_data/test_aero_rep_single_particle.F90
index a0784b892..87e70f7c1 100644
--- a/test/unit_aero_rep_data/test_aero_rep_single_particle.F90
+++ b/test/unit_aero_rep_data/test_aero_rep_single_particle.F90
@@ -284,7 +284,6 @@ logical function build_aero_rep_data_set_test()
 
     deallocate(camp_state)
     deallocate(camp_core)
-
 #endif
 
   end function build_aero_rep_data_set_test
@@ -317,9 +316,7 @@ logical function eval_c_func(camp_core) result(passed)
     end select
 
     call camp_core%solver_initialize()
-
     camp_state => camp_core%new_state()
-
     camp_state%state_var(:) = 0.0
     call camp_state%env_states(1)%set_temperature_K(  298.0d0 )
     call camp_state%env_states(1)%set_pressure_Pa( 101325.0d0 )
diff --git a/test/unit_sub_model_data/test_sub_model_ZSR_aerosol_water.c b/test/unit_sub_model_data/test_sub_model_ZSR_aerosol_water.c
index 60018c051..88431c1ae 100644
--- a/test/unit_sub_model_data/test_sub_model_ZSR_aerosol_water.c
+++ b/test/unit_sub_model_data/test_sub_model_ZSR_aerosol_water.c
@@ -77,8 +77,8 @@ int test_sub_model_zsr_jac_calc(void *solver_data, double *state, double *env,
   double ppm_to_RH = pressure_Pa / water_vp / 1.0e6; // (1/ppm)
 
   // Water activity and d_aw / d_[H2O_g]
-  long double a_w = ppm_to_RH * CONC_H2O_G;
-  long double d_aw_d_wg = ppm_to_RH;
+  double a_w = ppm_to_RH * CONC_H2O_G;
+  double d_aw_d_wg = ppm_to_RH;
 
   // Jacobson ion pair
   double Y0 = -1.918004e2;
@@ -89,27 +89,27 @@ int test_sub_model_zsr_jac_calc(void *solver_data, double *state, double *env,
   double Y5 =  2.187103e4;
   double Y6 = -9.591577e3;
   double Y7 =  1.763672e3;
-  long double molality = Y0 + Y1*a_w + Y2*pow(a_w,2) + Y3*pow(a_w,3) + Y4*pow(a_w,4) +
+  double molality = Y0 + Y1*a_w + Y2*pow(a_w,2) + Y3*pow(a_w,3) + Y4*pow(a_w,4) +
                     Y5*pow(a_w,5) + Y6*pow(a_w,6) + Y7*pow(a_w,7);
-  long double d_molal_d_wg = Y1 + 2.0*Y2*a_w + 3.0*Y3*pow(a_w,2) + 4.0*Y4*pow(a_w,3) +
+  double d_molal_d_wg = Y1 + 2.0*Y2*a_w + 3.0*Y3*pow(a_w,2) + 4.0*Y4*pow(a_w,3) +
                         5.0*Y5*pow(a_w,4) + 6.0*Y6*pow(a_w,5) + 7.0*Y7*pow(a_w,6);
   d_molal_d_wg *= d_aw_d_wg;
 
-  long double cation = CONC_CA / MW_CA / 1000.0;
-  long double d_cation_d_C = 1.0 / MW_CA / 1000.0;
-  long double anion = CONC_CL / 2.0 / MW_CL / 1000.0;
-  long double d_anion_d_A = 1.0 / 2.0 / MW_CL / 1000.0;
+  double cation = CONC_CA / MW_CA / 1000.0;
+  double d_cation_d_C = 1.0 / MW_CA / 1000.0;
+  double anion = CONC_CL / 2.0 / MW_CL / 1000.0;
+  double d_anion_d_A = 1.0 / 2.0 / MW_CL / 1000.0;
 
-  long double e_ac = exp(ALPHA_ * cation);
-  long double e_aa = exp(ALPHA_ * anion );
-  long double conc = (cation * e_ac + anion * e_aa) / (e_ac + e_aa);
-  long double denom = (e_ac + e_aa) * (e_ac + e_aa);
-  long double d_conc_d_cation =
+  double e_ac = exp(ALPHA_ * cation);
+  double e_aa = exp(ALPHA_ * anion );
+  double conc = (cation * e_ac + anion * e_aa) / (e_ac + e_aa);
+  double denom = (e_ac + e_aa) * (e_ac + e_aa);
+  double d_conc_d_cation =
     (e_ac * e_ac +
      e_ac * e_aa *
      (1.0 - ALPHA_ * anion + ALPHA_ * cation)) /
     denom;
-  long double d_conc_d_anion =
+  double d_conc_d_anion =
     (e_aa * e_aa +
      e_ac * e_aa *
      (1.0 - ALPHA_ * cation + ALPHA_ * anion)) /