ZimmermanGroup · joshkamm · Jan 9, 2026 · Oct 13, 2025 · Oct 15, 2025 · Jan 8, 2026
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -7,32 +7,54 @@ target_include_directories(sgpu.exe PRIVATE
     "${CMAKE_SOURCE_DIR}/src/libcintw"
     "${CMAKE_SOURCE_DIR}/include"
     )
+
 set(TEST_COMP_FLAGS ${CMAKE_CXX_FLAGS})
+
+# CUDA/GPU dependencies - only when USE_ACC is enabled
 if(USE_ACC)
+    enable_language(CUDA)
+    find_package(CUDA REQUIRED)
+    find_library(CUDART_LIBRARY cudart ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
+
     target_link_options(sgpu.exe PRIVATE 
         -acc=gpu
         )
     set(TEST_COMP_FLAGS ${TEST_COMP_FLAGS} 
         -DUSE_ACC=1 -acc=gpu
         )
+
+    # GPU-specific link libraries
+    set(GPU_LIBRARIES 
+        "${CUDART_LIBRARY}" 
+        "${CUDA_LIBRARIES}" 
+        "${CUDA_TOOLKIT_ROOT_DIR}/../../math_libs/lib64/libcublas.so" 
+        "${CUDA_TOOLKIT_ROOT_DIR}/../../math_libs/lib64/libcusolver.so"
+        )
+else()
+    # CPU-only mode
+    set(GPU_LIBRARIES "")
+    message(STATUS "Building CPU-only version (USE_ACC=OFF)")
 endif()
 
 #LAPACK cmake not right.
 #need to use this as a variable below
 find_package(LAPACK)
-
 find_package(BLAS)
-enable_language(CUDA)
-find_package(CUDA REQUIRED)
-find_library(CUDART_LIBRARY cudart ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
-
-#enable_language(FORTRAN)
-#find_library(FORTRANLIBS fortranlibs ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
 
 set_property(TARGET sgpu.exe PROPERTY CXX_STANDARD 14)
 target_compile_options(sgpu.exe PRIVATE ${TEST_COMP_FLAGS} -O2)
-#target_link_libraries(sgpu.exe PRIVATE SlaterGPU io "${CUDART_LIBRARY}" "${CUDA_LIBRARIES}" )
-target_link_libraries(sgpu.exe PRIVATE SlaterGPU io cintw "${FORTRANLIBS}" "${CUDART_LIBRARY}" "${CUDA_LIBRARIES}" "${CUDA_TOOLKIT_ROOT_DIR}/../../math_libs/lib64/libcublas.so" "${CUDA_TOOLKIT_ROOT_DIR}/../../math_libs/lib64/libcusolver.so" "-L/home/paulzim/integrate/gpu/lib/lapack-3.9.0/lib/ -llapack -lblas" "-L/export/apps/RockyOS8/nvidia_hpc/2024_249/Linux_x86_64/24.9/ -fortranlibs" )
+
+# Link libraries - GPU_LIBRARIES will be empty for CPU-only builds
+target_link_libraries(sgpu.exe PRIVATE 
+    SlaterGPU 
+    io 
+    cintw 
+    "${FORTRANLIBS}" 
+    ${GPU_LIBRARIES}
+    "-L/home/paulzim/integrate/gpu/lib/lapack-3.9.0/lib/ -llapack -lblas" 
+    "-L/export/apps/RockyOS8/nvidia_hpc/2024_249/Linux_x86_64/24.9/ -fortranlibs"
+    )
+
 file(COPY lih_VK1 DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
 file(COPY lih_ri5 DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
 file(COPY h2_ri5 DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
@@ -42,6 +64,5 @@ file(COPY noplus_ri5 DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
 file(COPY ch4_ri5 DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
 file(COPY run.sh DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
 
-# Add installation for the executable
 install(TARGETS sgpu.exe
-        RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+        RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
diff --git a/examples/main.cpp b/examples/main.cpp
@@ -68,14 +68,20 @@ void compute_ps_integrals_to_disk(int natoms, int* atno, double* coords, vector<
   for (int j=0;j<N2;j++) Ssp[j] = Tsp[j] = Ensp[j] = pVpsp[j] = 0.;
 
   compute_STEn_ps(natoms,atno,coords,basis,nquad,nmu,nnu,nphi,Ssp,Tsp,Ensp,prl);
-  //compute_pVp_ps(natoms,atno,coords,basis,nquad,nmu,nnu,nphi,pVpsp,prl);
-  //compute_pVp_3c_ps(natoms,atno,coords,basis,nquad,nquad2,nsplit,nmu,nnu,nphi,pVpsp,prl);
+
+  bool x2c = read_int("X2C");
+  if (x2c)
+  {
+    compute_pVp_ps(natoms,atno,coords,basis,nquad,nmu,nnu,nphi,pVpsp,prl);
+    compute_pVp_3c_ps(natoms,atno,coords,basis,nquad,nquad2,nsplit,nmu,nnu,nphi,pVpsp,prl);
+  }
   compute_2c_ps(0,0,gamma,nbatch,natoms,atno,coords,basis_aux,nquad,nmu,nnu,nphi,Asp,prl);
   compute_3c_ps(0,0,gamma,nbatch,natoms,atno,coords,basis,basis_aux,nquad,nquad2,nsplit,nmu,nnu,nphi,Ensp,Csp,prl);
 
   if (prl > 0) printf("Printing PS Integral Files:\n");
   write_S_En_T(N,Ssp,Ensp,Tsp);
-  //write_square(N,pVpsp,"pVp",2);
+  if (x2c)
+    write_square(N,pVpsp,"pVp",2);
   write_square(Naux,Asp,"A",2);
   write_C(Naux,N2,Csp);
 
@@ -297,7 +303,9 @@ int main(int argc, char* argv[]) {
       if (prl > 0) printf("Printing Standard Integral Files:\n");
       write_S_En_T(N,S,En,T);
       write_square(Naux,A,"A",2);
-      write_square(N,pVp,"pVp",2);
+      bool x2c = read_int("X2C");
+      if (x2c)
+        write_square(N,pVp,"pVp",2);
       write_C(Naux, N2, C);
 
       if(c4 > 0)

diff --git a/include/cintwrapper.h b/include/cintwrapper.h
@@ -99,6 +99,10 @@ void get_overlap_ri(double * overlap, int Naux,
                     int natm, int nbas, int nbas_ri, int nenv,
                     int *atm, int* bas, double *env);
 
+void get_hcore(double *hcore, int N,
+               int natm, int nbas, int nenv,
+               int *atm, int *bas, double *env);
+
 void get_hcore(double *hcore, double *En, double *T, int N,
                int natm, int nbas, int nenv,
                int *atm, int *bas, double *env);

diff --git a/include/cpu_util.h b/include/cpu_util.h
@@ -52,6 +52,6 @@ void mat_times_mat_bt_cpu(double* C, double* A, double* B, int M, int N, int K,
 void mat_times_mat_bt_cpu(float* C, float* A, float* B, int N);
 void mat_times_mat_bt_cpu(double* C, double* A, double* B, int M, int N, int K);
 void mat_times_mat_bt_cpu(double* C, double* A, double* B, int N);
-
+void mat_times_mat_at_cpu(double* C, double* A, double* B, int M, int N, int K);
 
 #endif
diff --git a/include/cuda_util.h b/include/cuda_util.h
@@ -1,9 +1,16 @@
 #ifndef CUDA_UTILH
 #define CUDA_UTILH
 
-//#include <cublas.h>
+#if !USE_ACC
+#include <complex>
+typedef std::complex<double> cuDoubleComplex;
+typedef std::complex<float> cuFloatComplex;
+typedef void* cusolverDnHandle_t;
+typedef void* cublasHandle_t;
+#else
 #include <cublas_v2.h>
 #include <cusolverDn.h>
+#endif
 
 int invert_eigen_cusolver(int size, double* A, double eig_max, cusolverDnHandle_t& cu_hdl);
 int invert_stable_cusolver(int size, double* A, double delta, cusolverDnHandle_t& cu_hdl);
@@ -41,4 +48,5 @@ void copy_to_all_gpu(int ngpu, int s1, double* A, int include_first);
 void copy_to_all_gpu(int ngpu, int s1, float* A, int include_first);
 void copy_to_all_gpu(int ngpu, int s1, int s2, double** A, int include_first);
 
-#endif
+
+#endif // CUDA_UTILH
diff --git a/include/cusp.h b/include/cusp.h
@@ -24,6 +24,7 @@
 
 using namespace std;
 
+
 void compute_diatomic_symm(int natoms, int* atno, vector<vector<double> > basis, vector<double*>& pB_all, int prl);
 
 void compute_cusp(int natoms, int* atno, double* coords, vector<vector<double> > &basis, double* pB1, double* pB2, int prl);

diff --git a/include/integrals.h b/include/integrals.h
@@ -228,6 +228,7 @@ void copy_symm_4c_ps_cpu(int natoms, int* n2i, int N, double* olp);
 int get_natoms_with_basis(int natoms, int* atno, vector<vector<double> >& basis);
 
 vector<vector<double> > setup_integrals_gsgpu(vector<vector<double> >& basis_aux, int natoms, int* atno, double* coords, int& nbas, int& nenv, int& N, int& Naux, int& nbas_ri, int* &atm, int* &bas, double* &env, int prl);
+void compute_integrals_g(int natm, int nbas, int nenv, int N, int Naux, int nbas_ri, int* atm, int* bas, double* env, double* S, double* T, double* jH1, double* A, double* C, int prl);
 void compute_integrals_g(int natm, int nbas, int nenv, int N, int Naux, int nbas_ri, int* atm, int* bas, double* env, double* S, double* En, double* T, double* jH1, double* A, double* C, double* pvp, int prl);
 void compute_gaussian_integrals_to_disk(int N, int Naux, int natoms, int nbas, int nenv, int nbas_ri, int* atm, int* bas, double* env);
 

diff --git a/src/integrals/CMakeLists.txt b/src/integrals/CMakeLists.txt
@@ -1,4 +1,5 @@
-set(INTEGRALS_SOURCES
+# Common sources for both CPU and GPU builds
+set(INTEGRALS_COMMON_SOURCES
     integrals.cpp
     integrals_d.cpp
     integrals_aux.cpp
@@ -19,17 +20,27 @@ set(INTEGRALS_SOURCES
     ps_first_second_order.cpp
     cusp.cpp
     cpu_util.cpp
-    gpu_util.cpp
     grid_util.cpp
-    cuda_util.cpp
     scf_util.cpp
     symm.cpp
     gauss.cpp
     hess.cpp
     opt.cpp
     jellium_ints.cpp
+    gpu_util.cpp
+    cuda_util.cpp
     )
 
+
+# Combine sources based on USE_ACC flag
+if(USE_ACC)
+    set(INTEGRALS_SOURCES ${INTEGRALS_COMMON_SOURCES})
+    message(STATUS "Building SlaterGPU library with GPU support")
+else()
+    set(INTEGRALS_SOURCES ${INTEGRALS_COMMON_SOURCES})
+    message(STATUS "Building SlaterGPU library for CPU-only")
+endif()
+
 add_library(SlaterGPU STATIC "${INTEGRALS_SOURCES}")
 target_include_directories(SlaterGPU PUBLIC
     $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/include>
@@ -52,4 +63,4 @@ else()
     target_compile_options(SlaterGPU PRIVATE
         ${SLATER_COMP_FLAGS} -DUSE_ACC=0
         )
-endif()
+endif()
diff --git a/src/integrals/cpu_util.cpp b/src/integrals/cpu_util.cpp
@@ -1120,3 +1120,23 @@ void cross(double* m, double* r1, double* r2)
 
   return;
 }
+
+// Extended version of mat_times_mat_at_cpu with M, N, K parameters
+void mat_times_mat_at_cpu(double* C, double* A, double* B, int M, int N, int K)
+{
+  // C = A^T * B
+  // A is K x M, B is K x N, C is M x N
+
+  int LDA = M;
+  int LDB = N;
+  int LDC = N;
+
+  double ALPHA = 1.0;
+  double BETA = 0.0;
+
+  char TA = 'T';
+  char TB = 'N';
+  dgemm_(&TB,&TA,&N,&M,&K,&ALPHA,B,&LDB,A,&LDA,&BETA,C,&LDC);
+
+  return;
+}
Original file line number	Diff line number	Diff line change
Expand Up		@@ -24,6 +24,7 @@

		using namespace std;


		void compute_diatomic_symm(int natoms, int* atno, vector<vector<double> > basis, vector<double*>& pB_all, int prl);

		void compute_cusp(int natoms, int* atno, double* coords, vector<vector<double> > &basis, double* pB1, double* pB2, int prl);
Expand Down