diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 14be0eb..82b0f85 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -7,32 +7,54 @@ target_include_directories(sgpu.exe PRIVATE "${CMAKE_SOURCE_DIR}/src/libcintw" "${CMAKE_SOURCE_DIR}/include" ) + set(TEST_COMP_FLAGS ${CMAKE_CXX_FLAGS}) + +# CUDA/GPU dependencies - only when USE_ACC is enabled if(USE_ACC) + enable_language(CUDA) + find_package(CUDA REQUIRED) + find_library(CUDART_LIBRARY cudart ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) + target_link_options(sgpu.exe PRIVATE -acc=gpu ) set(TEST_COMP_FLAGS ${TEST_COMP_FLAGS} -DUSE_ACC=1 -acc=gpu ) + + # GPU-specific link libraries + set(GPU_LIBRARIES + "${CUDART_LIBRARY}" + "${CUDA_LIBRARIES}" + "${CUDA_TOOLKIT_ROOT_DIR}/../../math_libs/lib64/libcublas.so" + "${CUDA_TOOLKIT_ROOT_DIR}/../../math_libs/lib64/libcusolver.so" + ) +else() + # CPU-only mode + set(GPU_LIBRARIES "") + message(STATUS "Building CPU-only version (USE_ACC=OFF)") endif() #LAPACK cmake not right. #need to use this as a variable below find_package(LAPACK) - find_package(BLAS) -enable_language(CUDA) -find_package(CUDA REQUIRED) -find_library(CUDART_LIBRARY cudart ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) - -#enable_language(FORTRAN) -#find_library(FORTRANLIBS fortranlibs ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) set_property(TARGET sgpu.exe PROPERTY CXX_STANDARD 14) target_compile_options(sgpu.exe PRIVATE ${TEST_COMP_FLAGS} -O2) -#target_link_libraries(sgpu.exe PRIVATE SlaterGPU io "${CUDART_LIBRARY}" "${CUDA_LIBRARIES}" ) -target_link_libraries(sgpu.exe PRIVATE SlaterGPU io cintw "${FORTRANLIBS}" "${CUDART_LIBRARY}" "${CUDA_LIBRARIES}" "${CUDA_TOOLKIT_ROOT_DIR}/../../math_libs/lib64/libcublas.so" "${CUDA_TOOLKIT_ROOT_DIR}/../../math_libs/lib64/libcusolver.so" "-L/home/paulzim/integrate/gpu/lib/lapack-3.9.0/lib/ -llapack -lblas" "-L/export/apps/RockyOS8/nvidia_hpc/2024_249/Linux_x86_64/24.9/ -fortranlibs" ) + +# Link libraries - GPU_LIBRARIES will be empty for CPU-only builds +target_link_libraries(sgpu.exe PRIVATE + SlaterGPU + io + cintw + "${FORTRANLIBS}" + ${GPU_LIBRARIES} + "-L/home/paulzim/integrate/gpu/lib/lapack-3.9.0/lib/ -llapack -lblas" + "-L/export/apps/RockyOS8/nvidia_hpc/2024_249/Linux_x86_64/24.9/ -fortranlibs" + ) + file(COPY lih_VK1 DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) file(COPY lih_ri5 DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) file(COPY h2_ri5 DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) @@ -42,6 +64,5 @@ file(COPY noplus_ri5 DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) file(COPY ch4_ri5 DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) file(COPY run.sh DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) -# Add installation for the executable install(TARGETS sgpu.exe - RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) \ No newline at end of file diff --git a/examples/main.cpp b/examples/main.cpp index 26baeab..d48726d 100644 --- a/examples/main.cpp +++ b/examples/main.cpp @@ -68,14 +68,20 @@ void compute_ps_integrals_to_disk(int natoms, int* atno, double* coords, vector< for (int j=0;j 0) printf("Printing PS Integral Files:\n"); write_S_En_T(N,Ssp,Ensp,Tsp); - //write_square(N,pVpsp,"pVp",2); + if (x2c) + write_square(N,pVpsp,"pVp",2); write_square(Naux,Asp,"A",2); write_C(Naux,N2,Csp); @@ -297,7 +303,9 @@ int main(int argc, char* argv[]) { if (prl > 0) printf("Printing Standard Integral Files:\n"); write_S_En_T(N,S,En,T); write_square(Naux,A,"A",2); - write_square(N,pVp,"pVp",2); + bool x2c = read_int("X2C"); + if (x2c) + write_square(N,pVp,"pVp",2); write_C(Naux, N2, C); if(c4 > 0) diff --git a/include/cintwrapper.h b/include/cintwrapper.h index 8e6d642..041f835 100644 --- a/include/cintwrapper.h +++ b/include/cintwrapper.h @@ -99,6 +99,10 @@ void get_overlap_ri(double * overlap, int Naux, int natm, int nbas, int nbas_ri, int nenv, int *atm, int* bas, double *env); +void get_hcore(double *hcore, int N, + int natm, int nbas, int nenv, + int *atm, int *bas, double *env); + void get_hcore(double *hcore, double *En, double *T, int N, int natm, int nbas, int nenv, int *atm, int *bas, double *env); diff --git a/include/cpu_util.h b/include/cpu_util.h index 9a259d7..0fe6f21 100644 --- a/include/cpu_util.h +++ b/include/cpu_util.h @@ -52,6 +52,6 @@ void mat_times_mat_bt_cpu(double* C, double* A, double* B, int M, int N, int K, void mat_times_mat_bt_cpu(float* C, float* A, float* B, int N); void mat_times_mat_bt_cpu(double* C, double* A, double* B, int M, int N, int K); void mat_times_mat_bt_cpu(double* C, double* A, double* B, int N); - +void mat_times_mat_at_cpu(double* C, double* A, double* B, int M, int N, int K); #endif diff --git a/include/cuda_util.h b/include/cuda_util.h index 7e35284..3f58030 100644 --- a/include/cuda_util.h +++ b/include/cuda_util.h @@ -1,9 +1,16 @@ #ifndef CUDA_UTILH #define CUDA_UTILH -//#include +#if !USE_ACC +#include +typedef std::complex cuDoubleComplex; +typedef std::complex cuFloatComplex; +typedef void* cusolverDnHandle_t; +typedef void* cublasHandle_t; +#else #include #include +#endif int invert_eigen_cusolver(int size, double* A, double eig_max, cusolverDnHandle_t& cu_hdl); int invert_stable_cusolver(int size, double* A, double delta, cusolverDnHandle_t& cu_hdl); @@ -41,4 +48,5 @@ void copy_to_all_gpu(int ngpu, int s1, double* A, int include_first); void copy_to_all_gpu(int ngpu, int s1, float* A, int include_first); void copy_to_all_gpu(int ngpu, int s1, int s2, double** A, int include_first); -#endif + +#endif // CUDA_UTILH \ No newline at end of file diff --git a/include/cusp.h b/include/cusp.h index 2b070b7..fcefc37 100644 --- a/include/cusp.h +++ b/include/cusp.h @@ -24,6 +24,7 @@ using namespace std; + void compute_diatomic_symm(int natoms, int* atno, vector > basis, vector& pB_all, int prl); void compute_cusp(int natoms, int* atno, double* coords, vector > &basis, double* pB1, double* pB2, int prl); diff --git a/include/integrals.h b/include/integrals.h index b475b07..b6a0a57 100644 --- a/include/integrals.h +++ b/include/integrals.h @@ -228,6 +228,7 @@ void copy_symm_4c_ps_cpu(int natoms, int* n2i, int N, double* olp); int get_natoms_with_basis(int natoms, int* atno, vector >& basis); vector > setup_integrals_gsgpu(vector >& basis_aux, int natoms, int* atno, double* coords, int& nbas, int& nenv, int& N, int& Naux, int& nbas_ri, int* &atm, int* &bas, double* &env, int prl); +void compute_integrals_g(int natm, int nbas, int nenv, int N, int Naux, int nbas_ri, int* atm, int* bas, double* env, double* S, double* T, double* jH1, double* A, double* C, int prl); void compute_integrals_g(int natm, int nbas, int nenv, int N, int Naux, int nbas_ri, int* atm, int* bas, double* env, double* S, double* En, double* T, double* jH1, double* A, double* C, double* pvp, int prl); void compute_gaussian_integrals_to_disk(int N, int Naux, int natoms, int nbas, int nenv, int nbas_ri, int* atm, int* bas, double* env); diff --git a/src/integrals/CMakeLists.txt b/src/integrals/CMakeLists.txt index c11e217..0c0fcc3 100644 --- a/src/integrals/CMakeLists.txt +++ b/src/integrals/CMakeLists.txt @@ -1,4 +1,5 @@ -set(INTEGRALS_SOURCES +# Common sources for both CPU and GPU builds +set(INTEGRALS_COMMON_SOURCES integrals.cpp integrals_d.cpp integrals_aux.cpp @@ -19,17 +20,27 @@ set(INTEGRALS_SOURCES ps_first_second_order.cpp cusp.cpp cpu_util.cpp - gpu_util.cpp grid_util.cpp - cuda_util.cpp scf_util.cpp symm.cpp gauss.cpp hess.cpp opt.cpp jellium_ints.cpp + gpu_util.cpp + cuda_util.cpp ) + +# Combine sources based on USE_ACC flag +if(USE_ACC) + set(INTEGRALS_SOURCES ${INTEGRALS_COMMON_SOURCES}) + message(STATUS "Building SlaterGPU library with GPU support") +else() + set(INTEGRALS_SOURCES ${INTEGRALS_COMMON_SOURCES}) + message(STATUS "Building SlaterGPU library for CPU-only") +endif() + add_library(SlaterGPU STATIC "${INTEGRALS_SOURCES}") target_include_directories(SlaterGPU PUBLIC $ @@ -52,4 +63,4 @@ else() target_compile_options(SlaterGPU PRIVATE ${SLATER_COMP_FLAGS} -DUSE_ACC=0 ) -endif() +endif() \ No newline at end of file diff --git a/src/integrals/cpu_util.cpp b/src/integrals/cpu_util.cpp index 44b39dc..7afd923 100644 --- a/src/integrals/cpu_util.cpp +++ b/src/integrals/cpu_util.cpp @@ -1120,3 +1120,23 @@ void cross(double* m, double* r1, double* r2) return; } + +// Extended version of mat_times_mat_at_cpu with M, N, K parameters +void mat_times_mat_at_cpu(double* C, double* A, double* B, int M, int N, int K) +{ + // C = A^T * B + // A is K x M, B is K x N, C is M x N + + int LDA = M; + int LDB = N; + int LDC = N; + + double ALPHA = 1.0; + double BETA = 0.0; + + char TA = 'T'; + char TB = 'N'; + dgemm_(&TB,&TA,&N,&M,&K,&ALPHA,B,&LDB,A,&LDA,&BETA,C,&LDC); + + return; +} \ No newline at end of file diff --git a/src/integrals/cuda_util.cpp b/src/integrals/cuda_util.cpp index 98327e0..d3c6763 100644 --- a/src/integrals/cuda_util.cpp +++ b/src/integrals/cuda_util.cpp @@ -1,3 +1,5 @@ +//Vaibhav needs to fix this + #include #include #include @@ -10,7 +12,10 @@ #include "cuda_util.h" #include "cpu_util.h" +#if USE_ACC #include "cuda_runtime.h" +#include +#endif //these functions assume the matrices are already on the device, via ACC //hoping to reduce overhead by getting cusolver handle just once @@ -23,21 +28,32 @@ void print_square_sm(int N, float* A); void print_square_sm(int N, double* A); double read_float(string filename); -#include + void print_square_complex(int N, cuDoubleComplex* A) { + #if !USE_ACC + printf("\n ERROR: there is no CPU implementation for print_square_complex \n"); exit(-1); + #else for (int i=0;i > setup_integrals_gsgpu(vector >& basis_aux return basis; } +void compute_integrals_g(int natm, int nbas, int nenv, int N, int Naux, int nbas_ri, int* atm, int* bas, double* env, double* S, double* T, double* jH1, double* A, double* C, int prl) +{ + get_overlap(S, N, natm, nbas, nenv, atm, bas, env); + get_hcore(jH1, N, natm, nbas, nenv, atm, bas, env); + if (T!=NULL) + get_tcore(T, N, natm, nbas, nenv, atm, bas, env); + + if (prl>1) + { + printf("\n S: \n"); + print_square(N,S); + printf("\n jH1: \n"); + print_square(N,jH1); + } + + if (Naux>0) + { + gen_eri_2c(A, Naux, natm, nbas, nenv, nbas_ri, atm, bas, env); + gen_eri_3c(C, N, Naux, natm, nbas, nenv, nbas_ri, atm, bas, env); + } + else + { + printf(" no auxiliary basis \n"); + } + + if (prl>1) + { + printf("\n A: \n"); + for (int m=0;m1) { diff --git a/src/integrals/integrals_d.cpp b/src/integrals/integrals_d.cpp index c9cc065..c494e99 100644 --- a/src/integrals/integrals_d.cpp +++ b/src/integrals/integrals_d.cpp @@ -8,7 +8,9 @@ #include void auto_crash(); void print_duration(chrono::high_resolution_clock::time_point t1, chrono::high_resolution_clock::time_point t2, string name); -void copy_to_all_gpu(int ngpu, int s1, double* A, int include_first); +#if USE_ACC + void copy_to_all_gpu(int ngpu, int s1, double* A, int include_first); +#endif void gather_12_d_En_0(int s1, int s2, int gs, int iN, float** valtx1, float** valtx2, float** valS1x, float** valS2x, float* wtt1, float* wtt2) { @@ -292,7 +294,9 @@ void compute_d_3c_para(int npgu, int natoms, int* atno, float* coords, vector