-
Notifications
You must be signed in to change notification settings - Fork 0
Support gpu #42
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Support gpu #42
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,4 +15,4 @@ | |
| add_executable(velox_gpu_hash_table_test HashTableTest.cu) | ||
| target_link_libraries(velox_gpu_hash_table_test Folly::folly gflags::gflags) | ||
| set_target_properties(velox_gpu_hash_table_test PROPERTIES CUDA_ARCHITECTURES | ||
| native) | ||
| 75) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you please clarify the reason for the changes as well as the following similar changes. |
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,10 +19,14 @@ set(CMAKE_PREFIX_PATH "$CONDA_PREFIX") | |
|
|
||
| find_package(Torch REQUIRED) | ||
| find_package(xgboost REQUIRED) | ||
| find_package(CUDA REQUIRED) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This may lead to a compilation error when comping in a CPU-only option. I see Velox provides a flag, VELOX_ENABLE_GPU. I think it would be better to cooperate this configuration code with the flag, VELOX_ENABLE_GPU |
||
|
|
||
| add_library(mat_mul_cublas STATIC tests/GPUFunctions.cu) | ||
| target_link_libraries(mat_mul_cublas cublas ${CUDA_LIBRARIES}) | ||
| add_executable(ml_functions_test tests/MLFunctionsTest.cpp) | ||
| target_link_libraries( | ||
| ml_functions_test | ||
| mat_mul_cublas | ||
| velox_aggregates | ||
| velox_type | ||
| velox_vector | ||
|
|
@@ -40,6 +44,7 @@ target_link_libraries( | |
| add_executable(nn_tests tests/NNTest.cpp) | ||
| target_link_libraries( | ||
| nn_tests | ||
| mat_mul_cublas | ||
| velox_aggregates | ||
| velox_type | ||
| velox_vector | ||
|
|
@@ -57,6 +62,7 @@ target_link_libraries( | |
| add_executable(embedding_test tests/EmbeddingTest.cpp) | ||
| target_link_libraries( | ||
| embedding_test | ||
| mat_mul_cublas | ||
| velox_aggregates | ||
| velox_type | ||
| velox_vector | ||
|
|
@@ -74,6 +80,7 @@ target_link_libraries( | |
| add_executable(two_tower_model_test tests/TwoTowerModelTest.cpp) | ||
| target_link_libraries( | ||
| two_tower_model_test | ||
| mat_mul_cublas | ||
| velox_aggregates | ||
| velox_type | ||
| velox_vector | ||
|
|
@@ -92,6 +99,7 @@ target_link_libraries( | |
| add_executable(two_tower_model_pipeline_test tests/TowTowerModelPipelineTest.cpp) | ||
| target_link_libraries( | ||
| two_tower_model_pipeline_test | ||
| mat_mul_cublas | ||
| velox_aggregates | ||
| velox_type | ||
| velox_vector | ||
|
|
@@ -110,6 +118,7 @@ target_link_libraries( | |
| add_executable(decision_forest_prediction_test tests/DecisionForestTest.cpp) | ||
| target_link_libraries( | ||
| decision_forest_prediction_test | ||
| mat_mul_cublas | ||
| velox_aggregates | ||
| velox_type | ||
| velox_vector | ||
|
|
@@ -145,6 +154,7 @@ target_link_libraries( | |
| add_executable(ml_sql_test tests/MLSQLTest.cpp) | ||
| target_link_libraries( | ||
| ml_sql_test | ||
| mat_mul_cublas | ||
| velox_aggregates | ||
| velox_type | ||
| velox_vector | ||
|
|
@@ -162,6 +172,7 @@ target_link_libraries( | |
| add_executable(array_array_unnest_test tests/ArrayofArrayUnnestTest.cpp) | ||
| target_link_libraries( | ||
| array_array_unnest_test | ||
| mat_mul_cublas | ||
| velox_aggregates | ||
| velox_type | ||
| velox_vector | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,26 @@ | ||
| #pragma once | ||
|
|
||
| #include <iostream> | ||
| #include <cstdlib> | ||
| #include <cublas_v2.h> | ||
|
|
||
| template <typename T> | ||
| struct CublasType {}; | ||
|
|
||
| template <> | ||
| struct CublasType<float> { | ||
| static const cudaDataType_t type = CUDA_R_32F; | ||
| static const cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT; | ||
| }; | ||
|
|
||
| template <> | ||
| struct CublasType<double> { | ||
| static const cudaDataType_t type = CUDA_R_64F; | ||
| static const cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT; | ||
| }; | ||
|
|
||
| template <typename T> | ||
| void multiplyMatrices(int m, int n, int k, | ||
| const T* A, int lda, const T* B, int ldb, | ||
| T* C, int ldc); | ||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,53 @@ | ||
| #include "velox/ml_functions/gpufunctions.h" | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please move your .cu file to the ml_functions folder. |
||
| #include "velox/experimental/gpu/Common.h" | ||
|
|
||
| #define CUBLAS_ERROR(x) do { if((x)!=CUBLAS_STATUS_SUCCESS) { \ | ||
| printf("Error %s at %s:%d\n", cublasGetStatusString(x), __FILE__, __LINE__);\ | ||
| exit(EXIT_FAILURE);} } while(0) | ||
| template <typename T> | ||
| void multiplyMatrices(int m, int n, int k, | ||
| const T* A, int lda, const T* B, int ldb, | ||
| T* C, int ldc) { | ||
|
|
||
| cublasHandle_t handle; | ||
| CUBLAS_ERROR(cublasCreate(&handle)); | ||
| // Allocate device memory | ||
| T *d_A, *d_B, *d_C; | ||
| CUDA_CHECK_FATAL(cudaMalloc((void**)&d_A, m * k * sizeof(T))); | ||
| CUDA_CHECK_FATAL(cudaMalloc((void**)&d_B, k * n * sizeof(T))); | ||
| CUDA_CHECK_FATAL(cudaMalloc((void**)&d_C, m * n * sizeof(T))); | ||
|
|
||
| // Copy data from host to device | ||
| CUDA_CHECK_FATAL(cudaMemcpy(d_A, A, m * k * sizeof(T), cudaMemcpyHostToDevice)); | ||
| CUDA_CHECK_FATAL(cudaMemcpy(d_B, B, k * n * sizeof(T), cudaMemcpyHostToDevice)); | ||
|
|
||
| T alpha = 1.0; | ||
| T beta = 0.0; | ||
| // Perform matrix multiplication on GPU | ||
| cublasStatus_t status = cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, | ||
| n, m, k, &alpha, | ||
| d_B, CublasType<T>::type, n, | ||
| d_A, CublasType<T>::type, k, | ||
| &beta, d_C, CublasType<T>::type, n, | ||
| CublasType<T>::type, CublasType<T>::algo); | ||
|
|
||
| CUBLAS_ERROR(status); | ||
|
|
||
| // Copy result from device to host | ||
| CUDA_CHECK_FATAL(cudaMemcpy(C, d_C, m * n * sizeof(T), cudaMemcpyDeviceToHost)); | ||
|
|
||
| // Free device memory | ||
| CUDA_CHECK_LOG(cudaFree(d_A)); | ||
| CUDA_CHECK_LOG(cudaFree(d_B)); | ||
| CUDA_CHECK_LOG(cudaFree(d_C)); | ||
|
|
||
| // Destroy cuBLAS handle | ||
| CUBLAS_ERROR(cublasDestroy(handle)); | ||
| } | ||
|
|
||
| // Explicit instantiation for float and double | ||
| template void multiplyMatrices<float>(int, int, int, | ||
| const float*, int, const float*, int, float*, int); | ||
| template void multiplyMatrices<double>(int, int, int, | ||
| const double*, int, const double*, int, double*, int); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please discard the hard-coded num_threads, it should be set automatically