From 27cba60d24bb87cfc7fa6130b9f4061fe52385fb Mon Sep 17 00:00:00 2001 From: xllgit Date: Thu, 14 Mar 2024 18:17:45 +0000 Subject: [PATCH 1/2] Update for gpu matrix multiply support --- Makefile | 3 +- velox/cost_model/CMakeLists.txt | 1 + velox/experimental/gpu/tests/CMakeLists.txt | 2 +- velox/experimental/wave/common/CMakeLists.txt | 2 +- .../wave/common/tests/CMakeLists.txt | 2 +- velox/experimental/wave/exec/CMakeLists.txt | 2 +- .../wave/exec/tests/CMakeLists.txt | 2 +- .../wave/vector/tests/CMakeLists.txt | 2 +- velox/ml_functions/CMakeLists.txt | 11 ++++ velox/ml_functions/functions.h | 29 +++++++--- velox/ml_functions/gpufunctions.h | 26 +++++++++ velox/ml_functions/tests/GPUFunctions.cu | 53 +++++++++++++++++++ velox/ml_functions/tests/MLFunctionsTest.cpp | 28 ++++++---- velox/optimizer/CMakeLists.txt | 9 ++++ 14 files changed, 148 insertions(+), 24 deletions(-) create mode 100644 velox/ml_functions/gpufunctions.h create mode 100644 velox/ml_functions/tests/GPUFunctions.cu diff --git a/Makefile b/Makefile index 3fb5eb15d..6f5d544db 100644 --- a/Makefile +++ b/Makefile @@ -63,7 +63,8 @@ GENERATOR += -DVELOX_FORCE_COLORED_OUTPUT=ON endif endif -NUM_THREADS ?= $(shell getconf _NPROCESSORS_CONF 2>/dev/null || echo 1) +#NUM_THREADS ?= $(shell getconf _NPROCESSORS_CONF 2>/dev/null || echo 1) +NUM_THREADS = 2 CPU_TARGET ?= "avx" FUZZER_SEED ?= 123456 diff --git a/velox/cost_model/CMakeLists.txt b/velox/cost_model/CMakeLists.txt index dfac2fcdc..3a780a61d 100644 --- a/velox/cost_model/CMakeLists.txt +++ b/velox/cost_model/CMakeLists.txt @@ -23,6 +23,7 @@ find_package(Torch REQUIRED) add_executable(nn_test tests/NNTests.cpp) target_link_libraries( nn_test + mat_mul_cublas velox_aggregates velox_type velox_vector diff --git a/velox/experimental/gpu/tests/CMakeLists.txt b/velox/experimental/gpu/tests/CMakeLists.txt index 8eaca86e7..2ed0f5eee 100644 --- a/velox/experimental/gpu/tests/CMakeLists.txt +++ b/velox/experimental/gpu/tests/CMakeLists.txt @@ -15,4 +15,4 @@ add_executable(velox_gpu_hash_table_test HashTableTest.cu) target_link_libraries(velox_gpu_hash_table_test Folly::folly gflags::gflags) set_target_properties(velox_gpu_hash_table_test PROPERTIES CUDA_ARCHITECTURES - native) + 75) diff --git a/velox/experimental/wave/common/CMakeLists.txt b/velox/experimental/wave/common/CMakeLists.txt index 205d94533..b69eaff28 100644 --- a/velox/experimental/wave/common/CMakeLists.txt +++ b/velox/experimental/wave/common/CMakeLists.txt @@ -15,7 +15,7 @@ add_library(velox_wave_common GpuArena.cpp Buffer.cpp Cuda.cu Exception.cpp Type.cpp) -set_target_properties(velox_wave_common PROPERTIES CUDA_ARCHITECTURES native) +set_target_properties(velox_wave_common PROPERTIES CUDA_ARCHITECTURES 75) target_link_libraries(velox_wave_common velox_exception velox_common_base velox_type) diff --git a/velox/experimental/wave/common/tests/CMakeLists.txt b/velox/experimental/wave/common/tests/CMakeLists.txt index 159261e72..e5cb28574 100644 --- a/velox/experimental/wave/common/tests/CMakeLists.txt +++ b/velox/experimental/wave/common/tests/CMakeLists.txt @@ -16,7 +16,7 @@ add_executable(velox_wave_common_test GpuArenaTest.cpp CudaTest.cpp CudaTest.cu BlockTest.cpp BlockTest.cu) set_target_properties(velox_wave_common_test PROPERTIES CUDA_ARCHITECTURES - native) + 75) add_test(velox_wave_common_test velox_wave_common_test) diff --git a/velox/experimental/wave/exec/CMakeLists.txt b/velox/experimental/wave/exec/CMakeLists.txt index a346b917a..39147e0da 100644 --- a/velox/experimental/wave/exec/CMakeLists.txt +++ b/velox/experimental/wave/exec/CMakeLists.txt @@ -26,7 +26,7 @@ add_library( Wave.cpp Project.cpp) -set_target_properties(velox_wave_exec PROPERTIES CUDA_ARCHITECTURES native) +set_target_properties(velox_wave_exec PROPERTIES CUDA_ARCHITECTURES 75) target_link_libraries(velox_wave_exec velox_wave_vector velox_wave_common velox_exception velox_common_base velox_exec) diff --git a/velox/experimental/wave/exec/tests/CMakeLists.txt b/velox/experimental/wave/exec/tests/CMakeLists.txt index 74feddf6c..ad1ce9c43 100644 --- a/velox/experimental/wave/exec/tests/CMakeLists.txt +++ b/velox/experimental/wave/exec/tests/CMakeLists.txt @@ -14,7 +14,7 @@ add_executable(velox_wave_exec_test FilterProjectTest.cpp Main.cpp) -set_target_properties(velox_wave_exec_test PROPERTIES CUDA_ARCHITECTURES native) +set_target_properties(velox_wave_exec_test PROPERTIES CUDA_ARCHITECTURES 75) add_test(velox_wave_exec_test velox_wave_exec_test) diff --git a/velox/experimental/wave/vector/tests/CMakeLists.txt b/velox/experimental/wave/vector/tests/CMakeLists.txt index 2d4201c4c..f2966ac0e 100644 --- a/velox/experimental/wave/vector/tests/CMakeLists.txt +++ b/velox/experimental/wave/vector/tests/CMakeLists.txt @@ -15,7 +15,7 @@ add_executable(velox_wave_vector_test VectorTest.cpp) set_target_properties(velox_wave_vector_test PROPERTIES CUDA_ARCHITECTURES - native) + 75) add_test(veloxwave__vector_test velox_wave_vector_test) diff --git a/velox/ml_functions/CMakeLists.txt b/velox/ml_functions/CMakeLists.txt index 65375e80a..3755df137 100644 --- a/velox/ml_functions/CMakeLists.txt +++ b/velox/ml_functions/CMakeLists.txt @@ -19,10 +19,14 @@ set(CMAKE_PREFIX_PATH "$CONDA_PREFIX") find_package(Torch REQUIRED) find_package(xgboost REQUIRED) +find_package(CUDA REQUIRED) +add_library(mat_mul_cublas STATIC tests/GPUFunctions.cu) +target_link_libraries(mat_mul_cublas cublas ${CUDA_LIBRARIES}) add_executable(ml_functions_test tests/MLFunctionsTest.cpp) target_link_libraries( ml_functions_test + mat_mul_cublas velox_aggregates velox_type velox_vector @@ -40,6 +44,7 @@ target_link_libraries( add_executable(nn_tests tests/NNTest.cpp) target_link_libraries( nn_tests + mat_mul_cublas velox_aggregates velox_type velox_vector @@ -57,6 +62,7 @@ target_link_libraries( add_executable(embedding_test tests/EmbeddingTest.cpp) target_link_libraries( embedding_test + mat_mul_cublas velox_aggregates velox_type velox_vector @@ -74,6 +80,7 @@ target_link_libraries( add_executable(two_tower_model_test tests/TwoTowerModelTest.cpp) target_link_libraries( two_tower_model_test + mat_mul_cublas velox_aggregates velox_type velox_vector @@ -92,6 +99,7 @@ target_link_libraries( add_executable(two_tower_model_pipeline_test tests/TowTowerModelPipelineTest.cpp) target_link_libraries( two_tower_model_pipeline_test + mat_mul_cublas velox_aggregates velox_type velox_vector @@ -110,6 +118,7 @@ target_link_libraries( add_executable(decision_forest_prediction_test tests/DecisionForestTest.cpp) target_link_libraries( decision_forest_prediction_test + mat_mul_cublas velox_aggregates velox_type velox_vector @@ -145,6 +154,7 @@ target_link_libraries( add_executable(ml_sql_test tests/MLSQLTest.cpp) target_link_libraries( ml_sql_test + mat_mul_cublas velox_aggregates velox_type velox_vector @@ -162,6 +172,7 @@ target_link_libraries( add_executable(array_array_unnest_test tests/ArrayofArrayUnnestTest.cpp) target_link_libraries( array_array_unnest_test + mat_mul_cublas velox_aggregates velox_type velox_vector diff --git a/velox/ml_functions/functions.h b/velox/ml_functions/functions.h index a3c4cedd4..6d79fc885 100644 --- a/velox/ml_functions/functions.h +++ b/velox/ml_functions/functions.h @@ -7,6 +7,7 @@ #include "velox/exec/Task.h" #include "velox/cost_model/CostEstimate.h" #include "velox/cost_model/UdfCostCoefficient.h" +#include "velox/ml_functions/gpufunctions.h" using namespace facebook::velox; @@ -90,17 +91,30 @@ class MatrixMultiply: public MLFunction { use_gpu = args[1]->as>()->valueAt(0); } + BaseVector::ensureWritable(rows, type, context.pool(), output); + + auto input_elements = args[0]->as()->elements(); + float* input_values = input_elements->values()->asMutable(); + int input_size = input_elements->size(); // results are expected to be stored as std::vector> std::vector> result; + int rows_A = input_size / dims[0]; + int cols_A = dims[0]; + int rows_B = dims[0]; + int cols_B = dims[1]; if (use_gpu) { - // TODO: implementation of matrix multiplication in GPU - throw std::runtime_error("GPU implementation of Matrix Multiple is not implemented."); + float *host_C = (float*) malloc(rows_A * cols_B * sizeof(float)); + multiplyMatrices(rows_A, cols_B, cols_A, + input_values, rows_A, weights_, cols_A, host_C, rows_A); + float *C = host_C; + for (int i = 0; i < rows_A; i++) { + std::vector row{C, C + cols_B}; + result.push_back(row); + C = C + cols_B; + } + free(host_C); + } else { - BaseVector::ensureWritable(rows, type, context.pool(), output); - - auto input_elements = args[0]->as()->elements(); - float* input_values = input_elements->values()->asMutable(); - int input_size = input_elements->size(); Eigen::Map> m1(input_values, input_size/dims[0], dims[0]); Eigen::Map> m2(weights_, dims[0], dims[1]); @@ -116,6 +130,7 @@ class MatrixMultiply: public MLFunction { result.push_back(row); } } + VectorMaker maker{context.pool()}; output = maker.arrayVector(result, REAL()); } diff --git a/velox/ml_functions/gpufunctions.h b/velox/ml_functions/gpufunctions.h new file mode 100644 index 000000000..a0421657d --- /dev/null +++ b/velox/ml_functions/gpufunctions.h @@ -0,0 +1,26 @@ +#pragma once + +#include +#include +#include + +template +struct CublasType {}; + +template <> +struct CublasType { + static const cudaDataType_t type = CUDA_R_32F; + static const cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT; +}; + +template <> +struct CublasType { + static const cudaDataType_t type = CUDA_R_64F; + static const cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT; +}; + +template +void multiplyMatrices(int m, int n, int k, + const T* A, int lda, const T* B, int ldb, + T* C, int ldc); + diff --git a/velox/ml_functions/tests/GPUFunctions.cu b/velox/ml_functions/tests/GPUFunctions.cu new file mode 100644 index 000000000..e07cc56ba --- /dev/null +++ b/velox/ml_functions/tests/GPUFunctions.cu @@ -0,0 +1,53 @@ +#include "velox/ml_functions/gpufunctions.h" +#include "velox/experimental/gpu/Common.h" + +#define CUBLAS_ERROR(x) do { if((x)!=CUBLAS_STATUS_SUCCESS) { \ + printf("Error %s at %s:%d\n", cublasGetStatusString(x), __FILE__, __LINE__);\ + exit(EXIT_FAILURE);} } while(0) +template +void multiplyMatrices(int m, int n, int k, + const T* A, int lda, const T* B, int ldb, + T* C, int ldc) { + + cublasHandle_t handle; + CUBLAS_ERROR(cublasCreate(&handle)); + // Allocate device memory + T *d_A, *d_B, *d_C; + CUDA_CHECK_FATAL(cudaMalloc((void**)&d_A, m * k * sizeof(T))); + CUDA_CHECK_FATAL(cudaMalloc((void**)&d_B, k * n * sizeof(T))); + CUDA_CHECK_FATAL(cudaMalloc((void**)&d_C, m * n * sizeof(T))); + + // Copy data from host to device + CUDA_CHECK_FATAL(cudaMemcpy(d_A, A, m * k * sizeof(T), cudaMemcpyHostToDevice)); + CUDA_CHECK_FATAL(cudaMemcpy(d_B, B, k * n * sizeof(T), cudaMemcpyHostToDevice)); + + T alpha = 1.0; + T beta = 0.0; + // Perform matrix multiplication on GPU + cublasStatus_t status = cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, + n, m, k, &alpha, + d_B, CublasType::type, n, + d_A, CublasType::type, k, + &beta, d_C, CublasType::type, n, + CublasType::type, CublasType::algo); + + CUBLAS_ERROR(status); + + // Copy result from device to host + CUDA_CHECK_FATAL(cudaMemcpy(C, d_C, m * n * sizeof(T), cudaMemcpyDeviceToHost)); + + // Free device memory + CUDA_CHECK_LOG(cudaFree(d_A)); + CUDA_CHECK_LOG(cudaFree(d_B)); + CUDA_CHECK_LOG(cudaFree(d_C)); + + // Destroy cuBLAS handle + CUBLAS_ERROR(cublasDestroy(handle)); +} + +// Explicit instantiation for float and double +template void multiplyMatrices(int, int, int, + const float*, int, const float*, int, float*, int); +template void multiplyMatrices(int, int, int, + const double*, int, const double*, int, double*, int); + diff --git a/velox/ml_functions/tests/MLFunctionsTest.cpp b/velox/ml_functions/tests/MLFunctionsTest.cpp index c70e06bbf..e5155fccc 100644 --- a/velox/ml_functions/tests/MLFunctionsTest.cpp +++ b/velox/ml_functions/tests/MLFunctionsTest.cpp @@ -43,6 +43,8 @@ #include "velox/exec/tests/utils/TempDirectoryPath.h" #include "velox/common/memory/MemoryArbitrator.h" #include "velox/vector/fuzzer/VectorFuzzer.h" +#include "velox/ml_functions/gpufunctions.h" +#include @@ -79,7 +81,8 @@ class MLFunctionsTest : public HiveConnectorTestBase { /// Run the demo. void run(); - void test_mat_mul(); + void test_mat_mul(int, int, int); + //void test_mat_mul(); void test_mat_add(); void test_relu(); void test_softmax(); @@ -168,17 +171,17 @@ class MLFunctionsTest : public HiveConnectorTestBase { }; -void MLFunctionsTest::test_mat_mul() { +void MLFunctionsTest::test_mat_mul(int output_size, int input_size, int num_samples) { //Eigen::setNbThreads(48); - int output_size = 500; - int input_size = 100; - int num_samples = 500; + //int output_size = 2000; + //int input_size = 4000; + //int num_samples = 3000; int size = output_size*input_size; auto weights = maker.flatVector(size); auto col = maker.flatVector(num_samples); for(int i=0; i < size; i++){ - weights->set(i, i*10); + weights->set(i, i/1000.0); } std::vector> featureVectors; @@ -186,7 +189,7 @@ void MLFunctionsTest::test_mat_mul() { col->set(i, i* 7 - i*(i%3)); std::vector featureVector; for(int j=0; j < input_size; j++){ - featureVector.push_back(i*j); + featureVector.push_back(i*j/5000.0); } featureVectors.push_back(featureVector); } @@ -203,9 +206,9 @@ void MLFunctionsTest::test_mat_mul() { auto myPlan = exec::test::PlanBuilder(pool_.get()) .values({inputRowVector}) // using CPU for mat_mul - .project({"mat_mul(x)"}) + //.project({"mat_mul(x)"}) // using GPU for mat_mul - // .project({"mat_mul(x, true)"}) + .project({"mat_mul(x, true)"}) .planNode(); std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now(); @@ -1725,7 +1728,12 @@ void MLFunctionsTest::test_land_cover_conv3() { } void MLFunctionsTest::run() { - test_mat_mul(); + int output = 100; + int input = 500; + for (int i = 10; i < 1000001; i *= 10){ + test_mat_mul(output, input, i); + + } // test_mat_add(); // test_relu(); // test_softmax() diff --git a/velox/optimizer/CMakeLists.txt b/velox/optimizer/CMakeLists.txt index abcd3d13d..764ebc432 100644 --- a/velox/optimizer/CMakeLists.txt +++ b/velox/optimizer/CMakeLists.txt @@ -27,6 +27,7 @@ find_package(jsoncpp REQUIRED) add_executable(velox_op ${SOURCES}) target_link_libraries( velox_op + mat_mul_cublas velox_type velox_vector velox_exec @@ -42,6 +43,7 @@ target_link_libraries( add_executable(traversal_test TraversalTest.cpp) target_link_libraries( traversal_test + mat_mul_cublas velox_aggregates velox_type velox_vector @@ -61,6 +63,7 @@ target_link_libraries( add_executable(torch2twolayer_test TorchNN2TwoLayerUDFRewriteActionTest.cpp) target_link_libraries( torch2twolayer_test + mat_mul_cublas velox_aggregates velox_type velox_vector @@ -78,6 +81,7 @@ target_link_libraries( add_executable(twolayer2torch_test TwoLayerUDF2TorchNNRewriteActionTest.cpp) target_link_libraries( twolayer2torch_test + mat_mul_cublas velox_aggregates velox_type velox_vector @@ -95,6 +99,7 @@ target_link_libraries( add_executable(multilayer2torch_test MultiLayerUDF2TorchNNRewriteActionTest.cpp) target_link_libraries( multilayer2torch_test + mat_mul_cublas velox_aggregates velox_type velox_vector @@ -113,6 +118,7 @@ target_link_libraries( add_executable(decisionforest2rel_test DecisionForestUDF2RelationRewriteActionTest.cpp) target_link_libraries( decisionforest2rel_test + mat_mul_cublas velox_aggregates velox_type velox_vector @@ -130,6 +136,7 @@ target_link_libraries( add_executable(mul2joinagg_test Mul2JoinAggRewriteActionTest.cpp) target_link_libraries( mul2joinagg_test + mat_mul_cublas velox_aggregates velox_type velox_vector @@ -147,6 +154,7 @@ target_link_libraries( add_executable(mul2joinagghorizontal_test Mul2JoinAggHorizontalRewriteActionTest.cpp) target_link_libraries( mul2joinagghorizontal_test + mat_mul_cublas velox_aggregates velox_type velox_vector @@ -164,6 +172,7 @@ target_link_libraries( add_executable(conv_test Conv2dActionTest.cpp) target_link_libraries( conv_test + mat_mul_cublas velox_aggregates velox_type velox_vector From 4954a22afa81a7cbafd281d949927aa61eb4707a Mon Sep 17 00:00:00 2001 From: xllgit Date: Tue, 19 Mar 2024 01:39:16 +0000 Subject: [PATCH 2/2] add gpu support for torchDNN_two_layers --- velox/ml_functions/functions.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/velox/ml_functions/functions.h b/velox/ml_functions/functions.h index 6d79fc885..a45e878c3 100644 --- a/velox/ml_functions/functions.h +++ b/velox/ml_functions/functions.h @@ -724,6 +724,17 @@ class TorchDNN2Level: public MLFunction { exec::EvalCtx& context, VectorPtr& output) const override { + bool use_gpu = false; + if (args.size() == 2) { + // an optional parameter can be passed to enable the GPU for mat_mul + use_gpu = args[1]->as>()->valueAt(0); + } + torch::Device device = torch::kCPU; + if(use_gpu){ + device = (torch::cuda::is_available() ? torch::kCUDA : torch::kCPU); + + } + std::cout << "Using device:" << device <weight.set_data(weightTensor2); dense1->bias.set_data(bias1); dense2->bias.set_data(bias2); + dense1->to(device); + dense2->to(device); auto input_elements = args[0]->as()->elements(); float* input_values = input_elements->values()->asMutable(); int input_size = input_elements->size(); torch::Tensor input = torch::from_blob(input_values, {rows.size(), dims[0]}); + input = input.to(device); torch::Tensor layer1_output = dense1->forward(input); torch::Tensor reluOutput = relu->forward(layer1_output); torch::Tensor layer2_output = dense2->forward(reluOutput); torch::Tensor softmax_output = torch::nn::functional::softmax(layer2_output, 1); + softmax_output = softmax_output.to(torch::kCPU); float* data = softmax_output.data_ptr(); std::vector> results;