From 27cba60d24bb87cfc7fa6130b9f4061fe52385fb Mon Sep 17 00:00:00 2001
From: xllgit <xll20131124@gmail.com>
Date: Thu, 14 Mar 2024 18:17:45 +0000
Subject: [PATCH 1/2] Update for gpu matrix multiply support

---
 Makefile                                      |  3 +-
 velox/cost_model/CMakeLists.txt               |  1 +
 velox/experimental/gpu/tests/CMakeLists.txt   |  2 +-
 velox/experimental/wave/common/CMakeLists.txt |  2 +-
 .../wave/common/tests/CMakeLists.txt          |  2 +-
 velox/experimental/wave/exec/CMakeLists.txt   |  2 +-
 .../wave/exec/tests/CMakeLists.txt            |  2 +-
 .../wave/vector/tests/CMakeLists.txt          |  2 +-
 velox/ml_functions/CMakeLists.txt             | 11 ++++
 velox/ml_functions/functions.h                | 29 +++++++---
 velox/ml_functions/gpufunctions.h             | 26 +++++++++
 velox/ml_functions/tests/GPUFunctions.cu      | 53 +++++++++++++++++++
 velox/ml_functions/tests/MLFunctionsTest.cpp  | 28 ++++++----
 velox/optimizer/CMakeLists.txt                |  9 ++++
 14 files changed, 148 insertions(+), 24 deletions(-)
 create mode 100644 velox/ml_functions/gpufunctions.h
 create mode 100644 velox/ml_functions/tests/GPUFunctions.cu

diff --git a/Makefile b/Makefile
index 3fb5eb15d..6f5d544db 100644
--- a/Makefile
+++ b/Makefile
@@ -63,7 +63,8 @@ GENERATOR += -DVELOX_FORCE_COLORED_OUTPUT=ON
 endif
 endif
 
-NUM_THREADS ?= $(shell getconf _NPROCESSORS_CONF 2>/dev/null || echo 1)
+#NUM_THREADS ?= $(shell getconf _NPROCESSORS_CONF 2>/dev/null || echo 1)
+NUM_THREADS = 2 
 CPU_TARGET ?= "avx"
 
 FUZZER_SEED ?= 123456
diff --git a/velox/cost_model/CMakeLists.txt b/velox/cost_model/CMakeLists.txt
index dfac2fcdc..3a780a61d 100644
--- a/velox/cost_model/CMakeLists.txt
+++ b/velox/cost_model/CMakeLists.txt
@@ -23,6 +23,7 @@ find_package(Torch REQUIRED)
 add_executable(nn_test tests/NNTests.cpp)
 target_link_libraries(
   nn_test
+  mat_mul_cublas
   velox_aggregates
   velox_type
   velox_vector
diff --git a/velox/experimental/gpu/tests/CMakeLists.txt b/velox/experimental/gpu/tests/CMakeLists.txt
index 8eaca86e7..2ed0f5eee 100644
--- a/velox/experimental/gpu/tests/CMakeLists.txt
+++ b/velox/experimental/gpu/tests/CMakeLists.txt
@@ -15,4 +15,4 @@
 add_executable(velox_gpu_hash_table_test HashTableTest.cu)
 target_link_libraries(velox_gpu_hash_table_test Folly::folly gflags::gflags)
 set_target_properties(velox_gpu_hash_table_test PROPERTIES CUDA_ARCHITECTURES
-                                                           native)
+                                                           75)
diff --git a/velox/experimental/wave/common/CMakeLists.txt b/velox/experimental/wave/common/CMakeLists.txt
index 205d94533..b69eaff28 100644
--- a/velox/experimental/wave/common/CMakeLists.txt
+++ b/velox/experimental/wave/common/CMakeLists.txt
@@ -15,7 +15,7 @@
 add_library(velox_wave_common GpuArena.cpp Buffer.cpp Cuda.cu Exception.cpp
                               Type.cpp)
 
-set_target_properties(velox_wave_common PROPERTIES CUDA_ARCHITECTURES native)
+set_target_properties(velox_wave_common PROPERTIES CUDA_ARCHITECTURES 75)
 
 target_link_libraries(velox_wave_common velox_exception velox_common_base
                       velox_type)
diff --git a/velox/experimental/wave/common/tests/CMakeLists.txt b/velox/experimental/wave/common/tests/CMakeLists.txt
index 159261e72..e5cb28574 100644
--- a/velox/experimental/wave/common/tests/CMakeLists.txt
+++ b/velox/experimental/wave/common/tests/CMakeLists.txt
@@ -16,7 +16,7 @@ add_executable(velox_wave_common_test GpuArenaTest.cpp CudaTest.cpp CudaTest.cu
                                       BlockTest.cpp BlockTest.cu)
 
 set_target_properties(velox_wave_common_test PROPERTIES CUDA_ARCHITECTURES
-                                                        native)
+                                                        75)
 
 add_test(velox_wave_common_test velox_wave_common_test)
 
diff --git a/velox/experimental/wave/exec/CMakeLists.txt b/velox/experimental/wave/exec/CMakeLists.txt
index a346b917a..39147e0da 100644
--- a/velox/experimental/wave/exec/CMakeLists.txt
+++ b/velox/experimental/wave/exec/CMakeLists.txt
@@ -26,7 +26,7 @@ add_library(
   Wave.cpp
   Project.cpp)
 
-set_target_properties(velox_wave_exec PROPERTIES CUDA_ARCHITECTURES native)
+set_target_properties(velox_wave_exec PROPERTIES CUDA_ARCHITECTURES 75)
 
 target_link_libraries(velox_wave_exec velox_wave_vector velox_wave_common
                       velox_exception velox_common_base velox_exec)
diff --git a/velox/experimental/wave/exec/tests/CMakeLists.txt b/velox/experimental/wave/exec/tests/CMakeLists.txt
index 74feddf6c..ad1ce9c43 100644
--- a/velox/experimental/wave/exec/tests/CMakeLists.txt
+++ b/velox/experimental/wave/exec/tests/CMakeLists.txt
@@ -14,7 +14,7 @@
 
 add_executable(velox_wave_exec_test FilterProjectTest.cpp Main.cpp)
 
-set_target_properties(velox_wave_exec_test PROPERTIES CUDA_ARCHITECTURES native)
+set_target_properties(velox_wave_exec_test PROPERTIES CUDA_ARCHITECTURES 75)
 
 add_test(velox_wave_exec_test velox_wave_exec_test)
 
diff --git a/velox/experimental/wave/vector/tests/CMakeLists.txt b/velox/experimental/wave/vector/tests/CMakeLists.txt
index 2d4201c4c..f2966ac0e 100644
--- a/velox/experimental/wave/vector/tests/CMakeLists.txt
+++ b/velox/experimental/wave/vector/tests/CMakeLists.txt
@@ -15,7 +15,7 @@
 add_executable(velox_wave_vector_test VectorTest.cpp)
 
 set_target_properties(velox_wave_vector_test PROPERTIES CUDA_ARCHITECTURES
-                                                        native)
+                                                        75)
 
 add_test(veloxwave__vector_test velox_wave_vector_test)
 
diff --git a/velox/ml_functions/CMakeLists.txt b/velox/ml_functions/CMakeLists.txt
index 65375e80a..3755df137 100644
--- a/velox/ml_functions/CMakeLists.txt
+++ b/velox/ml_functions/CMakeLists.txt
@@ -19,10 +19,14 @@ set(CMAKE_PREFIX_PATH "$CONDA_PREFIX")
 
 find_package(Torch REQUIRED)
 find_package(xgboost REQUIRED)
+find_package(CUDA REQUIRED)
 
+add_library(mat_mul_cublas STATIC tests/GPUFunctions.cu)
+target_link_libraries(mat_mul_cublas cublas ${CUDA_LIBRARIES})
 add_executable(ml_functions_test tests/MLFunctionsTest.cpp)
 target_link_libraries(
   ml_functions_test
+  mat_mul_cublas
   velox_aggregates
   velox_type
   velox_vector
@@ -40,6 +44,7 @@ target_link_libraries(
 add_executable(nn_tests tests/NNTest.cpp)
 target_link_libraries(
   nn_tests
+  mat_mul_cublas
   velox_aggregates
   velox_type
   velox_vector
@@ -57,6 +62,7 @@ target_link_libraries(
 add_executable(embedding_test tests/EmbeddingTest.cpp)
 target_link_libraries(
   embedding_test
+  mat_mul_cublas
   velox_aggregates
   velox_type
   velox_vector
@@ -74,6 +80,7 @@ target_link_libraries(
 add_executable(two_tower_model_test tests/TwoTowerModelTest.cpp)
 target_link_libraries(
   two_tower_model_test
+  mat_mul_cublas
   velox_aggregates
   velox_type
   velox_vector
@@ -92,6 +99,7 @@ target_link_libraries(
 add_executable(two_tower_model_pipeline_test tests/TowTowerModelPipelineTest.cpp)
 target_link_libraries(
   two_tower_model_pipeline_test
+  mat_mul_cublas
   velox_aggregates
   velox_type
   velox_vector
@@ -110,6 +118,7 @@ target_link_libraries(
 add_executable(decision_forest_prediction_test tests/DecisionForestTest.cpp)
 target_link_libraries(
   decision_forest_prediction_test
+  mat_mul_cublas
   velox_aggregates
   velox_type
   velox_vector
@@ -145,6 +154,7 @@ target_link_libraries(
 add_executable(ml_sql_test tests/MLSQLTest.cpp)
 target_link_libraries(
   ml_sql_test
+  mat_mul_cublas
   velox_aggregates
   velox_type
   velox_vector
@@ -162,6 +172,7 @@ target_link_libraries(
 add_executable(array_array_unnest_test tests/ArrayofArrayUnnestTest.cpp)
 target_link_libraries(
   array_array_unnest_test
+  mat_mul_cublas
   velox_aggregates
   velox_type
   velox_vector
diff --git a/velox/ml_functions/functions.h b/velox/ml_functions/functions.h
index a3c4cedd4..6d79fc885 100644
--- a/velox/ml_functions/functions.h
+++ b/velox/ml_functions/functions.h
@@ -7,6 +7,7 @@
 #include "velox/exec/Task.h"
 #include "velox/cost_model/CostEstimate.h"
 #include "velox/cost_model/UdfCostCoefficient.h"
+#include "velox/ml_functions/gpufunctions.h"
 
 
 using namespace facebook::velox;
@@ -90,17 +91,30 @@ class MatrixMultiply: public MLFunction {
             use_gpu = args[1]->as<ConstantVector<bool>>()->valueAt(0);
         }
         
+        BaseVector::ensureWritable(rows, type, context.pool(), output);
+        
+        auto input_elements = args[0]->as<ArrayVector>()->elements();
+        float* input_values = input_elements->values()->asMutable<float>();
+        int input_size = input_elements->size();
         // results are expected to be stored as std::vector<std::vector<float>>
         std::vector<std::vector<float>> result;
+	    int rows_A = input_size / dims[0];
+	    int cols_A = dims[0];
+	    int rows_B = dims[0];
+	    int cols_B = dims[1];
         if (use_gpu) {
-            // TODO: implementation of matrix multiplication in GPU
-            throw std::runtime_error("GPU implementation of Matrix Multiple is not implemented.");
+	        float *host_C = (float*) malloc(rows_A * cols_B * sizeof(float)); 
+	        multiplyMatrices(rows_A, cols_B, cols_A,
+                             input_values, rows_A, weights_, cols_A, host_C, rows_A);
+            float *C = host_C;
+            for (int i = 0; i < rows_A; i++) {
+                std::vector<float> row{C, C + cols_B};
+                result.push_back(row);
+                C = C + cols_B;
+            }
+            free(host_C);
+
         } else {
-            BaseVector::ensureWritable(rows, type, context.pool(), output);
-            
-            auto input_elements = args[0]->as<ArrayVector>()->elements();
-            float* input_values = input_elements->values()->asMutable<float>();
-            int input_size = input_elements->size();
 
             Eigen::Map<Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> m1(input_values, input_size/dims[0], dims[0]);
             Eigen::Map<Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> m2(weights_, dims[0], dims[1]); 
@@ -116,6 +130,7 @@ class MatrixMultiply: public MLFunction {
                 result.push_back(row);
             }
         }
+
         VectorMaker maker{context.pool()};
         output = maker.arrayVector<float>(result, REAL());
     }
diff --git a/velox/ml_functions/gpufunctions.h b/velox/ml_functions/gpufunctions.h
new file mode 100644
index 000000000..a0421657d
--- /dev/null
+++ b/velox/ml_functions/gpufunctions.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <iostream>
+#include <cstdlib>
+#include <cublas_v2.h>
+
+template <typename T>
+struct CublasType {};
+
+template <>
+struct CublasType<float> {
+    static const cudaDataType_t type = CUDA_R_32F;
+    static const cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT;
+};
+
+template <>
+struct CublasType<double> {
+    static const cudaDataType_t type = CUDA_R_64F;
+    static const cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT;
+};
+
+template <typename T>
+void multiplyMatrices(int m, int n, int k,
+                      const T* A, int lda, const T* B, int ldb,
+                      T* C, int ldc);
+
diff --git a/velox/ml_functions/tests/GPUFunctions.cu b/velox/ml_functions/tests/GPUFunctions.cu
new file mode 100644
index 000000000..e07cc56ba
--- /dev/null
+++ b/velox/ml_functions/tests/GPUFunctions.cu
@@ -0,0 +1,53 @@
+#include "velox/ml_functions/gpufunctions.h"
+#include "velox/experimental/gpu/Common.h"
+
+#define CUBLAS_ERROR(x) do { if((x)!=CUBLAS_STATUS_SUCCESS) { \
+    printf("Error %s at %s:%d\n", cublasGetStatusString(x), __FILE__, __LINE__);\
+    exit(EXIT_FAILURE);} } while(0)
+template <typename T>
+void multiplyMatrices(int m, int n, int k,
+                      const T* A, int lda, const T* B, int ldb,
+                      T* C, int ldc) {
+
+    cublasHandle_t handle;
+    CUBLAS_ERROR(cublasCreate(&handle));
+    // Allocate device memory
+    T *d_A, *d_B, *d_C;
+    CUDA_CHECK_FATAL(cudaMalloc((void**)&d_A, m * k * sizeof(T)));
+    CUDA_CHECK_FATAL(cudaMalloc((void**)&d_B, k * n * sizeof(T)));
+    CUDA_CHECK_FATAL(cudaMalloc((void**)&d_C, m * n * sizeof(T)));
+
+    // Copy data from host to device
+    CUDA_CHECK_FATAL(cudaMemcpy(d_A, A, m * k * sizeof(T), cudaMemcpyHostToDevice));
+    CUDA_CHECK_FATAL(cudaMemcpy(d_B, B, k * n * sizeof(T), cudaMemcpyHostToDevice));
+
+    T alpha = 1.0;
+    T beta = 0.0;
+    // Perform matrix multiplication on GPU
+    cublasStatus_t status = cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, 
+                                         n, m, k, &alpha,
+                                         d_B, CublasType<T>::type, n,
+                                         d_A, CublasType<T>::type, k,
+                                         &beta, d_C, CublasType<T>::type, n,
+                                         CublasType<T>::type, CublasType<T>::algo);
+
+    CUBLAS_ERROR(status);
+
+    // Copy result from device to host
+    CUDA_CHECK_FATAL(cudaMemcpy(C, d_C, m * n * sizeof(T), cudaMemcpyDeviceToHost));
+
+    // Free device memory
+    CUDA_CHECK_LOG(cudaFree(d_A));
+    CUDA_CHECK_LOG(cudaFree(d_B));
+    CUDA_CHECK_LOG(cudaFree(d_C));
+
+    // Destroy cuBLAS handle
+    CUBLAS_ERROR(cublasDestroy(handle));
+}
+
+// Explicit instantiation for float and double
+template void multiplyMatrices<float>(int, int, int,
+                                       const float*, int, const float*, int, float*, int);
+template void multiplyMatrices<double>(int, int, int,
+                                        const double*, int, const double*, int, double*, int);
+
diff --git a/velox/ml_functions/tests/MLFunctionsTest.cpp b/velox/ml_functions/tests/MLFunctionsTest.cpp
index c70e06bbf..e5155fccc 100644
--- a/velox/ml_functions/tests/MLFunctionsTest.cpp
+++ b/velox/ml_functions/tests/MLFunctionsTest.cpp
@@ -43,6 +43,8 @@
 #include "velox/exec/tests/utils/TempDirectoryPath.h"
 #include "velox/common/memory/MemoryArbitrator.h"
 #include "velox/vector/fuzzer/VectorFuzzer.h"
+#include "velox/ml_functions/gpufunctions.h"
+#include <cmath>
 
 
 
@@ -79,7 +81,8 @@ class MLFunctionsTest : public HiveConnectorTestBase {
 
   /// Run the demo.
   void run();
-  void test_mat_mul();
+  void test_mat_mul(int, int, int);
+  //void test_mat_mul();
   void test_mat_add();
   void test_relu();
   void test_softmax();
@@ -168,17 +171,17 @@ class MLFunctionsTest : public HiveConnectorTestBase {
 
 };
 
-void MLFunctionsTest::test_mat_mul() { 
+void MLFunctionsTest::test_mat_mul(int output_size, int input_size, int num_samples) { 
 //Eigen::setNbThreads(48); 
-  int output_size = 500;
-  int input_size = 100;
-  int num_samples = 500;
+  //int output_size = 2000;
+  //int input_size = 4000;
+  //int num_samples = 3000;
   int size = output_size*input_size;
   
   auto weights = maker.flatVector<float>(size);
   auto col = maker.flatVector<int>(num_samples);
   for(int i=0; i < size; i++){
-	  weights->set(i, i*10);
+	  weights->set(i, i/1000.0);
   } 
   
   std::vector<std::vector<float>> featureVectors;
@@ -186,7 +189,7 @@ void MLFunctionsTest::test_mat_mul() {
     col->set(i, i* 7 - i*(i%3));
     std::vector<float> featureVector;
     for(int j=0; j < input_size; j++){
-      featureVector.push_back(i*j);
+      featureVector.push_back(i*j/5000.0);
     }
     featureVectors.push_back(featureVector);
   }
@@ -203,9 +206,9 @@ void MLFunctionsTest::test_mat_mul() {
   auto myPlan = exec::test::PlanBuilder(pool_.get())
                   .values({inputRowVector})
                   // using CPU for mat_mul
-                  .project({"mat_mul(x)"})
+                  //.project({"mat_mul(x)"})
                   // using GPU for mat_mul
-                  // .project({"mat_mul(x, true)"})
+                    .project({"mat_mul(x, true)"})
 		              .planNode();
 
   std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now();
@@ -1725,7 +1728,12 @@ void MLFunctionsTest::test_land_cover_conv3() {
 }
 
 void MLFunctionsTest::run() {
-   test_mat_mul();
+    int output = 100;
+    int input = 500;
+    for (int i = 10; i < 1000001; i *= 10){
+        test_mat_mul(output, input, i);
+
+    }
   //  test_mat_add();
   //  test_relu();
   //  test_softmax()
diff --git a/velox/optimizer/CMakeLists.txt b/velox/optimizer/CMakeLists.txt
index abcd3d13d..764ebc432 100644
--- a/velox/optimizer/CMakeLists.txt
+++ b/velox/optimizer/CMakeLists.txt
@@ -27,6 +27,7 @@ find_package(jsoncpp REQUIRED)
 add_executable(velox_op ${SOURCES})
 target_link_libraries(
   velox_op
+  mat_mul_cublas
   velox_type
   velox_vector
   velox_exec
@@ -42,6 +43,7 @@ target_link_libraries(
 add_executable(traversal_test TraversalTest.cpp)
 target_link_libraries(
   traversal_test
+  mat_mul_cublas
   velox_aggregates
   velox_type
   velox_vector
@@ -61,6 +63,7 @@ target_link_libraries(
 add_executable(torch2twolayer_test TorchNN2TwoLayerUDFRewriteActionTest.cpp)
 target_link_libraries(
   torch2twolayer_test
+  mat_mul_cublas
   velox_aggregates
   velox_type
   velox_vector
@@ -78,6 +81,7 @@ target_link_libraries(
 add_executable(twolayer2torch_test TwoLayerUDF2TorchNNRewriteActionTest.cpp)
 target_link_libraries(
   twolayer2torch_test
+  mat_mul_cublas
   velox_aggregates
   velox_type
   velox_vector
@@ -95,6 +99,7 @@ target_link_libraries(
 add_executable(multilayer2torch_test MultiLayerUDF2TorchNNRewriteActionTest.cpp)
 target_link_libraries(
   multilayer2torch_test
+  mat_mul_cublas
   velox_aggregates
   velox_type
   velox_vector
@@ -113,6 +118,7 @@ target_link_libraries(
 add_executable(decisionforest2rel_test DecisionForestUDF2RelationRewriteActionTest.cpp)
 target_link_libraries(
   decisionforest2rel_test
+  mat_mul_cublas
   velox_aggregates
   velox_type
   velox_vector
@@ -130,6 +136,7 @@ target_link_libraries(
 add_executable(mul2joinagg_test Mul2JoinAggRewriteActionTest.cpp)
 target_link_libraries(
   mul2joinagg_test
+  mat_mul_cublas
   velox_aggregates
   velox_type
   velox_vector
@@ -147,6 +154,7 @@ target_link_libraries(
 add_executable(mul2joinagghorizontal_test Mul2JoinAggHorizontalRewriteActionTest.cpp)
 target_link_libraries(
   mul2joinagghorizontal_test
+  mat_mul_cublas
   velox_aggregates
   velox_type
   velox_vector
@@ -164,6 +172,7 @@ target_link_libraries(
 add_executable(conv_test Conv2dActionTest.cpp)
 target_link_libraries(
   conv_test
+  mat_mul_cublas
   velox_aggregates
   velox_type
   velox_vector

From 4954a22afa81a7cbafd281d949927aa61eb4707a Mon Sep 17 00:00:00 2001
From: xllgit <xll20131124@gmail.com>
Date: Tue, 19 Mar 2024 01:39:16 +0000
Subject: [PATCH 2/2] add gpu support for torchDNN_two_layers

---
 velox/ml_functions/functions.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/velox/ml_functions/functions.h b/velox/ml_functions/functions.h
index 6d79fc885..a45e878c3 100644
--- a/velox/ml_functions/functions.h
+++ b/velox/ml_functions/functions.h
@@ -724,6 +724,17 @@ class TorchDNN2Level: public MLFunction {
         exec::EvalCtx& context,
         VectorPtr& output) const override {
 
+        bool use_gpu = false;
+        if (args.size() == 2) {
+            // an optional parameter can be passed to enable the GPU for mat_mul
+            use_gpu = args[1]->as<ConstantVector<bool>>()->valueAt(0);
+        }
+        torch::Device device = torch::kCPU;
+        if(use_gpu){
+            device = (torch::cuda::is_available() ? torch::kCUDA : torch::kCPU);
+
+        }
+        std::cout << "Using device:" << device <<std::endl;
         std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now();
         torch::nn::Linear dense1(dims[0], dims[1]);
         torch::nn::Linear dense2(dims[1],dims[2]);
@@ -738,17 +749,21 @@ class TorchDNN2Level: public MLFunction {
         dense2->weight.set_data(weightTensor2);
         dense1->bias.set_data(bias1);
         dense2->bias.set_data(bias2);
+        dense1->to(device);
+        dense2->to(device);
         
         auto input_elements = args[0]->as<ArrayVector>()->elements();
         float* input_values = input_elements->values()->asMutable<float>();
         int input_size = input_elements->size();
         
         torch::Tensor input = torch::from_blob(input_values, {rows.size(), dims[0]});
+        input = input.to(device);
 
         torch::Tensor layer1_output = dense1->forward(input);
         torch::Tensor reluOutput = relu->forward(layer1_output);
         torch::Tensor layer2_output = dense2->forward(reluOutput);
         torch::Tensor softmax_output = torch::nn::functional::softmax(layer2_output, 1);
+        softmax_output = softmax_output.to(torch::kCPU);
         float* data = softmax_output.data_ptr<float>();
 
         std::vector<std::vector<float>> results;