Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,8 @@ GENERATOR += -DVELOX_FORCE_COLORED_OUTPUT=ON
endif
endif

NUM_THREADS ?= $(shell getconf _NPROCESSORS_CONF 2>/dev/null || echo 1)
#NUM_THREADS ?= $(shell getconf _NPROCESSORS_CONF 2>/dev/null || echo 1)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please discard the hard-coded num_threads, it should be set automatically

NUM_THREADS = 2
CPU_TARGET ?= "avx"

FUZZER_SEED ?= 123456
Expand Down
1 change: 1 addition & 0 deletions velox/cost_model/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ find_package(Torch REQUIRED)
add_executable(nn_test tests/NNTests.cpp)
target_link_libraries(
nn_test
mat_mul_cublas
velox_aggregates
velox_type
velox_vector
Expand Down
2 changes: 1 addition & 1 deletion velox/experimental/gpu/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,4 @@
add_executable(velox_gpu_hash_table_test HashTableTest.cu)
target_link_libraries(velox_gpu_hash_table_test Folly::folly gflags::gflags)
set_target_properties(velox_gpu_hash_table_test PROPERTIES CUDA_ARCHITECTURES
native)
75)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you please clarify the reason for the changes as well as the following similar changes.

2 changes: 1 addition & 1 deletion velox/experimental/wave/common/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
add_library(velox_wave_common GpuArena.cpp Buffer.cpp Cuda.cu Exception.cpp
Type.cpp)

set_target_properties(velox_wave_common PROPERTIES CUDA_ARCHITECTURES native)
set_target_properties(velox_wave_common PROPERTIES CUDA_ARCHITECTURES 75)

target_link_libraries(velox_wave_common velox_exception velox_common_base
velox_type)
Expand Down
2 changes: 1 addition & 1 deletion velox/experimental/wave/common/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ add_executable(velox_wave_common_test GpuArenaTest.cpp CudaTest.cpp CudaTest.cu
BlockTest.cpp BlockTest.cu)

set_target_properties(velox_wave_common_test PROPERTIES CUDA_ARCHITECTURES
native)
75)

add_test(velox_wave_common_test velox_wave_common_test)

Expand Down
2 changes: 1 addition & 1 deletion velox/experimental/wave/exec/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ add_library(
Wave.cpp
Project.cpp)

set_target_properties(velox_wave_exec PROPERTIES CUDA_ARCHITECTURES native)
set_target_properties(velox_wave_exec PROPERTIES CUDA_ARCHITECTURES 75)

target_link_libraries(velox_wave_exec velox_wave_vector velox_wave_common
velox_exception velox_common_base velox_exec)
Expand Down
2 changes: 1 addition & 1 deletion velox/experimental/wave/exec/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

add_executable(velox_wave_exec_test FilterProjectTest.cpp Main.cpp)

set_target_properties(velox_wave_exec_test PROPERTIES CUDA_ARCHITECTURES native)
set_target_properties(velox_wave_exec_test PROPERTIES CUDA_ARCHITECTURES 75)

add_test(velox_wave_exec_test velox_wave_exec_test)

Expand Down
2 changes: 1 addition & 1 deletion velox/experimental/wave/vector/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
add_executable(velox_wave_vector_test VectorTest.cpp)

set_target_properties(velox_wave_vector_test PROPERTIES CUDA_ARCHITECTURES
native)
75)

add_test(veloxwave__vector_test velox_wave_vector_test)

Expand Down
11 changes: 11 additions & 0 deletions velox/ml_functions/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,14 @@ set(CMAKE_PREFIX_PATH "$CONDA_PREFIX")

find_package(Torch REQUIRED)
find_package(xgboost REQUIRED)
find_package(CUDA REQUIRED)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This may lead to a compilation error when comping in a CPU-only option. I see Velox provides a flag, VELOX_ENABLE_GPU. I think it would be better to cooperate this configuration code with the flag, VELOX_ENABLE_GPU


add_library(mat_mul_cublas STATIC tests/GPUFunctions.cu)
target_link_libraries(mat_mul_cublas cublas ${CUDA_LIBRARIES})
add_executable(ml_functions_test tests/MLFunctionsTest.cpp)
target_link_libraries(
ml_functions_test
mat_mul_cublas
velox_aggregates
velox_type
velox_vector
Expand All @@ -40,6 +44,7 @@ target_link_libraries(
add_executable(nn_tests tests/NNTest.cpp)
target_link_libraries(
nn_tests
mat_mul_cublas
velox_aggregates
velox_type
velox_vector
Expand All @@ -57,6 +62,7 @@ target_link_libraries(
add_executable(embedding_test tests/EmbeddingTest.cpp)
target_link_libraries(
embedding_test
mat_mul_cublas
velox_aggregates
velox_type
velox_vector
Expand All @@ -74,6 +80,7 @@ target_link_libraries(
add_executable(two_tower_model_test tests/TwoTowerModelTest.cpp)
target_link_libraries(
two_tower_model_test
mat_mul_cublas
velox_aggregates
velox_type
velox_vector
Expand All @@ -92,6 +99,7 @@ target_link_libraries(
add_executable(two_tower_model_pipeline_test tests/TowTowerModelPipelineTest.cpp)
target_link_libraries(
two_tower_model_pipeline_test
mat_mul_cublas
velox_aggregates
velox_type
velox_vector
Expand All @@ -110,6 +118,7 @@ target_link_libraries(
add_executable(decision_forest_prediction_test tests/DecisionForestTest.cpp)
target_link_libraries(
decision_forest_prediction_test
mat_mul_cublas
velox_aggregates
velox_type
velox_vector
Expand Down Expand Up @@ -145,6 +154,7 @@ target_link_libraries(
add_executable(ml_sql_test tests/MLSQLTest.cpp)
target_link_libraries(
ml_sql_test
mat_mul_cublas
velox_aggregates
velox_type
velox_vector
Expand All @@ -162,6 +172,7 @@ target_link_libraries(
add_executable(array_array_unnest_test tests/ArrayofArrayUnnestTest.cpp)
target_link_libraries(
array_array_unnest_test
mat_mul_cublas
velox_aggregates
velox_type
velox_vector
Expand Down
44 changes: 37 additions & 7 deletions velox/ml_functions/functions.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "velox/exec/Task.h"
#include "velox/cost_model/CostEstimate.h"
#include "velox/cost_model/UdfCostCoefficient.h"
#include "velox/ml_functions/gpufunctions.h"


using namespace facebook::velox;
Expand Down Expand Up @@ -90,17 +91,30 @@ class MatrixMultiply: public MLFunction {
use_gpu = args[1]->as<ConstantVector<bool>>()->valueAt(0);
}

BaseVector::ensureWritable(rows, type, context.pool(), output);

auto input_elements = args[0]->as<ArrayVector>()->elements();
float* input_values = input_elements->values()->asMutable<float>();
int input_size = input_elements->size();
// results are expected to be stored as std::vector<std::vector<float>>
std::vector<std::vector<float>> result;
int rows_A = input_size / dims[0];
int cols_A = dims[0];
int rows_B = dims[0];
int cols_B = dims[1];
if (use_gpu) {
// TODO: implementation of matrix multiplication in GPU
throw std::runtime_error("GPU implementation of Matrix Multiple is not implemented.");
float *host_C = (float*) malloc(rows_A * cols_B * sizeof(float));
multiplyMatrices(rows_A, cols_B, cols_A,
input_values, rows_A, weights_, cols_A, host_C, rows_A);
float *C = host_C;
for (int i = 0; i < rows_A; i++) {
std::vector<float> row{C, C + cols_B};
result.push_back(row);
C = C + cols_B;
}
free(host_C);

} else {
BaseVector::ensureWritable(rows, type, context.pool(), output);

auto input_elements = args[0]->as<ArrayVector>()->elements();
float* input_values = input_elements->values()->asMutable<float>();
int input_size = input_elements->size();

Eigen::Map<Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> m1(input_values, input_size/dims[0], dims[0]);
Eigen::Map<Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>> m2(weights_, dims[0], dims[1]);
Expand All @@ -116,6 +130,7 @@ class MatrixMultiply: public MLFunction {
result.push_back(row);
}
}

VectorMaker maker{context.pool()};
output = maker.arrayVector<float>(result, REAL());
}
Expand Down Expand Up @@ -709,6 +724,17 @@ class TorchDNN2Level: public MLFunction {
exec::EvalCtx& context,
VectorPtr& output) const override {

bool use_gpu = false;
if (args.size() == 2) {
// an optional parameter can be passed to enable the GPU for mat_mul
use_gpu = args[1]->as<ConstantVector<bool>>()->valueAt(0);
}
torch::Device device = torch::kCPU;
if(use_gpu){
device = (torch::cuda::is_available() ? torch::kCUDA : torch::kCPU);

}
std::cout << "Using device:" << device <<std::endl;
std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now();
torch::nn::Linear dense1(dims[0], dims[1]);
torch::nn::Linear dense2(dims[1],dims[2]);
Expand All @@ -723,17 +749,21 @@ class TorchDNN2Level: public MLFunction {
dense2->weight.set_data(weightTensor2);
dense1->bias.set_data(bias1);
dense2->bias.set_data(bias2);
dense1->to(device);
dense2->to(device);

auto input_elements = args[0]->as<ArrayVector>()->elements();
float* input_values = input_elements->values()->asMutable<float>();
int input_size = input_elements->size();

torch::Tensor input = torch::from_blob(input_values, {rows.size(), dims[0]});
input = input.to(device);

torch::Tensor layer1_output = dense1->forward(input);
torch::Tensor reluOutput = relu->forward(layer1_output);
torch::Tensor layer2_output = dense2->forward(reluOutput);
torch::Tensor softmax_output = torch::nn::functional::softmax(layer2_output, 1);
softmax_output = softmax_output.to(torch::kCPU);
float* data = softmax_output.data_ptr<float>();

std::vector<std::vector<float>> results;
Expand Down
26 changes: 26 additions & 0 deletions velox/ml_functions/gpufunctions.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#pragma once

#include <iostream>
#include <cstdlib>
#include <cublas_v2.h>

template <typename T>
struct CublasType {};

template <>
struct CublasType<float> {
static const cudaDataType_t type = CUDA_R_32F;
static const cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT;
};

template <>
struct CublasType<double> {
static const cudaDataType_t type = CUDA_R_64F;
static const cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT;
};

template <typename T>
void multiplyMatrices(int m, int n, int k,
const T* A, int lda, const T* B, int ldb,
T* C, int ldc);

53 changes: 53 additions & 0 deletions velox/ml_functions/tests/GPUFunctions.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#include "velox/ml_functions/gpufunctions.h"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please move your .cu file to the ml_functions folder.

#include "velox/experimental/gpu/Common.h"

#define CUBLAS_ERROR(x) do { if((x)!=CUBLAS_STATUS_SUCCESS) { \
printf("Error %s at %s:%d\n", cublasGetStatusString(x), __FILE__, __LINE__);\
exit(EXIT_FAILURE);} } while(0)
template <typename T>
void multiplyMatrices(int m, int n, int k,
const T* A, int lda, const T* B, int ldb,
T* C, int ldc) {

cublasHandle_t handle;
CUBLAS_ERROR(cublasCreate(&handle));
// Allocate device memory
T *d_A, *d_B, *d_C;
CUDA_CHECK_FATAL(cudaMalloc((void**)&d_A, m * k * sizeof(T)));
CUDA_CHECK_FATAL(cudaMalloc((void**)&d_B, k * n * sizeof(T)));
CUDA_CHECK_FATAL(cudaMalloc((void**)&d_C, m * n * sizeof(T)));

// Copy data from host to device
CUDA_CHECK_FATAL(cudaMemcpy(d_A, A, m * k * sizeof(T), cudaMemcpyHostToDevice));
CUDA_CHECK_FATAL(cudaMemcpy(d_B, B, k * n * sizeof(T), cudaMemcpyHostToDevice));

T alpha = 1.0;
T beta = 0.0;
// Perform matrix multiplication on GPU
cublasStatus_t status = cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N,
n, m, k, &alpha,
d_B, CublasType<T>::type, n,
d_A, CublasType<T>::type, k,
&beta, d_C, CublasType<T>::type, n,
CublasType<T>::type, CublasType<T>::algo);

CUBLAS_ERROR(status);

// Copy result from device to host
CUDA_CHECK_FATAL(cudaMemcpy(C, d_C, m * n * sizeof(T), cudaMemcpyDeviceToHost));

// Free device memory
CUDA_CHECK_LOG(cudaFree(d_A));
CUDA_CHECK_LOG(cudaFree(d_B));
CUDA_CHECK_LOG(cudaFree(d_C));

// Destroy cuBLAS handle
CUBLAS_ERROR(cublasDestroy(handle));
}

// Explicit instantiation for float and double
template void multiplyMatrices<float>(int, int, int,
const float*, int, const float*, int, float*, int);
template void multiplyMatrices<double>(int, int, int,
const double*, int, const double*, int, double*, int);

28 changes: 18 additions & 10 deletions velox/ml_functions/tests/MLFunctionsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@
#include "velox/exec/tests/utils/TempDirectoryPath.h"
#include "velox/common/memory/MemoryArbitrator.h"
#include "velox/vector/fuzzer/VectorFuzzer.h"
#include "velox/ml_functions/gpufunctions.h"
#include <cmath>



Expand Down Expand Up @@ -79,7 +81,8 @@ class MLFunctionsTest : public HiveConnectorTestBase {

/// Run the demo.
void run();
void test_mat_mul();
void test_mat_mul(int, int, int);
//void test_mat_mul();
void test_mat_add();
void test_relu();
void test_softmax();
Expand Down Expand Up @@ -168,25 +171,25 @@ class MLFunctionsTest : public HiveConnectorTestBase {

};

void MLFunctionsTest::test_mat_mul() {
void MLFunctionsTest::test_mat_mul(int output_size, int input_size, int num_samples) {
//Eigen::setNbThreads(48);
int output_size = 500;
int input_size = 100;
int num_samples = 500;
//int output_size = 2000;
//int input_size = 4000;
//int num_samples = 3000;
int size = output_size*input_size;

auto weights = maker.flatVector<float>(size);
auto col = maker.flatVector<int>(num_samples);
for(int i=0; i < size; i++){
weights->set(i, i*10);
weights->set(i, i/1000.0);
}

std::vector<std::vector<float>> featureVectors;
for(int i=0; i < num_samples; i++){
col->set(i, i* 7 - i*(i%3));
std::vector<float> featureVector;
for(int j=0; j < input_size; j++){
featureVector.push_back(i*j);
featureVector.push_back(i*j/5000.0);
}
featureVectors.push_back(featureVector);
}
Expand All @@ -203,9 +206,9 @@ void MLFunctionsTest::test_mat_mul() {
auto myPlan = exec::test::PlanBuilder(pool_.get())
.values({inputRowVector})
// using CPU for mat_mul
.project({"mat_mul(x)"})
//.project({"mat_mul(x)"})
// using GPU for mat_mul
// .project({"mat_mul(x, true)"})
.project({"mat_mul(x, true)"})
.planNode();

std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now();
Expand Down Expand Up @@ -1725,7 +1728,12 @@ void MLFunctionsTest::test_land_cover_conv3() {
}

void MLFunctionsTest::run() {
test_mat_mul();
int output = 100;
int input = 500;
for (int i = 10; i < 1000001; i *= 10){
test_mat_mul(output, input, i);

}
// test_mat_add();
// test_relu();
// test_softmax()
Expand Down
Loading