Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
207 changes: 152 additions & 55 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
cmake_minimum_required(VERSION 3.28)

option(USE_CUDA "Support NVIDIA CUDA" OFF)
option(PROFILE_MODE "ENABLE PROFILE MODE" OFF)
option(USE_OMP "Use OpenMP as backend for Eigen" ON)
option(USE_NCCL "Build project for distributed running" ON)
cmake_minimum_required(VERSION 3.28)

project(infini_train VERSION 0.3.0 LANGUAGES CXX)
project(infini_train VERSION 0.5.0 LANGUAGES CXX)

set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
Expand All @@ -13,90 +14,186 @@ set(CMAKE_CXX_EXTENSIONS OFF)
# Generate compile_commands.json
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

# Add gflags
# ------------------------------------------------------------------------------
# Third-party deps
# ------------------------------------------------------------------------------

# gflags
add_subdirectory(third_party/gflags)
include_directories(${gflags_SOURCE_DIR}/include)

# glog
set(WITH_GFLAGS OFF CACHE BOOL "Disable glog finding system gflags" FORCE)
set(WITH_GTEST OFF CACHE BOOL "Disable glog finding system gtest" FORCE)

# Add glog
add_subdirectory(third_party/glog)
include_directories(${glog_SOURCE_DIR}/src)

# Add eigen
# eigen
if(USE_OMP)
find_package(OpenMP REQUIRED)
find_package(OpenMP REQUIRED)
endif()
# find_package(OpenBLAS REQUIRED)
# include_directories(${OpenBLAS_INCLUDE_DIR})
add_subdirectory(third_party/eigen)
include_directories(${PROJECT_SOURCE_DIR}/third_party/eigen)
# add_definitions(-DEIGEN_USE_BLAS)

include_directories(${PROJECT_SOURCE_DIR})
file(GLOB_RECURSE SRC ${PROJECT_SOURCE_DIR}/infini_train/src/*.cc)
list(FILTER SRC EXCLUDE REGEX ".*kernels/cpu/.*")

if(PROFILE_MODE)
add_compile_definitions(PROFILE_MODE=1)
add_compile_definitions(PROFILE_MODE=1)
endif()

file (GLOB_RECURSE CPU_KERNELS ${PROJECT_SOURCE_DIR}/infini_train/src/kernels/cpu/*.cc)
# ------------------------------------------------------------------------------
# Sources
# ------------------------------------------------------------------------------

# Framework core sources (*.cc), excluding cpu kernels (they are built separately)
file(GLOB_RECURSE SRC ${PROJECT_SOURCE_DIR}/infini_train/src/*.cc)
list(FILTER SRC EXCLUDE REGEX ".*kernels/cpu/.*")

# CPU kernels (*.cc)
file(GLOB_RECURSE CPU_KERNELS ${PROJECT_SOURCE_DIR}/infini_train/src/kernels/cpu/*.cc)

# ------------------------------------------------------------------------------
# CPU kernels library
# ------------------------------------------------------------------------------

add_library(infini_train_cpu_kernels STATIC ${CPU_KERNELS})
target_link_libraries(infini_train_cpu_kernels glog Eigen3::Eigen)
target_link_libraries(infini_train_cpu_kernels PUBLIC glog Eigen3::Eigen)

if(USE_OMP)
add_compile_definitions(USE_OMP=1)
target_link_libraries(infini_train_cpu_kernels OpenMP::OpenMP_CXX)
add_compile_definitions(USE_OMP=1)
target_link_libraries(infini_train_cpu_kernels PUBLIC OpenMP::OpenMP_CXX)
endif()

# ------------------------------------------------------------------------------
# CUDA kernels library (optional)
# ------------------------------------------------------------------------------

if(USE_CUDA)
add_compile_definitions(USE_CUDA=1)
enable_language(CUDA)
find_package(CUDAToolkit REQUIRED)
include_directories(${CUDAToolkit_INCLUDE_DIRS})

# CUDA compilation options
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda --expt-relaxed-constexpr")

# Only compile CUDA kernels / cuda sources here (your original used src/*.cu)
file(GLOB_RECURSE CUDA_KERNELS ${PROJECT_SOURCE_DIR}/infini_train/src/*.cu)

add_library(infini_train_cuda_kernels STATIC ${CUDA_KERNELS})
set_target_properties(infini_train_cuda_kernels PROPERTIES CUDA_ARCHITECTURES "75;80;90")

target_link_libraries(infini_train_cuda_kernels
PUBLIC
glog
CUDA::cudart
CUDA::cublas
CUDA::cuda_driver
)

if(USE_NCCL)
message(STATUS "Add USE_NCCL, use NCCL with CUDA")
list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
find_package(NCCL REQUIRED)
add_compile_definitions(USE_NCCL=1)
target_link_libraries(infini_train_cuda_kernels PUBLIC nccl)
endif()
endif()

# ------------------------------------------------------------------------------
# Main framework library
# ------------------------------------------------------------------------------

add_library(infini_train STATIC ${SRC})
target_link_libraries(infini_train
PUBLIC
glog
gflags
infini_train_cpu_kernels
)

if(USE_CUDA)
add_compile_definitions(USE_CUDA=1)
enable_language(CUDA)
find_package(CUDAToolkit REQUIRED)
include_directories(${CUDAToolkit_INCLUDE_DIRS})

# enable CUDA-related compilation options
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda --expt-relaxed-constexpr")
file(GLOB_RECURSE CUDA_KERNELS ${PROJECT_SOURCE_DIR}/infini_train/src/*.cu)
add_library(infini_train_cuda_kernels STATIC ${CUDA_KERNELS})
set_target_properties(infini_train_cuda_kernels PROPERTIES CUDA_ARCHITECTURES "75;80;90")
target_link_libraries(infini_train_cuda_kernels glog CUDA::cudart CUDA::cublas CUDA::cuda_driver)

add_library(infini_train STATIC ${SRC})
target_link_libraries(infini_train glog gflags "-Wl,--whole-archive" infini_train_cpu_kernels infini_train_cuda_kernels "-Wl,--no-whole-archive")

if (USE_NCCL)
message(STATUS "Add USE_NCCL, use NCCL with CUDA")
list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
find_package(NCCL REQUIRED)
add_compile_definitions(USE_NCCL=1)
target_link_libraries(infini_train nccl)
endif()
else()
add_library(infini_train STATIC ${SRC})
target_link_libraries(infini_train glog gflags "-Wl,--whole-archive" infini_train_cpu_kernels "-Wl,--no-whole-archive")
# infini_train contains cuda runtime wrappers (*.cc) like cuda_blas_handle.cc/cuda_guard.cc
# Those may need CUDA runtime/driver/cublas symbols at final link, so attach them here too.
target_link_libraries(infini_train
PUBLIC
infini_train_cuda_kernels
CUDA::cudart
CUDA::cublas
CUDA::cuda_driver
)

if(USE_NCCL)
# If your core library code also directly references NCCL symbols (not only kernels),
# keep this. Otherwise it's harmless.
target_link_libraries(infini_train PUBLIC nccl)
endif()
endif()

# ------------------------------------------------------------------------------
# Helper: link libraries in a group to fix static lib one-pass resolution
# (THIS is what fixes "undefined reference" from cuda_kernels -> core symbols)
# ------------------------------------------------------------------------------
function(link_infini_train_exe target_name)
if(USE_CUDA)
target_link_libraries(${target_name} PRIVATE
"-Wl,--start-group"
"-Wl,--whole-archive"
infini_train
infini_train_cpu_kernels
infini_train_cuda_kernels
"-Wl,--no-whole-archive"
"-Wl,--end-group"
)
else()
target_link_libraries(${target_name} PRIVATE
"-Wl,--start-group"
"-Wl,--whole-archive"
infini_train
infini_train_cpu_kernels
"-Wl,--no-whole-archive"
"-Wl,--end-group"
)
endif()
endfunction()


# ------------------------------------------------------------------------------
# Examples
add_executable(mnist example/mnist/main.cc example/mnist/dataset.cc example/mnist/net.cc)
target_link_libraries(mnist infini_train)
# ------------------------------------------------------------------------------

add_executable(gpt2 example/gpt2/main.cc example/common/tiny_shakespeare_dataset.cc example/common/utils.cc example/gpt2/net.cc example/common/tokenizer.cc)
target_link_libraries(gpt2 infini_train)
add_executable(mnist
example/mnist/main.cc
example/mnist/dataset.cc
example/mnist/net.cc
)
link_infini_train_exe(mnist)

add_executable(gpt2
example/gpt2/main.cc
example/common/tiny_shakespeare_dataset.cc
example/common/utils.cc
example/gpt2/net.cc
example/common/tokenizer.cc
)
link_infini_train_exe(gpt2)

add_executable(llama3
example/llama3/main.cc
example/common/tiny_shakespeare_dataset.cc
example/common/utils.cc
example/llama3/net.cc
example/common/tokenizer.cc
)
link_infini_train_exe(llama3)

add_executable(llama3 example/llama3/main.cc example/common/tiny_shakespeare_dataset.cc example/common/utils.cc example/llama3/net.cc example/common/tokenizer.cc)
target_link_libraries(llama3 infini_train)
# Tools
add_subdirectory(tools/infini_run)
set_target_properties(infini_run PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})

# Tests
add_executable(test_hook test/hook/test_hook.cc)
target_link_libraries(test_hook infini_train)

add_executable(test_precision_check test/hook/test_precision_check.cc)
target_link_libraries(test_precision_check infini_train)

add_subdirectory(tools/infini_run)

set_target_properties(infini_run PROPERTIES
RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}
)

7 changes: 5 additions & 2 deletions example/common/tokenizer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
#include "glog/logging.h"

#include "example/common/utils.h"
#include "infini_train/include/nn/functional.h"
#include "infini_train/include/nn/modules/module.h"
#include "infini_train/include/tensor.h"

namespace infini_train {

Expand Down Expand Up @@ -103,7 +106,7 @@ std::string Tokenizer::Decode(uint32_t token_id) const {
}

void Tokenizer::GenerateText(infini_train::nn::Module &model, uint32_t batch_size, uint32_t sequence_length,
uint32_t text_length, const Device *device) const {
uint32_t text_length, Device device) const {
std::vector<int64_t> dims;
dims.assign({batch_size, sequence_length});
// x_tensor (FLAGS_batch_size, FLAGS_sequence_length) eq:(4, 64)
Expand All @@ -121,7 +124,7 @@ void Tokenizer::GenerateText(infini_train::nn::Module &model, uint32_t batch_siz
uint64_t kRngState = kRngState;
LOG(INFO) << "start generate text:";

const auto *cpu_device = DeviceManager::Instance()->GetDefaultDevice();
auto cpu_device = Device();
for (int t = prompt_len; t < text_length; ++t) {
x = std::make_shared<infini_train::Tensor>(x->To(device)); // CPU->calc device
// TODO(jym): use no_grad forward later
Expand Down
10 changes: 4 additions & 6 deletions example/common/tokenizer.h
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
#include <cctype>
#include <cstdint>
#include <memory>
#include <vector>

#include "infini_train/include/device.h"
#include "infini_train/include/nn/functional.h"
#include "infini_train/include/nn/modules/module.h"
#include "infini_train/include/tensor.h"

namespace infini_train {

namespace nn {
class Module;
}
class Tokenizer {
public:
enum class Version : uint32_t {
Expand All @@ -22,7 +20,7 @@ class Tokenizer {
std::string Decode(uint32_t token_id) const;

void GenerateText(infini_train::nn::Module &model, uint32_t batch_size, uint32_t sequence_length,
uint32_t text_length, const Device *device) const;
uint32_t text_length, Device device) const;

uint32_t GetEndToken() const { return eot_token_; };

Expand Down
Loading