Skip to content
This repository was archived by the owner on Dec 18, 2024. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ project(XeTLA)
include(CTest)
enable_testing()

list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/tools/cmake")
list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/tools/cmake")
find_package(MKL REQUIRED)

# debug option
Expand All @@ -22,9 +22,12 @@ if (${LOG} STREQUAL "on")
add_definitions(-DLOG_PRINT)
endif ()

add_compile_options(-fsycl -fp-model=precise -Wall -Wextra)
add_compile_options(-fsycl -ffp-model=precise -Wall -Wextra)
add_link_options(-fsycl -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread -lm)
link_libraries(-lgtest -lgtest_main)

# Examples and tests are not needed when another project uses the library
if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
add_subdirectory(tests)
add_subdirectory(examples)
endif()
2 changes: 1 addition & 1 deletion examples/01_gemm_universal/gemm_universal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ void gemm_universal_run(uint32_t iter) {
typename gemm_op_t::arguments_t gemm_arg(matrix_m, matrix_k, matrix_n, A,
matrix_k, B, matrix_n, C, matrix_n, Acc, Cnt);

cl::sycl::nd_range<3> nd_range = gemm_op_t::get_nd_range(gemm_arg);
sycl::nd_range<3> nd_range = gemm_op_t::get_nd_range(gemm_arg);
if (!gemm_op_t::can_implement(gemm_arg)) {
std::cout << "The arguments cannot be supported, aborting ... "
<< std::endl;
Expand Down
6 changes: 3 additions & 3 deletions examples/02_basic_gemm/basic_gemm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,10 +88,10 @@ void basic_gemm_run(uint32_t iter) {
uint32_t ldc = matrix_n;

// Ndrange and workgroup shape
cl::sycl::range<3> group_range {1, group_range_m, group_range_n};
cl::sycl::range<3> local_range {1, thread_range_m, thread_range_n};
sycl::range<3> group_range {1, group_range_m, group_range_n};
sycl::range<3> local_range {1, thread_range_m, thread_range_n};

cl::sycl::nd_range<3> nd_range(group_range * local_range, local_range);
sycl::nd_range<3> nd_range(group_range * local_range, local_range);

constexpr uint32_t warmup = 10;
long ops = 2 * static_cast<long>(matrix_m) * matrix_n * matrix_k;
Expand Down
4 changes: 2 additions & 2 deletions examples/03_gemm_relu_bias/gemm_relu_bias.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
#include "tests/utils/utils.hpp"
#include "xetla.hpp"

using namespace cl::sycl;
using namespace sycl;
using namespace gpu::xetla;
using namespace gpu;

Expand Down Expand Up @@ -171,7 +171,7 @@ void gemm_relu_bias_run(uint32_t iter) {
// [ReLuBias] assign epilogue_args to gemm_op_t::arguments_t
typename gemm_op_t::arguments_t arg(matrix_m, matrix_k, matrix_n, A,
matrix_k, B, matrix_n, C, matrix_n, epilogue_args);
cl::sycl::nd_range<3> nd_range = gemm_op_t::get_nd_range(arg);
sycl::nd_range<3> nd_range = gemm_op_t::get_nd_range(arg);
if (!gemm_op_t::can_implement(arg)) {
std::cout << "The arguments cannot be supported, aborting ... "
<< std::endl;
Expand Down
4 changes: 2 additions & 2 deletions examples/04_gemm_polynomial/gemm_polynomial.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

#include "gemm_polynomial.hpp"

using namespace cl::sycl;
using namespace sycl;
using namespace gpu::xetla;

template <typename data_type_a, typename data_type_b, typename data_type_c,
Expand Down Expand Up @@ -179,7 +179,7 @@ void gemm_polynomial_run(int iter) {
typename gemm_op_t::arguments_t gemm_arg(matrix_m, matrix_k, matrix_n, A,
matrix_k, B, matrix_n, C, matrix_n, {epilogue_args});

cl::sycl::nd_range<3> nd_range = gemm_op_t::get_nd_range(gemm_arg);
sycl::nd_range<3> nd_range = gemm_op_t::get_nd_range(gemm_arg);

if (!gemm_op_t::can_implement(gemm_arg)) {
std::cout << "The arguments cannot be supported, aborting ... "
Expand Down
2 changes: 1 addition & 1 deletion examples/05_batch_gemm/batch_gemm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ void batch_gemm_run(uint32_t iter) {
typename batch_gemm_op_t::arguments_t gemm_arg(batch_size, matrix_m,
matrix_k, matrix_n, A, matrix_k, B, matrix_n, C, matrix_n);

cl::sycl::nd_range<3> nd_range = batch_gemm_op_t::get_nd_range(gemm_arg);
sycl::nd_range<3> nd_range = batch_gemm_op_t::get_nd_range(gemm_arg);
if (!batch_gemm_op_t::can_implement(gemm_arg)) {
std::cout << "The arguments cannot be supported, aborting ... "
<< std::endl;
Expand Down
16 changes: 8 additions & 8 deletions examples/05_batch_gemm/batch_gemm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -180,36 +180,36 @@ class batch_gemm_t {

/// @brief Host helper function to get the expected local range under the current BATCH_GEMM config.
/// @return Expected local range.
static cl::sycl::range<3> get_local_range() {
static sycl::range<3> get_local_range() {
uint32_t local_range_m = (wg_tile_m + sg_tile_m - 1) / sg_tile_m;
uint32_t local_range_n = (wg_tile_n + sg_tile_n - 1) / sg_tile_n;
std::cout << "Local range: {" << 1 << ", " << local_range_m << ", "
<< local_range_n << "} \n";
assert(local_range_m * local_range_n <= 32);
return cl::sycl::range<3> {1, local_range_m, local_range_n};
return sycl::range<3> {1, local_range_m, local_range_n};
};

/// @brief Host helper function to get the expected group range under the current BATCH_GEMM config.
/// @param matrix_m Is the size of the m dimension of the matrix multiplication (m x k x n).
/// @param matrix_n Is the size of the n dimension of the matrix multiplication (m x k x n).
/// @return Expected group range.
static cl::sycl::range<3> get_group_range(
static sycl::range<3> get_group_range(
uint32_t batch_size, uint32_t matrix_m, uint32_t matrix_n) {
uint32_t group_range_m = (matrix_m + wg_tile_m - 1) / wg_tile_m;
uint32_t group_range_n = (matrix_n + wg_tile_n - 1) / wg_tile_n;
std::cout << "Group range: {" << batch_size << ", " << group_range_m
<< ", " << group_range_n << "} \n";
return cl::sycl::range<3> {batch_size, group_range_m, group_range_n};
return sycl::range<3> {batch_size, group_range_m, group_range_n};
};

/// @brief Host helper function to get the expected nd_range under the current BATCH_GEMM config.
/// @param args Is the BATCH_GEMM arguments for application-related runtime variables.
/// @return Expected nd_range.
static cl::sycl::nd_range<3> get_nd_range(arguments_t &args) {
cl::sycl::range<3> local_range = get_local_range();
cl::sycl::range<3> group_range = get_group_range(
static sycl::nd_range<3> get_nd_range(arguments_t &args) {
sycl::range<3> local_range = get_local_range();
sycl::range<3> group_range = get_group_range(
args.batch_size, args.matrix_m, args.matrix_n);
return cl::sycl::nd_range<3> {group_range * local_range, local_range};
return sycl::nd_range<3> {group_range * local_range, local_range};
};

/// @brief Check if the arguments can be implemented.
Expand Down
10 changes: 5 additions & 5 deletions examples/06_gemm_softmax/gemm_softmax.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#include "xetla.hpp"

using namespace gpu::xetla;
using namespace cl::sycl;
using namespace sycl;

#define SIMD 32

Expand Down Expand Up @@ -151,9 +151,9 @@ void gemm_softmax_run(uint32_t iter) {
<< ", group_num_z: " << batch_num << "\n";
std::cout << "group_size_x: " << subgroup_range_n
<< ", group_size_y: " << subgroup_range_m << std::endl;
cl::sycl::range<3> group_range {batch_num, group_range_m, group_range_n};
cl::sycl::range<3> local_range {1, subgroup_range_m, subgroup_range_n};
cl::sycl::nd_range<3> nd_range(group_range * local_range, local_range);
sycl::range<3> group_range {batch_num, group_range_m, group_range_n};
sycl::range<3> local_range {1, subgroup_range_m, subgroup_range_n};
sycl::nd_range<3> nd_range(group_range * local_range, local_range);

uint32_t warmup = 10;
long ops
Expand Down Expand Up @@ -303,7 +303,7 @@ void gemm_softmax_run(uint32_t iter) {
prof.add_gpu_event(gpu_event);
}
}
} catch (cl::sycl::exception const &e) {
} catch (sycl::exception const &e) {
std::cout << "SYCL exception caught: " << e.what() << '\n';
FAIL();
}
Expand Down
4 changes: 2 additions & 2 deletions examples/07_multi_layer_perceptron/multi_layer_perceptron.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
#include "multi_layer_perceptron.hpp"
#include "tests/utils/utils.hpp"

using namespace cl::sycl;
using namespace sycl;
using namespace gpu::xetla;

// MLP input size
Expand Down Expand Up @@ -243,7 +243,7 @@ void mlp_run(uint32_t iter) {
typename mlp_op_t::arguments_t mlp_arg(matrix_m, matrix_k, matrix_n,
matrix_m, matrix_n, matrix_l, A, matrix_k, W, matrix_n, B, matrix_n,
V, matrix_l, C, matrix_l);
cl::sycl::nd_range<3> nd_range = mlp_op_t::get_nd_range(mlp_arg);
sycl::nd_range<3> nd_range = mlp_op_t::get_nd_range(mlp_arg);

if (!mlp_op_t::can_implement(mlp_arg)) {
std::cout << "The arguments cannot be supported, aborting ... "
Expand Down
16 changes: 8 additions & 8 deletions examples/07_multi_layer_perceptron/multi_layer_perceptron.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ class multi_layer_perceptron_t {

/// @brief Host helper function to get the expected local range under the current MLP config.
/// @return Expected local range.
static cl::sycl::range<3> get_local_range() {
static sycl::range<3> get_local_range() {
// make sure first layer and second layer use same subgroup number.
static_assert(work_group_layer1_t::size == work_group_layer2_t::size,
"we should make sure first gemm and second gemm use same "
Expand All @@ -296,14 +296,14 @@ class multi_layer_perceptron_t {
std::cout << "Local range: {" << 1 << ", " << local_range_m << ", "
<< local_range_n << "} \n";
assert(local_range_m * local_range_n <= 32);
return cl::sycl::range<3> {1, local_range_m, local_range_n};
return sycl::range<3> {1, local_range_m, local_range_n};
};

/// @brief Host helper function to get the expected group range under the current MLP config.
/// @param matrix_m Is the size of the m dimension of the matrix multiplication (m x k x n).
/// @param matrix_n Is the size of the n dimension of the matrix multiplication (m x k x n).
/// @return Expected group range.
static cl::sycl::range<3> get_group_range(arguments_t &args) {
static sycl::range<3> get_group_range(arguments_t &args) {
// make sure first layer and second layer meet the condition to be fused.
static_assert(wg_tile_m_layer1 == wg_tile_m_layer2,
"first gemm and second gemm should have the same wg_tile_m");
Expand All @@ -320,16 +320,16 @@ class multi_layer_perceptron_t {
/ wg_tile_n_layer1;
std::cout << "Group range: {1"
<< ", " << group_range_m << ", " << group_range_n << "} \n";
return cl::sycl::range<3> {1, group_range_m, group_range_n};
return sycl::range<3> {1, group_range_m, group_range_n};
};

/// @brief Host helper function to get the expected nd_range under the current MLP config.
/// @param args Is the MLP arguments for application-related runtime variables.
/// @return Expected nd_range.
static cl::sycl::nd_range<3> get_nd_range(arguments_t &args) {
cl::sycl::range<3> local_range = get_local_range();
cl::sycl::range<3> group_range = get_group_range(args);
return cl::sycl::nd_range<3> {group_range * local_range, local_range};
static sycl::nd_range<3> get_nd_range(arguments_t &args) {
sycl::range<3> local_range = get_local_range();
sycl::range<3> group_range = get_group_range(args);
return sycl::nd_range<3> {group_range * local_range, local_range};
};

/// @brief Check if the arguments can be implemented.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#include "tests/utils/utils.hpp"

using namespace gpu::xetla;
using namespace cl::sycl;
using namespace sycl;

#define SIMD 32

Expand Down Expand Up @@ -227,9 +227,9 @@ void sdp_fwd_run(uint32_t iter) {
<< ", group_num_z: " << batch_cnt << "\n";
std::cout << "group_size_x: " << subgroup_range_n
<< ", group_size_y: " << subgroup_range_m << std::endl;
cl::sycl::range<3> group_range {batch_cnt, group_range_m, group_range_n};
cl::sycl::range<3> local_range {1, subgroup_range_m, subgroup_range_n};
cl::sycl::nd_range<3> nd_range(group_range * local_range, local_range);
sycl::range<3> group_range {batch_cnt, group_range_m, group_range_n};
sycl::range<3> local_range {1, subgroup_range_m, subgroup_range_n};
sycl::nd_range<3> nd_range(group_range * local_range, local_range);

constexpr uint32_t warmup = 10;
long ops = long(4 * batch_num * head_num * sequence_len) * sequence_len
Expand Down Expand Up @@ -476,7 +476,7 @@ void sdp_fwd_run(uint32_t iter) {
prof.add_gpu_event(gpu_event);
}
}
} catch (cl::sycl::exception const &e) {
} catch (sycl::exception const &e) {
std::cout << "SYCL exception caught: " << e.what() << '\n';
FAIL();
}
Expand Down
8 changes: 4 additions & 4 deletions examples/09_gate_recurrent_unit/gate_recurrent_unit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
#include "kernel_func.hpp"
#include "tests/utils/utils.hpp"

using namespace cl::sycl;
using namespace sycl;
using namespace gpu::xetla;

template <typename data_type>
Expand Down Expand Up @@ -342,10 +342,10 @@ void gru_run(uint32_t iter) {

//***********dpcpp runtime setup && buffer allocation start ************//

cl::sycl::range<3> group_range {1, (N + wg_tile_m - 1) / wg_tile_m, 1};
cl::sycl::range<3> local_range {1, (wg_tile_m + sg_tile_m - 1) / sg_tile_m,
sycl::range<3> group_range {1, (N + wg_tile_m - 1) / wg_tile_m, 1};
sycl::range<3> local_range {1, (wg_tile_m + sg_tile_m - 1) / sg_tile_m,
(wg_tile_n + sg_tile_n - 1) / sg_tile_n};
cl::sycl::nd_range<3> nd_range(group_range * local_range, local_range);
sycl::nd_range<3> nd_range(group_range * local_range, local_range);

std::cout << "Launch kernel:\n";
std::cout << "group_num_x: " << 1
Expand Down
2 changes: 1 addition & 1 deletion examples/10_gemm_large_n/gemm_large_n.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ void gemm_large_n_run(uint32_t iter) {
typename gemm_op_t::arguments_t gemm_arg(matrix_m, matrix_k, matrix_n, A,
matrix_k, B, matrix_n, C, matrix_n);

cl::sycl::nd_range<3> nd_range = gemm_op_t::get_nd_range(gemm_arg);
sycl::nd_range<3> nd_range = gemm_op_t::get_nd_range(gemm_arg);
if (!gemm_op_t::can_implement(gemm_arg)) {
std::cout << "The arguments cannot be supported, aborting ... "
<< std::endl;
Expand Down
4 changes: 2 additions & 2 deletions examples/11_stream_k_gemm/stream_k_gemm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ void stream_k_gemm_run(uint32_t iter) {
matrix_k, B, matrix_n, C, matrix_n, Acc, matrix_n, Cnt, size_cnt,
stream_k);

cl::sycl::nd_range<3> NDRange = gemm_op_t::get_nd_range(gemm_arg);
sycl::nd_range<3> NDRange = gemm_op_t::get_nd_range(gemm_arg);

if (!gemm_op_t::can_implement(gemm_arg)) {
std::cout << "The arguments cannot be supported, aborting ... "
Expand Down Expand Up @@ -382,7 +382,7 @@ void stream_k_gemm_relu_biasadd_run(uint32_t iter) {
matrix_k, B, matrix_n, C, matrix_n, Acc, matrix_n, Cnt, size_cnt,
stream_k, epilogue_args);

cl::sycl::nd_range<3> NDRange = gemm_op_t::get_nd_range(gemm_arg);
sycl::nd_range<3> NDRange = gemm_op_t::get_nd_range(gemm_arg);

if (!gemm_op_t::can_implement(gemm_arg)) {
std::cout << "The arguments cannot be supported, aborting ... "
Expand Down
4 changes: 2 additions & 2 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
include_directories(${CMAKE_SOURCE_DIR}/include)
include_directories(${CMAKE_SOURCE_DIR})
include_directories(${PROJECT_SOURCE_DIR}/include)
include_directories(${PROJECT_SOURCE_DIR})

add_subdirectory(01_gemm_universal)
add_subdirectory(02_basic_gemm)
Expand Down
2 changes: 1 addition & 1 deletion include/common/core/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
#pragma once

#include <version.hpp>
#include <CL/sycl.hpp>
#include <sycl/sycl.hpp>
#include <ext/intel/esimd.hpp>

template <class T>
Expand Down
2 changes: 1 addition & 1 deletion include/common/core/debug.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
#pragma once

#include "common/core/common.hpp"
#include <CL/sycl.hpp>
#include <sycl/sycl.hpp>
#include <ext/intel/esimd.hpp>

namespace gpu::xetla {
Expand Down
2 changes: 1 addition & 1 deletion include/common/core/explicit_conv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,4 +151,4 @@ xetla_cvt(xetla_vector<T_src, N> src) {

/// @} xetla_core_conv

} // namespace gpu::xetla
} // namespace gpu::xetla
2 changes: 1 addition & 1 deletion include/common/utils/dict.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -236,4 +236,4 @@ struct dict_t {
G>::type;
};

} // namespace gpu::xetla
} // namespace gpu::xetla
2 changes: 1 addition & 1 deletion include/common/utils/fastmath.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,4 +92,4 @@ struct FastDivMod {
}
};

} // namespace gpu::xetla
} // namespace gpu::xetla
2 changes: 1 addition & 1 deletion include/experimental/common/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@

#pragma once

#include "experimental/common/base_types.hpp"
#include "experimental/common/base_types.hpp"
2 changes: 1 addition & 1 deletion include/experimental/experimental.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,4 @@
#include "experimental/common/common.hpp"
#include "experimental/group/group.hpp"
#include "experimental/kernel/kernel.hpp"
#include "experimental/subgroup/subgroup.hpp"
#include "experimental/subgroup/subgroup.hpp"
2 changes: 1 addition & 1 deletion include/experimental/group/fused_op/fused_op.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,4 @@
#include "experimental/group/fused_op/layer_norm_fused_op_bwd_xe.hpp"
#include "experimental/group/fused_op/layer_norm_fused_op_fwd_xe.hpp"
#include "experimental/group/fused_op/row_reduction_fused_op_api.hpp"
#include "experimental/group/fused_op/row_reduction_fused_op_xe.hpp"
#include "experimental/group/fused_op/row_reduction_fused_op_xe.hpp"
2 changes: 1 addition & 1 deletion include/experimental/group/group.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,4 @@
#include "experimental/group/dropout_mask_gen.hpp"
#include "experimental/group/fused_op/fused_op.hpp"
#include "experimental/group/gemm/gemm.hpp"
#include "experimental/group/reduction/reduction.hpp"
#include "experimental/group/reduction/reduction.hpp"
2 changes: 1 addition & 1 deletion include/experimental/group/reduction/reduction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,4 @@
#pragma once

#include "experimental/group/reduction/reduction_api.hpp"
#include "experimental/group/reduction/row_reduce_store_xe.hpp"
#include "experimental/group/reduction/row_reduce_store_xe.hpp"
Loading