Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions sgl-kernel/csrc/cpu/bmm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,8 @@ void bmm_kernel_impl(
// out : [B, M, N]
// scale: [] 0-dim tensor for per tensor quant
//
void bmm_cpu(at::Tensor& out, at::Tensor& mat1, at::Tensor& mat2, bool is_vnni,
std::optional<at::Tensor>& scale) {
void bmm_cpu(
at::Tensor& out, at::Tensor& mat1, at::Tensor& mat2, bool is_vnni, const std::optional<at::Tensor>& scale) {
RECORD_FUNCTION("sgl-kernel::bmm_cpu", std::vector<c10::IValue>({out, mat1, mat2}));

auto packed_w = is_vnni ? mat2 : convert_weight_packed(mat2);
Expand Down
4 changes: 2 additions & 2 deletions sgl-kernel/csrc/cpu/gemm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -412,8 +412,8 @@ at::Tensor convert_weight_packed(at::Tensor& weight) {
// bias : [N]
// out : [M, N]
//
at::Tensor weight_packed_linear(at::Tensor& mat1, at::Tensor& mat2,
std::optional<at::Tensor>& bias, bool is_vnni) {
at::Tensor
weight_packed_linear(at::Tensor& mat1, at::Tensor& mat2, const std::optional<at::Tensor>& bias, bool is_vnni) {
RECORD_FUNCTION(
"sgl-kernel::weight_packed_linear", std::vector<c10::IValue>({mat1, mat2, bias}));

Expand Down
11 changes: 8 additions & 3 deletions sgl-kernel/csrc/cpu/gemm_fp8.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -443,9 +443,14 @@ void tinygemm_kernel(
INSTANTIATE_TINYGEMM_TEMPLATE(at::BFloat16);
INSTANTIATE_TINYGEMM_TEMPLATE(at::Half);

at::Tensor fp8_scaled_mm_cpu(at::Tensor& mat1, at::Tensor& mat2, at::Tensor& scales2,
std::vector<int64_t> block_size, std::optional<at::Tensor>& bias,
at::ScalarType out_dtype, bool is_vnni) {
at::Tensor fp8_scaled_mm_cpu(
at::Tensor& mat1,
at::Tensor& mat2,
at::Tensor& scales2,
std::vector<int64_t> block_size,
const std::optional<at::Tensor>& bias,
at::ScalarType out_dtype,
bool is_vnni) {
RECORD_FUNCTION("sgl-kernel::fp8_scaled_mm_cpu", std::vector<c10::IValue>({mat1, mat2, scales2, block_size, bias}));

auto packed_w = is_vnni ? mat2 : convert_weight_packed(mat2);
Expand Down
20 changes: 15 additions & 5 deletions sgl-kernel/csrc/cpu/gemm_int8.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -310,9 +310,14 @@ std::tuple<at::Tensor, at::Tensor> per_token_quant_int8_cpu(at::Tensor& A) {
// bias : [N]
// out : [M, N]
//
at::Tensor int8_scaled_mm_cpu(at::Tensor& mat1, at::Tensor& mat2,
at::Tensor& scales1, at::Tensor& scales2,
std::optional<at::Tensor>& bias, at::ScalarType out_dtype, bool is_vnni) {
at::Tensor int8_scaled_mm_cpu(
at::Tensor& mat1,
at::Tensor& mat2,
at::Tensor& scales1,
at::Tensor& scales2,
const std::optional<at::Tensor>& bias,
at::ScalarType out_dtype,
bool is_vnni) {
RECORD_FUNCTION("sgl-kernel::int8_scaled_mm_cpu", std::vector<c10::IValue>({mat1, mat2, scales1, scales2, bias}));

auto packed_w = is_vnni ? mat2 : convert_weight_packed(mat2);
Expand Down Expand Up @@ -363,8 +368,13 @@ at::Tensor int8_scaled_mm_cpu(at::Tensor& mat1, at::Tensor& mat2,
}

// fused `per_token_quant_int8_cpu` and `int8_scaled_mm_cpu`
at::Tensor int8_scaled_mm_with_quant(at::Tensor& mat1, at::Tensor& mat2, at::Tensor& scales2,
std::optional<at::Tensor>& bias, at::ScalarType out_dtype, bool is_vnni) {
at::Tensor int8_scaled_mm_with_quant(
at::Tensor& mat1,
at::Tensor& mat2,
const at::Tensor& scales2,
const std::optional<at::Tensor>& bias,
at::ScalarType out_dtype,
bool is_vnni) {
RECORD_FUNCTION("sgl-kernel::int8_scaled_mm_cpu", std::vector<c10::IValue>({mat1, mat2, scales2, bias}));

auto packed_w = is_vnni ? mat2 : convert_weight_packed(mat2);
Expand Down
18 changes: 9 additions & 9 deletions sgl-kernel/csrc/cpu/moe.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -925,10 +925,10 @@ at::Tensor fused_experts_cpu(
at::Tensor& topk_ids,
bool inplace,
bool use_int8_w8a8,
std::optional<at::Tensor>& w1_scale,
std::optional<at::Tensor>& w2_scale,
std::optional<at::Tensor>& a1_scale,
std::optional<at::Tensor>& a2_scale,
const std::optional<at::Tensor>& w1_scale,
const std::optional<at::Tensor>& w2_scale,
const std::optional<at::Tensor>& a1_scale,
const std::optional<at::Tensor>& a2_scale,
bool is_vnni) {
RECORD_FUNCTION(
"sgl-kernel::fused_experts_cpu", std::vector<c10::IValue>({hidden_states, w1, w2, topk_weights, topk_ids}));
Expand Down Expand Up @@ -1117,11 +1117,11 @@ at::Tensor shared_expert_cpu(
bool inplace,
bool use_int8_w8a8,
bool use_fp8_w8a16,
std::optional<at::Tensor>& w1_scale,
std::optional<at::Tensor>& w2_scale,
std::optional<std::vector<int64_t>> block_size,
std::optional<at::Tensor>& a1_scale,
std::optional<at::Tensor>& a2_scale,
const std::optional<at::Tensor>& w1_scale,
const std::optional<at::Tensor>& w2_scale,
const std::optional<std::vector<int64_t>> block_size,
const std::optional<at::Tensor>& a1_scale,
const std::optional<at::Tensor>& a2_scale,
bool is_vnni) {
RECORD_FUNCTION("sgl-kernel::shared_expert_cpu", std::vector<c10::IValue>({hidden_states, w1, w2}));

Expand Down
25 changes: 14 additions & 11 deletions sgl-kernel/csrc/cpu/qkv_proj.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -320,15 +320,19 @@ void rotary_emb_kernel_impl(

} // anonymous namespace

extern at::Tensor weight_packed_linear(at::Tensor& mat1, at::Tensor& mat2,
std::optional<at::Tensor>& bias, bool is_vnni);
extern at::Tensor
weight_packed_linear(at::Tensor& mat1, at::Tensor& mat2, const std::optional<at::Tensor>& bias, bool is_vnni);

extern at::Tensor int8_scaled_mm_with_quant(at::Tensor& mat1, at::Tensor& mat2, at::Tensor& scales2,
std::optional<at::Tensor>& bias, at::ScalarType out_dtype, bool is_vnni);

extern void bmm_cpu(at::Tensor& out, at::Tensor& mat1, at::Tensor& mat2, bool is_vnni,
std::optional<at::Tensor>& scale);
extern at::Tensor int8_scaled_mm_with_quant(
at::Tensor& mat1,
at::Tensor& mat2,
const at::Tensor& scales2,
const std::optional<at::Tensor>& bias,
at::ScalarType out_dtype,
bool is_vnni);

extern void
bmm_cpu(at::Tensor& out, at::Tensor& mat1, at::Tensor& mat2, bool is_vnni, const std::optional<at::Tensor>& scale);

// NB: shapes in DeepDeek R1
//
Expand All @@ -352,11 +356,10 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> qkv_proj_with_rope(
at::Tensor& cos_sin_cache,
double eps,
bool use_int8_w8a8,
std::optional<at::Tensor>& q_a_proj_scale,
std::optional<at::Tensor>& q_b_proj_scale,
std::optional<at::Tensor>& kv_a_proj_scale,
const std::optional<at::Tensor>& q_a_proj_scale,
const std::optional<at::Tensor>& q_b_proj_scale,
const std::optional<at::Tensor>& kv_a_proj_scale,
bool is_vnni) {

RECORD_FUNCTION("sgl-kernel::qkv_proj_with_rope", std::vector<c10::IValue>({
hidden_states, q_a_proj_weight, q_b_proj_weight, kv_a_proj_weight, w_kc}));

Expand Down
Loading
Loading