Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/excuter/op-mem-cuda/list.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@
| cos | miaobyte | T3=cos(T1) | cos(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
| notequalscalar | miaobyte | T1!=scalar->mask | notequalscalar(tensor<any> A, var<any> scalar, var<float32> epsilon)->(tensor<bool> mask) |
| minscalar | miaobyte | T3=min(T1, scalar) | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
| rpowscalar | miaobyte | T3=pow(scalar, T1) | rpowscalar(var<float64|int32> scalar, tensor<float64|float32> A)->(tensor<float64|float32> C) |
| rpowscalar | miaobyte | T3=pow(scalar, T1) | rpowscalar(var<float32|int32> scalar, tensor<float64|float32> A)->(tensor<float64|float32> C) |
| rdivscalar | miaobyte | T3=scalar/T1 | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
| less | miaobyte | mask=compare(T1, T2) | less(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
| powscalar | miaobyte | T3=pow(T1, scalar) | powscalar(tensor<float64|float32> A, var<float64|int32> scalar)->(tensor<float64|float32> C) |
Expand Down
23 changes: 19 additions & 4 deletions excuter/op-mem-cuda/src/client/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,6 @@ int main()
deepx::tf::TfFactory tf_factory;
register_all(tf_factory);



// 将op table输出到markdown文件
string docdir = "../../../doc/excuter/op-mem-cuda/";
std::ofstream md_file(docdir + "list.md");
Expand Down Expand Up @@ -68,13 +66,30 @@ int main()
{
opresp.error("op" + op.name + " not found");
server.resp(opresp.to_string());
cerr<<opresp.message<<endl;
cerr << opresp.message << endl;
continue;
}
(*src).init(op.name, op.args, op.returns);

memmutex.lock();
opresp.start_at = chrono::system_clock::now();
int ret = (*src).run(mem,opresp.message);
int ret = 0;
if ((*src).metadata.benchmark.repeat > 1)
{
for (int i = 0; i < (*src).metadata.benchmark.repeat; i++)
{
ret = (*src).run(mem, opresp.message);
if (ret != 0)
{
break;
}
}
}
else
{
ret = (*src).run(mem, opresp.message);
}

memmutex.unlock();
if (ret != 0)
{
Expand Down
2 changes: 1 addition & 1 deletion excuter/op-mem-cuda/src/client/tfs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,7 @@ namespace deepx::tf
// rpowscalar
tffactory.add_tf(std::make_shared<RpowScalar<miaobyte>>(vector<Param>(
{
Param("scalar", DataCategory::Var, Precision::Float64 | Precision::Int32),
Param("scalar", DataCategory::Var, Precision::Float32 | Precision::Int32),
Param("A", DataCategory::Tensor, Precision::Float64 | Precision::Float32),
}),
vector<Param>(
Expand Down
22 changes: 18 additions & 4 deletions excuter/op-mem-ompsimd/src/client/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ int main()
client::udpserver server(8080);
deepx::tf::TfFactory tf_factory;
register_all(tf_factory);

// 将op table输出到markdown文件
string docdir = "../../../doc/excuter/op-mem-ompsimd/";
std::ofstream md_file(docdir + "list.md");
Expand Down Expand Up @@ -72,14 +72,28 @@ int main()
(*src).init(op.name, op.args, op.returns);
memmutex.lock();
opresp.start_at = chrono::system_clock::now();

int ret = (*src).run(mem,opresp.message);
int ret = 0;
if ((*src).metadata.benchmark.repeat > 1)
{
for (int i = 0; i < (*src).metadata.benchmark.repeat; i++)
{
ret = (*src).run(mem, opresp.message);
if (ret != 0)
{
break;
}
}
}
else
{
ret = (*src).run(mem, opresp.message);
}
memmutex.unlock();
if (ret != 0)
{
opresp.error(opresp.message);
server.resp(opresp.to_string());
cerr<<opresp.message<<endl;
cerr << opresp.message << endl;
continue;
}
opresp.finish("");
Expand Down
2 changes: 1 addition & 1 deletion excuter/op-mem-ompsimd/src/client/tfs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,7 @@ namespace deepx::tf
// rpowscalar author=miaobyte
tffactory.add_tf(std::make_shared<RpowScalar<miaobyte>>(vector<Param>(
{
Param("scalar", DataCategory::Var, Precision::Any),
Param("scalar", DataCategory::Var, Precision::Float32),
Param("A", DataCategory::Tensor, Precision::Any),
}),
vector<Param>(
Expand Down
41 changes: 23 additions & 18 deletions excuter/op-mem-ompsimd/src/deepx/tensorfunc/matmul_miaobyte.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,24 +15,29 @@ namespace deepx::tensorfunc
throw std::invalid_argument("A.shape could matmul with B.shape");
}
//TODO
//这里如果对二维矩阵运算,则omp并行不起来,因为C.shape.dim() - 2刚好=0
C.shape.rangeParallel(C.shape.dim() - 2, [&](const std::vector<int> &indices)
{
int aIdx=A.shape.linearat(indices);
int bIdx=B.shape.linearat(indices);
int cIdx=C.shape.linearat(indices);
int m=A.shape[-2];
int k=A.shape[-1];
int n=B.shape[-1];
for(int i=0;i<m;i++){
for(int j=0;j<n;j++){
T sum=0;
for(int l=0;l<k;l++){
sum+=A.data[aIdx+i*k+l]*B.data[bIdx+l*n+j];
}
C.data[cIdx+i*n+j]=sum;
}
} });
//这里需要进一步优化
C.shape.rangeParallel(C.shape.dim(), [&A,&B,&C](const int idx,const std::vector<int> &indices,ThreadLocalVectors &tlv) {

// int m=A.shape[-2];
int k=A.shape[-1];
// int n=B.shape[-1];

std::copy(indices.begin(),indices.end()-2,tlv.get(0).begin());
tlv.get(0)[indices.size()-2]=A.shape[-2];
tlv.get(0)[indices.size()-1]=indices[-1];
int aIdx=A.shape.linearat(tlv.get(0));
std::copy(indices.begin(),indices.end()-2,tlv.get(1).begin());
tlv.get(1)[indices.size()-2]=0;
tlv.get(1)[indices.size()-1]=indices[-2];
int bIdx=B.shape.linearat(tlv.get(1));
int bstride=k;

T sum=0;
for(int l=0;l<k;l++){
sum+=A.data[aIdx+l]+B.data[bIdx+l*bstride];
}
C.data[idx]=sum;
},{A.shape.dim(),B.shape.dim()});
}
};

Expand Down
40 changes: 13 additions & 27 deletions excuter/op-mem-ompsimd/src/deepx/tf/matmul.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#ifndef DEEPX_TF_MATMUL_HPP
#define DEEPX_TF_MATMUL_HPP

#include "deepx/tf/tf.hpp"
#include "deepx/dtype.hpp"
#include "deepx/dtype_ompsimd.hpp"
Expand All @@ -21,7 +21,7 @@ namespace deepx::tf
this->args = args;
this->returns = returns;
}

string math_formula() const override
{
return "T3=T1 @ T2";
Expand All @@ -30,7 +30,17 @@ namespace deepx::tf
{
return make_shared<MatMul<Author>>(*this);
}
int compute(shared_ptr<MemBase> mem, Precision a_type,string &error){

int run(shared_ptr<MemBase> mem, string &error) override
{
Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype;
Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
if (a_type != b_type || a_type != c_type)
{
error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(b_type) + " != " + precision_str(c_type);
return 1;
}
switch (a_type)
{
case Precision::Float64:
Expand All @@ -57,30 +67,6 @@ namespace deepx::tf
}
return 0;
}
int run(shared_ptr<MemBase> mem, string &error) override
{
Precision a_type = mem->gettensor(this->args[0].textvalue).get()->shape.dtype;
Precision b_type = mem->gettensor(this->args[1].textvalue).get()->shape.dtype;
Precision c_type = mem->gettensor(this->returns[0].textvalue).get()->shape.dtype;
if (a_type != b_type || a_type != c_type)
{
error = "Type mismatch: " + precision_str(a_type) + " != " + precision_str(b_type) + " != " + precision_str(c_type);
return 1;
}
if (metadata.benchmark.repeat > 0)
{
for (int i = 0; i < metadata.benchmark.repeat; i++)
{
if (compute(mem, a_type, error))
{
return 1;
}
}
}else{
return compute(mem, a_type, error);
}
return 0;
}
};
}

Expand Down
2 changes: 1 addition & 1 deletion front/py/deepx/nn/functional/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
"sqrt","pow","exp","log",
"min","max",
"less","greater","equal","notequal",
"switch",
"switch","where",
"todtype",
"invert",
"matmul",
Expand Down
4 changes: 4 additions & 0 deletions front/py/deepx/nn/functional/elementwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,7 @@ def bool(input:Tensor)->Tensor:
from .leaffunc_elementwise import todtype
dest=newtensor(input.shape,dtype='bool',name=input.name)
return todtype(input,dest)

def where(condition:Tensor,x:Tensor,y:Tensor)->Tensor:
from .leaffunc_elementwise import switch_func
return switch_func((x,y),condition)
2 changes: 1 addition & 1 deletion front/py/deepx/nn/functional/leaffunc_matmul.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from .leaffunc_life import newtensor
from .authormap import defaultauthor

def matmul(a:Tensor,b:Tensor,out:Union[Tensor,str]='',bench:tuple[int,int]=None)->Tensor:
def matmul(a:Tensor,b:Tensor,out:Union[Tensor,str]='',bench:int=None)->Tensor:
outtensor=out
if isinstance(out,str) or out is None:
outshape=Shape.matmul(a.shape,b.shape)
Expand Down
2 changes: 1 addition & 1 deletion front/py/deepx/scheduler/client/udpconn.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import select

class UDPConn:
def __init__(self, endpoint: str = "localhost:8080"):
def __init__(self, endpoint: str = "localhost:9090"):
# 解析endpoint
self._host, port_str = endpoint.split(':')
self._port = int(port_str)
Expand Down
24 changes: 15 additions & 9 deletions front/py/deepx/transformer/modeling_rope_utils.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,21 @@
from typing import Tuple
import math
from deepx import arange,Tensor
from deepx import arange,Tensor,where

def _compute_default_rope_parameters(config:dict={
"base":10000.0,
"rope_theta":10000.0,
"head_dim":0,
"partial_rotary_factor":1.0,
}) -> Tuple[Tensor, float]:
dim = config.head_dim* config.partial_rotary_factor
partial_rotary_factor = config.get("partial_rotary_factor", 1.0)
dim = config["head_dim"]* partial_rotary_factor
# 计算逆频率
inv_freq = 1.0 / (config.base ** (arange(0, dim, 2, dtype='float64')/ dim))
base=config["rope_theta"]
inv_freq = 1.0 / (base ** (arange(0, dim, 2, dtype='float64')/ dim))
return inv_freq, 1.0

def _compute_llama3_parameters(config:dict={
"base":10000.0,
"rope_theta":10000.0,
"head_dim":0,
"partial_rotary_factor":1.0,
"factor":8,
Expand All @@ -25,18 +27,22 @@ def _compute_llama3_parameters(config:dict={
# Gets the default RoPE parameters
inv_freq, attention_factor = _compute_default_rope_parameters(config)

low_freq_wavelen = config.old_context_len / config.low_freq_factor
high_freq_wavelen = config.old_context_len / config.high_freq_factor
low_freq_factor = config["rope_scaling"]["low_freq_factor"] # `1` in the original implementation
high_freq_factor = config["rope_scaling"]["high_freq_factor"] # `4` in the original implementation
old_context_len = config["rope_scaling"]["original_max_position_embeddings"] # `8192` in the original implementation
low_freq_wavelen = old_context_len /low_freq_factor
high_freq_wavelen = old_context_len/ high_freq_factor

wavelen = 2 * math.pi / inv_freq
wavelen.print()
# wavelen < high_freq_wavelen: do nothing
# wavelen > low_freq_wavelen: divide by factor
inv_freq_llama = torch.where(wavelen > low_freq_wavelen, inv_freq / config.factor, inv_freq)
inv_freq_llama = where(wavelen > low_freq_wavelen, inv_freq / config.factor, inv_freq)
# otherwise: interpolate between the two, using a smooth factor
smooth_factor = (config.old_context_len / wavelen - config.low_freq_factor) / (config.high_freq_factor - config.low_freq_factor)
smoothed_inv_freq = (1 - smooth_factor) * inv_freq_llama / config.factor + smooth_factor * inv_freq_llama
is_medium_freq = ~(wavelen < high_freq_wavelen) * ~(wavelen > low_freq_wavelen)
inv_freq_llama = torch.where(is_medium_freq, smoothed_inv_freq, inv_freq_llama)
inv_freq_llama = where(is_medium_freq, smoothed_inv_freq, inv_freq_llama)

return inv_freq_llama, attention_factor

Expand Down
4 changes: 4 additions & 0 deletions front/py/deepx/transformer/models/llama/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .embedding import *
__all__ = [
"LlamaRotaryEmbedding"
]
4 changes: 2 additions & 2 deletions front/py/deepx/transformer/models/llama/embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@ def __init__(self,config:dict):
# 原始最大序列长度
self.original_max_seq_len = config["max_position_embeddings"]
# 旋转类型
self.rope_type=config["rope_scaling"]["type"]
self.rope_type=config["rope_scaling"]["rope_type"]
# 旋转初始化函数
self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
# 旋转初始化函数
inv_freq, self.attention_scaling = self.rope_init_fn(self.config)
inv_freq, self.attention_scaling = self.rope_init_fn(config)
# 注册缓存
self.register_buffer("inv_freq", inv_freq, persistent=False)
# 原始旋转频率
Expand Down
24 changes: 11 additions & 13 deletions front/py/examples/2_ir/3_matmul.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
benchcnt=100

benchcnt=2000
shape=[4096,4096]
print()
from deepxutil.numpy import save_numpy
import numpy as np
np_T1 = np.random.randn(1024, 1024).astype(np.float32)
np_T2 = np.random.randn(1024, 1024).astype(np.float32)
np_T1 = np.random.randn(shape[0], shape[1]).astype(np.float32)
np_T2 = np.random.randn(shape[0], shape[1]).astype(np.float32)

npy_path = '/home/lipeng/model/deepxmodel/matmul/'
save_numpy(np_T1,npy_path+'t1')
Expand All @@ -13,35 +14,32 @@

import torch
import time
torch_t1 = torch.from_numpy(np_T1)
torch_t2 = torch.from_numpy(np_T2)
torch_t1 = torch.from_numpy(np_T1).to(torch.float32).to('cuda')
torch_t2 = torch.from_numpy(np_T2).to(torch.float32).to('cuda')
# warmup
_=torch_t1 @ torch_t2

torch_start = time.time()
for i in range(benchcnt):
torch_t3 = torch_t1 @ torch_t2

print(torch_t3)

torch_end = time.time()
print(f"PyTorch time: {torch_end - torch_start} seconds")
############-------DEEPX-------################

from deepx import uniform, matmul, zeros,load
from deepx.nn.functional import save,load
from deepx import matmul, zeros,load
print()

t1 = load(npy_path+'t1')
t2 = load(npy_path+'t2')
t3= zeros((1024,1024),dtype='float32',name="t3")
t3= zeros(tuple(shape),dtype='float32',name="t3")
from deepx.nn.functional import defaultauthor
defaultauthor['matmul']='miaobyte'
# warmup
matmul(t1,t2,out=t3)

deepx_start = time.time()
matmul(t1,t2,out=t3,bench=(benchcnt))
t3.print()
matmul(t1,t2,out=t3,bench=benchcnt)
deepx_end = time.time()
print(f"DeepX time: {deepx_end - deepx_start} seconds")

Expand Down
Loading
Loading