Skip to content

Commit b7a003c

Browse files
committed
complete hw06
1 parent 119ec63 commit b7a003c

File tree

5 files changed

+185
-72
lines changed

5 files changed

+185
-72
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
build
22
GNUmakefile
3+
benchmark

CMakeLists.txt

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,19 @@
11
cmake_minimum_required(VERSION 3.10)
22

3-
set(CMAKE_CXX_STANDARD 17)
3+
set(CMAKE_CXX_STANDARD 17)
44
set(CMAKE_BUILD_TYPE Release)
55

66
project(main LANGUAGES CXX)
77

8+
# set(BUILD_SHARED_LIBS 1)
9+
# add_subdirectory(benchmark)
810
add_executable(main main.cpp)
911

1012
#find_package(OpenMP REQUIRED)
1113
#target_link_libraries(main PUBLIC OpenMP::OpenMP_CXX)
1214

1315
find_package(TBB REQUIRED)
14-
target_link_libraries(main PUBLIC TBB::tbb)
16+
target_link_libraries(main PUBLIC tbb)
1517

1618
#find_package(benchmark REQUIRED)
1719
#target_link_libraries(main PUBLIC benchmark::benchmark)

main.cpp

Lines changed: 150 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -5,35 +5,69 @@
55
#include <numeric>
66
#include <algorithm>
77
#include "ticktock.h"
8+
#include "tbb/tbb.h"
9+
#include <atomic>
10+
#include "pod.h"
811

12+
//显式定义线程数
13+
#define NUM_THREADS 8
914
// TODO: 并行化所有这些 for 循环
1015

1116
template <class T, class Func>
1217
std::vector<T> fill(std::vector<T> &arr, Func const &func) {
1318
TICK(fill);
14-
for (size_t i = 0; i < arr.size(); i++) {
15-
arr[i] = func(i);
16-
}
19+
// for (size_t i = 0; i < arr.size(); i++) {
20+
// arr[i] = func(i);
21+
// }
22+
tbb::parallel_for(
23+
tbb::blocked_range<size_t>(0,arr.size()),
24+
[&](tbb::blocked_range<size_t> r){
25+
for ( size_t i=r.begin();i<r.end();i++){
26+
arr[i]=func(i);
27+
}
28+
}
29+
);
1730
TOCK(fill);
1831
return arr;
1932
}
2033

2134
template <class T>
2235
void saxpy(T a, std::vector<T> &x, std::vector<T> const &y) {
2336
TICK(saxpy);
24-
for (size_t i = 0; i < x.size(); i++) {
25-
x[i] = a * x[i] + y[i];
26-
}
37+
// for (size_t i = 0; i < x.size(); i++) {
38+
// x[i] = a * x[i] + y[i];
39+
// }
40+
tbb::parallel_for(
41+
tbb::blocked_range<size_t> (0 , x.size()),
42+
[&](tbb::blocked_range<size_t> r){
43+
for ( size_t i=r.begin();i<r.end();i++){
44+
x[i] = a * x[i] + y[i];
45+
}
46+
}
47+
);
48+
2749
TOCK(saxpy);
2850
}
2951

3052
template <class T>
3153
T sqrtdot(std::vector<T> const &x, std::vector<T> const &y) {
3254
TICK(sqrtdot);
33-
T ret = 0;
34-
for (size_t i = 0; i < std::min(x.size(), y.size()); i++) {
35-
ret += x[i] * y[i];
36-
}
55+
// T ret = 0;
56+
// for (size_t i = 0; i < std::min(x.size(), y.size()); i++) {
57+
// ret += x[i] * y[i];
58+
// }
59+
T ret=tbb::parallel_reduce(
60+
tbb::blocked_range<size_t>(0,std::min(x.size(), y.size())),T(0),
61+
[&](tbb::blocked_range<size_t> r,T local_ret){
62+
for (size_t i=r.begin();i<r.end();i++){
63+
local_ret+=x[i]*y[i];
64+
}
65+
return local_ret;
66+
},
67+
[](T a,T b){
68+
return a+b;
69+
}
70+
);
3771
ret = std::sqrt(ret);
3872
TOCK(sqrtdot);
3973
return ret;
@@ -42,48 +76,135 @@ T sqrtdot(std::vector<T> const &x, std::vector<T> const &y) {
4276
template <class T>
4377
T minvalue(std::vector<T> const &x) {
4478
TICK(minvalue);
45-
T ret = x[0];
46-
for (size_t i = 1; i < x.size(); i++) {
47-
if (x[i] < ret)
48-
ret = x[i];
49-
}
79+
// T ret = x[0];
80+
// for (size_t i = 1; i < x.size(); i++) {
81+
// if (x[i] < ret)
82+
// ret = x[i];
83+
// }
84+
T ret=tbb::parallel_reduce(
85+
tbb::blocked_range<size_t>(0,x.size()),x[0],
86+
[&](tbb::blocked_range<size_t> r,T local_ret){
87+
for(size_t i=r.begin();i<r.end();i++){
88+
if(x[i]<local_ret)
89+
local_ret=x[i];
90+
}
91+
return local_ret;
92+
},
93+
[](T a,T b){
94+
return a<b?a:b;
95+
}
96+
);
5097
TOCK(minvalue);
5198
return ret;
5299
}
53100

54101
template <class T>
55-
std::vector<T> magicfilter(std::vector<T> const &x, std::vector<T> const &y) {
102+
std::vector<pod<T>> magicfilter(std::vector<T> const &x, std::vector<T> const &y) {
56103
TICK(magicfilter);
57-
std::vector<T> res;
58-
for (size_t i = 0; i < std::min(x.size(), y.size()); i++) {
59-
if (x[i] > y[i]) {
60-
res.push_back(x[i]);
61-
} else if (y[i] > x[i] && y[i] > 0.5f) {
62-
res.push_back(y[i]);
63-
res.push_back(x[i] * y[i]);
104+
// std::vector<T> res;
105+
// for (size_t i = 0; i < std::min(x.size(), y.size()); i++) {
106+
// if (x[i] > y[i]) {
107+
// res.push_back(x[i]);
108+
// } else if (y[i] > x[i] && y[i] > 0.5f) {
109+
// res.push_back(y[i]);
110+
// res.push_back(x[i] * y[i]);
111+
// }
112+
// }
113+
std::vector<pod<T>> res(2*std::min(x.size(), y.size()));
114+
std::atomic<size_t> index{0};
115+
tbb::parallel_for(
116+
tbb::blocked_range<size_t>(0,std::min(x.size(), y.size())),
117+
[&](tbb::blocked_range<size_t> r){
118+
std::vector<pod<T>> local_res;
119+
// local_res.reserve(y.size()/NUM_THREADS);
120+
for(size_t i=r.begin();i<r.end();i++){
121+
if (x[i] > y[i]) {
122+
local_res.push_back(x[i]);
123+
} else if (y[i] > x[i] && y[i] > 0.5f) {
124+
local_res.push_back(y[i]);
125+
local_res.push_back(x[i] * y[i]);
126+
}
127+
}
128+
int beg=index.fetch_add(local_res.size());
129+
std::memcpy(&res[beg],&local_res[0],local_res.size());
64130
}
65-
}
131+
);
132+
res.resize(index);
66133
TOCK(magicfilter);
67134
return res;
68135
}
69136

70137
template <class T>
71138
T scanner(std::vector<T> &x) {
72139
TICK(scanner);
73-
T ret = 0;
74-
for (size_t i = 0; i < x.size(); i++) {
75-
ret += x[i];
76-
x[i] = ret;
140+
// T ret = 0;
141+
// for (size_t i = 0; i < x.size(); i++) {
142+
// ret += x[i];
143+
// x[i] = ret;
144+
// }
145+
146+
//实测下面的手动划分task方式比auto_partiioner的parallel_scan更快
147+
// float ret = tbb::parallel_scan(tbb::blocked_range<size_t>(0, x.size()), (float)0,
148+
// [&] (tbb::blocked_range<size_t> r, float local_res, auto is_final) {
149+
// for (size_t i = r.begin(); i < r.end(); i++) {
150+
// local_res += x[i];
151+
// if (is_final) {
152+
// x[i] = local_res;
153+
// }
154+
// }
155+
// return local_res;
156+
// }, [] (float x, float y) {
157+
// return x + y;
158+
// });
159+
160+
//手动划分任务区间
161+
tbb::task_group tg;
162+
std::vector<T> local_res(NUM_THREADS);
163+
for (size_t k=0;k<NUM_THREADS;k++){
164+
size_t beg=k*(x.size()+NUM_THREADS-1)/NUM_THREADS;//应该向上取整
165+
size_t end=std::min((k+1)*(x.size()+NUM_THREADS-1)/NUM_THREADS,x.size());
166+
tg.run(
167+
[&,k,beg,end](){
168+
T tmp=0.f;
169+
for(size_t i=beg;i<end;i++){
170+
tmp+=x[i];
171+
x[i]=tmp;
172+
}
173+
local_res[k]=tmp;
174+
}
175+
);
176+
}
177+
tg.wait();
178+
T pre_sum=0.f;
179+
for (size_t k=0;k<NUM_THREADS;k++){
180+
pre_sum+=local_res[k];
181+
local_res[k]=pre_sum;
77182
}
183+
for(size_t k=1;k<NUM_THREADS;k++){
184+
size_t beg=k*(x.size()+NUM_THREADS-1)/NUM_THREADS;
185+
size_t end=std::min((k+1)*(x.size()+NUM_THREADS-1)/NUM_THREADS,x.size());
186+
tg.run(
187+
[&,k,beg,end](){
188+
for(size_t i=beg;i<end;i++){
189+
x[i]+=local_res[k-1];
190+
}
191+
}
192+
);
193+
}
194+
tg.wait();
195+
196+
78197
TOCK(scanner);
79-
return ret;
198+
return local_res[NUM_THREADS-1];
199+
// return ret;
80200
}
81201

82202
int main() {
83203
size_t n = 1<<26;
84204
std::vector<float> x(n);
85205
std::vector<float> y(n);
86206

207+
tbb::task_scheduler_init init(NUM_THREADS);
87208
fill(x, [&] (size_t i) { return std::sin(i); });
88209
fill(y, [&] (size_t i) { return std::cos(i); });
89210

run.sh

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
1-
#!/bin/sh
1+
#!/bin/bash
2+
if [[ $1 = "clean" && -d build ]];then
3+
rm -rf build
4+
fi
25
set -e
36
cmake -B build
4-
cmake --build build
7+
cmake --build build -- -j 4
58
build/main

score.md

Lines changed: 25 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,55 +1,41 @@
11
# 原版
22

3-
fill: 0.691299s
4-
fill: 0.694203s
5-
saxpy: 0.0268882s
6-
sqrtdot: 0.0655007s
3+
fill: 0.728694s
4+
fill: 0.74406s
5+
saxpy: 0.0409412s
6+
sqrtdot: 0.0724723s
77
5165.4
8-
minvalue: 0.0654602s
8+
minvalue: 0.0691992s
99
-1.11803
10-
magicfilter: 0.280727s
10+
magicfilter: 0.371052s
1111
55924034
12-
scanner: 0.0651282s
12+
scanner: 0.0702286s
1313
6.18926e+07
1414

1515
# 2
1616

17-
fill: 0.135927s
18-
fill: 0.135436s
19-
saxpy: 0.0261193s
20-
sqrtdot: 0.0166558s
17+
fill: 0.110683s
18+
fill: 0.10929s
19+
saxpy: 0.0116205s
20+
sqrtdot: 0.011483s
2121
5792.62
22-
minvalue: 0.00855201s
22+
minvalue: 0.00942511s
2323
-1.11803
24-
magicfilter: 0.0343181s
24+
magicfilter: 0.0292501s
2525
55924034
26-
scanner: 0.0292899s
27-
6.19238e+07
26+
scanner: 0.0187589s
27+
6.19048e+07
2828

2929
# 3
30-
31-
fill: 0.151911s
32-
fill: 0.149576s
33-
saxpy: 0.0256344s
34-
sqrtdot: 0.0161882s
35-
5792.61
36-
minvalue: 0.00839197s
37-
-1.11803
38-
magicfilter: 0.174838s
39-
55924034
40-
scanner: 0.0305014s
41-
6.19266e+07
42-
43-
# 4
44-
45-
fill: 0.135299s
46-
fill: 0.135698s
47-
saxpy: 0.0259649s
48-
sqrtdot: 0.016133s
49-
5792.63
50-
minvalue: 0.0083628s
30+
只是针对parallel scan,发现手动划分task并行比parallel_scan更快,应该是我没找到最佳的partitioner,但是我测试了好几种都是手动划分更快
31+
fill: 0.111643s
32+
fill: 0.113284s
33+
saxpy: 0.0120721s
34+
sqrtdot: 0.0143203s
35+
5792.62
36+
minvalue: 0.00966027s
5137
-1.11803
52-
magicfilter: 0.0378731s
38+
magicfilter: 0.0297295s
5339
55924034
54-
scanner: 0.0257618s
55-
6.19406e+07
40+
scanner: 0.016231s
41+
6.19332e+07

0 commit comments

Comments
 (0)