55#include < numeric>
66#include < algorithm>
77#include " ticktock.h"
8+ #include " tbb/tbb.h"
9+ #include < atomic>
10+ #include " pod.h"
811
12+ // 显式定义线程数
13+ #define NUM_THREADS 8
914// TODO: 并行化所有这些 for 循环
1015
1116template <class T , class Func >
1217std::vector<T> fill (std::vector<T> &arr, Func const &func) {
1318 TICK (fill);
14- for (size_t i = 0 ; i < arr.size (); i++) {
15- arr[i] = func (i);
16- }
19+ // for (size_t i = 0; i < arr.size(); i++) {
20+ // arr[i] = func(i);
21+ // }
22+ tbb::parallel_for (
23+ tbb::blocked_range<size_t >(0 ,arr.size ()),
24+ [&](tbb::blocked_range<size_t > r){
25+ for ( size_t i=r.begin ();i<r.end ();i++){
26+ arr[i]=func (i);
27+ }
28+ }
29+ );
1730 TOCK (fill);
1831 return arr;
1932}
2033
2134template <class T >
2235void saxpy (T a, std::vector<T> &x, std::vector<T> const &y) {
2336 TICK (saxpy);
24- for (size_t i = 0 ; i < x.size (); i++) {
25- x[i] = a * x[i] + y[i];
26- }
37+ // for (size_t i = 0; i < x.size(); i++) {
38+ // x[i] = a * x[i] + y[i];
39+ // }
40+ tbb::parallel_for (
41+ tbb::blocked_range<size_t > (0 , x.size ()),
42+ [&](tbb::blocked_range<size_t > r){
43+ for ( size_t i=r.begin ();i<r.end ();i++){
44+ x[i] = a * x[i] + y[i];
45+ }
46+ }
47+ );
48+
2749 TOCK (saxpy);
2850}
2951
3052template <class T >
3153T sqrtdot (std::vector<T> const &x, std::vector<T> const &y) {
3254 TICK (sqrtdot);
33- T ret = 0 ;
34- for (size_t i = 0 ; i < std::min (x.size (), y.size ()); i++) {
35- ret += x[i] * y[i];
36- }
55+ // T ret = 0;
56+ // for (size_t i = 0; i < std::min(x.size(), y.size()); i++) {
57+ // ret += x[i] * y[i];
58+ // }
59+ T ret=tbb::parallel_reduce (
60+ tbb::blocked_range<size_t >(0 ,std::min (x.size (), y.size ())),T (0 ),
61+ [&](tbb::blocked_range<size_t > r,T local_ret){
62+ for (size_t i=r.begin ();i<r.end ();i++){
63+ local_ret+=x[i]*y[i];
64+ }
65+ return local_ret;
66+ },
67+ [](T a,T b){
68+ return a+b;
69+ }
70+ );
3771 ret = std::sqrt (ret);
3872 TOCK (sqrtdot);
3973 return ret;
@@ -42,48 +76,135 @@ T sqrtdot(std::vector<T> const &x, std::vector<T> const &y) {
4276template <class T >
4377T minvalue (std::vector<T> const &x) {
4478 TICK (minvalue);
45- T ret = x[0 ];
46- for (size_t i = 1 ; i < x.size (); i++) {
47- if (x[i] < ret)
48- ret = x[i];
49- }
79+ // T ret = x[0];
80+ // for (size_t i = 1; i < x.size(); i++) {
81+ // if (x[i] < ret)
82+ // ret = x[i];
83+ // }
84+ T ret=tbb::parallel_reduce (
85+ tbb::blocked_range<size_t >(0 ,x.size ()),x[0 ],
86+ [&](tbb::blocked_range<size_t > r,T local_ret){
87+ for (size_t i=r.begin ();i<r.end ();i++){
88+ if (x[i]<local_ret)
89+ local_ret=x[i];
90+ }
91+ return local_ret;
92+ },
93+ [](T a,T b){
94+ return a<b?a:b;
95+ }
96+ );
5097 TOCK (minvalue);
5198 return ret;
5299}
53100
54101template <class T >
55- std::vector<T > magicfilter (std::vector<T> const &x, std::vector<T> const &y) {
102+ std::vector<pod<T> > magicfilter (std::vector<T> const &x, std::vector<T> const &y) {
56103 TICK (magicfilter);
57- std::vector<T> res;
58- for (size_t i = 0 ; i < std::min (x.size (), y.size ()); i++) {
59- if (x[i] > y[i]) {
60- res.push_back (x[i]);
61- } else if (y[i] > x[i] && y[i] > 0 .5f ) {
62- res.push_back (y[i]);
63- res.push_back (x[i] * y[i]);
104+ // std::vector<T> res;
105+ // for (size_t i = 0; i < std::min(x.size(), y.size()); i++) {
106+ // if (x[i] > y[i]) {
107+ // res.push_back(x[i]);
108+ // } else if (y[i] > x[i] && y[i] > 0.5f) {
109+ // res.push_back(y[i]);
110+ // res.push_back(x[i] * y[i]);
111+ // }
112+ // }
113+ std::vector<pod<T>> res (2 *std::min (x.size (), y.size ()));
114+ std::atomic<size_t > index{0 };
115+ tbb::parallel_for (
116+ tbb::blocked_range<size_t >(0 ,std::min (x.size (), y.size ())),
117+ [&](tbb::blocked_range<size_t > r){
118+ std::vector<pod<T>> local_res;
119+ // local_res.reserve(y.size()/NUM_THREADS);
120+ for (size_t i=r.begin ();i<r.end ();i++){
121+ if (x[i] > y[i]) {
122+ local_res.push_back (x[i]);
123+ } else if (y[i] > x[i] && y[i] > 0 .5f ) {
124+ local_res.push_back (y[i]);
125+ local_res.push_back (x[i] * y[i]);
126+ }
127+ }
128+ int beg=index.fetch_add (local_res.size ());
129+ std::memcpy (&res[beg],&local_res[0 ],local_res.size ());
64130 }
65- }
131+ );
132+ res.resize (index);
66133 TOCK (magicfilter);
67134 return res;
68135}
69136
70137template <class T >
71138T scanner (std::vector<T> &x) {
72139 TICK (scanner);
73- T ret = 0 ;
74- for (size_t i = 0 ; i < x.size (); i++) {
75- ret += x[i];
76- x[i] = ret;
140+ // T ret = 0;
141+ // for (size_t i = 0; i < x.size(); i++) {
142+ // ret += x[i];
143+ // x[i] = ret;
144+ // }
145+
146+ // 实测下面的手动划分task方式比auto_partiioner的parallel_scan更快
147+ // float ret = tbb::parallel_scan(tbb::blocked_range<size_t>(0, x.size()), (float)0,
148+ // [&] (tbb::blocked_range<size_t> r, float local_res, auto is_final) {
149+ // for (size_t i = r.begin(); i < r.end(); i++) {
150+ // local_res += x[i];
151+ // if (is_final) {
152+ // x[i] = local_res;
153+ // }
154+ // }
155+ // return local_res;
156+ // }, [] (float x, float y) {
157+ // return x + y;
158+ // });
159+
160+ // 手动划分任务区间
161+ tbb::task_group tg;
162+ std::vector<T> local_res (NUM_THREADS);
163+ for (size_t k=0 ;k<NUM_THREADS;k++){
164+ size_t beg=k*(x.size ()+NUM_THREADS-1 )/NUM_THREADS;// 应该向上取整
165+ size_t end=std::min ((k+1 )*(x.size ()+NUM_THREADS-1 )/NUM_THREADS,x.size ());
166+ tg.run (
167+ [&,k,beg,end](){
168+ T tmp=0 .f ;
169+ for (size_t i=beg;i<end;i++){
170+ tmp+=x[i];
171+ x[i]=tmp;
172+ }
173+ local_res[k]=tmp;
174+ }
175+ );
176+ }
177+ tg.wait ();
178+ T pre_sum=0 .f ;
179+ for (size_t k=0 ;k<NUM_THREADS;k++){
180+ pre_sum+=local_res[k];
181+ local_res[k]=pre_sum;
77182 }
183+ for (size_t k=1 ;k<NUM_THREADS;k++){
184+ size_t beg=k*(x.size ()+NUM_THREADS-1 )/NUM_THREADS;
185+ size_t end=std::min ((k+1 )*(x.size ()+NUM_THREADS-1 )/NUM_THREADS,x.size ());
186+ tg.run (
187+ [&,k,beg,end](){
188+ for (size_t i=beg;i<end;i++){
189+ x[i]+=local_res[k-1 ];
190+ }
191+ }
192+ );
193+ }
194+ tg.wait ();
195+
196+
78197 TOCK (scanner);
79- return ret;
198+ return local_res[NUM_THREADS-1 ];
199+ // return ret;
80200}
81201
82202int main () {
83203 size_t n = 1 <<26 ;
84204 std::vector<float > x (n);
85205 std::vector<float > y (n);
86206
207+ tbb::task_scheduler_init init (NUM_THREADS);
87208 fill (x, [&] (size_t i) { return std::sin (i); });
88209 fill (y, [&] (size_t i) { return std::cos (i); });
89210
0 commit comments