Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
139 changes: 102 additions & 37 deletions main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,99 +4,164 @@
#include <cmath>
#include <numeric>
#include <algorithm>
#include <tbb/parallel_for.h>
#include <tbb/parallel_reduce.h>
#include <tbb/parallel_scan.h>
#include <tbb/blocked_range.h>
#include "ticktock.h"

// TODO: 并行化所有这些 for 循环
#include <atomic>
#include "pod.h"

// 1. fill - 使用 parallel_for 直接并行
template <class T, class Func>
std::vector<T> fill(std::vector<T> &arr, Func const &func) {
TICK(fill);
for (size_t i = 0; i < arr.size(); i++) {
arr[i] = func(i);
}
tbb::parallel_for(tbb::blocked_range<size_t>(0, arr.size()), [&](auto const &r) {
for (size_t i = r.begin(); i < r.end(); i++) {
arr[i] = func(i);
}
});
TOCK(fill);
return arr;
}

// 2. saxpy - 使用 parallel_for 直接并行
template <class T>
void saxpy(T a, std::vector<T> &x, std::vector<T> const &y) {
TICK(saxpy);
for (size_t i = 0; i < x.size(); i++) {
x[i] = a * x[i] + y[i];
}
tbb::parallel_for(tbb::blocked_range<size_t>(0, x.size()), [&](auto const &r) {
for (size_t i = r.begin(); i < r.end(); i++) {
x[i] = a * x[i] + y[i];
}
});
TOCK(saxpy);
}

// 3. sqrtdot - 使用 parallel_reduce 进行并行规约
template <class T>
T sqrtdot(std::vector<T> const &x, std::vector<T> const &y) {
TICK(sqrtdot);
T ret = 0;
for (size_t i = 0; i < std::min(x.size(), y.size()); i++) {
ret += x[i] * y[i];
}
size_t n = std::min(x.size(), y.size());
T ret = tbb::parallel_reduce(
tbb::blocked_range<size_t>(0, n), (T)0,
[&](auto const &r, T local_res) {
for (size_t i = r.begin(); i < r.end(); i++) {
local_res += x[i] * y[i];
}
return local_res;
},
[](T a, T b) { return a + b; }
);
ret = std::sqrt(ret);
TOCK(sqrtdot);
return ret;
}

// 4. minvalue - 使用 parallel_reduce 找最小值
template <class T>
T minvalue(std::vector<T> const &x) {
TICK(minvalue);
T ret = x[0];
for (size_t i = 1; i < x.size(); i++) {
if (x[i] < ret)
ret = x[i];
}
T ret = tbb::parallel_reduce(
tbb::blocked_range<size_t>(0, x.size()), x[0],
[&](auto const &r, T local_min) {
for (size_t i = r.begin(); i < r.end(); i++) {
if (x[i] < local_min) local_min = x[i];
}
return local_min;
},
[](T a, T b) { return std::min(a, b); }
);
TOCK(minvalue);
return ret;
}

// 5. magicfilter - 使用 原子索引 + 预分配空间
template <class T>
std::vector<T> magicfilter(std::vector<T> const &x, std::vector<T> const &y) {
TICK(magicfilter);
std::vector<T> res;
for (size_t i = 0; i < std::min(x.size(), y.size()); i++) {
if (x[i] > y[i]) {
res.push_back(x[i]);
} else if (y[i] > x[i] && y[i] > 0.5f) {
res.push_back(y[i]);
res.push_back(x[i] * y[i]);
size_t n = std::min(x.size(), y.size());

// 1. 预分配一个足够大的临时空间 (最大可能长度是 2n)
// 使用 pod<T> 可以避免不必要的构造函数开销
std::vector<pod<T>> tmp(n * 2);
std::atomic<size_t> counter{0};

// 2. 并行写入,每个线程通过 atomic 获取写入位置
tbb::parallel_for(tbb::blocked_range<size_t>(0, n), [&](auto const &r) {
// 局部缓冲区,减少对全局原子变量的竞争
std::vector<T> local;
local.reserve(r.size());

for (size_t i = r.begin(); i < r.end(); i++) {
if (x[i] > y[i]) {
local.push_back(x[i]);
} else if (y[i] > x[i] && y[i] > 0.5f) {
local.push_back(y[i]);
local.push_back(x[i] * y[i]);
}
}
}

// 一次性申请位置并拷贝,效率极高
if (!local.empty()) {
size_t base = counter.fetch_add(local.size());
for (size_t j = 0; j < local.size(); j++) {
tmp[base + j] = local[j];
}
}
});

// 3. 裁剪回最终大小并转回 std::vector
size_t final_size = counter.load();
std::vector<T> res(final_size);
tbb::parallel_for(tbb::blocked_range<size_t>(0, final_size), [&](auto const &r) {
for (size_t i = r.begin(); i < r.end(); i++) {
res[i] = tmp[i];
}
});

TOCK(magicfilter);
return res;
}

// 6. scanner - 使用 parallel_scan 实现并行前缀和
template <class T>
T scanner(std::vector<T> &x) {
TICK(scanner);
T ret = 0;
for (size_t i = 0; i < x.size(); i++) {
ret += x[i];
x[i] = ret;
}
T total_sum = tbb::parallel_scan(
tbb::blocked_range<size_t>(0, x.size()), (T)0,
[&](auto const &r, T sum, bool is_final) {
for (size_t i = r.begin(); i < r.end(); i++) {
sum += x[i];
if (is_final) x[i] = sum;
}
return sum;
},
[](T a, T b) { return a + b; }
);
TOCK(scanner);
return ret;
return total_sum;
}

// 下面是测试逻辑,必须保留!
int main() {
size_t n = 1<<26;
size_t n = 1 << 26;
std::vector<float> x(n);
std::vector<float> y(n);

fill(x, [&] (size_t i) { return std::sin(i); });
fill(y, [&] (size_t i) { return std::cos(i); });
fill(x, [](size_t i) { return std::sin(i); });
fill(y, [](size_t i) { return std::cos(i); });

saxpy(0.5f, x, y);

std::cout << sqrtdot(x, y) << std::endl;
std::cout << minvalue(x) << std::endl;

auto arr = magicfilter(x, y);
std::cout << arr.size() << std::endl;
auto res = magicfilter(x, y);
std::cout << res.size() << std::endl;

scanner(x);
std::cout << std::reduce(x.begin(), x.end()) << std::endl;
std::cout << x.back() << std::endl;

return 0;
}
}