ms_dpc/partition.cpp at master · John2016/ms_dpc · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
#include <map>
// #include <hash_map>
// #include <unordered_map>
#include <vector>
#include <algorithm>
#include <set>
#include <math.h>

#include "ms_dataset.h"
#include "parallel_struct.h"

using namespace std;

map<int, int> single_idx;
map<vector<int>, int> group_idx;

// hash_map<int, int> ms_map;

/*
struct str_hash{
	size_t operator()(const string& str) const
	{

		return size_t(_h);
	}
}
*/

/*
struct compare_str{
	bool operator()(const char* p1, const char* p2) const{
		return strcmp(p1, p2) == 0;
	}
}
*/

/*
 * get the hash value for single data
 * & get the hash value for local ms_dataset
 * template should be used for the type of data point is different
 * at least spectra type
int ms_hash(vector<int> point)
{
	/* get hash value for every preprocessed data point
	hash_map<int, set<int> > hashed_ms;
	// hashed value & corresponded idxes
}

hash_map<int, set<int> > multims_hash(ms_dataset local_ms, int aaa, int bbb)
{
	set<int> ms_hash_set;                                                                                                        a
	for (int i = 0; i < count; ++i)
	{
		int hash_value = ms_hash(local_ms[i])
	}


	return local_hm;
}
*/

// the merged
/*
hash_map<vector<int>, set<int> > merge_group_ht(vector<hash_map<int, set<int> > > group_hm_tmp)
{
	/* merge results from different hash functions in one group
	hash_map<vector<int>, set<int> > merged_hm;		// multi-hashed_values and data_idx

	return merged_hm;
}
*/

/* marge
typedef vector<hash_map<vector<int>, set<int> > > global_ms_ht;
global_ms_ht merge_multig_ht(global_ms_ht ht_ori, global_ms_ht ht_new)
{
	int m_groups = ht_ori.size();
	global_ms_ht ht_merged;
	for (int i = 0; i < m_groups; ++i)
	{

	}

	return ht_merged;
}
*/

/* algorithms above is LSH related
 * algorithms below is partiton based on heuristic method
 */
/* method description:
 * 将ms_dataset里面的数据按照某种方法划分为多个块
 * 以multimap的形式给出
 * 对LSH同样适用
 */

/* the hash function */
vector<int> hash_func(int idx, int num_group, vector<spectra> &dataset)			// 这里的num_group是广义的意思，目前的实际含义为选第num_group高的峰
{
	double tolerance_pre = 3;
	double tolerance_peak = 0.5;

	vector<int> key(2, -1);
	key[0] = round(dataset[idx].precursor_mz / tolerance_pre);
	key[1] = round(dataset[idx].ions[num_group].mass_value / tolerance_peak);

	return key;

}

typedef multimap<vector<int>, int> heu_hashmap;
heu_hashmap partition_heuristic(vector<spectra> dataset)
// unordered_multimap<vector<int>, int> partition_heuristic(vector<spectra> dataset)
// hash_multimap<vector<int>, int > partition_heuristic(vector<spectra> dataset)
{
	// cout << "in partition_heuristic" << endl;
	// cout << "dataset size: " << dataset.size() << endl;
	int group_num_heu = 4;			// top k  peaks to be selected

	// multimap<vector<int>, int > par_heu;
	heu_hashmap par_heu;
	int count = dataset.size();
	//cout << "count: " << count << endl;
	for (int i = 0; i < count; ++i)
	{
		int cur_group = group_num_heu;
		if (dataset[i].ions.size() < cur_group)
		{
			cur_group = dataset[i].ions.size();
		}
		// cout << i << " & " << cur_group << endl;
		for (int j = 0; j < cur_group; ++j)
		{
			vector<int> hash_value = hash_func(i, j, dataset);
			// cout << "hashed_value: " << hash_value[0] << " & " << hash_value[1] << endl;
			par_heu.insert(make_pair(hash_func(i, j, dataset), i));
		}
	}
	// cout << "heu_hashmap size: " << par_heu.size() << endl;
	return par_heu;
}


/* merge hashmaps collected from different nodes */
heu_hashmap merge_par_heu(vector<heu_hashmap> patch_hashmap)
{
	heu_hashmap merged_map;
	int count = patch_hashmap.size();
	for (int i = 0; i < count; ++i)
	{
		/* code */
		heu_hashmap::iterator it = patch_hashmap[i].begin();
		while(it != patch_hashmap[i].end())
		{
			merged_map.insert(make_pair((*it).first, (*it).second));
			it++;
			/*
			* hint
			* mm.insert(pair<float, char*>(3.0f, "apple"));
			*/

		}
	}
	return merged_map;
}

/* 将heu_hashmap对象转变为可用于分配任务的dict_idx格式 */
vector<dict_idx> trans_table_dict(heu_hashmap merged_map)
{
	vector<dict_idx> idx_table;
	heu_hashmap::iterator it = merged_map.begin();

	while(it != merged_map.end())
	{
		vector<int> key_curr = (*it).first;
		heu_hashmap::iterator beg_curr = merged_map.lower_bound(key_curr);
		heu_hashmap::iterator end_curr = merged_map.upper_bound(key_curr);

		vector<int> idx_tmp;
		while(beg_curr != end_curr)
		{
			idx_tmp.push_back((*it).second);
			beg_curr++;
			it++;
		}
		// 这里，只需要确定end_curr的值等于下一个key的beg_curr就正确
		// 实验表明，上述假设正确，upper_bound等于下一个元素的lower_bound
		dict_idx dict_tmp(-1, key_curr, idx_tmp);
		idx_table.push_back(dict_tmp);
	}
	return idx_table;
}

/* 分配任务，在原结构上写入num_proc信息
* 分配中保证负载均衡
* 目前的方法就是随机分配，能在一定程度上保证负载均衡
*/
// 目前这个函数未被使用
void alloc_assign(vector<dict_idx> &idx_table, int num_proc_comp)
{
	/* to fullfill the dict_idx class, try balancing the load of every proc*/
	/* note that the group_num is not important anymore */

	vector<vector<int> > global_datasize(num_proc_comp + 1, vector<int>());

	int count = idx_table.size();
	/* the simplest method, balance load by randomly distributed */
	for (int i = 0; i < count; ++i)
	{
		idx_table[i].num_proc = i % num_proc_comp + 1;
		// 每一个节点上，每一组数据的大小，貌似现在不需要了
		// global_datasize[idx_table[i].num_proc].push_back(idx_table[i].idx.size());
	}

	/* more effective method by optimization and planning */

	/*
	cout << "load of every nodes: " << endl;
	int load_tmp = 0;
	for (int i = 1; i < num_proc_comp + 1; i++)
	*/
	return;
}

/* allocate assigments, results as dict_full */
vector<dict_full> alloc_assign_full(vector<dict_idx> idx_table, int num_proc_comp)
{
	// cout << "\t\tparas transfored into alloc_assign_full: " << idx_table.size() << " & " << num_proc_comp;
	vector<dict_full> dict_full_vec;
	for (int i = 0; i < num_proc_comp + 1; i++)
	{
		dict_full dict_tmp(i);
		dict_full_vec.push_back(dict_tmp);
	}
	/* 关于此处的索引，分配任务有关的表都从0开始，便于数据发送，0地址的数据为空 */
	// cout << "\t\tafter initialization, dict_full_vec size: " << dict_full_vec.size() << endl;

	// vector<vector<int> > global_datasize(num_proc_comp + 1, vector<int>());

	int count = idx_table.size();
	int proc_assign;
	/* the simplest method, balance load by randomly distributed */
	for (int i = 0; i < count; ++i)
	{
		proc_assign = i % num_proc_comp + 1;
		// cout << "\t\texample " << i << ": " << proc_assign << endl;
		// cout << "\t\t" << idx_table[i].idx.size() << endl;
		// int proc_assign = idx_table[i].num_proc;
		// ms_dataset dset_tmp(idx_table[i].idx);
		// ms_dataset dset_tmp(idx_table[i].idx);
		// cout << "\t\tafter ms_dataset, size of: " << dset_tmp.data_idxes.size() << endl;
		// dict_full_vec[proc_assign].append_datasets(dset_tmp);
		dict_full_vec[proc_assign].multi_datasets.push_back(idx_table[i].idx);
		// cout << "\t\t" << dict_full_vec[1].multi_datasets.size() << endl;
		// delete dset_tmp;
	}
	// cout << "\t\tbefore function return\n";
	return dict_full_vec;
}