LocalUngappedAlignment/TestLocalGaplessAlignmentGPU.cu at dev · kazemf78/LocalUngappedAlignment · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
// remember!
#include <bits/stdc++.h>
#include "Utils.h"
#include "ScoreMatrix.h"

using namespace std;

#define MAX_LEN 1024
#define debug(A) cout << #A << ": " << A << endl

__constant__ int _aa2num[(int)'Z' - 'A'];
__constant__ int _score_matrix[ALPH_SIZE * ALPH_SIZE];

__global__ void test_kernel_flattened(char* flat_str, int* ids, int num_strs) {
    int index = threadIdx.x;
    int cur_str_len = ids[index+1] - ids[index];
    char* cur_str = (char *)malloc((cur_str_len + 1) * sizeof(char));
    memcpy(cur_str, flat_str + ids[index], cur_str_len);
    cur_str[cur_str_len] = '\0';
    printf("length: %d, string: %s\n", cur_str_len, cur_str);
}

__global__ void test_kernel_ptr2ptr(char** str_ptrs, int num_strings) {

    int index = threadIdx.x;
    char* cur_str = str_ptrs[index];
    if (index < num_strings)
        printf("string: %s\n", cur_str);
}

void allcoate_device_flattened(vector<string> strings, char ** d_fstr_addr, int ** d_fids_addr, int num_strs) {
    string flat_temp; int* flat_ids; char * flat_str;
    flat_ids = (int *) malloc((num_strs + 1) * sizeof(int));
    int cur_ptr = 0;
    // todo: maybe implementation of flattening can be more efficient but it's more readable now, improve it later on?
    for (int i = 0; i < strings.size(); i++) {
        flat_ids[i] = cur_ptr;
        flat_temp += strings[i];
        cur_ptr += strings[i].size();
    }
    flat_ids[num_strs] = cur_ptr;
    int total_chars_num = cur_ptr;

    flat_str = (char *) flat_temp.c_str();
    cudaMalloc(d_fstr_addr, total_chars_num * sizeof(char));
    cudaMalloc(d_fids_addr, (num_strs + 1) * sizeof(int));
    cudaMemcpy(*d_fstr_addr, flat_str, total_chars_num * sizeof(char), cudaMemcpyHostToDevice);
    cudaMemcpy(*d_fids_addr, flat_ids, (num_strs + 1) * sizeof(int), cudaMemcpyHostToDevice);
}

// maybe a redundant function!
void free_device_flattened(char* d_fstr, int* d_fids) {
    cudaFree(d_fstr);
    cudaFree(d_fids);
}

char ** allocate_device_ptr2ptr_without_free(vector<string> strings, int num_strs) {
    char ** d_str_ptrs;
    cudaMalloc(&d_str_ptrs, num_strs * sizeof(char *));
    char * d_temp_strs[num_strs];
    for (int i = 0; i < num_strs; i++) {
        int q_i_len = strings[i].size();
        cudaMalloc(&d_temp_strs[i],  q_i_len * sizeof(char));
        cudaMemcpy(d_temp_strs[i], strings[i].c_str(), q_i_len * sizeof(char), cudaMemcpyHostToDevice);
        cudaMemcpy(d_str_ptrs + i, &d_temp_strs[i], sizeof(char *), cudaMemcpyHostToDevice);
    }
    return d_str_ptrs;
}

void allocate_device_ptr2ptr(vector<string> strings, char *** d_str_ptrs_addr, char *** d_temp_strs_addr, int num_strs) {
    cudaMalloc(d_str_ptrs_addr, num_strs * sizeof(char *));
    *d_temp_strs_addr = (char **) malloc(num_strs * sizeof(char*));
    char** d_str_ptrs = *d_str_ptrs_addr;
    char** d_temp_strs = *d_temp_strs_addr;
    for (int i = 0; i < num_strs; i++) {
        int q_i_len = strings[i].size();
        cudaMalloc(&d_temp_strs[i],  q_i_len * sizeof(char));
        cudaMemcpy(d_temp_strs[i], strings[i].c_str(), q_i_len * sizeof(char), cudaMemcpyHostToDevice);
        cudaMemcpy(d_str_ptrs + i, &d_temp_strs[i], sizeof(char *), cudaMemcpyHostToDevice);
    }
}

void free_device_ptr2ptr(char** d_str_ptrs, char** d_temp_strs, int num_strs) {
    cudaFree(d_str_ptrs);
    for (int i = 0; i < num_strs; i++) {
        cudaFree(d_temp_strs[i]);
    }
}

__global__ void test() {
    for (int i = 0; i < ALPH_SIZE; i++) {
        for (int j = 0; j < ALPH_SIZE; j++)
            printf("%3d ", _score_matrix[i*ALPH_SIZE+j]);
        printf("\n");
    }
    for (char c = 'A'; c < 'Z'; c++)
        printf("%c:%d, ", c, _aa2num[(int) c - 'A']);
    printf("\n");
}

int main() {

    init_score_matrix();
    for (int i = 0; i < ALPH_SIZE; i++) {
        for (int j = 0; j < ALPH_SIZE; j++)
            printf("%3d ", score_matrix_flattened[i*ALPH_SIZE+j]);
        printf("\n");
    }
    printf("\n");
    cudaMemcpyToSymbol(_score_matrix, score_matrix_flattened, SCORE_MATRIX_SIZE * sizeof(int));
    cudaMemcpyToSymbol(_aa2num, aa2num, int('Z' - 'A') * sizeof(int));
    test<<<1, 1>>>();
    cudaDeviceSynchronize();

    vector<string> queries, targets;
    init_input_from_file("TestSamples/queries.txt", queries, false);
    init_input_from_file("TestSamples/targets.txt", targets, true);
    for (auto &q: queries)
        debug(q);

    int num_strs = queries.size();

    // flatten method (with c++ string appending)
    int * d_fids; char * d_fstr;
    allcoate_device_flattened(queries, &d_fstr, &d_fids, num_strs);
    test_kernel_flattened<<<1, num_strs>>> (d_fstr, d_fids, num_strs);
    cudaDeviceSynchronize();
    free_device_flattened(d_fstr, d_fids);

    // array of pointers to string method
    char ** d_str_ptrs;
    char ** d_temp_strs;
    allocate_device_ptr2ptr(queries, &d_str_ptrs, &d_temp_strs, num_strs);
    test_kernel_ptr2ptr<<<1, num_strs>>> (d_str_ptrs, num_strs);
    cudaDeviceSynchronize();
    free_device_ptr2ptr(d_str_ptrs, d_temp_strs, num_strs);

    return 0;
}