Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 27 additions & 19 deletions src/fatt.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <cstdlib>
#include <algorithm>
#include <numeric>
#include <regex>
#include <getopt.h>
#include <unistd.h>
#include <sys/types.h>
Expand Down Expand Up @@ -220,7 +221,7 @@ class FileLineBufferWithAutoExpansion
line_count = 0;
off_count = 0;
fileName = file_name;
return ist;
return true;
}
void close() {
ist.close();
Expand Down Expand Up @@ -1116,6 +1117,22 @@ void do_index(int argc, char** argv)
}
}

static bool check_read_conditions(long long param_start, const char* head_inf, set <string> readNamesToTake, regex re, bool flag_reverse_condition, long long number_of_sequences, long long param_end)
{
bool current_read_has_been_taken;
if(param_start == -1) {
string read_name = get_read_name_from_header(head_inf);
if(!readNamesToTake.empty()) {
current_read_has_been_taken = (readNamesToTake.count(read_name) != 0) ^ flag_reverse_condition;
} else {
current_read_has_been_taken = regex_search(head_inf, re) ^ flag_reverse_condition;
}
} else {
current_read_has_been_taken = param_start + 1 <= number_of_sequences && number_of_sequences <= param_end;
}
return current_read_has_been_taken;
}

void do_extract(int argc, char** argv)
{
bool flag_reverse_condition = false;
Expand All @@ -1132,6 +1149,7 @@ void do_extract(int argc, char** argv)
{"reverse", no_argument , 0, 'r'},
{"force", no_argument , 0, 'F'},
{"seq", required_argument, 0, 's'},
{"regex", required_argument, 0, 'x'}, //x is chosen because r and e was used for other options
{"file", required_argument, 0, 'f'},
{"stdin", no_argument, 0, 'c'},
{"unique", no_argument, 0, 'u'},
Expand All @@ -1144,6 +1162,7 @@ void do_extract(int argc, char** argv)
};

set<string> readNamesToTake;
std::regex re;
vector<string> fileInputs;

while(true) {
Expand All @@ -1163,6 +1182,9 @@ void do_extract(int argc, char** argv)
case 's':
readNamesToTake.insert(optarg);
break;
case 'x':
re = std::regex(optarg);
break;
case 'f':
fileInputs.push_back(optarg);
break;
Expand Down Expand Up @@ -1362,24 +1384,14 @@ void do_extract(int argc, char** argv)
if(f.getline()) {
number_of_sequences++;
size_t number_of_nucleotides_in_read = 0;
if(param_start == -1) {
current_read_has_been_taken = (readNamesToTake.count(get_read_name_from_header(f.b)) != 0) ^ flag_reverse_condition;
} else {
current_read_has_been_taken = param_start + 1 <= number_of_sequences && number_of_sequences <= param_end;
// NOTE: the latter condition never hold, if I properly implemented.
}
current_read_has_been_taken = check_read_conditions(param_start, f.b, readNamesToTake, re, flag_reverse_condition, number_of_sequences, param_end);
if(current_read_has_been_taken) cout << f.b << endl;
if(flag_output_unique) readNamesToTake.insert(get_read_name_from_header(f.b));
if(!f.looksLikeFASTQHeader()) {
while(f.getline()) {
if(f.looksLikeFASTAHeader()) {
number_of_sequences++;
if(param_start == -1) {
current_read_has_been_taken = (readNamesToTake.count(get_read_name_from_header(f.b)) != 0) ^ flag_reverse_condition;
} else {
current_read_has_been_taken = param_start + 1 <= number_of_sequences && number_of_sequences <= param_end;
// NOTE: the latter condition never hold, if I properly implemented.
}
current_read_has_been_taken = check_read_conditions(param_start, f.b, readNamesToTake, re, flag_reverse_condition, number_of_sequences, param_end);
if(current_read_has_been_taken) cout << f.b << endl;
if(flag_output_unique) readNamesToTake.insert(get_read_name_from_header(f.b));
number_of_nucleotides_in_read = 0;
Expand All @@ -1404,12 +1416,7 @@ void do_extract(int argc, char** argv)
if(!f.getline()) break;
f.registerHeaderLine();
++number_of_sequences;
if(param_start == -1) {
current_read_has_been_taken = (readNamesToTake.count(get_read_name_from_header(f.b)) != 0) ^ flag_reverse_condition;
} else {
current_read_has_been_taken = param_start + 1 <= number_of_sequences && number_of_sequences <= param_end;
// NOTE: the latter condition never hold, if I properly implemented.
}
current_read_has_been_taken = check_read_conditions(param_start, f.b, readNamesToTake, re, flag_reverse_condition, number_of_sequences, param_end);
if(param_end < number_of_sequences) // NOTE: param_end is 0-origin, number_of_sequences is 1-origin.
break;
if(current_read_has_been_taken) cout << f.b << endl;
Expand Down Expand Up @@ -3029,6 +3036,7 @@ void show_help(const char* subcommand)
cerr << "Usage: fatt extract [options...] <FAST(A|Q) files>\n\n";
cerr << "--unique\tOutput only unique reads. Reads with the same read name are removed.\n";
cerr << "--seq\tSpecify the name of the read to be retrieved. You can specify this option as many times as you wish.\n";
cerr << "--regex\tSpecify a regular expression. The deflines (i.e. read name AND description) are searched for a match. \n";
cerr << "--file\tSpecify a file in which you listed the read names. One line, one read.\n";
cerr << "--stdin\tRead the list of read names from stdin. It may be useful when you combine with *NIX pipe.\n";
cerr << "--reverse\tReverse the extracting condition. It is like -v option of grep.\n";
Expand Down
2 changes: 1 addition & 1 deletion wscript
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def configure(conf):
conf.check_perl_version((5,6,0))
conf.check_perl_ext_devel()
conf.check_python_version((2,4,2))
conf.env.append_unique('CXXFLAGS', ['-O2', '-DVERSION_STRING=' + VERSION])
conf.env.append_unique('CXXFLAGS', ['-std=c++11', '-O2', '-DVERSION_STRING=' + VERSION])
conf.env.INCLUDES += '.'
conf.env.LIB += ['pthread', 'dl']

Expand Down