diff --git a/src/Common.hpp b/src/Common.hpp index f81d446..7b68459 100644 --- a/src/Common.hpp +++ b/src/Common.hpp @@ -53,6 +53,8 @@ struct Bustools_opt { std::string predict_input; //specified the same way as the output for count - count and histogram filenames will be created from this double predict_t = 0.0; //this is how far to predict, t=10 means that we will predict the change in expression at 10 times the number of reads + /* clusterhist */ + std::string cluster_input_file; /* project */ std::string map; diff --git a/src/bustools_clusterhist.cpp b/src/bustools_clusterhist.cpp new file mode 100644 index 0000000..2bbb667 --- /dev/null +++ b/src/bustools_clusterhist.cpp @@ -0,0 +1,227 @@ +#include +#include +#include + +#include "Common.hpp" +#include "BUSData.h" + +#include "bustools_clusterhist.h" +#include +#include + +void bustools_clusterhist(Bustools_opt& opt) { + BUSHeader h; + size_t nr = 0; + size_t N = 100000; + uint32_t bclen = 0; + BUSData* p = new BUSData[N]; + + // read and parse the equivelence class files + + std::unordered_map, int32_t, SortedVectorHasher> ecmapinv; + std::vector> ecmap; + + std::unordered_map txnames; + parseTranscripts(opt.count_txp, txnames); + std::vector genemap(txnames.size(), -1); + std::unordered_map genenames; + parseGenes(opt.count_genes, txnames, genemap, genenames); + parseECs(opt.count_ecs, h); + ecmap = std::move(h.ecs); + ecmapinv.reserve(ecmap.size()); + for (int32_t ec = 0; ec < ecmap.size(); ec++) { + ecmapinv.insert({ ecmap[ec], ec }); + } + std::vector> ec2genes; + create_ec2genes(ecmap, genemap, ec2genes); + + + std::string gene_ofn = opt.output + "genes.txt"; + std::string histDir = opt.output + "cluster_hists/"; + + + std::vector v; + v.reserve(N); + uint64_t current_bc = 0xFFFFFFFFFFFFFFFFULL; + //temporary data + std::vector ecs; + std::vector glist; + ecs.reserve(100); + std::vector u; + u.reserve(100); + + //Read the cluster file + std::vector clusterNames; + std::unordered_map bcClusters; + { + std::ifstream ifs(opt.cluster_input_file); + uint32_t flag = 0; + while (!ifs.eof()) { + std::string bc, cluster; + ifs >> bc >> cluster; + if (!cluster.empty()) { + uint64_t bcBin = stringToBinary(bc, flag); + std::size_t clust = -1; + auto it = std::find(clusterNames.begin(), clusterNames.end(), cluster); + if (it != clusterNames.end()) { + clust = (it - clusterNames.begin()); + } + else { + clust = clusterNames.size(); + clusterNames.push_back(cluster); + } + bcClusters[bcBin] = clust; + } + } + } +// for (size_t i = 0; i < 5; ++i) { +// std::cout << "rc: " << rc << "\n"; +// } + + //Allocate histograms + + //Indexed as gene*histmax + histIndex + size_t n_genes = genenames.size(); + const uint32_t histmax = 100;//set molecules with more than histmax copies to histmax + typedef std::vector Histograms; + std::vector clusterHistograms; + for (std::size_t i = 0; i < clusterNames.size(); ++i) { + clusterHistograms.push_back(std::vector(n_genes * histmax, 0)); + } + + + //barcodes + auto write_barcode = [&](const std::vector& v) { + if (v.empty()) { + return; + } + auto it = bcClusters.find(v[0].barcode); + if (it == bcClusters.end()) { + return; //This is ok, could be a cell that has been filtered in quality check + } + + auto& hist = clusterHistograms[it->second]; + + size_t n = v.size(); + + for (size_t i = 0; i < n; ) { + size_t j = i + 1; + for (; j < n; j++) { + if (v[i].UMI != v[j].UMI) { + break; + } + } + + // v[i..j-1] share the same UMI + ecs.resize(0); + uint32_t counts = 0; + for (size_t k = i; k < j; k++) { + ecs.push_back(v[k].ec); + counts += v[k].count; + } + + intersect_genes_of_ecs(ecs, ec2genes, glist); + if (glist.size() == 1) { + //Fill in histograms for prediction. + if (glist[0] < n_genes) { //crasches with an invalid gene file otherwise + hist[glist[0] * histmax + std::min(counts - 1, histmax - 1)] += 1.0; //histmax-1 since histograms[g][0] is the histogram value for 1 copy and so forth + } + else { + std::cerr << "Mismatch between gene file and bus file, the bus file contains gene indices that is outside the gene range!\n"; + } + } + i = j; // increment + } + }; + + for (const auto& infn : opt.files) { + std::streambuf* inbuf; + std::ifstream inf; + if (!opt.stream_in) { + inf.open(infn.c_str(), std::ios::binary); + inbuf = inf.rdbuf(); + } + else { + inbuf = std::cin.rdbuf(); + } + std::istream in(inbuf); + + parseHeader(in, h); + bclen = h.bclen; + + int rc = 0; + while (true) { + in.read((char*)p, N * sizeof(BUSData)); + size_t rc = in.gcount() / sizeof(BUSData); + //std::cout << "rc: " << rc << "\n"; + nr += rc; + if (rc == 0) { + break; + } + + + for (size_t i = 0; i < rc; i++) { + if (p[i].barcode != current_bc) { + // output whatever is in v + if (!v.empty()) { + write_barcode(v); + } + v.clear(); + current_bc = p[i].barcode; + } + v.push_back(p[i]); + + } + } + if (!v.empty()) { + write_barcode(v); + } + + if (!opt.stream_in) { + inf.close(); + } + } + delete[] p; p = nullptr; + + + + // write genes file + writeGenes(gene_ofn, genenames); + + //write histogram files + for (std::size_t f = 0; f < clusterNames.size(); ++f) { + + auto& histograms = clusterHistograms[f]; + std::string hist_ofn = histDir + clusterNames[f] + ".txt"; + + std::ofstream histof; + histof.open(hist_ofn); + + for (std::size_t g = 0; g < genenames.size(); ++g) { + //Indexed as gene*histmax + histIndex + std::size_t offs = g * histmax; + + //first figure out the length of the histogram, don't write that to make the file smaller + std::size_t histEnd = histmax - 1; + for (; histEnd != 0; --histEnd) { + if (histograms[offs + histEnd] != 0) { + break; + } + } + for (size_t c = 0; c <= histEnd; ++c) { + if (c != 0) { + histof << '\t'; + } + histof << histograms[offs + c]; + } + + histof << "\n"; + } + histof.close(); + } + + + //std::cerr << "bad counts = " << bad_count <<", rescued =" << rescued << ", compacted = " << compacted << std::endl; + + //std::cerr << "Read in " << nr << " BUS records" << std::endl; +} diff --git a/src/bustools_clusterhist.h b/src/bustools_clusterhist.h new file mode 100644 index 0000000..5a1a255 --- /dev/null +++ b/src/bustools_clusterhist.h @@ -0,0 +1,3 @@ +#include "Common.hpp" + +void bustools_clusterhist(Bustools_opt &opt); diff --git a/src/bustools_collapse.cpp b/src/bustools_collapse.cpp new file mode 100644 index 0000000..2111c9b --- /dev/null +++ b/src/bustools_collapse.cpp @@ -0,0 +1,163 @@ +#include +#include +#include + +#include "Common.hpp" +#include "BUSData.h" + +#include "bustools_collapse.h" + + +void bustools_collapse(Bustools_opt &opt) { + BUSHeader h; + size_t nr = 0; + size_t N = 100000; + uint32_t bclen = 0; + BUSData* p = new BUSData[N]; + + // read and parse the equivelence class files + + std::unordered_map, int32_t, SortedVectorHasher> ecmapinv; + std::vector> ecmap; + + std::unordered_map txnames; + parseTranscripts(opt.count_txp, txnames); + std::vector genemap(txnames.size(), -1); + std::unordered_map genenames; + parseGenes(opt.count_genes, txnames, genemap, genenames); + parseECs(opt.count_ecs, h); + ecmap = std::move(h.ecs); + ecmapinv.reserve(ecmap.size()); + for (int32_t ec = 0; ec < ecmap.size(); ec++) { + ecmapinv.insert({ecmap[ec], ec}); + } + std::vector> ec2genes; + create_ec2genes(ecmap, genemap, ec2genes); + + + std::string bug_ofn = opt.output + ".bus"; + std::string gene_ofn = opt.output + ".genes.txt"; + + std::streambuf* buf = nullptr; + std::ofstream of; + + if (!opt.stream_out) { + of.open(bug_ofn, std::ios::binary); + if (of.fail()) { + std::cerr << "Failed to open file for writing: " << bug_ofn << std::endl; + } + buf = of.rdbuf(); + } + else { + buf = std::cout.rdbuf(); + } + std::ostream bus_out(buf); + + + std::vector v; + v.reserve(N); + uint64_t current_umi = 0xFFFFFFFFFFFFFFFFULL; + uint64_t current_bc = 0xFFFFFFFFFFFFFFFFULL; + //temporary data + std::vector ecs; + std::vector glist; + ecs.reserve(100); + std::vector> column_vp; + column_vp.reserve(N); + glist.reserve(100); + + int discarded = 0; + + auto write_umi_bug = [&](const std::vector &v) { + if(v.empty()) { + return; + } + column_vp.resize(0); + + double val = 0.0; + size_t n = v.size(); + + // v[i..j-1] share the same UMI + ecs.resize(0); + int32_t copies = 0; + for (size_t i = 0; i < n; ++i) { + ecs.push_back(v[i].ec); + copies += v[i].count; + } + + //There's room for something smarter here to avoid discarding UMIs with one + //read pointing to a non-fitting gene... I guess read errors could lead to this? + + intersect_genes_of_ecs(ecs,ec2genes, glist); + if (glist.size() == 1) { + auto entry = v[0]; + entry.count = copies; + entry.ec = glist[0]; + bus_out.write((char*)&entry, sizeof(entry)); + } else { + discarded++; + } + }; + + bool bHeaderWritten = false; + + for (const auto& infn : opt.files) { + std::streambuf *inbuf; + std::ifstream inf; + if (!opt.stream_in) { + inf.open(infn.c_str(), std::ios::binary); + inbuf = inf.rdbuf(); + } else { + inbuf = std::cin.rdbuf(); + } + std::istream in(inbuf); + + parseHeader(in, h); + bclen = h.bclen; + + if (!bHeaderWritten) { + bHeaderWritten = true; + // write out the initial header + writeHeader(bus_out, h);// TODO: Think through what to write here, i.e. if the header should be a "bug" header of some kind + } + + + int rc = 0; + while (true) { + in.read((char*)p, N*sizeof(BUSData)); + size_t rc = in.gcount() / sizeof(BUSData); + nr += rc; + if (rc == 0) { + break; + } + + for (size_t i = 0; i < rc; i++) { + if (p[i].UMI != current_umi || p[i].barcode != current_bc) { + // output whatever is in v + if (!v.empty()) { + write_umi_bug(v); + } + v.clear(); + current_umi = p[i].UMI; + current_bc = p[i].barcode; + } + v.push_back(p[i]); + + } + } + if (!v.empty()) { + write_umi_bug(v); + } + + if (!opt.stream_in) { + inf.close(); + } + } + delete[] p; p = nullptr; + + //write the genes file: + writeGenes(gene_ofn, genenames); + + + std::cerr << "Read in " << nr << " BUS records, discarded " << discarded << " UMIs" << std::endl; +} diff --git a/src/bustools_collapse.h b/src/bustools_collapse.h new file mode 100644 index 0000000..b61274c --- /dev/null +++ b/src/bustools_collapse.h @@ -0,0 +1,3 @@ +#include "Common.hpp" + +void bustools_collapse(Bustools_opt &opt); diff --git a/src/bustools_count.cpp b/src/bustools_count.cpp index 3e9a319..11a7dee 100644 --- a/src/bustools_count.cpp +++ b/src/bustools_count.cpp @@ -42,6 +42,7 @@ void bustools_count(Bustools_opt &opt) { std::string ec_ofn = opt.output + ".ec.txt"; std::string gene_ofn = opt.output + ".genes.txt"; std::string hist_ofn = opt.output + ".hist.txt"; + std::string cu_per_cell_ofn = opt.output + ".CUPerCell.txt"; std::string cu_ofn = opt.output + ".cu.txt"; of.open(mtx_ofn); @@ -82,8 +83,13 @@ void bustools_count(Bustools_opt &opt) { size_t n_genes = genenames.size(); const uint32_t histmax = 100;//set molecules with more than histmax copies to histmax std::vector histograms; + std::vector cellUMIs; //used for calculating CU per cell + std::vector cellCounts; //used for calculating CU per cell if (opt.count_gen_hist) { + histograms = std::vector(n_genes * histmax, 0); + cellUMIs.reserve(100000); + cellCounts.reserve(100000); } //set up random number generator for downsampling @@ -174,7 +180,9 @@ void bustools_count(Bustools_opt &opt) { } column_vp.resize(0); n_rows+= 1; - + cellUMIs.push_back(0); + cellCounts.push_back(0); + barcodes.push_back(v[0].barcode); double val = 0.0; size_t n = v.size(); @@ -188,7 +196,7 @@ void bustools_count(Bustools_opt &opt) { break; } } - + // v[i..j-1] share the same UMI ecs.resize(0); uint32_t counts = 0; @@ -211,22 +219,25 @@ void bustools_count(Bustools_opt &opt) { gn = 0;//trick to skip quantification below } - } + } if (gn > 0) { if (opt.count_gene_multimapping) { for (auto x : glist) { column_vp.push_back({x, (opt.count_raw_counts ? counts : 1.0)/gn}); - - //Fill in histograms for prediction. - if (opt.count_gen_hist) { + } + + //Fill in histograms for prediction. + if (opt.count_gen_hist) { + for (auto x : glist) { if (x < n_genes) { //crasches with an invalid gene file otherwise histograms[x * histmax + std::min(counts - 1, histmax - 1)] += 1.0 / gn; //histmax-1 since histograms[g][0] is the histogram value for 1 copy and so forth - } - else { + } else { std::cerr << "Mismatch between gene file and bus file, the bus file contains gene indices that is outside the gene range!\n"; } } - } + cellUMIs[barcodes.size()-1]++; + cellCounts[barcodes.size()-1] += counts; + } } else { if (gn==1) { column_vp.push_back({glist[0],opt.count_raw_counts ? counts : 1.0}); @@ -234,6 +245,8 @@ void bustools_count(Bustools_opt &opt) { if (opt.count_gen_hist) { if (glist[0] < n_genes) { //crasches with an invalid gene file otherwise histograms[glist[0] * histmax + std::min(counts - 1, histmax - 1)] += 1.0; //histmax-1 since histograms[g][0] is the histogram value for 1 copy and so forth + cellUMIs[barcodes.size()-1]++; + cellCounts[barcodes.size()-1] += counts; } else { std::cerr << "Mismatch between gene file and bus file, the bus file contains gene indices that is outside the gene range!\n"; } @@ -250,6 +263,8 @@ void bustools_count(Bustools_opt &opt) { std::cerr << "Mismatch between gene file and bus file, the bus file contains gene indices that is outside the gene range!\n"; } } + cellUMIs[barcodes.size()-1]++; + cellCounts[barcodes.size()-1] += counts; } } } @@ -464,8 +479,9 @@ void bustools_count(Bustools_opt &opt) { histof.close(); } - //write mean counts per UMI file if (opt.count_gen_hist) { + //write mean counts per UMI file (per gene) + std::ofstream cuof; cuof.open(cu_ofn); //write header @@ -500,6 +516,18 @@ void bustools_count(Bustools_opt &opt) { } } cuof.close(); + + //write cu per cell file + + std::ofstream cupcof; + cupcof.open(cu_per_cell_ofn); + //write header + cupcof << "barcode\tCU\tUMIs\n"; + + for (size_t bc = 0; bc < barcodes.size(); ++bc) { + cupcof << binaryToString(barcodes[bc], bclen) << '\t' << double(cellCounts[bc]) / double(cellUMIs[bc]) << '\t' << cellUMIs[bc] << '\n'; + } + cupcof.close(); } diff --git a/src/bustools_main.cpp b/src/bustools_main.cpp index 8a76738..48511a7 100644 --- a/src/bustools_main.cpp +++ b/src/bustools_main.cpp @@ -36,6 +36,8 @@ #include "bustools_text.h" #include "bustools_umicorrect.h" #include "bustools_predict.h" +#include "bustools_collapse.h" +#include "bustools_clusterhist.h" @@ -706,6 +708,99 @@ void parse_ProgramOptions_linker(int argc, char **argv, Bustools_opt &opt) { } } +void parse_ProgramOptions_collapse(int argc, char** argv, Bustools_opt& opt) { + const char* opt_string = "o:pg:e:t:"; + int gene_flag = 0; + int em_flag = 0; + static struct option long_options[] = { + {"output", required_argument, 0, 'o'}, + {"pipe", no_argument, 0, 'p'}, + {"genemap", required_argument, 0, 'g'}, + {"ecmap", required_argument, 0, 'e'}, + {"txnames", required_argument, 0, 't'}, + {0, 0, 0, 0 } + }; + int option_index = 0, c; + + while ((c = getopt_long(argc, argv, opt_string, long_options, &option_index)) != -1) { + + switch (c) { + case 'o': + opt.output = optarg; + break; + case 'p': + opt.stream_out = true; + break; + case 'g': + opt.count_genes = optarg; + break; + case 't': + opt.count_txp = optarg; + break; + case 'e': + opt.count_ecs = optarg; + break; + default: + break; + } + } + + while (optind < argc) opt.files.push_back(argv[optind++]); + + if (opt.files.size() == 1 && opt.files[0] == "-") { + opt.stream_in = true; + } +} + +void parse_ProgramOptions_clusterhist(int argc, char** argv, Bustools_opt& opt) { + const char* opt_string = "o:pg:e:t:"; + int gene_flag = 0; + int em_flag = 0; + static struct option long_options[] = { + {"output", required_argument, 0, 'o'}, + {"pipe", no_argument, 0, 'p'}, + {"genemap", required_argument, 0, 'g'}, + {"ecmap", required_argument, 0, 'e'}, + {"txnames", required_argument, 0, 't'}, + {"clusterfile", required_argument, 0, 'c'}, + {0, 0, 0, 0 } + }; + int option_index = 0, c; + + while ((c = getopt_long(argc, argv, opt_string, long_options, &option_index)) != -1) { + + switch (c) { + case 'o': + opt.output = optarg; + break; + case 'p': + opt.stream_out = true; + break; + case 'g': + opt.count_genes = optarg; + break; + case 't': + opt.count_txp = optarg; + break; + case 'e': + opt.count_ecs = optarg; + break; + case 'c': + opt.cluster_input_file = optarg; + break; + default: + break; + } + } + + while (optind < argc) opt.files.push_back(argv[optind++]); + + if (opt.files.size() == 1 && opt.files[0] == "-") { + opt.stream_in = true; + } +} + + void parse_ProgramOptions_extract(int argc, char **argv, Bustools_opt &opt) { /* Parse options. */ @@ -1484,6 +1579,184 @@ bool check_ProgramOptions_linker(Bustools_opt &opt) { return ret; } +bool check_ProgramOptions_collapse(Bustools_opt& opt) { + bool ret = true; + + // check for output directory + if (opt.output.empty()) { + std::cerr << "Error: Missing output directory" << std::endl; + ret = false; + } + else { + bool isDir = false; + if (checkDirectoryExists(opt.output)) { + isDir = true; + } + else { + if (opt.output.at(opt.output.size() - 1) == '/') { + if (my_mkdir(opt.output.c_str(), 0777) == -1) { + std::cerr << "Error: could not create directory " << opt.output << std::endl; + ret = false; + } + else { + isDir = true; + } + } + } + + if (isDir) { + opt.output += "output"; + } + } + + if (opt.files.size() == 0) { + std::cerr << "Error: Missing BUS input files" << std::endl; + ret = false; + } + else { + if (!opt.stream_in) { + for (const auto& it : opt.files) { + if (!checkFileExists(it)) { + std::cerr << "Error: File not found, " << it << std::endl; + ret = false; + } + } + } + } + + if (opt.count_genes.size() == 0) { + std::cerr << "Error: missing gene mapping file" << std::endl; + ret = false; + } + else { + if (!checkFileExists(opt.count_genes)) { + std::cerr << "Error: File not found " << opt.count_genes << std::endl; + ret = false; + } + } + + if (opt.count_ecs.size() == 0) { + std::cerr << "Error: missing equivalence class mapping file" << std::endl; + ret = false; + } + else { + if (!checkFileExists(opt.count_ecs)) { + std::cerr << "Error: File not found " << opt.count_ecs << std::endl; + ret = false; + } + } + + if (opt.count_txp.size() == 0) { + std::cerr << "Error: missing transcript name file" << std::endl; + ret = false; + } + else { + if (!checkFileExists(opt.count_txp)) { + std::cerr << "Error: File not found " << opt.count_txp << std::endl; + ret = false; + } + } + + return ret; +} + +bool check_ProgramOptions_clusterhist(Bustools_opt& opt) { + bool ret = true; + + // check for output directory + if (opt.output.empty()) { + std::cerr << "Error: Missing output directory" << std::endl; + ret = false; + } + else { + bool isDir = false; + if (checkDirectoryExists(opt.output)) { + isDir = true; + } + else { + if (opt.output.at(opt.output.size() - 1) == '/') { + if (my_mkdir(opt.output.c_str(), 0777) == -1) { + std::cerr << "Error: could not create directory " << opt.output << std::endl; + ret = false; + } + else { + isDir = true; + } + } + } + + std::string histDir = opt.output + "cluster_hists/"; + //generate directory + if (!checkDirectoryExists(histDir)) { + if (my_mkdir(histDir.c_str(), 0777) == -1) { + std::cerr << "Error: could not create directory " << opt.output << std::endl; + ret = false; + } + } + } + + if (opt.files.size() == 0) { + std::cerr << "Error: Missing BUS input files" << std::endl; + ret = false; + } + else { + if (!opt.stream_in) { + for (const auto& it : opt.files) { + if (!checkFileExists(it)) { + std::cerr << "Error: File not found, " << it << std::endl; + ret = false; + } + } + } + } + + if (opt.count_genes.size() == 0) { + std::cerr << "Error: missing gene mapping file" << std::endl; + ret = false; + } + else { + if (!checkFileExists(opt.count_genes)) { + std::cerr << "Error: File not found " << opt.count_genes << std::endl; + ret = false; + } + } + + if (opt.count_ecs.size() == 0) { + std::cerr << "Error: missing equivalence class mapping file" << std::endl; + ret = false; + } + else { + if (!checkFileExists(opt.count_ecs)) { + std::cerr << "Error: File not found " << opt.count_ecs << std::endl; + ret = false; + } + } + + if (opt.count_txp.size() == 0) { + std::cerr << "Error: missing transcript name file" << std::endl; + ret = false; + } + else { + if (!checkFileExists(opt.count_txp)) { + std::cerr << "Error: File not found " << opt.count_txp << std::endl; + ret = false; + } + } + + if (opt.cluster_input_file.size() == 0) { + std::cerr << "Error: missing cluster file" << std::endl; + ret = false; + } + else { + if (!checkFileExists(opt.cluster_input_file)) { + std::cerr << "Error: File not found " << opt.cluster_input_file << std::endl; + ret = false; + } + } + + return ret; +} + bool check_ProgramOptions_extract(Bustools_opt &opt) { bool ret = true; @@ -1567,6 +1840,8 @@ void Bustools_Usage() { << "text Convert a binary BUS file to a tab-delimited text file" << std::endl << "extract Extract FASTQ reads correspnding to reads in BUS file" << std::endl << "predict Correct the count matrix using prediction of unseen species" << std::endl + << "collapse Turn BUS files into a BUG file" << std::endl + << "clusterhist Create UMI histograms per cluster" << std::endl << "linker Remove section of barcodes in BUS files" << std::endl << "version Prints version number" << std::endl << "cite Prints citation information" << std::endl @@ -1723,6 +1998,29 @@ void Bustools_linker_Usage() { << std::endl; } +void Bustools_collapse_Usage() { + std::cout << "Usage: bustools collapse [options] sorted-bus-files" << std::endl << std::endl + << "Options: " << std::endl + << "-o, --output Output directory gene matrix files" << std::endl + << "-g, --genemap File for mapping transcripts to genes" << std::endl + << "-e, --ecmap File for mapping equivalence classes to transcripts" << std::endl + << "-t, --txnames File with names of transcripts" << std::endl + << "-p, --pipe Write to standard output" << std::endl + << std::endl; +} + +void Bustools_clusterhist_Usage() { + std::cout << "Usage: bustools collapse [options] sorted-bus-files" << std::endl << std::endl + << "Options: " << std::endl + << "-o, --output Output directory gene matrix files" << std::endl + << "-g, --genemap File for mapping transcripts to genes" << std::endl + << "-e, --ecmap File for mapping equivalence classes to transcripts" << std::endl + << "-t, --txnames File with names of transcripts" << std::endl + << "-c, --clusterfile File with cell cluster assignments" << std::endl + << "-p, --pipe Write to standard output" << std::endl + << std::endl; +} + void Bustools_extract_Usage() { std::cout << "Usage: bustools extract [options] sorted-bus-file" << std::endl << " Note: BUS file should be sorted by flag using bustools sort --flag" << std::endl << std::endl @@ -1919,6 +2217,30 @@ int main(int argc, char **argv) { Bustools_linker_Usage(); exit(1); } + } else if (cmd == "collapse") { + if (disp_help) { + Bustools_collapse_Usage(); + exit(0); + } + parse_ProgramOptions_collapse(argc-1, argv+1, opt); + if (check_ProgramOptions_collapse(opt)) { //Program options are valid + bustools_collapse(opt); + } else { + Bustools_collapse_Usage(); + exit(1); + } + } else if (cmd == "clusterhist") { + if (disp_help) { + Bustools_clusterhist_Usage(); + exit(0); + } + parse_ProgramOptions_clusterhist(argc-1, argv+1, opt); + if (check_ProgramOptions_clusterhist(opt)) { //Program options are valid + bustools_clusterhist(opt); + } else { + Bustools_clusterhist_Usage(); + exit(1); + } } else if (cmd == "extract") { if (disp_help) { Bustools_extract_Usage(); diff --git a/test/test_cases/runtests.txt b/test/test_cases/runtests.txt index dc0fc00..03f8916 100644 --- a/test/test_cases/runtests.txt +++ b/test/test_cases/runtests.txt @@ -2,6 +2,19 @@ #to be able to run this file, you need to run the line chmod +x ./test/test_cases/runtests.txt mkdir tmp +#Test collapse +./build/src/bustools fromtext -p ./test/test_cases/tc0002CollapseInput.txt | ./build/src/bustools collapse -o ./tmp/tc2mid2 -p -t ./test/test_cases/transcripts.txt -g ./test/test_cases/transcripts_to_genes.txt -e ./test/test_cases/matrix.ec - | ./build/src/bustools text -o ./tmp/tc0002output.txt - +diff -w -B -s ./test/test_cases/tc0002ExpResult.txt ./tmp/tc0002output.txt +rm ./tmp/* +#or separately +./build/src/bustools fromtext -o ./tmp/tc2mid.bus ./test/test_cases/tc0002CollapseInput.txt +./build/src/bustools collapse -o ./tmp/tc2mid2 -t ./test/test_cases/transcripts.txt -g ./test/test_cases/transcripts_to_genes.txt -e ./test/test_cases/matrix.ec ./tmp/tc2mid.bus +./build/src/bustools text -o ./tmp/tc0002output.txt ./tmp/tc2mid2.bus +#check that the files are identical +diff -w -B -s ./test/test_cases/tc0002ExpResult.txt ./tmp/tc0002output.txt +rm ./tmp/* + + #Test umicorrect #tc0003 ./build/src/bustools fromtext -p ./test/test_cases/tc0003.txt | ./build/src/bustools umicorrect -e ./test/test_cases/matrix.ec -g ./test/test_cases/transcripts_to_genes.txt -t ./test/test_cases/transcripts.txt -p - | ./build/src/bustools text -o ./tmp/tc0003output.txt - @@ -24,6 +37,7 @@ diff -w -B -s ./test/test_cases/tc0004ExpResultsGenes.txt ./tmp/tc0004output/out diff -w -B -s ./test/test_cases/tc0004ExpResultsBarcodes.txt ./tmp/tc0004output/output.barcodes.txt diff -w -B -s ./test/test_cases/tc0004ExpResultsHist.txt ./tmp/tc0004output/output.hist.txt diff -w -B -s ./test/test_cases/tc0004ExpResultsCU.txt ./tmp/tc0004output/output.cu.txt +diff -w -B -s ./test/test_cases/tc0004ExpResultsCUPerCell.txt ./tmp/tc0004output/output.CUPerCell.txt diff -w -B -s ./test/test_cases/tc0004ExpResultsCountRaw.txt ./tmp/tc0004outputraw/output.mtx diff --git a/test/test_cases/tc0002CollapseInput - explained.txt b/test/test_cases/tc0002CollapseInput - explained.txt new file mode 100644 index 0000000..0af68de --- /dev/null +++ b/test/test_cases/tc0002CollapseInput - explained.txt @@ -0,0 +1,27 @@ +#Test for the collapse function. +#What we need to test is that +#1. Different ecs with the same UMI and barcode leads to that they are merged to the same gene - these two should be merged +AAACCTGAGAAGCCCA ATGGAAATTT 1 4 +AAACCTGAGAAGCCCA ATGGAAATTT 2 3 + +#2. Identical ecs with same barcode and different UMI should not be merged +AAACCTGAGAAGCCCA ATGAAAATTT 1 1 +AAACCTGAGAAGCCCA ATGAAAATTA 1 1 + +#3. Identical barcode and UMI should lead to the set intersection of genes - these should be merged with gene set to 2 +AAACCTGAGAAGCCCA GTGGAAATTT 9 4 +AAACCTGAGAAGCCCA GTGGAAATTT 10 3 +AAACCTGAGAAGCCCA GTGGAAATTT 13 3 + +#4. Sometimes you get more than one gene - should be merged with gene set to 2,4, but discarded here since we do not keep multimapped molecules +AAACCTGAGAAGCCCA ATGGAAATTT 13 4 +AAACCTGAGAAGCCCA ATGGAAATTT 10 3 + +#5. No overlap between genes, should be discarded +AAACCTGAGAAGCCCA CTGGAAATTT 2 4 +AAACCTGAGAAGCCCA CTGGAAATTT 10 3 + +#6. Identical ecs with different barcode and same UMI should not be merged +AAACCTGAGAAGCCCA ATGAAAATAA 1 1 +AAACCTGAGAAGCCCT ATGAAAATAA 1 1 + diff --git a/test/test_cases/tc0002CollapseInput.txt b/test/test_cases/tc0002CollapseInput.txt new file mode 100644 index 0000000..46159e1 --- /dev/null +++ b/test/test_cases/tc0002CollapseInput.txt @@ -0,0 +1,14 @@ +AAACCTGAGAAGCCCA ATGGAAATTT 1 4 +AAACCTGAGAAGCCCA ATGGAAATTT 2 3 +AAACCTGAGAAGCCCA ATGAAAATTT 1 1 +AAACCTGAGAAGCCCA ATGAAAATTA 1 1 +AAACCTGAGAAGCCCA GTGGAAATTT 9 4 +AAACCTGAGAAGCCCA GTGGAAATTT 10 3 +AAACCTGAGAAGCCCA GTGGAAATTT 13 3 +AAACCTGAGAAGCCCA ATGGAAATTT 13 4 +AAACCTGAGAAGCCCA ATGGAAATTT 10 3 +AAACCTGAGAAGCCCA CTGGAAATTT 2 4 +AAACCTGAGAAGCCCA CTGGAAATTT 10 3 +AAACCTGAGAAGCCCA ATGAAAATAA 1 1 +AAACCTGAGAAGCCCT ATGAAAATAA 1 1 + diff --git a/test/test_cases/tc0002ExpResult.txt b/test/test_cases/tc0002ExpResult.txt new file mode 100644 index 0000000..fd4113b --- /dev/null +++ b/test/test_cases/tc0002ExpResult.txt @@ -0,0 +1,6 @@ +AAACCTGAGAAGCCCA ATGGAAATTT 1 7 +AAACCTGAGAAGCCCA ATGAAAATTT 1 1 +AAACCTGAGAAGCCCA ATGAAAATTA 1 1 +AAACCTGAGAAGCCCA GTGGAAATTT 2 10 +AAACCTGAGAAGCCCA ATGAAAATAA 1 1 +AAACCTGAGAAGCCCT ATGAAAATAA 1 1 diff --git a/test/test_cases/tc0004CUPerCellCalc.xlsx b/test/test_cases/tc0004CUPerCellCalc.xlsx new file mode 100644 index 0000000..dbefa42 Binary files /dev/null and b/test/test_cases/tc0004CUPerCellCalc.xlsx differ diff --git a/test/test_cases/tc0004ExpResultsCUPerCell.txt b/test/test_cases/tc0004ExpResultsCUPerCell.txt new file mode 100644 index 0000000..46e42f3 --- /dev/null +++ b/test/test_cases/tc0004ExpResultsCUPerCell.txt @@ -0,0 +1,7 @@ +barcode CU UMIs +TTTTTTTTTTTTTTTT 1.28571 7 +AAACCTGAGAAGCCCA 3.27273 11 +CCACCTGAGAAGCCCA 1.66667 3 +TTACCTGAGAAGCCCA 1.5 2 +TTTTCTGAGAAGCCCA 2 1 +AATTTTTTTTTTTTTT 3.625 8 \ No newline at end of file