From 6e61ba5741d2aa59e0d1bcd39457d5c980a40e36 Mon Sep 17 00:00:00 2001 From: Johan Gustafsson Date: Thu, 22 Apr 2021 13:52:28 +0200 Subject: [PATCH 1/9] feat: Added collapse from the butterfly branch, but modified to not include multimapped molecules. --- src/bustools_collapse.cpp | 162 ++++++++++++++++++++++++++++++++++++++ src/bustools_collapse.h | 3 + src/bustools_main.cpp | 151 +++++++++++++++++++++++++++++++++++ 3 files changed, 316 insertions(+) create mode 100644 src/bustools_collapse.cpp create mode 100644 src/bustools_collapse.h diff --git a/src/bustools_collapse.cpp b/src/bustools_collapse.cpp new file mode 100644 index 0000000..a289773 --- /dev/null +++ b/src/bustools_collapse.cpp @@ -0,0 +1,162 @@ +#include +#include +#include + +#include "Common.hpp" +#include "BUSData.h" + +#include "bustools_collapse.h" + + +void bustools_collapse(Bustools_opt &opt) { + BUSHeader h; + size_t nr = 0; + size_t N = 100000; + uint32_t bclen = 0; + BUSData* p = new BUSData[N]; + + // read and parse the equivelence class files + + std::unordered_map, int32_t, SortedVectorHasher> ecmapinv; + std::vector> ecmap; + + std::unordered_map txnames; + parseTranscripts(opt.count_txp, txnames); + std::vector genemap(txnames.size(), -1); + std::unordered_map genenames; + parseGenes(opt.count_genes, txnames, genemap, genenames); + parseECs(opt.count_ecs, h); + ecmap = std::move(h.ecs); + ecmapinv.reserve(ecmap.size()); + for (int32_t ec = 0; ec < ecmap.size(); ec++) { + ecmapinv.insert({ecmap[ec], ec}); + } + std::vector> ec2genes; + create_ec2genes(ecmap, genemap, ec2genes); + + + std::string bug_ofn = opt.output + ".bus"; + std::string gene_ofn = opt.output + ".genes.txt"; + + std::streambuf* buf = nullptr; + std::ofstream of; + + if (!opt.stream_out) { + of.open(bug_ofn, std::ios::binary); + if (of.fail()) { + std::cerr << "Failed to open file for writing: " << bug_ofn << std::endl; + } + buf = of.rdbuf(); + } + else { + buf = std::cout.rdbuf(); + } + std::ostream bus_out(buf); + + + std::vector v; + v.reserve(N); + uint64_t current_umi = 0xFFFFFFFFFFFFFFFFULL; + uint64_t current_bc = 0xFFFFFFFFFFFFFFFFULL; + //temporary data + std::vector ecs; + std::vector glist; + ecs.reserve(100); + std::vector> column_vp; + column_vp.reserve(N); + glist.reserve(100); + + int discarded = 0; + + auto write_umi_bug = [&](const std::vector &v) { + if(v.empty()) { + return; + } + column_vp.resize(0); + + double val = 0.0; + size_t n = v.size(); + + // v[i..j-1] share the same UMI + ecs.resize(0); + int32_t copies = 0; + for (size_t i = 0; i < n; ++i) { + ecs.push_back(v[i].ec); + copies += v[i].count; + } + + //There's room for something smarter here to avoid discarding UMIs with one + //read pointing to a non-fitting gene... I guess read errors could lead to this? + + intersect_genes_of_ecs(ecs,ec2genes, glist); + if (glist.size() == 1) { + auto entry = v[0]; + entry.count = copies; + bus_out.write((char*)&entry, sizeof(entry)); + } else { + discarded++; + } + }; + + bool bHeaderWritten = false; + + for (const auto& infn : opt.files) { + std::streambuf *inbuf; + std::ifstream inf; + if (!opt.stream_in) { + inf.open(infn.c_str(), std::ios::binary); + inbuf = inf.rdbuf(); + } else { + inbuf = std::cin.rdbuf(); + } + std::istream in(inbuf); + + parseHeader(in, h); + bclen = h.bclen; + + if (!bHeaderWritten) { + bHeaderWritten = true; + // write out the initial header + writeHeader(bus_out, h);// TODO: Think through what to write here, i.e. if the header should be a "bug" header of some kind + } + + + int rc = 0; + while (true) { + in.read((char*)p, N*sizeof(BUSData)); + size_t rc = in.gcount() / sizeof(BUSData); + nr += rc; + if (rc == 0) { + break; + } + + for (size_t i = 0; i < rc; i++) { + if (p[i].UMI != current_umi || p[i].barcode != current_bc) { + // output whatever is in v + if (!v.empty()) { + write_umi_bug(v); + } + v.clear(); + current_umi = p[i].UMI; + current_bc = p[i].barcode; + } + v.push_back(p[i]); + + } + } + if (!v.empty()) { + write_umi_bug(v); + } + + if (!opt.stream_in) { + inf.close(); + } + } + delete[] p; p = nullptr; + + //write the genes file: + writeGenes(gene_ofn, genenames); + + + std::cerr << "Read in " << nr << " BUS records, discarded " << discarded << " UMIs" << std::endl; +} diff --git a/src/bustools_collapse.h b/src/bustools_collapse.h new file mode 100644 index 0000000..b61274c --- /dev/null +++ b/src/bustools_collapse.h @@ -0,0 +1,3 @@ +#include "Common.hpp" + +void bustools_collapse(Bustools_opt &opt); diff --git a/src/bustools_main.cpp b/src/bustools_main.cpp index 8a76738..b6bb108 100644 --- a/src/bustools_main.cpp +++ b/src/bustools_main.cpp @@ -36,6 +36,7 @@ #include "bustools_text.h" #include "bustools_umicorrect.h" #include "bustools_predict.h" +#include "bustools_collapse.h" @@ -706,6 +707,51 @@ void parse_ProgramOptions_linker(int argc, char **argv, Bustools_opt &opt) { } } +void parse_ProgramOptions_collapse(int argc, char** argv, Bustools_opt& opt) { + const char* opt_string = "o:pg:e:t:"; + int gene_flag = 0; + int em_flag = 0; + static struct option long_options[] = { + {"output", required_argument, 0, 'o'}, + {"pipe", no_argument, 0, 'p'}, + {"genemap", required_argument, 0, 'g'}, + {"ecmap", required_argument, 0, 'e'}, + {"txnames", required_argument, 0, 't'}, + {0, 0, 0, 0 } + }; + int option_index = 0, c; + + while ((c = getopt_long(argc, argv, opt_string, long_options, &option_index)) != -1) { + + switch (c) { + case 'o': + opt.output = optarg; + break; + case 'p': + opt.stream_out = true; + break; + case 'g': + opt.count_genes = optarg; + break; + case 't': + opt.count_txp = optarg; + break; + case 'e': + opt.count_ecs = optarg; + break; + default: + break; + } + } + + while (optind < argc) opt.files.push_back(argv[optind++]); + + if (opt.files.size() == 1 && opt.files[0] == "-") { + opt.stream_in = true; + } +} + + void parse_ProgramOptions_extract(int argc, char **argv, Bustools_opt &opt) { /* Parse options. */ @@ -1484,6 +1530,87 @@ bool check_ProgramOptions_linker(Bustools_opt &opt) { return ret; } +bool check_ProgramOptions_collapse(Bustools_opt& opt) { + bool ret = true; + + // check for output directory + if (opt.output.empty()) { + std::cerr << "Error: Missing output directory" << std::endl; + ret = false; + } + else { + bool isDir = false; + if (checkDirectoryExists(opt.output)) { + isDir = true; + } + else { + if (opt.output.at(opt.output.size() - 1) == '/') { + if (my_mkdir(opt.output.c_str(), 0777) == -1) { + std::cerr << "Error: could not create directory " << opt.output << std::endl; + ret = false; + } + else { + isDir = true; + } + } + } + + if (isDir) { + opt.output += "output"; + } + } + + if (opt.files.size() == 0) { + std::cerr << "Error: Missing BUS input files" << std::endl; + ret = false; + } + else { + if (!opt.stream_in) { + for (const auto& it : opt.files) { + if (!checkFileExists(it)) { + std::cerr << "Error: File not found, " << it << std::endl; + ret = false; + } + } + } + } + + if (opt.count_genes.size() == 0) { + std::cerr << "Error: missing gene mapping file" << std::endl; + ret = false; + } + else { + if (!checkFileExists(opt.count_genes)) { + std::cerr << "Error: File not found " << opt.count_genes << std::endl; + ret = false; + } + } + + if (opt.count_ecs.size() == 0) { + std::cerr << "Error: missing equivalence class mapping file" << std::endl; + ret = false; + } + else { + if (!checkFileExists(opt.count_ecs)) { + std::cerr << "Error: File not found " << opt.count_ecs << std::endl; + ret = false; + } + } + + if (opt.count_txp.size() == 0) { + std::cerr << "Error: missing transcript name file" << std::endl; + ret = false; + } + else { + if (!checkFileExists(opt.count_txp)) { + std::cerr << "Error: File not found " << opt.count_txp << std::endl; + ret = false; + } + } + + return ret; +} + bool check_ProgramOptions_extract(Bustools_opt &opt) { bool ret = true; @@ -1567,6 +1694,7 @@ void Bustools_Usage() { << "text Convert a binary BUS file to a tab-delimited text file" << std::endl << "extract Extract FASTQ reads correspnding to reads in BUS file" << std::endl << "predict Correct the count matrix using prediction of unseen species" << std::endl + << "collapse Turn BUS files into a BUG file" << std::endl << "linker Remove section of barcodes in BUS files" << std::endl << "version Prints version number" << std::endl << "cite Prints citation information" << std::endl @@ -1723,6 +1851,17 @@ void Bustools_linker_Usage() { << std::endl; } +void Bustools_collapse_Usage() { + std::cout << "Usage: bustools collapse [options] sorted-bus-files" << std::endl << std::endl + << "Options: " << std::endl + << "-o, --output Output directory gene matrix files" << std::endl + << "-g, --genemap File for mapping transcripts to genes" << std::endl + << "-e, --ecmap File for mapping equivalence classes to transcripts" << std::endl + << "-t, --txnames File with names of transcripts" << std::endl + << "-p, --pipe Write to standard output" << std::endl + << std::endl; +} + void Bustools_extract_Usage() { std::cout << "Usage: bustools extract [options] sorted-bus-file" << std::endl << " Note: BUS file should be sorted by flag using bustools sort --flag" << std::endl << std::endl @@ -1919,6 +2058,18 @@ int main(int argc, char **argv) { Bustools_linker_Usage(); exit(1); } + } else if (cmd == "collapse") { + if (disp_help) { + Bustools_collapse_Usage(); + exit(0); + } + parse_ProgramOptions_collapse(argc-1, argv+1, opt); + if (check_ProgramOptions_collapse(opt)) { //Program options are valid + bustools_collapse(opt); + } else { + Bustools_collapse_Usage(); + exit(1); + } } else if (cmd == "extract") { if (disp_help) { Bustools_extract_Usage(); From ed76e0ce8609ee96ed9a7ad761ac2f12b05944fa Mon Sep 17 00:00:00 2001 From: Johan Gustafsson Date: Thu, 22 Apr 2021 14:26:19 +0200 Subject: [PATCH 2/9] test: copied the test case for collapse from the butterfly branch and modified it slightly. --- test/test_cases/runtests.txt | 13 +++++++++ .../tc0002CollapseInput - explained.txt | 27 +++++++++++++++++++ test/test_cases/tc0002CollapseInput.txt | 14 ++++++++++ test/test_cases/tc0002ExpResult.txt | 6 +++++ 4 files changed, 60 insertions(+) create mode 100644 test/test_cases/tc0002CollapseInput - explained.txt create mode 100644 test/test_cases/tc0002CollapseInput.txt create mode 100644 test/test_cases/tc0002ExpResult.txt diff --git a/test/test_cases/runtests.txt b/test/test_cases/runtests.txt index dc0fc00..5c1f54e 100644 --- a/test/test_cases/runtests.txt +++ b/test/test_cases/runtests.txt @@ -2,6 +2,19 @@ #to be able to run this file, you need to run the line chmod +x ./test/test_cases/runtests.txt mkdir tmp +#Test collapse +./build/src/bustools fromtext -p ./test/test_cases/tc0002CollapseInput.txt | ./build/src/bustools collapse -o ./tmp/tc2mid2 -p -t ./test/test_cases/transcripts.txt -g ./test/test_cases/transcripts_to_genes.txt -e ./test/test_cases/matrix.ec - | ./build/src/bustools text -o ./tmp/tc0002output.txt - +diff -w -B -s ./test/test_cases/tc0002ExpResult.txt ./tmp/tc0002output.txt +rm ./tmp/* +#or separately +./build/src/bustools fromtext -o ./tmp/tc2mid.bus ./test/test_cases/tc0002CollapseInput.txt +./build/src/bustools collapse -o ./tmp/tc2mid2 -t ./test/test_cases/transcripts.txt -g ./test/test_cases/transcripts_to_genes.txt -e ./test/test_cases/matrix.ec ./tmp/tc2mid.bus +./build/src/bustools text -o ./tmp/tc0002output.txt ./tmp/tc2mid2.bus +#check that the files are identical +diff -w -B -s ./test/test_cases/tc0002ExpResult.txt ./tmp/tc0002output.txt +rm ./tmp/* + + #Test umicorrect #tc0003 ./build/src/bustools fromtext -p ./test/test_cases/tc0003.txt | ./build/src/bustools umicorrect -e ./test/test_cases/matrix.ec -g ./test/test_cases/transcripts_to_genes.txt -t ./test/test_cases/transcripts.txt -p - | ./build/src/bustools text -o ./tmp/tc0003output.txt - diff --git a/test/test_cases/tc0002CollapseInput - explained.txt b/test/test_cases/tc0002CollapseInput - explained.txt new file mode 100644 index 0000000..0af68de --- /dev/null +++ b/test/test_cases/tc0002CollapseInput - explained.txt @@ -0,0 +1,27 @@ +#Test for the collapse function. +#What we need to test is that +#1. Different ecs with the same UMI and barcode leads to that they are merged to the same gene - these two should be merged +AAACCTGAGAAGCCCA ATGGAAATTT 1 4 +AAACCTGAGAAGCCCA ATGGAAATTT 2 3 + +#2. Identical ecs with same barcode and different UMI should not be merged +AAACCTGAGAAGCCCA ATGAAAATTT 1 1 +AAACCTGAGAAGCCCA ATGAAAATTA 1 1 + +#3. Identical barcode and UMI should lead to the set intersection of genes - these should be merged with gene set to 2 +AAACCTGAGAAGCCCA GTGGAAATTT 9 4 +AAACCTGAGAAGCCCA GTGGAAATTT 10 3 +AAACCTGAGAAGCCCA GTGGAAATTT 13 3 + +#4. Sometimes you get more than one gene - should be merged with gene set to 2,4, but discarded here since we do not keep multimapped molecules +AAACCTGAGAAGCCCA ATGGAAATTT 13 4 +AAACCTGAGAAGCCCA ATGGAAATTT 10 3 + +#5. No overlap between genes, should be discarded +AAACCTGAGAAGCCCA CTGGAAATTT 2 4 +AAACCTGAGAAGCCCA CTGGAAATTT 10 3 + +#6. Identical ecs with different barcode and same UMI should not be merged +AAACCTGAGAAGCCCA ATGAAAATAA 1 1 +AAACCTGAGAAGCCCT ATGAAAATAA 1 1 + diff --git a/test/test_cases/tc0002CollapseInput.txt b/test/test_cases/tc0002CollapseInput.txt new file mode 100644 index 0000000..46159e1 --- /dev/null +++ b/test/test_cases/tc0002CollapseInput.txt @@ -0,0 +1,14 @@ +AAACCTGAGAAGCCCA ATGGAAATTT 1 4 +AAACCTGAGAAGCCCA ATGGAAATTT 2 3 +AAACCTGAGAAGCCCA ATGAAAATTT 1 1 +AAACCTGAGAAGCCCA ATGAAAATTA 1 1 +AAACCTGAGAAGCCCA GTGGAAATTT 9 4 +AAACCTGAGAAGCCCA GTGGAAATTT 10 3 +AAACCTGAGAAGCCCA GTGGAAATTT 13 3 +AAACCTGAGAAGCCCA ATGGAAATTT 13 4 +AAACCTGAGAAGCCCA ATGGAAATTT 10 3 +AAACCTGAGAAGCCCA CTGGAAATTT 2 4 +AAACCTGAGAAGCCCA CTGGAAATTT 10 3 +AAACCTGAGAAGCCCA ATGAAAATAA 1 1 +AAACCTGAGAAGCCCT ATGAAAATAA 1 1 + diff --git a/test/test_cases/tc0002ExpResult.txt b/test/test_cases/tc0002ExpResult.txt new file mode 100644 index 0000000..fd4113b --- /dev/null +++ b/test/test_cases/tc0002ExpResult.txt @@ -0,0 +1,6 @@ +AAACCTGAGAAGCCCA ATGGAAATTT 1 7 +AAACCTGAGAAGCCCA ATGAAAATTT 1 1 +AAACCTGAGAAGCCCA ATGAAAATTA 1 1 +AAACCTGAGAAGCCCA GTGGAAATTT 2 10 +AAACCTGAGAAGCCCA ATGAAAATAA 1 1 +AAACCTGAGAAGCCCT ATGAAAATAA 1 1 From 17b65e6b41dbd0314f2f14838aa77c0b89a6a89b Mon Sep 17 00:00:00 2001 From: Johan Gustafsson Date: Thu, 22 Apr 2021 14:35:05 +0200 Subject: [PATCH 3/9] fix: bug in collapse, the gene ids were incorrect. --- src/bustools_collapse.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/bustools_collapse.cpp b/src/bustools_collapse.cpp index a289773..3c045e9 100644 --- a/src/bustools_collapse.cpp +++ b/src/bustools_collapse.cpp @@ -92,6 +92,7 @@ void bustools_collapse(Bustools_opt &opt) { if (glist.size() == 1) { auto entry = v[0]; entry.count = copies; + entry.ec = glist[1]; bus_out.write((char*)&entry, sizeof(entry)); } else { discarded++; From 7cb767593c9d94de1d05f6e8ed2925023058e1a5 Mon Sep 17 00:00:00 2001 From: Johan Gustafsson Date: Thu, 22 Apr 2021 15:04:49 +0200 Subject: [PATCH 4/9] fix: Fixed a bug in collapse --- src/bustools_collapse.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bustools_collapse.cpp b/src/bustools_collapse.cpp index 3c045e9..2111c9b 100644 --- a/src/bustools_collapse.cpp +++ b/src/bustools_collapse.cpp @@ -92,7 +92,7 @@ void bustools_collapse(Bustools_opt &opt) { if (glist.size() == 1) { auto entry = v[0]; entry.count = copies; - entry.ec = glist[1]; + entry.ec = glist[0]; bus_out.write((char*)&entry, sizeof(entry)); } else { discarded++; From 3386c0d1aa9a7d3245f92f72f6a3895fa581dde2 Mon Sep 17 00:00:00 2001 From: Johan Gustafsson Date: Sun, 25 Apr 2021 08:00:50 +0200 Subject: [PATCH 5/9] feat: Added clusterhist - a command that creates UMI histograms per cluster --- src/Common.hpp | 2 + src/bustools_clusterhist.cpp | 227 +++++++++++++++++++++++++++++++++++ src/bustools_clusterhist.h | 3 + src/bustools_main.cpp | 172 ++++++++++++++++++++++++++ 4 files changed, 404 insertions(+) create mode 100644 src/bustools_clusterhist.cpp create mode 100644 src/bustools_clusterhist.h diff --git a/src/Common.hpp b/src/Common.hpp index f81d446..7b68459 100644 --- a/src/Common.hpp +++ b/src/Common.hpp @@ -53,6 +53,8 @@ struct Bustools_opt { std::string predict_input; //specified the same way as the output for count - count and histogram filenames will be created from this double predict_t = 0.0; //this is how far to predict, t=10 means that we will predict the change in expression at 10 times the number of reads + /* clusterhist */ + std::string cluster_input_file; /* project */ std::string map; diff --git a/src/bustools_clusterhist.cpp b/src/bustools_clusterhist.cpp new file mode 100644 index 0000000..2bbb667 --- /dev/null +++ b/src/bustools_clusterhist.cpp @@ -0,0 +1,227 @@ +#include +#include +#include + +#include "Common.hpp" +#include "BUSData.h" + +#include "bustools_clusterhist.h" +#include +#include + +void bustools_clusterhist(Bustools_opt& opt) { + BUSHeader h; + size_t nr = 0; + size_t N = 100000; + uint32_t bclen = 0; + BUSData* p = new BUSData[N]; + + // read and parse the equivelence class files + + std::unordered_map, int32_t, SortedVectorHasher> ecmapinv; + std::vector> ecmap; + + std::unordered_map txnames; + parseTranscripts(opt.count_txp, txnames); + std::vector genemap(txnames.size(), -1); + std::unordered_map genenames; + parseGenes(opt.count_genes, txnames, genemap, genenames); + parseECs(opt.count_ecs, h); + ecmap = std::move(h.ecs); + ecmapinv.reserve(ecmap.size()); + for (int32_t ec = 0; ec < ecmap.size(); ec++) { + ecmapinv.insert({ ecmap[ec], ec }); + } + std::vector> ec2genes; + create_ec2genes(ecmap, genemap, ec2genes); + + + std::string gene_ofn = opt.output + "genes.txt"; + std::string histDir = opt.output + "cluster_hists/"; + + + std::vector v; + v.reserve(N); + uint64_t current_bc = 0xFFFFFFFFFFFFFFFFULL; + //temporary data + std::vector ecs; + std::vector glist; + ecs.reserve(100); + std::vector u; + u.reserve(100); + + //Read the cluster file + std::vector clusterNames; + std::unordered_map bcClusters; + { + std::ifstream ifs(opt.cluster_input_file); + uint32_t flag = 0; + while (!ifs.eof()) { + std::string bc, cluster; + ifs >> bc >> cluster; + if (!cluster.empty()) { + uint64_t bcBin = stringToBinary(bc, flag); + std::size_t clust = -1; + auto it = std::find(clusterNames.begin(), clusterNames.end(), cluster); + if (it != clusterNames.end()) { + clust = (it - clusterNames.begin()); + } + else { + clust = clusterNames.size(); + clusterNames.push_back(cluster); + } + bcClusters[bcBin] = clust; + } + } + } +// for (size_t i = 0; i < 5; ++i) { +// std::cout << "rc: " << rc << "\n"; +// } + + //Allocate histograms + + //Indexed as gene*histmax + histIndex + size_t n_genes = genenames.size(); + const uint32_t histmax = 100;//set molecules with more than histmax copies to histmax + typedef std::vector Histograms; + std::vector clusterHistograms; + for (std::size_t i = 0; i < clusterNames.size(); ++i) { + clusterHistograms.push_back(std::vector(n_genes * histmax, 0)); + } + + + //barcodes + auto write_barcode = [&](const std::vector& v) { + if (v.empty()) { + return; + } + auto it = bcClusters.find(v[0].barcode); + if (it == bcClusters.end()) { + return; //This is ok, could be a cell that has been filtered in quality check + } + + auto& hist = clusterHistograms[it->second]; + + size_t n = v.size(); + + for (size_t i = 0; i < n; ) { + size_t j = i + 1; + for (; j < n; j++) { + if (v[i].UMI != v[j].UMI) { + break; + } + } + + // v[i..j-1] share the same UMI + ecs.resize(0); + uint32_t counts = 0; + for (size_t k = i; k < j; k++) { + ecs.push_back(v[k].ec); + counts += v[k].count; + } + + intersect_genes_of_ecs(ecs, ec2genes, glist); + if (glist.size() == 1) { + //Fill in histograms for prediction. + if (glist[0] < n_genes) { //crasches with an invalid gene file otherwise + hist[glist[0] * histmax + std::min(counts - 1, histmax - 1)] += 1.0; //histmax-1 since histograms[g][0] is the histogram value for 1 copy and so forth + } + else { + std::cerr << "Mismatch between gene file and bus file, the bus file contains gene indices that is outside the gene range!\n"; + } + } + i = j; // increment + } + }; + + for (const auto& infn : opt.files) { + std::streambuf* inbuf; + std::ifstream inf; + if (!opt.stream_in) { + inf.open(infn.c_str(), std::ios::binary); + inbuf = inf.rdbuf(); + } + else { + inbuf = std::cin.rdbuf(); + } + std::istream in(inbuf); + + parseHeader(in, h); + bclen = h.bclen; + + int rc = 0; + while (true) { + in.read((char*)p, N * sizeof(BUSData)); + size_t rc = in.gcount() / sizeof(BUSData); + //std::cout << "rc: " << rc << "\n"; + nr += rc; + if (rc == 0) { + break; + } + + + for (size_t i = 0; i < rc; i++) { + if (p[i].barcode != current_bc) { + // output whatever is in v + if (!v.empty()) { + write_barcode(v); + } + v.clear(); + current_bc = p[i].barcode; + } + v.push_back(p[i]); + + } + } + if (!v.empty()) { + write_barcode(v); + } + + if (!opt.stream_in) { + inf.close(); + } + } + delete[] p; p = nullptr; + + + + // write genes file + writeGenes(gene_ofn, genenames); + + //write histogram files + for (std::size_t f = 0; f < clusterNames.size(); ++f) { + + auto& histograms = clusterHistograms[f]; + std::string hist_ofn = histDir + clusterNames[f] + ".txt"; + + std::ofstream histof; + histof.open(hist_ofn); + + for (std::size_t g = 0; g < genenames.size(); ++g) { + //Indexed as gene*histmax + histIndex + std::size_t offs = g * histmax; + + //first figure out the length of the histogram, don't write that to make the file smaller + std::size_t histEnd = histmax - 1; + for (; histEnd != 0; --histEnd) { + if (histograms[offs + histEnd] != 0) { + break; + } + } + for (size_t c = 0; c <= histEnd; ++c) { + if (c != 0) { + histof << '\t'; + } + histof << histograms[offs + c]; + } + + histof << "\n"; + } + histof.close(); + } + + + //std::cerr << "bad counts = " << bad_count <<", rescued =" << rescued << ", compacted = " << compacted << std::endl; + + //std::cerr << "Read in " << nr << " BUS records" << std::endl; +} diff --git a/src/bustools_clusterhist.h b/src/bustools_clusterhist.h new file mode 100644 index 0000000..5a1a255 --- /dev/null +++ b/src/bustools_clusterhist.h @@ -0,0 +1,3 @@ +#include "Common.hpp" + +void bustools_clusterhist(Bustools_opt &opt); diff --git a/src/bustools_main.cpp b/src/bustools_main.cpp index b6bb108..b045415 100644 --- a/src/bustools_main.cpp +++ b/src/bustools_main.cpp @@ -37,6 +37,7 @@ #include "bustools_umicorrect.h" #include "bustools_predict.h" #include "bustools_collapse.h" +#include "bustools_clusterhist.h" @@ -751,6 +752,54 @@ void parse_ProgramOptions_collapse(int argc, char** argv, Bustools_opt& opt) { } } +void parse_ProgramOptions_clusterhist(int argc, char** argv, Bustools_opt& opt) { + const char* opt_string = "o:pg:e:t:"; + int gene_flag = 0; + int em_flag = 0; + static struct option long_options[] = { + {"output", required_argument, 0, 'o'}, + {"pipe", no_argument, 0, 'p'}, + {"genemap", required_argument, 0, 'g'}, + {"ecmap", required_argument, 0, 'e'}, + {"txnames", required_argument, 0, 't'}, + {"clusterfile", required_argument, 0, 'c'}, + {0, 0, 0, 0 } + }; + int option_index = 0, c; + + while ((c = getopt_long(argc, argv, opt_string, long_options, &option_index)) != -1) { + + switch (c) { + case 'o': + opt.output = optarg; + break; + case 'p': + opt.stream_out = true; + break; + case 'g': + opt.count_genes = optarg; + break; + case 't': + opt.count_txp = optarg; + break; + case 'e': + opt.count_ecs = optarg; + break; + case 'c': + opt.cluster_input_file = optarg; + break; + default: + break; + } + } + + while (optind < argc) opt.files.push_back(argv[optind++]); + + if (opt.files.size() == 1 && opt.files[0] == "-") { + opt.stream_in = true; + } +} + void parse_ProgramOptions_extract(int argc, char **argv, Bustools_opt &opt) { @@ -1611,6 +1660,104 @@ bool check_ProgramOptions_collapse(Bustools_opt& opt) { return ret; } +bool check_ProgramOptions_clusterhist(Bustools_opt& opt) { + bool ret = true; + + // check for output directory + if (opt.output.empty()) { + std::cerr << "Error: Missing output directory" << std::endl; + ret = false; + } + else { + bool isDir = false; + if (checkDirectoryExists(opt.output)) { + isDir = true; + } + else { + if (opt.output.at(opt.output.size() - 1) == '/') { + if (my_mkdir(opt.output.c_str(), 0777) == -1) { + std::cerr << "Error: could not create directory " << opt.output << std::endl; + ret = false; + } + else { + isDir = true; + } + } + } + + if (isDir) { + opt.output += "output"; + } + std::string histDir = opt.output + "cluster_hists/"; + //generate directory + if (my_mkdir(histDir.c_str(), 0777) == -1) { + std::cerr << "Error: could not create cluster_hists directory " << opt.output << std::endl; + ret = false; + } + } + + if (opt.files.size() == 0) { + std::cerr << "Error: Missing BUS input files" << std::endl; + ret = false; + } + else { + if (!opt.stream_in) { + for (const auto& it : opt.files) { + if (!checkFileExists(it)) { + std::cerr << "Error: File not found, " << it << std::endl; + ret = false; + } + } + } + } + + if (opt.count_genes.size() == 0) { + std::cerr << "Error: missing gene mapping file" << std::endl; + ret = false; + } + else { + if (!checkFileExists(opt.count_genes)) { + std::cerr << "Error: File not found " << opt.count_genes << std::endl; + ret = false; + } + } + + if (opt.count_ecs.size() == 0) { + std::cerr << "Error: missing equivalence class mapping file" << std::endl; + ret = false; + } + else { + if (!checkFileExists(opt.count_ecs)) { + std::cerr << "Error: File not found " << opt.count_ecs << std::endl; + ret = false; + } + } + + if (opt.count_txp.size() == 0) { + std::cerr << "Error: missing transcript name file" << std::endl; + ret = false; + } + else { + if (!checkFileExists(opt.count_txp)) { + std::cerr << "Error: File not found " << opt.count_txp << std::endl; + ret = false; + } + } + + if (opt.cluster_input_file.size() == 0) { + std::cerr << "Error: missing transcript name file" << std::endl; + ret = false; + } + else { + if (!checkFileExists(opt.cluster_input_file)) { + std::cerr << "Error: File not found " << opt.cluster_input_file << std::endl; + ret = false; + } + } + + return ret; +} + bool check_ProgramOptions_extract(Bustools_opt &opt) { bool ret = true; @@ -1695,6 +1842,7 @@ void Bustools_Usage() { << "extract Extract FASTQ reads correspnding to reads in BUS file" << std::endl << "predict Correct the count matrix using prediction of unseen species" << std::endl << "collapse Turn BUS files into a BUG file" << std::endl + << "clusterhist Create UMI histograms per cluster" << std::endl << "linker Remove section of barcodes in BUS files" << std::endl << "version Prints version number" << std::endl << "cite Prints citation information" << std::endl @@ -1862,6 +2010,18 @@ void Bustools_collapse_Usage() { << std::endl; } +void Bustools_clusterhist_Usage() { + std::cout << "Usage: bustools collapse [options] sorted-bus-files" << std::endl << std::endl + << "Options: " << std::endl + << "-o, --output Output directory gene matrix files" << std::endl + << "-g, --genemap File for mapping transcripts to genes" << std::endl + << "-e, --ecmap File for mapping equivalence classes to transcripts" << std::endl + << "-t, --txnames File with names of transcripts" << std::endl + << "-c, --clusterfile File with cell cluster assignments" << std::endl + << "-p, --pipe Write to standard output" << std::endl + << std::endl; +} + void Bustools_extract_Usage() { std::cout << "Usage: bustools extract [options] sorted-bus-file" << std::endl << " Note: BUS file should be sorted by flag using bustools sort --flag" << std::endl << std::endl @@ -2070,6 +2230,18 @@ int main(int argc, char **argv) { Bustools_collapse_Usage(); exit(1); } + } else if (cmd == "clusterhist") { + if (disp_help) { + Bustools_clusterhist_Usage(); + exit(0); + } + parse_ProgramOptions_clusterhist(argc-1, argv+1, opt); + if (check_ProgramOptions_clusterhist(opt)) { //Program options are valid + bustools_clusterhist(opt); + } else { + Bustools_clusterhist_Usage(); + exit(1); + } } else if (cmd == "extract") { if (disp_help) { Bustools_extract_Usage(); From c2b7ac2703bbd4680c24913f60e5ea2198d94ae9 Mon Sep 17 00:00:00 2001 From: Johan Gustafsson Date: Sun, 25 Apr 2021 08:28:33 +0200 Subject: [PATCH 6/9] fix: Fixed an issue with directory creation. --- src/bustools_main.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/bustools_main.cpp b/src/bustools_main.cpp index b045415..12fdee0 100644 --- a/src/bustools_main.cpp +++ b/src/bustools_main.cpp @@ -1685,14 +1685,13 @@ bool check_ProgramOptions_clusterhist(Bustools_opt& opt) { } } - if (isDir) { - opt.output += "output"; - } std::string histDir = opt.output + "cluster_hists/"; //generate directory - if (my_mkdir(histDir.c_str(), 0777) == -1) { - std::cerr << "Error: could not create cluster_hists directory " << opt.output << std::endl; - ret = false; + if (!checkDirectoryExists(histDir)) { + if (my_mkdir(histDir.c_str(), 0777) == -1) { + std::cerr << "Error: could not create directory " << opt.output << std::endl; + ret = false; + } } } From 8c43cfcc6f0688870048aec28b6169d9bbc86bf7 Mon Sep 17 00:00:00 2001 From: Johan Gustafsson Date: Tue, 4 May 2021 20:37:55 +0200 Subject: [PATCH 7/9] feat: count --hist now also writes a CU per cell file. Test added to tc0004. --- src/bustools_count.cpp | 44 ++++++++++++++---- test/test_cases/runtests.txt | 1 + test/test_cases/tc0004CUPerCellCalc.xlsx | Bin 0 -> 10494 bytes test/test_cases/tc0004ExpResultsCUPerCell.txt | 7 +++ 4 files changed, 43 insertions(+), 9 deletions(-) create mode 100644 test/test_cases/tc0004CUPerCellCalc.xlsx create mode 100644 test/test_cases/tc0004ExpResultsCUPerCell.txt diff --git a/src/bustools_count.cpp b/src/bustools_count.cpp index 3e9a319..0d9989a 100644 --- a/src/bustools_count.cpp +++ b/src/bustools_count.cpp @@ -42,6 +42,7 @@ void bustools_count(Bustools_opt &opt) { std::string ec_ofn = opt.output + ".ec.txt"; std::string gene_ofn = opt.output + ".genes.txt"; std::string hist_ofn = opt.output + ".hist.txt"; + std::string cu_per_cell_ofn = opt.output + ".CUPerCell.txt"; std::string cu_ofn = opt.output + ".cu.txt"; of.open(mtx_ofn); @@ -82,8 +83,13 @@ void bustools_count(Bustools_opt &opt) { size_t n_genes = genenames.size(); const uint32_t histmax = 100;//set molecules with more than histmax copies to histmax std::vector histograms; + std::vector cellUMIs; //used for calculating CU per cell + std::vector cellCounts; //used for calculating CU per cell if (opt.count_gen_hist) { + histograms = std::vector(n_genes * histmax, 0); + cellUMIs.reserve(100000); + cellCounts.reserve(100000); } //set up random number generator for downsampling @@ -188,7 +194,7 @@ void bustools_count(Bustools_opt &opt) { break; } } - + // v[i..j-1] share the same UMI ecs.resize(0); uint32_t counts = 0; @@ -211,22 +217,25 @@ void bustools_count(Bustools_opt &opt) { gn = 0;//trick to skip quantification below } - } + } if (gn > 0) { if (opt.count_gene_multimapping) { for (auto x : glist) { column_vp.push_back({x, (opt.count_raw_counts ? counts : 1.0)/gn}); - - //Fill in histograms for prediction. - if (opt.count_gen_hist) { + } + + //Fill in histograms for prediction. + if (opt.count_gen_hist) { + for (auto x : glist) { if (x < n_genes) { //crasches with an invalid gene file otherwise histograms[x * histmax + std::min(counts - 1, histmax - 1)] += 1.0 / gn; //histmax-1 since histograms[g][0] is the histogram value for 1 copy and so forth - } - else { + } else { std::cerr << "Mismatch between gene file and bus file, the bus file contains gene indices that is outside the gene range!\n"; } } - } + cellUMIs[barcodes.size()-1]++; + cellCounts[barcodes.size()-1] += counts; + } } else { if (gn==1) { column_vp.push_back({glist[0],opt.count_raw_counts ? counts : 1.0}); @@ -234,6 +243,8 @@ void bustools_count(Bustools_opt &opt) { if (opt.count_gen_hist) { if (glist[0] < n_genes) { //crasches with an invalid gene file otherwise histograms[glist[0] * histmax + std::min(counts - 1, histmax - 1)] += 1.0; //histmax-1 since histograms[g][0] is the histogram value for 1 copy and so forth + cellUMIs[barcodes.size()-1]++; + cellCounts[barcodes.size()-1] += counts; } else { std::cerr << "Mismatch between gene file and bus file, the bus file contains gene indices that is outside the gene range!\n"; } @@ -250,6 +261,8 @@ void bustools_count(Bustools_opt &opt) { std::cerr << "Mismatch between gene file and bus file, the bus file contains gene indices that is outside the gene range!\n"; } } + cellUMIs[barcodes.size()-1]++; + cellCounts[barcodes.size()-1] += counts; } } } @@ -464,8 +477,9 @@ void bustools_count(Bustools_opt &opt) { histof.close(); } - //write mean counts per UMI file if (opt.count_gen_hist) { + //write mean counts per UMI file (per gene) + std::ofstream cuof; cuof.open(cu_ofn); //write header @@ -500,6 +514,18 @@ void bustools_count(Bustools_opt &opt) { } } cuof.close(); + + //write cu per cell file + + std::ofstream cupcof; + cupcof.open(cu_per_cell_ofn); + //write header + cupcof << "barcode\tCU\tUMIs\n"; + + for (size_t bc = 0; bc < barcodes.size(); ++bc) { + cupcof << binaryToString(barcodes[bc], bclen) << '\t' << double(cellCounts[bc]) / double(cellUMIs[bc]) << '\t' << cellUMIs[bc] << '\n'; + } + cupcof.close(); } diff --git a/test/test_cases/runtests.txt b/test/test_cases/runtests.txt index 5c1f54e..03f8916 100644 --- a/test/test_cases/runtests.txt +++ b/test/test_cases/runtests.txt @@ -37,6 +37,7 @@ diff -w -B -s ./test/test_cases/tc0004ExpResultsGenes.txt ./tmp/tc0004output/out diff -w -B -s ./test/test_cases/tc0004ExpResultsBarcodes.txt ./tmp/tc0004output/output.barcodes.txt diff -w -B -s ./test/test_cases/tc0004ExpResultsHist.txt ./tmp/tc0004output/output.hist.txt diff -w -B -s ./test/test_cases/tc0004ExpResultsCU.txt ./tmp/tc0004output/output.cu.txt +diff -w -B -s ./test/test_cases/tc0004ExpResultsCUPerCell.txt ./tmp/tc0004output/output.CUPerCell.txt diff -w -B -s ./test/test_cases/tc0004ExpResultsCountRaw.txt ./tmp/tc0004outputraw/output.mtx diff --git a/test/test_cases/tc0004CUPerCellCalc.xlsx b/test/test_cases/tc0004CUPerCellCalc.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..ab1cdd06aea4ca9eddbda3599ef88421e6700efc GIT binary patch literal 10494 zcmeHt^;;d;(lr{~HMmQ#gS!L`et_UE2Rpc1aF^g32$lfBg1fsr!68_14{q<7x%YlE zlbP=?xVL}kep*&Ny?58DRb91#^008YP%oemp`f5Bp*}L(*n^>=pfKT~pm3lNVRR+z z?OZ^1E{1PC9YD_dtRA*D6nStk3^`CRkmvum{TI(bX;PS?v z?F4dCOp3bZ@rTG}c3^xVNC@!8yNzbK8*9%GR(X6FD?uTavac1I zr0Gz$X1oy}dPWzO>tzt^!_4t|T-Si7A`6EDmQvHs!xwvkPR~zB=%;@ECs?-C&dTDL zd7;yBZrt~S@deKtI1=9N3_#LVp%sg1Hi&yYKf^)+|H9E)bpZ8G$Q=a;1yLXzHFN^m zIJ2?-w*MbD|BDIvr=gc7C@TMpn-5_F*Rv~enBwy8uirFNzV-E&UB;-5$)_P+X{RT_ zd`lDrE92Md`!KY$A{4VfKzX&nRT_njD@a}EUKW~q=je*?iq0uT+Oc%K2g_~ta`q}+ zM$VJLtu^*lNkdVt{NNh3^z@lzCFVG*CJ_?OXOd8S;q(BVK1JAvySRYfb;>Wx%WBv(tm^F@T*#q`3OUzS~ihnC}$5tJDHj(OQz@})Xv@FKQnm-k`- zXjDobSE09+#rw4^Em%%{X})~(#!X(!;uzD!5%q=PS8Z8N19X<78Fa~sP=*{f@V8;4 zGM3l?aD)e;SIL$w1}4)tz$I5;hO}-U!8Iv?*{uJP?$%3P=vWNDEwN1{`hdL^6&5D= zRG%^>sKwQI-yKmWo8Z-8qkEAT{glSK6EX$P9!9he7**#OT2Cm1PQJuw0X5QPJ(w*R zd7cP%<(_U$=4h=evU3#go2A~l>j{TbILcu`Dt69c%7TrCFB6z&fx zc9@Lcx+sQ9vxp7g2)zwGktr{94+M-6OG(M#DtZ=*g?YF8YC2Q9={N%$92K8Q({g}X zIc%qRI;wqUv|q!EV_tJPbH=2)1A~XFPEtu%XbkP~GkKjFi9Qh_Ah#Io|In4CNgmdW zq&*qlB;b;lBSj>psx)mTekR1DVA+sO)s!Oyv<-+z*x3zGn<4)M9--XAsABng_ch{d zQ1xgW7Y2l^P}O!>rf3<&BR zl5;|y{qNpN6NasO0H{*eAx}Y*ZH^r8fT51&ld5U9AAJ2ClK2!%5Om*doq$yZ20P;H zs@UIjU6VyLFZsEb-O!K|S>Tz;A_tbbflR^hC%wa?)ZrbL@-|tt@UZanJ9C=Nn%41~ zg+>`7;YB7vFx|`cpHUZqVU&)-3w&QPPvT7Xzzg{zCW=mp1sGG~+!UhIxM8XRqDl1> zk-4#B&*0`_Yc65I$I19@6nu`;i9UGAfHpdfQ(gLatS!*}XvP)ExOn2d*3j8#3SkxZ z%eG9>7ZN=<{a^65__Tu?4RNGROR`aU+yeP)Q&^qqtr<|gUaigdkBnEd+dp>>RL}2- z{-X~G@KS8sB0xc{(?CHHLZ0|{oOiYafn1!~em}7P7VtB5uPlT=Ys~?@anmv$r&oS|M*tG3u+fMDZ@#TebRDIQ*#>sih zOaCy_)}L;Q&KdWZMZpL03)#v1sn9+xIol50~Y9vljMYhxqPcHFsA|O~XUoNP0qe_j$Lg0+%ns zY@YURwKVpo$u@lHjWf_znQmm({T$j9nC8k|7^;d(hiy6G#zp!|!$dDEd^7Zg_RiL} zLW6yi+wpB+1T}y5Q^fCXM7fOxlDqJ<@tN)w|6uDi-+L59CL7o?Zf(my2n%aIrKE7@ z#9mob>^h4clQz4GS9reAh*E^+9LXbkd+WCMj^?7D;q+lPk0qx)@}Vq61S8|;o_yPQ zHCI!xbPQWcNb4;)=&3ODn6tZ2$oBDyVC zZk?O2ahnRe#~@M_SFi~Yma~<`0qiFf)`6j;hK~72`EM%0QACc8U*HLt!>~QE-%G-P%ALJE&oNdj8w1cKOn;YIvA>+9T`T**>Qe7zn$%)pT8AHcE zCV(4gE9x^j5;5E*U za{Vj0akY7+bE9K&9jxjBN!9_O4uY(Jx{U-^%knuKYF5_T?>C_FVh(?9XPc?<>eoZA77$bI` z4SMhHHZyBN*dw+HnvRr0_%OQZ|4~3(}|3nMs#qHc`!lqp`j6j@!#R?YYAq$wGr78`4KE`% zrXNR)7|yLM|h`P|{2@H3p71<#6!o%gpsFj;Qn2dYB^XsEy;8EGZ_BaLi3!R-&qFNlWza z!JR-E0S#9MeH=;k$qj+pUsf(14LWMjP4DP53Zf3a-|fPT;WkCgb~tKum_eJ~bqtz0 zlIoFb14Eckh2bn?)cbo^TuPX}MKD>CYcj|wXoDw+(yy3#-A(r;@kbWr@FP{j=-)td zvPAFe;whQEz8?@*!S*44p|iPsMU)OFf2iZv>43L}eG)3lGr})IL1g513xZtUi`)&zl8C)xD=S^8xiafJ3mkwJHr zheV6hTV*;jIz>ECq`3C&6w=tRE8oC&=TCrZ|ND0CBUt0zLMytT_S9REvzarx?MDb-GFUKWU5636x_gdRz)yqcRU8`#kQYsAOQJxsTra56phGf$c&La@iN- zR+oAt2c05o;-S(KmYjHgsdGkQLaW;(?W%*Vjx zqx>cb@{N(~Zc&&Y^N5&P$1ZzXMVQ^;lcmtNubB8J8{beQu6tkFgHzv-gR z#_uN~4O&gZXZoyKMo67-V-RQp^nS-dNqvZxEa&I_rA2w2Jzg9wnqf_|xRN%vaX3dA5C@6U#?MSA_02ah`KbyJ1VmRy0!1$FZR2g!DE#%RF1+1zH$6ZK*I)d@rW z>PFJy*pRuLw|*6&>ZqT=1wi9Y5Kf%1n@rIFnj>_t_cK&)zr zV&MPYvqU?Mh;1K=O=r~bmJv-({pPDkz*7NrWF3IxQ?kBMgch;tm4`{sphDdPQ_d_e z1=~kq@2Atfi?DWKzn{Ce-G)_-wXfvu5NOrZj%Q;HpC2CyRt?*qPWR7v_h#rJe{=!n?}14x(j?S z&KN#Db@-^Tl6cV=(Ht=m4?bpzq<+uat-;Eoqhr z-<}iZ*ifj5_<4-^H7IH}MR+Y6z!X$jS3q%pTcs{yn`?b3WJS*&DMr{mv4}fp@ zm+ih`OIDRbReni!Zql|Wvn>uMzfKvtayA-CuSmv=J5%Oh`5yvdmqf4Ikx@6%@5%Ig zcoO8DcBdMn2sz`0-R5I}Le7f{2?IU8bm){)@$Nyg1U|6~f|9}#B)}^6<$9#Lu19mK z2KX)Oj3oN-JJ1+uWlMB=D>;EnU_}&YQK)Xdb~J)hP~hTA@?tZgnoD#ap?AHAv(|($ z=kQMBq$ZV7GU=gQDbv81`emjOG$^4M{Eg8m6_E*aeU@ZfVPWqW=z-4=fwlJOmPm3% zPM>c(^Q~9bdvwL`+%VhZTT#=DbMU)>gyweHgBV0@i&QWCV;_A7CSUvNej~$G;iERV zmp;}nDX!|TMln^ufkMWih{&h}D}mv7PkNGBB|0|8Ngw7v@L<5=N^7*@ByPp@jz?6{ zKe!K(xkEd@fREKZyH67MzyR8j45&uBCT9kFS$3A5vw^`i%9bC&~GyL!Cbjd?kVnq||GPL!TwF?Iq~M*_6ckr}CQ`PPGzF30p!m@5-r_C(pe(uGHC%(Rk4 zV`)>y7Zmt}TMtIP7>QK5f?XZqlhv}B1K-`Z?%Js@7#5}pjTOv^Ra^;G4AI4L4J?8U z7SgpQU9={F`>DA)(vI8609!mfkL|1PQmxxzuNPm3nv*2$nBo=Iq_;$uBafv^dnIyy zNrwiSuyInvx~MHQIeJ>VS{1iSePW@GcBzg5OaTH3)|mxIQ|`6s5JC~9U@XmJuHSSe#vNDlO%bzOq%|fB08HHVoc;895G!42Rt}BsjWg+ z2vt5Rkq4}+l{-8Y_pXta(kzg4wUWogw!#YR@M3+^(qdymcpu>(738!(${z5c0xy31 z)_uch$a1bFa*99cJ+(j-&@C^`>|+;qMES_1G7~wW@hv5ricfUc#NW!6SZRy*IM}LS)P3vZ4g%E=2?=6jN38a2C3#^)s=*}5o!!S zR_N`!zQXu!6tAL(K7DWpt!F$Te=+@dS$^ z>%MjfwIm;tU;DSzVdn?Q&`neGXi*Gk1CiO^{)H5&TjtR3)x7rEA{UKi1t z7UY|+U4)PSEiOFz8kwG+##aQ3dy^@;& zi9p??Z-mm$wDb6iv+C3bCFB~@KBqDc68#WDcJ=+p62;JV5xh=R>Vu-Br-Zkn)qPZhiM zc~WEU7}+7B9BfdP>at^|Wa%jlvo%L4g@m*jwJ3NY_X&yJroYvPPZ?a6DfQVd8J0bu z$i8z!d+((9G`qz)d`Rsc_NWL-Iau~%ijHLv$s2^M1K~(H9?50Z&biS0iVQi{t02Lv z_XbB2nOhUyCo~`XjVm3(Q5;S1ykWi%sKt%5=OfSO1U=T+Q?v%k5*yt$_(|HMuE;oY z2#DPUdp3P52pkl--rOBh>$FCYL|e0Y+;@R(2Op6XWf$TV5|`G9OxuD4UOq1&qtfL50%n?7l9E@y-4s$c`trWusi$F z6%W9VfLLSK=%Du)05H;6X@t+!?6@tNcjNiVot(YH;%TBdtvFsy*4`9F$`sX^BLD@FAcju>~V(lg_J_X z9Y!ipBqA2QY60R7=;b&T6-}nIM~a>kI-b;b%p2^sJ2zUz?CYNmWb;Q_?4D3>!ac$1Kah&z|3`T?f34F3#OYK)Uss|*i! z7YcoJ_@E?{c&Ijnpaf0JFdzy>mVZbwpl+2ovT{>5ZiKcoH?ZNG1_$|jw;;4lLmOB= z`DtC|PR>rDZWaI!EEZ{eAjX^OYF7kI5(zZyB=_<3 z^Q=|oV{F}Hb9mv+-SE$i^ry`NkJJ=-m~mIGN<;TsPsX)kpcHc&r9wxqYAVhQa~F<# z8|sI2d)RA&bHc4P$%M&07fB)LA(HHO3FrU#TWlJuwcxn9MTx{Ls%!1c` z%$B#uouupviGbnHxXE5%#C9GoJRC8Y;40Z;ensCG4?Y2@_KaPO_Pzgxw4smc2NE1J zCx3VuxKc^^Bn{O(+g#2DBmRo1p5>PtJSQL;Pj7yD!*6*V!AFKhR$W`jPbp}6>Y*lOn8*kZoOl>131h!bA6=;6Exgi!2-=LWJ;yzz*-A!5 z*xl?!J3#1Z^Jn+vxtEGB>#e>8C+*r()f38pf+sW#3q*7Od9B!gKG%Pa|FCilDF0Uj ze_hD(AK-7tPY_Q0X<5s!z+apDe?SKzN$xML{$Ig=?K=Jeg@Uq0{vG`P=|TQ#=hx2D zAC@Xn|KCgetq=99m0ycve^_}7$!;J+`L$H`tAStB`#%g!L-Hqxfj^S{U!lLIQhz}6 z$o>WWHL?2D!moM49~KZOez)+Ktl?MmU-kD7EELoc1pPNP{uTb$0QzS*nEFrfe}_|` WJUqlVep@kx0@Vf){8x0po&6sdNkh8; literal 0 HcmV?d00001 diff --git a/test/test_cases/tc0004ExpResultsCUPerCell.txt b/test/test_cases/tc0004ExpResultsCUPerCell.txt new file mode 100644 index 0000000..46e42f3 --- /dev/null +++ b/test/test_cases/tc0004ExpResultsCUPerCell.txt @@ -0,0 +1,7 @@ +barcode CU UMIs +TTTTTTTTTTTTTTTT 1.28571 7 +AAACCTGAGAAGCCCA 3.27273 11 +CCACCTGAGAAGCCCA 1.66667 3 +TTACCTGAGAAGCCCA 1.5 2 +TTTTCTGAGAAGCCCA 2 1 +AATTTTTTTTTTTTTT 3.625 8 \ No newline at end of file From b0654148c641be600df35ca673fd84dfba3535a4 Mon Sep 17 00:00:00 2001 From: Johan Gustafsson Date: Tue, 4 May 2021 21:18:45 +0200 Subject: [PATCH 8/9] fix: Fixed a bug related to CU per cell; had forgotten to extend the UMI/counts vectors per cell for each encountered cell. --- src/bustools_count.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/bustools_count.cpp b/src/bustools_count.cpp index 0d9989a..11a7dee 100644 --- a/src/bustools_count.cpp +++ b/src/bustools_count.cpp @@ -180,7 +180,9 @@ void bustools_count(Bustools_opt &opt) { } column_vp.resize(0); n_rows+= 1; - + cellUMIs.push_back(0); + cellCounts.push_back(0); + barcodes.push_back(v[0].barcode); double val = 0.0; size_t n = v.size(); From 3c45325ff6fba48245e2be5be12eceb976a199ab Mon Sep 17 00:00:00 2001 From: Johan Gustafsson Date: Sun, 30 May 2021 09:43:57 +0200 Subject: [PATCH 9/9] fix: some minor things --- src/bustools_main.cpp | 2 +- test/test_cases/tc0004CUPerCellCalc.xlsx | Bin 10494 -> 10524 bytes 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bustools_main.cpp b/src/bustools_main.cpp index 12fdee0..48511a7 100644 --- a/src/bustools_main.cpp +++ b/src/bustools_main.cpp @@ -1744,7 +1744,7 @@ bool check_ProgramOptions_clusterhist(Bustools_opt& opt) { } if (opt.cluster_input_file.size() == 0) { - std::cerr << "Error: missing transcript name file" << std::endl; + std::cerr << "Error: missing cluster file" << std::endl; ret = false; } else { diff --git a/test/test_cases/tc0004CUPerCellCalc.xlsx b/test/test_cases/tc0004CUPerCellCalc.xlsx index ab1cdd06aea4ca9eddbda3599ef88421e6700efc..dbefa42a7eae81595dac7d9a21eca24ff421ca11 100644 GIT binary patch delta 3348 zcmV+v4eRp$QJhk+*9Hkbk|X7P0{{SCli&sze{E0WI1v7R(*1`h-$UXwZeB!LHA$mG zT2=_U@TC$mj!P_(IOjOD2Ty2l)QbI+25Ci6WLa;NL!Lxv}|TSv)=x4_U9j`8zCR>g?O|8pzUXNrBs)=Zci)L ze~|t}bgTu-TF8bdSjq>t?<6B-Uoobd+Vwqe>NbS8_6{MD$6$!{np2hwIy9_RI}pig zqJVy1@va}y8hQ*|L*(PoIaD+qK-}}1t7ijhTMbRlA6g;FeGP7J5w^AxepC2I9t{3? z6PP|+)o?0BFV^Y=pxqtm5B?+1Md)1!f3_cEa2!(LO7_Hc4iBv4^pmWn2dwEU81jA_ z4S}?bRuTx_C(-bMn{S_;u6fOFcG9&-*L@?6&Qxt%HR;u&YIm26FhS}j4yBRo{bbQ=3up9EpdYsF+s6w5@bK!-YAxQERX-7SweLnT!_Su?e6*Kr~ql1jJmv3(t_vn{43<o`w#}f3k@aghl4WY3w;UT7>B&O(PsGVt=1#*4TAbM*YvFxq}-hjqKV;cU~t%0(}&?JaLhu>m#35nTQ} zE`tbM+!)&}FI5ExFbWTQe=+`00bvw`kYe3ZG_jfeGLLZKW$n&T$gxoibOkV|NX0}>W3)i~WB52@xHZis>ndm#@d5KI8$JZ#SDQI6$=3tCG zKejC@S~^Gx8qdI@a|Ur>TlKY9W(Fm4V2lE99(r-$c#8=J#x}77Me=_%Oj8ia_zlv;e=8!@QWE+IU#=^*PC@YyTH_<|eM>gQQR+>AVo;#M2@ohN zO{al_^J0QSlxGVJ;T_59h^#-^t|HgG&4?PHlyw!Gb)v^&zkAqR?>6?SdsmIgRjwV` zzx{6qufhA89eKF8Ir3ok?P7UhLitMvToYhD8d|>k{gl~JMm+@yyeB$a-XKv06d(^rL(Vps`K1+%VeJ(<8qjv(-UZR zd2x&@_*y)><+(`XZ9V9CaJ|uJ>>lInLl{^Iei3&*0)IJ7m?j+Nm=*Ci9Qar@V-t4iNV{3*uz(Ucf&O{`<{k zYPoi$W0n+ud71C3A$WE7hhiM%>Pbj z>GPn9pFXFDG^tX5xCnCwRet_0-Bs)KaDc%+NU&m` z^870f{Rl=QxJX$K!HdL6mA$6id=p^R|5INVKf^`c%T?vi4T4Qw<-Zq$ZMutJ4%Hv| z+plT%ysrRC2(e@F%(vg7v`nCC;nG7dM6XGH02lCcaLjP|LDj|Y_3yWATkV&QHyTdE zFc?pNC*bBOEvrq2KAl1GQdar#Z*IkeFYX0quJ!`Ig%=jwJa{Yy!|w)yH=MB1%tr+J zRb3uJot-GI;?<(a-v&_1fU!Ks*cjM+VoL$UjpX(XGveZZQVT9vBDdoLV=FFM=M!AM5f_(|+-^|1 zTYdiW>8FUzqsh+;cZbt6ySP-+3Ke!cd@#1cmvsSw9~j|tDTU9a6V=0`?44OcfjYlE z6fm}iBI`mDN@xs)OKB)5-K{phyMct%lfZ`FG@P*D@^fmQ6U%_HAyTA1Hfu$tvw+`wc4(BhDPl}w{8&IF{rMc~mGhY%Alj|u==Gh@AY1+DFs zBP;coIE#=z7lB7(Oq^XdjJX>I)mC>kf9Oq=zm^7#+nFA#LGJn5JSl`gX&f$OTn$vL zR#7kh!0n+u_N@khs=(hr$LWOb0ANs8 z-N}4$9bFe+c(PKv>*KsYX&uINO{`28*THqs1)M3g6RUyatU$>E#&m&`$>KV=F0$|x zvY_`6pfy$<7q`=`uGn5&;dOC^qNJ3y2Cb248@ag~d%eg4t*vCCD0O&$f)psz!l>^A-C+J@I3MMjR0EP;VEQ6lLtd< zc$8^{+_EzKUTlHZc6bU~kUiEKh9W##c|vSE>gu(B_m!4QPX|>kz0m?K!qbvRkBpSI z;`iFXqxB_uo(50RYVde}^Cd}*z@=gPvZ=-5e%24*hIZQA(6&uo4W!pr-UZ+uk+nRuVGod zL9a%N)@@MqR6@~zY855UOz15H3^G;}ikr1Do=2hIFoI`$Ow+5m21@(XB2l#TQH|)K zS|sM)3Rl7aEsrtk$FVm*OhoIB{L5PH0h8#F8fq>lrG}4&`O+A}(tz3#r81 z)e8Rdz&U?<)RecCKwF? z-7e3o<_o?EzHxb7Y4twt4HMeHi_r}jSAq>ZNm(YE3~ilj7{*p?q2@O7 z4&}wQHZ^yD{%AJ1;o^&L0LO&Pa1D-2?l|s%VKWlZ5h=xpY<8yV0a9S7pa$hErV+)z zUI(VO7YBM};;c(|Xndz*+$md%K^)zCTaL!IZKF2nWBB|Of2^Kg`>J87x?KrCc`5}H z2DsBEf6`m7g2$cL*4Ap3;)&QTvf|dQ{La@U$h*Tu1^>Cn^ZPo(BOn7GkPUTC%6oGE zP%Kd%lbD9&HY7I%oih@%Wd7Mj_%q)H9xm!*kbe*pPA7rs z-BB$o1($rSP5|28k$&eta%_a&gkbwVe+I`PVO!8grgL~;CHxPv@()=43m9^K8x4W9 zMyOIAbtOZv^rCbX*5Ktykd%0b2ES`-_SP;V(62BHDGmT z$PG;U>>%eWVN`TYhDI$w$P+wNhVMtNr+0!d&KpHVM-)wYr$C2$zkXM9BX(MIe^9t_ zP5&L3pio$#FYpywQpRK@uZXIRK{RtSjhA;&gv+w;mmS?CUvCziH*&ceq#6ebyk6?& zTS+99%U6me@5|#CG)qby2pb}?W4jyw7!@Eb;z^fRN*FM{$UZ}EZpb5K3iMFz3;6<4 z5q^8dxf9|d#8&R7frZ^7wZdqYe^_3SpfHHiJVW?9cp?0_qtyMJEg5K)33%RAUbkWWX3<=R9yCY&rSN1ICeSg?Q##DbC#73-T;aJe@K9jfnrm zO*vLVJbipL@tO#=5Tty7e^1smT_+%Q?XMOn@O@1-@xsKJ0YzM(!onzWtR(TVap z3s9Ek9)x!yt0S`h;5`-ELWjTkKa4i|gIOK6P)(Fuu;TBm1}iZ{rPk z-_RpB7q>@lOut+%F2_**)PY@qbvw$!p3-vm9{>OV|NjF3P)h>@le-KGvw;dk0tq@d zAmwcb007~W&I}=cOOvBE5Wc4>|G~v6IgGIc_*J&IcFZu@!&arT+2qV%Jf_w*4ls|c zRQ`KfNH$VS!q)C%#_n!4U$?TX$KwlJ{^t5{vj=n9``V>Zt+oWw?&r6W%hE&j}3QH<@>Y&{PnIl z)n{*q>=t*2wEFgPy35ML30yoC`=a^YI`syJZ2sx_SXSxN9^!r{VVVuzEBHs@e{jA` zE7z`cD6*=5tjldP1kXOltcO4IXMUPpJn8Y@xYNC@F5^~GZNk3O~Ubv&PHVRkxc*dR*OvmGt8nI zRQY!4eI)b8DDeE%qAj4miu|pXJ`bAo=}W%PvnGdsix6+X>hq~AzcIs4AR2+ox;0=f z)2u08^L4)8FTEt3L5=^@UWI*zOZb;-$e+*GOWKT1^gTw3bxpw#?p5Vw%9bgrAJ4@X%vOy=>*(7<#m%3 z%%?Yh$X@EEJp9eAi15X|z&y2H;P;!*hc^!%i^1?)45oBAA){Hq5a?I+c?fNG;iDZK;Z($`joOZK;{!$3Lti7`w@&GtQscdbv0V}uaM(R=A{0tw z424T^C``ItC4rwnLfT0nLpqHnB)t5bnr8)8@$I32u{9J~H%6h1jiGR9w=k4yfrfB@ zJ1#J`;*xb}uGbSIE-tm;Vr~ECrUk|qTwVU#hITLOx-x5Z>sgBnO>&H-O1=ne!W_CO zSX+@XR~IEg&2|l1w#ZRGi-p+Ix`~2v2quEIImenX%wN~x^${3z)iq)*d4jDF z<;E~dUG3`DTV}DV@Ms^4DN2I(iJ&z!omdmA=wueVj;@czsKR0zI!;%ZV_{Iwx}Y*Q zT}Ri)P0Up{wRmF+T2s#xV%trBmCQ2N(e<$mRaizt$LR#`0ANsG-N}4$9bF$^P+6(n z^>JRnv<_qXCRV14>)`t60?rh=iPgYyR={KdWBS0!WN{r_A6Za^Ebu)9XpL3J#octP zE7plCXdhQ7N=#X6&>ETcEH`&!>O>aI+DaCRQimr<0W&R(>RS~~WI_9X$U;#Pd>;l{ zfs+SATX>jhh1{|-0w=a$)^>Ob zTcADG8iq1>xblS9cGTsxe-D(Fi%$ntExyqLErX{e4<8vRZ6$Epz%%Ph@~8$+(Q5E` z^F>LG!KGpQlC)xRKO2O9a6>yi-_UkVT@A!(s~^Y>S`S3gVse483^6+Ch=o`?S?ulV zfz#m-v+ha1qSa7~wvHw(2A{_KGch}di~NKgA&8T>vHGX0HFYjzGPrsk9>`>%&xNAZ z&Ui)F*Q%}=EfPhmMIv_WxPHK=seXjm!YHBB;S;m&nE*wL_s}r5g@;cd&vHBg%pA%-9KrcB zj1gsT={ccOZ)etB7>GsgRjn3=XyW)j3Pu^W8;M!`s9wXectfW~%B;Jf@Tr8N)hbGy znc!Op7-Xy{CO)r!jqyB=f-@sf+hZE1<{B#PQ;S5=;zu>2hiZ|Sdn;Ut1!#GU5jr(b zX5Ev2S*tx@5=*P&?ujb4+cE!)eRS9zyrTA!{)Qz|gi{v^coj zvTKZ-_T0?c4o}hIyFE}E;R$(%TA|R$Y0u59d*I1hyyu30p=NkoZkgQ3Y0u59d*CTr z`~V5Yn(^VyxSBqvGa7iYq7R?)kQqNjg0W_NT#lP8f3wYQ;{0s~uM*|(Ho<5J=(c6qoWHQwH!d$Ht^Nl90RR63 z08mQ<1QY<1y$lGmd=!}s1>6d>EJw2%AtnKTUrWO<6vf{KzeCA;o2GLzLfe7L1aTlZ zQG5%@-BxV=kYslI_NMJtE8ByQ$vO9zb1uo}ds#={U|pqEfk={~2o%?%Qkw$3trqbN zMb0xNSgjQl=l~AQ%j>J08_KnX7po0eUxAAPNjb_*fwtZoim~Hckjy2)p@O*9Rx%%d zj&_3$<6mq8I7`!8EWtCuJi{$8HUkkIk%A9Mv$J&v5FA4d5|nqCBn1C@Jy_{34s^=M zDJy?4q0=$$m@RlO4(`3H24mASNi*qUg!}}5ET3Pys&S>-U2#BVE;!{DnAf&^(p#pY z$6e6Y&S{n7k=QP>X3j6e&es))yTb)R|G6jdyE^?NAfgbE_H|Cm`{e#%wLoQ-W+YCh zaeA{NGn!^}GW%>J{F!e9_ZRXQ