diff --git a/.gitignore b/.gitignore index 3751d5ea..8b72265c 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,4 @@ pll/libpll.a pll_test/ workspace.code-workspace .vscode +build/** diff --git a/CMakeLists.txt b/CMakeLists.txt index fce4a26b..16d49c50 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -516,3 +516,5 @@ set(CPACK_STRIP_FILES TRUE) include (CPack) add_custom_target(dist COMMAND ${CMAKE_MAKE_PROGRAM} package_source) + +target_link_libraries(mpboot pthread) diff --git a/README.md b/README.md index d8c5336a..6a825dfc 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,33 @@ MPBoot: Fast phylogenetic maximum parsimony tree inference and bootstrap approxi * Run **make** * You will find the executable named **mpboot** once the **make** command is done. +## **PLACEMENT CORE** +### **Parameter** +* **-pp_on**: enable placement. +* **-pp_n**: number of available samples on tree. +* **-pp_k**: number of missing samples. +* **-pp_tree**: tree file. +* **-pp_orig_spr**: run spr without changing origin tree. +* **-pp_test_spr**: enable check correct tree. +* **-pp_origin**: origin tree file. +* **-pp_zip_aln**: alignment's zip file. +* **-pp_zip_tree**: tree's zip file. + +### **Command** +* Add missing samples to existing tree: +
+ ``./mpboot -s -pp_tree -pp_on -pp_n -pp_k `` +
+ *Example:* +
+ ``./mpboot -s data/test5/1/added5.vcf -pp_tree data/test5/1/origin5.fasta.treefile -pp_on -pp_k 5 -pp_n 5`` +* Check if tree unchanged +
+ ``./mpboot -s -pp_tree -pp_origin -pp_on -pp_test_spr -pp_k -pp_orig_spr`` +
+ *Example:* +
+ ``./mpboot -s data/test5/1/added5.vcf -pp_tree addedTree.txt -pp_origin data/test5/1/origin5.fasta.treefile -pp_on -pp_test_spr -pp_k 5 -pp_orig_spr``



diff --git a/phylonode.h b/phylonode.h index ef77b22e..8051a788 100644 --- a/phylonode.h +++ b/phylonode.h @@ -13,6 +13,8 @@ #define PHYLONODE_H #include "node.h" +#include "queue" +#include typedef short int UBYTE; @@ -21,7 +23,8 @@ A neighbor in a phylogenetic tree @author BUI Quang Minh, Steffen Klaere, Arndt von Haeseler */ -class PhyloNeighbor : public Neighbor { +class PhyloNeighbor : public Neighbor +{ friend class PhyloNode; friend class PhyloTree; friend class IQTree; @@ -37,7 +40,8 @@ class PhyloNeighbor : public Neighbor { @param alength length of branch */ - PhyloNeighbor(Node *anode, double alength) : Neighbor(anode, alength) { + PhyloNeighbor(Node *anode, double alength) : Neighbor(anode, alength) + { partial_lh = NULL; partial_lh_computed = 0; lh_scale_factor = 0.0; @@ -50,7 +54,8 @@ class PhyloNeighbor : public Neighbor { @param alength length of branch @param aid branch ID */ - PhyloNeighbor(Node *anode, double alength, int aid) : Neighbor(anode, alength, aid) { + PhyloNeighbor(Node *anode, double alength, int aid) : Neighbor(anode, alength, aid) + { partial_lh = NULL; partial_lh_computed = 0; lh_scale_factor = 0.0; @@ -60,14 +65,16 @@ class PhyloNeighbor : public Neighbor { /** tell that the partial likelihood vector is not computed */ - inline void clearPartialLh() { + inline void clearPartialLh() + { partial_lh_computed = 0; } /** * tell that the partial likelihood vector is computed */ - inline void unclearPartialLh() { + inline void unclearPartialLh() + { partial_lh_computed = 1; } @@ -78,7 +85,6 @@ class PhyloNeighbor : public Neighbor { void clearForwardPartialLh(Node *dad); private: - /** true if the partial likelihood was computed */ @@ -104,6 +110,7 @@ class PhyloNeighbor : public Neighbor { */ UINT *partial_pars; + std::mutex partial_lh_mutex; }; /** @@ -111,7 +118,8 @@ A node in a phylogenetic tree @author BUI Quang Minh, Steffen Klaere, Arndt von Haeseler */ -class PhyloNode : public Node { +class PhyloNode : public Node +{ friend class PhyloTree; public: @@ -153,8 +161,6 @@ class PhyloNode : public Node { */ virtual void addNeighbor(Node *node, double length, int id = -1); - - /** tell that all partial likelihood vectors below this node are not computed */ @@ -166,11 +172,9 @@ class PhyloNode : public Node { void clearReversePartialLh(PhyloNode *dad); }; - /** Node vector */ -typedef vector PhyloNodeVector; - +typedef vector PhyloNodeVector; #endif diff --git a/phylotree.cpp b/phylotree.cpp index 3eb44929..4f959fd1 100644 --- a/phylotree.cpp +++ b/phylotree.cpp @@ -935,6 +935,200 @@ void PhyloTree::computePartialParsimony(PhyloNeighbor *dad_branch, PhyloNode *da delete[] bits_entry; } +void PhyloTree::computePartialParsimonyMultiThreads(PhyloNeighbor *dad_branch, PhyloNode *dad, int startPtn, int endPtn) { + // don't recompute the parsimony + if (dad_branch->partial_lh_computed & 2) + return; + if (startPtn > endPtn) { + return; + } + Node *node = dad_branch->node; + //assert(node->degree() <= 3); + int ptn; + int nstates = aln->num_states; + int pars_size = getBitsBlockSize(); + int entry_size = getBitsEntrySize(); + int nptn = aln->size(); + int ptn_pars_start_id = pars_size - nptn - 1; + + assert(dad_branch->partial_pars); + + if (nstates == 4 && aln->seq_type == SEQ_DNA && (node->isLeaf() || node->degree() == 3)) { + // ULTRAFAST VERSION FOR DNA, assuming that UINT is 32-bit integer + if (node->isLeaf() && dad) { + // external node + for (ptn = startPtn; ptn <= endPtn; ptn+=8) { + UINT states = 0; + int maxi = endPtn - ptn + 1; + if(maxi > 8) maxi = 8; + for (int i = 0; i < maxi; i++) { + UINT bit_state = dna_state_map[(aln->at(ptn+i))[node->id]]; + states |= (bit_state << (i*4)); + dad_branch->partial_pars[ptn_pars_start_id + ptn + i] = 0; + } + dad_branch->partial_pars[ptn/8] = states; + } + } else { + // internal node + memset(dad_branch->partial_pars + ptn_pars_start_id + startPtn, 0, (endPtn - startPtn + 1) * sizeof(int)); + UINT *left = NULL, *right = NULL; + FOR_NEIGHBOR_IT(node, dad, it)if ((*it)->node->name != ROOT_NAME) { + computePartialParsimonyMultiThreads((PhyloNeighbor*) (*it), (PhyloNode*) node, startPtn, endPtn); + if (!left) + left = ((PhyloNeighbor*) (*it))->partial_pars; + else + right = ((PhyloNeighbor*) (*it))->partial_pars; + for(int p = startPtn; p <= endPtn; p++) + dad_branch->partial_pars[ptn_pars_start_id + p] += ((PhyloNeighbor*) (*it))->partial_pars[ptn_pars_start_id + p]; + } + int pars_steps = 0; + for (ptn = startPtn; ptn <= endPtn; ptn+=8) { + UINT states_left = left[ptn/8]; + UINT states_right = right[ptn/8]; + UINT states_dad = 0; + int maxi = endPtn - ptn + 1; + if(maxi > 8) maxi = 8; + for (int i = 0; i < maxi; i++) { + UINT state_left = (states_left >> (i*4)) & 15; + UINT state_right = (states_right >> (i*4)) & 15; + UINT state_both = state_left | (state_right << 4); + states_dad |= dna_fitch_result[state_both] << (i*4); + pars_steps += dna_fitch_step[state_both] * aln->at(ptn+i).frequency; + dad_branch->partial_pars[ptn_pars_start_id + ptn + i] += dna_fitch_step[state_both]; + } + dad_branch->partial_pars[ptn/8] = states_dad; + } + dad_branch->partial_lh_mutex.lock(); + dad_branch->partial_pars[pars_size - 1] += pars_steps; + dad_branch->partial_lh_mutex.unlock(); + } + return; + } // END OF DNA VERSION +} + +vector > PhyloTree::initializeComputeParsimonyMultiThreads(PhyloNeighbor *dad_branch, PhyloNode *dad) +{ + PhyloNode *node = (PhyloNode*) dad_branch->node; + PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad); + vector > result; + queue > q; + int pars_size = getBitsBlockSize(); + + q.push(make_pair(dad_branch, dad)); + q.push(make_pair(node_branch, node)); + while(q.size()) { + PhyloNeighbor *cur_dad_branch = q.front().first; + PhyloNode *cur_dad = q.front().second; + q.pop(); + if (cur_dad_branch->partial_lh_computed & 2) + continue; + cur_dad_branch->partial_pars[pars_size - 1] = 0; + result.push_back(make_pair(cur_dad_branch, cur_dad)); + + PhyloNode *cur_node = (PhyloNode*) cur_dad_branch->node; + if (cur_node->isLeaf() && cur_dad) + continue; + FOR_NEIGHBOR_IT(cur_node, cur_dad, it)if ((*it)->node->name != ROOT_NAME) { + q.push(make_pair((PhyloNeighbor*) (*it), cur_node)); + } + } + return result; +} + +void PhyloTree::finalizeComputeParsimonyMultiThreads(vector > topo_sorted_branches) { + int pars_size = getBitsBlockSize(); + for (int i = (int)topo_sorted_branches.size() - 1; i >= 0; i--) { + PhyloNeighbor *dad_branch = topo_sorted_branches[i].first; + PhyloNode *dad = topo_sorted_branches[i].second; + dad_branch->partial_lh_computed |= 2; + + PhyloNode *node = (PhyloNode*) dad_branch->node; + FOR_NEIGHBOR_IT(node, dad, it) if ((*it)->node->name != ROOT_NAME) { + dad_branch->partial_pars[pars_size - 1] += ((PhyloNeighbor*)(*it))->partial_pars[pars_size - 1]; + } + } +} + +int PhyloTree::computeParsimonyBranchMultiThreads(PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst) { + PhyloNode *node = (PhyloNode*) dad_branch->node; + PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad); + assert(node_branch); + if (!central_partial_pars) + initializeAllPartialPars(); + // swap node and dad if dad is a leaf + if (node->isLeaf()) { + PhyloNode *tmp_node = dad; + dad = node; + node = tmp_node; + PhyloNeighbor *tmp_nei = dad_branch; + dad_branch = node_branch; + node_branch = tmp_nei; + //cout << "swapped\n"; + } + + int nptn = aln->size(); + if(!_pattern_pars) _pattern_pars = aligned_alloc(nptn+VCSIZE_USHORT); + memset(_pattern_pars, 0, sizeof(BootValTypePars) * (nptn+VCSIZE_USHORT)); + + vector > topo_sorted_branches = initializeComputeParsimonyMultiThreads(dad_branch, dad); + vector threads; + int ptnPerThread = (aln->size() + params->pp_thread - 1) / params->pp_thread; + if (ptnPerThread % 8 != 0) ptnPerThread += 8 - (ptnPerThread % 8); + // int ptnPerThread = aln->size(); + for (int i = 0; i < params->pp_thread; i++) { + int startPtn = i * ptnPerThread; + int endPtn = min((int)aln->size(), (i + 1) * ptnPerThread) - 1; + if (startPtn > endPtn) continue; + if ((dad_branch->partial_lh_computed & 2) == 0) { + threads.push_back(thread(&PhyloTree::computePartialParsimonyMultiThreads, this, dad_branch, dad, startPtn, endPtn)); + } + if ((node_branch->partial_lh_computed & 2) == 0) { + threads.push_back(thread(&PhyloTree::computePartialParsimonyMultiThreads, this, node_branch, node, startPtn, endPtn)); + } + } + for (int i = 0; i < (int)threads.size(); ++i) { + threads[i].join(); + } + finalizeComputeParsimonyMultiThreads(topo_sorted_branches); + // if ((dad_branch->partial_lh_computed & 2) == 0) + // computePartialParsimony(dad_branch, dad); + // if ((node_branch->partial_lh_computed & 2) == 0) + // computePartialParsimony(node_branch, node); + // // now combine likelihood at the branch + + int pars_size = getBitsBlockSize(); + int entry_size = getBitsEntrySize(); + int ptn_pars_start_id = pars_size - nptn - 1; + //int nstates = aln->num_states; + int i, ptn; + int tree_pars = 0; + + if (aln->num_states == 4 && aln->seq_type == SEQ_DNA) { + // ULTRAFAST VERSION FOR DNA + for (ptn = 0; ptn < aln->size(); ptn+=8) { + UINT states_left = node_branch->partial_pars[ptn/8]; + UINT states_right = dad_branch->partial_pars[ptn/8]; + UINT states_dad = 0; + int maxi = aln->size() - ptn; + if(maxi > 8) maxi = 8; + for (i = 0; i< maxi; i++) { + UINT state_left = (states_left >> (i*4)) & 15; + UINT state_right = (states_right >> (i*4)) & 15; + UINT state_both = state_left | (state_right << 4); + states_dad |= dna_fitch_result[state_both] << (i*4); + tree_pars += dna_fitch_step[state_both] * aln->at(ptn+i).frequency; + _pattern_pars[ptn + i] = node_branch->partial_pars[ptn_pars_start_id + ptn + i] + + dad_branch->partial_pars[ptn_pars_start_id + ptn + i] + dna_fitch_step[state_both]; + } + } + } + if (branch_subst) + *branch_subst = tree_pars; + + tree_pars += node_branch->partial_pars[pars_size - 1] + dad_branch->partial_pars[pars_size - 1]; + return tree_pars; +} + int PhyloTree::computeParsimonyBranch(PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst) { PhyloNode *node = (PhyloNode*) dad_branch->node; PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad); @@ -1057,7 +1251,7 @@ int PhyloTree::computeParsimony() { int nptn = aln->size(); if(_pattern_pars == NULL) _pattern_pars = aligned_alloc(nptn + VCSIZE_USHORT); - return computeParsimonyBranch((PhyloNeighbor*) root->neighbors[0], (PhyloNode*) root); + return computeParsimonyBranchMultiThreads((PhyloNeighbor*) root->neighbors[0], (PhyloNode*) root); } void PhyloTree::printParsimonyStates(PhyloNeighbor *dad_branch, PhyloNode *dad) { diff --git a/phylotree.h b/phylotree.h index 58ec5cb0..4a2cca4c 100644 --- a/phylotree.h +++ b/phylotree.h @@ -31,6 +31,7 @@ #include "optimization.h" #include "model/rateheterogeneity.h" #include "phyloanalysis.h" +#include "thread" const double MIN_BRANCH_LEN = 0.000001; // NEVER TOUCH THIS CONSTANT AGAIN PLEASE! const double MAX_BRANCH_LEN = 100.0; @@ -438,6 +439,14 @@ class PhyloTree : public MTree, public Optimization { */ virtual void computePartialParsimony(PhyloNeighbor *dad_branch, PhyloNode *dad); + /** + compute partial parsimony score of the subtree rooted at dad from startPtn to endPtn + @param dad_branch the branch leading to the subtree + @param dad its dad, used to direct the tranversal + @param startPtn start pattern + @param endPtn end pattern + */ + void computePartialParsimonyMultiThreads(PhyloNeighbor *dad_branch, PhyloNode *dad, int startPtn, int endPtn); /** compute tree parsimony score on a branch @@ -448,6 +457,29 @@ class PhyloTree : public MTree, public Optimization { */ virtual int computeParsimonyBranch(PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst = NULL); + /** + compute tree parsimony score on a branch + @param dad_branch the branch leading to the subtree + @param dad its dad, used to direct the tranversal + @param branch_subst (OUT) if not NULL, the number of substitutions on this branch + @return parsimony score of the tree + */ + int computeParsimonyBranchMultiThreads(PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst = NULL); + + /** + initialize information for computing parsimony score with multi-threads + @param dad_branch the branch leading to the subtree + @param dad its dad, used to direct the tranversal + @return a vector of pairs of (branch, node) + */ + vector > initializeComputeParsimonyMultiThreads(PhyloNeighbor *dad_branch, PhyloNode *dad); + + /** + finalize compute parsimony score with multi-threads + @param topo_sorted_branches a vector of pairs of (branch, node) + */ + void finalizeComputeParsimonyMultiThreads(vector > topo_sorted_branches); + void printParsimonyStates(PhyloNeighbor *dad_branch = NULL, PhyloNode *dad = NULL); diff --git a/tools.cpp b/tools.cpp index d58577a6..c8c8bbd9 100644 --- a/tools.cpp +++ b/tools.cpp @@ -954,6 +954,13 @@ void parseArg(int argc, char *argv[], Params ¶ms) { //params.run_mode = BOTH_ALG; continue; } + if (strcmp(argv[cnt], "-pp_thread") == 0) { + cnt++; + if (cnt >= argc) + throw "Invalid thread number"; + params.pp_thread = convert_int(argv[cnt]); + continue; + } if (strcmp(argv[cnt], "-e") == 0) { cnt++; if (cnt >= argc) diff --git a/tools.h b/tools.h index 315db2dd..2926bcf2 100644 --- a/tools.h +++ b/tools.h @@ -411,6 +411,10 @@ extern int NNI_MAX_NR_STEP; program parameters, everything is specified here */ struct Params { + /** + * Number of threads + */ + int pp_thread = 1; /** * Number of starting parsimony trees