From ba3fe97595fb1cf10eeacbcf2d474548e43aac8f Mon Sep 17 00:00:00 2001 From: trungnotchung Date: Mon, 2 Jun 2025 15:21:30 +0700 Subject: [PATCH 01/23] remove build from git --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 3751d5ea..ae1bbbdd 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,4 @@ pll/libpll.a pll_test/ workspace.code-workspace .vscode +build/ From 8cf93ab4e6137cc8777067180011dffd7e8bc159 Mon Sep 17 00:00:00 2001 From: trungnotchung Date: Mon, 2 Jun 2025 16:50:00 +0700 Subject: [PATCH 02/23] feat: placement --- CMakeLists.txt | 2 + alignment.cpp | 471 ++++++++++++++++- alignment.h | 75 ++- iqtree.cpp | 150 ++++++ iqtree.h | 39 ++ mutation.cpp | 66 +++ mutation.h | 54 ++ pda.cpp | 5 +- phylonode.cpp | 60 ++- phylonode.h | 98 +++- phylotree.cpp | 1312 +++++++++++++++++++++++++++++++++++++++++++++++- phylotree.h | 115 +++++ placement.cpp | 204 ++++++++ placement.h | 19 + timeutil.h | 11 +- tools.cpp | 99 +++- tools.h | 31 +- 17 files changed, 2742 insertions(+), 69 deletions(-) create mode 100644 mutation.cpp create mode 100644 mutation.h create mode 100644 placement.cpp create mode 100644 placement.h diff --git a/CMakeLists.txt b/CMakeLists.txt index ebb6506f..fdab61cc 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -400,6 +400,8 @@ checkpoint.cpp parstree.cpp sprparsimony.cpp test.cpp +mutation.cpp +placement.cpp ) ################################################################## diff --git a/alignment.cpp b/alignment.cpp index 8b1d32bb..229529f7 100644 --- a/alignment.cpp +++ b/alignment.cpp @@ -398,7 +398,7 @@ void Alignment::checkGappySeq(bool force_error) { } } -Alignment::Alignment(char *filename, char *sequence_type, InputType &intype) : vector() { +Alignment::Alignment(char *filename, char *sequence_type, InputType &intype, int numStartRow) : vector() { num_states = 0; frac_const_sites = 0.0; codon_table = NULL; @@ -420,6 +420,9 @@ Alignment::Alignment(char *filename, char *sequence_type, InputType &intype) : v } else if (intype == IN_PHYLIP) { cout << "Phylip format detected" << endl; readPhylip(filename, sequence_type); + } else if (intype == IN_VCF) { + cout << "VCF format detected" << endl; + readVCF(filename, sequence_type, numStartRow); } else { outError("Unknown sequence format, please use PHYLIP, FASTA, or NEXUS format"); } @@ -992,6 +995,56 @@ char Alignment::convertStateBack(char state) { } } +char Alignment::getMutationFromState(char state) +{ + int value = convertState(state, SEQ_DNA); + switch (value) + { + case 0: + return 1; + case 1: + return 2; + case 2: + return 4; + case 3: + return 8; + case 1 + 4 + 3: + return 1 + 4; + case 2 + 8 + 3: + return 2 + 8; + case 1 + 8 + 3: + return 1 + 8; + case 2 + 4 + 3: + return 2 + 4; + case 1 + 2 + 3: + return 1 + 2; + case 4 + 8 + 3: + return 4 + 8; + case 2 + 4 + 8 + 3: + return 2 + 4 + 8; + case 1 + 2 + 8 + 3: + return 1 + 2 + 8; + case 1 + 4 + 8 + 3: + return 1 + 4 + 8; + case 1 + 2 + 4 + 3: + return 1 + 2 + 4; + + default: + return 15; + break; + } +} + +int Alignment::getStateFromMutation(int nuc) +{ + int value; + if ((nuc & (nuc - 1)) == 0) + value = log2(nuc); + else + value = nuc + 3; + return convertStateBack(value); +} + string Alignment::convertStateBackStr(char state) { string str; if (seq_type != SEQ_CODON) { @@ -1235,6 +1288,422 @@ int Alignment::buildPattern(StrVector &sequences, char *sequence_type, int nseq, return 1; } +void split(const string &s, vector &elems, const string &delim) +{ + elems.clear(); + size_t pos = 0; + size_t len = s.length(); + size_t delim_len = delim.length(); + if (delim_len == 0) + { + elems.push_back(s); + return; + } + while (pos < len) + { + size_t find_pos = s.find(delim, pos); + if (find_pos == string::npos) + { + elems.push_back(s.substr(pos)); + return; + } + elems.push_back(s.substr(pos, find_pos - pos)); + pos = find_pos + delim_len; + } +} + +// Find the permutation of columns after rotation +vector Alignment::findRotatedColumnPermutation() +{ + assert(getNSite() == (int)initialColumnState.size()); + vector perm(getNSite(), 0); + char char_to_state[NUM_CHAR]; + computeUnknownState(); + buildStateMap(char_to_state, seq_type); + map> patternMap; + // Build pattern map + for (int i = 0; i < getNSite(); ++i) + { + Pattern ptn = getPattern(i); + patternMap[ptn].push_back(i); + } + for (int col = 0; col < getNSite(); ++col) + { + // For each column, build a pattern + // Find initial index of the pattern + Pattern nptn; + for (int i = 0; i < initialColumnState[col].length(); ++i) + { + nptn += char_to_state[(int)initialColumnState[col][i]]; + } + perm[patternMap[nptn].back()] = col; + patternMap[nptn].pop_back(); + } + return perm; +} + +void Alignment::addToAlignmentNewSequence(const string &newName, const string &newSeq, const vector &permCol) +{ + assert(newSeq.size() == getNSite()); + + char char_to_state[NUM_CHAR]; + computeUnknownState(); + buildStateMap(char_to_state, seq_type); + vector newVectorPattern; + vector newSitePattern; + PatternIntMap newPatternIdx; + for (int i = 0; i < getNSite(); ++i) + { + Pattern newPat = getPattern(i); + newPat.push_back(char_to_state[(int)newSeq[permCol[i]]]); + PatternIntMap::iterator pat_it = newPatternIdx.find(newPat); + if (pat_it == newPatternIdx.end()) + { // not found + newPat.frequency = 1; + newPat.computeConst(STATE_UNKNOWN); + newVectorPattern.push_back(newPat); + newPatternIdx[newPat] = newVectorPattern.size() - 1; + newSitePattern.push_back(newVectorPattern.size() - 1); + } + else + { + int index = pat_it->second; + newVectorPattern[index].frequency++; + newSitePattern.push_back(index); + } + } + clear(); + for (vector::iterator it = newVectorPattern.begin(); it != newVectorPattern.end(); ++it) + { + push_back(*it); + } + pattern_index = newPatternIdx; + site_pattern = newSitePattern; + seq_names.push_back(newName); + buildSeqStates(); + // checkSeqName(); + countConstSite(); +} + +void Alignment::addToAlignmentNewSequences(const vector &newName, const vector &newSeq, const vector &permCol) +{ + char char_to_state[NUM_CHAR]; + computeUnknownState(); + buildStateMap(char_to_state, seq_type); + vector newVectorPattern; + vector newSitePattern; + PatternIntMap newPatternIdx; + int nSeqSize = newSeq.size(); + for (int i = 0; i < getNSite(); ++i) + { + Pattern newPat = getPattern(i); + for (int j = 0; j < nSeqSize; ++j) + { + newPat.push_back(char_to_state[(int)newSeq[j][permCol[i]]]); + } + PatternIntMap::iterator pat_it = newPatternIdx.find(newPat); + if (pat_it == newPatternIdx.end()) + { // not found + newPat.frequency = 1; + newPat.computeConst(STATE_UNKNOWN); + newVectorPattern.push_back(newPat); + newPatternIdx[newPat] = newVectorPattern.size() - 1; + newSitePattern.push_back(newVectorPattern.size() - 1); + } + else + { + int index = pat_it->second; + newVectorPattern[index].frequency++; + newSitePattern.push_back(index); + } + } + clear(); + for (vector::iterator it = newVectorPattern.begin(); it != newVectorPattern.end(); ++it) + { + push_back(*it); + } + pattern_index = newPatternIdx; + site_pattern = newSitePattern; + seq_names.insert(seq_names.end(), newName.begin(), newName.end()); + buildSeqStates(); + // checkSeqName(); + countConstSite(); +} + +void Alignment::updateAlignmentNewSequences(const vector &newSeq, const vector &permCol) +{ + computeUnknownState(); + char char_to_state[NUM_CHAR]; + buildStateMap(char_to_state, seq_type); + + vector newVectorPattern; + vector newSitePattern; + PatternIntMap newPatternIdx; + int nseq = newSeq.size(); + int nsite = getNSite(); + + for (int site = 0; site < nsite; ++site) + { + Pattern newPat; + for (int seq = 0; seq < nseq; ++seq) + { + newPat.push_back(char_to_state[(int)newSeq[seq][permCol[site]]]); + } + PatternIntMap::iterator pat_it = newPatternIdx.find(newPat); + if (pat_it == newPatternIdx.end()) + { + // If pattern not found, add new pattern + newPat.frequency = 1; + newPat.computeConst(STATE_UNKNOWN); + newVectorPattern.push_back(newPat); + newPatternIdx[newPat] = newVectorPattern.size() - 1; + newSitePattern.push_back(newVectorPattern.size() - 1); + } + else + { + // If pattern found, increment frequency + int index = pat_it->second; + newVectorPattern[index].frequency++; + newSitePattern.push_back(index); + } + } + clear(); + for (vector::iterator itr = newVectorPattern.begin(); itr != newVectorPattern.end(); ++itr) + { + push_back(*itr); + } + pattern_index = newPatternIdx; + site_pattern = newSitePattern; + buildSeqStates(); + countConstSite(); +} + +// Read partial VCF file and update alignment +int Alignment::readPartialVCF(ifstream &in, char *sequence_type, vector &permCol, int numStartRow, int startIndex, int numColumn) +{ + if (in.eof()) + { + return 0; + } + StrVector sequences; + int nseq = getNSeq(); + int nsite = 0; + int seq_id = 0; + string line; + int numProcessedColumn = 0; + + sequences.resize(nseq, ""); + existingSampleMutations.assign(nseq, vector()); + + for (; !in.eof() && numProcessedColumn < numColumn;) + { + getline(in, line); + if (line == "") + continue; + vector words; + split(line, words, "\t"); + if (words.size() == 1) + continue; + if (words.size() != 9 + nseq + missingSampleMutations.size()) + throw "Number of columns in VCF file is not consistent"; + vector alleles; + Mutation cur_mut; + int variant_pos = std::stoi(words[1]); + cur_mut.position = variant_pos; + cur_mut.compressed_position = numProcessedColumn + startIndex; + while ((int)reference_nuc.size() <= cur_mut.position) + reference_nuc.push_back(0); + split(words[4], alleles, ","); + cur_mut.ref_nuc = getMutationFromState(words[3][0]); + if (reference_nuc[cur_mut.position] == 0) + reference_nuc[cur_mut.position] = cur_mut.ref_nuc; + for (int i = 9; i < words.size(); ++i) + { + cur_mut.is_missing = false; + if (isdigit(words[i][0])) + { + int allele_id = std::stoi(words[i]); + if (allele_id > 0) + { + std::string allele = alleles[allele_id - 1]; + if (i - 9 < numStartRow) + { + sequences[i - 9].push_back(allele[0]); + } + cur_mut.mut_nuc = getMutationFromState(allele[0]); + } + else + { + if (i - 9 < numStartRow) + { + sequences[i - 9].push_back(words[3][0]); + } + cur_mut.mut_nuc = getMutationFromState(words[3][0]); + } + } + else + { + if (i - 9 < numStartRow) + { + sequences[i - 9].push_back('-'); + } + cur_mut.mut_nuc = getMutationFromState('N'); + cur_mut.is_missing = true; + } + if (i - 9 >= numStartRow) + { + if (cur_mut.mut_nuc != cur_mut.ref_nuc) + { + cur_mut.par_nuc = cur_mut.ref_nuc; + missingSampleMutations[i - 9 - numStartRow].push_back(cur_mut); + } + } + else + { + existingSampleMutations[i - 9].push_back(cur_mut); + } + } + ++nsite; + ++numProcessedColumn; + } + + // If not enough columns, rebuild pattern and return + if (numProcessedColumn < numColumn) + { + buildPattern(sequences, sequence_type, nseq, nsite); + initialColumnState.assign(nsite, ""); + for (int seq = 0; seq < nseq; ++seq) + { + for (int site = 0; site < nsite; ++site) + initialColumnState[site] += sequences[seq][site]; + } + permCol = findRotatedColumnPermutation(); + return numProcessedColumn; + } + + // Update alignment with new sequences + updateAlignmentNewSequences(sequences, permCol); + return numProcessedColumn; +} + +int Alignment::readVCF(char *filename, char *sequence_type, int numStartRow) +{ + StrVector sequences; + ifstream in; + in.exceptions(ios::failbit | ios::badbit); + in.open(filename); + int nseq = 0; + int nsite = 0; + int seq_id = 0; + string line; + in.exceptions(ios::badbit); + int processedColumn = 0; + + for (; !in.eof();) + { + getline(in, line); + if (line == "") + continue; + vector words; + split(line, words, "\t"); + if (words.size() == 1) + continue; + if (words[1] == "POS") + { + // Sample names start from the 10th word in the header + for (int i = 9; i < words.size(); i++) + { + if (i - 9 >= numStartRow) + { + missingSampleNames.push_back(words[i]); + } + else + { + seq_names.push_back(words[i]); + nseq++; + } + } + sequences.resize(nseq, ""); + missingSampleMutations.resize(missingSampleNames.size()); + existingSampleMutations.resize(nseq); + } + else + { + if (words.size() != 9 + nseq + missingSampleMutations.size()) + throw "Number of columns in VCF file is not consistent"; + vector alleles; + Mutation cur_mut; + int variant_pos = std::stoi(words[1]); + cur_mut.position = variant_pos; + cur_mut.compressed_position = processedColumn; + while ((int)reference_nuc.size() <= cur_mut.position) + reference_nuc.push_back(0); + split(words[4], alleles, ","); + cur_mut.ref_nuc = getMutationFromState(words[3][0]); + if (reference_nuc[cur_mut.position] == 0) + reference_nuc[cur_mut.position] = cur_mut.ref_nuc; + for (int i = 9; i < words.size(); ++i) + { + cur_mut.is_missing = false; + if (isdigit(words[i][0])) + { + int allele_id = std::stoi(words[i]); + if (allele_id > 0) + { + std::string allele = alleles[allele_id - 1]; + if (i - 9 < numStartRow) + { + sequences[i - 9].push_back(allele[0]); + } + cur_mut.mut_nuc = getMutationFromState(allele[0]); + } + else + { + if (i - 9 < numStartRow) + { + sequences[i - 9].push_back(words[3][0]); + } + cur_mut.mut_nuc = getMutationFromState(words[3][0]); + } + } + else + { + if (i - 9 < numStartRow) + { + sequences[i - 9].push_back('-'); + } + cur_mut.mut_nuc = getMutationFromState('N'); + cur_mut.is_missing = true; + } + if (i - 9 >= numStartRow) + { + if (cur_mut.mut_nuc != cur_mut.ref_nuc) + { + cur_mut.par_nuc = cur_mut.ref_nuc; + missingSampleMutations[i - 9 - numStartRow].push_back(cur_mut); + } + } + else + { + existingSampleMutations[i - 9].push_back(cur_mut); + } + } + ++nsite; + ++processedColumn; + } + } + initialColumnState.assign(nsite, ""); + for (int seq = 0; seq < nseq; ++seq) + { + for (int site = 0; site < nsite; ++site) + initialColumnState[site] += sequences[seq][site]; + } + in.clear(); + in.exceptions(ios::failbit | ios::badbit); + in.close(); + return buildPattern(sequences, sequence_type, nseq, nsite); +} + int Alignment::readPhylip(char *filename, char *sequence_type) { StrVector sequences; diff --git a/alignment.h b/alignment.h index 74f43ad7..933fcfdc 100644 --- a/alignment.h +++ b/alignment.h @@ -16,6 +16,7 @@ #include "pattern.h" #include "ncl/ncl.h" #include "tools.h" +#include "mutation.h" // IMPORTANT: refactor STATE_UNKNOWN //const char STATE_UNKNOWN = 126; @@ -58,7 +59,7 @@ class Alignment : public vector { @param sequence_type type of the sequence, either "BIN", "DNA", "AA", or NULL @param intype (OUT) input format of the file */ - Alignment(char *filename, char *sequence_type, InputType &intype); + Alignment(char *filename, char *sequence_type, InputType &intype, int numStartRow = INT_MAX); /** destructor @@ -609,6 +610,78 @@ class Alignment : public vector { int n_informative_patterns; int n_informative_sites; + /** + * Missing sample names + */ + vector missingSampleNames; + + /** + * Missing sample sequences + */ + vector missingSampleSequences; + + /** + * Initial column state + * Using for finding rotated column permutation + */ + vector initialColumnState; + + /** + * Missing sample mutations + */ + vector> missingSampleMutations; + + /** + * Existing sample mutations + */ + vector> existingSampleMutations; + + /** + * Reference nucleotides + */ + vector reference_nuc; + + /** + * Replace current alignment with new sequences + */ + void updateAlignmentNewSequences(const vector &newSeqs, const vector &permCol); + + /** + * Add a new sequence to the alignment + */ + void addToAlignmentNewSequence(const string &newName, const string &newSeq, const vector &permCol); + + /** + * Add new sequences to the alignment + */ + void addToAlignmentNewSequences(const vector &newNames, const vector &newSeqs, const vector &permCol); + + /** + * Get mutation from state + */ + char getMutationFromState(char state); + + /** + * Get state from mutation + */ + int getStateFromMutation(int nuc); + + /** + * Find rotated column permutation + */ + vector findRotatedColumnPermutation(); + + /** + * Read partial VCF file + * Using for reducing memory usage + */ + int readPartialVCF(ifstream &in, char *sequence_type, vector &permCol, int numStartRow, int startIndex, int numColumn); + + /** + * Read VCF file + */ + int readVCF(char *filename, char *sequence_type, int numStartRow); + protected: diff --git a/iqtree.cpp b/iqtree.cpp index bad3c8ff..f0e6241c 100644 --- a/iqtree.cpp +++ b/iqtree.cpp @@ -4538,3 +4538,153 @@ void IQTree::reinsertIdenticalSeqs(Alignment *orig_aln, StrVector &removed_seqs, deleteAllPartialLh(); clearAllPartialLH(); } + +void IQTree::getLeafName(vector &leafName) +{ + getLeafName(root, root->neighbors[0]->node, leafName); + getLeafName(root->neighbors[0]->node, root, leafName); +} + +void IQTree::getLeafName(Node *node, Node *dad, vector &leafName) +{ + if (node->isLeaf()) + { + leafName.push_back(node->name); + return; + } + FOR_NEIGHBOR_IT(node, dad, it) + { + getLeafName((*it)->node, node, leafName); + if (node->name == "") + { + node->name = (*it)->node->name; + } + else + { + node->name = min(node->name, (*it)->node->name); + } + } +} + +void IQTree::assignRoot(string &rootName) +{ + if (root->name == rootName) + return; + assignRoot(root->neighbors[0]->node, root, rootName); +} + +bool IQTree::assignRoot(Node *node, Node *dad, string &rootName) +{ + if (node->isLeaf() && node->name == rootName) + { + root = node; + return true; + } + FOR_NEIGHBOR_IT(node, dad, it) + { + if (assignRoot((*it)->node, node, rootName)) + { + return true; + } + } +} + +int IQTree::initInfoNode(vector &leafName) +{ + PhyloNode *node1 = (PhyloNode *)root; + PhyloNode *node2 = (PhyloNode *)root->neighbors[0]->node; + + int lf = initInfoNode(node1, node2, leafName); + int rg = initInfoNode(node2, node1, leafName); + return lf + rg; +} + +int IQTree::initInfoNode(PhyloNode *node, PhyloNode *dad, vector &leafName) +{ + if (node->isLeaf()) + { + int k = lower_bound(leafName.begin(), leafName.end(), node->name) - leafName.begin(); + if (k < leafName.size() && leafName[k] == node->name) + { + node->setMissingNode(-1); + return 1; + } + else + { + node->setMissingNode(1); + return 0; + } + } + + int sum = 0; + bool check = true; + FOR_NEIGHBOR_IT(node, dad, it) + { + int tmp = initInfoNode((PhyloNode *)(*it)->node, node, leafName); + if (tmp == 0) + { + check = false; + } + else + { + if (node->name == "") + { + node->name = (*it)->node->name; + } + else + { + node->name = min(node->name, (*it)->node->name); + } + } + sum += tmp; + } + + if (check) + { + node->setMissingNode(-1); + } + else + { + node->setMissingNode(1); + } + return sum; +} + +bool IQTree::compareTree(IQTree *anotherTree) +{ + if (root->name != anotherTree->root->name) + return false; + return compareTree((PhyloNode *)root, NULL, anotherTree->root, NULL); +} + +bool IQTree::compareTree(PhyloNode *node1, PhyloNode *dad1, Node *node2, Node *dad2) +{ + bool check = true; + FOR_NEIGHBOR_IT(node1, dad1, it1) + { + PhyloNode *child1 = (PhyloNode *)(*it1)->node; + if (!child1->checkMissingNode()) + { + bool found = false; + FOR_NEIGHBOR_IT(node2, dad2, it2) + { + Node *child2 = (*it2)->node; + if (child1->name == child2->name) + { + found = true; + check &= compareTree(child1, node1, child2, node2); + break; + } + } + if (!found) + { + return false; + } + } + else + { + check &= compareTree(child1, node1, node2, dad2); + } + } + return check; +} diff --git a/iqtree.h b/iqtree.h index 3ff0ad90..5562380b 100644 --- a/iqtree.h +++ b/iqtree.h @@ -664,6 +664,45 @@ class IQTree : public PhyloTree { int k_represent; public: + /** + * Get all leaf names of the tree. + */ + void getLeafName(vector &leafName); + + /** + * Get all leaf names of the tree rooted at node. + */ + void getLeafName(Node *node, Node *dad, vector& leafName); + + /** + * Assign root with given name. + */ + void assignRoot(string &rootName); + + /** + * Assign root with given name. + */ + bool assignRoot(Node *node, Node *dad, string &rootName); + + /** + * Init info which node is original node, which node is added node. + */ + int initInfoNode(vector &leafName); + + /** + * Init info which node is original node, which node is added node. + */ + int initInfoNode(PhyloNode *node, PhyloNode *dad, vector &leafName); + + /** + * Compare two trees. + */ + bool compareTree(IQTree *anotherTree); + + /** + * Compare two trees rooted at node1 and node2. + */ + bool compareTree(PhyloNode *node1, PhyloNode *dad1, Node *node2, Node *dad2); /** * @brief: optimize model parameters on the current tree diff --git a/mutation.cpp b/mutation.cpp new file mode 100644 index 00000000..ea438c8f --- /dev/null +++ b/mutation.cpp @@ -0,0 +1,66 @@ +#include "mutation.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +// Uses one-hot encoding if base is unambiguous +// A:1,C:2,G:4,T:8 + +// Convert nuc_id back to IUPAC base +char get_nuc(int8_t nuc_id) { + char ret = 'N'; + //assert ((nuc_id >= 1) && (nuc_id <= 15)); + switch (nuc_id) { + case 1: + ret = 'A'; + break; + case 2: + ret = 'C'; + break; + case 3: + ret = 'M'; + break; + case 4: + ret = 'G'; + break; + case 5: + ret = 'R'; + break; + case 6: + ret = 'S'; + break; + case 7: + ret = 'V'; + break; + case 8: + ret = 'T'; + break; + case 9: + ret = 'W'; + break; + case 10: + ret = 'Y'; + break; + case 11: + ret = 'H'; + break; + case 12: + ret = 'K'; + break; + case 13: + ret = 'D'; + break; + case 14: + ret = 'B'; + break; + default: + ret = 'N'; + break; + } + return ret; +} \ No newline at end of file diff --git a/mutation.h b/mutation.h new file mode 100644 index 00000000..54f9f19e --- /dev/null +++ b/mutation.h @@ -0,0 +1,54 @@ +#ifndef _MUTATION +#define _MUTATION +#include +#include +#include +#include +#include +#include +#include +#include +#include +char get_nuc(int8_t nuc_id); + +struct Mutation +{ + int position; + int compressed_position; + char ref_nuc; + char par_nuc; + char mut_nuc; + bool is_missing; + inline bool operator<(const Mutation &m) const + { + return ((*this).position < m.position); + } + inline Mutation copy() const + { + Mutation m; + m.position = position; + m.ref_nuc = ref_nuc; + m.par_nuc = par_nuc; + m.mut_nuc = mut_nuc; + m.is_missing = is_missing; + m.compressed_position = compressed_position; + return m; + } + Mutation() + { + is_missing = false; + } + inline bool is_masked() const + { + return (position < 0); + } + inline std::string get_string() const { + if (is_masked()) { + return "MASKED"; + } + else { + return get_nuc(par_nuc) + std::to_string(position) + get_nuc(mut_nuc); + } + } +}; +#endif \ No newline at end of file diff --git a/pda.cpp b/pda.cpp index a1f9f3f2..ebded5d1 100644 --- a/pda.cpp +++ b/pda.cpp @@ -67,6 +67,7 @@ #include #include "sprparsimony.h" #include "vectorclass/vectorclass.h" +#include "placement.h" #ifdef _OPENMP #include @@ -2322,7 +2323,9 @@ int main(int argc, char *argv[]) cout.setf(ios::fixed); // call the main function - if (params.tree_gen != NONE) { + if (params.ppon) { + placeNewSamplesOntoExistingTree(params); + } else if (params.tree_gen != NONE) { generateRandomTree(params); } else if (params.do_pars_multistate) { // cout << "Starting the test for computing concensus NOT from file:" << endl; diff --git a/phylonode.cpp b/phylonode.cpp index 3d87686a..06437eaa 100644 --- a/phylonode.cpp +++ b/phylonode.cpp @@ -19,6 +19,52 @@ void PhyloNeighbor::clearForwardPartialLh(Node *dad) { ((PhyloNeighbor*)*it)->clearForwardPartialLh(node); } +void PhyloNeighbor::clear_mutations() +{ + mutations.clear(); +} + +void PhyloNeighbor::add_mutation(Mutation mut) +{ + auto iter = std::lower_bound(mutations.begin(), mutations.end(), mut); + // check if mutation at the same position has occured before + if ((iter != mutations.end()) && (iter->position == mut.position)) + { + // update to new allele + if (iter->par_nuc != mut.mut_nuc) + { + iter->mut_nuc = mut.mut_nuc; + } + // reversal mutation + else + { + if (iter->mut_nuc != mut.par_nuc) + { + printf("ERROR: add_mutation: consecutive mutations at same position " + "disagree on nuc -- called out of order?\n"); + exit(1); + } + std::vector tmp; + for (auto m : mutations) + { + if (m.position != iter->position) + { + tmp.emplace_back(m.copy()); + } + } + mutations.clear(); + for (auto m : tmp) + { + mutations.emplace_back(m.copy()); + } + } + } + // new mutation + else + { + mutations.insert(iter, mut); + } +} void PhyloNode::clearReversePartialLh(PhyloNode *dad) { PhyloNeighbor *node_nei = (PhyloNeighbor*)findNeighbor(dad); @@ -62,10 +108,22 @@ PhyloNode::PhyloNode(int aid, const char *aname) : Node(aid, aname) { } void PhyloNode::init() { - //partial_lh = NULL; + missingIndex = -1; } void PhyloNode::addNeighbor(Node *node, double length, int id) { neighbors.push_back(new PhyloNeighbor(node, length, id)); } + +void PhyloNode::setMissingNode(int index) { + missingIndex = index; +} + +bool PhyloNode::checkMissingNode() { + return missingIndex != -1; +} + +int PhyloNode::getMissingIndex() { + return missingIndex; +} \ No newline at end of file diff --git a/phylonode.h b/phylonode.h index ef77b22e..8b562325 100644 --- a/phylonode.h +++ b/phylonode.h @@ -13,6 +13,7 @@ #define PHYLONODE_H #include "node.h" +#include "mutation.h" typedef short int UBYTE; @@ -21,7 +22,8 @@ A neighbor in a phylogenetic tree @author BUI Quang Minh, Steffen Klaere, Arndt von Haeseler */ -class PhyloNeighbor : public Neighbor { +class PhyloNeighbor : public Neighbor +{ friend class PhyloNode; friend class PhyloTree; friend class IQTree; @@ -37,11 +39,13 @@ class PhyloNeighbor : public Neighbor { @param alength length of branch */ - PhyloNeighbor(Node *anode, double alength) : Neighbor(anode, alength) { + PhyloNeighbor(Node *anode, double alength) : Neighbor(anode, alength) + { partial_lh = NULL; partial_lh_computed = 0; lh_scale_factor = 0.0; partial_pars = NULL; + mutations.clear(); } /** @@ -50,24 +54,29 @@ class PhyloNeighbor : public Neighbor { @param alength length of branch @param aid branch ID */ - PhyloNeighbor(Node *anode, double alength, int aid) : Neighbor(anode, alength, aid) { + PhyloNeighbor(Node *anode, double alength, int aid) : Neighbor(anode, alength, aid) + { partial_lh = NULL; partial_lh_computed = 0; lh_scale_factor = 0.0; partial_pars = NULL; + mutations.clear(); + canMove = 0; } /** tell that the partial likelihood vector is not computed */ - inline void clearPartialLh() { + inline void clearPartialLh() + { partial_lh_computed = 0; } /** * tell that the partial likelihood vector is computed */ - inline void unclearPartialLh() { + inline void unclearPartialLh() + { partial_lh_computed = 1; } @@ -77,8 +86,32 @@ class PhyloNeighbor : public Neighbor { */ void clearForwardPartialLh(Node *dad); -private: + /** + * All mutations on this branch + */ + std::vector mutations; + + /** + * Number of leaves in the subtree rooted at this node + */ + int num_leaves; + + /** + * Distance to the root + */ + int distance; + + /** + * Clear all mutations on this branch + */ + void clear_mutations(); + + /** + * Add a mutation to this branch + */ + void add_mutation(Mutation mut); +private: /** true if the partial likelihood was computed */ @@ -104,6 +137,10 @@ class PhyloNeighbor : public Neighbor { */ UINT *partial_pars; + /** + * check if this branch can be movedor do SPR + */ + int canMove; }; /** @@ -111,7 +148,8 @@ A node in a phylogenetic tree @author BUI Quang Minh, Steffen Klaere, Arndt von Haeseler */ -class PhyloNode : public Node { +class PhyloNode : public Node +{ friend class PhyloTree; public: @@ -145,6 +183,12 @@ class PhyloNode : public Node { */ void init(); + void setMissingNode(int index); + + bool checkMissingNode(); + + int getMissingIndex(); + /** add a neighbor @param node the neighbor node @@ -153,8 +197,6 @@ class PhyloNode : public Node { */ virtual void addNeighbor(Node *node, double length, int id = -1); - - /** tell that all partial likelihood vectors below this node are not computed */ @@ -164,13 +206,45 @@ class PhyloNode : public Node { tell that all partial likelihood vectors (in reverse direction) below this node are not computed */ void clearReversePartialLh(PhyloNode *dad); -}; + PhyloNode *dad; + + int missingIndex; +}; /** Node vector */ -typedef vector PhyloNodeVector; +typedef vector PhyloNodeVector; +class CandidateNode +{ +public: + PhyloNode *node; + PhyloNeighbor *node_branch; + std::vector *missing_sample_mutations; + + int *best_set_difference; + int *set_difference; + size_t *best_node_num_leaves; + size_t distance; + size_t *best_distance; + size_t index; + size_t *best_index; + size_t *num_best; + PhyloNode *best_node; + PhyloNeighbor *best_node_branch; + + std::vector *node_has_unique; + std::vector *best_j_vec; + + bool *has_unique; + + std::vector *excess_mutations; + + CandidateNode() + { + } +}; -#endif +#endif \ No newline at end of file diff --git a/phylotree.cpp b/phylotree.cpp index a86ba2b9..3bc26ec5 100644 --- a/phylotree.cpp +++ b/phylotree.cpp @@ -1059,33 +1059,47 @@ int PhyloTree::computeParsimonyBranch(PhyloNeighbor *dad_branch, PhyloNode *dad, } } else if (aln->num_states == 4 && aln->seq_type == SEQ_DNA) { // ULTRAFAST VERSION FOR DNA - for (ptn = 0; ptn < aln->size(); ptn+=8) { - UINT states_left = node_branch->partial_pars[ptn/8]; - UINT states_right = dad_branch->partial_pars[ptn/8]; - UINT states_dad = 0; - int maxi = aln->size() - ptn; - if(maxi > 8) maxi = 8; - for (i = 0; i< maxi; i++) { - UINT state_left = (states_left >> (i*4)) & 15; - UINT state_right = (states_right >> (i*4)) & 15; - UINT state_both = state_left | (state_right << 4); - states_dad |= dna_fitch_result[state_both] << (i*4); - tree_pars += dna_fitch_step[state_both] * aln->at(ptn+i).frequency; - _pattern_pars[ptn + i] = node_branch->partial_pars[ptn_pars_start_id + ptn + i] + - dad_branch->partial_pars[ptn_pars_start_id + ptn + i] + dna_fitch_step[state_both]; - } + for (ptn = 0; ptn < aln->size(); ptn += 8) + { + UINT states_left = node_branch->partial_pars[ptn / 8]; + UINT states_right = dad_branch->partial_pars[ptn / 8]; + UINT states_dad = 0; + int maxi = aln->size() - ptn; + if (maxi > 8) + maxi = 8; + for (i = 0; i < maxi; i++) + { + UINT state_left = (states_left >> (i * 4)) & 15; + UINT state_right = (states_right >> (i * 4)) & 15; + UINT state_both = state_left | (state_right << 4); + // cout << state_left << " " << states_right << " " << state_right << " " << dna_fitch_result[state_both] << endl; + states_dad |= dna_fitch_result[state_both] << (i * 4); + tree_pars += dna_fitch_step[state_both] * aln->at(ptn + i).frequency; + _pattern_pars[ptn + i] = node_branch->partial_pars[ptn_pars_start_id + ptn + i] + + dad_branch->partial_pars[ptn_pars_start_id + ptn + i] + dna_fitch_step[state_both]; + } + if (add_row) + { + for (int i = 0; i < maxi; ++i) + { + for (int j = 0; j < 4; ++j) + { + if (states_dad & (1 << (i * 4 + j))) + { + for (int k = j + 1; k < 4; ++k) + { + if (states_dad & (1 << (i * 4 + k))) + { + states_dad ^= (1 << (i * 4 + k)); + } + } + break; + } + } + } + root_states[ptn / 8] = states_dad; + } } -// // the remaining bits -// UINT states_left = node_branch->partial_pars[ptn/8]; -// UINT states_right = dad_branch->partial_pars[ptn/8]; -// int maxi = aln->size() - ptn; -// for (i = 0; i< maxi; i++) { -// UINT state_left = (states_left >> (i*4)) & 15; -// UINT state_right = (states_right >> (i*4)) & 15; -// UINT state_both = state_left | (state_right << 4); -// _pattern_pars[ptn + i] += dna_fitch_step[state_both]; -// tree_pars += dna_fitch_step[state_both] * aln->at(ptn+i).frequency; -// } } else if (aln->num_states == 20 && aln->seq_type == SEQ_PROTEIN) { // ULTRAFAST VERSION FOR PROTEIN UINT state_left[8], state_right[8]; @@ -5121,3 +5135,1249 @@ void PhyloTree::printTransMatrices(Node *node, Node *dad) { } FOR_NEIGHBOR_IT(node, dad, it)printTransMatrices((*it)->node, node); } + +void PhyloTree::allocateMutationMemory(int num_column) +{ + cur_missing_sample_mutations.resize(num_column); + cur_ancestral_mutations.resize(num_column); + visited_missing_sample_mutations.resize(num_column); + visited_ancestral_mutations.resize(num_column); + cur_excess_mutations.resize(num_column); + visited_excess_mutations.resize(num_column); +} + +void PhyloTree::computePartialMutation(UINT *states_dad, vector &perm_col, vector &compressed_perm_col, PhyloNeighbor *dad_branch, PhyloNode *dad) +{ + PhyloNode *node = (PhyloNode *)dad_branch->node; + int ptn; + int nstates = aln->num_states; + int pars_size = getBitsBlockSize(); + int entry_size = getBitsEntrySize(); + int nptn = aln->size(); + int ptn_pars_start_id = pars_size - nptn - 1; + + if (node->isLeaf() && dad) + { + // Leaf node does not have mutations + return; + } + // Process internal node + UINT *left = NULL, *right = NULL; + PhyloNeighbor *left_branch, *right_branch; + FOR_NEIGHBOR_IT(node, dad, it) + if ((*it)->node->name != ROOT_NAME) + { + if (!left) + left = ((PhyloNeighbor *)(*it))->partial_pars, left_branch = (PhyloNeighbor *)(*it)->node->findNeighbor(node); + else + right = ((PhyloNeighbor *)(*it))->partial_pars, right_branch = (PhyloNeighbor *)(*it)->node->findNeighbor(node); + } + + int p = -1; + vector> left_branch_mutations, right_branch_mutations; + for (ptn = 0; ptn < aln->size(); ptn += 8) + { + // cout << dad_branch->partial_pars[pars_size - 1] << ": ***\n"; + UINT left_state = left[ptn / 8]; + UINT right_state = right[ptn / 8]; + UINT dad_state = states_dad[ptn / 8]; + int maxi = aln->size() - ptn; + if (maxi > 8) + maxi = 8; + for (int i = 0; i < maxi; i++) + { + ++p; + UINT state_left = (left_state >> (i * 4)) & 15; + UINT state_right = (right_state >> (i * 4)) & 15; + UINT state_both = (dad_state >> (i * 4)) & 15; + + char dad_nuc = 0; + for (dad_nuc = 0; dad_nuc < 4; ++dad_nuc) + if (1 & (state_both >> dad_nuc)) + break; + + char left_child_nuc; + if ((1 & (state_left >> dad_nuc)) == 1) + { + left_child_nuc = dad_nuc; + } + else + { + for (left_child_nuc = 0; left_child_nuc < 4; ++left_child_nuc) + if (1 & (state_left >> left_child_nuc)) + break; + Mutation left_child_mut; + left_child_mut.position = perm_col[p]; + left_child_mut.compressed_position = compressed_perm_col[p]; + left_child_mut.mut_nuc = (1 << left_child_nuc); + left_child_mut.par_nuc = (1 << dad_nuc); + left_child_mut.ref_nuc = aln->reference_nuc[left_child_mut.position]; + left_branch->mutations.push_back(left_child_mut); + left_branch_mutations.push_back(make_pair(p, left_child_nuc)); + } + for (int nuc = 0; nuc < 4; ++nuc) + { + if (nuc != left_child_nuc && (1 & (state_left >> nuc))) + { + left[ptn / 8] ^= (1 << (i * 4 + nuc)); + } + } + + char right_child_nuc; + if ((1 & (state_right >> dad_nuc)) == 1) + { + right_child_nuc = dad_nuc; + } + else + { + for (right_child_nuc = 0; right_child_nuc < 4; ++right_child_nuc) + if (1 & (state_right >> right_child_nuc)) + break; + Mutation mut_r; + mut_r.position = perm_col[p]; + mut_r.compressed_position = compressed_perm_col[p]; + mut_r.mut_nuc = (1 << right_child_nuc); + mut_r.par_nuc = (1 << dad_nuc); + mut_r.ref_nuc = aln->reference_nuc[mut_r.position]; + right_branch->mutations.push_back(mut_r); + right_branch_mutations.push_back(make_pair(p, right_child_nuc)); + } + for (int nuc = 0; nuc < 4; ++nuc) + { + if (nuc != right_child_nuc && (1 & (state_right >> nuc))) + { + right[ptn / 8] ^= (1 << (i * 4 + nuc)); + } + } + } + } + + bool left_child = true; + FOR_NEIGHBOR_IT(node, dad, it) + if ((*it)->node->name != ROOT_NAME) + { + if (left_child) + { + computePartialMutation(left, perm_col, compressed_perm_col, (PhyloNeighbor *)(*it), (PhyloNode *)node); + left_child = false; + continue; + } + computePartialMutation(right, perm_col, compressed_perm_col, (PhyloNeighbor *)(*it), (PhyloNode *)node); + } +} + +void PhyloTree::computeMutationBranch(vector &perm_col, vector &compressed_perm_col, PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst) +{ + PhyloNode *node = (PhyloNode *)dad_branch->node; + PhyloNeighbor *node_branch = (PhyloNeighbor *)node->findNeighbor(dad); + assert(node_branch); + if (node->isLeaf()) + { + PhyloNode *tmp_node = dad; + dad = node; + node = tmp_node; + PhyloNeighbor *tmp_nei = dad_branch; + dad_branch = node_branch; + node_branch = tmp_nei; + } + + int nptn = aln->size(); + + UINT *left_branch_states_dad = new UINT[(aln->size() + 7) / 8 + 1]; + for (int ptn = 0; ptn < aln->size(); ptn += 8) + { + left_branch_states_dad[ptn / 8] = 0; + } + + UINT *right_branch_states_dad = new UINT[(aln->size() + 7) / 8 + 1]; + for (int ptn = 0; ptn < aln->size(); ptn += 8) + { + right_branch_states_dad[ptn / 8] = 0; + } + + int i, ptn, col = -1; + for (ptn = 0; ptn < aln->size(); ptn += 8) + { + UINT states_left = node_branch->partial_pars[ptn / 8]; + UINT states_right = dad_branch->partial_pars[ptn / 8]; + UINT states_dad = root_states[ptn / 8]; + int maxi = aln->size() - ptn; + if (maxi > 8) + maxi = 8; + for (i = 0; i < maxi; i++) + { + ++col; + UINT state_left = (states_left >> (i * 4)) & 15; + UINT state_right = (states_right >> (i * 4)) & 15; + UINT state_both = (states_dad >> (i * 4)) & 15; + + char dad_nuc = 0; + for (dad_nuc = 0; dad_nuc < 4; ++dad_nuc) + if (1 & (state_both >> dad_nuc)) + break; + + char left_child_nuc; + if ((1 & (state_left >> dad_nuc)) == 1) + { + left_child_nuc = dad_nuc; + } + else + { + for (left_child_nuc = 0; left_child_nuc < 4; ++left_child_nuc) + if (1 & (state_left >> left_child_nuc)) + break; + Mutation left_child_mut; + left_child_mut.position = perm_col[col]; + left_child_mut.compressed_position = compressed_perm_col[col]; + left_child_mut.mut_nuc = (1 << left_child_nuc); + left_child_mut.par_nuc = (1 << dad_nuc); + left_child_mut.ref_nuc = aln->reference_nuc[left_child_mut.position]; + dad_branch->mutations.push_back(left_child_mut); + } + right_branch_states_dad[ptn / 8] ^= (1 << (i * 4 + left_child_nuc)); + + char right_child_nuc; + if ((1 & (state_right >> dad_nuc)) == 1) + { + right_child_nuc = dad_nuc; + } + else + { + for (right_child_nuc = 0; right_child_nuc < 4; ++right_child_nuc) + if (1 & (state_right >> right_child_nuc)) + break; + Mutation right_child_mut; + right_child_mut.position = perm_col[col]; + right_child_mut.compressed_position = compressed_perm_col[col]; + right_child_mut.mut_nuc = (1 << right_child_nuc); + right_child_mut.par_nuc = (1 << dad_nuc); + right_child_mut.ref_nuc = aln->reference_nuc[right_child_mut.position]; + node_branch->mutations.push_back(right_child_mut); + } + left_branch_states_dad[ptn / 8] ^= (1 << (i * 4 + right_child_nuc)); + } + } + + computePartialMutation(left_branch_states_dad, perm_col, compressed_perm_col, dad_branch, dad); + computePartialMutation(right_branch_states_dad, perm_col, compressed_perm_col, node_branch, node); +} + +void PhyloTree::initMutation(vector &perm_col, vector &compressed_perm_col) +{ + // Compute parsimony is necessary for tracing back the mutations + computeParsimony(); + computeMutationBranch(perm_col, compressed_perm_col, (PhyloNeighbor *)root->neighbors[0], (PhyloNode *)root); + + int ptn = 0, counter = 0; + int nptn = aln->size(); + for (int i = 0; i < nptn; ++i) + { + char root_nuc = ((root_states[ptn] >> (i * 4)) & 15); + char ref_nuc = aln->reference_nuc[perm_col[i]]; + if ((root_nuc & ref_nuc) == 0) + { + char dad_nuc = 0; + for (dad_nuc = 0; dad_nuc < 4; ++dad_nuc) + { + if (1 & (ref_nuc >> dad_nuc)) + break; + } + + char mut_nuc = 0; + for (mut_nuc = 0; mut_nuc < 4; ++mut_nuc) + { + if (1 & (root_nuc >> mut_nuc)) + break; + } + + Mutation m; + m.position = perm_col[i]; + m.compressed_position = compressed_perm_col[i]; + m.mut_nuc = (1 << mut_nuc); + m.ref_nuc = ref_nuc; + m.par_nuc = (1 << dad_nuc); + root_mutations.push_back(m); + } + ++counter; + if (counter == 8) + { + counter = 0; + ++ptn; + } + } +} + +int PhyloTree::computePartialParsimonyMutation(PhyloNeighbor *dad_branch, PhyloNode *dad) +{ + int par_s = 0; + PhyloNode *node = (PhyloNode *)dad_branch->node; + PhyloNeighbor *node_branch = (PhyloNeighbor *)node->findNeighbor(dad); + par_s += node_branch->mutations.size(); + FOR_NEIGHBOR_IT(node, dad, it) + if ((*it)->node->name != ROOT_NAME) + { + par_s += computePartialParsimonyMutation(((PhyloNeighbor *)(*it)), node); + } + return par_s; +} + +int PhyloTree::computeParsimonyBranchMutation(PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst) +{ + PhyloNode *node = (PhyloNode *)dad_branch->node; + PhyloNeighbor *node_branch = (PhyloNeighbor *)node->findNeighbor(dad); + assert(node_branch); + if (!central_partial_pars) + initializeAllPartialPars(); + if (node->isLeaf()) + { + PhyloNode *tmp_node = dad; + dad = node; + node = tmp_node; + PhyloNeighbor *tmp_nei = dad_branch; + dad_branch = node_branch; + node_branch = tmp_nei; + } + + int par_s = 0; + par_s += computePartialParsimonyMutation(dad_branch, dad); + par_s += computePartialParsimonyMutation(node_branch, node); + return par_s; +} + +int PhyloTree::computeParsimonyScoreMutation() +{ + assert(root->isLeaf()); + PhyloNeighbor *nei = ((PhyloNeighbor *)root->neighbors[0]); + current_it = nei; + assert(current_it); + current_it_back = (PhyloNeighbor *)nei->node->findNeighbor(root); + assert(current_it_back); + + int parsimonyScore = 0; + parsimonyScore += computeParsimonyBranchMutation((PhyloNeighbor *)root->neighbors[0], (PhyloNode *)root); + parsimonyScore += root_mutations.size(); + return parsimonyScore; +} + +vector> PhyloTree::breadth_first_expansion() +{ + assert(root->isLeaf()); + PhyloNeighbor *nei = ((PhyloNeighbor *)root->neighbors[0]); + current_it = nei; + assert(current_it); + current_it_back = (PhyloNeighbor *)nei->node->findNeighbor(root); + assert(current_it_back); + + vector> bfs; + queue> q; + q.push(make_pair((PhyloNode *)nei->node, current_it_back)); + current_it_back->distance = 1; + while (q.size()) + { + PhyloNode *node = q.front().first; + PhyloNeighbor *node_branch = q.front().second; + node->dad = (PhyloNode *)node_branch->node; + for (auto mut : node_branch->mutations) + { + assert((mut.mut_nuc & (mut.mut_nuc - 1)) == 0); + } + PhyloNode *dad = (PhyloNode *)node_branch->node; + q.pop(); + bfs.push_back(make_pair(node, node_branch)); + FOR_NEIGHBOR_IT(node, dad, it) + { + ((PhyloNeighbor *)(*it)->node->findNeighbor(node))->distance = node_branch->distance + 1; + q.push(make_pair((PhyloNode *)(*it)->node, (PhyloNeighbor *)(*it)->node->findNeighbor(node))); + } + } + + for (int i = bfs.size() - 1; i >= 0; --i) + { + PhyloNode *node = bfs[i].first; + PhyloNeighbor *node_branch = bfs[i].second; + PhyloNode *dad = (PhyloNode *)node_branch->node; + node_branch->num_leaves = 0; + if (node->isLeaf()) + { + node_branch->num_leaves = 1; + continue; + } + FOR_NEIGHBOR_IT(node, dad, it) + { + node_branch->num_leaves += ((PhyloNeighbor *)(*it)->node->findNeighbor(node))->num_leaves; + } + } + return bfs; +} + +void PhyloTree::calculatePlacementMutation(CandidateNode &input, bool compute_parsimony_scores, bool compute_vecs) +{ + int set_difference = 0; + int best_set_difference = *input.best_set_difference; + std::vector anc_positions; + std::vector ancestral_mutations; + bool has_unique = false; + int node_num_mut = 0; + int num_common_mut = 0; + assert(input.node->dad); + + timer_regular--; + for (auto m : (*input.missing_sample_mutations)) + { + visited_missing_sample_mutations[m.compressed_position] = timer_regular; + cur_missing_sample_mutations[m.compressed_position] = m; + } + + if (!(input.node == root)) + { + for (auto m1 : input.node_branch->mutations) + { + node_num_mut++; + auto anc_nuc = m1.mut_nuc; + if (m1.is_masked()) + { + has_unique = true; + break; + } + assert(((anc_nuc - 1) & anc_nuc) == 0); + bool found = false; + bool found_pos = false; + if (visited_missing_sample_mutations[m1.compressed_position] == timer_regular) + { + auto m2 = cur_missing_sample_mutations[m1.compressed_position]; + if (m1.position == m2.position) + { + found_pos = true; + if (m2.is_missing) + { + found = true; + num_common_mut++; + } + else + { + auto nuc = m2.mut_nuc; + if ((nuc & anc_nuc) != 0) + { + Mutation m; + m.position = m1.position; + m.compressed_position = m1.compressed_position; + m.ref_nuc = m1.ref_nuc; + m.par_nuc = m1.par_nuc; + m.mut_nuc = anc_nuc; + + ancestral_mutations.emplace_back(m); + anc_positions.emplace_back(m.compressed_position); + assert((m.mut_nuc & (m.mut_nuc - 1)) == 0); + if (compute_vecs) + { + (*input.excess_mutations).emplace_back(m); + } + + found = true; + num_common_mut++; + } + } + } + } + if (!found) + { + if (!found_pos && (anc_nuc == m1.ref_nuc)) + { // m.mut_nuc = m.par_nuc = m1.ref_nuc + Mutation m; + m.position = m1.position; + m.compressed_position = m1.compressed_position; + m.ref_nuc = m1.ref_nuc; + m.par_nuc = m1.par_nuc; + m.mut_nuc = anc_nuc; + + ancestral_mutations.emplace_back(m); + anc_positions.emplace_back(m.compressed_position); + assert((m.mut_nuc & (m.mut_nuc - 1)) == 0); + if (compute_vecs) + { + (*input.excess_mutations).emplace_back(m); + } + + num_common_mut++; + } + else + { + has_unique = true; + } + } + } + } + else + { + assert(false); + for (auto m : input.node_branch->mutations) + { + ancestral_mutations.emplace_back(m); + anc_positions.emplace_back(m.compressed_position); + } + } + + for (auto m : ancestral_mutations) + { + visited_ancestral_mutations[m.compressed_position] = timer_regular; + cur_ancestral_mutations[m.compressed_position] = m; + } + + { + PhyloNode *n = input.node; + while (n->dad != root) + { + n = n->dad; + PhyloNeighbor *node_branch = (PhyloNeighbor *)n->findNeighbor(n->dad); + for (auto m : node_branch->mutations) + { + if (!m.is_masked() && visited_ancestral_mutations[m.compressed_position] != timer_regular) + { + ancestral_mutations.emplace_back(m); + anc_positions.emplace_back(m.compressed_position); + visited_ancestral_mutations[m.compressed_position] = timer_regular; + cur_ancestral_mutations[m.compressed_position] = m; + } + } + } + for (auto m : root_mutations) + { + if (!m.is_masked() && visited_ancestral_mutations[m.compressed_position] != timer_regular) + { + ancestral_mutations.emplace_back(m); + anc_positions.emplace_back(m.compressed_position); + visited_ancestral_mutations[m.compressed_position] = timer_regular; + cur_ancestral_mutations[m.compressed_position] = m; + } + } + } + + for (auto m1 : (*input.missing_sample_mutations)) + { + if (m1.is_missing) + { + continue; + } + bool found_pos = false; + bool found = false; + bool has_ref = false; + auto anc_nuc = m1.ref_nuc; + if ((m1.mut_nuc & m1.ref_nuc) != 0) + { + has_ref = true; + } + if (visited_ancestral_mutations[m1.compressed_position] == timer_regular) + { + auto m2 = cur_ancestral_mutations[m1.compressed_position]; + if (!m2.is_masked()) + { + found_pos = true; + anc_nuc = m2.mut_nuc; + if ((m1.mut_nuc & anc_nuc) != 0) + { + found = true; + } + } + } + if (!found && (found_pos || !has_ref)) + { + + Mutation m; + m.position = m1.position; + m.compressed_position = m1.compressed_position; + m.ref_nuc = m1.ref_nuc; + m.par_nuc = anc_nuc; + if (has_ref) + { + m.mut_nuc = m1.ref_nuc; + } + else + { + for (int j = 0; j < 4; j++) + { + if (((1 << j) & m1.mut_nuc) != 0) + { + m.mut_nuc = (1 << j); + break; + } + } + } + assert((m.mut_nuc & (m.mut_nuc - 1)) == 0); + if (m.mut_nuc != m.par_nuc) + { + if (compute_vecs) + { + input.excess_mutations->emplace_back(m); + } + set_difference += 1; + if (!compute_parsimony_scores && (set_difference > best_set_difference)) + { + return; + } + } + } + } + + for (auto m1 : ancestral_mutations) + { + bool found = false; + bool found_pos = false; + auto anc_nuc = m1.mut_nuc; + if (visited_missing_sample_mutations[m1.compressed_position] == timer_regular) + { + if (!m1.is_masked()) + { + auto m2 = cur_missing_sample_mutations[m1.compressed_position]; + found_pos = true; + if (m2.is_missing) + { + found = true; + } + else if ((m2.mut_nuc & anc_nuc) != 0) + { + found = true; + } + } + } + if (!found && !found_pos && (m1.is_masked() || (anc_nuc != m1.ref_nuc))) + { + Mutation m; + m.position = m1.position; + m.compressed_position = m1.compressed_position; + m.ref_nuc = m1.ref_nuc; + m.par_nuc = anc_nuc; + m.mut_nuc = m1.ref_nuc; + assert(m.is_masked() || ((m.mut_nuc & (m.mut_nuc - 1)) == 0)); + if (m.mut_nuc != m.par_nuc) + { + set_difference += 1; + if (!compute_parsimony_scores && (set_difference > best_set_difference)) + { + return; + } + if (compute_vecs) + { + (*input.excess_mutations).emplace_back(m); + } + } + } + } + + if (compute_parsimony_scores) + { + *input.set_difference = set_difference; + } + + if (set_difference > *input.best_set_difference) + { + return; + } + size_t num_leaves = input.node_branch->num_leaves; + if (set_difference < *input.best_set_difference) + { + *input.best_set_difference = set_difference; + *input.best_node_num_leaves = num_leaves; + *input.best_index = input.index; + *input.has_unique = has_unique; + *input.best_distance = input.distance; + (*input.node_has_unique)[input.index] = has_unique; + } + else if (set_difference == *input.best_set_difference) + { + if (((input.distance == *input.best_distance) && + ((num_leaves > *input.best_node_num_leaves) || + ((num_leaves == *input.best_node_num_leaves) && (*input.best_index < input.index)))) || + (input.distance < *input.best_distance)) + { + *input.best_set_difference = set_difference; + *input.best_node_num_leaves = num_leaves; + *input.best_index = input.index; + *input.has_unique = has_unique; + *input.best_distance = input.distance; + } + (*input.node_has_unique)[input.index] = has_unique; + } +} + +void PhyloTree::initDataCalculatePlacementMutation(CandidateNode &inp) +{ + ++timer_optimized; + for (auto m : (*inp.missing_sample_mutations)) + { + visited_missing_sample_mutations[m.compressed_position] = timer_optimized; + cur_missing_sample_mutations[m.compressed_position] = m; + } +} + +void PhyloTree::eraseMutation(vector &erased_excess_mutation, Mutation m, int &set_difference) +{ + if (visited_excess_mutations[m.compressed_position] == timer_optimized) + { + erased_excess_mutation.emplace_back(cur_excess_mutations[m.compressed_position]); + visited_excess_mutations[m.compressed_position] = 0; + --set_difference; + } +} + +void PhyloTree::addMutation(vector &added_excess_mutation, Mutation m, int diff, int &set_difference) +{ + added_excess_mutation.push_back(m); + visited_excess_mutations[m.compressed_position] = timer_optimized; + cur_excess_mutations[m.compressed_position] = m; + set_difference += diff; +} + +void PhyloTree::optimizedCalculatePlacementMutation(CandidateNode &input, int set_difference, bool firstNode) +{ + int num_common_mut = 0; + int best_set_difference = *input.best_set_difference; + + std::vector anc_positions; + std::vector ancestral_mutations; + std::vector erased_excess_mutation; + std::vector added_excess_mutation; + std::vector common_mutations; + std::vector diff_mutations; + + bool has_unique = false; + int node_num_mut = 0; + assert(input.node->dad); + + if (!(input.node == root)) + { + for (auto m1 : input.node_branch->mutations) + { + node_num_mut++; + auto anc_nuc = m1.mut_nuc; + if (m1.is_masked()) + { + has_unique = true; + break; + } + assert(((anc_nuc - 1) & anc_nuc) == 0); + bool found = false; + bool found_pos = false; + if (visited_missing_sample_mutations[m1.compressed_position] == timer_optimized) + { + auto m2 = cur_missing_sample_mutations[m1.compressed_position]; + if (m1.position == m2.position) + { + found_pos = true; + if (m2.is_missing) + { + ++num_common_mut; + found = true; + } + else + { + auto nuc = m2.mut_nuc; + if ((nuc & anc_nuc) != 0) + { + Mutation m; + m.position = m1.position; + m.compressed_position = m1.compressed_position; + m.ref_nuc = m1.ref_nuc; + m.par_nuc = m1.par_nuc; + m.mut_nuc = anc_nuc; + + ancestral_mutations.emplace_back(m); + anc_positions.emplace_back(m.compressed_position); + assert((m.mut_nuc & (m.mut_nuc - 1)) == 0); + + found = true; + eraseMutation(erased_excess_mutation, m, set_difference); + addMutation(added_excess_mutation, m, 0, set_difference); + common_mutations.emplace_back(m); + ++num_common_mut; + } + } + } + } + if (!found) + { + if (!found_pos && (anc_nuc == m1.ref_nuc)) + { // m.mut_nuc = m.par_nuc = m1.ref_nuc + Mutation m; + m.position = m1.position; + m.compressed_position = m1.compressed_position; + m.ref_nuc = m1.ref_nuc; + m.par_nuc = m1.par_nuc; + m.mut_nuc = anc_nuc; + + ancestral_mutations.emplace_back(m); + anc_positions.emplace_back(m.compressed_position); + assert((m.mut_nuc & (m.mut_nuc - 1)) == 0); + eraseMutation(erased_excess_mutation, m, set_difference); + addMutation(added_excess_mutation, m, 0, set_difference); + common_mutations.emplace_back(m); + ++num_common_mut; + } + else + { + has_unique = true; + diff_mutations.emplace_back(m1); + } + } + } + } + + if (firstNode) + { + { + PhyloNode *n = input.node; + while (n->dad != root) + { + n = n->dad; + PhyloNeighbor *node_branch = (PhyloNeighbor *)n->findNeighbor(n->dad); + for (auto m : node_branch->mutations) + { + if (!m.is_masked() && visited_ancestral_mutations[m.compressed_position] != timer_optimized) + { + ancestral_mutations.emplace_back(m); + anc_positions.emplace_back(m.compressed_position); + visited_ancestral_mutations[m.compressed_position] = timer_optimized; + cur_ancestral_mutations[m.compressed_position] = m; + } + } + } + for (auto m : root_mutations) + { + if (!m.is_masked() && visited_ancestral_mutations[m.compressed_position] != timer_optimized) + { + ancestral_mutations.emplace_back(m); + anc_positions.emplace_back(m.compressed_position); + visited_ancestral_mutations[m.compressed_position] = timer_optimized; + cur_ancestral_mutations[m.compressed_position] = m; + } + } + } + + for (auto m1 : (*input.missing_sample_mutations)) + { + // Missing bases (Ns) are ignored + if (m1.is_missing) + { + continue; + } + bool found_pos = false; + bool found = false; + bool has_ref = false; + auto anc_nuc = m1.ref_nuc; + if ((m1.mut_nuc & m1.ref_nuc) != 0) + { + has_ref = true; + } + + if (visited_ancestral_mutations[m1.compressed_position] == timer_optimized) + { + auto m2 = cur_ancestral_mutations[m1.compressed_position]; + if (!m2.is_masked()) + { + found_pos = true; + anc_nuc = m2.mut_nuc; + if ((m1.mut_nuc & anc_nuc) != 0) + { + found = true; + } + } + } + if (!found && !has_ref) + { + Mutation m; + m.position = m1.position; + m.compressed_position = m1.compressed_position; + m.ref_nuc = m1.ref_nuc; + m.par_nuc = anc_nuc; + if (has_ref) + { + m.mut_nuc = m1.ref_nuc; + } + else + { + for (int j = 0; j < 4; j++) + { + if (((1 << j) & m1.mut_nuc) != 0) + { + m.mut_nuc = (1 << j); + break; + } + } + } + assert((m.mut_nuc & (m.mut_nuc - 1)) == 0); + if (m.mut_nuc != m.par_nuc) + { + addMutation(added_excess_mutation, m, 1, set_difference); + } + } + } + } + + for (auto m1 : ancestral_mutations) + { + bool found = false; + bool found_pos = false; + auto anc_nuc = m1.mut_nuc; + if (visited_missing_sample_mutations[m1.compressed_position] == timer_optimized) + { + if (!m1.is_masked()) + { + auto m2 = cur_missing_sample_mutations[m1.compressed_position]; + found_pos = true; + if (m2.is_missing) + { + found = true; + } + else if ((m2.mut_nuc & anc_nuc) != 0) + { + found = true; + } + } + } + if (!found && (found_pos || m1.is_masked() || (anc_nuc != m1.ref_nuc))) + { + eraseMutation(erased_excess_mutation, m1, set_difference); + Mutation m; + m.position = m1.position; + m.compressed_position = m1.compressed_position; + m.ref_nuc = m1.ref_nuc; + m.par_nuc = anc_nuc; + m.mut_nuc = m1.ref_nuc; + assert(m.is_masked() || ((m.mut_nuc & (m.mut_nuc - 1)) == 0)); + if (m.mut_nuc != m.par_nuc) + { + addMutation(added_excess_mutation, m, 1, set_difference); + } + } + } + + size_t num_leaves = input.node_branch->num_leaves; + if (set_difference < *input.best_set_difference) + { + *input.best_set_difference = set_difference; + *input.best_node_num_leaves = num_leaves; + *input.best_distance = input.distance; + input.best_node = input.node; + input.best_node_branch = input.node_branch; + } + else if (set_difference == *input.best_set_difference) + { + if (((input.distance == *input.best_distance) && + ((num_leaves >= *input.best_node_num_leaves))) || + (input.distance < *input.best_distance)) + { + *input.best_set_difference = set_difference; + *input.best_node_num_leaves = num_leaves; + *input.best_distance = input.distance; + input.best_node = input.node; + input.best_node_branch = input.node_branch; + } + } + + for (auto m : common_mutations) + { + visited_excess_mutations[m.compressed_position] = 0; + } + + for (auto m : diff_mutations) + { + Mutation m1; + m1.ref_nuc = m.ref_nuc; + m1.par_nuc = m.mut_nuc; + m1.mut_nuc = m.ref_nuc; + m1.position = m.position; + m1.compressed_position = m.compressed_position; + if (visited_missing_sample_mutations[m.compressed_position] == timer_optimized) + { + m1.mut_nuc = cur_missing_sample_mutations[m.compressed_position].mut_nuc; + } + eraseMutation(erased_excess_mutation, m1, set_difference); + if (m1.mut_nuc != m1.par_nuc) + { + addMutation(added_excess_mutation, m1, 1, set_difference); + } + } + + PhyloNode *node = input.node; + PhyloNode *dad = node->dad; + FOR_NEIGHBOR_IT(node, dad, it) + { + PhyloNode *childNode = (PhyloNode *)(*it)->node; + PhyloNeighbor *childNodeBranch = (PhyloNeighbor *)childNode->findNeighbor(node); + input.node = childNode; + input.node_branch = childNodeBranch; + optimizedCalculatePlacementMutation(input, set_difference, false); + } + + for (auto m : added_excess_mutation) + { + visited_excess_mutations[m.compressed_position] = 0; + } + + for (int i = (int)erased_excess_mutation.size() - 1; i >= 0; --i) + { + Mutation m = erased_excess_mutation[i]; + visited_excess_mutations[m.compressed_position] = timer_optimized; + cur_excess_mutations[m.compressed_position] = m; + } +} + +void PhyloTree::addNewSample(PhyloNode *best_node, PhyloNeighbor *best_node_branch, std::vector node_excess_mutations, int index, std::string sample_name) +{ + PhyloNode *new_node = (PhyloNode *)newNode(); + PhyloNode *sample = (PhyloNode *)newNode(aln->getNSeq() + index, sample_name.c_str()); + sample->setMissingNode(index); + new_node->addNeighbor(sample, -1.0); + sample->addNeighbor(new_node, -1.0); + PhyloNode *best_dad = (PhyloNode *)best_node_branch->node; + + std::vector common_mut, l1_mut, l2_mut; + std::vector curr_l1_mut; + // Compute current best node branch mutations + for (auto m1 : best_node_branch->mutations) + { + Mutation m = m1.copy(); + curr_l1_mut.emplace_back(m); + } + // Clear mutations on the best node branch which + // will be later replaced by l1_mut + best_node_branch->clear_mutations(); + // Compute l1_mut + --timer_regular; + for (auto m1 : curr_l1_mut) + { + visited_ancestral_mutations[m1.compressed_position] = timer_regular; + cur_ancestral_mutations[m1.compressed_position] = m1; + } + for (auto m2 : node_excess_mutations) + { + visited_excess_mutations[m2.compressed_position] = timer_regular; + cur_excess_mutations[m2.compressed_position] = m2; + } + for (auto m1 : curr_l1_mut) + { + bool found = false; + if (!m1.is_masked()) + { + if (visited_excess_mutations[m1.compressed_position] == timer_regular) + { + auto m2 = cur_excess_mutations[m1.compressed_position]; + if (m1.position == m2.position) + { + if (m1.mut_nuc == m2.mut_nuc) + { + found = true; + } + } + } + } + if (!found) + { + Mutation m = m1.copy(); + l1_mut.emplace_back(m); + } + } + // Compute l2_mut + for (auto m1 : node_excess_mutations) + { + bool found = false; + if (!m1.is_masked()) + { + if (visited_ancestral_mutations[m1.compressed_position] == timer_regular) + { + auto m2 = cur_ancestral_mutations[m1.compressed_position]; + if (m1.position == m2.position) + { + if (m1.mut_nuc == m2.mut_nuc) + { + found = true; + Mutation m = m1.copy(); + common_mut.emplace_back(m); + } + } + } + } + if (!found) + { + Mutation m = m1.copy(); + l2_mut.emplace_back(m); + } + } + + new_node->addNeighbor(best_node, -1.0); + new_node->addNeighbor(best_dad, -1.0); + best_node->updateNeighbor(best_dad, new_node, -1.0); + best_dad->updateNeighbor(best_node, new_node, -1.0); + // Add mutations to new node using common_mut + PhyloNeighbor *new_node_branch = (PhyloNeighbor *)new_node->findNeighbor(best_dad); + for (auto m : common_mut) + { + new_node_branch->add_mutation(m); + } + + // Add mutations to best node using l1_mut + PhyloNeighbor *new_best_node_branch = (PhyloNeighbor *)best_node->findNeighbor(new_node); + for (auto m : l1_mut) + { + new_best_node_branch->add_mutation(m); + } + + PhyloNeighbor *sample_branch = (PhyloNeighbor *)sample->findNeighbor(new_node); + // Add new sample mutations using l2_mut + for (auto m : l2_mut) + { + sample_branch->add_mutation(m); + } +} + +string PhyloTree::checkPartialMutation(vector &pos, PhyloNeighbor *dad_branch, PhyloNode *dad) +{ + PhyloNode *node = (PhyloNode *)dad_branch->node; + int ptn; + int nstates = aln->num_states; + int pars_size = getBitsBlockSize(); + int entry_size = getBitsEntrySize(); + int nptn = aln->size(); + int ptn_pars_start_id = pars_size - nptn - 1; + + if (nstates == 4 && aln->seq_type == SEQ_DNA && (node->isLeaf() || node->degree() == 3)) + { + // ULTRAFAST VERSION FOR DNA, assuming that UINT is 32-bit integer + if (node->isLeaf() && dad) + { + PhyloNeighbor *node_branch = (PhyloNeighbor *)node->findNeighbor(dad); + string s = ""; + if (node->id >= aln->getNSeq()) + cout << node->id << " " << aln->getNSeq() << '\n'; + assert(node->id < aln->getNSeq()); + for (int i = 0; i < (int)aln->getNSite(); ++i) + { + Pattern pat = aln->getPattern(i); + s += aln->convertStateBack(pat[node->id]); + } + // for (auto m : node_branch->mutations) cout << m.get_string() << "+" << pos[m.position] << " "; + return s; + } + else + { + // internal node + int cur = 0; + string left, right; + PhyloNeighbor *left_branch, *right_branch; + FOR_NEIGHBOR_IT(node, dad, it) + if ((*it)->node->name != ROOT_NAME) + { + // ((PhyloNeighbor*)(*it))->distance = dad_branch->distance + 1; + if (cur == 0) + left_branch = (PhyloNeighbor *)(*it)->node->findNeighbor(node), left = checkPartialMutation(pos, (PhyloNeighbor *)(*it), (PhyloNode *)node), cur = 1; + else + right_branch = (PhyloNeighbor *)(*it)->node->findNeighbor(node), right = checkPartialMutation(pos, (PhyloNeighbor *)(*it), (PhyloNode *)node); + } + for (auto m : left_branch->mutations) + { + assert(pos[m.position] < (int)left.length()); + left[pos[m.position]] = aln->getStateFromMutation(m.par_nuc); + } + for (auto m : right_branch->mutations) + { + assert(pos[m.position] < (int)right.length()); + right[pos[m.position]] = aln->getStateFromMutation(m.par_nuc); + } + + if (left != right) + { + for (int i = 0; i < (int)left.length(); ++i) + { + if (left[i] != right[i] && (aln->getMutationFromState(left[i]) & aln->getMutationFromState(right[i])) == 0) + { + cout << "compute mutations wrong"; + exit(1); + } + } + } + return left; + } + } // END OF DNA VERSION +} + +void PhyloTree::checkMutationBranch(vector &pos, PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst) +{ + PhyloNode *node = (PhyloNode *)dad_branch->node; + PhyloNeighbor *node_branch = (PhyloNeighbor *)node->findNeighbor(dad); + assert(node_branch); + if (!central_partial_pars) + initializeAllPartialPars(); + // swap node and dad if dad is a leaf + if (node->isLeaf()) + { + PhyloNode *tmp_node = dad; + dad = node; + node = tmp_node; + PhyloNeighbor *tmp_nei = dad_branch; + dad_branch = node_branch; + node_branch = tmp_nei; + } + + string s = checkPartialMutation(pos, dad_branch, dad); + string t = checkPartialMutation(pos, node_branch, node); + for (auto m : node_branch->mutations) + { + s[pos[m.position]] = aln->getStateFromMutation(m.par_nuc); + } + for (auto m : dad_branch->mutations) + { + t[pos[m.position]] = aln->getStateFromMutation(m.par_nuc); + } + if (s != t) + { + for (int i = 0; i < (int)s.length(); ++i) + { + if (s[i] != t[i] && (aln->getMutationFromState(s[i]) & aln->getMutationFromState(t[i])) == 0) + { + cout << "compute mutations wrong at root"; + exit(1); + } + } + } +} + +void PhyloTree::checkMutation(vector &pos) +{ + cout << "========== Start checking mutations ==========\n"; + assert(root->isLeaf()); + PhyloNeighbor *nei = ((PhyloNeighbor *)root->neighbors[0]); + current_it = nei; + assert(current_it); + current_it_back = (PhyloNeighbor *)nei->node->findNeighbor(root); + assert(current_it_back); + + checkMutationBranch(pos, (PhyloNeighbor *)root->neighbors[0], (PhyloNode *)root); + cout << "========== End checking mutations ==========\n"; +} + +PhyloNode *PhyloTree::findNode(PhyloNode *node, PhyloNode *dad, string name) +{ + if (node->name == name) + { + return node; + } + if (node->isLeaf()) + { + return NULL; + } + PhyloNode *found = NULL; + FOR_NEIGHBOR_IT(node, dad, it) + { + found = findNode((PhyloNode *)(*it)->node, node, name); + if (found) + { + return found; + } + } + return NULL; +} + +PhyloNode *PhyloTree::findNode(string name) +{ + PhyloNeighbor *nei = ((PhyloNeighbor *)root->neighbors[0]); + return findNode((PhyloNode *)nei->node, (PhyloNode *)root, name); +} \ No newline at end of file diff --git a/phylotree.h b/phylotree.h index 58ec5cb0..3177725d 100644 --- a/phylotree.h +++ b/phylotree.h @@ -279,6 +279,121 @@ class PhyloTree : public MTree, public Optimization { */ virtual ~PhyloTree(); + /** + * Add a row to the tree + */ + bool add_row; + + /** + * Save the states of the root node + */ + UINT *root_states; + + /** + * Root mutations + */ + vector root_mutations; + + /** + * Allocate memory for mutation data + */ + void allocateMutationMemory(int num_column); + + /** + * Initialize mutation data for MAT + */ + void initMutation(vector &perm_col, vector &compressed_perm_col); + + /** + * Compute mutation for a branch + */ + void computeMutationBranch(vector &perm_col, vector &compressed_perm_col, PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst = NULL); + + /** + * Compute partial mutation for a branch + */ + void computePartialMutation(UINT *states_dad, vector &perm_col, vector &compressed_perm_col, PhyloNeighbor *dad_branch, PhyloNode *dad); + + /** + * Compute parsimony score using mutation + */ + int computeParsimonyScoreMutation(); + + /** + * Compute parsimony score for a branch using mutation + */ + int computeParsimonyBranchMutation(PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst = NULL); + + /** + * Compute partial parsimony score for a branch using mutation + */ + int computePartialParsimonyMutation(PhyloNeighbor *dad_branch, PhyloNode *dad); + + /** + * Compute the breadth-first expansion of the vertices of the tree + */ + std::vector> breadth_first_expansion(); + + std::vector cur_excess_mutations, cur_missing_sample_mutations, cur_ancestral_mutations; + std::vector visited_missing_sample_mutations, visited_ancestral_mutations; + std::vector visited_excess_mutations; + int timer_optimized, timer_regular; + + /** + * Calculate placement mutation for a candidate node + */ + void calculatePlacementMutation(CandidateNode &input, bool compute_parsimony_scores = false, bool compute_vecs = false); + + /** + * Initialize data for calculatePlacementMutation + */ + void initDataCalculatePlacementMutation(CandidateNode &inp); + + /** + * Erase a mutation from the candidate node + */ + void eraseMutation(vector &erase_excess_mutations, Mutation m, int &set_difference); + + /** + * Add a mutation to the candidate node + */ + void addMutation(vector &added_excess_mutations, Mutation m, int diff, int &set_difference); + + /** + * Optimize the placement mutation for a candidate node + */ + void optimizedCalculatePlacementMutation(CandidateNode &input, int set_difference = 0, bool firstNode = false); + + /** + * Add a new sample to the tree + */ + void addNewSample(PhyloNode *best_node, PhyloNeighbor *best_node_branch, std::vector node_excess_mutations, int index, std::string name); + + /** + * Check mutations at a given position + */ + void checkMutation(vector &pos); + + /** + * Check mutations on a branch + */ + void checkMutationBranch(vector &pos, PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst = NULL); + + /** + * Check partial mutation on a branch + */ + string checkPartialMutation(vector &pos, PhyloNeighbor *dad_branch, PhyloNode *dad); + + /** + * Find a node by name + */ + PhyloNode *findNode(PhyloNode *node, PhyloNode *dad, string name); + + /** + * Find a node by name + */ + PhyloNode *findNode(string name); + /** copy the phylogenetic tree structure into this tree, override to take sequence names in the alignment into account diff --git a/placement.cpp b/placement.cpp new file mode 100644 index 00000000..5e79f3d2 --- /dev/null +++ b/placement.cpp @@ -0,0 +1,204 @@ +#ifdef HAVE_CONFIG_H +#include +#endif + +#include "phylotree.h" +#include "alignment.h" +#include "iqtree.h" +#include "mutation.h" +#include "placement.h" + +void checkCorectTree(char *originTreeFile, char *newTreeFile) +{ + cout << "================= Check correct tree ================\n"; + IQTree *originTree = new IQTree; + bool originIsRooted = false; + originTree->readTree(originTreeFile, originIsRooted); + + IQTree *newTree = new IQTree; + bool newIsRooted = false; + newTree->readTree(newTreeFile, newIsRooted); + + vector originLeafName; + originTree->getLeafName(originLeafName); + + newTree->assignRoot(originLeafName[0]); + sort(originLeafName.begin(), originLeafName.end()); + newTree->initInfoNode(originLeafName); + + if (newTree->compareTree(originTree)) + cout << "Correct tree\n"; + else + cout << "Wrong tree\n"; + + delete originTree; + delete newTree; +} + +void configLeafNames(IQTree *tree, Node *node, Node *dad) +{ + if (node->isLeaf()) + { + node->id = tree->aln->getSeqID(node->name); + } + FOR_NEIGHBOR_IT(node, dad, it) + configLeafNames(tree, (*it)->node, node); +} + +void initializeNewColumn(IQTree *tree, Alignment *alignment, vector &rotatedPermutationColumn) +{ + int nsite = rotatedPermutationColumn.size(); + vector permCol(nsite); + vector compressedPermCol(nsite); + if (alignment->existingSampleMutations.size()) + { + for (int site = 0; site < nsite; ++site) + { + int col = rotatedPermutationColumn[site]; + compressedPermCol[site] = alignment->existingSampleMutations[0][col].compressed_position; + permCol[site] = alignment->existingSampleMutations[0][col].position; + } + } + alignment->ungroupSitePattern(); + tree->add_row = true; + tree->root_states = new UINT[(alignment->size() + 7) / 8 + 1]; + tree->initMutation(permCol, compressedPermCol); +} + +int readInitialAlignment(ifstream &inFileStream, char *outFileName, int numInitialRow) +{ + ofstream outFile(outFileName); + if (!outFile.is_open()) + { + cout << "Cannot open outputfile :" << outFileName << '\n'; + exit(1); + } + string line; + int currentRow = 0; + while (getline(inFileStream, line)) + { + if (line == "") + { + continue; + } + outFile << line << '\n'; + ++currentRow; + if (currentRow >= numInitialRow) + { + break; + } + } + outFile.close(); + return currentRow; +} + +int readVCFFile(IQTree *tree, Alignment **alignment, Params ¶ms) +{ + char *alnFile = params.aln_file; + ifstream in; + in.exceptions(ios::failbit | ios::badbit); + in.open(alnFile); + string line; + in.exceptions(ios::badbit); + + // Read first 12 lines and create tree alignment + int totalColumn = readInitialAlignment(in, "temp.vcf", 12) - 1; // Read first 12 lines and write to temp.vcf + *alignment = new Alignment("temp.vcf", params.sequence_type, params.intype, params.num_existing_sample); + (*alignment)->ungroupSitePattern(); + std::remove("temp.vcf"); + tree->setAlignment(*alignment); + tree->aln = *alignment; + + vector rotatedColumnPermutation = (*alignment)->findRotatedColumnPermutation(); + initializeNewColumn(tree, *alignment, rotatedColumnPermutation); + + while (true) + { + int numProcessedColumn = (*alignment)->readPartialVCF(in, params.sequence_type, rotatedColumnPermutation, params.num_existing_sample, totalColumn, 8); + if (numProcessedColumn == 0) + { + // Process all columns + break; + } + tree->clearAllPartialLH(); + totalColumn += numProcessedColumn; + initializeNewColumn(tree, *alignment, rotatedColumnPermutation); + } + + in.close(); + return totalColumn; +} + +void placeNewSamplesOntoExistingTree(Params ¶ms) +{ + cout << "\n========== Start initial data structure ==========\n"; + + Alignment *alignment; + IQTree *tree; + tree = new IQTree; + char *fileName = params.mutation_tree_file; + bool isRooted = false; + + tree->readTree(fileName, isRooted); + int numColumn = readVCFFile(tree, &alignment, params) + 1; + // Init new tree's memory + tree->allocateMutationMemory(numColumn); + // free memory + delete[] tree->root_states; + tree->add_row = false; + cout << "Tree parsimony after init mutations: " << tree->computeParsimonyScoreMutation() << '\n'; + + cout << "\n========== Start placement core ==========\n"; + int numSample = min((int)alignment->missingSampleMutations.size(), params.num_missing_sample); + + auto startTime = getCPUTime(); + for (int i = 0; i < numSample; ++i) + { + vector> bfs = tree->breadth_first_expansion(); + int totalNodes = (int)bfs.size(); + + CandidateNode inp; + int bestSetDifference = INF; + size_t bestNodeNumLeaves = INF; + size_t bestDistance = INF; + std::vector excessMutations; + std::vector nodeHasUnique(totalNodes, false); + bool bestNodeHasUnique = false; + size_t bestIndex = 0; + + inp.best_set_difference = &bestSetDifference; + inp.best_node_num_leaves = &bestNodeNumLeaves; + inp.best_distance = &bestDistance; + inp.node = (PhyloNode *)tree->root->neighbors[0]->node; + inp.node_branch = (PhyloNeighbor *)inp.node->findNeighbor(tree->root); + inp.missing_sample_mutations = &alignment->missingSampleMutations[i]; + inp.excess_mutations = &excessMutations; + inp.has_unique = &bestNodeHasUnique; + inp.node_has_unique = &(nodeHasUnique); + inp.best_index = &bestIndex; + + tree->initDataCalculatePlacementMutation(inp); + tree->optimizedCalculatePlacementMutation(inp, 0, true); + + for (int j = 0; j < totalNodes; ++j) + { + if (inp.best_node == bfs[j].first) + { + bestIndex = j; + } + } + *inp.best_set_difference = INF; + inp.index = bestIndex; + inp.node = bfs[bestIndex].first; + inp.node_branch = bfs[bestIndex].second; + tree->calculatePlacementMutation(inp, false, true); + tree->addNewSample(bfs[bestIndex].first, bfs[bestIndex].second, excessMutations, i, alignment->missingSampleNames[i]); + } + cout << "New tree's parsimony score: " << tree->computeParsimonyScoreMutation() << '\n'; + cout << "Time: " << fixed << setprecision(3) << (double)(getCPUTime() - startTime) << " seconds\n"; + cout << "Memory: " << getMemory() << " KB\n"; + + delete alignment; + alignment = NULL; + delete tree; +} \ No newline at end of file diff --git a/placement.h b/placement.h new file mode 100644 index 00000000..a9e0f727 --- /dev/null +++ b/placement.h @@ -0,0 +1,19 @@ +#ifndef PLACEMENT_H +#define PLACEMENT_H + +#include "tools.h" +#include "fstream" +#include "timeutil.h" + +const int INF = (int)1e9 + 7; + +/** + * Place new samples onto existing tree + */ +void placeNewSamplesOntoExistingTree(Params ¶ms); + +/** + * Check if origin tree doesn't change. + */ +void checkCorrectTree(char *originTreeFile, char *newTreeFile); +#endif diff --git a/timeutil.h b/timeutil.h index 7151d70c..c1931d84 100644 --- a/timeutil.h +++ b/timeutil.h @@ -86,7 +86,16 @@ #endif #endif /* HAVE_GETTIMEOFDAY */ - +/** + * @return CPU memory usage since program was started + */ +__inline uint64_t getMemory() { +#ifdef HAVE_GETRUSAGE + struct rusage usage; + getrusage(RUSAGE_SELF, &usage); + return usage.ru_maxrss; +#endif +} /** * @return CPU time in seconds since program was started (corrrect up to micro-seconds) diff --git a/tools.cpp b/tools.cpp index d58577a6..c8e775a7 100644 --- a/tools.cpp +++ b/tools.cpp @@ -552,6 +552,12 @@ void get2RandNumb(const int size, int &first, int &second) { void parseArg(int argc, char *argv[], Params ¶ms) { int cnt; + params.num_existing_sample = INT_MAX; + params.num_missing_sample = 0; + params.mutation_tree_file = NULL; + params.ppon = false; + params.pp_verify_preserved_tree = false; + params.original_tree_file = NULL; verbose_mode = VB_MIN; params.tree_gen = NONE; params.user_file = NULL; @@ -851,6 +857,40 @@ void parseArg(int argc, char *argv[], Params ¶ms) { #endif continue; } + if (strcmp(argv[cnt], "-pp_on") == 0) + { + params.ppon = true; + continue; + } + if (strcmp(argv[cnt], "-pp_origin") == 0) + { + cnt++; + params.original_tree_file = argv[cnt]; + continue; + } + if (strcmp(argv[cnt], "-pp_n") == 0) + { + cnt++; + params.num_existing_sample = convert_int(argv[cnt]); + continue; + } + if (strcmp(argv[cnt], "-pp_k") == 0) + { + cnt++; + params.num_missing_sample = convert_int(argv[cnt]); + continue; + } + if (strcmp(argv[cnt], "-pp_tree") == 0) + { + cnt++; + params.mutation_tree_file = argv[cnt]; + continue; + } + if (strcmp(argv[cnt], "-pp_test_optimize") == 0) + { + params.pp_verify_preserved_tree = true; + continue; + } if (strcmp(argv[cnt], "-ho") == 0 || strcmp(argv[cnt], "-?") == 0) { // usage_iqtree(argv, false); usage_mpboot(argv, false); @@ -3087,31 +3127,40 @@ void usage_mpboot(char* argv[], bool full_command) { } InputType detectInputFile(char *input_file) { - - try { - ifstream in; - in.exceptions(ios::failbit | ios::badbit); - in.open(input_file); - - unsigned char ch; - int count = 0; - do { - in >> ch; - } while (ch <= 32 && !in.eof() && count++ < 20); - in.close(); - switch (ch) { - case '#': return IN_NEXUS; - case '(': return IN_NEWICK; - case '[': return IN_NEWICK; - case '>': return IN_FASTA; - default: - if (isdigit(ch)) return IN_PHYLIP; - return IN_OTHER; - } - } catch (ios::failure) { - outError("Cannot read file ", input_file); - } - return IN_OTHER; + try { + ifstream in; + in.exceptions(ios::failbit | ios::badbit); + in.open(input_file); + + unsigned char ch; + int count = 0; + do { + in >> ch; + } while (ch <= 32 && !in.eof() && count++ < 20); + char tmp = 'N'; + in >> tmp; + in.close(); + switch (ch) { + case '#': + if (tmp == 'N') + return IN_NEXUS; + return IN_VCF; + case '(': + return IN_NEWICK; + case '[': + return IN_NEWICK; + case '>': + return IN_FASTA; + default: + if (isdigit(ch)) + return IN_PHYLIP; + return IN_OTHER; + } + } + catch (ios::failure) { + outError("Cannot read file ", input_file); + } + return IN_OTHER; } bool overwriteFile(char *filename) { diff --git a/tools.h b/tools.h index 315db2dd..f484cc31 100644 --- a/tools.h +++ b/tools.h @@ -314,7 +314,7 @@ const int SW_AVG_PRESENT = 4; // take the split weight average over all trees th input type, tree or splits graph */ enum InputType { - IN_NEWICK, IN_NEXUS, IN_FASTA, IN_PHYLIP, IN_OTHER + IN_NEWICK, IN_NEXUS, IN_FASTA, IN_PHYLIP, IN_VCF, IN_OTHER }; /** @@ -411,6 +411,35 @@ extern int NNI_MAX_NR_STEP; program parameters, everything is specified here */ struct Params { + /** + * Enable placement + */ + bool ppon; + + /** + * Number of starting row + */ + int num_existing_sample; + + /** + * Number of adding row + */ + int num_missing_sample; + + /** + * Tree file name + */ + char *mutation_tree_file; + + /** + * Original tree file name + */ + char *original_tree_file; + + /** + * Checking correct tree + */ + bool pp_verify_preserved_tree; /** * Number of starting parsimony trees From d821e29148628110c1780e7e7eeefe8a6d0d8be6 Mon Sep 17 00:00:00 2001 From: trungnotchung Date: Tue, 3 Jun 2025 15:10:20 +0700 Subject: [PATCH 03/23] chore: refactor placement --- placement.cpp | 16 +++++++++------- placement.h | 2 -- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/placement.cpp b/placement.cpp index 5e79f3d2..4898b6c1 100644 --- a/placement.cpp +++ b/placement.cpp @@ -65,7 +65,7 @@ void initializeNewColumn(IQTree *tree, Alignment *alignment, vector &rotate tree->initMutation(permCol, compressedPermCol); } -int readInitialAlignment(ifstream &inFileStream, char *outFileName, int numInitialRow) +int readInitialAlignment(ifstream &INT_MAXileStream, char *outFileName, int numInitialRow) { ofstream outFile(outFileName); if (!outFile.is_open()) @@ -75,7 +75,7 @@ int readInitialAlignment(ifstream &inFileStream, char *outFileName, int numIniti } string line; int currentRow = 0; - while (getline(inFileStream, line)) + while (getline(INT_MAXileStream, line)) { if (line == "") { @@ -148,7 +148,7 @@ void placeNewSamplesOntoExistingTree(Params ¶ms) tree->add_row = false; cout << "Tree parsimony after init mutations: " << tree->computeParsimonyScoreMutation() << '\n'; - cout << "\n========== Start placement core ==========\n"; + cout << "\n========== Starting placement core ==========\n"; int numSample = min((int)alignment->missingSampleMutations.size(), params.num_missing_sample); auto startTime = getCPUTime(); @@ -158,9 +158,9 @@ void placeNewSamplesOntoExistingTree(Params ¶ms) int totalNodes = (int)bfs.size(); CandidateNode inp; - int bestSetDifference = INF; - size_t bestNodeNumLeaves = INF; - size_t bestDistance = INF; + int bestSetDifference = INT_MAX; + size_t bestNodeNumLeaves = INT_MAX; + size_t bestDistance = INT_MAX; std::vector excessMutations; std::vector nodeHasUnique(totalNodes, false); bool bestNodeHasUnique = false; @@ -187,13 +187,15 @@ void placeNewSamplesOntoExistingTree(Params ¶ms) bestIndex = j; } } - *inp.best_set_difference = INF; + *inp.best_set_difference = INT_MAX; inp.index = bestIndex; inp.node = bfs[bestIndex].first; inp.node_branch = bfs[bestIndex].second; tree->calculatePlacementMutation(inp, false, true); tree->addNewSample(bfs[bestIndex].first, bfs[bestIndex].second, excessMutations, i, alignment->missingSampleNames[i]); } + + cout << "\n========== Finished placement core ==========\n"; cout << "New tree's parsimony score: " << tree->computeParsimonyScoreMutation() << '\n'; cout << "Time: " << fixed << setprecision(3) << (double)(getCPUTime() - startTime) << " seconds\n"; cout << "Memory: " << getMemory() << " KB\n"; diff --git a/placement.h b/placement.h index a9e0f727..d5970a6d 100644 --- a/placement.h +++ b/placement.h @@ -5,8 +5,6 @@ #include "fstream" #include "timeutil.h" -const int INF = (int)1e9 + 7; - /** * Place new samples onto existing tree */ From 8b2bc1c2897dd1e7fc42cb0ac2479ddebf5ee457 Mon Sep 17 00:00:00 2001 From: trungnotchung Date: Sun, 8 Jun 2025 09:34:26 +0700 Subject: [PATCH 04/23] wip: refactor alignment --- alignment.cpp | 210 +++++++++++++++++++++++++------------------------- alignment.h | 20 ++--- placement.cpp | 12 +-- 3 files changed, 121 insertions(+), 121 deletions(-) diff --git a/alignment.cpp b/alignment.cpp index 229529f7..b4a6f706 100644 --- a/alignment.cpp +++ b/alignment.cpp @@ -398,7 +398,7 @@ void Alignment::checkGappySeq(bool force_error) { } } -Alignment::Alignment(char *filename, char *sequence_type, InputType &intype, int numStartRow) : vector() { +Alignment::Alignment(char *filename, char *sequence_type, InputType &intype, int num_start_row) : vector() { num_states = 0; frac_const_sites = 0.0; codon_table = NULL; @@ -422,7 +422,7 @@ Alignment::Alignment(char *filename, char *sequence_type, InputType &intype, int readPhylip(filename, sequence_type); } else if (intype == IN_VCF) { cout << "VCF format detected" << endl; - readVCF(filename, sequence_type, numStartRow); + readVCF(filename, sequence_type, num_start_row); } else { outError("Unknown sequence format, please use PHYLIP, FASTA, or NEXUS format"); } @@ -1315,171 +1315,171 @@ void split(const string &s, vector &elems, const string &delim) // Find the permutation of columns after rotation vector Alignment::findRotatedColumnPermutation() { - assert(getNSite() == (int)initialColumnState.size()); + assert(getNSite() == (int)initial_column_state.size()); vector perm(getNSite(), 0); char char_to_state[NUM_CHAR]; computeUnknownState(); buildStateMap(char_to_state, seq_type); - map> patternMap; + map> pattern_map; // Build pattern map for (int i = 0; i < getNSite(); ++i) { Pattern ptn = getPattern(i); - patternMap[ptn].push_back(i); + pattern_map[ptn].push_back(i); } for (int col = 0; col < getNSite(); ++col) { // For each column, build a pattern // Find initial index of the pattern Pattern nptn; - for (int i = 0; i < initialColumnState[col].length(); ++i) + for (int i = 0; i < initial_column_state[col].length(); ++i) { - nptn += char_to_state[(int)initialColumnState[col][i]]; + nptn += char_to_state[(int)initial_column_state[col][i]]; } - perm[patternMap[nptn].back()] = col; - patternMap[nptn].pop_back(); + perm[pattern_map[nptn].back()] = col; + pattern_map[nptn].pop_back(); } return perm; } -void Alignment::addToAlignmentNewSequence(const string &newName, const string &newSeq, const vector &permCol) +void Alignment::addToAlignmentNewSequence(const string &new_name, const string &new_seq, const vector &perm_col) { - assert(newSeq.size() == getNSite()); + assert(new_seq.size() == getNSite()); char char_to_state[NUM_CHAR]; computeUnknownState(); buildStateMap(char_to_state, seq_type); - vector newVectorPattern; - vector newSitePattern; - PatternIntMap newPatternIdx; + vector new_vector_pattern; + vector new_site_pattern; + PatternIntMap new_pattern_index; for (int i = 0; i < getNSite(); ++i) { - Pattern newPat = getPattern(i); - newPat.push_back(char_to_state[(int)newSeq[permCol[i]]]); - PatternIntMap::iterator pat_it = newPatternIdx.find(newPat); - if (pat_it == newPatternIdx.end()) + Pattern new_pattern = getPattern(i); + new_pattern.push_back(char_to_state[(int)new_seq[perm_col[i]]]); + PatternIntMap::iterator pat_it = new_pattern_index.find(new_pattern); + if (pat_it == new_pattern_index.end()) { // not found - newPat.frequency = 1; - newPat.computeConst(STATE_UNKNOWN); - newVectorPattern.push_back(newPat); - newPatternIdx[newPat] = newVectorPattern.size() - 1; - newSitePattern.push_back(newVectorPattern.size() - 1); + new_pattern.frequency = 1; + new_pattern.computeConst(STATE_UNKNOWN); + new_vector_pattern.push_back(new_pattern); + new_pattern_index[new_pattern] = new_vector_pattern.size() - 1; + new_site_pattern.push_back(new_vector_pattern.size() - 1); } else { int index = pat_it->second; - newVectorPattern[index].frequency++; - newSitePattern.push_back(index); + new_vector_pattern[index].frequency++; + new_site_pattern.push_back(index); } } clear(); - for (vector::iterator it = newVectorPattern.begin(); it != newVectorPattern.end(); ++it) + for (vector::iterator it = new_vector_pattern.begin(); it != new_vector_pattern.end(); ++it) { push_back(*it); } - pattern_index = newPatternIdx; - site_pattern = newSitePattern; - seq_names.push_back(newName); + pattern_index = new_pattern_index; + site_pattern = new_site_pattern; + seq_names.push_back(new_name); buildSeqStates(); // checkSeqName(); countConstSite(); } -void Alignment::addToAlignmentNewSequences(const vector &newName, const vector &newSeq, const vector &permCol) +void Alignment::addToAlignmentNewSequences(const vector &new_seq_names, const vector &new_sequences, const vector &perm_col) { char char_to_state[NUM_CHAR]; computeUnknownState(); buildStateMap(char_to_state, seq_type); - vector newVectorPattern; - vector newSitePattern; - PatternIntMap newPatternIdx; - int nSeqSize = newSeq.size(); - for (int i = 0; i < getNSite(); ++i) + vector new_vector_pattern; + vector new_site_pattern; + PatternIntMap new_pattern_index; + int nseq = new_sequences.size(); + for (int site = 0; site < getNSite(); ++site) { - Pattern newPat = getPattern(i); - for (int j = 0; j < nSeqSize; ++j) + Pattern newPat = getPattern(site); + for (int seq = 0; seq < nseq; ++seq) { - newPat.push_back(char_to_state[(int)newSeq[j][permCol[i]]]); + newPat.push_back(char_to_state[(int)new_sequences[seq][perm_col[site]]]); } - PatternIntMap::iterator pat_it = newPatternIdx.find(newPat); - if (pat_it == newPatternIdx.end()) + PatternIntMap::iterator pat_it = new_pattern_index.find(newPat); + if (pat_it == new_pattern_index.end()) { // not found newPat.frequency = 1; newPat.computeConst(STATE_UNKNOWN); - newVectorPattern.push_back(newPat); - newPatternIdx[newPat] = newVectorPattern.size() - 1; - newSitePattern.push_back(newVectorPattern.size() - 1); + new_vector_pattern.push_back(newPat); + new_pattern_index[newPat] = new_vector_pattern.size() - 1; + new_site_pattern.push_back(new_vector_pattern.size() - 1); } else { int index = pat_it->second; - newVectorPattern[index].frequency++; - newSitePattern.push_back(index); + new_vector_pattern[index].frequency++; + new_site_pattern.push_back(index); } } clear(); - for (vector::iterator it = newVectorPattern.begin(); it != newVectorPattern.end(); ++it) + for (vector::iterator it = new_vector_pattern.begin(); it != new_vector_pattern.end(); ++it) { push_back(*it); } - pattern_index = newPatternIdx; - site_pattern = newSitePattern; - seq_names.insert(seq_names.end(), newName.begin(), newName.end()); + pattern_index = new_pattern_index; + site_pattern = new_site_pattern; + seq_names.insert(seq_names.end(), new_seq_names.begin(), new_seq_names.end()); buildSeqStates(); // checkSeqName(); countConstSite(); } -void Alignment::updateAlignmentNewSequences(const vector &newSeq, const vector &permCol) +void Alignment::updateAlignmentNewSequences(const vector &new_sequences, const vector &perm_col) { computeUnknownState(); char char_to_state[NUM_CHAR]; buildStateMap(char_to_state, seq_type); - vector newVectorPattern; - vector newSitePattern; - PatternIntMap newPatternIdx; - int nseq = newSeq.size(); + vector new_vector_pattern; + vector new_site_pattern; + PatternIntMap new_pattern_index; + int nseq = new_sequences.size(); int nsite = getNSite(); for (int site = 0; site < nsite; ++site) { - Pattern newPat; + Pattern new_pattern; for (int seq = 0; seq < nseq; ++seq) { - newPat.push_back(char_to_state[(int)newSeq[seq][permCol[site]]]); + new_pattern.push_back(char_to_state[(int)new_sequences[seq][perm_col[site]]]); } - PatternIntMap::iterator pat_it = newPatternIdx.find(newPat); - if (pat_it == newPatternIdx.end()) + PatternIntMap::iterator pat_it = new_pattern_index.find(new_pattern); + if (pat_it == new_pattern_index.end()) { // If pattern not found, add new pattern - newPat.frequency = 1; - newPat.computeConst(STATE_UNKNOWN); - newVectorPattern.push_back(newPat); - newPatternIdx[newPat] = newVectorPattern.size() - 1; - newSitePattern.push_back(newVectorPattern.size() - 1); + new_pattern.frequency = 1; + new_pattern.computeConst(STATE_UNKNOWN); + new_vector_pattern.push_back(new_pattern); + new_pattern_index[new_pattern] = new_vector_pattern.size() - 1; + new_site_pattern.push_back(new_vector_pattern.size() - 1); } else { // If pattern found, increment frequency int index = pat_it->second; - newVectorPattern[index].frequency++; - newSitePattern.push_back(index); + new_vector_pattern[index].frequency++; + new_site_pattern.push_back(index); } } clear(); - for (vector::iterator itr = newVectorPattern.begin(); itr != newVectorPattern.end(); ++itr) + for (vector::iterator itr = new_vector_pattern.begin(); itr != new_vector_pattern.end(); ++itr) { push_back(*itr); } - pattern_index = newPatternIdx; - site_pattern = newSitePattern; + pattern_index = new_pattern_index; + site_pattern = new_site_pattern; buildSeqStates(); countConstSite(); } // Read partial VCF file and update alignment -int Alignment::readPartialVCF(ifstream &in, char *sequence_type, vector &permCol, int numStartRow, int startIndex, int numColumn) +int Alignment::readPartialVCF(ifstream &in, char *sequence_type, vector &perm_col, int num_start_row, int start_index, int num_column) { if (in.eof()) { @@ -1490,12 +1490,12 @@ int Alignment::readPartialVCF(ifstream &in, char *sequence_type, vector &pe int nsite = 0; int seq_id = 0; string line; - int numProcessedColumn = 0; + int num_processed_column = 0; sequences.resize(nseq, ""); - existingSampleMutations.assign(nseq, vector()); + existing_sample_mutations.assign(nseq, vector()); - for (; !in.eof() && numProcessedColumn < numColumn;) + for (; !in.eof() && num_processed_column < num_column;) { getline(in, line); if (line == "") @@ -1504,13 +1504,13 @@ int Alignment::readPartialVCF(ifstream &in, char *sequence_type, vector &pe split(line, words, "\t"); if (words.size() == 1) continue; - if (words.size() != 9 + nseq + missingSampleMutations.size()) + if (words.size() != 9 + nseq + missing_sample_mutations.size()) throw "Number of columns in VCF file is not consistent"; vector alleles; Mutation cur_mut; int variant_pos = std::stoi(words[1]); cur_mut.position = variant_pos; - cur_mut.compressed_position = numProcessedColumn + startIndex; + cur_mut.compressed_position = num_processed_column + start_index; while ((int)reference_nuc.size() <= cur_mut.position) reference_nuc.push_back(0); split(words[4], alleles, ","); @@ -1526,7 +1526,7 @@ int Alignment::readPartialVCF(ifstream &in, char *sequence_type, vector &pe if (allele_id > 0) { std::string allele = alleles[allele_id - 1]; - if (i - 9 < numStartRow) + if (i - 9 < num_start_row) { sequences[i - 9].push_back(allele[0]); } @@ -1534,7 +1534,7 @@ int Alignment::readPartialVCF(ifstream &in, char *sequence_type, vector &pe } else { - if (i - 9 < numStartRow) + if (i - 9 < num_start_row) { sequences[i - 9].push_back(words[3][0]); } @@ -1543,50 +1543,50 @@ int Alignment::readPartialVCF(ifstream &in, char *sequence_type, vector &pe } else { - if (i - 9 < numStartRow) + if (i - 9 < num_start_row) { sequences[i - 9].push_back('-'); } cur_mut.mut_nuc = getMutationFromState('N'); cur_mut.is_missing = true; } - if (i - 9 >= numStartRow) + if (i - 9 >= num_start_row) { if (cur_mut.mut_nuc != cur_mut.ref_nuc) { cur_mut.par_nuc = cur_mut.ref_nuc; - missingSampleMutations[i - 9 - numStartRow].push_back(cur_mut); + missing_sample_mutations[i - 9 - num_start_row].push_back(cur_mut); } } else { - existingSampleMutations[i - 9].push_back(cur_mut); + existing_sample_mutations[i - 9].push_back(cur_mut); } } ++nsite; - ++numProcessedColumn; + ++num_processed_column; } // If not enough columns, rebuild pattern and return - if (numProcessedColumn < numColumn) + if (num_processed_column < num_column) { buildPattern(sequences, sequence_type, nseq, nsite); - initialColumnState.assign(nsite, ""); + initial_column_state.assign(nsite, ""); for (int seq = 0; seq < nseq; ++seq) { for (int site = 0; site < nsite; ++site) - initialColumnState[site] += sequences[seq][site]; + initial_column_state[site] += sequences[seq][site]; } - permCol = findRotatedColumnPermutation(); - return numProcessedColumn; + perm_col = findRotatedColumnPermutation(); + return num_processed_column; } // Update alignment with new sequences - updateAlignmentNewSequences(sequences, permCol); - return numProcessedColumn; + updateAlignmentNewSequences(sequences, perm_col); + return num_processed_column; } -int Alignment::readVCF(char *filename, char *sequence_type, int numStartRow) +int Alignment::readVCF(char *filename, char *sequence_type, int num_start_row) { StrVector sequences; ifstream in; @@ -1597,7 +1597,7 @@ int Alignment::readVCF(char *filename, char *sequence_type, int numStartRow) int seq_id = 0; string line; in.exceptions(ios::badbit); - int processedColumn = 0; + int num_processed_column = 0; for (; !in.eof();) { @@ -1613,9 +1613,9 @@ int Alignment::readVCF(char *filename, char *sequence_type, int numStartRow) // Sample names start from the 10th word in the header for (int i = 9; i < words.size(); i++) { - if (i - 9 >= numStartRow) + if (i - 9 >= num_start_row) { - missingSampleNames.push_back(words[i]); + missing_sample_names.push_back(words[i]); } else { @@ -1624,18 +1624,18 @@ int Alignment::readVCF(char *filename, char *sequence_type, int numStartRow) } } sequences.resize(nseq, ""); - missingSampleMutations.resize(missingSampleNames.size()); - existingSampleMutations.resize(nseq); + missing_sample_mutations.resize(missing_sample_names.size()); + existing_sample_mutations.resize(nseq); } else { - if (words.size() != 9 + nseq + missingSampleMutations.size()) + if (words.size() != 9 + nseq + missing_sample_mutations.size()) throw "Number of columns in VCF file is not consistent"; vector alleles; Mutation cur_mut; int variant_pos = std::stoi(words[1]); cur_mut.position = variant_pos; - cur_mut.compressed_position = processedColumn; + cur_mut.compressed_position = num_processed_column; while ((int)reference_nuc.size() <= cur_mut.position) reference_nuc.push_back(0); split(words[4], alleles, ","); @@ -1651,7 +1651,7 @@ int Alignment::readVCF(char *filename, char *sequence_type, int numStartRow) if (allele_id > 0) { std::string allele = alleles[allele_id - 1]; - if (i - 9 < numStartRow) + if (i - 9 < num_start_row) { sequences[i - 9].push_back(allele[0]); } @@ -1659,7 +1659,7 @@ int Alignment::readVCF(char *filename, char *sequence_type, int numStartRow) } else { - if (i - 9 < numStartRow) + if (i - 9 < num_start_row) { sequences[i - 9].push_back(words[3][0]); } @@ -1668,35 +1668,35 @@ int Alignment::readVCF(char *filename, char *sequence_type, int numStartRow) } else { - if (i - 9 < numStartRow) + if (i - 9 < num_start_row) { sequences[i - 9].push_back('-'); } cur_mut.mut_nuc = getMutationFromState('N'); cur_mut.is_missing = true; } - if (i - 9 >= numStartRow) + if (i - 9 >= num_start_row) { if (cur_mut.mut_nuc != cur_mut.ref_nuc) { cur_mut.par_nuc = cur_mut.ref_nuc; - missingSampleMutations[i - 9 - numStartRow].push_back(cur_mut); + missing_sample_mutations[i - 9 - num_start_row].push_back(cur_mut); } } else { - existingSampleMutations[i - 9].push_back(cur_mut); + existing_sample_mutations[i - 9].push_back(cur_mut); } } ++nsite; - ++processedColumn; + ++num_processed_column; } } - initialColumnState.assign(nsite, ""); + initial_column_state.assign(nsite, ""); for (int seq = 0; seq < nseq; ++seq) { for (int site = 0; site < nsite; ++site) - initialColumnState[site] += sequences[seq][site]; + initial_column_state[site] += sequences[seq][site]; } in.clear(); in.exceptions(ios::failbit | ios::badbit); diff --git a/alignment.h b/alignment.h index 933fcfdc..c7a01544 100644 --- a/alignment.h +++ b/alignment.h @@ -613,28 +613,28 @@ class Alignment : public vector { /** * Missing sample names */ - vector missingSampleNames; + vector missing_sample_names; /** * Missing sample sequences */ - vector missingSampleSequences; + vector missing_sample_sequences; /** * Initial column state * Using for finding rotated column permutation */ - vector initialColumnState; + vector initial_column_state; /** * Missing sample mutations */ - vector> missingSampleMutations; + vector> missing_sample_mutations; /** * Existing sample mutations */ - vector> existingSampleMutations; + vector> existing_sample_mutations; /** * Reference nucleotides @@ -644,17 +644,17 @@ class Alignment : public vector { /** * Replace current alignment with new sequences */ - void updateAlignmentNewSequences(const vector &newSeqs, const vector &permCol); + void updateAlignmentNewSequences(const vector &new_seqs, const vector &perm_col); /** * Add a new sequence to the alignment */ - void addToAlignmentNewSequence(const string &newName, const string &newSeq, const vector &permCol); + void addToAlignmentNewSequence(const string &new_seq_name, const string &new_seq, const vector &perm_col); /** * Add new sequences to the alignment */ - void addToAlignmentNewSequences(const vector &newNames, const vector &newSeqs, const vector &permCol); + void addToAlignmentNewSequences(const vector &new_seq_names, const vector &new_seqs, const vector &perm_col); /** * Get mutation from state @@ -675,12 +675,12 @@ class Alignment : public vector { * Read partial VCF file * Using for reducing memory usage */ - int readPartialVCF(ifstream &in, char *sequence_type, vector &permCol, int numStartRow, int startIndex, int numColumn); + int readPartialVCF(ifstream &in, char *sequence_type, vector &perm_col, int num_start_row, int start_index, int num_column); /** * Read VCF file */ - int readVCF(char *filename, char *sequence_type, int numStartRow); + int readVCF(char *file_name, char *sequence_type, int num_start_row); protected: diff --git a/placement.cpp b/placement.cpp index 4898b6c1..b3fd1020 100644 --- a/placement.cpp +++ b/placement.cpp @@ -50,13 +50,13 @@ void initializeNewColumn(IQTree *tree, Alignment *alignment, vector &rotate int nsite = rotatedPermutationColumn.size(); vector permCol(nsite); vector compressedPermCol(nsite); - if (alignment->existingSampleMutations.size()) + if (alignment->existing_sample_mutations.size()) { for (int site = 0; site < nsite; ++site) { int col = rotatedPermutationColumn[site]; - compressedPermCol[site] = alignment->existingSampleMutations[0][col].compressed_position; - permCol[site] = alignment->existingSampleMutations[0][col].position; + compressedPermCol[site] = alignment->existing_sample_mutations[0][col].compressed_position; + permCol[site] = alignment->existing_sample_mutations[0][col].position; } } alignment->ungroupSitePattern(); @@ -149,7 +149,7 @@ void placeNewSamplesOntoExistingTree(Params ¶ms) cout << "Tree parsimony after init mutations: " << tree->computeParsimonyScoreMutation() << '\n'; cout << "\n========== Starting placement core ==========\n"; - int numSample = min((int)alignment->missingSampleMutations.size(), params.num_missing_sample); + int numSample = min((int)alignment->missing_sample_mutations.size(), params.num_missing_sample); auto startTime = getCPUTime(); for (int i = 0; i < numSample; ++i) @@ -171,7 +171,7 @@ void placeNewSamplesOntoExistingTree(Params ¶ms) inp.best_distance = &bestDistance; inp.node = (PhyloNode *)tree->root->neighbors[0]->node; inp.node_branch = (PhyloNeighbor *)inp.node->findNeighbor(tree->root); - inp.missing_sample_mutations = &alignment->missingSampleMutations[i]; + inp.missing_sample_mutations = &alignment->missing_sample_mutations[i]; inp.excess_mutations = &excessMutations; inp.has_unique = &bestNodeHasUnique; inp.node_has_unique = &(nodeHasUnique); @@ -192,7 +192,7 @@ void placeNewSamplesOntoExistingTree(Params ¶ms) inp.node = bfs[bestIndex].first; inp.node_branch = bfs[bestIndex].second; tree->calculatePlacementMutation(inp, false, true); - tree->addNewSample(bfs[bestIndex].first, bfs[bestIndex].second, excessMutations, i, alignment->missingSampleNames[i]); + tree->addNewSample(bfs[bestIndex].first, bfs[bestIndex].second, excessMutations, i, alignment->missing_sample_names[i]); } cout << "\n========== Finished placement core ==========\n"; From 2675d53ca38238bc2110dbce2fe8b838d78d7a5f Mon Sep 17 00:00:00 2001 From: trungnotchung Date: Sun, 8 Jun 2025 11:52:09 +0700 Subject: [PATCH 05/23] feat: detect long input file --- alignment.cpp | 72 ++++++++++++++++++++++++++++++++------------------- alignment.h | 18 ++++++------- placement.cpp | 27 +++++++++++-------- placement.h | 2 ++ 4 files changed, 73 insertions(+), 46 deletions(-) diff --git a/alignment.cpp b/alignment.cpp index b4a6f706..9eef0b74 100644 --- a/alignment.cpp +++ b/alignment.cpp @@ -398,7 +398,7 @@ void Alignment::checkGappySeq(bool force_error) { } } -Alignment::Alignment(char *filename, char *sequence_type, InputType &intype, int num_start_row) : vector() { +Alignment::Alignment(char *filename, char *sequence_type, InputType &intype, int existing_sequence) : vector() { num_states = 0; frac_const_sites = 0.0; codon_table = NULL; @@ -422,7 +422,7 @@ Alignment::Alignment(char *filename, char *sequence_type, InputType &intype, int readPhylip(filename, sequence_type); } else if (intype == IN_VCF) { cout << "VCF format detected" << endl; - readVCF(filename, sequence_type, num_start_row); + readVCF(filename, sequence_type, existing_sequence); } else { outError("Unknown sequence format, please use PHYLIP, FASTA, or NEXUS format"); } @@ -1311,6 +1311,30 @@ void split(const string &s, vector &elems, const string &delim) pos = find_pos + delim_len; } } +int Alignment::getNumberSequence(char *file_name) +{ + ifstream in; + in.exceptions(ios::failbit | ios::badbit); + in.open(file_name); + string line; + in.exceptions(ios::badbit); + + for (; !in.eof();) + { + getline(in, line); + if (line == "") + continue; + vector words; + split(line, words, "\t"); + if (words.size() == 1) + continue; + if (words[1] == "POS") + { + return words.size() - 9; + } + } + throw "Cannot find number of sequence in alignment file\n"; +} // Find the permutation of columns after rotation vector Alignment::findRotatedColumnPermutation() @@ -1479,7 +1503,7 @@ void Alignment::updateAlignmentNewSequences(const vector &new_sequences, } // Read partial VCF file and update alignment -int Alignment::readPartialVCF(ifstream &in, char *sequence_type, vector &perm_col, int num_start_row, int start_index, int num_column) +int Alignment::readPartialVCF(ifstream &in, char *sequence_type, vector &perm_col, int existing_sequence, int start_index, int num_column) { if (in.eof()) { @@ -1526,7 +1550,7 @@ int Alignment::readPartialVCF(ifstream &in, char *sequence_type, vector &pe if (allele_id > 0) { std::string allele = alleles[allele_id - 1]; - if (i - 9 < num_start_row) + if (i - 9 < existing_sequence) { sequences[i - 9].push_back(allele[0]); } @@ -1534,7 +1558,7 @@ int Alignment::readPartialVCF(ifstream &in, char *sequence_type, vector &pe } else { - if (i - 9 < num_start_row) + if (i - 9 < existing_sequence) { sequences[i - 9].push_back(words[3][0]); } @@ -1543,19 +1567,19 @@ int Alignment::readPartialVCF(ifstream &in, char *sequence_type, vector &pe } else { - if (i - 9 < num_start_row) + if (i - 9 < existing_sequence) { sequences[i - 9].push_back('-'); } cur_mut.mut_nuc = getMutationFromState('N'); cur_mut.is_missing = true; } - if (i - 9 >= num_start_row) + if (i - 9 >= existing_sequence) { if (cur_mut.mut_nuc != cur_mut.ref_nuc) { cur_mut.par_nuc = cur_mut.ref_nuc; - missing_sample_mutations[i - 9 - num_start_row].push_back(cur_mut); + missing_sample_mutations[i - 9 - existing_sequence].push_back(cur_mut); } } else @@ -1586,7 +1610,7 @@ int Alignment::readPartialVCF(ifstream &in, char *sequence_type, vector &pe return num_processed_column; } -int Alignment::readVCF(char *filename, char *sequence_type, int num_start_row) +int Alignment::readVCF(char *filename, char *sequence_type, int existing_sequence) { StrVector sequences; ifstream in; @@ -1595,6 +1619,7 @@ int Alignment::readVCF(char *filename, char *sequence_type, int num_start_row) int nseq = 0; int nsite = 0; int seq_id = 0; + int num_missing_sequence = 0; string line; in.exceptions(ios::badbit); int num_processed_column = 0; @@ -1613,9 +1638,10 @@ int Alignment::readVCF(char *filename, char *sequence_type, int num_start_row) // Sample names start from the 10th word in the header for (int i = 9; i < words.size(); i++) { - if (i - 9 >= num_start_row) + if (i - 9 >= existing_sequence) { - missing_sample_names.push_back(words[i]); + missing_seq_names.push_back(words[i]); + num_missing_sequence++; } else { @@ -1624,12 +1650,12 @@ int Alignment::readVCF(char *filename, char *sequence_type, int num_start_row) } } sequences.resize(nseq, ""); - missing_sample_mutations.resize(missing_sample_names.size()); existing_sample_mutations.resize(nseq); + missing_sample_mutations.resize(num_missing_sequence); } else { - if (words.size() != 9 + nseq + missing_sample_mutations.size()) + if (words.size() != 9 + nseq + num_missing_sequence) throw "Number of columns in VCF file is not consistent"; vector alleles; Mutation cur_mut; @@ -1651,42 +1677,36 @@ int Alignment::readVCF(char *filename, char *sequence_type, int num_start_row) if (allele_id > 0) { std::string allele = alleles[allele_id - 1]; - if (i - 9 < num_start_row) - { + if (i - 9 < existing_sequence) sequences[i - 9].push_back(allele[0]); - } + cur_mut.mut_nuc = getMutationFromState(allele[0]); } else { - if (i - 9 < num_start_row) - { + if (i - 9 < existing_sequence) sequences[i - 9].push_back(words[3][0]); - } + cur_mut.mut_nuc = getMutationFromState(words[3][0]); } } else { - if (i - 9 < num_start_row) - { + if (i - 9 < existing_sequence) sequences[i - 9].push_back('-'); - } cur_mut.mut_nuc = getMutationFromState('N'); cur_mut.is_missing = true; } - if (i - 9 >= num_start_row) + if (i - 9 >= existing_sequence) { if (cur_mut.mut_nuc != cur_mut.ref_nuc) { cur_mut.par_nuc = cur_mut.ref_nuc; - missing_sample_mutations[i - 9 - num_start_row].push_back(cur_mut); + missing_sample_mutations[i - 9 - existing_sequence].push_back(cur_mut); } } else - { existing_sample_mutations[i - 9].push_back(cur_mut); - } } ++nsite; ++num_processed_column; diff --git a/alignment.h b/alignment.h index c7a01544..19a4508a 100644 --- a/alignment.h +++ b/alignment.h @@ -59,7 +59,7 @@ class Alignment : public vector { @param sequence_type type of the sequence, either "BIN", "DNA", "AA", or NULL @param intype (OUT) input format of the file */ - Alignment(char *filename, char *sequence_type, InputType &intype, int numStartRow = INT_MAX); + Alignment(char *filename, char *sequence_type, InputType &intype, int existing_sequence = INT_MAX); /** destructor @@ -613,12 +613,7 @@ class Alignment : public vector { /** * Missing sample names */ - vector missing_sample_names; - - /** - * Missing sample sequences - */ - vector missing_sample_sequences; + vector missing_seq_names; /** * Initial column state @@ -675,12 +670,17 @@ class Alignment : public vector { * Read partial VCF file * Using for reducing memory usage */ - int readPartialVCF(ifstream &in, char *sequence_type, vector &perm_col, int num_start_row, int start_index, int num_column); + int readPartialVCF(ifstream &in, char *sequence_type, vector &perm_col, int existing_sequence, int start_index, int num_column); /** * Read VCF file */ - int readVCF(char *file_name, char *sequence_type, int num_start_row); + int readVCF(char *file_name, char *sequence_type, int existing_sequence); + + /** + * Get number of sequence + */ + static int getNumberSequence(char *file_name); protected: diff --git a/placement.cpp b/placement.cpp index b3fd1020..50c5f833 100644 --- a/placement.cpp +++ b/placement.cpp @@ -45,7 +45,7 @@ void configLeafNames(IQTree *tree, Node *node, Node *dad) configLeafNames(tree, (*it)->node, node); } -void initializeNewColumn(IQTree *tree, Alignment *alignment, vector &rotatedPermutationColumn) +void initAlignment(IQTree *tree, Alignment *alignment, vector &rotatedPermutationColumn) { int nsite = rotatedPermutationColumn.size(); vector permCol(nsite); @@ -65,7 +65,7 @@ void initializeNewColumn(IQTree *tree, Alignment *alignment, vector &rotate tree->initMutation(permCol, compressedPermCol); } -int readInitialAlignment(ifstream &INT_MAXileStream, char *outFileName, int numInitialRow) +int readInitialAlignment(ifstream &inFileStream, char *outFileName, int numInitialRow) { ofstream outFile(outFileName); if (!outFile.is_open()) @@ -75,7 +75,7 @@ int readInitialAlignment(ifstream &INT_MAXileStream, char *outFileName, int numI } string line; int currentRow = 0; - while (getline(INT_MAXileStream, line)) + while (getline(inFileStream, line)) { if (line == "") { @@ -94,10 +94,18 @@ int readInitialAlignment(ifstream &INT_MAXileStream, char *outFileName, int numI int readVCFFile(IQTree *tree, Alignment **alignment, Params ¶ms) { - char *alnFile = params.aln_file; + if (Alignment::getNumberSequence(params.aln_file) <= MAX_SEQUENCE) { + *alignment = new Alignment(params.aln_file, params.sequence_type, params.intype, params.num_existing_sample); + tree->setAlignment(*alignment); + tree->aln = *alignment; + vector rotatedColumnPermutation = (*alignment)->findRotatedColumnPermutation(); + initAlignment(tree, *alignment, rotatedColumnPermutation); + return (*alignment)->getNSite(); + } + ifstream in; in.exceptions(ios::failbit | ios::badbit); - in.open(alnFile); + in.open(params.aln_file); string line; in.exceptions(ios::badbit); @@ -110,19 +118,16 @@ int readVCFFile(IQTree *tree, Alignment **alignment, Params ¶ms) tree->aln = *alignment; vector rotatedColumnPermutation = (*alignment)->findRotatedColumnPermutation(); - initializeNewColumn(tree, *alignment, rotatedColumnPermutation); + initAlignment(tree, *alignment, rotatedColumnPermutation); while (true) { int numProcessedColumn = (*alignment)->readPartialVCF(in, params.sequence_type, rotatedColumnPermutation, params.num_existing_sample, totalColumn, 8); if (numProcessedColumn == 0) - { - // Process all columns break; - } tree->clearAllPartialLH(); totalColumn += numProcessedColumn; - initializeNewColumn(tree, *alignment, rotatedColumnPermutation); + initAlignment(tree, *alignment, rotatedColumnPermutation); } in.close(); @@ -192,7 +197,7 @@ void placeNewSamplesOntoExistingTree(Params ¶ms) inp.node = bfs[bestIndex].first; inp.node_branch = bfs[bestIndex].second; tree->calculatePlacementMutation(inp, false, true); - tree->addNewSample(bfs[bestIndex].first, bfs[bestIndex].second, excessMutations, i, alignment->missing_sample_names[i]); + tree->addNewSample(bfs[bestIndex].first, bfs[bestIndex].second, excessMutations, i, alignment->missing_seq_names[i]); } cout << "\n========== Finished placement core ==========\n"; diff --git a/placement.h b/placement.h index d5970a6d..8b992290 100644 --- a/placement.h +++ b/placement.h @@ -5,6 +5,8 @@ #include "fstream" #include "timeutil.h" +const int MAX_SEQUENCE = 20000; + /** * Place new samples onto existing tree */ From 7105983c8f5c9e989e807da55afca55135a0f152 Mon Sep 17 00:00:00 2001 From: trungnotchung Date: Wed, 11 Jun 2025 16:48:33 +0700 Subject: [PATCH 06/23] feat: add new sequences to alignment --- alignment.cpp | 85 +++++++++++++++++++++++++++++---------------------- alignment.h | 11 +++++-- phylotree.cpp | 4 +++ placement.cpp | 8 ++++- 4 files changed, 68 insertions(+), 40 deletions(-) diff --git a/alignment.cpp b/alignment.cpp index 9eef0b74..19175874 100644 --- a/alignment.cpp +++ b/alignment.cpp @@ -1340,10 +1340,11 @@ int Alignment::getNumberSequence(char *file_name) vector Alignment::findRotatedColumnPermutation() { assert(getNSite() == (int)initial_column_state.size()); - vector perm(getNSite(), 0); char char_to_state[NUM_CHAR]; computeUnknownState(); buildStateMap(char_to_state, seq_type); + + vector perm(getNSite(), 0); map> pattern_map; // Build pattern map for (int i = 0; i < getNSite(); ++i) @@ -1366,16 +1367,18 @@ vector Alignment::findRotatedColumnPermutation() return perm; } -void Alignment::addToAlignmentNewSequence(const string &new_name, const string &new_seq, const vector &perm_col) +void Alignment::addToAlignmentNewSequence(const string &new_name, const string &new_seq) { assert(new_seq.size() == getNSite()); - char char_to_state[NUM_CHAR]; computeUnknownState(); + buildStateMap(char_to_state, seq_type); - vector new_vector_pattern; - vector new_site_pattern; + vector new_patterns; PatternIntMap new_pattern_index; + vector new_site_patterns; + vector perm_col = findRotatedColumnPermutation(); + for (int i = 0; i < getNSite(); ++i) { Pattern new_pattern = getPattern(i); @@ -1385,69 +1388,72 @@ void Alignment::addToAlignmentNewSequence(const string &new_name, const string & { // not found new_pattern.frequency = 1; new_pattern.computeConst(STATE_UNKNOWN); - new_vector_pattern.push_back(new_pattern); - new_pattern_index[new_pattern] = new_vector_pattern.size() - 1; - new_site_pattern.push_back(new_vector_pattern.size() - 1); + new_patterns.push_back(new_pattern); + new_pattern_index[new_pattern] = new_patterns.size() - 1; + new_site_patterns.push_back(new_patterns.size() - 1); } else { int index = pat_it->second; - new_vector_pattern[index].frequency++; - new_site_pattern.push_back(index); + new_patterns[index].frequency++; + new_site_patterns.push_back(index); } } clear(); - for (vector::iterator it = new_vector_pattern.begin(); it != new_vector_pattern.end(); ++it) + for (vector::iterator it = new_patterns.begin(); it != new_patterns.end(); ++it) { push_back(*it); } pattern_index = new_pattern_index; - site_pattern = new_site_pattern; + site_pattern = new_site_patterns; seq_names.push_back(new_name); buildSeqStates(); // checkSeqName(); countConstSite(); } -void Alignment::addToAlignmentNewSequences(const vector &new_seq_names, const vector &new_sequences, const vector &perm_col) +void Alignment::addToAlignmentNewSequences(const vector &new_seq_names, const vector &new_sequences) { char char_to_state[NUM_CHAR]; computeUnknownState(); buildStateMap(char_to_state, seq_type); - vector new_vector_pattern; - vector new_site_pattern; + + vector new_patterns; PatternIntMap new_pattern_index; + vector new_site_patterns; int nseq = new_sequences.size(); + vector perm_col = findRotatedColumnPermutation(); + for (int site = 0; site < getNSite(); ++site) { - Pattern newPat = getPattern(site); + Pattern new_pattern = getPattern(site); for (int seq = 0; seq < nseq; ++seq) { - newPat.push_back(char_to_state[(int)new_sequences[seq][perm_col[site]]]); + new_pattern.push_back(char_to_state[(int)new_sequences[seq][perm_col[site]]]); } - PatternIntMap::iterator pat_it = new_pattern_index.find(newPat); + PatternIntMap::iterator pat_it = new_pattern_index.find(new_pattern); if (pat_it == new_pattern_index.end()) { // not found - newPat.frequency = 1; - newPat.computeConst(STATE_UNKNOWN); - new_vector_pattern.push_back(newPat); - new_pattern_index[newPat] = new_vector_pattern.size() - 1; - new_site_pattern.push_back(new_vector_pattern.size() - 1); + new_pattern.frequency = 1; + new_pattern.computeConst(STATE_UNKNOWN); + new_patterns.push_back(new_pattern); + new_pattern_index[new_pattern] = new_patterns.size() - 1; + new_site_patterns.push_back(new_patterns.size() - 1); } else { int index = pat_it->second; - new_vector_pattern[index].frequency++; - new_site_pattern.push_back(index); + new_patterns[index].frequency++; + new_site_patterns.push_back(index); } } clear(); - for (vector::iterator it = new_vector_pattern.begin(); it != new_vector_pattern.end(); ++it) + for (vector::iterator it = new_patterns.begin(); it != new_patterns.end(); ++it) { push_back(*it); } pattern_index = new_pattern_index; - site_pattern = new_site_pattern; + site_pattern = new_site_patterns; seq_names.insert(seq_names.end(), new_seq_names.begin(), new_seq_names.end()); buildSeqStates(); // checkSeqName(); @@ -1460,9 +1466,9 @@ void Alignment::updateAlignmentNewSequences(const vector &new_sequences, char char_to_state[NUM_CHAR]; buildStateMap(char_to_state, seq_type); - vector new_vector_pattern; - vector new_site_pattern; + vector new_patterns; PatternIntMap new_pattern_index; + vector new_site_patterns; int nseq = new_sequences.size(); int nsite = getNSite(); @@ -1479,25 +1485,25 @@ void Alignment::updateAlignmentNewSequences(const vector &new_sequences, // If pattern not found, add new pattern new_pattern.frequency = 1; new_pattern.computeConst(STATE_UNKNOWN); - new_vector_pattern.push_back(new_pattern); - new_pattern_index[new_pattern] = new_vector_pattern.size() - 1; - new_site_pattern.push_back(new_vector_pattern.size() - 1); + new_patterns.push_back(new_pattern); + new_pattern_index[new_pattern] = new_patterns.size() - 1; + new_site_patterns.push_back(new_patterns.size() - 1); } else { // If pattern found, increment frequency int index = pat_it->second; - new_vector_pattern[index].frequency++; - new_site_pattern.push_back(index); + new_patterns[index].frequency++; + new_site_patterns.push_back(index); } } clear(); - for (vector::iterator itr = new_vector_pattern.begin(); itr != new_vector_pattern.end(); ++itr) + for (vector::iterator itr = new_patterns.begin(); itr != new_patterns.end(); ++itr) { push_back(*itr); } pattern_index = new_pattern_index; - site_pattern = new_site_pattern; + site_pattern = new_site_patterns; buildSeqStates(); countConstSite(); } @@ -1650,6 +1656,7 @@ int Alignment::readVCF(char *filename, char *sequence_type, int existing_sequenc } } sequences.resize(nseq, ""); + missing_sequences.resize(num_missing_sequence, ""); existing_sample_mutations.resize(nseq); missing_sample_mutations.resize(num_missing_sequence); } @@ -1679,6 +1686,8 @@ int Alignment::readVCF(char *filename, char *sequence_type, int existing_sequenc std::string allele = alleles[allele_id - 1]; if (i - 9 < existing_sequence) sequences[i - 9].push_back(allele[0]); + else + missing_sequences[i - 9 - existing_sequence].push_back(allele[0]); cur_mut.mut_nuc = getMutationFromState(allele[0]); } @@ -1686,6 +1695,8 @@ int Alignment::readVCF(char *filename, char *sequence_type, int existing_sequenc { if (i - 9 < existing_sequence) sequences[i - 9].push_back(words[3][0]); + else + missing_sequences[i - 9 - existing_sequence].push_back(words[3][0]); cur_mut.mut_nuc = getMutationFromState(words[3][0]); } @@ -1694,6 +1705,8 @@ int Alignment::readVCF(char *filename, char *sequence_type, int existing_sequenc { if (i - 9 < existing_sequence) sequences[i - 9].push_back('-'); + else + missing_sequences[i - 9 - existing_sequence].push_back('-'); cur_mut.mut_nuc = getMutationFromState('N'); cur_mut.is_missing = true; } diff --git a/alignment.h b/alignment.h index 19a4508a..8efd376d 100644 --- a/alignment.h +++ b/alignment.h @@ -611,10 +611,15 @@ class Alignment : public vector { int n_informative_sites; /** - * Missing sample names + * Missing sequence names */ vector missing_seq_names; + /** + * Missing sequences + */ + vector missing_sequences; + /** * Initial column state * Using for finding rotated column permutation @@ -644,12 +649,12 @@ class Alignment : public vector { /** * Add a new sequence to the alignment */ - void addToAlignmentNewSequence(const string &new_seq_name, const string &new_seq, const vector &perm_col); + void addToAlignmentNewSequence(const string &new_seq_name, const string &new_seq); /** * Add new sequences to the alignment */ - void addToAlignmentNewSequences(const vector &new_seq_names, const vector &new_seqs, const vector &perm_col); + void addToAlignmentNewSequences(const vector &new_seq_names, const vector &new_seqs); /** * Get mutation from state diff --git a/phylotree.cpp b/phylotree.cpp index 3bc26ec5..d7d485fc 100644 --- a/phylotree.cpp +++ b/phylotree.cpp @@ -5136,6 +5136,10 @@ void PhyloTree::printTransMatrices(Node *node, Node *dad) { FOR_NEIGHBOR_IT(node, dad, it)printTransMatrices((*it)->node, node); } +/**************************************************************************** + Place new samples onto the tree + ****************************************************************************/ + void PhyloTree::allocateMutationMemory(int num_column) { cur_missing_sample_mutations.resize(num_column); diff --git a/placement.cpp b/placement.cpp index 50c5f833..b362e38f 100644 --- a/placement.cpp +++ b/placement.cpp @@ -200,10 +200,16 @@ void placeNewSamplesOntoExistingTree(Params ¶ms) tree->addNewSample(bfs[bestIndex].first, bfs[bestIndex].second, excessMutations, i, alignment->missing_seq_names[i]); } + alignment->addToAlignmentNewSequences(alignment->missing_seq_names, alignment->missing_sequences); + cout << "\n========== Finished placement core ==========\n"; - cout << "New tree's parsimony score: " << tree->computeParsimonyScoreMutation() << '\n'; cout << "Time: " << fixed << setprecision(3) << (double)(getCPUTime() - startTime) << " seconds\n"; cout << "Memory: " << getMemory() << " KB\n"; + + cout << "New tree's parsimony score computed by mutation: " << tree->computeParsimonyScoreMutation() << '\n'; + tree->initializeAllPartialPars(); + tree->clearAllPartialLH(); + cout << "New tree's parsimony score computed by fitch: " << tree->computeParsimony() << '\n'; delete alignment; alignment = NULL; From 3a838269145665659346a42f3a352da74fe41574 Mon Sep 17 00:00:00 2001 From: trungnotchung Date: Wed, 11 Jun 2025 17:01:20 +0700 Subject: [PATCH 07/23] fix: init partialLH data --- placement.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/placement.cpp b/placement.cpp index b362e38f..4a1ba6a3 100644 --- a/placement.cpp +++ b/placement.cpp @@ -207,8 +207,7 @@ void placeNewSamplesOntoExistingTree(Params ¶ms) cout << "Memory: " << getMemory() << " KB\n"; cout << "New tree's parsimony score computed by mutation: " << tree->computeParsimonyScoreMutation() << '\n'; - tree->initializeAllPartialPars(); - tree->clearAllPartialLH(); + tree->deleteAllPartialLh(); cout << "New tree's parsimony score computed by fitch: " << tree->computeParsimony() << '\n'; delete alignment; From 120780cf4c787c3f0532f300bb57e57841106d6d Mon Sep 17 00:00:00 2001 From: trungnotchung Date: Thu, 12 Jun 2025 09:52:58 +0700 Subject: [PATCH 08/23] refactor: placement --- alignment.cpp | 24 ------ alignment.h | 5 -- placement.cpp | 233 +++++++++++++++++++++++--------------------------- tools.cpp | 8 +- tools.h | 4 +- 5 files changed, 114 insertions(+), 160 deletions(-) diff --git a/alignment.cpp b/alignment.cpp index 19175874..3efdcea4 100644 --- a/alignment.cpp +++ b/alignment.cpp @@ -1311,30 +1311,6 @@ void split(const string &s, vector &elems, const string &delim) pos = find_pos + delim_len; } } -int Alignment::getNumberSequence(char *file_name) -{ - ifstream in; - in.exceptions(ios::failbit | ios::badbit); - in.open(file_name); - string line; - in.exceptions(ios::badbit); - - for (; !in.eof();) - { - getline(in, line); - if (line == "") - continue; - vector words; - split(line, words, "\t"); - if (words.size() == 1) - continue; - if (words[1] == "POS") - { - return words.size() - 9; - } - } - throw "Cannot find number of sequence in alignment file\n"; -} // Find the permutation of columns after rotation vector Alignment::findRotatedColumnPermutation() diff --git a/alignment.h b/alignment.h index 8efd376d..b9c49821 100644 --- a/alignment.h +++ b/alignment.h @@ -682,11 +682,6 @@ class Alignment : public vector { */ int readVCF(char *file_name, char *sequence_type, int existing_sequence); - /** - * Get number of sequence - */ - static int getNumberSequence(char *file_name); - protected: diff --git a/placement.cpp b/placement.cpp index 4a1ba6a3..cc747aae 100644 --- a/placement.cpp +++ b/placement.cpp @@ -8,99 +8,53 @@ #include "mutation.h" #include "placement.h" -void checkCorectTree(char *originTreeFile, char *newTreeFile) -{ - cout << "================= Check correct tree ================\n"; - IQTree *originTree = new IQTree; - bool originIsRooted = false; - originTree->readTree(originTreeFile, originIsRooted); - - IQTree *newTree = new IQTree; - bool newIsRooted = false; - newTree->readTree(newTreeFile, newIsRooted); - - vector originLeafName; - originTree->getLeafName(originLeafName); - - newTree->assignRoot(originLeafName[0]); - sort(originLeafName.begin(), originLeafName.end()); - newTree->initInfoNode(originLeafName); - - if (newTree->compareTree(originTree)) - cout << "Correct tree\n"; - else - cout << "Wrong tree\n"; - - delete originTree; - delete newTree; -} - -void configLeafNames(IQTree *tree, Node *node, Node *dad) -{ - if (node->isLeaf()) - { - node->id = tree->aln->getSeqID(node->name); - } - FOR_NEIGHBOR_IT(node, dad, it) - configLeafNames(tree, (*it)->node, node); -} - -void initAlignment(IQTree *tree, Alignment *alignment, vector &rotatedPermutationColumn) -{ - int nsite = rotatedPermutationColumn.size(); - vector permCol(nsite); - vector compressedPermCol(nsite); - if (alignment->existing_sample_mutations.size()) - { - for (int site = 0; site < nsite; ++site) - { - int col = rotatedPermutationColumn[site]; - compressedPermCol[site] = alignment->existing_sample_mutations[0][col].compressed_position; - permCol[site] = alignment->existing_sample_mutations[0][col].position; +void initAlignment(IQTree *tree, Alignment *alignment, vector &rotated_permutation_column) { + int nsite = rotated_permutation_column.size(); + vector perm_col(nsite); + vector compressed_perm_col(nsite); + if (alignment->existing_sample_mutations.size()) { + for (int site = 0; site < nsite; ++site) { + int col = rotated_permutation_column[site]; + compressed_perm_col[site] = alignment->existing_sample_mutations[0][col].compressed_position; + perm_col[site] = alignment->existing_sample_mutations[0][col].position; } } alignment->ungroupSitePattern(); tree->add_row = true; tree->root_states = new UINT[(alignment->size() + 7) / 8 + 1]; - tree->initMutation(permCol, compressedPermCol); + tree->initMutation(perm_col, compressed_perm_col); } -int readInitialAlignment(ifstream &inFileStream, char *outFileName, int numInitialRow) -{ - ofstream outFile(outFileName); - if (!outFile.is_open()) - { - cout << "Cannot open outputfile :" << outFileName << '\n'; +int readInitialAlignment(ifstream &in_file_stream, char *out_file_name, int num_initial_rows) { + ofstream out_file(out_file_name); + if (!out_file.is_open()) { + cout << "Cannot open outputfile :" << out_file_name << '\n'; exit(1); } string line; - int currentRow = 0; - while (getline(inFileStream, line)) - { - if (line == "") - { + int num_processed_rows = 0; + while (getline(in_file_stream, line)) { + if (line == "") { continue; } - outFile << line << '\n'; - ++currentRow; - if (currentRow >= numInitialRow) - { + out_file << line << '\n'; + ++num_processed_rows; + if (num_processed_rows >= num_initial_rows) { break; } } - outFile.close(); - return currentRow; + out_file.close(); + return num_processed_rows; } -int readVCFFile(IQTree *tree, Alignment **alignment, Params ¶ms) -{ - if (Alignment::getNumberSequence(params.aln_file) <= MAX_SEQUENCE) { - *alignment = new Alignment(params.aln_file, params.sequence_type, params.intype, params.num_existing_sample); - tree->setAlignment(*alignment); - tree->aln = *alignment; - vector rotatedColumnPermutation = (*alignment)->findRotatedColumnPermutation(); - initAlignment(tree, *alignment, rotatedColumnPermutation); - return (*alignment)->getNSite(); +int readVCFFile(IQTree *tree, Alignment*& alignment, Params ¶ms) { + if (params.num_existing_sequences + params.num_missing_sequences <= MAX_SEQUENCE) { + alignment = new Alignment(params.aln_file, params.sequence_type, params.intype, params.num_existing_sequences); + tree->setAlignment(alignment); + tree->aln = alignment; + vector rotatedColumnPermutation = alignment->findRotatedColumnPermutation(); + initAlignment(tree, alignment, rotatedColumnPermutation); + return (alignment)->getNSite(); } ifstream in; @@ -111,99 +65,92 @@ int readVCFFile(IQTree *tree, Alignment **alignment, Params ¶ms) // Read first 12 lines and create tree alignment int totalColumn = readInitialAlignment(in, "temp.vcf", 12) - 1; // Read first 12 lines and write to temp.vcf - *alignment = new Alignment("temp.vcf", params.sequence_type, params.intype, params.num_existing_sample); - (*alignment)->ungroupSitePattern(); + alignment = new Alignment("temp.vcf", params.sequence_type, params.intype, params.num_existing_sequences); + alignment->ungroupSitePattern(); std::remove("temp.vcf"); - tree->setAlignment(*alignment); - tree->aln = *alignment; + tree->setAlignment(alignment); + tree->aln = alignment; - vector rotatedColumnPermutation = (*alignment)->findRotatedColumnPermutation(); - initAlignment(tree, *alignment, rotatedColumnPermutation); + vector rotatedColumnPermutation = alignment->findRotatedColumnPermutation(); + initAlignment(tree, alignment, rotatedColumnPermutation); - while (true) - { - int numProcessedColumn = (*alignment)->readPartialVCF(in, params.sequence_type, rotatedColumnPermutation, params.num_existing_sample, totalColumn, 8); + while (true) { + int numProcessedColumn = (alignment)->readPartialVCF(in, params.sequence_type, rotatedColumnPermutation, params.num_existing_sequences, totalColumn, 8); if (numProcessedColumn == 0) break; tree->clearAllPartialLH(); totalColumn += numProcessedColumn; - initAlignment(tree, *alignment, rotatedColumnPermutation); + initAlignment(tree, alignment, rotatedColumnPermutation); } in.close(); return totalColumn; } -void placeNewSamplesOntoExistingTree(Params ¶ms) -{ +void placeNewSamplesOntoExistingTree(Params ¶ms) { cout << "\n========== Start initial data structure ==========\n"; Alignment *alignment; - IQTree *tree; - tree = new IQTree; - char *fileName = params.mutation_tree_file; - bool isRooted = false; + IQTree *tree = new IQTree; + bool is_rooted = false; - tree->readTree(fileName, isRooted); - int numColumn = readVCFFile(tree, &alignment, params) + 1; + tree->readTree(params.mutation_tree_file, is_rooted); + int sequence_length = readVCFFile(tree, alignment, params) + 1; // Init new tree's memory - tree->allocateMutationMemory(numColumn); + tree->allocateMutationMemory(sequence_length); // free memory delete[] tree->root_states; tree->add_row = false; cout << "Tree parsimony after init mutations: " << tree->computeParsimonyScoreMutation() << '\n'; cout << "\n========== Starting placement core ==========\n"; - int numSample = min((int)alignment->missing_sample_mutations.size(), params.num_missing_sample); + int num_sequences = min((int)alignment->missing_sample_mutations.size(), params.num_missing_sequences); - auto startTime = getCPUTime(); - for (int i = 0; i < numSample; ++i) - { + auto start_time = getCPUTime(); + for (int i = 0; i < num_sequences; ++i) { vector> bfs = tree->breadth_first_expansion(); - int totalNodes = (int)bfs.size(); + int total_nodes = (int)bfs.size(); CandidateNode inp; - int bestSetDifference = INT_MAX; - size_t bestNodeNumLeaves = INT_MAX; - size_t bestDistance = INT_MAX; - std::vector excessMutations; - std::vector nodeHasUnique(totalNodes, false); - bool bestNodeHasUnique = false; - size_t bestIndex = 0; - - inp.best_set_difference = &bestSetDifference; - inp.best_node_num_leaves = &bestNodeNumLeaves; - inp.best_distance = &bestDistance; + int best_set_difference = INT_MAX; + size_t best_node_num_leaves = INT_MAX; + size_t best_distance = INT_MAX; + std::vector excess_mutations; + std::vector node_has_unique(total_nodes, false); + bool best_node_has_unique = false; + size_t best_index = 0; + + inp.best_set_difference = &best_set_difference; + inp.best_node_num_leaves = &best_node_num_leaves; + inp.best_distance = &best_distance; inp.node = (PhyloNode *)tree->root->neighbors[0]->node; inp.node_branch = (PhyloNeighbor *)inp.node->findNeighbor(tree->root); inp.missing_sample_mutations = &alignment->missing_sample_mutations[i]; - inp.excess_mutations = &excessMutations; - inp.has_unique = &bestNodeHasUnique; - inp.node_has_unique = &(nodeHasUnique); - inp.best_index = &bestIndex; + inp.excess_mutations = &excess_mutations; + inp.has_unique = &best_node_has_unique; + inp.node_has_unique = &(node_has_unique); + inp.best_index = &best_index; tree->initDataCalculatePlacementMutation(inp); tree->optimizedCalculatePlacementMutation(inp, 0, true); - for (int j = 0; j < totalNodes; ++j) - { - if (inp.best_node == bfs[j].first) - { - bestIndex = j; + for (int j = 0; j < total_nodes; ++j) { + if (inp.best_node == bfs[j].first) { + best_index = j; } } *inp.best_set_difference = INT_MAX; - inp.index = bestIndex; - inp.node = bfs[bestIndex].first; - inp.node_branch = bfs[bestIndex].second; + inp.index = best_index; + inp.node = bfs[best_index].first; + inp.node_branch = bfs[best_index].second; tree->calculatePlacementMutation(inp, false, true); - tree->addNewSample(bfs[bestIndex].first, bfs[bestIndex].second, excessMutations, i, alignment->missing_seq_names[i]); + tree->addNewSample(bfs[best_index].first, bfs[best_index].second, excess_mutations, i, alignment->missing_seq_names[i]); } alignment->addToAlignmentNewSequences(alignment->missing_seq_names, alignment->missing_sequences); cout << "\n========== Finished placement core ==========\n"; - cout << "Time: " << fixed << setprecision(3) << (double)(getCPUTime() - startTime) << " seconds\n"; + cout << "Time: " << fixed << setprecision(3) << (double)(getCPUTime() - start_time) << " seconds\n"; cout << "Memory: " << getMemory() << " KB\n"; cout << "New tree's parsimony score computed by mutation: " << tree->computeParsimonyScoreMutation() << '\n'; @@ -213,4 +160,40 @@ void placeNewSamplesOntoExistingTree(Params ¶ms) delete alignment; alignment = NULL; delete tree; +} + +void checkCorectTree(char *origin_tree_file, char *new_tree_file) { + cout << "================= Start checking correct tree ================\n"; + IQTree *origin_tree = new IQTree; + bool origin_tree_is_rooted = false; + origin_tree->readTree(origin_tree_file, origin_tree_is_rooted); + + IQTree *new_tree = new IQTree; + bool new_tree_is_rooted = false; + new_tree->readTree(new_tree_file, new_tree_is_rooted); + + vector origin_tree_leaves_name; + origin_tree->getLeafName(origin_tree_leaves_name); + + new_tree->assignRoot(origin_tree_leaves_name[0]); + sort(origin_tree_leaves_name.begin(), origin_tree_leaves_name.end()); + new_tree->initInfoNode(origin_tree_leaves_name); + + if (new_tree->compareTree(origin_tree)) { + cout << "Finish checking correct tree: Correct tree detected\n"; + } + else { + cout << "Finish checking correct tree: Wrong tree detected\n"; + } + + delete origin_tree; + delete new_tree; +} + +void configLeafNames(IQTree *tree, Node *node, Node *dad) { + if (node->isLeaf()) { + node->id = tree->aln->getSeqID(node->name); + } + FOR_NEIGHBOR_IT(node, dad, it) + configLeafNames(tree, (*it)->node, node); } \ No newline at end of file diff --git a/tools.cpp b/tools.cpp index c8e775a7..be84aac3 100644 --- a/tools.cpp +++ b/tools.cpp @@ -552,8 +552,8 @@ void get2RandNumb(const int size, int &first, int &second) { void parseArg(int argc, char *argv[], Params ¶ms) { int cnt; - params.num_existing_sample = INT_MAX; - params.num_missing_sample = 0; + params.num_existing_sequences = INT_MAX; + params.num_missing_sequences = 0; params.mutation_tree_file = NULL; params.ppon = false; params.pp_verify_preserved_tree = false; @@ -871,13 +871,13 @@ void parseArg(int argc, char *argv[], Params ¶ms) { if (strcmp(argv[cnt], "-pp_n") == 0) { cnt++; - params.num_existing_sample = convert_int(argv[cnt]); + params.num_existing_sequences = convert_int(argv[cnt]); continue; } if (strcmp(argv[cnt], "-pp_k") == 0) { cnt++; - params.num_missing_sample = convert_int(argv[cnt]); + params.num_missing_sequences = convert_int(argv[cnt]); continue; } if (strcmp(argv[cnt], "-pp_tree") == 0) diff --git a/tools.h b/tools.h index f484cc31..4b2fbf79 100644 --- a/tools.h +++ b/tools.h @@ -419,12 +419,12 @@ struct Params { /** * Number of starting row */ - int num_existing_sample; + int num_existing_sequences; /** * Number of adding row */ - int num_missing_sample; + int num_missing_sequences; /** * Tree file name From 24d151e36f6b801ff00de2a7e418009a1d76b30b Mon Sep 17 00:00:00 2001 From: trungnotchung Date: Thu, 12 Jun 2025 13:11:37 +0700 Subject: [PATCH 09/23] refactor: place new sample onto phylo tree --- mutation.h | 28 ++- phylonode.h | 18 +- phylotree.cpp | 641 +++++++++++++++----------------------------------- phylotree.h | 6 +- placement.cpp | 55 ++--- 5 files changed, 236 insertions(+), 512 deletions(-) diff --git a/mutation.h b/mutation.h index 54f9f19e..ed1a09cc 100644 --- a/mutation.h +++ b/mutation.h @@ -11,20 +11,23 @@ #include char get_nuc(int8_t nuc_id); -struct Mutation -{ +struct Mutation { int position; int compressed_position; char ref_nuc; char par_nuc; char mut_nuc; bool is_missing; - inline bool operator<(const Mutation &m) const - { + + Mutation() { + is_missing = false; + } + + inline bool operator < (const Mutation &m) const { return ((*this).position < m.position); } - inline Mutation copy() const - { + + inline Mutation copy() const { Mutation m; m.position = position; m.ref_nuc = ref_nuc; @@ -34,14 +37,11 @@ struct Mutation m.compressed_position = compressed_position; return m; } - Mutation() - { - is_missing = false; - } - inline bool is_masked() const - { + + inline bool is_masked() const { return (position < 0); } + inline std::string get_string() const { if (is_masked()) { return "MASKED"; @@ -50,5 +50,9 @@ struct Mutation return get_nuc(par_nuc) + std::to_string(position) + get_nuc(mut_nuc); } } + + inline bool has_nuc(int nuc) { + return ((1 << nuc) & mut_nuc) != 0; + } }; #endif \ No newline at end of file diff --git a/phylonode.h b/phylonode.h index 8b562325..2d2ed776 100644 --- a/phylonode.h +++ b/phylonode.h @@ -217,32 +217,20 @@ class PhyloNode : public Node */ typedef vector PhyloNodeVector; -class CandidateNode +class PlacementCandidateNode { public: PhyloNode *node; PhyloNeighbor *node_branch; std::vector *missing_sample_mutations; + std::vector *excess_mutations; int *best_set_difference; - int *set_difference; size_t *best_node_num_leaves; - size_t distance; - size_t *best_distance; - size_t index; - size_t *best_index; - size_t *num_best; PhyloNode *best_node; PhyloNeighbor *best_node_branch; - std::vector *node_has_unique; - std::vector *best_j_vec; - - bool *has_unique; - - std::vector *excess_mutations; - - CandidateNode() + PlacementCandidateNode() { } }; diff --git a/phylotree.cpp b/phylotree.cpp index d7d485fc..1393b66b 100644 --- a/phylotree.cpp +++ b/phylotree.cpp @@ -5514,117 +5514,56 @@ vector> PhyloTree::breadth_first_expansion() return bfs; } -void PhyloTree::calculatePlacementMutation(CandidateNode &input, bool compute_parsimony_scores, bool compute_vecs) +void PhyloTree::computeExcessMutations(PlacementCandidateNode &input) { - int set_difference = 0; - int best_set_difference = *input.best_set_difference; std::vector anc_positions; std::vector ancestral_mutations; - bool has_unique = false; - int node_num_mut = 0; - int num_common_mut = 0; - assert(input.node->dad); timer_regular--; - for (auto m : (*input.missing_sample_mutations)) - { - visited_missing_sample_mutations[m.compressed_position] = timer_regular; - cur_missing_sample_mutations[m.compressed_position] = m; + for (auto mutation : (*input.missing_sample_mutations)) { + visited_missing_sample_mutations[mutation.compressed_position] = timer_regular; + cur_missing_sample_mutations[mutation.compressed_position] = mutation; } - if (!(input.node == root)) - { - for (auto m1 : input.node_branch->mutations) - { - node_num_mut++; - auto anc_nuc = m1.mut_nuc; - if (m1.is_masked()) - { - has_unique = true; + if (!(input.node == root)) { + for (auto node_mutation : input.node_branch->mutations) { + auto anc_nuc = node_mutation.mut_nuc; + if (node_mutation.is_masked()) break; - } assert(((anc_nuc - 1) & anc_nuc) == 0); bool found = false; bool found_pos = false; - if (visited_missing_sample_mutations[m1.compressed_position] == timer_regular) - { - auto m2 = cur_missing_sample_mutations[m1.compressed_position]; - if (m1.position == m2.position) - { + if (visited_missing_sample_mutations[node_mutation.compressed_position] == timer_regular) { + auto missing_sample_mutation = cur_missing_sample_mutations[node_mutation.compressed_position]; + if (node_mutation.position == missing_sample_mutation.position) { found_pos = true; - if (m2.is_missing) - { + if (missing_sample_mutation.is_missing) { found = true; - num_common_mut++; } - else - { - auto nuc = m2.mut_nuc; - if ((nuc & anc_nuc) != 0) - { - Mutation m; - m.position = m1.position; - m.compressed_position = m1.compressed_position; - m.ref_nuc = m1.ref_nuc; - m.par_nuc = m1.par_nuc; - m.mut_nuc = anc_nuc; - - ancestral_mutations.emplace_back(m); - anc_positions.emplace_back(m.compressed_position); - assert((m.mut_nuc & (m.mut_nuc - 1)) == 0); - if (compute_vecs) - { - (*input.excess_mutations).emplace_back(m); - } - + else { + auto nuc = missing_sample_mutation.mut_nuc; + if ((nuc & anc_nuc) != 0) { + ancestral_mutations.emplace_back(node_mutation); + anc_positions.emplace_back(node_mutation.compressed_position); + (*input.excess_mutations).emplace_back(node_mutation); found = true; - num_common_mut++; } } } } - if (!found) - { - if (!found_pos && (anc_nuc == m1.ref_nuc)) - { // m.mut_nuc = m.par_nuc = m1.ref_nuc - Mutation m; - m.position = m1.position; - m.compressed_position = m1.compressed_position; - m.ref_nuc = m1.ref_nuc; - m.par_nuc = m1.par_nuc; - m.mut_nuc = anc_nuc; - - ancestral_mutations.emplace_back(m); - anc_positions.emplace_back(m.compressed_position); - assert((m.mut_nuc & (m.mut_nuc - 1)) == 0); - if (compute_vecs) - { - (*input.excess_mutations).emplace_back(m); - } + if (!found) { + if (!found_pos && (anc_nuc == node_mutation.ref_nuc)) { + ancestral_mutations.emplace_back(node_mutation); + anc_positions.emplace_back(node_mutation.compressed_position); + (*input.excess_mutations).emplace_back(node_mutation); - num_common_mut++; - } - else - { - has_unique = true; } } } } - else - { - assert(false); - for (auto m : input.node_branch->mutations) - { - ancestral_mutations.emplace_back(m); - anc_positions.emplace_back(m.compressed_position); - } - } - - for (auto m : ancestral_mutations) - { - visited_ancestral_mutations[m.compressed_position] = timer_regular; - cur_ancestral_mutations[m.compressed_position] = m; + for (auto ancestral_mutation : ancestral_mutations) { + visited_ancestral_mutations[ancestral_mutation.compressed_position] = timer_regular; + cur_ancestral_mutations[ancestral_mutation.compressed_position] = ancestral_mutation; } { @@ -5633,495 +5572,303 @@ void PhyloTree::calculatePlacementMutation(CandidateNode &input, bool compute_pa { n = n->dad; PhyloNeighbor *node_branch = (PhyloNeighbor *)n->findNeighbor(n->dad); - for (auto m : node_branch->mutations) - { - if (!m.is_masked() && visited_ancestral_mutations[m.compressed_position] != timer_regular) - { - ancestral_mutations.emplace_back(m); - anc_positions.emplace_back(m.compressed_position); - visited_ancestral_mutations[m.compressed_position] = timer_regular; - cur_ancestral_mutations[m.compressed_position] = m; + for (auto node_mutation : node_branch->mutations) { + if (!node_mutation.is_masked() && visited_ancestral_mutations[node_mutation.compressed_position] != timer_regular) { + ancestral_mutations.emplace_back(node_mutation); + anc_positions.emplace_back(node_mutation.compressed_position); + visited_ancestral_mutations[node_mutation.compressed_position] = timer_regular; + cur_ancestral_mutations[node_mutation.compressed_position] = node_mutation; } } } - for (auto m : root_mutations) - { - if (!m.is_masked() && visited_ancestral_mutations[m.compressed_position] != timer_regular) - { - ancestral_mutations.emplace_back(m); - anc_positions.emplace_back(m.compressed_position); - visited_ancestral_mutations[m.compressed_position] = timer_regular; - cur_ancestral_mutations[m.compressed_position] = m; + for (auto root_mutation : root_mutations) { + if (!root_mutation.is_masked() && visited_ancestral_mutations[root_mutation.compressed_position] != timer_regular) { + ancestral_mutations.emplace_back(root_mutation); + anc_positions.emplace_back(root_mutation.compressed_position); + visited_ancestral_mutations[root_mutation.compressed_position] = timer_regular; + cur_ancestral_mutations[root_mutation.compressed_position] = root_mutation; } } } - for (auto m1 : (*input.missing_sample_mutations)) - { - if (m1.is_missing) - { + for (auto missing_sample_mutation : (*input.missing_sample_mutations)) { + if (missing_sample_mutation.is_missing) { continue; } + bool found_pos = false; bool found = false; - bool has_ref = false; - auto anc_nuc = m1.ref_nuc; - if ((m1.mut_nuc & m1.ref_nuc) != 0) - { - has_ref = true; + bool nuc = false; + auto anc_nuc = missing_sample_mutation.ref_nuc; + + if ((missing_sample_mutation.mut_nuc & missing_sample_mutation.ref_nuc) != 0) { + nuc = true; } - if (visited_ancestral_mutations[m1.compressed_position] == timer_regular) - { - auto m2 = cur_ancestral_mutations[m1.compressed_position]; - if (!m2.is_masked()) - { + if (visited_ancestral_mutations[missing_sample_mutation.compressed_position] == timer_regular) { + auto ancestral_mutation = cur_ancestral_mutations[missing_sample_mutation.compressed_position]; + if (!ancestral_mutation.is_masked()) { found_pos = true; - anc_nuc = m2.mut_nuc; - if ((m1.mut_nuc & anc_nuc) != 0) - { + anc_nuc = ancestral_mutation.mut_nuc; + if ((missing_sample_mutation.mut_nuc & anc_nuc) != 0) { found = true; } } } - if (!found && (found_pos || !has_ref)) - { - - Mutation m; - m.position = m1.position; - m.compressed_position = m1.compressed_position; - m.ref_nuc = m1.ref_nuc; - m.par_nuc = anc_nuc; - if (has_ref) - { - m.mut_nuc = m1.ref_nuc; - } - else - { - for (int j = 0; j < 4; j++) - { - if (((1 << j) & m1.mut_nuc) != 0) - { - m.mut_nuc = (1 << j); - break; - } + if (!found && (found_pos || !nuc)) { + Mutation mutation; + mutation.position = missing_sample_mutation.position; + mutation.compressed_position = missing_sample_mutation.compressed_position; + mutation.ref_nuc = missing_sample_mutation.ref_nuc; + mutation.par_nuc = anc_nuc; + for (int nuc = 0; nuc < 4; nuc++) { + if (((1 << nuc) & missing_sample_mutation.mut_nuc) != 0) { + mutation.mut_nuc = (1 << nuc); + break; } } - assert((m.mut_nuc & (m.mut_nuc - 1)) == 0); - if (m.mut_nuc != m.par_nuc) - { - if (compute_vecs) - { - input.excess_mutations->emplace_back(m); - } - set_difference += 1; - if (!compute_parsimony_scores && (set_difference > best_set_difference)) - { - return; - } + assert((mutation.mut_nuc & (mutation.mut_nuc - 1)) == 0); + if (mutation.mut_nuc != mutation.par_nuc) { + input.excess_mutations->emplace_back(mutation); } } } - for (auto m1 : ancestral_mutations) - { + for (auto ancestral_mutation : ancestral_mutations) { bool found = false; bool found_pos = false; - auto anc_nuc = m1.mut_nuc; - if (visited_missing_sample_mutations[m1.compressed_position] == timer_regular) - { - if (!m1.is_masked()) - { - auto m2 = cur_missing_sample_mutations[m1.compressed_position]; + auto anc_nuc = ancestral_mutation.mut_nuc; + if (visited_missing_sample_mutations[ancestral_mutation.compressed_position] == timer_regular) { + if (!ancestral_mutation.is_masked()) { + auto missing_sample_mutation = cur_missing_sample_mutations[ancestral_mutation.compressed_position]; found_pos = true; - if (m2.is_missing) - { - found = true; - } - else if ((m2.mut_nuc & anc_nuc) != 0) - { + if (missing_sample_mutation.is_missing || (missing_sample_mutation.mut_nuc & anc_nuc) != 0) { found = true; } } } - if (!found && !found_pos && (m1.is_masked() || (anc_nuc != m1.ref_nuc))) - { - Mutation m; - m.position = m1.position; - m.compressed_position = m1.compressed_position; - m.ref_nuc = m1.ref_nuc; - m.par_nuc = anc_nuc; - m.mut_nuc = m1.ref_nuc; - assert(m.is_masked() || ((m.mut_nuc & (m.mut_nuc - 1)) == 0)); - if (m.mut_nuc != m.par_nuc) - { - set_difference += 1; - if (!compute_parsimony_scores && (set_difference > best_set_difference)) - { - return; - } - if (compute_vecs) - { - (*input.excess_mutations).emplace_back(m); - } + if (!found && !found_pos && (ancestral_mutation.is_masked() || (anc_nuc != ancestral_mutation.ref_nuc))) { + Mutation mutation; + mutation.position = ancestral_mutation.position; + mutation.compressed_position = ancestral_mutation.compressed_position; + mutation.ref_nuc = ancestral_mutation.ref_nuc; + mutation.par_nuc = anc_nuc; + mutation.mut_nuc = ancestral_mutation.ref_nuc; + assert(mutation.is_masked() || ((mutation.mut_nuc & (mutation.mut_nuc - 1)) == 0)); + if (mutation.mut_nuc != mutation.par_nuc) { + (*input.excess_mutations).emplace_back(mutation); } } } - - if (compute_parsimony_scores) - { - *input.set_difference = set_difference; - } - - if (set_difference > *input.best_set_difference) - { - return; - } - size_t num_leaves = input.node_branch->num_leaves; - if (set_difference < *input.best_set_difference) - { - *input.best_set_difference = set_difference; - *input.best_node_num_leaves = num_leaves; - *input.best_index = input.index; - *input.has_unique = has_unique; - *input.best_distance = input.distance; - (*input.node_has_unique)[input.index] = has_unique; - } - else if (set_difference == *input.best_set_difference) - { - if (((input.distance == *input.best_distance) && - ((num_leaves > *input.best_node_num_leaves) || - ((num_leaves == *input.best_node_num_leaves) && (*input.best_index < input.index)))) || - (input.distance < *input.best_distance)) - { - *input.best_set_difference = set_difference; - *input.best_node_num_leaves = num_leaves; - *input.best_index = input.index; - *input.has_unique = has_unique; - *input.best_distance = input.distance; - } - (*input.node_has_unique)[input.index] = has_unique; - } } -void PhyloTree::initDataCalculatePlacementMutation(CandidateNode &inp) +void PhyloTree::initDataPlaceNewSample(PlacementCandidateNode &inp) { ++timer_optimized; - for (auto m : (*inp.missing_sample_mutations)) - { - visited_missing_sample_mutations[m.compressed_position] = timer_optimized; - cur_missing_sample_mutations[m.compressed_position] = m; + for (auto mutation : (*inp.missing_sample_mutations)) { + visited_missing_sample_mutations[mutation.compressed_position] = timer_optimized; + cur_missing_sample_mutations[mutation.compressed_position] = mutation; } } -void PhyloTree::eraseMutation(vector &erased_excess_mutation, Mutation m, int &set_difference) -{ - if (visited_excess_mutations[m.compressed_position] == timer_optimized) - { - erased_excess_mutation.emplace_back(cur_excess_mutations[m.compressed_position]); - visited_excess_mutations[m.compressed_position] = 0; +void PhyloTree::eraseMutation(vector &erased_excess_mutation, Mutation mutation, int &set_difference) { + if (visited_excess_mutations[mutation.compressed_position] == timer_optimized) { + erased_excess_mutation.emplace_back(cur_excess_mutations[mutation.compressed_position]); + visited_excess_mutations[mutation.compressed_position] = 0; --set_difference; } } -void PhyloTree::addMutation(vector &added_excess_mutation, Mutation m, int diff, int &set_difference) -{ - added_excess_mutation.push_back(m); - visited_excess_mutations[m.compressed_position] = timer_optimized; - cur_excess_mutations[m.compressed_position] = m; +void PhyloTree::addMutation(vector &added_excess_mutation, Mutation mutation, int diff, int &set_difference) { + added_excess_mutation.push_back(mutation); + visited_excess_mutations[mutation.compressed_position] = timer_optimized; + cur_excess_mutations[mutation.compressed_position] = mutation; set_difference += diff; } -void PhyloTree::optimizedCalculatePlacementMutation(CandidateNode &input, int set_difference, bool firstNode) +void PhyloTree::optimizedFindPositionPlaceNewSample(PlacementCandidateNode &input, int set_difference) { - int num_common_mut = 0; - int best_set_difference = *input.best_set_difference; - - std::vector anc_positions; - std::vector ancestral_mutations; - std::vector erased_excess_mutation; - std::vector added_excess_mutation; - std::vector common_mutations; - std::vector diff_mutations; - - bool has_unique = false; - int node_num_mut = 0; - assert(input.node->dad); - - if (!(input.node == root)) - { - for (auto m1 : input.node_branch->mutations) - { - node_num_mut++; - auto anc_nuc = m1.mut_nuc; - if (m1.is_masked()) - { - has_unique = true; + vector ancentral_positions; + vector ancestral_mutations; + vector erased_excess_mutation; + vector added_excess_mutation; + vector common_mutations; + vector diff_mutations; + + if (!(input.node == root)) { + for (auto node_mutation : input.node_branch->mutations) { + auto anc_nuc = node_mutation.mut_nuc; + if (node_mutation.is_masked()) { break; } assert(((anc_nuc - 1) & anc_nuc) == 0); bool found = false; bool found_pos = false; - if (visited_missing_sample_mutations[m1.compressed_position] == timer_optimized) - { - auto m2 = cur_missing_sample_mutations[m1.compressed_position]; - if (m1.position == m2.position) - { + if (visited_missing_sample_mutations[node_mutation.compressed_position] == timer_optimized) { + auto missing_sample_mutation = cur_missing_sample_mutations[node_mutation.compressed_position]; + if (node_mutation.position == missing_sample_mutation.position) { found_pos = true; - if (m2.is_missing) - { - ++num_common_mut; + if (missing_sample_mutation.is_missing) { found = true; } - else - { - auto nuc = m2.mut_nuc; - if ((nuc & anc_nuc) != 0) - { - Mutation m; - m.position = m1.position; - m.compressed_position = m1.compressed_position; - m.ref_nuc = m1.ref_nuc; - m.par_nuc = m1.par_nuc; - m.mut_nuc = anc_nuc; - - ancestral_mutations.emplace_back(m); - anc_positions.emplace_back(m.compressed_position); - assert((m.mut_nuc & (m.mut_nuc - 1)) == 0); - + else { + auto sample_nuc = missing_sample_mutation.mut_nuc; + if ((sample_nuc & anc_nuc) != 0) { + ancestral_mutations.emplace_back(node_mutation); + ancentral_positions.emplace_back(node_mutation.compressed_position); + eraseMutation(erased_excess_mutation, node_mutation, set_difference); + addMutation(added_excess_mutation, node_mutation, 0, set_difference); + common_mutations.emplace_back(node_mutation); found = true; - eraseMutation(erased_excess_mutation, m, set_difference); - addMutation(added_excess_mutation, m, 0, set_difference); - common_mutations.emplace_back(m); - ++num_common_mut; } } } } - if (!found) - { - if (!found_pos && (anc_nuc == m1.ref_nuc)) - { // m.mut_nuc = m.par_nuc = m1.ref_nuc - Mutation m; - m.position = m1.position; - m.compressed_position = m1.compressed_position; - m.ref_nuc = m1.ref_nuc; - m.par_nuc = m1.par_nuc; - m.mut_nuc = anc_nuc; - - ancestral_mutations.emplace_back(m); - anc_positions.emplace_back(m.compressed_position); - assert((m.mut_nuc & (m.mut_nuc - 1)) == 0); - eraseMutation(erased_excess_mutation, m, set_difference); - addMutation(added_excess_mutation, m, 0, set_difference); - common_mutations.emplace_back(m); - ++num_common_mut; + if (!found) { + if (!found_pos && (anc_nuc == node_mutation.ref_nuc)) { + ancestral_mutations.emplace_back(node_mutation); + ancentral_positions.emplace_back(node_mutation.compressed_position); + eraseMutation(erased_excess_mutation, node_mutation, set_difference); + addMutation(added_excess_mutation, node_mutation, 0, set_difference); + common_mutations.emplace_back(node_mutation); } - else - { - has_unique = true; - diff_mutations.emplace_back(m1); + else { + diff_mutations.emplace_back(node_mutation); } } } } - if (firstNode) - { - { - PhyloNode *n = input.node; - while (n->dad != root) - { - n = n->dad; - PhyloNeighbor *node_branch = (PhyloNeighbor *)n->findNeighbor(n->dad); - for (auto m : node_branch->mutations) - { - if (!m.is_masked() && visited_ancestral_mutations[m.compressed_position] != timer_optimized) - { - ancestral_mutations.emplace_back(m); - anc_positions.emplace_back(m.compressed_position); - visited_ancestral_mutations[m.compressed_position] = timer_optimized; - cur_ancestral_mutations[m.compressed_position] = m; - } - } - } - for (auto m : root_mutations) - { - if (!m.is_masked() && visited_ancestral_mutations[m.compressed_position] != timer_optimized) - { - ancestral_mutations.emplace_back(m); - anc_positions.emplace_back(m.compressed_position); - visited_ancestral_mutations[m.compressed_position] = timer_optimized; - cur_ancestral_mutations[m.compressed_position] = m; - } + if (input.node->dad == root) { + for (auto root_mutation : root_mutations) { + if (!root_mutation.is_masked() && visited_ancestral_mutations[root_mutation.compressed_position] != timer_optimized) { + ancestral_mutations.emplace_back(root_mutation); + ancentral_positions.emplace_back(root_mutation.compressed_position); + visited_ancestral_mutations[root_mutation.compressed_position] = timer_optimized; + cur_ancestral_mutations[root_mutation.compressed_position] = root_mutation; } } - for (auto m1 : (*input.missing_sample_mutations)) - { - // Missing bases (Ns) are ignored - if (m1.is_missing) - { + for (auto missing_sample_mutation : (*input.missing_sample_mutations)) { + if (missing_sample_mutation.is_missing) { continue; } - bool found_pos = false; bool found = false; + bool found_pos = false; bool has_ref = false; - auto anc_nuc = m1.ref_nuc; - if ((m1.mut_nuc & m1.ref_nuc) != 0) - { + auto anc_nuc = missing_sample_mutation.ref_nuc; + if ((missing_sample_mutation.mut_nuc & missing_sample_mutation.ref_nuc) != 0) { has_ref = true; } - if (visited_ancestral_mutations[m1.compressed_position] == timer_optimized) - { - auto m2 = cur_ancestral_mutations[m1.compressed_position]; - if (!m2.is_masked()) - { + if (visited_ancestral_mutations[missing_sample_mutation.compressed_position] == timer_optimized) { + auto ancestral_mutation = cur_ancestral_mutations[missing_sample_mutation.compressed_position]; + if (!ancestral_mutation.is_masked()) { found_pos = true; - anc_nuc = m2.mut_nuc; - if ((m1.mut_nuc & anc_nuc) != 0) - { + anc_nuc = ancestral_mutation.mut_nuc; + if ((missing_sample_mutation.mut_nuc & anc_nuc) != 0) { found = true; } } } - if (!found && !has_ref) - { - Mutation m; - m.position = m1.position; - m.compressed_position = m1.compressed_position; - m.ref_nuc = m1.ref_nuc; - m.par_nuc = anc_nuc; - if (has_ref) - { - m.mut_nuc = m1.ref_nuc; - } - else - { - for (int j = 0; j < 4; j++) + if (!found && !has_ref) { + Mutation mutation; + mutation.position = missing_sample_mutation.position; + mutation.compressed_position = missing_sample_mutation.compressed_position; + mutation.ref_nuc = missing_sample_mutation.ref_nuc; + mutation.par_nuc = anc_nuc; + for (int nuc = 0; nuc < 4; nuc++) { + if (((1 << nuc) & missing_sample_mutation.mut_nuc) != 0) { - if (((1 << j) & m1.mut_nuc) != 0) - { - m.mut_nuc = (1 << j); - break; - } + mutation.mut_nuc = (1 << nuc); + break; } } - assert((m.mut_nuc & (m.mut_nuc - 1)) == 0); - if (m.mut_nuc != m.par_nuc) - { - addMutation(added_excess_mutation, m, 1, set_difference); - } + addMutation(added_excess_mutation, mutation, 1, set_difference); } } } - for (auto m1 : ancestral_mutations) - { + for (auto ancestral_mutation : ancestral_mutations) { bool found = false; bool found_pos = false; - auto anc_nuc = m1.mut_nuc; - if (visited_missing_sample_mutations[m1.compressed_position] == timer_optimized) - { - if (!m1.is_masked()) - { - auto m2 = cur_missing_sample_mutations[m1.compressed_position]; + auto anc_nuc = ancestral_mutation.mut_nuc; + if (visited_missing_sample_mutations[ancestral_mutation.compressed_position] == timer_optimized) { + if (!ancestral_mutation.is_masked()) { + auto missing_sample_mutation = cur_missing_sample_mutations[ancestral_mutation.compressed_position]; found_pos = true; - if (m2.is_missing) - { - found = true; - } - else if ((m2.mut_nuc & anc_nuc) != 0) - { + if (missing_sample_mutation.is_missing || (missing_sample_mutation.mut_nuc & anc_nuc) != 0) { found = true; } } } - if (!found && (found_pos || m1.is_masked() || (anc_nuc != m1.ref_nuc))) - { - eraseMutation(erased_excess_mutation, m1, set_difference); - Mutation m; - m.position = m1.position; - m.compressed_position = m1.compressed_position; - m.ref_nuc = m1.ref_nuc; - m.par_nuc = anc_nuc; - m.mut_nuc = m1.ref_nuc; - assert(m.is_masked() || ((m.mut_nuc & (m.mut_nuc - 1)) == 0)); - if (m.mut_nuc != m.par_nuc) - { - addMutation(added_excess_mutation, m, 1, set_difference); + if (!found && (found_pos || ancestral_mutation.is_masked() || (anc_nuc != ancestral_mutation.ref_nuc))) { + eraseMutation(erased_excess_mutation, ancestral_mutation, set_difference); + Mutation mutation; + mutation.position = ancestral_mutation.position; + mutation.compressed_position = ancestral_mutation.compressed_position; + mutation.ref_nuc = ancestral_mutation.ref_nuc; + mutation.par_nuc = anc_nuc; + mutation.mut_nuc = ancestral_mutation.ref_nuc; + if (mutation.mut_nuc != mutation.par_nuc) { + addMutation(added_excess_mutation, mutation, 1, set_difference); } } } size_t num_leaves = input.node_branch->num_leaves; - if (set_difference < *input.best_set_difference) - { + if (set_difference < *input.best_set_difference) { *input.best_set_difference = set_difference; *input.best_node_num_leaves = num_leaves; - *input.best_distance = input.distance; input.best_node = input.node; input.best_node_branch = input.node_branch; } - else if (set_difference == *input.best_set_difference) - { - if (((input.distance == *input.best_distance) && - ((num_leaves >= *input.best_node_num_leaves))) || - (input.distance < *input.best_distance)) - { + else if (set_difference == *input.best_set_difference) { + if (((num_leaves >= *input.best_node_num_leaves))) { *input.best_set_difference = set_difference; *input.best_node_num_leaves = num_leaves; - *input.best_distance = input.distance; input.best_node = input.node; input.best_node_branch = input.node_branch; } } - for (auto m : common_mutations) - { - visited_excess_mutations[m.compressed_position] = 0; + for (auto common_mutation : common_mutations) { + visited_excess_mutations[common_mutation.compressed_position] = 0; } - for (auto m : diff_mutations) - { - Mutation m1; - m1.ref_nuc = m.ref_nuc; - m1.par_nuc = m.mut_nuc; - m1.mut_nuc = m.ref_nuc; - m1.position = m.position; - m1.compressed_position = m.compressed_position; - if (visited_missing_sample_mutations[m.compressed_position] == timer_optimized) - { - m1.mut_nuc = cur_missing_sample_mutations[m.compressed_position].mut_nuc; + for (auto diff_mutation : diff_mutations) { + Mutation mutation; + mutation.ref_nuc = diff_mutation.ref_nuc; + mutation.par_nuc = diff_mutation.mut_nuc; + mutation.mut_nuc = diff_mutation.ref_nuc; + mutation.position = diff_mutation.position; + mutation.compressed_position = diff_mutation.compressed_position; + if (visited_missing_sample_mutations[diff_mutation.compressed_position] == timer_optimized) { + mutation.mut_nuc = cur_missing_sample_mutations[diff_mutation.compressed_position].mut_nuc; } - eraseMutation(erased_excess_mutation, m1, set_difference); - if (m1.mut_nuc != m1.par_nuc) - { - addMutation(added_excess_mutation, m1, 1, set_difference); + eraseMutation(erased_excess_mutation, mutation, set_difference); + if (mutation.mut_nuc != mutation.par_nuc) { + addMutation(added_excess_mutation, mutation, 1, set_difference); } } PhyloNode *node = input.node; PhyloNode *dad = node->dad; - FOR_NEIGHBOR_IT(node, dad, it) - { + FOR_NEIGHBOR_IT(node, dad, it) { PhyloNode *childNode = (PhyloNode *)(*it)->node; PhyloNeighbor *childNodeBranch = (PhyloNeighbor *)childNode->findNeighbor(node); input.node = childNode; input.node_branch = childNodeBranch; - optimizedCalculatePlacementMutation(input, set_difference, false); + optimizedFindPositionPlaceNewSample(input, set_difference); } - for (auto m : added_excess_mutation) - { - visited_excess_mutations[m.compressed_position] = 0; + for (auto mutation : added_excess_mutation) { + visited_excess_mutations[mutation.compressed_position] = 0; } - for (int i = (int)erased_excess_mutation.size() - 1; i >= 0; --i) - { - Mutation m = erased_excess_mutation[i]; - visited_excess_mutations[m.compressed_position] = timer_optimized; - cur_excess_mutations[m.compressed_position] = m; + for (int i = erased_excess_mutation.size() - 1; i >= 0; i--) { + Mutation mutation = erased_excess_mutation[i]; + visited_excess_mutations[mutation.compressed_position] = timer_optimized; + cur_excess_mutations[mutation.compressed_position] = mutation; } } diff --git a/phylotree.h b/phylotree.h index 3177725d..55ecb480 100644 --- a/phylotree.h +++ b/phylotree.h @@ -342,12 +342,12 @@ class PhyloTree : public MTree, public Optimization { /** * Calculate placement mutation for a candidate node */ - void calculatePlacementMutation(CandidateNode &input, bool compute_parsimony_scores = false, bool compute_vecs = false); + void computeExcessMutations(PlacementCandidateNode &input); /** * Initialize data for calculatePlacementMutation */ - void initDataCalculatePlacementMutation(CandidateNode &inp); + void initDataPlaceNewSample(PlacementCandidateNode &inp); /** * Erase a mutation from the candidate node @@ -362,7 +362,7 @@ class PhyloTree : public MTree, public Optimization { /** * Optimize the placement mutation for a candidate node */ - void optimizedCalculatePlacementMutation(CandidateNode &input, int set_difference = 0, bool firstNode = false); + void optimizedFindPositionPlaceNewSample(PlacementCandidateNode &input, int set_difference = 0); /** * Add a new sample to the tree diff --git a/placement.cpp b/placement.cpp index cc747aae..2798c6ec 100644 --- a/placement.cpp +++ b/placement.cpp @@ -8,6 +8,8 @@ #include "mutation.h" #include "placement.h" +const int VCF_HEADER_LINES = 12; // Number of header lines in VCF file +const int BATCH_SIZE = 8; // Number of columns to process in each batch void initAlignment(IQTree *tree, Alignment *alignment, vector &rotated_permutation_column) { int nsite = rotated_permutation_column.size(); vector perm_col(nsite); @@ -64,7 +66,7 @@ int readVCFFile(IQTree *tree, Alignment*& alignment, Params ¶ms) { in.exceptions(ios::badbit); // Read first 12 lines and create tree alignment - int totalColumn = readInitialAlignment(in, "temp.vcf", 12) - 1; // Read first 12 lines and write to temp.vcf + int totalColumn = readInitialAlignment(in, "temp.vcf", VCF_HEADER_LINES) - 1; // Read header lines and write to temp.vcf alignment = new Alignment("temp.vcf", params.sequence_type, params.intype, params.num_existing_sequences); alignment->ungroupSitePattern(); std::remove("temp.vcf"); @@ -75,7 +77,8 @@ int readVCFFile(IQTree *tree, Alignment*& alignment, Params ¶ms) { initAlignment(tree, alignment, rotatedColumnPermutation); while (true) { - int numProcessedColumn = (alignment)->readPartialVCF(in, params.sequence_type, rotatedColumnPermutation, params.num_existing_sequences, totalColumn, 8); + int numProcessedColumn = (alignment)->readPartialVCF(in, params.sequence_type, rotatedColumnPermutation, + params.num_existing_sequences, totalColumn, BATCH_SIZE); if (numProcessedColumn == 0) break; tree->clearAllPartialLH(); @@ -109,42 +112,24 @@ void placeNewSamplesOntoExistingTree(Params ¶ms) { auto start_time = getCPUTime(); for (int i = 0; i < num_sequences; ++i) { vector> bfs = tree->breadth_first_expansion(); - int total_nodes = (int)bfs.size(); - - CandidateNode inp; + PlacementCandidateNode input; int best_set_difference = INT_MAX; size_t best_node_num_leaves = INT_MAX; - size_t best_distance = INT_MAX; std::vector excess_mutations; - std::vector node_has_unique(total_nodes, false); - bool best_node_has_unique = false; - size_t best_index = 0; - - inp.best_set_difference = &best_set_difference; - inp.best_node_num_leaves = &best_node_num_leaves; - inp.best_distance = &best_distance; - inp.node = (PhyloNode *)tree->root->neighbors[0]->node; - inp.node_branch = (PhyloNeighbor *)inp.node->findNeighbor(tree->root); - inp.missing_sample_mutations = &alignment->missing_sample_mutations[i]; - inp.excess_mutations = &excess_mutations; - inp.has_unique = &best_node_has_unique; - inp.node_has_unique = &(node_has_unique); - inp.best_index = &best_index; - - tree->initDataCalculatePlacementMutation(inp); - tree->optimizedCalculatePlacementMutation(inp, 0, true); - - for (int j = 0; j < total_nodes; ++j) { - if (inp.best_node == bfs[j].first) { - best_index = j; - } - } - *inp.best_set_difference = INT_MAX; - inp.index = best_index; - inp.node = bfs[best_index].first; - inp.node_branch = bfs[best_index].second; - tree->calculatePlacementMutation(inp, false, true); - tree->addNewSample(bfs[best_index].first, bfs[best_index].second, excess_mutations, i, alignment->missing_seq_names[i]); + + input.best_set_difference = &best_set_difference; + input.best_node_num_leaves = &best_node_num_leaves; + input.node = (PhyloNode *)tree->root->neighbors[0]->node; + input.node_branch = (PhyloNeighbor *)input.node->findNeighbor(tree->root); + input.missing_sample_mutations = &alignment->missing_sample_mutations[i]; + input.excess_mutations = &excess_mutations; + + tree->initDataPlaceNewSample(input); + tree->optimizedFindPositionPlaceNewSample(input, 0); + input.node = input.best_node; + input.node_branch = input.best_node_branch; + tree->computeExcessMutations(input); + tree->addNewSample(input.best_node, input.best_node_branch, excess_mutations, i, alignment->missing_seq_names[i]); } alignment->addToAlignmentNewSequences(alignment->missing_seq_names, alignment->missing_sequences); From 4ab8568a76a564721605e23d3dc21bb4fa4f867a Mon Sep 17 00:00:00 2001 From: trungnotchung Date: Thu, 12 Jun 2025 15:34:06 +0700 Subject: [PATCH 10/23] refactor: alignment --- alignment.cpp | 192 +++++++++++++++++--------------------------------- 1 file changed, 65 insertions(+), 127 deletions(-) diff --git a/alignment.cpp b/alignment.cpp index 3efdcea4..2a761627 100644 --- a/alignment.cpp +++ b/alignment.cpp @@ -995,11 +995,9 @@ char Alignment::convertStateBack(char state) { } } -char Alignment::getMutationFromState(char state) -{ +char Alignment::getMutationFromState(char state) { int value = convertState(state, SEQ_DNA); - switch (value) - { + switch (value) { case 0: return 1; case 1: @@ -1035,8 +1033,7 @@ char Alignment::getMutationFromState(char state) } } -int Alignment::getStateFromMutation(int nuc) -{ +int Alignment::getStateFromMutation(int nuc) { int value; if ((nuc & (nuc - 1)) == 0) value = log2(nuc); @@ -1288,22 +1285,18 @@ int Alignment::buildPattern(StrVector &sequences, char *sequence_type, int nseq, return 1; } -void split(const string &s, vector &elems, const string &delim) -{ +void split(const string &s, vector &elems, const string &delim) { elems.clear(); size_t pos = 0; size_t len = s.length(); size_t delim_len = delim.length(); - if (delim_len == 0) - { + if (delim_len == 0) { elems.push_back(s); return; } - while (pos < len) - { + while (pos < len) { size_t find_pos = s.find(delim, pos); - if (find_pos == string::npos) - { + if (find_pos == string::npos) { elems.push_back(s.substr(pos)); return; } @@ -1313,8 +1306,7 @@ void split(const string &s, vector &elems, const string &delim) } // Find the permutation of columns after rotation -vector Alignment::findRotatedColumnPermutation() -{ +vector Alignment::findRotatedColumnPermutation() { assert(getNSite() == (int)initial_column_state.size()); char char_to_state[NUM_CHAR]; computeUnknownState(); @@ -1323,28 +1315,24 @@ vector Alignment::findRotatedColumnPermutation() vector perm(getNSite(), 0); map> pattern_map; // Build pattern map - for (int i = 0; i < getNSite(); ++i) - { - Pattern ptn = getPattern(i); - pattern_map[ptn].push_back(i); + for (int i = 0; i < getNSite(); ++i) { + Pattern pattern = getPattern(i); + pattern_map[pattern].push_back(i); } - for (int col = 0; col < getNSite(); ++col) - { + for (int col = 0; col < getNSite(); ++col) { // For each column, build a pattern // Find initial index of the pattern - Pattern nptn; - for (int i = 0; i < initial_column_state[col].length(); ++i) - { - nptn += char_to_state[(int)initial_column_state[col][i]]; + Pattern pattern; + for (int i = 0; i < initial_column_state[col].length(); ++i) { + pattern += char_to_state[(int)initial_column_state[col][i]]; } - perm[pattern_map[nptn].back()] = col; - pattern_map[nptn].pop_back(); + perm[pattern_map[pattern].back()] = col; + pattern_map[pattern].pop_back(); } return perm; } -void Alignment::addToAlignmentNewSequence(const string &new_name, const string &new_seq) -{ +void Alignment::addToAlignmentNewSequence(const string &new_name, const string &new_seq) { assert(new_seq.size() == getNSite()); char char_to_state[NUM_CHAR]; computeUnknownState(); @@ -1355,41 +1343,35 @@ void Alignment::addToAlignmentNewSequence(const string &new_name, const string & vector new_site_patterns; vector perm_col = findRotatedColumnPermutation(); - for (int i = 0; i < getNSite(); ++i) - { + for (int i = 0; i < getNSite(); ++i) { Pattern new_pattern = getPattern(i); new_pattern.push_back(char_to_state[(int)new_seq[perm_col[i]]]); PatternIntMap::iterator pat_it = new_pattern_index.find(new_pattern); - if (pat_it == new_pattern_index.end()) - { // not found + if (pat_it == new_pattern_index.end()) { new_pattern.frequency = 1; new_pattern.computeConst(STATE_UNKNOWN); new_patterns.push_back(new_pattern); new_pattern_index[new_pattern] = new_patterns.size() - 1; new_site_patterns.push_back(new_patterns.size() - 1); } - else - { + else { int index = pat_it->second; new_patterns[index].frequency++; new_site_patterns.push_back(index); } } clear(); - for (vector::iterator it = new_patterns.begin(); it != new_patterns.end(); ++it) - { + for (vector::iterator it = new_patterns.begin(); it != new_patterns.end(); ++it) { push_back(*it); } pattern_index = new_pattern_index; site_pattern = new_site_patterns; seq_names.push_back(new_name); buildSeqStates(); - // checkSeqName(); countConstSite(); } -void Alignment::addToAlignmentNewSequences(const vector &new_seq_names, const vector &new_sequences) -{ +void Alignment::addToAlignmentNewSequences(const vector &new_seq_names, const vector &new_sequences) { char char_to_state[NUM_CHAR]; computeUnknownState(); buildStateMap(char_to_state, seq_type); @@ -1400,44 +1382,37 @@ void Alignment::addToAlignmentNewSequences(const vector &new_seq_names, int nseq = new_sequences.size(); vector perm_col = findRotatedColumnPermutation(); - for (int site = 0; site < getNSite(); ++site) - { + for (int site = 0; site < getNSite(); ++site) { Pattern new_pattern = getPattern(site); - for (int seq = 0; seq < nseq; ++seq) - { + for (int seq = 0; seq < nseq; ++seq) { new_pattern.push_back(char_to_state[(int)new_sequences[seq][perm_col[site]]]); } PatternIntMap::iterator pat_it = new_pattern_index.find(new_pattern); - if (pat_it == new_pattern_index.end()) - { // not found + if (pat_it == new_pattern_index.end()) { new_pattern.frequency = 1; new_pattern.computeConst(STATE_UNKNOWN); new_patterns.push_back(new_pattern); new_pattern_index[new_pattern] = new_patterns.size() - 1; new_site_patterns.push_back(new_patterns.size() - 1); } - else - { + else { int index = pat_it->second; new_patterns[index].frequency++; new_site_patterns.push_back(index); } } clear(); - for (vector::iterator it = new_patterns.begin(); it != new_patterns.end(); ++it) - { + for (vector::iterator it = new_patterns.begin(); it != new_patterns.end(); ++it) { push_back(*it); } pattern_index = new_pattern_index; site_pattern = new_site_patterns; seq_names.insert(seq_names.end(), new_seq_names.begin(), new_seq_names.end()); buildSeqStates(); - // checkSeqName(); countConstSite(); } -void Alignment::updateAlignmentNewSequences(const vector &new_sequences, const vector &perm_col) -{ +void Alignment::updateAlignmentNewSequences(const vector &new_sequences, const vector &perm_col) { computeUnknownState(); char char_to_state[NUM_CHAR]; buildStateMap(char_to_state, seq_type); @@ -1448,16 +1423,13 @@ void Alignment::updateAlignmentNewSequences(const vector &new_sequences, int nseq = new_sequences.size(); int nsite = getNSite(); - for (int site = 0; site < nsite; ++site) - { + for (int site = 0; site < nsite; ++site) { Pattern new_pattern; - for (int seq = 0; seq < nseq; ++seq) - { + for (int seq = 0; seq < nseq; ++seq) { new_pattern.push_back(char_to_state[(int)new_sequences[seq][perm_col[site]]]); } PatternIntMap::iterator pat_it = new_pattern_index.find(new_pattern); - if (pat_it == new_pattern_index.end()) - { + if (pat_it == new_pattern_index.end()) { // If pattern not found, add new pattern new_pattern.frequency = 1; new_pattern.computeConst(STATE_UNKNOWN); @@ -1465,8 +1437,7 @@ void Alignment::updateAlignmentNewSequences(const vector &new_sequences, new_pattern_index[new_pattern] = new_patterns.size() - 1; new_site_patterns.push_back(new_patterns.size() - 1); } - else - { + else { // If pattern found, increment frequency int index = pat_it->second; new_patterns[index].frequency++; @@ -1474,8 +1445,7 @@ void Alignment::updateAlignmentNewSequences(const vector &new_sequences, } } clear(); - for (vector::iterator itr = new_patterns.begin(); itr != new_patterns.end(); ++itr) - { + for (vector::iterator itr = new_patterns.begin(); itr != new_patterns.end(); ++itr) { push_back(*itr); } pattern_index = new_pattern_index; @@ -1485,10 +1455,8 @@ void Alignment::updateAlignmentNewSequences(const vector &new_sequences, } // Read partial VCF file and update alignment -int Alignment::readPartialVCF(ifstream &in, char *sequence_type, vector &perm_col, int existing_sequence, int start_index, int num_column) -{ - if (in.eof()) - { +int Alignment::readPartialVCF(ifstream &in, char *sequence_type, vector &perm_col, int existing_sequence, int start_index, int num_column) { + if (in.eof()) { return 0; } StrVector sequences; @@ -1501,8 +1469,7 @@ int Alignment::readPartialVCF(ifstream &in, char *sequence_type, vector &pe sequences.resize(nseq, ""); existing_sample_mutations.assign(nseq, vector()); - for (; !in.eof() && num_processed_column < num_column;) - { + for (; !in.eof() && num_processed_column < num_column;) { getline(in, line); if (line == "") continue; @@ -1523,49 +1490,38 @@ int Alignment::readPartialVCF(ifstream &in, char *sequence_type, vector &pe cur_mut.ref_nuc = getMutationFromState(words[3][0]); if (reference_nuc[cur_mut.position] == 0) reference_nuc[cur_mut.position] = cur_mut.ref_nuc; - for (int i = 9; i < words.size(); ++i) - { + for (int i = 9; i < words.size(); ++i) { cur_mut.is_missing = false; - if (isdigit(words[i][0])) - { + if (isdigit(words[i][0])) { int allele_id = std::stoi(words[i]); - if (allele_id > 0) - { + if (allele_id > 0) { std::string allele = alleles[allele_id - 1]; - if (i - 9 < existing_sequence) - { + if (i - 9 < existing_sequence) { sequences[i - 9].push_back(allele[0]); } cur_mut.mut_nuc = getMutationFromState(allele[0]); } - else - { - if (i - 9 < existing_sequence) - { + else { + if (i - 9 < existing_sequence) { sequences[i - 9].push_back(words[3][0]); } cur_mut.mut_nuc = getMutationFromState(words[3][0]); } } - else - { - if (i - 9 < existing_sequence) - { + else { + if (i - 9 < existing_sequence) { sequences[i - 9].push_back('-'); } cur_mut.mut_nuc = getMutationFromState('N'); cur_mut.is_missing = true; } - if (i - 9 >= existing_sequence) - { - if (cur_mut.mut_nuc != cur_mut.ref_nuc) - { + if (i - 9 >= existing_sequence) { + if (cur_mut.mut_nuc != cur_mut.ref_nuc) { cur_mut.par_nuc = cur_mut.ref_nuc; missing_sample_mutations[i - 9 - existing_sequence].push_back(cur_mut); } } - else - { + else { existing_sample_mutations[i - 9].push_back(cur_mut); } } @@ -1574,12 +1530,10 @@ int Alignment::readPartialVCF(ifstream &in, char *sequence_type, vector &pe } // If not enough columns, rebuild pattern and return - if (num_processed_column < num_column) - { + if (num_processed_column < num_column) { buildPattern(sequences, sequence_type, nseq, nsite); initial_column_state.assign(nsite, ""); - for (int seq = 0; seq < nseq; ++seq) - { + for (int seq = 0; seq < nseq; ++seq) { for (int site = 0; site < nsite; ++site) initial_column_state[site] += sequences[seq][site]; } @@ -1592,8 +1546,7 @@ int Alignment::readPartialVCF(ifstream &in, char *sequence_type, vector &pe return num_processed_column; } -int Alignment::readVCF(char *filename, char *sequence_type, int existing_sequence) -{ +int Alignment::readVCF(char *filename, char *sequence_type, int existing_sequence) { StrVector sequences; ifstream in; in.exceptions(ios::failbit | ios::badbit); @@ -1606,8 +1559,7 @@ int Alignment::readVCF(char *filename, char *sequence_type, int existing_sequenc in.exceptions(ios::badbit); int num_processed_column = 0; - for (; !in.eof();) - { + for (; !in.eof();) { getline(in, line); if (line == "") continue; @@ -1615,18 +1567,14 @@ int Alignment::readVCF(char *filename, char *sequence_type, int existing_sequenc split(line, words, "\t"); if (words.size() == 1) continue; - if (words[1] == "POS") - { + if (words[1] == "POS") { // Sample names start from the 10th word in the header - for (int i = 9; i < words.size(); i++) - { - if (i - 9 >= existing_sequence) - { + for (int i = 9; i < words.size(); i++) { + if (i - 9 >= existing_sequence) { missing_seq_names.push_back(words[i]); num_missing_sequence++; } - else - { + else { seq_names.push_back(words[i]); nseq++; } @@ -1636,8 +1584,7 @@ int Alignment::readVCF(char *filename, char *sequence_type, int existing_sequenc existing_sample_mutations.resize(nseq); missing_sample_mutations.resize(num_missing_sequence); } - else - { + else { if (words.size() != 9 + nseq + num_missing_sequence) throw "Number of columns in VCF file is not consistent"; vector alleles; @@ -1651,14 +1598,11 @@ int Alignment::readVCF(char *filename, char *sequence_type, int existing_sequenc cur_mut.ref_nuc = getMutationFromState(words[3][0]); if (reference_nuc[cur_mut.position] == 0) reference_nuc[cur_mut.position] = cur_mut.ref_nuc; - for (int i = 9; i < words.size(); ++i) - { + for (int i = 9; i < words.size(); ++i) { cur_mut.is_missing = false; - if (isdigit(words[i][0])) - { + if (isdigit(words[i][0])) { int allele_id = std::stoi(words[i]); - if (allele_id > 0) - { + if (allele_id > 0) { std::string allele = alleles[allele_id - 1]; if (i - 9 < existing_sequence) sequences[i - 9].push_back(allele[0]); @@ -1667,8 +1611,7 @@ int Alignment::readVCF(char *filename, char *sequence_type, int existing_sequenc cur_mut.mut_nuc = getMutationFromState(allele[0]); } - else - { + else { if (i - 9 < existing_sequence) sequences[i - 9].push_back(words[3][0]); else @@ -1677,8 +1620,7 @@ int Alignment::readVCF(char *filename, char *sequence_type, int existing_sequenc cur_mut.mut_nuc = getMutationFromState(words[3][0]); } } - else - { + else { if (i - 9 < existing_sequence) sequences[i - 9].push_back('-'); else @@ -1686,10 +1628,8 @@ int Alignment::readVCF(char *filename, char *sequence_type, int existing_sequenc cur_mut.mut_nuc = getMutationFromState('N'); cur_mut.is_missing = true; } - if (i - 9 >= existing_sequence) - { - if (cur_mut.mut_nuc != cur_mut.ref_nuc) - { + if (i - 9 >= existing_sequence) { + if (cur_mut.mut_nuc != cur_mut.ref_nuc) { cur_mut.par_nuc = cur_mut.ref_nuc; missing_sample_mutations[i - 9 - existing_sequence].push_back(cur_mut); } @@ -1702,8 +1642,7 @@ int Alignment::readVCF(char *filename, char *sequence_type, int existing_sequenc } } initial_column_state.assign(nsite, ""); - for (int seq = 0; seq < nseq; ++seq) - { + for (int seq = 0; seq < nseq; ++seq) { for (int site = 0; site < nsite; ++site) initial_column_state[site] += sequences[seq][site]; } @@ -1714,7 +1653,6 @@ int Alignment::readVCF(char *filename, char *sequence_type, int existing_sequenc } int Alignment::readPhylip(char *filename, char *sequence_type) { - StrVector sequences; ostringstream err_str; ifstream in; From b8709360d8ab55310be0ff8e52209f8435a0cdcf Mon Sep 17 00:00:00 2001 From: trungnotchung Date: Thu, 12 Jun 2025 15:44:33 +0700 Subject: [PATCH 11/23] refactor: update doc for alignment --- alignment.h | 166 +++++++++++++++++++++++++++++++++------------------- 1 file changed, 107 insertions(+), 59 deletions(-) diff --git a/alignment.h b/alignment.h index b9c49821..2538e568 100644 --- a/alignment.h +++ b/alignment.h @@ -610,78 +610,126 @@ class Alignment : public vector { int n_informative_patterns; int n_informative_sites; - /** - * Missing sequence names - */ - vector missing_seq_names; - - /** - * Missing sequences - */ - vector missing_sequences; + /** + * Names of sequences that are missing from the current alignment but present in the VCF file. + * These sequences will be processed separately during VCF file reading. + */ + vector missing_seq_names; - /** - * Initial column state - * Using for finding rotated column permutation - */ - vector initial_column_state; + /** + * Actual sequence data for the missing sequences. + * Each string represents a complete sequence for one missing sample. + */ + vector missing_sequences; - /** - * Missing sample mutations - */ - vector> missing_sample_mutations; + /** + * Initial state of each column in the alignment before rotation + * Used for finding rotated column permutations to optimize memory usage and processing. + */ + vector initial_column_state; - /** - * Existing sample mutations - */ - vector> existing_sample_mutations; + /** + * Mutations found in missing sequences (sequences not in the current alignment). + * Each inner vector contains mutations for one missing sequence. + * Used to track mutations that need to be processed separately. + */ + vector> missing_sample_mutations; - /** - * Reference nucleotides - */ - vector reference_nuc; + /** + * Mutations found in existing sequences (sequences already in the alignment). + * Each inner vector contains mutations for one existing sequence. + * Used to track mutations that have already been incorporated into the alignment. + */ + vector> existing_sample_mutations; - /** - * Replace current alignment with new sequences - */ - void updateAlignmentNewSequences(const vector &new_seqs, const vector &perm_col); + /** + * Reference nucleotides for each position in the alignment. + * Used to track the original nucleotide at each position before mutations. + */ + vector reference_nuc; - /** - * Add a new sequence to the alignment - */ - void addToAlignmentNewSequence(const string &new_seq_name, const string &new_seq); + /** + * Replaces the current alignment with new sequences. + * @param new_seqs Vector of new sequences to replace the current alignment + * @param perm_col Vector of column permutations to apply to the new sequences + * + * This function is used when updating the alignment with new sequence data, + * typically after processing a batch of VCF data. + */ + void updateAlignmentNewSequences(const vector &new_seqs, const vector &perm_col); - /** - * Add new sequences to the alignment - */ - void addToAlignmentNewSequences(const vector &new_seq_names, const vector &new_seqs); + /** + * Adds a single new sequence to the current alignment. + * @param new_seq_name Name of the new sequence to add + * @param new_seq The actual sequence data to add + * + * The sequence must be the same length as existing sequences in the alignment. + * This function handles the conversion of sequence characters to internal states + * and updates the pattern information accordingly. + */ + void addToAlignmentNewSequence(const string &new_seq_name, const string &new_seq); - /** - * Get mutation from state - */ - char getMutationFromState(char state); + /** + * Adds multiple new sequences to the current alignment. + * @param new_seq_names Vector of names for the new sequences + * @param new_seqs Vector of sequence data for the new sequences + * + * All sequences must be the same length as existing sequences in the alignment. + * This function efficiently processes multiple sequences at once by updating + * patterns and site information in a single pass. + */ + void addToAlignmentNewSequences(const vector &new_seq_names, const vector &new_seqs); - /** - * Get state from mutation - */ - int getStateFromMutation(int nuc); + /** + * Converts an internal state code to its corresponding mutation character. + * @param state Internal state code to convert + * @return Character representing the mutation (e.g., 'A', 'C', 'G', 'T' for DNA) + */ + char getMutationFromState(char state); - /** - * Find rotated column permutation - */ - vector findRotatedColumnPermutation(); + /** + * Converts a mutation character to its corresponding internal state code. + * @param nuc Nucleotide character to convert + * @return Internal state code for the nucleotide + */ + int getStateFromMutation(int nuc); - /** - * Read partial VCF file - * Using for reducing memory usage - */ - int readPartialVCF(ifstream &in, char *sequence_type, vector &perm_col, int existing_sequence, int start_index, int num_column); + /** + * Finds the optimal column permutation. + * @return Vector of integers representing the origin order + */ + vector findRotatedColumnPermutation(); - /** - * Read VCF file - */ - int readVCF(char *file_name, char *sequence_type, int existing_sequence); + /** + * Reads a portion of a VCF file to process it in batches. + * @param in Input file stream for the VCF file + * @param sequence_type Type of sequence data (e.g., "DNA", "PROTEIN") + * @param perm_col Vector to store column permutations + * @param existing_sequence Number of sequences already in the alignment + * @param start_index Starting position in the alignment + * @param num_column Number of columns to process in this batch + * @return Number of columns actually processed + * + * This function is used to process large VCF files in chunks to reduce memory usage. + * It updates the alignment with new sequence data and mutation information. + */ + int readPartialVCF(ifstream &in, char *sequence_type, vector &perm_col, + int existing_sequence, int start_index, int num_column); + /** + * Reads and processes a complete VCF file. + * @param file_name Path to the VCF file + * @param sequence_type Type of sequence data (e.g., "DNA", "PROTEIN") + * @param existing_sequence Number of sequences already in the alignment + * @return Number of sites processed + * + * This function reads a VCF file and builds an alignment from it. It handles: + * - Reading sequence names and data + * - Processing mutations and reference nucleotides + * - Building patterns and updating the alignment + * - Tracking mutations for both existing and missing sequences + */ + int readVCF(char *file_name, char *sequence_type, int existing_sequence); protected: From c3edd808d89f5aa6ade9f27f9f45e32b4ba583d9 Mon Sep 17 00:00:00 2001 From: trungnotchung Date: Thu, 12 Jun 2025 16:08:21 +0700 Subject: [PATCH 12/23] improve: reduce reference size --- alignment.cpp | 64 +++++++++++++++++++++++++-------------------------- phylotree.cpp | 10 ++++---- 2 files changed, 37 insertions(+), 37 deletions(-) diff --git a/alignment.cpp b/alignment.cpp index 2a761627..e61e7b2a 100644 --- a/alignment.cpp +++ b/alignment.cpp @@ -1480,18 +1480,18 @@ int Alignment::readPartialVCF(ifstream &in, char *sequence_type, vector &pe if (words.size() != 9 + nseq + missing_sample_mutations.size()) throw "Number of columns in VCF file is not consistent"; vector alleles; - Mutation cur_mut; + Mutation mutation; int variant_pos = std::stoi(words[1]); - cur_mut.position = variant_pos; - cur_mut.compressed_position = num_processed_column + start_index; - while ((int)reference_nuc.size() <= cur_mut.position) + mutation.position = variant_pos; + mutation.compressed_position = num_processed_column + start_index; + while ((int)reference_nuc.size() <= mutation.compressed_position) reference_nuc.push_back(0); split(words[4], alleles, ","); - cur_mut.ref_nuc = getMutationFromState(words[3][0]); - if (reference_nuc[cur_mut.position] == 0) - reference_nuc[cur_mut.position] = cur_mut.ref_nuc; + mutation.ref_nuc = getMutationFromState(words[3][0]); + if (reference_nuc[mutation.compressed_position] == 0) + reference_nuc[mutation.compressed_position] = mutation.ref_nuc; for (int i = 9; i < words.size(); ++i) { - cur_mut.is_missing = false; + mutation.is_missing = false; if (isdigit(words[i][0])) { int allele_id = std::stoi(words[i]); if (allele_id > 0) { @@ -1499,30 +1499,30 @@ int Alignment::readPartialVCF(ifstream &in, char *sequence_type, vector &pe if (i - 9 < existing_sequence) { sequences[i - 9].push_back(allele[0]); } - cur_mut.mut_nuc = getMutationFromState(allele[0]); + mutation.mut_nuc = getMutationFromState(allele[0]); } else { if (i - 9 < existing_sequence) { sequences[i - 9].push_back(words[3][0]); } - cur_mut.mut_nuc = getMutationFromState(words[3][0]); + mutation.mut_nuc = getMutationFromState(words[3][0]); } } else { if (i - 9 < existing_sequence) { sequences[i - 9].push_back('-'); } - cur_mut.mut_nuc = getMutationFromState('N'); - cur_mut.is_missing = true; + mutation.mut_nuc = getMutationFromState('N'); + mutation.is_missing = true; } if (i - 9 >= existing_sequence) { - if (cur_mut.mut_nuc != cur_mut.ref_nuc) { - cur_mut.par_nuc = cur_mut.ref_nuc; - missing_sample_mutations[i - 9 - existing_sequence].push_back(cur_mut); + if (mutation.mut_nuc != mutation.ref_nuc) { + mutation.par_nuc = mutation.ref_nuc; + missing_sample_mutations[i - 9 - existing_sequence].push_back(mutation); } } else { - existing_sample_mutations[i - 9].push_back(cur_mut); + existing_sample_mutations[i - 9].push_back(mutation); } } ++nsite; @@ -1588,18 +1588,18 @@ int Alignment::readVCF(char *filename, char *sequence_type, int existing_sequenc if (words.size() != 9 + nseq + num_missing_sequence) throw "Number of columns in VCF file is not consistent"; vector alleles; - Mutation cur_mut; + Mutation mutation; int variant_pos = std::stoi(words[1]); - cur_mut.position = variant_pos; - cur_mut.compressed_position = num_processed_column; - while ((int)reference_nuc.size() <= cur_mut.position) + mutation.position = variant_pos; + mutation.compressed_position = num_processed_column; + while ((int)reference_nuc.size() <= mutation.compressed_position) reference_nuc.push_back(0); split(words[4], alleles, ","); - cur_mut.ref_nuc = getMutationFromState(words[3][0]); - if (reference_nuc[cur_mut.position] == 0) - reference_nuc[cur_mut.position] = cur_mut.ref_nuc; + mutation.ref_nuc = getMutationFromState(words[3][0]); + if (reference_nuc[mutation.compressed_position] == 0) + reference_nuc[mutation.compressed_position] = mutation.ref_nuc; for (int i = 9; i < words.size(); ++i) { - cur_mut.is_missing = false; + mutation.is_missing = false; if (isdigit(words[i][0])) { int allele_id = std::stoi(words[i]); if (allele_id > 0) { @@ -1609,7 +1609,7 @@ int Alignment::readVCF(char *filename, char *sequence_type, int existing_sequenc else missing_sequences[i - 9 - existing_sequence].push_back(allele[0]); - cur_mut.mut_nuc = getMutationFromState(allele[0]); + mutation.mut_nuc = getMutationFromState(allele[0]); } else { if (i - 9 < existing_sequence) @@ -1617,7 +1617,7 @@ int Alignment::readVCF(char *filename, char *sequence_type, int existing_sequenc else missing_sequences[i - 9 - existing_sequence].push_back(words[3][0]); - cur_mut.mut_nuc = getMutationFromState(words[3][0]); + mutation.mut_nuc = getMutationFromState(words[3][0]); } } else { @@ -1625,17 +1625,17 @@ int Alignment::readVCF(char *filename, char *sequence_type, int existing_sequenc sequences[i - 9].push_back('-'); else missing_sequences[i - 9 - existing_sequence].push_back('-'); - cur_mut.mut_nuc = getMutationFromState('N'); - cur_mut.is_missing = true; + mutation.mut_nuc = getMutationFromState('N'); + mutation.is_missing = true; } if (i - 9 >= existing_sequence) { - if (cur_mut.mut_nuc != cur_mut.ref_nuc) { - cur_mut.par_nuc = cur_mut.ref_nuc; - missing_sample_mutations[i - 9 - existing_sequence].push_back(cur_mut); + if (mutation.mut_nuc != mutation.ref_nuc) { + mutation.par_nuc = mutation.ref_nuc; + missing_sample_mutations[i - 9 - existing_sequence].push_back(mutation); } } else - existing_sample_mutations[i - 9].push_back(cur_mut); + existing_sample_mutations[i - 9].push_back(mutation); } ++nsite; ++num_processed_column; diff --git a/phylotree.cpp b/phylotree.cpp index 1393b66b..d9834095 100644 --- a/phylotree.cpp +++ b/phylotree.cpp @@ -5215,7 +5215,7 @@ void PhyloTree::computePartialMutation(UINT *states_dad, vector &perm_col, left_child_mut.compressed_position = compressed_perm_col[p]; left_child_mut.mut_nuc = (1 << left_child_nuc); left_child_mut.par_nuc = (1 << dad_nuc); - left_child_mut.ref_nuc = aln->reference_nuc[left_child_mut.position]; + left_child_mut.ref_nuc = aln->reference_nuc[left_child_mut.compressed_position]; left_branch->mutations.push_back(left_child_mut); left_branch_mutations.push_back(make_pair(p, left_child_nuc)); } @@ -5242,7 +5242,7 @@ void PhyloTree::computePartialMutation(UINT *states_dad, vector &perm_col, mut_r.compressed_position = compressed_perm_col[p]; mut_r.mut_nuc = (1 << right_child_nuc); mut_r.par_nuc = (1 << dad_nuc); - mut_r.ref_nuc = aln->reference_nuc[mut_r.position]; + mut_r.ref_nuc = aln->reference_nuc[mut_r.compressed_position]; right_branch->mutations.push_back(mut_r); right_branch_mutations.push_back(make_pair(p, right_child_nuc)); } @@ -5335,7 +5335,7 @@ void PhyloTree::computeMutationBranch(vector &perm_col, vector &compre left_child_mut.compressed_position = compressed_perm_col[col]; left_child_mut.mut_nuc = (1 << left_child_nuc); left_child_mut.par_nuc = (1 << dad_nuc); - left_child_mut.ref_nuc = aln->reference_nuc[left_child_mut.position]; + left_child_mut.ref_nuc = aln->reference_nuc[left_child_mut.compressed_position]; dad_branch->mutations.push_back(left_child_mut); } right_branch_states_dad[ptn / 8] ^= (1 << (i * 4 + left_child_nuc)); @@ -5355,7 +5355,7 @@ void PhyloTree::computeMutationBranch(vector &perm_col, vector &compre right_child_mut.compressed_position = compressed_perm_col[col]; right_child_mut.mut_nuc = (1 << right_child_nuc); right_child_mut.par_nuc = (1 << dad_nuc); - right_child_mut.ref_nuc = aln->reference_nuc[right_child_mut.position]; + right_child_mut.ref_nuc = aln->reference_nuc[right_child_mut.compressed_position]; node_branch->mutations.push_back(right_child_mut); } left_branch_states_dad[ptn / 8] ^= (1 << (i * 4 + right_child_nuc)); @@ -5377,7 +5377,7 @@ void PhyloTree::initMutation(vector &perm_col, vector &compressed_perm for (int i = 0; i < nptn; ++i) { char root_nuc = ((root_states[ptn] >> (i * 4)) & 15); - char ref_nuc = aln->reference_nuc[perm_col[i]]; + char ref_nuc = aln->reference_nuc[compressed_perm_col[i]]; if ((root_nuc & ref_nuc) == 0) { char dad_nuc = 0; From 6e313abe7bc4b324a08a0c951cf0c4a7aec81683 Mon Sep 17 00:00:00 2001 From: trungnotchung Date: Thu, 12 Jun 2025 16:26:56 +0700 Subject: [PATCH 13/23] refactor: iqtree --- iqtree.cpp | 119 +++++++++++++++++++------------------------------- iqtree.h | 81 ++++++++++++++++++++++++---------- placement.cpp | 4 +- 3 files changed, 105 insertions(+), 99 deletions(-) diff --git a/iqtree.cpp b/iqtree.cpp index f0e6241c..e4349e34 100644 --- a/iqtree.cpp +++ b/iqtree.cpp @@ -4539,150 +4539,121 @@ void IQTree::reinsertIdenticalSeqs(Alignment *orig_aln, StrVector &removed_seqs, clearAllPartialLH(); } -void IQTree::getLeafName(vector &leafName) -{ - getLeafName(root, root->neighbors[0]->node, leafName); - getLeafName(root->neighbors[0]->node, root, leafName); +void IQTree::getLeavesName(vector &leaves_name) { + getLeavesName(root, root->neighbors[0]->node, leaves_name); + getLeavesName(root->neighbors[0]->node, root, leaves_name); } -void IQTree::getLeafName(Node *node, Node *dad, vector &leafName) -{ - if (node->isLeaf()) - { - leafName.push_back(node->name); +void IQTree::getLeavesName(Node *node, Node *dad, vector &leaves_name) { + if (node->isLeaf()) { + leaves_name.push_back(node->name); return; } - FOR_NEIGHBOR_IT(node, dad, it) - { - getLeafName((*it)->node, node, leafName); - if (node->name == "") - { + FOR_NEIGHBOR_IT(node, dad, it) { + getLeavesName((*it)->node, node, leaves_name); + if (node->name == "") { node->name = (*it)->node->name; } - else - { + else { node->name = min(node->name, (*it)->node->name); } } } -void IQTree::assignRoot(string &rootName) +void IQTree::assignRoot(string &root_name) { - if (root->name == rootName) + if (root->name == root_name) return; - assignRoot(root->neighbors[0]->node, root, rootName); + assignRoot(root->neighbors[0]->node, root, root_name); } -bool IQTree::assignRoot(Node *node, Node *dad, string &rootName) +bool IQTree::assignRoot(Node *node, Node *dad, string &root_name) { - if (node->isLeaf() && node->name == rootName) - { + if (node->isLeaf() && node->name == root_name) { root = node; return true; } - FOR_NEIGHBOR_IT(node, dad, it) - { - if (assignRoot((*it)->node, node, rootName)) - { + FOR_NEIGHBOR_IT(node, dad, it) { + if (assignRoot((*it)->node, node, root_name)) { return true; } } } -int IQTree::initInfoNode(vector &leafName) -{ +int IQTree::initNodeData(vector &leaves_name) { PhyloNode *node1 = (PhyloNode *)root; PhyloNode *node2 = (PhyloNode *)root->neighbors[0]->node; - int lf = initInfoNode(node1, node2, leafName); - int rg = initInfoNode(node2, node1, leafName); - return lf + rg; + int left_child_num_missing_sample = initInfoNode(node1, node2, leaves_name); + int right_child_num_missing_sample = initInfoNode(node2, node1, leaves_name); + return left_child_num_missing_sample + right_child_num_missing_sample; } -int IQTree::initInfoNode(PhyloNode *node, PhyloNode *dad, vector &leafName) -{ - if (node->isLeaf()) - { - int k = lower_bound(leafName.begin(), leafName.end(), node->name) - leafName.begin(); - if (k < leafName.size() && leafName[k] == node->name) - { +int IQTree::initInfoNode(PhyloNode *node, PhyloNode *dad, vector &leaves_name) { + if (node->isLeaf()) { + int node_index = lower_bound(leaves_name.begin(), leaves_name.end(), node->name) - leaves_name.begin(); + if (node_index < leaves_name.size() && leaves_name[node_index] == node->name) { node->setMissingNode(-1); return 1; } - else - { + else { node->setMissingNode(1); return 0; } } - int sum = 0; + int total_missing_sample = 0; bool check = true; - FOR_NEIGHBOR_IT(node, dad, it) - { - int tmp = initInfoNode((PhyloNode *)(*it)->node, node, leafName); - if (tmp == 0) - { + FOR_NEIGHBOR_IT(node, dad, it) { + int num_missing_sample = initInfoNode((PhyloNode *)(*it)->node, node, leaves_name); + if (num_missing_sample == 0) { check = false; } - else - { - if (node->name == "") - { + else { + if (node->name == "") { node->name = (*it)->node->name; } - else - { + else { node->name = min(node->name, (*it)->node->name); } } - sum += tmp; + total_missing_sample += num_missing_sample; } - if (check) - { + if (check) { node->setMissingNode(-1); } - else - { + else { node->setMissingNode(1); } - return sum; + return total_missing_sample; } -bool IQTree::compareTree(IQTree *anotherTree) -{ +bool IQTree::compareTree(IQTree *anotherTree) { if (root->name != anotherTree->root->name) return false; return compareTree((PhyloNode *)root, NULL, anotherTree->root, NULL); } -bool IQTree::compareTree(PhyloNode *node1, PhyloNode *dad1, Node *node2, Node *dad2) -{ +bool IQTree::compareTree(PhyloNode *node1, PhyloNode *dad1, Node *node2, Node *dad2) { bool check = true; - FOR_NEIGHBOR_IT(node1, dad1, it1) - { + FOR_NEIGHBOR_IT(node1, dad1, it1) { PhyloNode *child1 = (PhyloNode *)(*it1)->node; - if (!child1->checkMissingNode()) - { + if (!child1->checkMissingNode()) { bool found = false; - FOR_NEIGHBOR_IT(node2, dad2, it2) - { + FOR_NEIGHBOR_IT(node2, dad2, it2) { Node *child2 = (*it2)->node; - if (child1->name == child2->name) - { + if (child1->name == child2->name) { found = true; check &= compareTree(child1, node1, child2, node2); break; } } - if (!found) - { + if (!found) { return false; } } - else - { + else { check &= compareTree(child1, node1, node2, dad2); } } diff --git a/iqtree.h b/iqtree.h index 5562380b..b64c1927 100644 --- a/iqtree.h +++ b/iqtree.h @@ -665,43 +665,78 @@ class IQTree : public PhyloTree { public: /** - * Get all leaf names of the tree. - */ - void getLeafName(vector &leafName); + * Retrieves all leaf node names from the tree and stores them in a vector. + * + * @param leaves_name [out] Vector to store the leaf names. Will be populated with + * all leaf node names from the tree. + */ + void getLeavesName(vector &leaves_name); /** - * Get all leaf names of the tree rooted at node. - */ - void getLeafName(Node *node, Node *dad, vector& leafName); + * Retrieves all leaf node names from a subtree rooted at the specified node. + * + * @param node [in] The root node of the subtree to traverse + * @param dad [in] The parent node of 'node', used to direct the traversal + * @param leaves_name [out] Vector to store the leaf names. Will be populated with + * all leaf node names from the subtree. + */ + void getLeavesName(Node *node, Node *dad, vector& leaves_name); /** - * Assign root with given name. - */ - void assignRoot(string &rootName); + * Assigns the root of the tree to the node with the specified name. + * + * @param root_name [in] The name of the node that should become the root + * @throws std::runtime_error if no node with the given name is found + */ + void assignRoot(string &root_name); /** - * Assign root with given name. - */ - bool assignRoot(Node *node, Node *dad, string &rootName); + * Attempts to assign the root of a subtree to the node with the specified name. + * + * @param node [in] The current node being considered + * @param dad [in] The parent node of 'node', used to direct the traversal + * @param root_name [in] The name of the node that should become the root + * @return true if the root was successfully assigned, false otherwise + */ + bool assignRoot(Node *node, Node *dad, string &root_name); /** - * Init info which node is original node, which node is added node. - */ - int initInfoNode(vector &leafName); + * Initializes node data by marking which nodes are original vs added nodes. + * This is done by comparing against a list of known leaf names. + * + * @param leaves_name [in] Vector containing the names of original leaf nodes + * @return The number of nodes that were successfully initialized + */ + int initNodeData(vector &leaves_name); /** - * Init info which node is original node, which node is added node. - */ - int initInfoNode(PhyloNode *node, PhyloNode *dad, vector &leafName); + * Recursively initializes node data for a subtree by marking which nodes are + * original vs added nodes. This is done by comparing against a list of known leaf names. + * + * @param node [in] The current node being considered + * @param dad [in] The parent node of 'node', used to direct the traversal + * @param leaves_name [in] Vector containing the names of original leaf nodes + * @return The number of nodes that were successfully initialized in this subtree + */ + int initInfoNode(PhyloNode *node, PhyloNode *dad, vector &leaves_name); /** - * Compare two trees. - */ - bool compareTree(IQTree *anotherTree); + * Compares the current tree with another tree to check if they have the same topology. + * + * @param another_tree [in] Pointer to the tree to compare against + * @return true if the trees have identical topology, false otherwise + */ + bool compareTree(IQTree *another_tree); /** - * Compare two trees rooted at node1 and node2. - */ + * Recursively compares two subtrees to check if they have the same topology. + * + * @param node1 [in] Root node of the first subtree + * @param dad1 [in] Parent node of node1, used to direct the traversal + * @param node2 [in] Root node of the second subtree + * @param dad2 [in] Parent node of node2, used to direct the traversal + * @return true if the subtrees have identical topology, false otherwise + */ bool compareTree(PhyloNode *node1, PhyloNode *dad1, Node *node2, Node *dad2); /** diff --git a/placement.cpp b/placement.cpp index 2798c6ec..6bba8e9a 100644 --- a/placement.cpp +++ b/placement.cpp @@ -158,11 +158,11 @@ void checkCorectTree(char *origin_tree_file, char *new_tree_file) { new_tree->readTree(new_tree_file, new_tree_is_rooted); vector origin_tree_leaves_name; - origin_tree->getLeafName(origin_tree_leaves_name); + origin_tree->getLeavesName(origin_tree_leaves_name); new_tree->assignRoot(origin_tree_leaves_name[0]); sort(origin_tree_leaves_name.begin(), origin_tree_leaves_name.end()); - new_tree->initInfoNode(origin_tree_leaves_name); + new_tree->initNodeData(origin_tree_leaves_name); if (new_tree->compareTree(origin_tree)) { cout << "Finish checking correct tree: Correct tree detected\n"; From 5b021a6c9f73519dcda8910084a34224089fc82f Mon Sep 17 00:00:00 2001 From: trungnotchung Date: Thu, 12 Jun 2025 18:37:22 +0700 Subject: [PATCH 14/23] refactor: phylo tree --- phylonode.h | 5 - phylotree.cpp | 640 +++++++++++++++++++------------------------------- phylotree.h | 150 +++++++++--- placement.cpp | 4 +- 4 files changed, 353 insertions(+), 446 deletions(-) diff --git a/phylonode.h b/phylonode.h index 2d2ed776..7bfe73fe 100644 --- a/phylonode.h +++ b/phylonode.h @@ -97,11 +97,6 @@ class PhyloNeighbor : public Neighbor */ int num_leaves; - /** - * Distance to the root - */ - int distance; - /** * Clear all mutations on this branch */ diff --git a/phylotree.cpp b/phylotree.cpp index d9834095..5dd6dc5e 100644 --- a/phylotree.cpp +++ b/phylotree.cpp @@ -1078,18 +1078,12 @@ int PhyloTree::computeParsimonyBranch(PhyloNeighbor *dad_branch, PhyloNode *dad, _pattern_pars[ptn + i] = node_branch->partial_pars[ptn_pars_start_id + ptn + i] + dad_branch->partial_pars[ptn_pars_start_id + ptn + i] + dna_fitch_step[state_both]; } - if (add_row) - { - for (int i = 0; i < maxi; ++i) - { - for (int j = 0; j < 4; ++j) - { - if (states_dad & (1 << (i * 4 + j))) - { - for (int k = j + 1; k < 4; ++k) - { - if (states_dad & (1 << (i * 4 + k))) - { + if (add_row) { + for (int i = 0; i < maxi; ++i) { + for (int j = 0; j < 4; ++j) { + if (states_dad & (1 << (i * 4 + j))) { + for (int k = j + 1; k < 4; ++k) { + if (states_dad & (1 << (i * 4 + k))) { states_dad ^= (1 << (i * 4 + k)); } } @@ -5142,114 +5136,93 @@ void PhyloTree::printTransMatrices(Node *node, Node *dad) { void PhyloTree::allocateMutationMemory(int num_column) { - cur_missing_sample_mutations.resize(num_column); - cur_ancestral_mutations.resize(num_column); + current_missing_sample_mutations.resize(num_column); + current_ancestral_mutations.resize(num_column); visited_missing_sample_mutations.resize(num_column); visited_ancestral_mutations.resize(num_column); - cur_excess_mutations.resize(num_column); + current_excess_mutations.resize(num_column); visited_excess_mutations.resize(num_column); } void PhyloTree::computePartialMutation(UINT *states_dad, vector &perm_col, vector &compressed_perm_col, PhyloNeighbor *dad_branch, PhyloNode *dad) { PhyloNode *node = (PhyloNode *)dad_branch->node; - int ptn; - int nstates = aln->num_states; - int pars_size = getBitsBlockSize(); - int entry_size = getBitsEntrySize(); - int nptn = aln->size(); - int ptn_pars_start_id = pars_size - nptn - 1; - - if (node->isLeaf() && dad) - { - // Leaf node does not have mutations + if (node->isLeaf() && dad) { return; } - // Process internal node + + int ptn; + int nptn = aln->size(); UINT *left = NULL, *right = NULL; PhyloNeighbor *left_branch, *right_branch; FOR_NEIGHBOR_IT(node, dad, it) - if ((*it)->node->name != ROOT_NAME) - { + if ((*it)->node->name != ROOT_NAME) { if (!left) left = ((PhyloNeighbor *)(*it))->partial_pars, left_branch = (PhyloNeighbor *)(*it)->node->findNeighbor(node); else right = ((PhyloNeighbor *)(*it))->partial_pars, right_branch = (PhyloNeighbor *)(*it)->node->findNeighbor(node); } - int p = -1; + int col = -1; vector> left_branch_mutations, right_branch_mutations; - for (ptn = 0; ptn < aln->size(); ptn += 8) - { - // cout << dad_branch->partial_pars[pars_size - 1] << ": ***\n"; - UINT left_state = left[ptn / 8]; - UINT right_state = right[ptn / 8]; - UINT dad_state = states_dad[ptn / 8]; + for (ptn = 0; ptn < aln->size(); ptn += 8) { + UINT left_states = left[ptn / 8]; + UINT right_states = right[ptn / 8]; + UINT dad_states = states_dad[ptn / 8]; int maxi = aln->size() - ptn; - if (maxi > 8) - maxi = 8; - for (int i = 0; i < maxi; i++) - { - ++p; - UINT state_left = (left_state >> (i * 4)) & 15; - UINT state_right = (right_state >> (i * 4)) & 15; - UINT state_both = (dad_state >> (i * 4)) & 15; - + if (maxi > 8) maxi = 8; + for (int i = 0; i < maxi; i++) { + ++col; + UINT left_state = (left_states >> (i * 4)) & 15; + UINT right_state = (right_states >> (i * 4)) & 15; + UINT dad_state = (dad_states >> (i * 4)) & 15; char dad_nuc = 0; for (dad_nuc = 0; dad_nuc < 4; ++dad_nuc) - if (1 & (state_both >> dad_nuc)) + if (1 & (dad_state >> dad_nuc)) break; char left_child_nuc; - if ((1 & (state_left >> dad_nuc)) == 1) - { + if ((1 & (left_state >> dad_nuc))) { left_child_nuc = dad_nuc; } - else - { + else { for (left_child_nuc = 0; left_child_nuc < 4; ++left_child_nuc) - if (1 & (state_left >> left_child_nuc)) + if (1 & (left_state >> left_child_nuc)) break; Mutation left_child_mut; - left_child_mut.position = perm_col[p]; - left_child_mut.compressed_position = compressed_perm_col[p]; + left_child_mut.position = perm_col[col]; + left_child_mut.compressed_position = compressed_perm_col[col]; left_child_mut.mut_nuc = (1 << left_child_nuc); left_child_mut.par_nuc = (1 << dad_nuc); left_child_mut.ref_nuc = aln->reference_nuc[left_child_mut.compressed_position]; left_branch->mutations.push_back(left_child_mut); - left_branch_mutations.push_back(make_pair(p, left_child_nuc)); + left_branch_mutations.push_back(make_pair(col, left_child_nuc)); } - for (int nuc = 0; nuc < 4; ++nuc) - { - if (nuc != left_child_nuc && (1 & (state_left >> nuc))) - { + for (int nuc = 0; nuc < 4; ++nuc) { + if (nuc != left_child_nuc && (1 & (left_state >> nuc))) { left[ptn / 8] ^= (1 << (i * 4 + nuc)); } } char right_child_nuc; - if ((1 & (state_right >> dad_nuc)) == 1) - { + if ((1 & (right_state >> dad_nuc))) { right_child_nuc = dad_nuc; } - else - { + else { for (right_child_nuc = 0; right_child_nuc < 4; ++right_child_nuc) - if (1 & (state_right >> right_child_nuc)) + if (1 & (right_state >> right_child_nuc)) break; - Mutation mut_r; - mut_r.position = perm_col[p]; - mut_r.compressed_position = compressed_perm_col[p]; - mut_r.mut_nuc = (1 << right_child_nuc); - mut_r.par_nuc = (1 << dad_nuc); - mut_r.ref_nuc = aln->reference_nuc[mut_r.compressed_position]; - right_branch->mutations.push_back(mut_r); - right_branch_mutations.push_back(make_pair(p, right_child_nuc)); + Mutation right_child_mutation; + right_child_mutation.position = perm_col[col]; + right_child_mutation.compressed_position = compressed_perm_col[col]; + right_child_mutation.mut_nuc = (1 << right_child_nuc); + right_child_mutation.par_nuc = (1 << dad_nuc); + right_child_mutation.ref_nuc = aln->reference_nuc[right_child_mutation.compressed_position]; + right_branch->mutations.push_back(right_child_mutation); + right_branch_mutations.push_back(make_pair(col, right_child_nuc)); } - for (int nuc = 0; nuc < 4; ++nuc) - { - if (nuc != right_child_nuc && (1 & (state_right >> nuc))) - { + for (int nuc = 0; nuc < 4; ++nuc) { + if (nuc != right_child_nuc && (1 & (right_state >> nuc))) { right[ptn / 8] ^= (1 << (i * 4 + nuc)); } } @@ -5258,10 +5231,8 @@ void PhyloTree::computePartialMutation(UINT *states_dad, vector &perm_col, bool left_child = true; FOR_NEIGHBOR_IT(node, dad, it) - if ((*it)->node->name != ROOT_NAME) - { - if (left_child) - { + if ((*it)->node->name != ROOT_NAME) { + if (left_child) { computePartialMutation(left, perm_col, compressed_perm_col, (PhyloNeighbor *)(*it), (PhyloNode *)node); left_child = false; continue; @@ -5270,13 +5241,11 @@ void PhyloTree::computePartialMutation(UINT *states_dad, vector &perm_col, } } -void PhyloTree::computeMutationBranch(vector &perm_col, vector &compressed_perm_col, PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst) -{ +void PhyloTree::computeMutationBranch(vector &perm_col, vector &compressed_perm_col, PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst) { PhyloNode *node = (PhyloNode *)dad_branch->node; PhyloNeighbor *node_branch = (PhyloNeighbor *)node->findNeighbor(dad); assert(node_branch); - if (node->isLeaf()) - { + if (node->isLeaf()) { PhyloNode *tmp_node = dad; dad = node; node = tmp_node; @@ -5286,49 +5255,40 @@ void PhyloTree::computeMutationBranch(vector &perm_col, vector &compre } int nptn = aln->size(); - UINT *left_branch_states_dad = new UINT[(aln->size() + 7) / 8 + 1]; - for (int ptn = 0; ptn < aln->size(); ptn += 8) - { + for (int ptn = 0; ptn < aln->size(); ptn += 8) { left_branch_states_dad[ptn / 8] = 0; } - UINT *right_branch_states_dad = new UINT[(aln->size() + 7) / 8 + 1]; - for (int ptn = 0; ptn < aln->size(); ptn += 8) - { + for (int ptn = 0; ptn < aln->size(); ptn += 8) { right_branch_states_dad[ptn / 8] = 0; } int i, ptn, col = -1; - for (ptn = 0; ptn < aln->size(); ptn += 8) - { - UINT states_left = node_branch->partial_pars[ptn / 8]; - UINT states_right = dad_branch->partial_pars[ptn / 8]; - UINT states_dad = root_states[ptn / 8]; + for (ptn = 0; ptn < aln->size(); ptn += 8) { + UINT left_states = node_branch->partial_pars[ptn / 8]; + UINT right_states = dad_branch->partial_pars[ptn / 8]; + UINT dad_states = root_states[ptn / 8]; int maxi = aln->size() - ptn; - if (maxi > 8) - maxi = 8; - for (i = 0; i < maxi; i++) - { + if (maxi > 8) maxi = 8; + for (i = 0; i < maxi; i++) { ++col; - UINT state_left = (states_left >> (i * 4)) & 15; - UINT state_right = (states_right >> (i * 4)) & 15; - UINT state_both = (states_dad >> (i * 4)) & 15; + UINT left_state = (left_states >> (i * 4)) & 15; + UINT right_state = (right_states >> (i * 4)) & 15; + UINT dad_state = (dad_states >> (i * 4)) & 15; char dad_nuc = 0; for (dad_nuc = 0; dad_nuc < 4; ++dad_nuc) - if (1 & (state_both >> dad_nuc)) + if (1 & (dad_state >> dad_nuc)) break; char left_child_nuc; - if ((1 & (state_left >> dad_nuc)) == 1) - { + if ((1 & (left_state >> dad_nuc))) { left_child_nuc = dad_nuc; } - else - { + else { for (left_child_nuc = 0; left_child_nuc < 4; ++left_child_nuc) - if (1 & (state_left >> left_child_nuc)) + if (1 & (left_state >> left_child_nuc)) break; Mutation left_child_mut; left_child_mut.position = perm_col[col]; @@ -5341,14 +5301,12 @@ void PhyloTree::computeMutationBranch(vector &perm_col, vector &compre right_branch_states_dad[ptn / 8] ^= (1 << (i * 4 + left_child_nuc)); char right_child_nuc; - if ((1 & (state_right >> dad_nuc)) == 1) - { + if ((1 & (right_state >> dad_nuc))) { right_child_nuc = dad_nuc; } - else - { + else { for (right_child_nuc = 0; right_child_nuc < 4; ++right_child_nuc) - if (1 & (state_right >> right_child_nuc)) + if (1 & (right_state >> right_child_nuc)) break; Mutation right_child_mut; right_child_mut.position = perm_col[col]; @@ -5366,74 +5324,62 @@ void PhyloTree::computeMutationBranch(vector &perm_col, vector &compre computePartialMutation(right_branch_states_dad, perm_col, compressed_perm_col, node_branch, node); } -void PhyloTree::initMutation(vector &perm_col, vector &compressed_perm_col) -{ +void PhyloTree::initMutation(vector &perm_col, vector &compressed_perm_col) { // Compute parsimony is necessary for tracing back the mutations computeParsimony(); computeMutationBranch(perm_col, compressed_perm_col, (PhyloNeighbor *)root->neighbors[0], (PhyloNode *)root); int ptn = 0, counter = 0; int nptn = aln->size(); - for (int i = 0; i < nptn; ++i) - { + for (int i = 0; i < nptn; ++i) { char root_nuc = ((root_states[ptn] >> (i * 4)) & 15); char ref_nuc = aln->reference_nuc[compressed_perm_col[i]]; - if ((root_nuc & ref_nuc) == 0) - { + if ((root_nuc & ref_nuc) == 0) { char dad_nuc = 0; - for (dad_nuc = 0; dad_nuc < 4; ++dad_nuc) - { + for (dad_nuc = 0; dad_nuc < 4; ++dad_nuc) { if (1 & (ref_nuc >> dad_nuc)) break; } char mut_nuc = 0; - for (mut_nuc = 0; mut_nuc < 4; ++mut_nuc) - { + for (mut_nuc = 0; mut_nuc < 4; ++mut_nuc) { if (1 & (root_nuc >> mut_nuc)) break; } - Mutation m; - m.position = perm_col[i]; - m.compressed_position = compressed_perm_col[i]; - m.mut_nuc = (1 << mut_nuc); - m.ref_nuc = ref_nuc; - m.par_nuc = (1 << dad_nuc); - root_mutations.push_back(m); + Mutation mutation; + mutation.position = perm_col[i]; + mutation.compressed_position = compressed_perm_col[i]; + mutation.mut_nuc = (1 << mut_nuc); + mutation.ref_nuc = ref_nuc; + mutation.par_nuc = (1 << dad_nuc); + root_mutations.push_back(mutation); } ++counter; - if (counter == 8) - { + if (counter == 8) { counter = 0; ++ptn; } } } -int PhyloTree::computePartialParsimonyMutation(PhyloNeighbor *dad_branch, PhyloNode *dad) -{ - int par_s = 0; +int PhyloTree::computePartialParsimonyMutation(PhyloNeighbor *dad_branch, PhyloNode *dad) { + int parsimony_score = 0; PhyloNode *node = (PhyloNode *)dad_branch->node; PhyloNeighbor *node_branch = (PhyloNeighbor *)node->findNeighbor(dad); - par_s += node_branch->mutations.size(); + parsimony_score += node_branch->mutations.size(); FOR_NEIGHBOR_IT(node, dad, it) - if ((*it)->node->name != ROOT_NAME) - { - par_s += computePartialParsimonyMutation(((PhyloNeighbor *)(*it)), node); + if ((*it)->node->name != ROOT_NAME) { + parsimony_score += computePartialParsimonyMutation(((PhyloNeighbor *)(*it)), node); } - return par_s; + return parsimony_score; } -int PhyloTree::computeParsimonyBranchMutation(PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst) -{ +int PhyloTree::computeParsimonyBranchMutation(PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst) { PhyloNode *node = (PhyloNode *)dad_branch->node; PhyloNeighbor *node_branch = (PhyloNeighbor *)node->findNeighbor(dad); assert(node_branch); - if (!central_partial_pars) - initializeAllPartialPars(); - if (node->isLeaf()) - { + if (node->isLeaf()) { PhyloNode *tmp_node = dad; dad = node; node = tmp_node; @@ -5442,29 +5388,19 @@ int PhyloTree::computeParsimonyBranchMutation(PhyloNeighbor *dad_branch, PhyloNo node_branch = tmp_nei; } - int par_s = 0; - par_s += computePartialParsimonyMutation(dad_branch, dad); - par_s += computePartialParsimonyMutation(node_branch, node); - return par_s; + int parsimony_score = 0; + parsimony_score += computePartialParsimonyMutation(dad_branch, dad); + parsimony_score += computePartialParsimonyMutation(node_branch, node); + return parsimony_score; } -int PhyloTree::computeParsimonyScoreMutation() -{ - assert(root->isLeaf()); - PhyloNeighbor *nei = ((PhyloNeighbor *)root->neighbors[0]); - current_it = nei; - assert(current_it); - current_it_back = (PhyloNeighbor *)nei->node->findNeighbor(root); - assert(current_it_back); - - int parsimonyScore = 0; - parsimonyScore += computeParsimonyBranchMutation((PhyloNeighbor *)root->neighbors[0], (PhyloNode *)root); - parsimonyScore += root_mutations.size(); - return parsimonyScore; +int PhyloTree::computeParsimonyScoreMutation() { + int parsimony_score = computeParsimonyBranchMutation((PhyloNeighbor *)root->neighbors[0], (PhyloNode *)root); + parsimony_score += root_mutations.size(); + return parsimony_score; } -vector> PhyloTree::breadth_first_expansion() -{ +void PhyloTree::initNodeDataPlaceNewSample() { assert(root->isLeaf()); PhyloNeighbor *nei = ((PhyloNeighbor *)root->neighbors[0]); current_it = nei; @@ -5473,56 +5409,43 @@ vector> PhyloTree::breadth_first_expansion() assert(current_it_back); vector> bfs; - queue> q; - q.push(make_pair((PhyloNode *)nei->node, current_it_back)); - current_it_back->distance = 1; - while (q.size()) - { - PhyloNode *node = q.front().first; - PhyloNeighbor *node_branch = q.front().second; + queue> node_queue; + node_queue.push(make_pair((PhyloNode *)nei->node, current_it_back)); + while (node_queue.size()) { + PhyloNode *node = node_queue.front().first; + PhyloNeighbor *node_branch = node_queue.front().second; node->dad = (PhyloNode *)node_branch->node; - for (auto mut : node_branch->mutations) - { - assert((mut.mut_nuc & (mut.mut_nuc - 1)) == 0); - } PhyloNode *dad = (PhyloNode *)node_branch->node; - q.pop(); + node_queue.pop(); bfs.push_back(make_pair(node, node_branch)); - FOR_NEIGHBOR_IT(node, dad, it) - { - ((PhyloNeighbor *)(*it)->node->findNeighbor(node))->distance = node_branch->distance + 1; - q.push(make_pair((PhyloNode *)(*it)->node, (PhyloNeighbor *)(*it)->node->findNeighbor(node))); + FOR_NEIGHBOR_IT(node, dad, it) { + node_queue.push(make_pair((PhyloNode *)(*it)->node, (PhyloNeighbor *)(*it)->node->findNeighbor(node))); } } - for (int i = bfs.size() - 1; i >= 0; --i) - { + for (int i = bfs.size() - 1; i >= 0; --i) { PhyloNode *node = bfs[i].first; PhyloNeighbor *node_branch = bfs[i].second; PhyloNode *dad = (PhyloNode *)node_branch->node; node_branch->num_leaves = 0; - if (node->isLeaf()) - { + if (node->isLeaf()) { node_branch->num_leaves = 1; continue; } - FOR_NEIGHBOR_IT(node, dad, it) - { + FOR_NEIGHBOR_IT(node, dad, it) { node_branch->num_leaves += ((PhyloNeighbor *)(*it)->node->findNeighbor(node))->num_leaves; } } - return bfs; } -void PhyloTree::computeExcessMutations(PlacementCandidateNode &input) -{ - std::vector anc_positions; - std::vector ancestral_mutations; +void PhyloTree::computeExcessMutations(PlacementCandidateNode &input) { + vector anc_positions; + vector ancestral_mutations; timer_regular--; for (auto mutation : (*input.missing_sample_mutations)) { visited_missing_sample_mutations[mutation.compressed_position] = timer_regular; - cur_missing_sample_mutations[mutation.compressed_position] = mutation; + current_missing_sample_mutations[mutation.compressed_position] = mutation; } if (!(input.node == root)) { @@ -5534,7 +5457,7 @@ void PhyloTree::computeExcessMutations(PlacementCandidateNode &input) bool found = false; bool found_pos = false; if (visited_missing_sample_mutations[node_mutation.compressed_position] == timer_regular) { - auto missing_sample_mutation = cur_missing_sample_mutations[node_mutation.compressed_position]; + auto missing_sample_mutation = current_missing_sample_mutations[node_mutation.compressed_position]; if (node_mutation.position == missing_sample_mutation.position) { found_pos = true; if (missing_sample_mutation.is_missing) { @@ -5563,7 +5486,7 @@ void PhyloTree::computeExcessMutations(PlacementCandidateNode &input) } for (auto ancestral_mutation : ancestral_mutations) { visited_ancestral_mutations[ancestral_mutation.compressed_position] = timer_regular; - cur_ancestral_mutations[ancestral_mutation.compressed_position] = ancestral_mutation; + current_ancestral_mutations[ancestral_mutation.compressed_position] = ancestral_mutation; } { @@ -5577,7 +5500,7 @@ void PhyloTree::computeExcessMutations(PlacementCandidateNode &input) ancestral_mutations.emplace_back(node_mutation); anc_positions.emplace_back(node_mutation.compressed_position); visited_ancestral_mutations[node_mutation.compressed_position] = timer_regular; - cur_ancestral_mutations[node_mutation.compressed_position] = node_mutation; + current_ancestral_mutations[node_mutation.compressed_position] = node_mutation; } } } @@ -5586,7 +5509,7 @@ void PhyloTree::computeExcessMutations(PlacementCandidateNode &input) ancestral_mutations.emplace_back(root_mutation); anc_positions.emplace_back(root_mutation.compressed_position); visited_ancestral_mutations[root_mutation.compressed_position] = timer_regular; - cur_ancestral_mutations[root_mutation.compressed_position] = root_mutation; + current_ancestral_mutations[root_mutation.compressed_position] = root_mutation; } } } @@ -5605,7 +5528,7 @@ void PhyloTree::computeExcessMutations(PlacementCandidateNode &input) nuc = true; } if (visited_ancestral_mutations[missing_sample_mutation.compressed_position] == timer_regular) { - auto ancestral_mutation = cur_ancestral_mutations[missing_sample_mutation.compressed_position]; + auto ancestral_mutation = current_ancestral_mutations[missing_sample_mutation.compressed_position]; if (!ancestral_mutation.is_masked()) { found_pos = true; anc_nuc = ancestral_mutation.mut_nuc; @@ -5639,7 +5562,7 @@ void PhyloTree::computeExcessMutations(PlacementCandidateNode &input) auto anc_nuc = ancestral_mutation.mut_nuc; if (visited_missing_sample_mutations[ancestral_mutation.compressed_position] == timer_regular) { if (!ancestral_mutation.is_masked()) { - auto missing_sample_mutation = cur_missing_sample_mutations[ancestral_mutation.compressed_position]; + auto missing_sample_mutation = current_missing_sample_mutations[ancestral_mutation.compressed_position]; found_pos = true; if (missing_sample_mutation.is_missing || (missing_sample_mutation.mut_nuc & anc_nuc) != 0) { found = true; @@ -5661,18 +5584,17 @@ void PhyloTree::computeExcessMutations(PlacementCandidateNode &input) } } -void PhyloTree::initDataPlaceNewSample(PlacementCandidateNode &inp) -{ +void PhyloTree::initNewSampleMutations(PlacementCandidateNode &inp) { ++timer_optimized; for (auto mutation : (*inp.missing_sample_mutations)) { visited_missing_sample_mutations[mutation.compressed_position] = timer_optimized; - cur_missing_sample_mutations[mutation.compressed_position] = mutation; + current_missing_sample_mutations[mutation.compressed_position] = mutation; } } void PhyloTree::eraseMutation(vector &erased_excess_mutation, Mutation mutation, int &set_difference) { if (visited_excess_mutations[mutation.compressed_position] == timer_optimized) { - erased_excess_mutation.emplace_back(cur_excess_mutations[mutation.compressed_position]); + erased_excess_mutation.emplace_back(current_excess_mutations[mutation.compressed_position]); visited_excess_mutations[mutation.compressed_position] = 0; --set_difference; } @@ -5681,12 +5603,11 @@ void PhyloTree::eraseMutation(vector &erased_excess_mutation, Mutation void PhyloTree::addMutation(vector &added_excess_mutation, Mutation mutation, int diff, int &set_difference) { added_excess_mutation.push_back(mutation); visited_excess_mutations[mutation.compressed_position] = timer_optimized; - cur_excess_mutations[mutation.compressed_position] = mutation; + current_excess_mutations[mutation.compressed_position] = mutation; set_difference += diff; } -void PhyloTree::optimizedFindPositionPlaceNewSample(PlacementCandidateNode &input, int set_difference) -{ +void PhyloTree::optimizedFindPositionPlaceNewSample(PlacementCandidateNode &input, int set_difference) { vector ancentral_positions; vector ancestral_mutations; vector erased_excess_mutation; @@ -5704,7 +5625,7 @@ void PhyloTree::optimizedFindPositionPlaceNewSample(PlacementCandidateNode &inpu bool found = false; bool found_pos = false; if (visited_missing_sample_mutations[node_mutation.compressed_position] == timer_optimized) { - auto missing_sample_mutation = cur_missing_sample_mutations[node_mutation.compressed_position]; + auto missing_sample_mutation = current_missing_sample_mutations[node_mutation.compressed_position]; if (node_mutation.position == missing_sample_mutation.position) { found_pos = true; if (missing_sample_mutation.is_missing) { @@ -5744,7 +5665,7 @@ void PhyloTree::optimizedFindPositionPlaceNewSample(PlacementCandidateNode &inpu ancestral_mutations.emplace_back(root_mutation); ancentral_positions.emplace_back(root_mutation.compressed_position); visited_ancestral_mutations[root_mutation.compressed_position] = timer_optimized; - cur_ancestral_mutations[root_mutation.compressed_position] = root_mutation; + current_ancestral_mutations[root_mutation.compressed_position] = root_mutation; } } @@ -5761,7 +5682,7 @@ void PhyloTree::optimizedFindPositionPlaceNewSample(PlacementCandidateNode &inpu } if (visited_ancestral_mutations[missing_sample_mutation.compressed_position] == timer_optimized) { - auto ancestral_mutation = cur_ancestral_mutations[missing_sample_mutation.compressed_position]; + auto ancestral_mutation = current_ancestral_mutations[missing_sample_mutation.compressed_position]; if (!ancestral_mutation.is_masked()) { found_pos = true; anc_nuc = ancestral_mutation.mut_nuc; @@ -5794,7 +5715,7 @@ void PhyloTree::optimizedFindPositionPlaceNewSample(PlacementCandidateNode &inpu auto anc_nuc = ancestral_mutation.mut_nuc; if (visited_missing_sample_mutations[ancestral_mutation.compressed_position] == timer_optimized) { if (!ancestral_mutation.is_masked()) { - auto missing_sample_mutation = cur_missing_sample_mutations[ancestral_mutation.compressed_position]; + auto missing_sample_mutation = current_missing_sample_mutations[ancestral_mutation.compressed_position]; found_pos = true; if (missing_sample_mutation.is_missing || (missing_sample_mutation.mut_nuc & anc_nuc) != 0) { found = true; @@ -5843,7 +5764,7 @@ void PhyloTree::optimizedFindPositionPlaceNewSample(PlacementCandidateNode &inpu mutation.position = diff_mutation.position; mutation.compressed_position = diff_mutation.compressed_position; if (visited_missing_sample_mutations[diff_mutation.compressed_position] == timer_optimized) { - mutation.mut_nuc = cur_missing_sample_mutations[diff_mutation.compressed_position].mut_nuc; + mutation.mut_nuc = current_missing_sample_mutations[diff_mutation.compressed_position].mut_nuc; } eraseMutation(erased_excess_mutation, mutation, set_difference); if (mutation.mut_nuc != mutation.par_nuc) { @@ -5854,10 +5775,10 @@ void PhyloTree::optimizedFindPositionPlaceNewSample(PlacementCandidateNode &inpu PhyloNode *node = input.node; PhyloNode *dad = node->dad; FOR_NEIGHBOR_IT(node, dad, it) { - PhyloNode *childNode = (PhyloNode *)(*it)->node; - PhyloNeighbor *childNodeBranch = (PhyloNeighbor *)childNode->findNeighbor(node); - input.node = childNode; - input.node_branch = childNodeBranch; + PhyloNode *child_node = (PhyloNode *)(*it)->node; + PhyloNeighbor *child_node_branch = (PhyloNeighbor *)child_node->findNeighbor(node); + input.node = child_node; + input.node_branch = child_node_branch; optimizedFindPositionPlaceNewSample(input, set_difference); } @@ -5868,12 +5789,11 @@ void PhyloTree::optimizedFindPositionPlaceNewSample(PlacementCandidateNode &inpu for (int i = erased_excess_mutation.size() - 1; i >= 0; i--) { Mutation mutation = erased_excess_mutation[i]; visited_excess_mutations[mutation.compressed_position] = timer_optimized; - cur_excess_mutations[mutation.compressed_position] = mutation; + current_excess_mutations[mutation.compressed_position] = mutation; } } -void PhyloTree::addNewSample(PhyloNode *best_node, PhyloNeighbor *best_node_branch, std::vector node_excess_mutations, int index, std::string sample_name) -{ +void PhyloTree::addNewSample(PhyloNode *best_node, PhyloNeighbor *best_node_branch, vector node_excess_mutations, int index, string sample_name) { PhyloNode *new_node = (PhyloNode *)newNode(); PhyloNode *sample = (PhyloNode *)newNode(aln->getNSeq() + index, sample_name.c_str()); sample->setMissingNode(index); @@ -5881,76 +5801,56 @@ void PhyloTree::addNewSample(PhyloNode *best_node, PhyloNeighbor *best_node_bran sample->addNeighbor(new_node, -1.0); PhyloNode *best_dad = (PhyloNode *)best_node_branch->node; - std::vector common_mut, l1_mut, l2_mut; - std::vector curr_l1_mut; + vector common_mutations, best_node_mutations, sample_mutations; + vector current_node_mutations; // Compute current best node branch mutations - for (auto m1 : best_node_branch->mutations) - { - Mutation m = m1.copy(); - curr_l1_mut.emplace_back(m); + for (auto node_mutation : best_node_branch->mutations) { + current_node_mutations.emplace_back(node_mutation); } - // Clear mutations on the best node branch which - // will be later replaced by l1_mut + best_node_branch->clear_mutations(); - // Compute l1_mut --timer_regular; - for (auto m1 : curr_l1_mut) - { - visited_ancestral_mutations[m1.compressed_position] = timer_regular; - cur_ancestral_mutations[m1.compressed_position] = m1; + for (auto node_mutation : current_node_mutations) { + visited_ancestral_mutations[node_mutation.compressed_position] = timer_regular; + current_ancestral_mutations[node_mutation.compressed_position] = node_mutation; } - for (auto m2 : node_excess_mutations) - { - visited_excess_mutations[m2.compressed_position] = timer_regular; - cur_excess_mutations[m2.compressed_position] = m2; + for (auto excess_mutation : node_excess_mutations) { + visited_excess_mutations[excess_mutation.compressed_position] = timer_regular; + current_excess_mutations[excess_mutation.compressed_position] = excess_mutation; } - for (auto m1 : curr_l1_mut) - { + for (auto node_mutation : current_node_mutations) { bool found = false; - if (!m1.is_masked()) - { - if (visited_excess_mutations[m1.compressed_position] == timer_regular) - { - auto m2 = cur_excess_mutations[m1.compressed_position]; - if (m1.position == m2.position) - { - if (m1.mut_nuc == m2.mut_nuc) - { + if (!node_mutation.is_masked()) { + if (visited_excess_mutations[node_mutation.compressed_position] == timer_regular) { + auto excess_mutation = current_excess_mutations[node_mutation.compressed_position]; + if (node_mutation.position == excess_mutation.position) { + if (node_mutation.mut_nuc == excess_mutation.mut_nuc) { found = true; } } } } - if (!found) - { - Mutation m = m1.copy(); - l1_mut.emplace_back(m); + if (!found) { + best_node_mutations.emplace_back(node_mutation); } } - // Compute l2_mut - for (auto m1 : node_excess_mutations) - { + // Compute sample mutations + for (auto excess_mutation : node_excess_mutations) { bool found = false; - if (!m1.is_masked()) - { - if (visited_ancestral_mutations[m1.compressed_position] == timer_regular) - { - auto m2 = cur_ancestral_mutations[m1.compressed_position]; - if (m1.position == m2.position) - { - if (m1.mut_nuc == m2.mut_nuc) - { + if (!excess_mutation.is_masked()) { + if (visited_ancestral_mutations[excess_mutation.compressed_position] == timer_regular) { + auto ancestral_mutation = current_ancestral_mutations[excess_mutation.compressed_position]; + if (excess_mutation.position == ancestral_mutation.position) { + if (excess_mutation.mut_nuc == ancestral_mutation.mut_nuc) { found = true; - Mutation m = m1.copy(); - common_mut.emplace_back(m); + Mutation m = excess_mutation.copy(); + common_mutations.emplace_back(m); } } } } - if (!found) - { - Mutation m = m1.copy(); - l2_mut.emplace_back(m); + if (!found) { + sample_mutations.emplace_back(excess_mutation); } } @@ -5960,106 +5860,70 @@ void PhyloTree::addNewSample(PhyloNode *best_node, PhyloNeighbor *best_node_bran best_dad->updateNeighbor(best_node, new_node, -1.0); // Add mutations to new node using common_mut PhyloNeighbor *new_node_branch = (PhyloNeighbor *)new_node->findNeighbor(best_dad); - for (auto m : common_mut) - { - new_node_branch->add_mutation(m); - } + new_node_branch->mutations = common_mutations; - // Add mutations to best node using l1_mut PhyloNeighbor *new_best_node_branch = (PhyloNeighbor *)best_node->findNeighbor(new_node); - for (auto m : l1_mut) - { - new_best_node_branch->add_mutation(m); - } + new_best_node_branch->mutations = best_node_mutations; PhyloNeighbor *sample_branch = (PhyloNeighbor *)sample->findNeighbor(new_node); - // Add new sample mutations using l2_mut - for (auto m : l2_mut) - { - sample_branch->add_mutation(m); - } + sample_branch->mutations = sample_mutations; } -string PhyloTree::checkPartialMutation(vector &pos, PhyloNeighbor *dad_branch, PhyloNode *dad) -{ +string PhyloTree::verifyPartialMutationCorrectness(vector &position, PhyloNeighbor *dad_branch, PhyloNode *dad) { PhyloNode *node = (PhyloNode *)dad_branch->node; - int ptn; - int nstates = aln->num_states; - int pars_size = getBitsBlockSize(); - int entry_size = getBitsEntrySize(); - int nptn = aln->size(); - int ptn_pars_start_id = pars_size - nptn - 1; + int nsite = aln->getNSite(); - if (nstates == 4 && aln->seq_type == SEQ_DNA && (node->isLeaf() || node->degree() == 3)) - { - // ULTRAFAST VERSION FOR DNA, assuming that UINT is 32-bit integer - if (node->isLeaf() && dad) - { - PhyloNeighbor *node_branch = (PhyloNeighbor *)node->findNeighbor(dad); - string s = ""; - if (node->id >= aln->getNSeq()) - cout << node->id << " " << aln->getNSeq() << '\n'; - assert(node->id < aln->getNSeq()); - for (int i = 0; i < (int)aln->getNSite(); ++i) - { - Pattern pat = aln->getPattern(i); - s += aln->convertStateBack(pat[node->id]); - } - // for (auto m : node_branch->mutations) cout << m.get_string() << "+" << pos[m.position] << " "; - return s; + if (node->isLeaf() && dad) { + PhyloNeighbor *node_branch = (PhyloNeighbor *)node->findNeighbor(dad); + string sequence = ""; + assert(node->id < aln->getNSeq()); + for (int i = 0; i < nsite; ++i) { + Pattern pattern = aln->getPattern(i); + sequence += aln->convertStateBack(pattern[node->id]); } - else - { - // internal node - int cur = 0; - string left, right; - PhyloNeighbor *left_branch, *right_branch; - FOR_NEIGHBOR_IT(node, dad, it) - if ((*it)->node->name != ROOT_NAME) - { - // ((PhyloNeighbor*)(*it))->distance = dad_branch->distance + 1; - if (cur == 0) - left_branch = (PhyloNeighbor *)(*it)->node->findNeighbor(node), left = checkPartialMutation(pos, (PhyloNeighbor *)(*it), (PhyloNode *)node), cur = 1; - else - right_branch = (PhyloNeighbor *)(*it)->node->findNeighbor(node), right = checkPartialMutation(pos, (PhyloNeighbor *)(*it), (PhyloNode *)node); - } - for (auto m : left_branch->mutations) - { - assert(pos[m.position] < (int)left.length()); - left[pos[m.position]] = aln->getStateFromMutation(m.par_nuc); - } - for (auto m : right_branch->mutations) - { - assert(pos[m.position] < (int)right.length()); - right[pos[m.position]] = aln->getStateFromMutation(m.par_nuc); + return sequence; + } + else { + string left_sequence, right_sequence; + PhyloNeighbor *left_branch, *right_branch; + bool left_child = true; + FOR_NEIGHBOR_IT(node, dad, it) { + if ((*it)->node->name != ROOT_NAME) { + if (left_child) { + left_branch = (PhyloNeighbor *)(*it)->node->findNeighbor(node); + left_sequence = verifyPartialMutationCorrectness(position, (PhyloNeighbor *)(*it), (PhyloNode *)node); + left_child = false; + continue; + } + right_branch = (PhyloNeighbor *)(*it)->node->findNeighbor(node); + right_sequence = verifyPartialMutationCorrectness(position, (PhyloNeighbor *)(*it), (PhyloNode *)node); } + } + for (auto mutation : left_branch->mutations) { + assert(position[mutation.compressed_position] < (int)left_sequence.length()); + left_sequence[position[mutation.compressed_position]] = aln->getStateFromMutation(mutation.par_nuc); + } + for (auto mutation : right_branch->mutations) { + assert(position[mutation.compressed_position] < (int)right_sequence.length()); + right_sequence[position[mutation.compressed_position]] = aln->getStateFromMutation(mutation.par_nuc); + } - if (left != right) - { - for (int i = 0; i < (int)left.length(); ++i) - { - if (left[i] != right[i] && (aln->getMutationFromState(left[i]) & aln->getMutationFromState(right[i])) == 0) - { - cout << "compute mutations wrong"; - exit(1); - } + if (left_sequence != right_sequence) { + for (int i = 0; i < (int)left_sequence.length(); ++i) { + if (left_sequence[i] != right_sequence[i] && (aln->getMutationFromState(left_sequence[i]) & aln->getMutationFromState(right_sequence[i])) == 0) { + cout << "Compute mutations wrong"; + exit(1); } } - return left; } - } // END OF DNA VERSION + return left_sequence; + } } -void PhyloTree::checkMutationBranch(vector &pos, PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst) -{ +void PhyloTree::verifyMutationCorrectnessBranch(vector &position, PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst) { PhyloNode *node = (PhyloNode *)dad_branch->node; PhyloNeighbor *node_branch = (PhyloNeighbor *)node->findNeighbor(dad); - assert(node_branch); - if (!central_partial_pars) - initializeAllPartialPars(); - // swap node and dad if dad is a leaf - if (node->isLeaf()) - { + if (node->isLeaf()) { PhyloNode *tmp_node = dad; dad = node; node = tmp_node; @@ -6068,67 +5932,41 @@ void PhyloTree::checkMutationBranch(vector &pos, PhyloNeighbor *dad_branch, node_branch = tmp_nei; } - string s = checkPartialMutation(pos, dad_branch, dad); - string t = checkPartialMutation(pos, node_branch, node); - for (auto m : node_branch->mutations) - { - s[pos[m.position]] = aln->getStateFromMutation(m.par_nuc); + string left_sequence = verifyPartialMutationCorrectness(position, dad_branch, dad); + string right_sequence = verifyPartialMutationCorrectness(position, node_branch, node); + for (auto mutation : node_branch->mutations) { + left_sequence[position[mutation.compressed_position]] = aln->getStateFromMutation(mutation.par_nuc); } - for (auto m : dad_branch->mutations) - { - t[pos[m.position]] = aln->getStateFromMutation(m.par_nuc); + for (auto mutation : dad_branch->mutations) { + right_sequence[position[mutation.compressed_position]] = aln->getStateFromMutation(mutation.par_nuc); } - if (s != t) - { - for (int i = 0; i < (int)s.length(); ++i) - { - if (s[i] != t[i] && (aln->getMutationFromState(s[i]) & aln->getMutationFromState(t[i])) == 0) - { - cout << "compute mutations wrong at root"; + if (left_sequence != right_sequence) { + for (int i = 0; i < (int)left_sequence.length(); ++i) { + if (left_sequence[i] != right_sequence[i] && (aln->getMutationFromState(left_sequence[i]) & aln->getMutationFromState(right_sequence[i])) == 0) { + cout << "Compute mutations wrong at root"; exit(1); } } } } -void PhyloTree::checkMutation(vector &pos) -{ +void PhyloTree::verifyMutationCorrectness() { cout << "========== Start checking mutations ==========\n"; - assert(root->isLeaf()); - PhyloNeighbor *nei = ((PhyloNeighbor *)root->neighbors[0]); - current_it = nei; - assert(current_it); - current_it_back = (PhyloNeighbor *)nei->node->findNeighbor(root); - assert(current_it_back); - - checkMutationBranch(pos, (PhyloNeighbor *)root->neighbors[0], (PhyloNode *)root); + vector perm_col = aln->findRotatedColumnPermutation(); + int nsite = aln->getNSite(); + assert(perm_col.size() == nsite); + vector sorted_perm_col(perm_col); + sort(sorted_perm_col.begin(), sorted_perm_col.end()); + vector compressed_perm_col; + for (int col : perm_col) { + int idx = lower_bound(sorted_perm_col.begin(), sorted_perm_col.end(), col) - sorted_perm_col.begin(); + compressed_perm_col.push_back(idx); + } + vector position(nsite); + for (int i = 0; i < nsite; ++i) { + position[compressed_perm_col[i]] = i; + } + verifyMutationCorrectnessBranch(position, (PhyloNeighbor *)root->neighbors[0], (PhyloNode *)root); + cout << "Compute mutation correctly\n"; cout << "========== End checking mutations ==========\n"; -} - -PhyloNode *PhyloTree::findNode(PhyloNode *node, PhyloNode *dad, string name) -{ - if (node->name == name) - { - return node; - } - if (node->isLeaf()) - { - return NULL; - } - PhyloNode *found = NULL; - FOR_NEIGHBOR_IT(node, dad, it) - { - found = findNode((PhyloNode *)(*it)->node, node, name); - if (found) - { - return found; - } - } - return NULL; -} - -PhyloNode *PhyloTree::findNode(string name) -{ - PhyloNeighbor *nei = ((PhyloNeighbor *)root->neighbors[0]); - return findNode((PhyloNode *)nei->node, (PhyloNode *)root, name); } \ No newline at end of file diff --git a/phylotree.h b/phylotree.h index 55ecb480..ceee488c 100644 --- a/phylotree.h +++ b/phylotree.h @@ -280,119 +280,193 @@ class PhyloTree : public MTree, public Optimization { virtual ~PhyloTree(); /** - * Add a row to the tree + * Flag indicating whether to add a new row to the tree + * Used in tree modification operations */ bool add_row; /** - * Save the states of the root node + * Array storing the states of the root node + * Used for tracking ancestral states in phylogenetic analysis */ UINT *root_states; /** - * Root mutations + * Vector storing mutations at the root node + * Contains information about mutations that occurred at the root of the tree */ vector root_mutations; /** - * Allocate memory for mutation data + * Allocates memory for mutation data structures + * @param num_column Number of columns in the alignment to allocate memory for */ void allocateMutationMemory(int num_column); /** - * Initialize mutation data for MAT + * Initializes mutation data for MAT + * @param perm_col Vector of column permutations + * @param compressed_perm_col Vector of compressed column permutations */ void initMutation(vector &perm_col, vector &compressed_perm_col); /** - * Compute mutation for a branch + * Computes mutations along a specific branch + * @param perm_col Vector of column permutations + * @param compressed_perm_col Vector of compressed column permutations + * @param dad_branch Pointer to the branch being analyzed + * @param dad Pointer to the parent node + * @param branch_subst Optional pointer to store number of substitutions on branch */ void computeMutationBranch(vector &perm_col, vector &compressed_perm_col, PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst = NULL); /** - * Compute partial mutation for a branch + * Computes partial mutations along a branch + * @param states_dad Array of parent node states + * @param perm_col Vector of column permutations + * @param compressed_perm_col Vector of compressed column permutations + * @param dad_branch Pointer to the branch being analyzed + * @param dad Pointer to the parent node */ void computePartialMutation(UINT *states_dad, vector &perm_col, vector &compressed_perm_col, PhyloNeighbor *dad_branch, PhyloNode *dad); /** - * Compute parsimony score using mutation + * Computes parsimony score using mutation data + * @return The parsimony score based on mutations */ int computeParsimonyScoreMutation(); /** - * Compute parsimony score for a branch using mutation + * Computes parsimony score for a specific branch using mutation data + * @param dad_branch Pointer to the branch being analyzed + * @param dad Pointer to the parent node + * @param branch_subst Optional pointer to store number of substitutions on branch + * @return The parsimony score for the branch */ int computeParsimonyBranchMutation(PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst = NULL); /** - * Compute partial parsimony score for a branch using mutation + * Computes partial parsimony score for a branch using mutation data + * @param dad_branch Pointer to the branch being analyzed + * @param dad Pointer to the parent node + * @return The partial parsimony score for the branch */ int computePartialParsimonyMutation(PhyloNeighbor *dad_branch, PhyloNode *dad); /** - * Compute the breadth-first expansion of the vertices of the tree + * Initializes node data for placing new samples + * Computes breadth-first expansion of tree vertices */ - std::vector> breadth_first_expansion(); + void initNodeDataPlaceNewSample(); - std::vector cur_excess_mutations, cur_missing_sample_mutations, cur_ancestral_mutations; - std::vector visited_missing_sample_mutations, visited_ancestral_mutations; - std::vector visited_excess_mutations; - int timer_optimized, timer_regular; + /** + * Vector storing current excess mutations + */ + vector current_excess_mutations; /** - * Calculate placement mutation for a candidate node + * Vector storing mutations from the missing sample */ - void computeExcessMutations(PlacementCandidateNode &input); + vector current_missing_sample_mutations; /** - * Initialize data for calculatePlacementMutation + * Vector storing ancestral mutations */ - void initDataPlaceNewSample(PlacementCandidateNode &inp); + vector current_ancestral_mutations; /** - * Erase a mutation from the candidate node + * Vector storing time of visited missing sample mutations */ - void eraseMutation(vector &erase_excess_mutations, Mutation m, int &set_difference); + vector visited_missing_sample_mutations; /** - * Add a mutation to the candidate node + * Vector storing time of visited ancestral mutations */ - void addMutation(vector &added_excess_mutations, Mutation m, int diff, int &set_difference); + vector visited_ancestral_mutations; /** - * Optimize the placement mutation for a candidate node + * Vector storing time of visited excess mutations */ - void optimizedFindPositionPlaceNewSample(PlacementCandidateNode &input, int set_difference = 0); + vector visited_excess_mutations; /** - * Add a new sample to the tree + * Timer for optimized operations */ - void addNewSample(PhyloNode *best_node, PhyloNeighbor *best_node_branch, std::vector node_excess_mutations, int index, std::string name); + int timer_optimized; + + /** + * Timer for regular operations + */ + int timer_regular; + + /** + * Calculates placement mutations for a candidate node + * @param input Reference to the placement candidate node + */ + void computeExcessMutations(PlacementCandidateNode &input); /** - * Check mutations at a given position + * Initializes data for calculating placement mutations + * @param inp Reference to the placement candidate node */ - void checkMutation(vector &pos); + void initNewSampleMutations(PlacementCandidateNode &inp); + + /** + * Erases a mutation from the excess mutations set + * @param erase_excess_mutations Vector of mutations to erase + * @param m The mutation to erase + * @param set_difference Reference to track the difference in mutation sets + */ + void eraseMutation(vector &erase_excess_mutations, Mutation m, int &set_difference); + + /** + * Adds a mutation to the excess mutations set + * @param added_excess_mutations Vector of mutations to add + * @param m The mutation to add + * @param diff The difference value + * @param set_difference Reference to track the difference in mutation sets + */ + void addMutation(vector &added_excess_mutations, Mutation m, int diff, int &set_difference); + + /** + * Optimizes function for finding the best position to place a new sample + * @param input Reference to the placement candidate node + * @param set_difference Optional difference in mutation sets + */ + void optimizedFindPositionPlaceNewSample(PlacementCandidateNode &input, int set_difference = 0); /** - * Check mutations on a branch + * Adds a new sample to the tree + * @param best_node Pointer to the best node for placement + * @param best_node_branch Pointer to the best branch for placement + * @param node_excess_mutations Vector of excess mutations for the node + * @param index Index of the new sample + * @param name Name of the new sample */ - void checkMutationBranch(vector &pos, PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst = NULL); + void addNewSample(PhyloNode *best_node, PhyloNeighbor *best_node_branch, vector node_excess_mutations, int index, string name); /** - * Check partial mutation on a branch + * Verifies the correctness of mutations at a given position */ - string checkPartialMutation(vector &pos, PhyloNeighbor *dad_branch, PhyloNode *dad); + void verifyMutationCorrectness(); /** - * Find a node by name + * Verifies mutations on a specific branch + * @param pos Vector of positions to verify + * @param dad_branch Pointer to the branch being verified + * @param dad Pointer to the parent node + * @param branch_subst Optional pointer to store number of substitutions */ - PhyloNode *findNode(PhyloNode *node, PhyloNode *dad, string name); + void verifyMutationCorrectnessBranch(vector &pos, PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst = NULL); /** - * Find a node by name + * Verifies partial mutations on a branch + * @param pos Vector of positions to verify + * @param dad_branch Pointer to the branch being verified + * @param dad Pointer to the parent node + * @return String containing verification results */ - PhyloNode *findNode(string name); + string verifyPartialMutationCorrectness(vector &pos, PhyloNeighbor *dad_branch, PhyloNode *dad); /** copy the phylogenetic tree structure into this tree, override to take sequence names diff --git a/placement.cpp b/placement.cpp index 6bba8e9a..b6d107aa 100644 --- a/placement.cpp +++ b/placement.cpp @@ -111,7 +111,7 @@ void placeNewSamplesOntoExistingTree(Params ¶ms) { auto start_time = getCPUTime(); for (int i = 0; i < num_sequences; ++i) { - vector> bfs = tree->breadth_first_expansion(); + tree->initNodeDataPlaceNewSample(); PlacementCandidateNode input; int best_set_difference = INT_MAX; size_t best_node_num_leaves = INT_MAX; @@ -124,7 +124,7 @@ void placeNewSamplesOntoExistingTree(Params ¶ms) { input.missing_sample_mutations = &alignment->missing_sample_mutations[i]; input.excess_mutations = &excess_mutations; - tree->initDataPlaceNewSample(input); + tree->initNewSampleMutations(input); tree->optimizedFindPositionPlaceNewSample(input, 0); input.node = input.best_node; input.node_branch = input.best_node_branch; From ecbe9c6454a7346a68ad7b18b6eed645eb367b14 Mon Sep 17 00:00:00 2001 From: trungnotchung Date: Thu, 12 Jun 2025 18:56:49 +0700 Subject: [PATCH 15/23] refactor: phylonode --- phylonode.cpp | 45 +-------------------------------------------- phylonode.h | 31 ++++++------------------------- phylotree.cpp | 2 +- 3 files changed, 8 insertions(+), 70 deletions(-) diff --git a/phylonode.cpp b/phylonode.cpp index 06437eaa..1f247034 100644 --- a/phylonode.cpp +++ b/phylonode.cpp @@ -19,53 +19,10 @@ void PhyloNeighbor::clearForwardPartialLh(Node *dad) { ((PhyloNeighbor*)*it)->clearForwardPartialLh(node); } -void PhyloNeighbor::clear_mutations() -{ +void PhyloNeighbor::clearMutations() { mutations.clear(); } -void PhyloNeighbor::add_mutation(Mutation mut) -{ - auto iter = std::lower_bound(mutations.begin(), mutations.end(), mut); - // check if mutation at the same position has occured before - if ((iter != mutations.end()) && (iter->position == mut.position)) - { - // update to new allele - if (iter->par_nuc != mut.mut_nuc) - { - iter->mut_nuc = mut.mut_nuc; - } - // reversal mutation - else - { - if (iter->mut_nuc != mut.par_nuc) - { - printf("ERROR: add_mutation: consecutive mutations at same position " - "disagree on nuc -- called out of order?\n"); - exit(1); - } - std::vector tmp; - for (auto m : mutations) - { - if (m.position != iter->position) - { - tmp.emplace_back(m.copy()); - } - } - mutations.clear(); - for (auto m : tmp) - { - mutations.emplace_back(m.copy()); - } - } - } - // new mutation - else - { - mutations.insert(iter, mut); - } -} - void PhyloNode::clearReversePartialLh(PhyloNode *dad) { PhyloNeighbor *node_nei = (PhyloNeighbor*)findNeighbor(dad); assert(node_nei); diff --git a/phylonode.h b/phylonode.h index 7bfe73fe..d708b54b 100644 --- a/phylonode.h +++ b/phylonode.h @@ -39,13 +39,11 @@ class PhyloNeighbor : public Neighbor @param alength length of branch */ - PhyloNeighbor(Node *anode, double alength) : Neighbor(anode, alength) - { + PhyloNeighbor(Node *anode, double alength) : Neighbor(anode, alength) { partial_lh = NULL; partial_lh_computed = 0; lh_scale_factor = 0.0; partial_pars = NULL; - mutations.clear(); } /** @@ -54,29 +52,24 @@ class PhyloNeighbor : public Neighbor @param alength length of branch @param aid branch ID */ - PhyloNeighbor(Node *anode, double alength, int aid) : Neighbor(anode, alength, aid) - { + PhyloNeighbor(Node *anode, double alength, int aid) : Neighbor(anode, alength, aid) { partial_lh = NULL; partial_lh_computed = 0; lh_scale_factor = 0.0; partial_pars = NULL; - mutations.clear(); - canMove = 0; } /** tell that the partial likelihood vector is not computed */ - inline void clearPartialLh() - { + inline void clearPartialLh() { partial_lh_computed = 0; } /** * tell that the partial likelihood vector is computed */ - inline void unclearPartialLh() - { + inline void unclearPartialLh() { partial_lh_computed = 1; } @@ -100,12 +93,7 @@ class PhyloNeighbor : public Neighbor /** * Clear all mutations on this branch */ - void clear_mutations(); - - /** - * Add a mutation to this branch - */ - void add_mutation(Mutation mut); + void clearMutations(); private: /** true if the partial likelihood was computed @@ -131,11 +119,6 @@ class PhyloNeighbor : public Neighbor vector containing the partial parsimony scores */ UINT *partial_pars; - - /** - * check if this branch can be movedor do SPR - */ - int canMove; }; /** @@ -225,9 +208,7 @@ class PlacementCandidateNode PhyloNode *best_node; PhyloNeighbor *best_node_branch; - PlacementCandidateNode() - { - } + PlacementCandidateNode() {} }; #endif \ No newline at end of file diff --git a/phylotree.cpp b/phylotree.cpp index 5dd6dc5e..02bce8f8 100644 --- a/phylotree.cpp +++ b/phylotree.cpp @@ -5808,7 +5808,7 @@ void PhyloTree::addNewSample(PhyloNode *best_node, PhyloNeighbor *best_node_bran current_node_mutations.emplace_back(node_mutation); } - best_node_branch->clear_mutations(); + best_node_branch->clearMutations(); --timer_regular; for (auto node_mutation : current_node_mutations) { visited_ancestral_mutations[node_mutation.compressed_position] = timer_regular; From d8960d03cf17a4b266613646c036b69f13096f12 Mon Sep 17 00:00:00 2001 From: trungnotchung Date: Thu, 12 Jun 2025 19:02:41 +0700 Subject: [PATCH 16/23] update spacing --- alignment.cpp | 766 ++++++++++++------------- phylotree.cpp | 1499 ++++++++++++++++++++++++------------------------- 2 files changed, 1132 insertions(+), 1133 deletions(-) diff --git a/alignment.cpp b/alignment.cpp index e61e7b2a..0d7974be 100644 --- a/alignment.cpp +++ b/alignment.cpp @@ -996,50 +996,50 @@ char Alignment::convertStateBack(char state) { } char Alignment::getMutationFromState(char state) { - int value = convertState(state, SEQ_DNA); - switch (value) { - case 0: - return 1; - case 1: - return 2; - case 2: - return 4; - case 3: - return 8; - case 1 + 4 + 3: - return 1 + 4; - case 2 + 8 + 3: - return 2 + 8; - case 1 + 8 + 3: - return 1 + 8; - case 2 + 4 + 3: - return 2 + 4; - case 1 + 2 + 3: - return 1 + 2; - case 4 + 8 + 3: - return 4 + 8; - case 2 + 4 + 8 + 3: - return 2 + 4 + 8; - case 1 + 2 + 8 + 3: - return 1 + 2 + 8; - case 1 + 4 + 8 + 3: - return 1 + 4 + 8; - case 1 + 2 + 4 + 3: - return 1 + 2 + 4; - - default: - return 15; - break; - } + int value = convertState(state, SEQ_DNA); + switch (value) { + case 0: + return 1; + case 1: + return 2; + case 2: + return 4; + case 3: + return 8; + case 1 + 4 + 3: + return 1 + 4; + case 2 + 8 + 3: + return 2 + 8; + case 1 + 8 + 3: + return 1 + 8; + case 2 + 4 + 3: + return 2 + 4; + case 1 + 2 + 3: + return 1 + 2; + case 4 + 8 + 3: + return 4 + 8; + case 2 + 4 + 8 + 3: + return 2 + 4 + 8; + case 1 + 2 + 8 + 3: + return 1 + 2 + 8; + case 1 + 4 + 8 + 3: + return 1 + 4 + 8; + case 1 + 2 + 4 + 3: + return 1 + 2 + 4; + + default: + return 15; + break; + } } int Alignment::getStateFromMutation(int nuc) { - int value; - if ((nuc & (nuc - 1)) == 0) - value = log2(nuc); - else - value = nuc + 3; - return convertStateBack(value); + int value; + if ((nuc & (nuc - 1)) == 0) + value = log2(nuc); + else + value = nuc + 3; + return convertStateBack(value); } string Alignment::convertStateBackStr(char state) { @@ -1059,8 +1059,8 @@ string Alignment::convertStateBackStr(char state) { } void Alignment::convertStateStr(string &str, SeqType seq_type) { - for (string::iterator it = str.begin(); it != str.end(); it++) - (*it) = convertState(*it, seq_type); + for (string::iterator it = str.begin(); it != str.end(); it++) + (*it) = convertState(*it, seq_type); } void Alignment::initCodon(char *sequence_type) { @@ -1286,370 +1286,370 @@ int Alignment::buildPattern(StrVector &sequences, char *sequence_type, int nseq, } void split(const string &s, vector &elems, const string &delim) { - elems.clear(); - size_t pos = 0; - size_t len = s.length(); - size_t delim_len = delim.length(); - if (delim_len == 0) { - elems.push_back(s); - return; - } - while (pos < len) { - size_t find_pos = s.find(delim, pos); - if (find_pos == string::npos) { - elems.push_back(s.substr(pos)); - return; - } - elems.push_back(s.substr(pos, find_pos - pos)); - pos = find_pos + delim_len; - } + elems.clear(); + size_t pos = 0; + size_t len = s.length(); + size_t delim_len = delim.length(); + if (delim_len == 0) { + elems.push_back(s); + return; + } + while (pos < len) { + size_t find_pos = s.find(delim, pos); + if (find_pos == string::npos) { + elems.push_back(s.substr(pos)); + return; + } + elems.push_back(s.substr(pos, find_pos - pos)); + pos = find_pos + delim_len; + } } // Find the permutation of columns after rotation vector Alignment::findRotatedColumnPermutation() { - assert(getNSite() == (int)initial_column_state.size()); - char char_to_state[NUM_CHAR]; - computeUnknownState(); - buildStateMap(char_to_state, seq_type); - - vector perm(getNSite(), 0); - map> pattern_map; - // Build pattern map - for (int i = 0; i < getNSite(); ++i) { - Pattern pattern = getPattern(i); - pattern_map[pattern].push_back(i); - } - for (int col = 0; col < getNSite(); ++col) { - // For each column, build a pattern - // Find initial index of the pattern - Pattern pattern; - for (int i = 0; i < initial_column_state[col].length(); ++i) { - pattern += char_to_state[(int)initial_column_state[col][i]]; - } - perm[pattern_map[pattern].back()] = col; - pattern_map[pattern].pop_back(); - } - return perm; + assert(getNSite() == (int)initial_column_state.size()); + char char_to_state[NUM_CHAR]; + computeUnknownState(); + buildStateMap(char_to_state, seq_type); + + vector perm(getNSite(), 0); + map> pattern_map; + // Build pattern map + for (int i = 0; i < getNSite(); ++i) { + Pattern pattern = getPattern(i); + pattern_map[pattern].push_back(i); + } + for (int col = 0; col < getNSite(); ++col) { + // For each column, build a pattern + // Find initial index of the pattern + Pattern pattern; + for (int i = 0; i < initial_column_state[col].length(); ++i) { + pattern += char_to_state[(int)initial_column_state[col][i]]; + } + perm[pattern_map[pattern].back()] = col; + pattern_map[pattern].pop_back(); + } + return perm; } void Alignment::addToAlignmentNewSequence(const string &new_name, const string &new_seq) { - assert(new_seq.size() == getNSite()); - char char_to_state[NUM_CHAR]; - computeUnknownState(); - - buildStateMap(char_to_state, seq_type); - vector new_patterns; - PatternIntMap new_pattern_index; - vector new_site_patterns; - vector perm_col = findRotatedColumnPermutation(); - - for (int i = 0; i < getNSite(); ++i) { - Pattern new_pattern = getPattern(i); - new_pattern.push_back(char_to_state[(int)new_seq[perm_col[i]]]); - PatternIntMap::iterator pat_it = new_pattern_index.find(new_pattern); - if (pat_it == new_pattern_index.end()) { - new_pattern.frequency = 1; - new_pattern.computeConst(STATE_UNKNOWN); - new_patterns.push_back(new_pattern); - new_pattern_index[new_pattern] = new_patterns.size() - 1; - new_site_patterns.push_back(new_patterns.size() - 1); - } - else { - int index = pat_it->second; - new_patterns[index].frequency++; - new_site_patterns.push_back(index); - } - } - clear(); - for (vector::iterator it = new_patterns.begin(); it != new_patterns.end(); ++it) { - push_back(*it); - } - pattern_index = new_pattern_index; - site_pattern = new_site_patterns; - seq_names.push_back(new_name); - buildSeqStates(); - countConstSite(); + assert(new_seq.size() == getNSite()); + char char_to_state[NUM_CHAR]; + computeUnknownState(); + + buildStateMap(char_to_state, seq_type); + vector new_patterns; + PatternIntMap new_pattern_index; + vector new_site_patterns; + vector perm_col = findRotatedColumnPermutation(); + + for (int i = 0; i < getNSite(); ++i) { + Pattern new_pattern = getPattern(i); + new_pattern.push_back(char_to_state[(int)new_seq[perm_col[i]]]); + PatternIntMap::iterator pat_it = new_pattern_index.find(new_pattern); + if (pat_it == new_pattern_index.end()) { + new_pattern.frequency = 1; + new_pattern.computeConst(STATE_UNKNOWN); + new_patterns.push_back(new_pattern); + new_pattern_index[new_pattern] = new_patterns.size() - 1; + new_site_patterns.push_back(new_patterns.size() - 1); + } + else { + int index = pat_it->second; + new_patterns[index].frequency++; + new_site_patterns.push_back(index); + } + } + clear(); + for (vector::iterator it = new_patterns.begin(); it != new_patterns.end(); ++it) { + push_back(*it); + } + pattern_index = new_pattern_index; + site_pattern = new_site_patterns; + seq_names.push_back(new_name); + buildSeqStates(); + countConstSite(); } void Alignment::addToAlignmentNewSequences(const vector &new_seq_names, const vector &new_sequences) { - char char_to_state[NUM_CHAR]; - computeUnknownState(); - buildStateMap(char_to_state, seq_type); - - vector new_patterns; - PatternIntMap new_pattern_index; - vector new_site_patterns; - int nseq = new_sequences.size(); - vector perm_col = findRotatedColumnPermutation(); - - for (int site = 0; site < getNSite(); ++site) { - Pattern new_pattern = getPattern(site); - for (int seq = 0; seq < nseq; ++seq) { - new_pattern.push_back(char_to_state[(int)new_sequences[seq][perm_col[site]]]); - } - PatternIntMap::iterator pat_it = new_pattern_index.find(new_pattern); - if (pat_it == new_pattern_index.end()) { - new_pattern.frequency = 1; - new_pattern.computeConst(STATE_UNKNOWN); - new_patterns.push_back(new_pattern); - new_pattern_index[new_pattern] = new_patterns.size() - 1; - new_site_patterns.push_back(new_patterns.size() - 1); - } - else { - int index = pat_it->second; - new_patterns[index].frequency++; - new_site_patterns.push_back(index); - } - } - clear(); - for (vector::iterator it = new_patterns.begin(); it != new_patterns.end(); ++it) { - push_back(*it); - } - pattern_index = new_pattern_index; - site_pattern = new_site_patterns; - seq_names.insert(seq_names.end(), new_seq_names.begin(), new_seq_names.end()); - buildSeqStates(); - countConstSite(); + char char_to_state[NUM_CHAR]; + computeUnknownState(); + buildStateMap(char_to_state, seq_type); + + vector new_patterns; + PatternIntMap new_pattern_index; + vector new_site_patterns; + int nseq = new_sequences.size(); + vector perm_col = findRotatedColumnPermutation(); + + for (int site = 0; site < getNSite(); ++site) { + Pattern new_pattern = getPattern(site); + for (int seq = 0; seq < nseq; ++seq) { + new_pattern.push_back(char_to_state[(int)new_sequences[seq][perm_col[site]]]); + } + PatternIntMap::iterator pat_it = new_pattern_index.find(new_pattern); + if (pat_it == new_pattern_index.end()) { + new_pattern.frequency = 1; + new_pattern.computeConst(STATE_UNKNOWN); + new_patterns.push_back(new_pattern); + new_pattern_index[new_pattern] = new_patterns.size() - 1; + new_site_patterns.push_back(new_patterns.size() - 1); + } + else { + int index = pat_it->second; + new_patterns[index].frequency++; + new_site_patterns.push_back(index); + } + } + clear(); + for (vector::iterator it = new_patterns.begin(); it != new_patterns.end(); ++it) { + push_back(*it); + } + pattern_index = new_pattern_index; + site_pattern = new_site_patterns; + seq_names.insert(seq_names.end(), new_seq_names.begin(), new_seq_names.end()); + buildSeqStates(); + countConstSite(); } void Alignment::updateAlignmentNewSequences(const vector &new_sequences, const vector &perm_col) { - computeUnknownState(); - char char_to_state[NUM_CHAR]; - buildStateMap(char_to_state, seq_type); - - vector new_patterns; - PatternIntMap new_pattern_index; - vector new_site_patterns; - int nseq = new_sequences.size(); - int nsite = getNSite(); + computeUnknownState(); + char char_to_state[NUM_CHAR]; + buildStateMap(char_to_state, seq_type); + + vector new_patterns; + PatternIntMap new_pattern_index; + vector new_site_patterns; + int nseq = new_sequences.size(); + int nsite = getNSite(); - for (int site = 0; site < nsite; ++site) { - Pattern new_pattern; - for (int seq = 0; seq < nseq; ++seq) { - new_pattern.push_back(char_to_state[(int)new_sequences[seq][perm_col[site]]]); - } - PatternIntMap::iterator pat_it = new_pattern_index.find(new_pattern); - if (pat_it == new_pattern_index.end()) { - // If pattern not found, add new pattern - new_pattern.frequency = 1; - new_pattern.computeConst(STATE_UNKNOWN); - new_patterns.push_back(new_pattern); - new_pattern_index[new_pattern] = new_patterns.size() - 1; - new_site_patterns.push_back(new_patterns.size() - 1); - } - else { - // If pattern found, increment frequency - int index = pat_it->second; - new_patterns[index].frequency++; - new_site_patterns.push_back(index); - } - } - clear(); - for (vector::iterator itr = new_patterns.begin(); itr != new_patterns.end(); ++itr) { - push_back(*itr); - } - pattern_index = new_pattern_index; - site_pattern = new_site_patterns; - buildSeqStates(); - countConstSite(); + for (int site = 0; site < nsite; ++site) { + Pattern new_pattern; + for (int seq = 0; seq < nseq; ++seq) { + new_pattern.push_back(char_to_state[(int)new_sequences[seq][perm_col[site]]]); + } + PatternIntMap::iterator pat_it = new_pattern_index.find(new_pattern); + if (pat_it == new_pattern_index.end()) { + // If pattern not found, add new pattern + new_pattern.frequency = 1; + new_pattern.computeConst(STATE_UNKNOWN); + new_patterns.push_back(new_pattern); + new_pattern_index[new_pattern] = new_patterns.size() - 1; + new_site_patterns.push_back(new_patterns.size() - 1); + } + else { + // If pattern found, increment frequency + int index = pat_it->second; + new_patterns[index].frequency++; + new_site_patterns.push_back(index); + } + } + clear(); + for (vector::iterator itr = new_patterns.begin(); itr != new_patterns.end(); ++itr) { + push_back(*itr); + } + pattern_index = new_pattern_index; + site_pattern = new_site_patterns; + buildSeqStates(); + countConstSite(); } // Read partial VCF file and update alignment int Alignment::readPartialVCF(ifstream &in, char *sequence_type, vector &perm_col, int existing_sequence, int start_index, int num_column) { - if (in.eof()) { - return 0; - } - StrVector sequences; - int nseq = getNSeq(); - int nsite = 0; - int seq_id = 0; - string line; - int num_processed_column = 0; - - sequences.resize(nseq, ""); - existing_sample_mutations.assign(nseq, vector()); - - for (; !in.eof() && num_processed_column < num_column;) { - getline(in, line); - if (line == "") - continue; - vector words; - split(line, words, "\t"); - if (words.size() == 1) - continue; - if (words.size() != 9 + nseq + missing_sample_mutations.size()) - throw "Number of columns in VCF file is not consistent"; - vector alleles; - Mutation mutation; - int variant_pos = std::stoi(words[1]); - mutation.position = variant_pos; - mutation.compressed_position = num_processed_column + start_index; - while ((int)reference_nuc.size() <= mutation.compressed_position) - reference_nuc.push_back(0); - split(words[4], alleles, ","); - mutation.ref_nuc = getMutationFromState(words[3][0]); - if (reference_nuc[mutation.compressed_position] == 0) - reference_nuc[mutation.compressed_position] = mutation.ref_nuc; - for (int i = 9; i < words.size(); ++i) { - mutation.is_missing = false; - if (isdigit(words[i][0])) { - int allele_id = std::stoi(words[i]); - if (allele_id > 0) { - std::string allele = alleles[allele_id - 1]; - if (i - 9 < existing_sequence) { - sequences[i - 9].push_back(allele[0]); - } - mutation.mut_nuc = getMutationFromState(allele[0]); - } - else { - if (i - 9 < existing_sequence) { - sequences[i - 9].push_back(words[3][0]); - } - mutation.mut_nuc = getMutationFromState(words[3][0]); - } - } - else { - if (i - 9 < existing_sequence) { - sequences[i - 9].push_back('-'); - } - mutation.mut_nuc = getMutationFromState('N'); - mutation.is_missing = true; - } - if (i - 9 >= existing_sequence) { - if (mutation.mut_nuc != mutation.ref_nuc) { - mutation.par_nuc = mutation.ref_nuc; - missing_sample_mutations[i - 9 - existing_sequence].push_back(mutation); - } - } - else { - existing_sample_mutations[i - 9].push_back(mutation); - } - } - ++nsite; - ++num_processed_column; - } + if (in.eof()) { + return 0; + } + StrVector sequences; + int nseq = getNSeq(); + int nsite = 0; + int seq_id = 0; + string line; + int num_processed_column = 0; + + sequences.resize(nseq, ""); + existing_sample_mutations.assign(nseq, vector()); + + for (; !in.eof() && num_processed_column < num_column;) { + getline(in, line); + if (line == "") + continue; + vector words; + split(line, words, "\t"); + if (words.size() == 1) + continue; + if (words.size() != 9 + nseq + missing_sample_mutations.size()) + throw "Number of columns in VCF file is not consistent"; + vector alleles; + Mutation mutation; + int variant_pos = std::stoi(words[1]); + mutation.position = variant_pos; + mutation.compressed_position = num_processed_column + start_index; + while ((int)reference_nuc.size() <= mutation.compressed_position) + reference_nuc.push_back(0); + split(words[4], alleles, ","); + mutation.ref_nuc = getMutationFromState(words[3][0]); + if (reference_nuc[mutation.compressed_position] == 0) + reference_nuc[mutation.compressed_position] = mutation.ref_nuc; + for (int i = 9; i < words.size(); ++i) { + mutation.is_missing = false; + if (isdigit(words[i][0])) { + int allele_id = std::stoi(words[i]); + if (allele_id > 0) { + std::string allele = alleles[allele_id - 1]; + if (i - 9 < existing_sequence) { + sequences[i - 9].push_back(allele[0]); + } + mutation.mut_nuc = getMutationFromState(allele[0]); + } + else { + if (i - 9 < existing_sequence) { + sequences[i - 9].push_back(words[3][0]); + } + mutation.mut_nuc = getMutationFromState(words[3][0]); + } + } + else { + if (i - 9 < existing_sequence) { + sequences[i - 9].push_back('-'); + } + mutation.mut_nuc = getMutationFromState('N'); + mutation.is_missing = true; + } + if (i - 9 >= existing_sequence) { + if (mutation.mut_nuc != mutation.ref_nuc) { + mutation.par_nuc = mutation.ref_nuc; + missing_sample_mutations[i - 9 - existing_sequence].push_back(mutation); + } + } + else { + existing_sample_mutations[i - 9].push_back(mutation); + } + } + ++nsite; + ++num_processed_column; + } - // If not enough columns, rebuild pattern and return - if (num_processed_column < num_column) { - buildPattern(sequences, sequence_type, nseq, nsite); - initial_column_state.assign(nsite, ""); - for (int seq = 0; seq < nseq; ++seq) { - for (int site = 0; site < nsite; ++site) - initial_column_state[site] += sequences[seq][site]; - } - perm_col = findRotatedColumnPermutation(); - return num_processed_column; - } + // If not enough columns, rebuild pattern and return + if (num_processed_column < num_column) { + buildPattern(sequences, sequence_type, nseq, nsite); + initial_column_state.assign(nsite, ""); + for (int seq = 0; seq < nseq; ++seq) { + for (int site = 0; site < nsite; ++site) + initial_column_state[site] += sequences[seq][site]; + } + perm_col = findRotatedColumnPermutation(); + return num_processed_column; + } - // Update alignment with new sequences - updateAlignmentNewSequences(sequences, perm_col); - return num_processed_column; + // Update alignment with new sequences + updateAlignmentNewSequences(sequences, perm_col); + return num_processed_column; } int Alignment::readVCF(char *filename, char *sequence_type, int existing_sequence) { - StrVector sequences; - ifstream in; - in.exceptions(ios::failbit | ios::badbit); - in.open(filename); - int nseq = 0; - int nsite = 0; - int seq_id = 0; - int num_missing_sequence = 0; - string line; - in.exceptions(ios::badbit); - int num_processed_column = 0; - - for (; !in.eof();) { - getline(in, line); - if (line == "") - continue; - vector words; - split(line, words, "\t"); - if (words.size() == 1) - continue; - if (words[1] == "POS") { - // Sample names start from the 10th word in the header - for (int i = 9; i < words.size(); i++) { - if (i - 9 >= existing_sequence) { - missing_seq_names.push_back(words[i]); - num_missing_sequence++; - } - else { - seq_names.push_back(words[i]); - nseq++; - } - } - sequences.resize(nseq, ""); - missing_sequences.resize(num_missing_sequence, ""); - existing_sample_mutations.resize(nseq); - missing_sample_mutations.resize(num_missing_sequence); - } - else { - if (words.size() != 9 + nseq + num_missing_sequence) - throw "Number of columns in VCF file is not consistent"; - vector alleles; - Mutation mutation; - int variant_pos = std::stoi(words[1]); - mutation.position = variant_pos; - mutation.compressed_position = num_processed_column; - while ((int)reference_nuc.size() <= mutation.compressed_position) - reference_nuc.push_back(0); - split(words[4], alleles, ","); - mutation.ref_nuc = getMutationFromState(words[3][0]); - if (reference_nuc[mutation.compressed_position] == 0) - reference_nuc[mutation.compressed_position] = mutation.ref_nuc; - for (int i = 9; i < words.size(); ++i) { - mutation.is_missing = false; - if (isdigit(words[i][0])) { - int allele_id = std::stoi(words[i]); - if (allele_id > 0) { - std::string allele = alleles[allele_id - 1]; - if (i - 9 < existing_sequence) - sequences[i - 9].push_back(allele[0]); - else - missing_sequences[i - 9 - existing_sequence].push_back(allele[0]); - - mutation.mut_nuc = getMutationFromState(allele[0]); - } - else { - if (i - 9 < existing_sequence) - sequences[i - 9].push_back(words[3][0]); - else - missing_sequences[i - 9 - existing_sequence].push_back(words[3][0]); + StrVector sequences; + ifstream in; + in.exceptions(ios::failbit | ios::badbit); + in.open(filename); + int nseq = 0; + int nsite = 0; + int seq_id = 0; + int num_missing_sequence = 0; + string line; + in.exceptions(ios::badbit); + int num_processed_column = 0; + + for (; !in.eof();) { + getline(in, line); + if (line == "") + continue; + vector words; + split(line, words, "\t"); + if (words.size() == 1) + continue; + if (words[1] == "POS") { + // Sample names start from the 10th word in the header + for (int i = 9; i < words.size(); i++) { + if (i - 9 >= existing_sequence) { + missing_seq_names.push_back(words[i]); + num_missing_sequence++; + } + else { + seq_names.push_back(words[i]); + nseq++; + } + } + sequences.resize(nseq, ""); + missing_sequences.resize(num_missing_sequence, ""); + existing_sample_mutations.resize(nseq); + missing_sample_mutations.resize(num_missing_sequence); + } + else { + if (words.size() != 9 + nseq + num_missing_sequence) + throw "Number of columns in VCF file is not consistent"; + vector alleles; + Mutation mutation; + int variant_pos = std::stoi(words[1]); + mutation.position = variant_pos; + mutation.compressed_position = num_processed_column; + while ((int)reference_nuc.size() <= mutation.compressed_position) + reference_nuc.push_back(0); + split(words[4], alleles, ","); + mutation.ref_nuc = getMutationFromState(words[3][0]); + if (reference_nuc[mutation.compressed_position] == 0) + reference_nuc[mutation.compressed_position] = mutation.ref_nuc; + for (int i = 9; i < words.size(); ++i) { + mutation.is_missing = false; + if (isdigit(words[i][0])) { + int allele_id = std::stoi(words[i]); + if (allele_id > 0) { + std::string allele = alleles[allele_id - 1]; + if (i - 9 < existing_sequence) + sequences[i - 9].push_back(allele[0]); + else + missing_sequences[i - 9 - existing_sequence].push_back(allele[0]); + + mutation.mut_nuc = getMutationFromState(allele[0]); + } + else { + if (i - 9 < existing_sequence) + sequences[i - 9].push_back(words[3][0]); + else + missing_sequences[i - 9 - existing_sequence].push_back(words[3][0]); - mutation.mut_nuc = getMutationFromState(words[3][0]); - } - } - else { - if (i - 9 < existing_sequence) - sequences[i - 9].push_back('-'); - else - missing_sequences[i - 9 - existing_sequence].push_back('-'); - mutation.mut_nuc = getMutationFromState('N'); - mutation.is_missing = true; - } - if (i - 9 >= existing_sequence) { - if (mutation.mut_nuc != mutation.ref_nuc) { - mutation.par_nuc = mutation.ref_nuc; - missing_sample_mutations[i - 9 - existing_sequence].push_back(mutation); - } - } - else - existing_sample_mutations[i - 9].push_back(mutation); - } - ++nsite; - ++num_processed_column; - } - } - initial_column_state.assign(nsite, ""); - for (int seq = 0; seq < nseq; ++seq) { - for (int site = 0; site < nsite; ++site) - initial_column_state[site] += sequences[seq][site]; - } - in.clear(); - in.exceptions(ios::failbit | ios::badbit); - in.close(); - return buildPattern(sequences, sequence_type, nseq, nsite); + mutation.mut_nuc = getMutationFromState(words[3][0]); + } + } + else { + if (i - 9 < existing_sequence) + sequences[i - 9].push_back('-'); + else + missing_sequences[i - 9 - existing_sequence].push_back('-'); + mutation.mut_nuc = getMutationFromState('N'); + mutation.is_missing = true; + } + if (i - 9 >= existing_sequence) { + if (mutation.mut_nuc != mutation.ref_nuc) { + mutation.par_nuc = mutation.ref_nuc; + missing_sample_mutations[i - 9 - existing_sequence].push_back(mutation); + } + } + else + existing_sample_mutations[i - 9].push_back(mutation); + } + ++nsite; + ++num_processed_column; + } + } + initial_column_state.assign(nsite, ""); + for (int seq = 0; seq < nseq; ++seq) { + for (int site = 0; site < nsite; ++site) + initial_column_state[site] += sequences[seq][site]; + } + in.clear(); + in.exceptions(ios::failbit | ios::badbit); + in.close(); + return buildPattern(sequences, sequence_type, nseq, nsite); } int Alignment::readPhylip(char *filename, char *sequence_type) { diff --git a/phylotree.cpp b/phylotree.cpp index 02bce8f8..974b4dda 100644 --- a/phylotree.cpp +++ b/phylotree.cpp @@ -5136,837 +5136,836 @@ void PhyloTree::printTransMatrices(Node *node, Node *dad) { void PhyloTree::allocateMutationMemory(int num_column) { - current_missing_sample_mutations.resize(num_column); - current_ancestral_mutations.resize(num_column); - visited_missing_sample_mutations.resize(num_column); - visited_ancestral_mutations.resize(num_column); - current_excess_mutations.resize(num_column); - visited_excess_mutations.resize(num_column); + current_missing_sample_mutations.resize(num_column); + current_ancestral_mutations.resize(num_column); + visited_missing_sample_mutations.resize(num_column); + visited_ancestral_mutations.resize(num_column); + current_excess_mutations.resize(num_column); + visited_excess_mutations.resize(num_column); } -void PhyloTree::computePartialMutation(UINT *states_dad, vector &perm_col, vector &compressed_perm_col, PhyloNeighbor *dad_branch, PhyloNode *dad) -{ - PhyloNode *node = (PhyloNode *)dad_branch->node; - if (node->isLeaf() && dad) { - return; - } - - int ptn; - int nptn = aln->size(); - UINT *left = NULL, *right = NULL; - PhyloNeighbor *left_branch, *right_branch; - FOR_NEIGHBOR_IT(node, dad, it) - if ((*it)->node->name != ROOT_NAME) { - if (!left) - left = ((PhyloNeighbor *)(*it))->partial_pars, left_branch = (PhyloNeighbor *)(*it)->node->findNeighbor(node); - else - right = ((PhyloNeighbor *)(*it))->partial_pars, right_branch = (PhyloNeighbor *)(*it)->node->findNeighbor(node); - } +void PhyloTree::computePartialMutation(UINT *states_dad, vector &perm_col, vector &compressed_perm_col, PhyloNeighbor *dad_branch, PhyloNode *dad) { + PhyloNode *node = (PhyloNode *)dad_branch->node; + if (node->isLeaf() && dad) { + return; + } - int col = -1; - vector> left_branch_mutations, right_branch_mutations; - for (ptn = 0; ptn < aln->size(); ptn += 8) { - UINT left_states = left[ptn / 8]; - UINT right_states = right[ptn / 8]; - UINT dad_states = states_dad[ptn / 8]; - int maxi = aln->size() - ptn; - if (maxi > 8) maxi = 8; - for (int i = 0; i < maxi; i++) { - ++col; - UINT left_state = (left_states >> (i * 4)) & 15; - UINT right_state = (right_states >> (i * 4)) & 15; - UINT dad_state = (dad_states >> (i * 4)) & 15; - char dad_nuc = 0; - for (dad_nuc = 0; dad_nuc < 4; ++dad_nuc) - if (1 & (dad_state >> dad_nuc)) - break; + int ptn; + int nptn = aln->size(); + UINT *left = NULL, *right = NULL; + PhyloNeighbor *left_branch, *right_branch; + FOR_NEIGHBOR_IT(node, dad, it) + if ((*it)->node->name != ROOT_NAME) { + if (!left) + left = ((PhyloNeighbor *)(*it))->partial_pars, left_branch = (PhyloNeighbor *)(*it)->node->findNeighbor(node); + else + right = ((PhyloNeighbor *)(*it))->partial_pars, right_branch = (PhyloNeighbor *)(*it)->node->findNeighbor(node); + } - char left_child_nuc; - if ((1 & (left_state >> dad_nuc))) { - left_child_nuc = dad_nuc; - } - else { - for (left_child_nuc = 0; left_child_nuc < 4; ++left_child_nuc) - if (1 & (left_state >> left_child_nuc)) - break; - Mutation left_child_mut; - left_child_mut.position = perm_col[col]; - left_child_mut.compressed_position = compressed_perm_col[col]; - left_child_mut.mut_nuc = (1 << left_child_nuc); - left_child_mut.par_nuc = (1 << dad_nuc); - left_child_mut.ref_nuc = aln->reference_nuc[left_child_mut.compressed_position]; - left_branch->mutations.push_back(left_child_mut); - left_branch_mutations.push_back(make_pair(col, left_child_nuc)); - } - for (int nuc = 0; nuc < 4; ++nuc) { - if (nuc != left_child_nuc && (1 & (left_state >> nuc))) { - left[ptn / 8] ^= (1 << (i * 4 + nuc)); - } - } + int col = -1; + vector> left_branch_mutations, right_branch_mutations; + for (ptn = 0; ptn < aln->size(); ptn += 8) { + UINT left_states = left[ptn / 8]; + UINT right_states = right[ptn / 8]; + UINT dad_states = states_dad[ptn / 8]; + int maxi = aln->size() - ptn; + if (maxi > 8) maxi = 8; + for (int i = 0; i < maxi; i++) { + ++col; + UINT left_state = (left_states >> (i * 4)) & 15; + UINT right_state = (right_states >> (i * 4)) & 15; + UINT dad_state = (dad_states >> (i * 4)) & 15; + char dad_nuc = 0; + for (dad_nuc = 0; dad_nuc < 4; ++dad_nuc) + if (1 & (dad_state >> dad_nuc)) + break; + + char left_child_nuc; + if ((1 & (left_state >> dad_nuc))) { + left_child_nuc = dad_nuc; + } + else { + for (left_child_nuc = 0; left_child_nuc < 4; ++left_child_nuc) + if (1 & (left_state >> left_child_nuc)) + break; + Mutation left_child_mut; + left_child_mut.position = perm_col[col]; + left_child_mut.compressed_position = compressed_perm_col[col]; + left_child_mut.mut_nuc = (1 << left_child_nuc); + left_child_mut.par_nuc = (1 << dad_nuc); + left_child_mut.ref_nuc = aln->reference_nuc[left_child_mut.compressed_position]; + left_branch->mutations.push_back(left_child_mut); + left_branch_mutations.push_back(make_pair(col, left_child_nuc)); + } + for (int nuc = 0; nuc < 4; ++nuc) { + if (nuc != left_child_nuc && (1 & (left_state >> nuc))) { + left[ptn / 8] ^= (1 << (i * 4 + nuc)); + } + } - char right_child_nuc; - if ((1 & (right_state >> dad_nuc))) { - right_child_nuc = dad_nuc; - } - else { - for (right_child_nuc = 0; right_child_nuc < 4; ++right_child_nuc) - if (1 & (right_state >> right_child_nuc)) - break; - Mutation right_child_mutation; - right_child_mutation.position = perm_col[col]; - right_child_mutation.compressed_position = compressed_perm_col[col]; - right_child_mutation.mut_nuc = (1 << right_child_nuc); - right_child_mutation.par_nuc = (1 << dad_nuc); - right_child_mutation.ref_nuc = aln->reference_nuc[right_child_mutation.compressed_position]; - right_branch->mutations.push_back(right_child_mutation); - right_branch_mutations.push_back(make_pair(col, right_child_nuc)); - } - for (int nuc = 0; nuc < 4; ++nuc) { - if (nuc != right_child_nuc && (1 & (right_state >> nuc))) { - right[ptn / 8] ^= (1 << (i * 4 + nuc)); - } - } - } - } + char right_child_nuc; + if ((1 & (right_state >> dad_nuc))) { + right_child_nuc = dad_nuc; + } + else { + for (right_child_nuc = 0; right_child_nuc < 4; ++right_child_nuc) + if (1 & (right_state >> right_child_nuc)) + break; + Mutation right_child_mutation; + right_child_mutation.position = perm_col[col]; + right_child_mutation.compressed_position = compressed_perm_col[col]; + right_child_mutation.mut_nuc = (1 << right_child_nuc); + right_child_mutation.par_nuc = (1 << dad_nuc); + right_child_mutation.ref_nuc = aln->reference_nuc[right_child_mutation.compressed_position]; + right_branch->mutations.push_back(right_child_mutation); + right_branch_mutations.push_back(make_pair(col, right_child_nuc)); + } + for (int nuc = 0; nuc < 4; ++nuc) { + if (nuc != right_child_nuc && (1 & (right_state >> nuc))) { + right[ptn / 8] ^= (1 << (i * 4 + nuc)); + } + } + } + } - bool left_child = true; - FOR_NEIGHBOR_IT(node, dad, it) - if ((*it)->node->name != ROOT_NAME) { - if (left_child) { - computePartialMutation(left, perm_col, compressed_perm_col, (PhyloNeighbor *)(*it), (PhyloNode *)node); - left_child = false; - continue; - } - computePartialMutation(right, perm_col, compressed_perm_col, (PhyloNeighbor *)(*it), (PhyloNode *)node); - } + bool left_child = true; + FOR_NEIGHBOR_IT(node, dad, it) + if ((*it)->node->name != ROOT_NAME) { + if (left_child) { + computePartialMutation(left, perm_col, compressed_perm_col, (PhyloNeighbor *)(*it), (PhyloNode *)node); + left_child = false; + continue; + } + computePartialMutation(right, perm_col, compressed_perm_col, (PhyloNeighbor *)(*it), (PhyloNode *)node); + } } void PhyloTree::computeMutationBranch(vector &perm_col, vector &compressed_perm_col, PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst) { - PhyloNode *node = (PhyloNode *)dad_branch->node; - PhyloNeighbor *node_branch = (PhyloNeighbor *)node->findNeighbor(dad); - assert(node_branch); - if (node->isLeaf()) { - PhyloNode *tmp_node = dad; - dad = node; - node = tmp_node; - PhyloNeighbor *tmp_nei = dad_branch; - dad_branch = node_branch; - node_branch = tmp_nei; - } - - int nptn = aln->size(); - UINT *left_branch_states_dad = new UINT[(aln->size() + 7) / 8 + 1]; - for (int ptn = 0; ptn < aln->size(); ptn += 8) { - left_branch_states_dad[ptn / 8] = 0; - } - UINT *right_branch_states_dad = new UINT[(aln->size() + 7) / 8 + 1]; - for (int ptn = 0; ptn < aln->size(); ptn += 8) { - right_branch_states_dad[ptn / 8] = 0; - } + PhyloNode *node = (PhyloNode *)dad_branch->node; + PhyloNeighbor *node_branch = (PhyloNeighbor *)node->findNeighbor(dad); + assert(node_branch); + if (node->isLeaf()) { + PhyloNode *tmp_node = dad; + dad = node; + node = tmp_node; + PhyloNeighbor *tmp_nei = dad_branch; + dad_branch = node_branch; + node_branch = tmp_nei; + } - int i, ptn, col = -1; - for (ptn = 0; ptn < aln->size(); ptn += 8) { - UINT left_states = node_branch->partial_pars[ptn / 8]; - UINT right_states = dad_branch->partial_pars[ptn / 8]; - UINT dad_states = root_states[ptn / 8]; - int maxi = aln->size() - ptn; - if (maxi > 8) maxi = 8; - for (i = 0; i < maxi; i++) { - ++col; - UINT left_state = (left_states >> (i * 4)) & 15; - UINT right_state = (right_states >> (i * 4)) & 15; - UINT dad_state = (dad_states >> (i * 4)) & 15; - - char dad_nuc = 0; - for (dad_nuc = 0; dad_nuc < 4; ++dad_nuc) - if (1 & (dad_state >> dad_nuc)) - break; + int nptn = aln->size(); + UINT *left_branch_states_dad = new UINT[(aln->size() + 7) / 8 + 1]; + for (int ptn = 0; ptn < aln->size(); ptn += 8) { + left_branch_states_dad[ptn / 8] = 0; + } + UINT *right_branch_states_dad = new UINT[(aln->size() + 7) / 8 + 1]; + for (int ptn = 0; ptn < aln->size(); ptn += 8) { + right_branch_states_dad[ptn / 8] = 0; + } - char left_child_nuc; - if ((1 & (left_state >> dad_nuc))) { - left_child_nuc = dad_nuc; - } - else { - for (left_child_nuc = 0; left_child_nuc < 4; ++left_child_nuc) - if (1 & (left_state >> left_child_nuc)) - break; - Mutation left_child_mut; - left_child_mut.position = perm_col[col]; - left_child_mut.compressed_position = compressed_perm_col[col]; - left_child_mut.mut_nuc = (1 << left_child_nuc); - left_child_mut.par_nuc = (1 << dad_nuc); - left_child_mut.ref_nuc = aln->reference_nuc[left_child_mut.compressed_position]; - dad_branch->mutations.push_back(left_child_mut); - } - right_branch_states_dad[ptn / 8] ^= (1 << (i * 4 + left_child_nuc)); + int i, ptn, col = -1; + for (ptn = 0; ptn < aln->size(); ptn += 8) { + UINT left_states = node_branch->partial_pars[ptn / 8]; + UINT right_states = dad_branch->partial_pars[ptn / 8]; + UINT dad_states = root_states[ptn / 8]; + int maxi = aln->size() - ptn; + if (maxi > 8) maxi = 8; + for (i = 0; i < maxi; i++) { + ++col; + UINT left_state = (left_states >> (i * 4)) & 15; + UINT right_state = (right_states >> (i * 4)) & 15; + UINT dad_state = (dad_states >> (i * 4)) & 15; + + char dad_nuc = 0; + for (dad_nuc = 0; dad_nuc < 4; ++dad_nuc) + if (1 & (dad_state >> dad_nuc)) + break; + + char left_child_nuc; + if ((1 & (left_state >> dad_nuc))) { + left_child_nuc = dad_nuc; + } + else { + for (left_child_nuc = 0; left_child_nuc < 4; ++left_child_nuc) + if (1 & (left_state >> left_child_nuc)) + break; + Mutation left_child_mut; + left_child_mut.position = perm_col[col]; + left_child_mut.compressed_position = compressed_perm_col[col]; + left_child_mut.mut_nuc = (1 << left_child_nuc); + left_child_mut.par_nuc = (1 << dad_nuc); + left_child_mut.ref_nuc = aln->reference_nuc[left_child_mut.compressed_position]; + dad_branch->mutations.push_back(left_child_mut); + } + right_branch_states_dad[ptn / 8] ^= (1 << (i * 4 + left_child_nuc)); - char right_child_nuc; - if ((1 & (right_state >> dad_nuc))) { - right_child_nuc = dad_nuc; - } - else { - for (right_child_nuc = 0; right_child_nuc < 4; ++right_child_nuc) - if (1 & (right_state >> right_child_nuc)) - break; - Mutation right_child_mut; - right_child_mut.position = perm_col[col]; - right_child_mut.compressed_position = compressed_perm_col[col]; - right_child_mut.mut_nuc = (1 << right_child_nuc); - right_child_mut.par_nuc = (1 << dad_nuc); - right_child_mut.ref_nuc = aln->reference_nuc[right_child_mut.compressed_position]; - node_branch->mutations.push_back(right_child_mut); - } - left_branch_states_dad[ptn / 8] ^= (1 << (i * 4 + right_child_nuc)); - } - } + char right_child_nuc; + if ((1 & (right_state >> dad_nuc))) { + right_child_nuc = dad_nuc; + } + else { + for (right_child_nuc = 0; right_child_nuc < 4; ++right_child_nuc) + if (1 & (right_state >> right_child_nuc)) + break; + Mutation right_child_mut; + right_child_mut.position = perm_col[col]; + right_child_mut.compressed_position = compressed_perm_col[col]; + right_child_mut.mut_nuc = (1 << right_child_nuc); + right_child_mut.par_nuc = (1 << dad_nuc); + right_child_mut.ref_nuc = aln->reference_nuc[right_child_mut.compressed_position]; + node_branch->mutations.push_back(right_child_mut); + } + left_branch_states_dad[ptn / 8] ^= (1 << (i * 4 + right_child_nuc)); + } + } - computePartialMutation(left_branch_states_dad, perm_col, compressed_perm_col, dad_branch, dad); - computePartialMutation(right_branch_states_dad, perm_col, compressed_perm_col, node_branch, node); + computePartialMutation(left_branch_states_dad, perm_col, compressed_perm_col, dad_branch, dad); + computePartialMutation(right_branch_states_dad, perm_col, compressed_perm_col, node_branch, node); } void PhyloTree::initMutation(vector &perm_col, vector &compressed_perm_col) { - // Compute parsimony is necessary for tracing back the mutations - computeParsimony(); - computeMutationBranch(perm_col, compressed_perm_col, (PhyloNeighbor *)root->neighbors[0], (PhyloNode *)root); - - int ptn = 0, counter = 0; - int nptn = aln->size(); - for (int i = 0; i < nptn; ++i) { - char root_nuc = ((root_states[ptn] >> (i * 4)) & 15); - char ref_nuc = aln->reference_nuc[compressed_perm_col[i]]; - if ((root_nuc & ref_nuc) == 0) { - char dad_nuc = 0; - for (dad_nuc = 0; dad_nuc < 4; ++dad_nuc) { - if (1 & (ref_nuc >> dad_nuc)) - break; - } + // Compute parsimony is necessary for tracing back the mutations + computeParsimony(); + computeMutationBranch(perm_col, compressed_perm_col, (PhyloNeighbor *)root->neighbors[0], (PhyloNode *)root); + + int ptn = 0, counter = 0; + int nptn = aln->size(); + for (int i = 0; i < nptn; ++i) { + char root_nuc = ((root_states[ptn] >> (i * 4)) & 15); + char ref_nuc = aln->reference_nuc[compressed_perm_col[i]]; + if ((root_nuc & ref_nuc) == 0) { + char dad_nuc = 0; + for (dad_nuc = 0; dad_nuc < 4; ++dad_nuc) { + if (1 & (ref_nuc >> dad_nuc)) + break; + } - char mut_nuc = 0; - for (mut_nuc = 0; mut_nuc < 4; ++mut_nuc) { - if (1 & (root_nuc >> mut_nuc)) - break; - } + char mut_nuc = 0; + for (mut_nuc = 0; mut_nuc < 4; ++mut_nuc) { + if (1 & (root_nuc >> mut_nuc)) + break; + } - Mutation mutation; - mutation.position = perm_col[i]; - mutation.compressed_position = compressed_perm_col[i]; - mutation.mut_nuc = (1 << mut_nuc); - mutation.ref_nuc = ref_nuc; - mutation.par_nuc = (1 << dad_nuc); - root_mutations.push_back(mutation); - } - ++counter; - if (counter == 8) { - counter = 0; - ++ptn; - } - } + Mutation mutation; + mutation.position = perm_col[i]; + mutation.compressed_position = compressed_perm_col[i]; + mutation.mut_nuc = (1 << mut_nuc); + mutation.ref_nuc = ref_nuc; + mutation.par_nuc = (1 << dad_nuc); + root_mutations.push_back(mutation); + } + ++counter; + if (counter == 8) { + counter = 0; + ++ptn; + } + } } int PhyloTree::computePartialParsimonyMutation(PhyloNeighbor *dad_branch, PhyloNode *dad) { - int parsimony_score = 0; - PhyloNode *node = (PhyloNode *)dad_branch->node; - PhyloNeighbor *node_branch = (PhyloNeighbor *)node->findNeighbor(dad); - parsimony_score += node_branch->mutations.size(); - FOR_NEIGHBOR_IT(node, dad, it) - if ((*it)->node->name != ROOT_NAME) { - parsimony_score += computePartialParsimonyMutation(((PhyloNeighbor *)(*it)), node); - } - return parsimony_score; + int parsimony_score = 0; + PhyloNode *node = (PhyloNode *)dad_branch->node; + PhyloNeighbor *node_branch = (PhyloNeighbor *)node->findNeighbor(dad); + parsimony_score += node_branch->mutations.size(); + FOR_NEIGHBOR_IT(node, dad, it) + if ((*it)->node->name != ROOT_NAME) { + parsimony_score += computePartialParsimonyMutation(((PhyloNeighbor *)(*it)), node); + } + return parsimony_score; } int PhyloTree::computeParsimonyBranchMutation(PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst) { - PhyloNode *node = (PhyloNode *)dad_branch->node; - PhyloNeighbor *node_branch = (PhyloNeighbor *)node->findNeighbor(dad); - assert(node_branch); - if (node->isLeaf()) { - PhyloNode *tmp_node = dad; - dad = node; - node = tmp_node; - PhyloNeighbor *tmp_nei = dad_branch; - dad_branch = node_branch; - node_branch = tmp_nei; - } + PhyloNode *node = (PhyloNode *)dad_branch->node; + PhyloNeighbor *node_branch = (PhyloNeighbor *)node->findNeighbor(dad); + assert(node_branch); + if (node->isLeaf()) { + PhyloNode *tmp_node = dad; + dad = node; + node = tmp_node; + PhyloNeighbor *tmp_nei = dad_branch; + dad_branch = node_branch; + node_branch = tmp_nei; + } - int parsimony_score = 0; - parsimony_score += computePartialParsimonyMutation(dad_branch, dad); - parsimony_score += computePartialParsimonyMutation(node_branch, node); - return parsimony_score; + int parsimony_score = 0; + parsimony_score += computePartialParsimonyMutation(dad_branch, dad); + parsimony_score += computePartialParsimonyMutation(node_branch, node); + return parsimony_score; } int PhyloTree::computeParsimonyScoreMutation() { - int parsimony_score = computeParsimonyBranchMutation((PhyloNeighbor *)root->neighbors[0], (PhyloNode *)root); - parsimony_score += root_mutations.size(); - return parsimony_score; + int parsimony_score = computeParsimonyBranchMutation((PhyloNeighbor *)root->neighbors[0], (PhyloNode *)root); + parsimony_score += root_mutations.size(); + return parsimony_score; } void PhyloTree::initNodeDataPlaceNewSample() { - assert(root->isLeaf()); - PhyloNeighbor *nei = ((PhyloNeighbor *)root->neighbors[0]); - current_it = nei; - assert(current_it); - current_it_back = (PhyloNeighbor *)nei->node->findNeighbor(root); - assert(current_it_back); - - vector> bfs; - queue> node_queue; - node_queue.push(make_pair((PhyloNode *)nei->node, current_it_back)); - while (node_queue.size()) { - PhyloNode *node = node_queue.front().first; - PhyloNeighbor *node_branch = node_queue.front().second; - node->dad = (PhyloNode *)node_branch->node; - PhyloNode *dad = (PhyloNode *)node_branch->node; - node_queue.pop(); - bfs.push_back(make_pair(node, node_branch)); - FOR_NEIGHBOR_IT(node, dad, it) { - node_queue.push(make_pair((PhyloNode *)(*it)->node, (PhyloNeighbor *)(*it)->node->findNeighbor(node))); - } - } + assert(root->isLeaf()); + PhyloNeighbor *nei = ((PhyloNeighbor *)root->neighbors[0]); + current_it = nei; + assert(current_it); + current_it_back = (PhyloNeighbor *)nei->node->findNeighbor(root); + assert(current_it_back); + + vector> bfs; + queue> node_queue; + node_queue.push(make_pair((PhyloNode *)nei->node, current_it_back)); + while (node_queue.size()) { + PhyloNode *node = node_queue.front().first; + PhyloNeighbor *node_branch = node_queue.front().second; + node->dad = (PhyloNode *)node_branch->node; + PhyloNode *dad = (PhyloNode *)node_branch->node; + node_queue.pop(); + bfs.push_back(make_pair(node, node_branch)); + FOR_NEIGHBOR_IT(node, dad, it) { + node_queue.push(make_pair((PhyloNode *)(*it)->node, (PhyloNeighbor *)(*it)->node->findNeighbor(node))); + } + } - for (int i = bfs.size() - 1; i >= 0; --i) { - PhyloNode *node = bfs[i].first; - PhyloNeighbor *node_branch = bfs[i].second; - PhyloNode *dad = (PhyloNode *)node_branch->node; - node_branch->num_leaves = 0; - if (node->isLeaf()) { - node_branch->num_leaves = 1; - continue; - } - FOR_NEIGHBOR_IT(node, dad, it) { - node_branch->num_leaves += ((PhyloNeighbor *)(*it)->node->findNeighbor(node))->num_leaves; - } - } + for (int i = bfs.size() - 1; i >= 0; --i) { + PhyloNode *node = bfs[i].first; + PhyloNeighbor *node_branch = bfs[i].second; + PhyloNode *dad = (PhyloNode *)node_branch->node; + node_branch->num_leaves = 0; + if (node->isLeaf()) { + node_branch->num_leaves = 1; + continue; + } + FOR_NEIGHBOR_IT(node, dad, it) { + node_branch->num_leaves += ((PhyloNeighbor *)(*it)->node->findNeighbor(node))->num_leaves; + } + } } void PhyloTree::computeExcessMutations(PlacementCandidateNode &input) { - vector anc_positions; - vector ancestral_mutations; + vector anc_positions; + vector ancestral_mutations; - timer_regular--; - for (auto mutation : (*input.missing_sample_mutations)) { - visited_missing_sample_mutations[mutation.compressed_position] = timer_regular; - current_missing_sample_mutations[mutation.compressed_position] = mutation; - } + timer_regular--; + for (auto mutation : (*input.missing_sample_mutations)) { + visited_missing_sample_mutations[mutation.compressed_position] = timer_regular; + current_missing_sample_mutations[mutation.compressed_position] = mutation; + } - if (!(input.node == root)) { - for (auto node_mutation : input.node_branch->mutations) { - auto anc_nuc = node_mutation.mut_nuc; - if (node_mutation.is_masked()) - break; - assert(((anc_nuc - 1) & anc_nuc) == 0); - bool found = false; - bool found_pos = false; - if (visited_missing_sample_mutations[node_mutation.compressed_position] == timer_regular) { - auto missing_sample_mutation = current_missing_sample_mutations[node_mutation.compressed_position]; - if (node_mutation.position == missing_sample_mutation.position) { - found_pos = true; - if (missing_sample_mutation.is_missing) { - found = true; - } - else { - auto nuc = missing_sample_mutation.mut_nuc; - if ((nuc & anc_nuc) != 0) { - ancestral_mutations.emplace_back(node_mutation); - anc_positions.emplace_back(node_mutation.compressed_position); - (*input.excess_mutations).emplace_back(node_mutation); - found = true; - } - } - } - } - if (!found) { - if (!found_pos && (anc_nuc == node_mutation.ref_nuc)) { - ancestral_mutations.emplace_back(node_mutation); - anc_positions.emplace_back(node_mutation.compressed_position); - (*input.excess_mutations).emplace_back(node_mutation); + if (!(input.node == root)) { + for (auto node_mutation : input.node_branch->mutations) { + auto anc_nuc = node_mutation.mut_nuc; + if (node_mutation.is_masked()) + break; + assert(((anc_nuc - 1) & anc_nuc) == 0); + bool found = false; + bool found_pos = false; + if (visited_missing_sample_mutations[node_mutation.compressed_position] == timer_regular) { + auto missing_sample_mutation = current_missing_sample_mutations[node_mutation.compressed_position]; + if (node_mutation.position == missing_sample_mutation.position) { + found_pos = true; + if (missing_sample_mutation.is_missing) { + found = true; + } + else { + auto nuc = missing_sample_mutation.mut_nuc; + if ((nuc & anc_nuc) != 0) { + ancestral_mutations.emplace_back(node_mutation); + anc_positions.emplace_back(node_mutation.compressed_position); + (*input.excess_mutations).emplace_back(node_mutation); + found = true; + } + } + } + } + if (!found) { + if (!found_pos && (anc_nuc == node_mutation.ref_nuc)) { + ancestral_mutations.emplace_back(node_mutation); + anc_positions.emplace_back(node_mutation.compressed_position); + (*input.excess_mutations).emplace_back(node_mutation); - } - } - } - } - for (auto ancestral_mutation : ancestral_mutations) { - visited_ancestral_mutations[ancestral_mutation.compressed_position] = timer_regular; - current_ancestral_mutations[ancestral_mutation.compressed_position] = ancestral_mutation; - } + } + } + } + } + for (auto ancestral_mutation : ancestral_mutations) { + visited_ancestral_mutations[ancestral_mutation.compressed_position] = timer_regular; + current_ancestral_mutations[ancestral_mutation.compressed_position] = ancestral_mutation; + } - { - PhyloNode *n = input.node; - while (n->dad != root) - { - n = n->dad; - PhyloNeighbor *node_branch = (PhyloNeighbor *)n->findNeighbor(n->dad); - for (auto node_mutation : node_branch->mutations) { - if (!node_mutation.is_masked() && visited_ancestral_mutations[node_mutation.compressed_position] != timer_regular) { - ancestral_mutations.emplace_back(node_mutation); - anc_positions.emplace_back(node_mutation.compressed_position); - visited_ancestral_mutations[node_mutation.compressed_position] = timer_regular; - current_ancestral_mutations[node_mutation.compressed_position] = node_mutation; - } - } - } - for (auto root_mutation : root_mutations) { - if (!root_mutation.is_masked() && visited_ancestral_mutations[root_mutation.compressed_position] != timer_regular) { - ancestral_mutations.emplace_back(root_mutation); - anc_positions.emplace_back(root_mutation.compressed_position); - visited_ancestral_mutations[root_mutation.compressed_position] = timer_regular; - current_ancestral_mutations[root_mutation.compressed_position] = root_mutation; - } - } - } + { + PhyloNode *n = input.node; + while (n->dad != root) + { + n = n->dad; + PhyloNeighbor *node_branch = (PhyloNeighbor *)n->findNeighbor(n->dad); + for (auto node_mutation : node_branch->mutations) { + if (!node_mutation.is_masked() && visited_ancestral_mutations[node_mutation.compressed_position] != timer_regular) { + ancestral_mutations.emplace_back(node_mutation); + anc_positions.emplace_back(node_mutation.compressed_position); + visited_ancestral_mutations[node_mutation.compressed_position] = timer_regular; + current_ancestral_mutations[node_mutation.compressed_position] = node_mutation; + } + } + } + for (auto root_mutation : root_mutations) { + if (!root_mutation.is_masked() && visited_ancestral_mutations[root_mutation.compressed_position] != timer_regular) { + ancestral_mutations.emplace_back(root_mutation); + anc_positions.emplace_back(root_mutation.compressed_position); + visited_ancestral_mutations[root_mutation.compressed_position] = timer_regular; + current_ancestral_mutations[root_mutation.compressed_position] = root_mutation; + } + } + } - for (auto missing_sample_mutation : (*input.missing_sample_mutations)) { - if (missing_sample_mutation.is_missing) { - continue; - } + for (auto missing_sample_mutation : (*input.missing_sample_mutations)) { + if (missing_sample_mutation.is_missing) { + continue; + } - bool found_pos = false; - bool found = false; - bool nuc = false; - auto anc_nuc = missing_sample_mutation.ref_nuc; + bool found_pos = false; + bool found = false; + bool nuc = false; + auto anc_nuc = missing_sample_mutation.ref_nuc; - if ((missing_sample_mutation.mut_nuc & missing_sample_mutation.ref_nuc) != 0) { - nuc = true; - } - if (visited_ancestral_mutations[missing_sample_mutation.compressed_position] == timer_regular) { - auto ancestral_mutation = current_ancestral_mutations[missing_sample_mutation.compressed_position]; - if (!ancestral_mutation.is_masked()) { - found_pos = true; - anc_nuc = ancestral_mutation.mut_nuc; - if ((missing_sample_mutation.mut_nuc & anc_nuc) != 0) { - found = true; - } - } - } - if (!found && (found_pos || !nuc)) { - Mutation mutation; - mutation.position = missing_sample_mutation.position; - mutation.compressed_position = missing_sample_mutation.compressed_position; - mutation.ref_nuc = missing_sample_mutation.ref_nuc; - mutation.par_nuc = anc_nuc; - for (int nuc = 0; nuc < 4; nuc++) { - if (((1 << nuc) & missing_sample_mutation.mut_nuc) != 0) { - mutation.mut_nuc = (1 << nuc); - break; - } - } - assert((mutation.mut_nuc & (mutation.mut_nuc - 1)) == 0); - if (mutation.mut_nuc != mutation.par_nuc) { - input.excess_mutations->emplace_back(mutation); - } - } - } + if ((missing_sample_mutation.mut_nuc & missing_sample_mutation.ref_nuc) != 0) { + nuc = true; + } + if (visited_ancestral_mutations[missing_sample_mutation.compressed_position] == timer_regular) { + auto ancestral_mutation = current_ancestral_mutations[missing_sample_mutation.compressed_position]; + if (!ancestral_mutation.is_masked()) { + found_pos = true; + anc_nuc = ancestral_mutation.mut_nuc; + if ((missing_sample_mutation.mut_nuc & anc_nuc) != 0) { + found = true; + } + } + } + if (!found && (found_pos || !nuc)) { + Mutation mutation; + mutation.position = missing_sample_mutation.position; + mutation.compressed_position = missing_sample_mutation.compressed_position; + mutation.ref_nuc = missing_sample_mutation.ref_nuc; + mutation.par_nuc = anc_nuc; + for (int nuc = 0; nuc < 4; nuc++) { + if (((1 << nuc) & missing_sample_mutation.mut_nuc) != 0) { + mutation.mut_nuc = (1 << nuc); + break; + } + } + assert((mutation.mut_nuc & (mutation.mut_nuc - 1)) == 0); + if (mutation.mut_nuc != mutation.par_nuc) { + input.excess_mutations->emplace_back(mutation); + } + } + } - for (auto ancestral_mutation : ancestral_mutations) { - bool found = false; - bool found_pos = false; - auto anc_nuc = ancestral_mutation.mut_nuc; - if (visited_missing_sample_mutations[ancestral_mutation.compressed_position] == timer_regular) { - if (!ancestral_mutation.is_masked()) { - auto missing_sample_mutation = current_missing_sample_mutations[ancestral_mutation.compressed_position]; - found_pos = true; - if (missing_sample_mutation.is_missing || (missing_sample_mutation.mut_nuc & anc_nuc) != 0) { - found = true; - } - } - } - if (!found && !found_pos && (ancestral_mutation.is_masked() || (anc_nuc != ancestral_mutation.ref_nuc))) { - Mutation mutation; - mutation.position = ancestral_mutation.position; - mutation.compressed_position = ancestral_mutation.compressed_position; - mutation.ref_nuc = ancestral_mutation.ref_nuc; - mutation.par_nuc = anc_nuc; - mutation.mut_nuc = ancestral_mutation.ref_nuc; - assert(mutation.is_masked() || ((mutation.mut_nuc & (mutation.mut_nuc - 1)) == 0)); - if (mutation.mut_nuc != mutation.par_nuc) { - (*input.excess_mutations).emplace_back(mutation); - } - } - } + for (auto ancestral_mutation : ancestral_mutations) { + bool found = false; + bool found_pos = false; + auto anc_nuc = ancestral_mutation.mut_nuc; + if (visited_missing_sample_mutations[ancestral_mutation.compressed_position] == timer_regular) { + if (!ancestral_mutation.is_masked()) { + auto missing_sample_mutation = current_missing_sample_mutations[ancestral_mutation.compressed_position]; + found_pos = true; + if (missing_sample_mutation.is_missing || (missing_sample_mutation.mut_nuc & anc_nuc) != 0) { + found = true; + } + } + } + if (!found && !found_pos && (ancestral_mutation.is_masked() || (anc_nuc != ancestral_mutation.ref_nuc))) { + Mutation mutation; + mutation.position = ancestral_mutation.position; + mutation.compressed_position = ancestral_mutation.compressed_position; + mutation.ref_nuc = ancestral_mutation.ref_nuc; + mutation.par_nuc = anc_nuc; + mutation.mut_nuc = ancestral_mutation.ref_nuc; + assert(mutation.is_masked() || ((mutation.mut_nuc & (mutation.mut_nuc - 1)) == 0)); + if (mutation.mut_nuc != mutation.par_nuc) { + (*input.excess_mutations).emplace_back(mutation); + } + } + } } void PhyloTree::initNewSampleMutations(PlacementCandidateNode &inp) { - ++timer_optimized; - for (auto mutation : (*inp.missing_sample_mutations)) { - visited_missing_sample_mutations[mutation.compressed_position] = timer_optimized; - current_missing_sample_mutations[mutation.compressed_position] = mutation; - } + ++timer_optimized; + for (auto mutation : (*inp.missing_sample_mutations)) { + visited_missing_sample_mutations[mutation.compressed_position] = timer_optimized; + current_missing_sample_mutations[mutation.compressed_position] = mutation; + } } void PhyloTree::eraseMutation(vector &erased_excess_mutation, Mutation mutation, int &set_difference) { - if (visited_excess_mutations[mutation.compressed_position] == timer_optimized) { - erased_excess_mutation.emplace_back(current_excess_mutations[mutation.compressed_position]); - visited_excess_mutations[mutation.compressed_position] = 0; - --set_difference; - } + if (visited_excess_mutations[mutation.compressed_position] == timer_optimized) { + erased_excess_mutation.emplace_back(current_excess_mutations[mutation.compressed_position]); + visited_excess_mutations[mutation.compressed_position] = 0; + --set_difference; + } } void PhyloTree::addMutation(vector &added_excess_mutation, Mutation mutation, int diff, int &set_difference) { - added_excess_mutation.push_back(mutation); - visited_excess_mutations[mutation.compressed_position] = timer_optimized; - current_excess_mutations[mutation.compressed_position] = mutation; - set_difference += diff; + added_excess_mutation.push_back(mutation); + visited_excess_mutations[mutation.compressed_position] = timer_optimized; + current_excess_mutations[mutation.compressed_position] = mutation; + set_difference += diff; } void PhyloTree::optimizedFindPositionPlaceNewSample(PlacementCandidateNode &input, int set_difference) { - vector ancentral_positions; - vector ancestral_mutations; - vector erased_excess_mutation; - vector added_excess_mutation; - vector common_mutations; - vector diff_mutations; - - if (!(input.node == root)) { - for (auto node_mutation : input.node_branch->mutations) { - auto anc_nuc = node_mutation.mut_nuc; - if (node_mutation.is_masked()) { - break; - } - assert(((anc_nuc - 1) & anc_nuc) == 0); - bool found = false; - bool found_pos = false; - if (visited_missing_sample_mutations[node_mutation.compressed_position] == timer_optimized) { - auto missing_sample_mutation = current_missing_sample_mutations[node_mutation.compressed_position]; - if (node_mutation.position == missing_sample_mutation.position) { - found_pos = true; - if (missing_sample_mutation.is_missing) { - found = true; - } - else { - auto sample_nuc = missing_sample_mutation.mut_nuc; - if ((sample_nuc & anc_nuc) != 0) { - ancestral_mutations.emplace_back(node_mutation); - ancentral_positions.emplace_back(node_mutation.compressed_position); - eraseMutation(erased_excess_mutation, node_mutation, set_difference); - addMutation(added_excess_mutation, node_mutation, 0, set_difference); - common_mutations.emplace_back(node_mutation); - found = true; - } - } - } - } - if (!found) { - if (!found_pos && (anc_nuc == node_mutation.ref_nuc)) { - ancestral_mutations.emplace_back(node_mutation); - ancentral_positions.emplace_back(node_mutation.compressed_position); - eraseMutation(erased_excess_mutation, node_mutation, set_difference); - addMutation(added_excess_mutation, node_mutation, 0, set_difference); - common_mutations.emplace_back(node_mutation); - } - else { - diff_mutations.emplace_back(node_mutation); - } - } - } - } + vector ancentral_positions; + vector ancestral_mutations; + vector erased_excess_mutation; + vector added_excess_mutation; + vector common_mutations; + vector diff_mutations; + + if (!(input.node == root)) { + for (auto node_mutation : input.node_branch->mutations) { + auto anc_nuc = node_mutation.mut_nuc; + if (node_mutation.is_masked()) { + break; + } + assert(((anc_nuc - 1) & anc_nuc) == 0); + bool found = false; + bool found_pos = false; + if (visited_missing_sample_mutations[node_mutation.compressed_position] == timer_optimized) { + auto missing_sample_mutation = current_missing_sample_mutations[node_mutation.compressed_position]; + if (node_mutation.position == missing_sample_mutation.position) { + found_pos = true; + if (missing_sample_mutation.is_missing) { + found = true; + } + else { + auto sample_nuc = missing_sample_mutation.mut_nuc; + if ((sample_nuc & anc_nuc) != 0) { + ancestral_mutations.emplace_back(node_mutation); + ancentral_positions.emplace_back(node_mutation.compressed_position); + eraseMutation(erased_excess_mutation, node_mutation, set_difference); + addMutation(added_excess_mutation, node_mutation, 0, set_difference); + common_mutations.emplace_back(node_mutation); + found = true; + } + } + } + } + if (!found) { + if (!found_pos && (anc_nuc == node_mutation.ref_nuc)) { + ancestral_mutations.emplace_back(node_mutation); + ancentral_positions.emplace_back(node_mutation.compressed_position); + eraseMutation(erased_excess_mutation, node_mutation, set_difference); + addMutation(added_excess_mutation, node_mutation, 0, set_difference); + common_mutations.emplace_back(node_mutation); + } + else { + diff_mutations.emplace_back(node_mutation); + } + } + } + } - if (input.node->dad == root) { - for (auto root_mutation : root_mutations) { - if (!root_mutation.is_masked() && visited_ancestral_mutations[root_mutation.compressed_position] != timer_optimized) { - ancestral_mutations.emplace_back(root_mutation); - ancentral_positions.emplace_back(root_mutation.compressed_position); - visited_ancestral_mutations[root_mutation.compressed_position] = timer_optimized; - current_ancestral_mutations[root_mutation.compressed_position] = root_mutation; - } - } + if (input.node->dad == root) { + for (auto root_mutation : root_mutations) { + if (!root_mutation.is_masked() && visited_ancestral_mutations[root_mutation.compressed_position] != timer_optimized) { + ancestral_mutations.emplace_back(root_mutation); + ancentral_positions.emplace_back(root_mutation.compressed_position); + visited_ancestral_mutations[root_mutation.compressed_position] = timer_optimized; + current_ancestral_mutations[root_mutation.compressed_position] = root_mutation; + } + } - for (auto missing_sample_mutation : (*input.missing_sample_mutations)) { - if (missing_sample_mutation.is_missing) { - continue; - } - bool found = false; - bool found_pos = false; - bool has_ref = false; - auto anc_nuc = missing_sample_mutation.ref_nuc; - if ((missing_sample_mutation.mut_nuc & missing_sample_mutation.ref_nuc) != 0) { - has_ref = true; - } + for (auto missing_sample_mutation : (*input.missing_sample_mutations)) { + if (missing_sample_mutation.is_missing) { + continue; + } + bool found = false; + bool found_pos = false; + bool has_ref = false; + auto anc_nuc = missing_sample_mutation.ref_nuc; + if ((missing_sample_mutation.mut_nuc & missing_sample_mutation.ref_nuc) != 0) { + has_ref = true; + } - if (visited_ancestral_mutations[missing_sample_mutation.compressed_position] == timer_optimized) { - auto ancestral_mutation = current_ancestral_mutations[missing_sample_mutation.compressed_position]; - if (!ancestral_mutation.is_masked()) { - found_pos = true; - anc_nuc = ancestral_mutation.mut_nuc; - if ((missing_sample_mutation.mut_nuc & anc_nuc) != 0) { - found = true; - } - } - } - if (!found && !has_ref) { - Mutation mutation; - mutation.position = missing_sample_mutation.position; - mutation.compressed_position = missing_sample_mutation.compressed_position; - mutation.ref_nuc = missing_sample_mutation.ref_nuc; - mutation.par_nuc = anc_nuc; - for (int nuc = 0; nuc < 4; nuc++) { - if (((1 << nuc) & missing_sample_mutation.mut_nuc) != 0) - { - mutation.mut_nuc = (1 << nuc); - break; - } - } - addMutation(added_excess_mutation, mutation, 1, set_difference); - } - } - } + if (visited_ancestral_mutations[missing_sample_mutation.compressed_position] == timer_optimized) { + auto ancestral_mutation = current_ancestral_mutations[missing_sample_mutation.compressed_position]; + if (!ancestral_mutation.is_masked()) { + found_pos = true; + anc_nuc = ancestral_mutation.mut_nuc; + if ((missing_sample_mutation.mut_nuc & anc_nuc) != 0) { + found = true; + } + } + } + if (!found && !has_ref) { + Mutation mutation; + mutation.position = missing_sample_mutation.position; + mutation.compressed_position = missing_sample_mutation.compressed_position; + mutation.ref_nuc = missing_sample_mutation.ref_nuc; + mutation.par_nuc = anc_nuc; + for (int nuc = 0; nuc < 4; nuc++) { + if (((1 << nuc) & missing_sample_mutation.mut_nuc) != 0) + { + mutation.mut_nuc = (1 << nuc); + break; + } + } + addMutation(added_excess_mutation, mutation, 1, set_difference); + } + } + } - for (auto ancestral_mutation : ancestral_mutations) { - bool found = false; - bool found_pos = false; - auto anc_nuc = ancestral_mutation.mut_nuc; - if (visited_missing_sample_mutations[ancestral_mutation.compressed_position] == timer_optimized) { - if (!ancestral_mutation.is_masked()) { - auto missing_sample_mutation = current_missing_sample_mutations[ancestral_mutation.compressed_position]; - found_pos = true; - if (missing_sample_mutation.is_missing || (missing_sample_mutation.mut_nuc & anc_nuc) != 0) { - found = true; - } - } - } - if (!found && (found_pos || ancestral_mutation.is_masked() || (anc_nuc != ancestral_mutation.ref_nuc))) { - eraseMutation(erased_excess_mutation, ancestral_mutation, set_difference); - Mutation mutation; - mutation.position = ancestral_mutation.position; - mutation.compressed_position = ancestral_mutation.compressed_position; - mutation.ref_nuc = ancestral_mutation.ref_nuc; - mutation.par_nuc = anc_nuc; - mutation.mut_nuc = ancestral_mutation.ref_nuc; - if (mutation.mut_nuc != mutation.par_nuc) { - addMutation(added_excess_mutation, mutation, 1, set_difference); - } - } - } + for (auto ancestral_mutation : ancestral_mutations) { + bool found = false; + bool found_pos = false; + auto anc_nuc = ancestral_mutation.mut_nuc; + if (visited_missing_sample_mutations[ancestral_mutation.compressed_position] == timer_optimized) { + if (!ancestral_mutation.is_masked()) { + auto missing_sample_mutation = current_missing_sample_mutations[ancestral_mutation.compressed_position]; + found_pos = true; + if (missing_sample_mutation.is_missing || (missing_sample_mutation.mut_nuc & anc_nuc) != 0) { + found = true; + } + } + } + if (!found && (found_pos || ancestral_mutation.is_masked() || (anc_nuc != ancestral_mutation.ref_nuc))) { + eraseMutation(erased_excess_mutation, ancestral_mutation, set_difference); + Mutation mutation; + mutation.position = ancestral_mutation.position; + mutation.compressed_position = ancestral_mutation.compressed_position; + mutation.ref_nuc = ancestral_mutation.ref_nuc; + mutation.par_nuc = anc_nuc; + mutation.mut_nuc = ancestral_mutation.ref_nuc; + if (mutation.mut_nuc != mutation.par_nuc) { + addMutation(added_excess_mutation, mutation, 1, set_difference); + } + } + } - size_t num_leaves = input.node_branch->num_leaves; - if (set_difference < *input.best_set_difference) { - *input.best_set_difference = set_difference; - *input.best_node_num_leaves = num_leaves; - input.best_node = input.node; - input.best_node_branch = input.node_branch; - } - else if (set_difference == *input.best_set_difference) { - if (((num_leaves >= *input.best_node_num_leaves))) { - *input.best_set_difference = set_difference; - *input.best_node_num_leaves = num_leaves; - input.best_node = input.node; - input.best_node_branch = input.node_branch; - } - } + size_t num_leaves = input.node_branch->num_leaves; + if (set_difference < *input.best_set_difference) { + *input.best_set_difference = set_difference; + *input.best_node_num_leaves = num_leaves; + input.best_node = input.node; + input.best_node_branch = input.node_branch; + } + else if (set_difference == *input.best_set_difference) { + if (((num_leaves >= *input.best_node_num_leaves))) { + *input.best_set_difference = set_difference; + *input.best_node_num_leaves = num_leaves; + input.best_node = input.node; + input.best_node_branch = input.node_branch; + } + } - for (auto common_mutation : common_mutations) { - visited_excess_mutations[common_mutation.compressed_position] = 0; - } + for (auto common_mutation : common_mutations) { + visited_excess_mutations[common_mutation.compressed_position] = 0; + } - for (auto diff_mutation : diff_mutations) { - Mutation mutation; - mutation.ref_nuc = diff_mutation.ref_nuc; - mutation.par_nuc = diff_mutation.mut_nuc; - mutation.mut_nuc = diff_mutation.ref_nuc; - mutation.position = diff_mutation.position; - mutation.compressed_position = diff_mutation.compressed_position; - if (visited_missing_sample_mutations[diff_mutation.compressed_position] == timer_optimized) { - mutation.mut_nuc = current_missing_sample_mutations[diff_mutation.compressed_position].mut_nuc; - } - eraseMutation(erased_excess_mutation, mutation, set_difference); - if (mutation.mut_nuc != mutation.par_nuc) { - addMutation(added_excess_mutation, mutation, 1, set_difference); - } - } + for (auto diff_mutation : diff_mutations) { + Mutation mutation; + mutation.ref_nuc = diff_mutation.ref_nuc; + mutation.par_nuc = diff_mutation.mut_nuc; + mutation.mut_nuc = diff_mutation.ref_nuc; + mutation.position = diff_mutation.position; + mutation.compressed_position = diff_mutation.compressed_position; + if (visited_missing_sample_mutations[diff_mutation.compressed_position] == timer_optimized) { + mutation.mut_nuc = current_missing_sample_mutations[diff_mutation.compressed_position].mut_nuc; + } + eraseMutation(erased_excess_mutation, mutation, set_difference); + if (mutation.mut_nuc != mutation.par_nuc) { + addMutation(added_excess_mutation, mutation, 1, set_difference); + } + } - PhyloNode *node = input.node; - PhyloNode *dad = node->dad; - FOR_NEIGHBOR_IT(node, dad, it) { - PhyloNode *child_node = (PhyloNode *)(*it)->node; - PhyloNeighbor *child_node_branch = (PhyloNeighbor *)child_node->findNeighbor(node); - input.node = child_node; - input.node_branch = child_node_branch; - optimizedFindPositionPlaceNewSample(input, set_difference); - } + PhyloNode *node = input.node; + PhyloNode *dad = node->dad; + FOR_NEIGHBOR_IT(node, dad, it) { + PhyloNode *child_node = (PhyloNode *)(*it)->node; + PhyloNeighbor *child_node_branch = (PhyloNeighbor *)child_node->findNeighbor(node); + input.node = child_node; + input.node_branch = child_node_branch; + optimizedFindPositionPlaceNewSample(input, set_difference); + } - for (auto mutation : added_excess_mutation) { - visited_excess_mutations[mutation.compressed_position] = 0; - } + for (auto mutation : added_excess_mutation) { + visited_excess_mutations[mutation.compressed_position] = 0; + } - for (int i = erased_excess_mutation.size() - 1; i >= 0; i--) { - Mutation mutation = erased_excess_mutation[i]; - visited_excess_mutations[mutation.compressed_position] = timer_optimized; - current_excess_mutations[mutation.compressed_position] = mutation; - } + for (int i = erased_excess_mutation.size() - 1; i >= 0; i--) { + Mutation mutation = erased_excess_mutation[i]; + visited_excess_mutations[mutation.compressed_position] = timer_optimized; + current_excess_mutations[mutation.compressed_position] = mutation; + } } void PhyloTree::addNewSample(PhyloNode *best_node, PhyloNeighbor *best_node_branch, vector node_excess_mutations, int index, string sample_name) { - PhyloNode *new_node = (PhyloNode *)newNode(); - PhyloNode *sample = (PhyloNode *)newNode(aln->getNSeq() + index, sample_name.c_str()); - sample->setMissingNode(index); - new_node->addNeighbor(sample, -1.0); - sample->addNeighbor(new_node, -1.0); - PhyloNode *best_dad = (PhyloNode *)best_node_branch->node; - - vector common_mutations, best_node_mutations, sample_mutations; - vector current_node_mutations; - // Compute current best node branch mutations - for (auto node_mutation : best_node_branch->mutations) { - current_node_mutations.emplace_back(node_mutation); - } + PhyloNode *new_node = (PhyloNode *)newNode(); + PhyloNode *sample = (PhyloNode *)newNode(aln->getNSeq() + index, sample_name.c_str()); + sample->setMissingNode(index); + new_node->addNeighbor(sample, -1.0); + sample->addNeighbor(new_node, -1.0); + PhyloNode *best_dad = (PhyloNode *)best_node_branch->node; + + vector common_mutations, best_node_mutations, sample_mutations; + vector current_node_mutations; + // Compute current best node branch mutations + for (auto node_mutation : best_node_branch->mutations) { + current_node_mutations.emplace_back(node_mutation); + } - best_node_branch->clearMutations(); - --timer_regular; - for (auto node_mutation : current_node_mutations) { - visited_ancestral_mutations[node_mutation.compressed_position] = timer_regular; - current_ancestral_mutations[node_mutation.compressed_position] = node_mutation; - } - for (auto excess_mutation : node_excess_mutations) { - visited_excess_mutations[excess_mutation.compressed_position] = timer_regular; - current_excess_mutations[excess_mutation.compressed_position] = excess_mutation; - } - for (auto node_mutation : current_node_mutations) { - bool found = false; - if (!node_mutation.is_masked()) { - if (visited_excess_mutations[node_mutation.compressed_position] == timer_regular) { - auto excess_mutation = current_excess_mutations[node_mutation.compressed_position]; - if (node_mutation.position == excess_mutation.position) { - if (node_mutation.mut_nuc == excess_mutation.mut_nuc) { - found = true; - } - } - } - } - if (!found) { - best_node_mutations.emplace_back(node_mutation); - } - } - // Compute sample mutations - for (auto excess_mutation : node_excess_mutations) { - bool found = false; - if (!excess_mutation.is_masked()) { - if (visited_ancestral_mutations[excess_mutation.compressed_position] == timer_regular) { - auto ancestral_mutation = current_ancestral_mutations[excess_mutation.compressed_position]; - if (excess_mutation.position == ancestral_mutation.position) { - if (excess_mutation.mut_nuc == ancestral_mutation.mut_nuc) { - found = true; - Mutation m = excess_mutation.copy(); - common_mutations.emplace_back(m); - } - } - } - } - if (!found) { - sample_mutations.emplace_back(excess_mutation); - } - } + best_node_branch->clearMutations(); + --timer_regular; + for (auto node_mutation : current_node_mutations) { + visited_ancestral_mutations[node_mutation.compressed_position] = timer_regular; + current_ancestral_mutations[node_mutation.compressed_position] = node_mutation; + } + for (auto excess_mutation : node_excess_mutations) { + visited_excess_mutations[excess_mutation.compressed_position] = timer_regular; + current_excess_mutations[excess_mutation.compressed_position] = excess_mutation; + } + for (auto node_mutation : current_node_mutations) { + bool found = false; + if (!node_mutation.is_masked()) { + if (visited_excess_mutations[node_mutation.compressed_position] == timer_regular) { + auto excess_mutation = current_excess_mutations[node_mutation.compressed_position]; + if (node_mutation.position == excess_mutation.position) { + if (node_mutation.mut_nuc == excess_mutation.mut_nuc) { + found = true; + } + } + } + } + if (!found) { + best_node_mutations.emplace_back(node_mutation); + } + } + // Compute sample mutations + for (auto excess_mutation : node_excess_mutations) { + bool found = false; + if (!excess_mutation.is_masked()) { + if (visited_ancestral_mutations[excess_mutation.compressed_position] == timer_regular) { + auto ancestral_mutation = current_ancestral_mutations[excess_mutation.compressed_position]; + if (excess_mutation.position == ancestral_mutation.position) { + if (excess_mutation.mut_nuc == ancestral_mutation.mut_nuc) { + found = true; + Mutation m = excess_mutation.copy(); + common_mutations.emplace_back(m); + } + } + } + } + if (!found) { + sample_mutations.emplace_back(excess_mutation); + } + } - new_node->addNeighbor(best_node, -1.0); - new_node->addNeighbor(best_dad, -1.0); - best_node->updateNeighbor(best_dad, new_node, -1.0); - best_dad->updateNeighbor(best_node, new_node, -1.0); - // Add mutations to new node using common_mut - PhyloNeighbor *new_node_branch = (PhyloNeighbor *)new_node->findNeighbor(best_dad); - new_node_branch->mutations = common_mutations; + new_node->addNeighbor(best_node, -1.0); + new_node->addNeighbor(best_dad, -1.0); + best_node->updateNeighbor(best_dad, new_node, -1.0); + best_dad->updateNeighbor(best_node, new_node, -1.0); + // Add mutations to new node using common_mut + PhyloNeighbor *new_node_branch = (PhyloNeighbor *)new_node->findNeighbor(best_dad); + new_node_branch->mutations = common_mutations; - PhyloNeighbor *new_best_node_branch = (PhyloNeighbor *)best_node->findNeighbor(new_node); - new_best_node_branch->mutations = best_node_mutations; + PhyloNeighbor *new_best_node_branch = (PhyloNeighbor *)best_node->findNeighbor(new_node); + new_best_node_branch->mutations = best_node_mutations; - PhyloNeighbor *sample_branch = (PhyloNeighbor *)sample->findNeighbor(new_node); - sample_branch->mutations = sample_mutations; + PhyloNeighbor *sample_branch = (PhyloNeighbor *)sample->findNeighbor(new_node); + sample_branch->mutations = sample_mutations; } string PhyloTree::verifyPartialMutationCorrectness(vector &position, PhyloNeighbor *dad_branch, PhyloNode *dad) { - PhyloNode *node = (PhyloNode *)dad_branch->node; - int nsite = aln->getNSite(); - - if (node->isLeaf() && dad) { - PhyloNeighbor *node_branch = (PhyloNeighbor *)node->findNeighbor(dad); - string sequence = ""; - assert(node->id < aln->getNSeq()); - for (int i = 0; i < nsite; ++i) { - Pattern pattern = aln->getPattern(i); - sequence += aln->convertStateBack(pattern[node->id]); - } - return sequence; - } - else { - string left_sequence, right_sequence; - PhyloNeighbor *left_branch, *right_branch; - bool left_child = true; - FOR_NEIGHBOR_IT(node, dad, it) { - if ((*it)->node->name != ROOT_NAME) { - if (left_child) { - left_branch = (PhyloNeighbor *)(*it)->node->findNeighbor(node); - left_sequence = verifyPartialMutationCorrectness(position, (PhyloNeighbor *)(*it), (PhyloNode *)node); - left_child = false; - continue; - } - right_branch = (PhyloNeighbor *)(*it)->node->findNeighbor(node); - right_sequence = verifyPartialMutationCorrectness(position, (PhyloNeighbor *)(*it), (PhyloNode *)node); - } - } - for (auto mutation : left_branch->mutations) { - assert(position[mutation.compressed_position] < (int)left_sequence.length()); - left_sequence[position[mutation.compressed_position]] = aln->getStateFromMutation(mutation.par_nuc); - } - for (auto mutation : right_branch->mutations) { - assert(position[mutation.compressed_position] < (int)right_sequence.length()); - right_sequence[position[mutation.compressed_position]] = aln->getStateFromMutation(mutation.par_nuc); - } + PhyloNode *node = (PhyloNode *)dad_branch->node; + int nsite = aln->getNSite(); + + if (node->isLeaf() && dad) { + PhyloNeighbor *node_branch = (PhyloNeighbor *)node->findNeighbor(dad); + string sequence = ""; + assert(node->id < aln->getNSeq()); + for (int i = 0; i < nsite; ++i) { + Pattern pattern = aln->getPattern(i); + sequence += aln->convertStateBack(pattern[node->id]); + } + return sequence; + } + else { + string left_sequence, right_sequence; + PhyloNeighbor *left_branch, *right_branch; + bool left_child = true; + FOR_NEIGHBOR_IT(node, dad, it) { + if ((*it)->node->name != ROOT_NAME) { + if (left_child) { + left_branch = (PhyloNeighbor *)(*it)->node->findNeighbor(node); + left_sequence = verifyPartialMutationCorrectness(position, (PhyloNeighbor *)(*it), (PhyloNode *)node); + left_child = false; + continue; + } + right_branch = (PhyloNeighbor *)(*it)->node->findNeighbor(node); + right_sequence = verifyPartialMutationCorrectness(position, (PhyloNeighbor *)(*it), (PhyloNode *)node); + } + } + for (auto mutation : left_branch->mutations) { + assert(position[mutation.compressed_position] < (int)left_sequence.length()); + left_sequence[position[mutation.compressed_position]] = aln->getStateFromMutation(mutation.par_nuc); + } + for (auto mutation : right_branch->mutations) { + assert(position[mutation.compressed_position] < (int)right_sequence.length()); + right_sequence[position[mutation.compressed_position]] = aln->getStateFromMutation(mutation.par_nuc); + } - if (left_sequence != right_sequence) { - for (int i = 0; i < (int)left_sequence.length(); ++i) { - if (left_sequence[i] != right_sequence[i] && (aln->getMutationFromState(left_sequence[i]) & aln->getMutationFromState(right_sequence[i])) == 0) { - cout << "Compute mutations wrong"; - exit(1); - } - } - } - return left_sequence; - } + if (left_sequence != right_sequence) { + for (int i = 0; i < (int)left_sequence.length(); ++i) { + if (left_sequence[i] != right_sequence[i] && (aln->getMutationFromState(left_sequence[i]) & aln->getMutationFromState(right_sequence[i])) == 0) { + cout << "Compute mutations wrong"; + exit(1); + } + } + } + return left_sequence; + } } void PhyloTree::verifyMutationCorrectnessBranch(vector &position, PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst) { - PhyloNode *node = (PhyloNode *)dad_branch->node; - PhyloNeighbor *node_branch = (PhyloNeighbor *)node->findNeighbor(dad); - if (node->isLeaf()) { - PhyloNode *tmp_node = dad; - dad = node; - node = tmp_node; - PhyloNeighbor *tmp_nei = dad_branch; - dad_branch = node_branch; - node_branch = tmp_nei; - } + PhyloNode *node = (PhyloNode *)dad_branch->node; + PhyloNeighbor *node_branch = (PhyloNeighbor *)node->findNeighbor(dad); + if (node->isLeaf()) { + PhyloNode *tmp_node = dad; + dad = node; + node = tmp_node; + PhyloNeighbor *tmp_nei = dad_branch; + dad_branch = node_branch; + node_branch = tmp_nei; + } - string left_sequence = verifyPartialMutationCorrectness(position, dad_branch, dad); - string right_sequence = verifyPartialMutationCorrectness(position, node_branch, node); - for (auto mutation : node_branch->mutations) { - left_sequence[position[mutation.compressed_position]] = aln->getStateFromMutation(mutation.par_nuc); - } - for (auto mutation : dad_branch->mutations) { - right_sequence[position[mutation.compressed_position]] = aln->getStateFromMutation(mutation.par_nuc); - } - if (left_sequence != right_sequence) { - for (int i = 0; i < (int)left_sequence.length(); ++i) { - if (left_sequence[i] != right_sequence[i] && (aln->getMutationFromState(left_sequence[i]) & aln->getMutationFromState(right_sequence[i])) == 0) { - cout << "Compute mutations wrong at root"; - exit(1); - } - } - } + string left_sequence = verifyPartialMutationCorrectness(position, dad_branch, dad); + string right_sequence = verifyPartialMutationCorrectness(position, node_branch, node); + for (auto mutation : node_branch->mutations) { + left_sequence[position[mutation.compressed_position]] = aln->getStateFromMutation(mutation.par_nuc); + } + for (auto mutation : dad_branch->mutations) { + right_sequence[position[mutation.compressed_position]] = aln->getStateFromMutation(mutation.par_nuc); + } + if (left_sequence != right_sequence) { + for (int i = 0; i < (int)left_sequence.length(); ++i) { + if (left_sequence[i] != right_sequence[i] && (aln->getMutationFromState(left_sequence[i]) & aln->getMutationFromState(right_sequence[i])) == 0) { + cout << "Compute mutations wrong at root"; + exit(1); + } + } + } } void PhyloTree::verifyMutationCorrectness() { - cout << "========== Start checking mutations ==========\n"; - vector perm_col = aln->findRotatedColumnPermutation(); - int nsite = aln->getNSite(); - assert(perm_col.size() == nsite); - vector sorted_perm_col(perm_col); - sort(sorted_perm_col.begin(), sorted_perm_col.end()); - vector compressed_perm_col; - for (int col : perm_col) { - int idx = lower_bound(sorted_perm_col.begin(), sorted_perm_col.end(), col) - sorted_perm_col.begin(); - compressed_perm_col.push_back(idx); - } - vector position(nsite); - for (int i = 0; i < nsite; ++i) { - position[compressed_perm_col[i]] = i; - } - verifyMutationCorrectnessBranch(position, (PhyloNeighbor *)root->neighbors[0], (PhyloNode *)root); - cout << "Compute mutation correctly\n"; - cout << "========== End checking mutations ==========\n"; + cout << "========== Start checking mutations ==========\n"; + vector perm_col = aln->findRotatedColumnPermutation(); + int nsite = aln->getNSite(); + assert(perm_col.size() == nsite); + vector sorted_perm_col(perm_col); + sort(sorted_perm_col.begin(), sorted_perm_col.end()); + vector compressed_perm_col; + for (int col : perm_col) { + int idx = lower_bound(sorted_perm_col.begin(), sorted_perm_col.end(), col) - sorted_perm_col.begin(); + compressed_perm_col.push_back(idx); + } + vector position(nsite); + for (int i = 0; i < nsite; ++i) { + position[compressed_perm_col[i]] = i; + } + verifyMutationCorrectnessBranch(position, (PhyloNeighbor *)root->neighbors[0], (PhyloNode *)root); + cout << "Compute mutation correctly\n"; + cout << "========== End checking mutations ==========\n"; } \ No newline at end of file From eae6c6a398c2ebf1f7ad900d69b4c0e840029c5f Mon Sep 17 00:00:00 2001 From: trungnotchung Date: Thu, 12 Jun 2025 19:04:50 +0700 Subject: [PATCH 17/23] update spacing --- phylotree.cpp | 70 +++++++++++++++++++++++++-------------------------- 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/phylotree.cpp b/phylotree.cpp index 974b4dda..0a81af31 100644 --- a/phylotree.cpp +++ b/phylotree.cpp @@ -1058,41 +1058,41 @@ int PhyloTree::computeParsimonyBranch(PhyloNeighbor *dad_branch, PhyloNode *dad, } } } else if (aln->num_states == 4 && aln->seq_type == SEQ_DNA) { - // ULTRAFAST VERSION FOR DNA - for (ptn = 0; ptn < aln->size(); ptn += 8) - { - UINT states_left = node_branch->partial_pars[ptn / 8]; - UINT states_right = dad_branch->partial_pars[ptn / 8]; - UINT states_dad = 0; - int maxi = aln->size() - ptn; - if (maxi > 8) - maxi = 8; - for (i = 0; i < maxi; i++) - { - UINT state_left = (states_left >> (i * 4)) & 15; - UINT state_right = (states_right >> (i * 4)) & 15; - UINT state_both = state_left | (state_right << 4); - // cout << state_left << " " << states_right << " " << state_right << " " << dna_fitch_result[state_both] << endl; - states_dad |= dna_fitch_result[state_both] << (i * 4); - tree_pars += dna_fitch_step[state_both] * aln->at(ptn + i).frequency; - _pattern_pars[ptn + i] = node_branch->partial_pars[ptn_pars_start_id + ptn + i] + - dad_branch->partial_pars[ptn_pars_start_id + ptn + i] + dna_fitch_step[state_both]; - } - if (add_row) { - for (int i = 0; i < maxi; ++i) { - for (int j = 0; j < 4; ++j) { - if (states_dad & (1 << (i * 4 + j))) { - for (int k = j + 1; k < 4; ++k) { - if (states_dad & (1 << (i * 4 + k))) { - states_dad ^= (1 << (i * 4 + k)); - } - } - break; - } - } - } - root_states[ptn / 8] = states_dad; - } + // ULTRAFAST VERSION FOR DNA + for (ptn = 0; ptn < aln->size(); ptn += 8) + { + UINT states_left = node_branch->partial_pars[ptn / 8]; + UINT states_right = dad_branch->partial_pars[ptn / 8]; + UINT states_dad = 0; + int maxi = aln->size() - ptn; + if (maxi > 8) + maxi = 8; + for (i = 0; i < maxi; i++) + { + UINT state_left = (states_left >> (i * 4)) & 15; + UINT state_right = (states_right >> (i * 4)) & 15; + UINT state_both = state_left | (state_right << 4); + // cout << state_left << " " << states_right << " " << state_right << " " << dna_fitch_result[state_both] << endl; + states_dad |= dna_fitch_result[state_both] << (i * 4); + tree_pars += dna_fitch_step[state_both] * aln->at(ptn + i).frequency; + _pattern_pars[ptn + i] = node_branch->partial_pars[ptn_pars_start_id + ptn + i] + + dad_branch->partial_pars[ptn_pars_start_id + ptn + i] + dna_fitch_step[state_both]; + } + if (add_row) { + for (int i = 0; i < maxi; ++i) { + for (int j = 0; j < 4; ++j) { + if (states_dad & (1 << (i * 4 + j))) { + for (int k = j + 1; k < 4; ++k) { + if (states_dad & (1 << (i * 4 + k))) { + states_dad ^= (1 << (i * 4 + k)); + } + } + break; + } + } + } + root_states[ptn / 8] = states_dad; + } } } else if (aln->num_states == 20 && aln->seq_type == SEQ_PROTEIN) { // ULTRAFAST VERSION FOR PROTEIN From f8f71a0980a4262c7bb9af02ff61f850d25d117c Mon Sep 17 00:00:00 2001 From: trungnotchung Date: Sat, 28 Jun 2025 11:16:25 +0700 Subject: [PATCH 18/23] stop adding new sequences to alignment --- placement.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/placement.cpp b/placement.cpp index b6d107aa..c0ae0de8 100644 --- a/placement.cpp +++ b/placement.cpp @@ -131,9 +131,6 @@ void placeNewSamplesOntoExistingTree(Params ¶ms) { tree->computeExcessMutations(input); tree->addNewSample(input.best_node, input.best_node_branch, excess_mutations, i, alignment->missing_seq_names[i]); } - - alignment->addToAlignmentNewSequences(alignment->missing_seq_names, alignment->missing_sequences); - cout << "\n========== Finished placement core ==========\n"; cout << "Time: " << fixed << setprecision(3) << (double)(getCPUTime() - start_time) << " seconds\n"; cout << "Memory: " << getMemory() << " KB\n"; From dbb4abf03f1d9039de6c03f44dbf000eeea73abe Mon Sep 17 00:00:00 2001 From: trungnotchung Date: Sat, 28 Jun 2025 12:38:02 +0700 Subject: [PATCH 19/23] remove compute parsimony by fitch algo --- placement.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/placement.cpp b/placement.cpp index c0ae0de8..54383931 100644 --- a/placement.cpp +++ b/placement.cpp @@ -134,10 +134,7 @@ void placeNewSamplesOntoExistingTree(Params ¶ms) { cout << "\n========== Finished placement core ==========\n"; cout << "Time: " << fixed << setprecision(3) << (double)(getCPUTime() - start_time) << " seconds\n"; cout << "Memory: " << getMemory() << " KB\n"; - cout << "New tree's parsimony score computed by mutation: " << tree->computeParsimonyScoreMutation() << '\n'; - tree->deleteAllPartialLh(); - cout << "New tree's parsimony score computed by fitch: " << tree->computeParsimony() << '\n'; delete alignment; alignment = NULL; From 7f067d510b2ebb52e5306a94b5cadb8c6d3aea02 Mon Sep 17 00:00:00 2001 From: trungnotchung Date: Fri, 4 Jul 2025 21:18:31 +0700 Subject: [PATCH 20/23] fix: detect seq type --- alignment.cpp | 6 +++--- alignment.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/alignment.cpp b/alignment.cpp index 0d7974be..707ab622 100644 --- a/alignment.cpp +++ b/alignment.cpp @@ -1130,7 +1130,7 @@ int getMaxObservedStates(StrVector &sequences) { return 0; } -int Alignment::buildPattern(StrVector &sequences, char *sequence_type, int nseq, int nsite) { +int Alignment::buildPattern(StrVector &sequences, char *sequence_type, int nseq, int nsite, SeqType seq_type) { int seq_id; ostringstream err_str; codon_table = NULL; @@ -1171,7 +1171,7 @@ int Alignment::buildPattern(StrVector &sequences, char *sequence_type, int nseq, throw err_str.str(); /* now check data type */ - seq_type = detectSequenceType(sequences); + if (seq_type == SEQ_UNKNOWN) seq_type = detectSequenceType(sequences); switch (seq_type) { case SEQ_BINARY: num_states = 2; @@ -1531,7 +1531,7 @@ int Alignment::readPartialVCF(ifstream &in, char *sequence_type, vector &pe // If not enough columns, rebuild pattern and return if (num_processed_column < num_column) { - buildPattern(sequences, sequence_type, nseq, nsite); + buildPattern(sequences, sequence_type, nseq, nsite, SEQ_DNA); initial_column_state.assign(nsite, ""); for (int seq = 0; seq < nseq; ++seq) { for (int site = 0; site < nsite; ++site) diff --git a/alignment.h b/alignment.h index 2538e568..28c48278 100644 --- a/alignment.h +++ b/alignment.h @@ -92,7 +92,7 @@ class Alignment : public vector { */ int readNexus(char *filename); - int buildPattern(StrVector &sequences, char *sequence_type, int nseq, int nsite); + int buildPattern(StrVector &sequences, char *sequence_type, int nseq, int nsite, SeqType seq_type = SEQ_UNKNOWN); /** read the alignment in PHYLIP format From 1008ad8dddf4d806e4fd467e00c3af60f8f67bd2 Mon Sep 17 00:00:00 2001 From: trungnotchung Date: Sun, 13 Jul 2025 09:29:37 +0700 Subject: [PATCH 21/23] fix: duplicate variable name --- alignment.cpp | 6 ++++-- alignment.h | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/alignment.cpp b/alignment.cpp index 707ab622..13b1d791 100644 --- a/alignment.cpp +++ b/alignment.cpp @@ -1130,7 +1130,7 @@ int getMaxObservedStates(StrVector &sequences) { return 0; } -int Alignment::buildPattern(StrVector &sequences, char *sequence_type, int nseq, int nsite, SeqType seq_type) { +int Alignment::buildPattern(StrVector &sequences, char *sequence_type, int nseq, int nsite, SeqType _seq_type) { int seq_id; ostringstream err_str; codon_table = NULL; @@ -1171,7 +1171,9 @@ int Alignment::buildPattern(StrVector &sequences, char *sequence_type, int nseq, throw err_str.str(); /* now check data type */ - if (seq_type == SEQ_UNKNOWN) seq_type = detectSequenceType(sequences); + if (_seq_type == SEQ_UNKNOWN) seq_type = detectSequenceType(sequences); + else seq_type = _seq_type; + switch (seq_type) { case SEQ_BINARY: num_states = 2; diff --git a/alignment.h b/alignment.h index 28c48278..8e4082f1 100644 --- a/alignment.h +++ b/alignment.h @@ -92,7 +92,7 @@ class Alignment : public vector { */ int readNexus(char *filename); - int buildPattern(StrVector &sequences, char *sequence_type, int nseq, int nsite, SeqType seq_type = SEQ_UNKNOWN); + int buildPattern(StrVector &sequences, char *sequence_type, int nseq, int nsite, SeqType _seq_type = SEQ_UNKNOWN); /** read the alignment in PHYLIP format From 68dba246e91f735693d1920905042639bb94842e Mon Sep 17 00:00:00 2001 From: trungnotchung Date: Sun, 13 Jul 2025 20:53:13 +0700 Subject: [PATCH 22/23] fix: remove default parameter --- alignment.cpp | 7 +++---- alignment.h | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/alignment.cpp b/alignment.cpp index 13b1d791..216fc5cc 100644 --- a/alignment.cpp +++ b/alignment.cpp @@ -1130,7 +1130,7 @@ int getMaxObservedStates(StrVector &sequences) { return 0; } -int Alignment::buildPattern(StrVector &sequences, char *sequence_type, int nseq, int nsite, SeqType _seq_type) { +int Alignment::buildPattern(StrVector &sequences, char *sequence_type, int nseq, int nsite) { int seq_id; ostringstream err_str; codon_table = NULL; @@ -1171,8 +1171,7 @@ int Alignment::buildPattern(StrVector &sequences, char *sequence_type, int nseq, throw err_str.str(); /* now check data type */ - if (_seq_type == SEQ_UNKNOWN) seq_type = detectSequenceType(sequences); - else seq_type = _seq_type; + if (seq_type == SEQ_UNKNOWN) seq_type = detectSequenceType(sequences); switch (seq_type) { case SEQ_BINARY: @@ -1533,7 +1532,7 @@ int Alignment::readPartialVCF(ifstream &in, char *sequence_type, vector &pe // If not enough columns, rebuild pattern and return if (num_processed_column < num_column) { - buildPattern(sequences, sequence_type, nseq, nsite, SEQ_DNA); + buildPattern(sequences, sequence_type, nseq, nsite); initial_column_state.assign(nsite, ""); for (int seq = 0; seq < nseq; ++seq) { for (int site = 0; site < nsite; ++site) diff --git a/alignment.h b/alignment.h index 8e4082f1..2538e568 100644 --- a/alignment.h +++ b/alignment.h @@ -92,7 +92,7 @@ class Alignment : public vector { */ int readNexus(char *filename); - int buildPattern(StrVector &sequences, char *sequence_type, int nseq, int nsite, SeqType _seq_type = SEQ_UNKNOWN); + int buildPattern(StrVector &sequences, char *sequence_type, int nseq, int nsite); /** read the alignment in PHYLIP format From 8d88af17e50a92d0ae34ea5223e18ba883d60b18 Mon Sep 17 00:00:00 2001 From: trungnotchung <0xtrungnotchung@gmail.com> Date: Sat, 1 Nov 2025 22:38:07 +0700 Subject: [PATCH 23/23] fix: overflow when multiplying big integers --- phylotree.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/phylotree.cpp b/phylotree.cpp index 0a81af31..292aacc1 100644 --- a/phylotree.cpp +++ b/phylotree.cpp @@ -381,7 +381,7 @@ void PhyloTree::initializeAllPartialPars(int &index, PhyloNode *node, PhyloNode node = (PhyloNode*) root; // allocate the big central partial pars memory if (!central_partial_pars) { - int memsize = (aln->getNSeq() - 1) * 4 * pars_block_size; + size_t memsize = 1LL * (aln->getNSeq() - 1) * 4 * pars_block_size; if (verbose_mode >= VB_MED) cout << "Allocating " << memsize * sizeof(UINT) << " bytes for partial parsimony vectors" << endl; central_partial_pars = new UINT[memsize];