diff --git a/.gitignore b/.gitignore index 3751d5ea..ae1bbbdd 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,4 @@ pll/libpll.a pll_test/ workspace.code-workspace .vscode +build/ diff --git a/CMakeLists.txt b/CMakeLists.txt index ebb6506f..fdab61cc 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -400,6 +400,8 @@ checkpoint.cpp parstree.cpp sprparsimony.cpp test.cpp +mutation.cpp +placement.cpp ) ################################################################## diff --git a/alignment.cpp b/alignment.cpp index 8b1d32bb..216fc5cc 100644 --- a/alignment.cpp +++ b/alignment.cpp @@ -398,7 +398,7 @@ void Alignment::checkGappySeq(bool force_error) { } } -Alignment::Alignment(char *filename, char *sequence_type, InputType &intype) : vector() { +Alignment::Alignment(char *filename, char *sequence_type, InputType &intype, int existing_sequence) : vector() { num_states = 0; frac_const_sites = 0.0; codon_table = NULL; @@ -420,6 +420,9 @@ Alignment::Alignment(char *filename, char *sequence_type, InputType &intype) : v } else if (intype == IN_PHYLIP) { cout << "Phylip format detected" << endl; readPhylip(filename, sequence_type); + } else if (intype == IN_VCF) { + cout << "VCF format detected" << endl; + readVCF(filename, sequence_type, existing_sequence); } else { outError("Unknown sequence format, please use PHYLIP, FASTA, or NEXUS format"); } @@ -992,6 +995,53 @@ char Alignment::convertStateBack(char state) { } } +char Alignment::getMutationFromState(char state) { + int value = convertState(state, SEQ_DNA); + switch (value) { + case 0: + return 1; + case 1: + return 2; + case 2: + return 4; + case 3: + return 8; + case 1 + 4 + 3: + return 1 + 4; + case 2 + 8 + 3: + return 2 + 8; + case 1 + 8 + 3: + return 1 + 8; + case 2 + 4 + 3: + return 2 + 4; + case 1 + 2 + 3: + return 1 + 2; + case 4 + 8 + 3: + return 4 + 8; + case 2 + 4 + 8 + 3: + return 2 + 4 + 8; + case 1 + 2 + 8 + 3: + return 1 + 2 + 8; + case 1 + 4 + 8 + 3: + return 1 + 4 + 8; + case 1 + 2 + 4 + 3: + return 1 + 2 + 4; + + default: + return 15; + break; + } +} + +int Alignment::getStateFromMutation(int nuc) { + int value; + if ((nuc & (nuc - 1)) == 0) + value = log2(nuc); + else + value = nuc + 3; + return convertStateBack(value); +} + string Alignment::convertStateBackStr(char state) { string str; if (seq_type != SEQ_CODON) { @@ -1009,8 +1059,8 @@ string Alignment::convertStateBackStr(char state) { } void Alignment::convertStateStr(string &str, SeqType seq_type) { - for (string::iterator it = str.begin(); it != str.end(); it++) - (*it) = convertState(*it, seq_type); + for (string::iterator it = str.begin(); it != str.end(); it++) + (*it) = convertState(*it, seq_type); } void Alignment::initCodon(char *sequence_type) { @@ -1121,7 +1171,8 @@ int Alignment::buildPattern(StrVector &sequences, char *sequence_type, int nseq, throw err_str.str(); /* now check data type */ - seq_type = detectSequenceType(sequences); + if (seq_type == SEQ_UNKNOWN) seq_type = detectSequenceType(sequences); + switch (seq_type) { case SEQ_BINARY: num_states = 2; @@ -1235,8 +1286,374 @@ int Alignment::buildPattern(StrVector &sequences, char *sequence_type, int nseq, return 1; } -int Alignment::readPhylip(char *filename, char *sequence_type) { +void split(const string &s, vector &elems, const string &delim) { + elems.clear(); + size_t pos = 0; + size_t len = s.length(); + size_t delim_len = delim.length(); + if (delim_len == 0) { + elems.push_back(s); + return; + } + while (pos < len) { + size_t find_pos = s.find(delim, pos); + if (find_pos == string::npos) { + elems.push_back(s.substr(pos)); + return; + } + elems.push_back(s.substr(pos, find_pos - pos)); + pos = find_pos + delim_len; + } +} + +// Find the permutation of columns after rotation +vector Alignment::findRotatedColumnPermutation() { + assert(getNSite() == (int)initial_column_state.size()); + char char_to_state[NUM_CHAR]; + computeUnknownState(); + buildStateMap(char_to_state, seq_type); + + vector perm(getNSite(), 0); + map> pattern_map; + // Build pattern map + for (int i = 0; i < getNSite(); ++i) { + Pattern pattern = getPattern(i); + pattern_map[pattern].push_back(i); + } + for (int col = 0; col < getNSite(); ++col) { + // For each column, build a pattern + // Find initial index of the pattern + Pattern pattern; + for (int i = 0; i < initial_column_state[col].length(); ++i) { + pattern += char_to_state[(int)initial_column_state[col][i]]; + } + perm[pattern_map[pattern].back()] = col; + pattern_map[pattern].pop_back(); + } + return perm; +} + +void Alignment::addToAlignmentNewSequence(const string &new_name, const string &new_seq) { + assert(new_seq.size() == getNSite()); + char char_to_state[NUM_CHAR]; + computeUnknownState(); + + buildStateMap(char_to_state, seq_type); + vector new_patterns; + PatternIntMap new_pattern_index; + vector new_site_patterns; + vector perm_col = findRotatedColumnPermutation(); + + for (int i = 0; i < getNSite(); ++i) { + Pattern new_pattern = getPattern(i); + new_pattern.push_back(char_to_state[(int)new_seq[perm_col[i]]]); + PatternIntMap::iterator pat_it = new_pattern_index.find(new_pattern); + if (pat_it == new_pattern_index.end()) { + new_pattern.frequency = 1; + new_pattern.computeConst(STATE_UNKNOWN); + new_patterns.push_back(new_pattern); + new_pattern_index[new_pattern] = new_patterns.size() - 1; + new_site_patterns.push_back(new_patterns.size() - 1); + } + else { + int index = pat_it->second; + new_patterns[index].frequency++; + new_site_patterns.push_back(index); + } + } + clear(); + for (vector::iterator it = new_patterns.begin(); it != new_patterns.end(); ++it) { + push_back(*it); + } + pattern_index = new_pattern_index; + site_pattern = new_site_patterns; + seq_names.push_back(new_name); + buildSeqStates(); + countConstSite(); +} + +void Alignment::addToAlignmentNewSequences(const vector &new_seq_names, const vector &new_sequences) { + char char_to_state[NUM_CHAR]; + computeUnknownState(); + buildStateMap(char_to_state, seq_type); + + vector new_patterns; + PatternIntMap new_pattern_index; + vector new_site_patterns; + int nseq = new_sequences.size(); + vector perm_col = findRotatedColumnPermutation(); + + for (int site = 0; site < getNSite(); ++site) { + Pattern new_pattern = getPattern(site); + for (int seq = 0; seq < nseq; ++seq) { + new_pattern.push_back(char_to_state[(int)new_sequences[seq][perm_col[site]]]); + } + PatternIntMap::iterator pat_it = new_pattern_index.find(new_pattern); + if (pat_it == new_pattern_index.end()) { + new_pattern.frequency = 1; + new_pattern.computeConst(STATE_UNKNOWN); + new_patterns.push_back(new_pattern); + new_pattern_index[new_pattern] = new_patterns.size() - 1; + new_site_patterns.push_back(new_patterns.size() - 1); + } + else { + int index = pat_it->second; + new_patterns[index].frequency++; + new_site_patterns.push_back(index); + } + } + clear(); + for (vector::iterator it = new_patterns.begin(); it != new_patterns.end(); ++it) { + push_back(*it); + } + pattern_index = new_pattern_index; + site_pattern = new_site_patterns; + seq_names.insert(seq_names.end(), new_seq_names.begin(), new_seq_names.end()); + buildSeqStates(); + countConstSite(); +} + +void Alignment::updateAlignmentNewSequences(const vector &new_sequences, const vector &perm_col) { + computeUnknownState(); + char char_to_state[NUM_CHAR]; + buildStateMap(char_to_state, seq_type); + vector new_patterns; + PatternIntMap new_pattern_index; + vector new_site_patterns; + int nseq = new_sequences.size(); + int nsite = getNSite(); + + for (int site = 0; site < nsite; ++site) { + Pattern new_pattern; + for (int seq = 0; seq < nseq; ++seq) { + new_pattern.push_back(char_to_state[(int)new_sequences[seq][perm_col[site]]]); + } + PatternIntMap::iterator pat_it = new_pattern_index.find(new_pattern); + if (pat_it == new_pattern_index.end()) { + // If pattern not found, add new pattern + new_pattern.frequency = 1; + new_pattern.computeConst(STATE_UNKNOWN); + new_patterns.push_back(new_pattern); + new_pattern_index[new_pattern] = new_patterns.size() - 1; + new_site_patterns.push_back(new_patterns.size() - 1); + } + else { + // If pattern found, increment frequency + int index = pat_it->second; + new_patterns[index].frequency++; + new_site_patterns.push_back(index); + } + } + clear(); + for (vector::iterator itr = new_patterns.begin(); itr != new_patterns.end(); ++itr) { + push_back(*itr); + } + pattern_index = new_pattern_index; + site_pattern = new_site_patterns; + buildSeqStates(); + countConstSite(); +} + +// Read partial VCF file and update alignment +int Alignment::readPartialVCF(ifstream &in, char *sequence_type, vector &perm_col, int existing_sequence, int start_index, int num_column) { + if (in.eof()) { + return 0; + } + StrVector sequences; + int nseq = getNSeq(); + int nsite = 0; + int seq_id = 0; + string line; + int num_processed_column = 0; + + sequences.resize(nseq, ""); + existing_sample_mutations.assign(nseq, vector()); + + for (; !in.eof() && num_processed_column < num_column;) { + getline(in, line); + if (line == "") + continue; + vector words; + split(line, words, "\t"); + if (words.size() == 1) + continue; + if (words.size() != 9 + nseq + missing_sample_mutations.size()) + throw "Number of columns in VCF file is not consistent"; + vector alleles; + Mutation mutation; + int variant_pos = std::stoi(words[1]); + mutation.position = variant_pos; + mutation.compressed_position = num_processed_column + start_index; + while ((int)reference_nuc.size() <= mutation.compressed_position) + reference_nuc.push_back(0); + split(words[4], alleles, ","); + mutation.ref_nuc = getMutationFromState(words[3][0]); + if (reference_nuc[mutation.compressed_position] == 0) + reference_nuc[mutation.compressed_position] = mutation.ref_nuc; + for (int i = 9; i < words.size(); ++i) { + mutation.is_missing = false; + if (isdigit(words[i][0])) { + int allele_id = std::stoi(words[i]); + if (allele_id > 0) { + std::string allele = alleles[allele_id - 1]; + if (i - 9 < existing_sequence) { + sequences[i - 9].push_back(allele[0]); + } + mutation.mut_nuc = getMutationFromState(allele[0]); + } + else { + if (i - 9 < existing_sequence) { + sequences[i - 9].push_back(words[3][0]); + } + mutation.mut_nuc = getMutationFromState(words[3][0]); + } + } + else { + if (i - 9 < existing_sequence) { + sequences[i - 9].push_back('-'); + } + mutation.mut_nuc = getMutationFromState('N'); + mutation.is_missing = true; + } + if (i - 9 >= existing_sequence) { + if (mutation.mut_nuc != mutation.ref_nuc) { + mutation.par_nuc = mutation.ref_nuc; + missing_sample_mutations[i - 9 - existing_sequence].push_back(mutation); + } + } + else { + existing_sample_mutations[i - 9].push_back(mutation); + } + } + ++nsite; + ++num_processed_column; + } + + // If not enough columns, rebuild pattern and return + if (num_processed_column < num_column) { + buildPattern(sequences, sequence_type, nseq, nsite); + initial_column_state.assign(nsite, ""); + for (int seq = 0; seq < nseq; ++seq) { + for (int site = 0; site < nsite; ++site) + initial_column_state[site] += sequences[seq][site]; + } + perm_col = findRotatedColumnPermutation(); + return num_processed_column; + } + + // Update alignment with new sequences + updateAlignmentNewSequences(sequences, perm_col); + return num_processed_column; +} + +int Alignment::readVCF(char *filename, char *sequence_type, int existing_sequence) { + StrVector sequences; + ifstream in; + in.exceptions(ios::failbit | ios::badbit); + in.open(filename); + int nseq = 0; + int nsite = 0; + int seq_id = 0; + int num_missing_sequence = 0; + string line; + in.exceptions(ios::badbit); + int num_processed_column = 0; + + for (; !in.eof();) { + getline(in, line); + if (line == "") + continue; + vector words; + split(line, words, "\t"); + if (words.size() == 1) + continue; + if (words[1] == "POS") { + // Sample names start from the 10th word in the header + for (int i = 9; i < words.size(); i++) { + if (i - 9 >= existing_sequence) { + missing_seq_names.push_back(words[i]); + num_missing_sequence++; + } + else { + seq_names.push_back(words[i]); + nseq++; + } + } + sequences.resize(nseq, ""); + missing_sequences.resize(num_missing_sequence, ""); + existing_sample_mutations.resize(nseq); + missing_sample_mutations.resize(num_missing_sequence); + } + else { + if (words.size() != 9 + nseq + num_missing_sequence) + throw "Number of columns in VCF file is not consistent"; + vector alleles; + Mutation mutation; + int variant_pos = std::stoi(words[1]); + mutation.position = variant_pos; + mutation.compressed_position = num_processed_column; + while ((int)reference_nuc.size() <= mutation.compressed_position) + reference_nuc.push_back(0); + split(words[4], alleles, ","); + mutation.ref_nuc = getMutationFromState(words[3][0]); + if (reference_nuc[mutation.compressed_position] == 0) + reference_nuc[mutation.compressed_position] = mutation.ref_nuc; + for (int i = 9; i < words.size(); ++i) { + mutation.is_missing = false; + if (isdigit(words[i][0])) { + int allele_id = std::stoi(words[i]); + if (allele_id > 0) { + std::string allele = alleles[allele_id - 1]; + if (i - 9 < existing_sequence) + sequences[i - 9].push_back(allele[0]); + else + missing_sequences[i - 9 - existing_sequence].push_back(allele[0]); + + mutation.mut_nuc = getMutationFromState(allele[0]); + } + else { + if (i - 9 < existing_sequence) + sequences[i - 9].push_back(words[3][0]); + else + missing_sequences[i - 9 - existing_sequence].push_back(words[3][0]); + + mutation.mut_nuc = getMutationFromState(words[3][0]); + } + } + else { + if (i - 9 < existing_sequence) + sequences[i - 9].push_back('-'); + else + missing_sequences[i - 9 - existing_sequence].push_back('-'); + mutation.mut_nuc = getMutationFromState('N'); + mutation.is_missing = true; + } + if (i - 9 >= existing_sequence) { + if (mutation.mut_nuc != mutation.ref_nuc) { + mutation.par_nuc = mutation.ref_nuc; + missing_sample_mutations[i - 9 - existing_sequence].push_back(mutation); + } + } + else + existing_sample_mutations[i - 9].push_back(mutation); + } + ++nsite; + ++num_processed_column; + } + } + initial_column_state.assign(nsite, ""); + for (int seq = 0; seq < nseq; ++seq) { + for (int site = 0; site < nsite; ++site) + initial_column_state[site] += sequences[seq][site]; + } + in.clear(); + in.exceptions(ios::failbit | ios::badbit); + in.close(); + return buildPattern(sequences, sequence_type, nseq, nsite); +} + +int Alignment::readPhylip(char *filename, char *sequence_type) { StrVector sequences; ostringstream err_str; ifstream in; diff --git a/alignment.h b/alignment.h index 74f43ad7..2538e568 100644 --- a/alignment.h +++ b/alignment.h @@ -16,6 +16,7 @@ #include "pattern.h" #include "ncl/ncl.h" #include "tools.h" +#include "mutation.h" // IMPORTANT: refactor STATE_UNKNOWN //const char STATE_UNKNOWN = 126; @@ -58,7 +59,7 @@ class Alignment : public vector { @param sequence_type type of the sequence, either "BIN", "DNA", "AA", or NULL @param intype (OUT) input format of the file */ - Alignment(char *filename, char *sequence_type, InputType &intype); + Alignment(char *filename, char *sequence_type, InputType &intype, int existing_sequence = INT_MAX); /** destructor @@ -609,6 +610,126 @@ class Alignment : public vector { int n_informative_patterns; int n_informative_sites; + /** + * Names of sequences that are missing from the current alignment but present in the VCF file. + * These sequences will be processed separately during VCF file reading. + */ + vector missing_seq_names; + + /** + * Actual sequence data for the missing sequences. + * Each string represents a complete sequence for one missing sample. + */ + vector missing_sequences; + + /** + * Initial state of each column in the alignment before rotation + * Used for finding rotated column permutations to optimize memory usage and processing. + */ + vector initial_column_state; + + /** + * Mutations found in missing sequences (sequences not in the current alignment). + * Each inner vector contains mutations for one missing sequence. + * Used to track mutations that need to be processed separately. + */ + vector> missing_sample_mutations; + + /** + * Mutations found in existing sequences (sequences already in the alignment). + * Each inner vector contains mutations for one existing sequence. + * Used to track mutations that have already been incorporated into the alignment. + */ + vector> existing_sample_mutations; + + /** + * Reference nucleotides for each position in the alignment. + * Used to track the original nucleotide at each position before mutations. + */ + vector reference_nuc; + + /** + * Replaces the current alignment with new sequences. + * @param new_seqs Vector of new sequences to replace the current alignment + * @param perm_col Vector of column permutations to apply to the new sequences + * + * This function is used when updating the alignment with new sequence data, + * typically after processing a batch of VCF data. + */ + void updateAlignmentNewSequences(const vector &new_seqs, const vector &perm_col); + + /** + * Adds a single new sequence to the current alignment. + * @param new_seq_name Name of the new sequence to add + * @param new_seq The actual sequence data to add + * + * The sequence must be the same length as existing sequences in the alignment. + * This function handles the conversion of sequence characters to internal states + * and updates the pattern information accordingly. + */ + void addToAlignmentNewSequence(const string &new_seq_name, const string &new_seq); + + /** + * Adds multiple new sequences to the current alignment. + * @param new_seq_names Vector of names for the new sequences + * @param new_seqs Vector of sequence data for the new sequences + * + * All sequences must be the same length as existing sequences in the alignment. + * This function efficiently processes multiple sequences at once by updating + * patterns and site information in a single pass. + */ + void addToAlignmentNewSequences(const vector &new_seq_names, const vector &new_seqs); + + /** + * Converts an internal state code to its corresponding mutation character. + * @param state Internal state code to convert + * @return Character representing the mutation (e.g., 'A', 'C', 'G', 'T' for DNA) + */ + char getMutationFromState(char state); + + /** + * Converts a mutation character to its corresponding internal state code. + * @param nuc Nucleotide character to convert + * @return Internal state code for the nucleotide + */ + int getStateFromMutation(int nuc); + + /** + * Finds the optimal column permutation. + * @return Vector of integers representing the origin order + */ + vector findRotatedColumnPermutation(); + + /** + * Reads a portion of a VCF file to process it in batches. + * @param in Input file stream for the VCF file + * @param sequence_type Type of sequence data (e.g., "DNA", "PROTEIN") + * @param perm_col Vector to store column permutations + * @param existing_sequence Number of sequences already in the alignment + * @param start_index Starting position in the alignment + * @param num_column Number of columns to process in this batch + * @return Number of columns actually processed + * + * This function is used to process large VCF files in chunks to reduce memory usage. + * It updates the alignment with new sequence data and mutation information. + */ + int readPartialVCF(ifstream &in, char *sequence_type, vector &perm_col, + int existing_sequence, int start_index, int num_column); + + /** + * Reads and processes a complete VCF file. + * @param file_name Path to the VCF file + * @param sequence_type Type of sequence data (e.g., "DNA", "PROTEIN") + * @param existing_sequence Number of sequences already in the alignment + * @return Number of sites processed + * + * This function reads a VCF file and builds an alignment from it. It handles: + * - Reading sequence names and data + * - Processing mutations and reference nucleotides + * - Building patterns and updating the alignment + * - Tracking mutations for both existing and missing sequences + */ + int readVCF(char *file_name, char *sequence_type, int existing_sequence); protected: diff --git a/iqtree.cpp b/iqtree.cpp index bad3c8ff..e4349e34 100644 --- a/iqtree.cpp +++ b/iqtree.cpp @@ -4538,3 +4538,124 @@ void IQTree::reinsertIdenticalSeqs(Alignment *orig_aln, StrVector &removed_seqs, deleteAllPartialLh(); clearAllPartialLH(); } + +void IQTree::getLeavesName(vector &leaves_name) { + getLeavesName(root, root->neighbors[0]->node, leaves_name); + getLeavesName(root->neighbors[0]->node, root, leaves_name); +} + +void IQTree::getLeavesName(Node *node, Node *dad, vector &leaves_name) { + if (node->isLeaf()) { + leaves_name.push_back(node->name); + return; + } + FOR_NEIGHBOR_IT(node, dad, it) { + getLeavesName((*it)->node, node, leaves_name); + if (node->name == "") { + node->name = (*it)->node->name; + } + else { + node->name = min(node->name, (*it)->node->name); + } + } +} + +void IQTree::assignRoot(string &root_name) +{ + if (root->name == root_name) + return; + assignRoot(root->neighbors[0]->node, root, root_name); +} + +bool IQTree::assignRoot(Node *node, Node *dad, string &root_name) +{ + if (node->isLeaf() && node->name == root_name) { + root = node; + return true; + } + FOR_NEIGHBOR_IT(node, dad, it) { + if (assignRoot((*it)->node, node, root_name)) { + return true; + } + } +} + +int IQTree::initNodeData(vector &leaves_name) { + PhyloNode *node1 = (PhyloNode *)root; + PhyloNode *node2 = (PhyloNode *)root->neighbors[0]->node; + + int left_child_num_missing_sample = initInfoNode(node1, node2, leaves_name); + int right_child_num_missing_sample = initInfoNode(node2, node1, leaves_name); + return left_child_num_missing_sample + right_child_num_missing_sample; +} + +int IQTree::initInfoNode(PhyloNode *node, PhyloNode *dad, vector &leaves_name) { + if (node->isLeaf()) { + int node_index = lower_bound(leaves_name.begin(), leaves_name.end(), node->name) - leaves_name.begin(); + if (node_index < leaves_name.size() && leaves_name[node_index] == node->name) { + node->setMissingNode(-1); + return 1; + } + else { + node->setMissingNode(1); + return 0; + } + } + + int total_missing_sample = 0; + bool check = true; + FOR_NEIGHBOR_IT(node, dad, it) { + int num_missing_sample = initInfoNode((PhyloNode *)(*it)->node, node, leaves_name); + if (num_missing_sample == 0) { + check = false; + } + else { + if (node->name == "") { + node->name = (*it)->node->name; + } + else { + node->name = min(node->name, (*it)->node->name); + } + } + total_missing_sample += num_missing_sample; + } + + if (check) { + node->setMissingNode(-1); + } + else { + node->setMissingNode(1); + } + return total_missing_sample; +} + +bool IQTree::compareTree(IQTree *anotherTree) { + if (root->name != anotherTree->root->name) + return false; + return compareTree((PhyloNode *)root, NULL, anotherTree->root, NULL); +} + +bool IQTree::compareTree(PhyloNode *node1, PhyloNode *dad1, Node *node2, Node *dad2) { + bool check = true; + FOR_NEIGHBOR_IT(node1, dad1, it1) { + PhyloNode *child1 = (PhyloNode *)(*it1)->node; + if (!child1->checkMissingNode()) { + bool found = false; + FOR_NEIGHBOR_IT(node2, dad2, it2) { + Node *child2 = (*it2)->node; + if (child1->name == child2->name) { + found = true; + check &= compareTree(child1, node1, child2, node2); + break; + } + } + if (!found) { + return false; + } + } + else { + check &= compareTree(child1, node1, node2, dad2); + } + } + return check; +} diff --git a/iqtree.h b/iqtree.h index 3ff0ad90..b64c1927 100644 --- a/iqtree.h +++ b/iqtree.h @@ -664,6 +664,80 @@ class IQTree : public PhyloTree { int k_represent; public: + /** + * Retrieves all leaf node names from the tree and stores them in a vector. + * + * @param leaves_name [out] Vector to store the leaf names. Will be populated with + * all leaf node names from the tree. + */ + void getLeavesName(vector &leaves_name); + + /** + * Retrieves all leaf node names from a subtree rooted at the specified node. + * + * @param node [in] The root node of the subtree to traverse + * @param dad [in] The parent node of 'node', used to direct the traversal + * @param leaves_name [out] Vector to store the leaf names. Will be populated with + * all leaf node names from the subtree. + */ + void getLeavesName(Node *node, Node *dad, vector& leaves_name); + + /** + * Assigns the root of the tree to the node with the specified name. + * + * @param root_name [in] The name of the node that should become the root + * @throws std::runtime_error if no node with the given name is found + */ + void assignRoot(string &root_name); + + /** + * Attempts to assign the root of a subtree to the node with the specified name. + * + * @param node [in] The current node being considered + * @param dad [in] The parent node of 'node', used to direct the traversal + * @param root_name [in] The name of the node that should become the root + * @return true if the root was successfully assigned, false otherwise + */ + bool assignRoot(Node *node, Node *dad, string &root_name); + + /** + * Initializes node data by marking which nodes are original vs added nodes. + * This is done by comparing against a list of known leaf names. + * + * @param leaves_name [in] Vector containing the names of original leaf nodes + * @return The number of nodes that were successfully initialized + */ + int initNodeData(vector &leaves_name); + + /** + * Recursively initializes node data for a subtree by marking which nodes are + * original vs added nodes. This is done by comparing against a list of known leaf names. + * + * @param node [in] The current node being considered + * @param dad [in] The parent node of 'node', used to direct the traversal + * @param leaves_name [in] Vector containing the names of original leaf nodes + * @return The number of nodes that were successfully initialized in this subtree + */ + int initInfoNode(PhyloNode *node, PhyloNode *dad, vector &leaves_name); + + /** + * Compares the current tree with another tree to check if they have the same topology. + * + * @param another_tree [in] Pointer to the tree to compare against + * @return true if the trees have identical topology, false otherwise + */ + bool compareTree(IQTree *another_tree); + + /** + * Recursively compares two subtrees to check if they have the same topology. + * + * @param node1 [in] Root node of the first subtree + * @param dad1 [in] Parent node of node1, used to direct the traversal + * @param node2 [in] Root node of the second subtree + * @param dad2 [in] Parent node of node2, used to direct the traversal + * @return true if the subtrees have identical topology, false otherwise + */ + bool compareTree(PhyloNode *node1, PhyloNode *dad1, Node *node2, Node *dad2); /** * @brief: optimize model parameters on the current tree diff --git a/mutation.cpp b/mutation.cpp new file mode 100644 index 00000000..ea438c8f --- /dev/null +++ b/mutation.cpp @@ -0,0 +1,66 @@ +#include "mutation.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +// Uses one-hot encoding if base is unambiguous +// A:1,C:2,G:4,T:8 + +// Convert nuc_id back to IUPAC base +char get_nuc(int8_t nuc_id) { + char ret = 'N'; + //assert ((nuc_id >= 1) && (nuc_id <= 15)); + switch (nuc_id) { + case 1: + ret = 'A'; + break; + case 2: + ret = 'C'; + break; + case 3: + ret = 'M'; + break; + case 4: + ret = 'G'; + break; + case 5: + ret = 'R'; + break; + case 6: + ret = 'S'; + break; + case 7: + ret = 'V'; + break; + case 8: + ret = 'T'; + break; + case 9: + ret = 'W'; + break; + case 10: + ret = 'Y'; + break; + case 11: + ret = 'H'; + break; + case 12: + ret = 'K'; + break; + case 13: + ret = 'D'; + break; + case 14: + ret = 'B'; + break; + default: + ret = 'N'; + break; + } + return ret; +} \ No newline at end of file diff --git a/mutation.h b/mutation.h new file mode 100644 index 00000000..ed1a09cc --- /dev/null +++ b/mutation.h @@ -0,0 +1,58 @@ +#ifndef _MUTATION +#define _MUTATION +#include +#include +#include +#include +#include +#include +#include +#include +#include +char get_nuc(int8_t nuc_id); + +struct Mutation { + int position; + int compressed_position; + char ref_nuc; + char par_nuc; + char mut_nuc; + bool is_missing; + + Mutation() { + is_missing = false; + } + + inline bool operator < (const Mutation &m) const { + return ((*this).position < m.position); + } + + inline Mutation copy() const { + Mutation m; + m.position = position; + m.ref_nuc = ref_nuc; + m.par_nuc = par_nuc; + m.mut_nuc = mut_nuc; + m.is_missing = is_missing; + m.compressed_position = compressed_position; + return m; + } + + inline bool is_masked() const { + return (position < 0); + } + + inline std::string get_string() const { + if (is_masked()) { + return "MASKED"; + } + else { + return get_nuc(par_nuc) + std::to_string(position) + get_nuc(mut_nuc); + } + } + + inline bool has_nuc(int nuc) { + return ((1 << nuc) & mut_nuc) != 0; + } +}; +#endif \ No newline at end of file diff --git a/pda.cpp b/pda.cpp index a1f9f3f2..ebded5d1 100644 --- a/pda.cpp +++ b/pda.cpp @@ -67,6 +67,7 @@ #include #include "sprparsimony.h" #include "vectorclass/vectorclass.h" +#include "placement.h" #ifdef _OPENMP #include @@ -2322,7 +2323,9 @@ int main(int argc, char *argv[]) cout.setf(ios::fixed); // call the main function - if (params.tree_gen != NONE) { + if (params.ppon) { + placeNewSamplesOntoExistingTree(params); + } else if (params.tree_gen != NONE) { generateRandomTree(params); } else if (params.do_pars_multistate) { // cout << "Starting the test for computing concensus NOT from file:" << endl; diff --git a/phylonode.cpp b/phylonode.cpp index 3d87686a..1f247034 100644 --- a/phylonode.cpp +++ b/phylonode.cpp @@ -19,6 +19,9 @@ void PhyloNeighbor::clearForwardPartialLh(Node *dad) { ((PhyloNeighbor*)*it)->clearForwardPartialLh(node); } +void PhyloNeighbor::clearMutations() { + mutations.clear(); +} void PhyloNode::clearReversePartialLh(PhyloNode *dad) { PhyloNeighbor *node_nei = (PhyloNeighbor*)findNeighbor(dad); @@ -62,10 +65,22 @@ PhyloNode::PhyloNode(int aid, const char *aname) : Node(aid, aname) { } void PhyloNode::init() { - //partial_lh = NULL; + missingIndex = -1; } void PhyloNode::addNeighbor(Node *node, double length, int id) { neighbors.push_back(new PhyloNeighbor(node, length, id)); } + +void PhyloNode::setMissingNode(int index) { + missingIndex = index; +} + +bool PhyloNode::checkMissingNode() { + return missingIndex != -1; +} + +int PhyloNode::getMissingIndex() { + return missingIndex; +} \ No newline at end of file diff --git a/phylonode.h b/phylonode.h index ef77b22e..d708b54b 100644 --- a/phylonode.h +++ b/phylonode.h @@ -13,6 +13,7 @@ #define PHYLONODE_H #include "node.h" +#include "mutation.h" typedef short int UBYTE; @@ -21,7 +22,8 @@ A neighbor in a phylogenetic tree @author BUI Quang Minh, Steffen Klaere, Arndt von Haeseler */ -class PhyloNeighbor : public Neighbor { +class PhyloNeighbor : public Neighbor +{ friend class PhyloNode; friend class PhyloTree; friend class IQTree; @@ -77,8 +79,22 @@ class PhyloNeighbor : public Neighbor { */ void clearForwardPartialLh(Node *dad); -private: + /** + * All mutations on this branch + */ + std::vector mutations; + + /** + * Number of leaves in the subtree rooted at this node + */ + int num_leaves; + + /** + * Clear all mutations on this branch + */ + void clearMutations(); +private: /** true if the partial likelihood was computed */ @@ -103,7 +119,6 @@ class PhyloNeighbor : public Neighbor { vector containing the partial parsimony scores */ UINT *partial_pars; - }; /** @@ -111,7 +126,8 @@ A node in a phylogenetic tree @author BUI Quang Minh, Steffen Klaere, Arndt von Haeseler */ -class PhyloNode : public Node { +class PhyloNode : public Node +{ friend class PhyloTree; public: @@ -145,6 +161,12 @@ class PhyloNode : public Node { */ void init(); + void setMissingNode(int index); + + bool checkMissingNode(); + + int getMissingIndex(); + /** add a neighbor @param node the neighbor node @@ -153,8 +175,6 @@ class PhyloNode : public Node { */ virtual void addNeighbor(Node *node, double length, int id = -1); - - /** tell that all partial likelihood vectors below this node are not computed */ @@ -164,13 +184,31 @@ class PhyloNode : public Node { tell that all partial likelihood vectors (in reverse direction) below this node are not computed */ void clearReversePartialLh(PhyloNode *dad); -}; + PhyloNode *dad; + + int missingIndex; +}; /** Node vector */ -typedef vector PhyloNodeVector; +typedef vector PhyloNodeVector; + +class PlacementCandidateNode +{ +public: + PhyloNode *node; + PhyloNeighbor *node_branch; + std::vector *missing_sample_mutations; + std::vector *excess_mutations; + int *best_set_difference; + size_t *best_node_num_leaves; + PhyloNode *best_node; + PhyloNeighbor *best_node_branch; + + PlacementCandidateNode() {} +}; -#endif +#endif \ No newline at end of file diff --git a/phylotree.cpp b/phylotree.cpp index a86ba2b9..292aacc1 100644 --- a/phylotree.cpp +++ b/phylotree.cpp @@ -381,7 +381,7 @@ void PhyloTree::initializeAllPartialPars(int &index, PhyloNode *node, PhyloNode node = (PhyloNode*) root; // allocate the big central partial pars memory if (!central_partial_pars) { - int memsize = (aln->getNSeq() - 1) * 4 * pars_block_size; + size_t memsize = 1LL * (aln->getNSeq() - 1) * 4 * pars_block_size; if (verbose_mode >= VB_MED) cout << "Allocating " << memsize * sizeof(UINT) << " bytes for partial parsimony vectors" << endl; central_partial_pars = new UINT[memsize]; @@ -1058,34 +1058,42 @@ int PhyloTree::computeParsimonyBranch(PhyloNeighbor *dad_branch, PhyloNode *dad, } } } else if (aln->num_states == 4 && aln->seq_type == SEQ_DNA) { - // ULTRAFAST VERSION FOR DNA - for (ptn = 0; ptn < aln->size(); ptn+=8) { - UINT states_left = node_branch->partial_pars[ptn/8]; - UINT states_right = dad_branch->partial_pars[ptn/8]; - UINT states_dad = 0; - int maxi = aln->size() - ptn; - if(maxi > 8) maxi = 8; - for (i = 0; i< maxi; i++) { - UINT state_left = (states_left >> (i*4)) & 15; - UINT state_right = (states_right >> (i*4)) & 15; + // ULTRAFAST VERSION FOR DNA + for (ptn = 0; ptn < aln->size(); ptn += 8) + { + UINT states_left = node_branch->partial_pars[ptn / 8]; + UINT states_right = dad_branch->partial_pars[ptn / 8]; + UINT states_dad = 0; + int maxi = aln->size() - ptn; + if (maxi > 8) + maxi = 8; + for (i = 0; i < maxi; i++) + { + UINT state_left = (states_left >> (i * 4)) & 15; + UINT state_right = (states_right >> (i * 4)) & 15; UINT state_both = state_left | (state_right << 4); - states_dad |= dna_fitch_result[state_both] << (i*4); - tree_pars += dna_fitch_step[state_both] * aln->at(ptn+i).frequency; + // cout << state_left << " " << states_right << " " << state_right << " " << dna_fitch_result[state_both] << endl; + states_dad |= dna_fitch_result[state_both] << (i * 4); + tree_pars += dna_fitch_step[state_both] * aln->at(ptn + i).frequency; _pattern_pars[ptn + i] = node_branch->partial_pars[ptn_pars_start_id + ptn + i] + - dad_branch->partial_pars[ptn_pars_start_id + ptn + i] + dna_fitch_step[state_both]; + dad_branch->partial_pars[ptn_pars_start_id + ptn + i] + dna_fitch_step[state_both]; + } + if (add_row) { + for (int i = 0; i < maxi; ++i) { + for (int j = 0; j < 4; ++j) { + if (states_dad & (1 << (i * 4 + j))) { + for (int k = j + 1; k < 4; ++k) { + if (states_dad & (1 << (i * 4 + k))) { + states_dad ^= (1 << (i * 4 + k)); + } + } + break; + } + } + } + root_states[ptn / 8] = states_dad; } } -// // the remaining bits -// UINT states_left = node_branch->partial_pars[ptn/8]; -// UINT states_right = dad_branch->partial_pars[ptn/8]; -// int maxi = aln->size() - ptn; -// for (i = 0; i< maxi; i++) { -// UINT state_left = (states_left >> (i*4)) & 15; -// UINT state_right = (states_right >> (i*4)) & 15; -// UINT state_both = state_left | (state_right << 4); -// _pattern_pars[ptn + i] += dna_fitch_step[state_both]; -// tree_pars += dna_fitch_step[state_both] * aln->at(ptn+i).frequency; -// } } else if (aln->num_states == 20 && aln->seq_type == SEQ_PROTEIN) { // ULTRAFAST VERSION FOR PROTEIN UINT state_left[8], state_right[8]; @@ -5121,3 +5129,843 @@ void PhyloTree::printTransMatrices(Node *node, Node *dad) { } FOR_NEIGHBOR_IT(node, dad, it)printTransMatrices((*it)->node, node); } + +/**************************************************************************** + Place new samples onto the tree + ****************************************************************************/ + +void PhyloTree::allocateMutationMemory(int num_column) +{ + current_missing_sample_mutations.resize(num_column); + current_ancestral_mutations.resize(num_column); + visited_missing_sample_mutations.resize(num_column); + visited_ancestral_mutations.resize(num_column); + current_excess_mutations.resize(num_column); + visited_excess_mutations.resize(num_column); +} + +void PhyloTree::computePartialMutation(UINT *states_dad, vector &perm_col, vector &compressed_perm_col, PhyloNeighbor *dad_branch, PhyloNode *dad) { + PhyloNode *node = (PhyloNode *)dad_branch->node; + if (node->isLeaf() && dad) { + return; + } + + int ptn; + int nptn = aln->size(); + UINT *left = NULL, *right = NULL; + PhyloNeighbor *left_branch, *right_branch; + FOR_NEIGHBOR_IT(node, dad, it) + if ((*it)->node->name != ROOT_NAME) { + if (!left) + left = ((PhyloNeighbor *)(*it))->partial_pars, left_branch = (PhyloNeighbor *)(*it)->node->findNeighbor(node); + else + right = ((PhyloNeighbor *)(*it))->partial_pars, right_branch = (PhyloNeighbor *)(*it)->node->findNeighbor(node); + } + + int col = -1; + vector> left_branch_mutations, right_branch_mutations; + for (ptn = 0; ptn < aln->size(); ptn += 8) { + UINT left_states = left[ptn / 8]; + UINT right_states = right[ptn / 8]; + UINT dad_states = states_dad[ptn / 8]; + int maxi = aln->size() - ptn; + if (maxi > 8) maxi = 8; + for (int i = 0; i < maxi; i++) { + ++col; + UINT left_state = (left_states >> (i * 4)) & 15; + UINT right_state = (right_states >> (i * 4)) & 15; + UINT dad_state = (dad_states >> (i * 4)) & 15; + char dad_nuc = 0; + for (dad_nuc = 0; dad_nuc < 4; ++dad_nuc) + if (1 & (dad_state >> dad_nuc)) + break; + + char left_child_nuc; + if ((1 & (left_state >> dad_nuc))) { + left_child_nuc = dad_nuc; + } + else { + for (left_child_nuc = 0; left_child_nuc < 4; ++left_child_nuc) + if (1 & (left_state >> left_child_nuc)) + break; + Mutation left_child_mut; + left_child_mut.position = perm_col[col]; + left_child_mut.compressed_position = compressed_perm_col[col]; + left_child_mut.mut_nuc = (1 << left_child_nuc); + left_child_mut.par_nuc = (1 << dad_nuc); + left_child_mut.ref_nuc = aln->reference_nuc[left_child_mut.compressed_position]; + left_branch->mutations.push_back(left_child_mut); + left_branch_mutations.push_back(make_pair(col, left_child_nuc)); + } + for (int nuc = 0; nuc < 4; ++nuc) { + if (nuc != left_child_nuc && (1 & (left_state >> nuc))) { + left[ptn / 8] ^= (1 << (i * 4 + nuc)); + } + } + + char right_child_nuc; + if ((1 & (right_state >> dad_nuc))) { + right_child_nuc = dad_nuc; + } + else { + for (right_child_nuc = 0; right_child_nuc < 4; ++right_child_nuc) + if (1 & (right_state >> right_child_nuc)) + break; + Mutation right_child_mutation; + right_child_mutation.position = perm_col[col]; + right_child_mutation.compressed_position = compressed_perm_col[col]; + right_child_mutation.mut_nuc = (1 << right_child_nuc); + right_child_mutation.par_nuc = (1 << dad_nuc); + right_child_mutation.ref_nuc = aln->reference_nuc[right_child_mutation.compressed_position]; + right_branch->mutations.push_back(right_child_mutation); + right_branch_mutations.push_back(make_pair(col, right_child_nuc)); + } + for (int nuc = 0; nuc < 4; ++nuc) { + if (nuc != right_child_nuc && (1 & (right_state >> nuc))) { + right[ptn / 8] ^= (1 << (i * 4 + nuc)); + } + } + } + } + + bool left_child = true; + FOR_NEIGHBOR_IT(node, dad, it) + if ((*it)->node->name != ROOT_NAME) { + if (left_child) { + computePartialMutation(left, perm_col, compressed_perm_col, (PhyloNeighbor *)(*it), (PhyloNode *)node); + left_child = false; + continue; + } + computePartialMutation(right, perm_col, compressed_perm_col, (PhyloNeighbor *)(*it), (PhyloNode *)node); + } +} + +void PhyloTree::computeMutationBranch(vector &perm_col, vector &compressed_perm_col, PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst) { + PhyloNode *node = (PhyloNode *)dad_branch->node; + PhyloNeighbor *node_branch = (PhyloNeighbor *)node->findNeighbor(dad); + assert(node_branch); + if (node->isLeaf()) { + PhyloNode *tmp_node = dad; + dad = node; + node = tmp_node; + PhyloNeighbor *tmp_nei = dad_branch; + dad_branch = node_branch; + node_branch = tmp_nei; + } + + int nptn = aln->size(); + UINT *left_branch_states_dad = new UINT[(aln->size() + 7) / 8 + 1]; + for (int ptn = 0; ptn < aln->size(); ptn += 8) { + left_branch_states_dad[ptn / 8] = 0; + } + UINT *right_branch_states_dad = new UINT[(aln->size() + 7) / 8 + 1]; + for (int ptn = 0; ptn < aln->size(); ptn += 8) { + right_branch_states_dad[ptn / 8] = 0; + } + + int i, ptn, col = -1; + for (ptn = 0; ptn < aln->size(); ptn += 8) { + UINT left_states = node_branch->partial_pars[ptn / 8]; + UINT right_states = dad_branch->partial_pars[ptn / 8]; + UINT dad_states = root_states[ptn / 8]; + int maxi = aln->size() - ptn; + if (maxi > 8) maxi = 8; + for (i = 0; i < maxi; i++) { + ++col; + UINT left_state = (left_states >> (i * 4)) & 15; + UINT right_state = (right_states >> (i * 4)) & 15; + UINT dad_state = (dad_states >> (i * 4)) & 15; + + char dad_nuc = 0; + for (dad_nuc = 0; dad_nuc < 4; ++dad_nuc) + if (1 & (dad_state >> dad_nuc)) + break; + + char left_child_nuc; + if ((1 & (left_state >> dad_nuc))) { + left_child_nuc = dad_nuc; + } + else { + for (left_child_nuc = 0; left_child_nuc < 4; ++left_child_nuc) + if (1 & (left_state >> left_child_nuc)) + break; + Mutation left_child_mut; + left_child_mut.position = perm_col[col]; + left_child_mut.compressed_position = compressed_perm_col[col]; + left_child_mut.mut_nuc = (1 << left_child_nuc); + left_child_mut.par_nuc = (1 << dad_nuc); + left_child_mut.ref_nuc = aln->reference_nuc[left_child_mut.compressed_position]; + dad_branch->mutations.push_back(left_child_mut); + } + right_branch_states_dad[ptn / 8] ^= (1 << (i * 4 + left_child_nuc)); + + char right_child_nuc; + if ((1 & (right_state >> dad_nuc))) { + right_child_nuc = dad_nuc; + } + else { + for (right_child_nuc = 0; right_child_nuc < 4; ++right_child_nuc) + if (1 & (right_state >> right_child_nuc)) + break; + Mutation right_child_mut; + right_child_mut.position = perm_col[col]; + right_child_mut.compressed_position = compressed_perm_col[col]; + right_child_mut.mut_nuc = (1 << right_child_nuc); + right_child_mut.par_nuc = (1 << dad_nuc); + right_child_mut.ref_nuc = aln->reference_nuc[right_child_mut.compressed_position]; + node_branch->mutations.push_back(right_child_mut); + } + left_branch_states_dad[ptn / 8] ^= (1 << (i * 4 + right_child_nuc)); + } + } + + computePartialMutation(left_branch_states_dad, perm_col, compressed_perm_col, dad_branch, dad); + computePartialMutation(right_branch_states_dad, perm_col, compressed_perm_col, node_branch, node); +} + +void PhyloTree::initMutation(vector &perm_col, vector &compressed_perm_col) { + // Compute parsimony is necessary for tracing back the mutations + computeParsimony(); + computeMutationBranch(perm_col, compressed_perm_col, (PhyloNeighbor *)root->neighbors[0], (PhyloNode *)root); + + int ptn = 0, counter = 0; + int nptn = aln->size(); + for (int i = 0; i < nptn; ++i) { + char root_nuc = ((root_states[ptn] >> (i * 4)) & 15); + char ref_nuc = aln->reference_nuc[compressed_perm_col[i]]; + if ((root_nuc & ref_nuc) == 0) { + char dad_nuc = 0; + for (dad_nuc = 0; dad_nuc < 4; ++dad_nuc) { + if (1 & (ref_nuc >> dad_nuc)) + break; + } + + char mut_nuc = 0; + for (mut_nuc = 0; mut_nuc < 4; ++mut_nuc) { + if (1 & (root_nuc >> mut_nuc)) + break; + } + + Mutation mutation; + mutation.position = perm_col[i]; + mutation.compressed_position = compressed_perm_col[i]; + mutation.mut_nuc = (1 << mut_nuc); + mutation.ref_nuc = ref_nuc; + mutation.par_nuc = (1 << dad_nuc); + root_mutations.push_back(mutation); + } + ++counter; + if (counter == 8) { + counter = 0; + ++ptn; + } + } +} + +int PhyloTree::computePartialParsimonyMutation(PhyloNeighbor *dad_branch, PhyloNode *dad) { + int parsimony_score = 0; + PhyloNode *node = (PhyloNode *)dad_branch->node; + PhyloNeighbor *node_branch = (PhyloNeighbor *)node->findNeighbor(dad); + parsimony_score += node_branch->mutations.size(); + FOR_NEIGHBOR_IT(node, dad, it) + if ((*it)->node->name != ROOT_NAME) { + parsimony_score += computePartialParsimonyMutation(((PhyloNeighbor *)(*it)), node); + } + return parsimony_score; +} + +int PhyloTree::computeParsimonyBranchMutation(PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst) { + PhyloNode *node = (PhyloNode *)dad_branch->node; + PhyloNeighbor *node_branch = (PhyloNeighbor *)node->findNeighbor(dad); + assert(node_branch); + if (node->isLeaf()) { + PhyloNode *tmp_node = dad; + dad = node; + node = tmp_node; + PhyloNeighbor *tmp_nei = dad_branch; + dad_branch = node_branch; + node_branch = tmp_nei; + } + + int parsimony_score = 0; + parsimony_score += computePartialParsimonyMutation(dad_branch, dad); + parsimony_score += computePartialParsimonyMutation(node_branch, node); + return parsimony_score; +} + +int PhyloTree::computeParsimonyScoreMutation() { + int parsimony_score = computeParsimonyBranchMutation((PhyloNeighbor *)root->neighbors[0], (PhyloNode *)root); + parsimony_score += root_mutations.size(); + return parsimony_score; +} + +void PhyloTree::initNodeDataPlaceNewSample() { + assert(root->isLeaf()); + PhyloNeighbor *nei = ((PhyloNeighbor *)root->neighbors[0]); + current_it = nei; + assert(current_it); + current_it_back = (PhyloNeighbor *)nei->node->findNeighbor(root); + assert(current_it_back); + + vector> bfs; + queue> node_queue; + node_queue.push(make_pair((PhyloNode *)nei->node, current_it_back)); + while (node_queue.size()) { + PhyloNode *node = node_queue.front().first; + PhyloNeighbor *node_branch = node_queue.front().second; + node->dad = (PhyloNode *)node_branch->node; + PhyloNode *dad = (PhyloNode *)node_branch->node; + node_queue.pop(); + bfs.push_back(make_pair(node, node_branch)); + FOR_NEIGHBOR_IT(node, dad, it) { + node_queue.push(make_pair((PhyloNode *)(*it)->node, (PhyloNeighbor *)(*it)->node->findNeighbor(node))); + } + } + + for (int i = bfs.size() - 1; i >= 0; --i) { + PhyloNode *node = bfs[i].first; + PhyloNeighbor *node_branch = bfs[i].second; + PhyloNode *dad = (PhyloNode *)node_branch->node; + node_branch->num_leaves = 0; + if (node->isLeaf()) { + node_branch->num_leaves = 1; + continue; + } + FOR_NEIGHBOR_IT(node, dad, it) { + node_branch->num_leaves += ((PhyloNeighbor *)(*it)->node->findNeighbor(node))->num_leaves; + } + } +} + +void PhyloTree::computeExcessMutations(PlacementCandidateNode &input) { + vector anc_positions; + vector ancestral_mutations; + + timer_regular--; + for (auto mutation : (*input.missing_sample_mutations)) { + visited_missing_sample_mutations[mutation.compressed_position] = timer_regular; + current_missing_sample_mutations[mutation.compressed_position] = mutation; + } + + if (!(input.node == root)) { + for (auto node_mutation : input.node_branch->mutations) { + auto anc_nuc = node_mutation.mut_nuc; + if (node_mutation.is_masked()) + break; + assert(((anc_nuc - 1) & anc_nuc) == 0); + bool found = false; + bool found_pos = false; + if (visited_missing_sample_mutations[node_mutation.compressed_position] == timer_regular) { + auto missing_sample_mutation = current_missing_sample_mutations[node_mutation.compressed_position]; + if (node_mutation.position == missing_sample_mutation.position) { + found_pos = true; + if (missing_sample_mutation.is_missing) { + found = true; + } + else { + auto nuc = missing_sample_mutation.mut_nuc; + if ((nuc & anc_nuc) != 0) { + ancestral_mutations.emplace_back(node_mutation); + anc_positions.emplace_back(node_mutation.compressed_position); + (*input.excess_mutations).emplace_back(node_mutation); + found = true; + } + } + } + } + if (!found) { + if (!found_pos && (anc_nuc == node_mutation.ref_nuc)) { + ancestral_mutations.emplace_back(node_mutation); + anc_positions.emplace_back(node_mutation.compressed_position); + (*input.excess_mutations).emplace_back(node_mutation); + + } + } + } + } + for (auto ancestral_mutation : ancestral_mutations) { + visited_ancestral_mutations[ancestral_mutation.compressed_position] = timer_regular; + current_ancestral_mutations[ancestral_mutation.compressed_position] = ancestral_mutation; + } + + { + PhyloNode *n = input.node; + while (n->dad != root) + { + n = n->dad; + PhyloNeighbor *node_branch = (PhyloNeighbor *)n->findNeighbor(n->dad); + for (auto node_mutation : node_branch->mutations) { + if (!node_mutation.is_masked() && visited_ancestral_mutations[node_mutation.compressed_position] != timer_regular) { + ancestral_mutations.emplace_back(node_mutation); + anc_positions.emplace_back(node_mutation.compressed_position); + visited_ancestral_mutations[node_mutation.compressed_position] = timer_regular; + current_ancestral_mutations[node_mutation.compressed_position] = node_mutation; + } + } + } + for (auto root_mutation : root_mutations) { + if (!root_mutation.is_masked() && visited_ancestral_mutations[root_mutation.compressed_position] != timer_regular) { + ancestral_mutations.emplace_back(root_mutation); + anc_positions.emplace_back(root_mutation.compressed_position); + visited_ancestral_mutations[root_mutation.compressed_position] = timer_regular; + current_ancestral_mutations[root_mutation.compressed_position] = root_mutation; + } + } + } + + for (auto missing_sample_mutation : (*input.missing_sample_mutations)) { + if (missing_sample_mutation.is_missing) { + continue; + } + + bool found_pos = false; + bool found = false; + bool nuc = false; + auto anc_nuc = missing_sample_mutation.ref_nuc; + + if ((missing_sample_mutation.mut_nuc & missing_sample_mutation.ref_nuc) != 0) { + nuc = true; + } + if (visited_ancestral_mutations[missing_sample_mutation.compressed_position] == timer_regular) { + auto ancestral_mutation = current_ancestral_mutations[missing_sample_mutation.compressed_position]; + if (!ancestral_mutation.is_masked()) { + found_pos = true; + anc_nuc = ancestral_mutation.mut_nuc; + if ((missing_sample_mutation.mut_nuc & anc_nuc) != 0) { + found = true; + } + } + } + if (!found && (found_pos || !nuc)) { + Mutation mutation; + mutation.position = missing_sample_mutation.position; + mutation.compressed_position = missing_sample_mutation.compressed_position; + mutation.ref_nuc = missing_sample_mutation.ref_nuc; + mutation.par_nuc = anc_nuc; + for (int nuc = 0; nuc < 4; nuc++) { + if (((1 << nuc) & missing_sample_mutation.mut_nuc) != 0) { + mutation.mut_nuc = (1 << nuc); + break; + } + } + assert((mutation.mut_nuc & (mutation.mut_nuc - 1)) == 0); + if (mutation.mut_nuc != mutation.par_nuc) { + input.excess_mutations->emplace_back(mutation); + } + } + } + + for (auto ancestral_mutation : ancestral_mutations) { + bool found = false; + bool found_pos = false; + auto anc_nuc = ancestral_mutation.mut_nuc; + if (visited_missing_sample_mutations[ancestral_mutation.compressed_position] == timer_regular) { + if (!ancestral_mutation.is_masked()) { + auto missing_sample_mutation = current_missing_sample_mutations[ancestral_mutation.compressed_position]; + found_pos = true; + if (missing_sample_mutation.is_missing || (missing_sample_mutation.mut_nuc & anc_nuc) != 0) { + found = true; + } + } + } + if (!found && !found_pos && (ancestral_mutation.is_masked() || (anc_nuc != ancestral_mutation.ref_nuc))) { + Mutation mutation; + mutation.position = ancestral_mutation.position; + mutation.compressed_position = ancestral_mutation.compressed_position; + mutation.ref_nuc = ancestral_mutation.ref_nuc; + mutation.par_nuc = anc_nuc; + mutation.mut_nuc = ancestral_mutation.ref_nuc; + assert(mutation.is_masked() || ((mutation.mut_nuc & (mutation.mut_nuc - 1)) == 0)); + if (mutation.mut_nuc != mutation.par_nuc) { + (*input.excess_mutations).emplace_back(mutation); + } + } + } +} + +void PhyloTree::initNewSampleMutations(PlacementCandidateNode &inp) { + ++timer_optimized; + for (auto mutation : (*inp.missing_sample_mutations)) { + visited_missing_sample_mutations[mutation.compressed_position] = timer_optimized; + current_missing_sample_mutations[mutation.compressed_position] = mutation; + } +} + +void PhyloTree::eraseMutation(vector &erased_excess_mutation, Mutation mutation, int &set_difference) { + if (visited_excess_mutations[mutation.compressed_position] == timer_optimized) { + erased_excess_mutation.emplace_back(current_excess_mutations[mutation.compressed_position]); + visited_excess_mutations[mutation.compressed_position] = 0; + --set_difference; + } +} + +void PhyloTree::addMutation(vector &added_excess_mutation, Mutation mutation, int diff, int &set_difference) { + added_excess_mutation.push_back(mutation); + visited_excess_mutations[mutation.compressed_position] = timer_optimized; + current_excess_mutations[mutation.compressed_position] = mutation; + set_difference += diff; +} + +void PhyloTree::optimizedFindPositionPlaceNewSample(PlacementCandidateNode &input, int set_difference) { + vector ancentral_positions; + vector ancestral_mutations; + vector erased_excess_mutation; + vector added_excess_mutation; + vector common_mutations; + vector diff_mutations; + + if (!(input.node == root)) { + for (auto node_mutation : input.node_branch->mutations) { + auto anc_nuc = node_mutation.mut_nuc; + if (node_mutation.is_masked()) { + break; + } + assert(((anc_nuc - 1) & anc_nuc) == 0); + bool found = false; + bool found_pos = false; + if (visited_missing_sample_mutations[node_mutation.compressed_position] == timer_optimized) { + auto missing_sample_mutation = current_missing_sample_mutations[node_mutation.compressed_position]; + if (node_mutation.position == missing_sample_mutation.position) { + found_pos = true; + if (missing_sample_mutation.is_missing) { + found = true; + } + else { + auto sample_nuc = missing_sample_mutation.mut_nuc; + if ((sample_nuc & anc_nuc) != 0) { + ancestral_mutations.emplace_back(node_mutation); + ancentral_positions.emplace_back(node_mutation.compressed_position); + eraseMutation(erased_excess_mutation, node_mutation, set_difference); + addMutation(added_excess_mutation, node_mutation, 0, set_difference); + common_mutations.emplace_back(node_mutation); + found = true; + } + } + } + } + if (!found) { + if (!found_pos && (anc_nuc == node_mutation.ref_nuc)) { + ancestral_mutations.emplace_back(node_mutation); + ancentral_positions.emplace_back(node_mutation.compressed_position); + eraseMutation(erased_excess_mutation, node_mutation, set_difference); + addMutation(added_excess_mutation, node_mutation, 0, set_difference); + common_mutations.emplace_back(node_mutation); + } + else { + diff_mutations.emplace_back(node_mutation); + } + } + } + } + + if (input.node->dad == root) { + for (auto root_mutation : root_mutations) { + if (!root_mutation.is_masked() && visited_ancestral_mutations[root_mutation.compressed_position] != timer_optimized) { + ancestral_mutations.emplace_back(root_mutation); + ancentral_positions.emplace_back(root_mutation.compressed_position); + visited_ancestral_mutations[root_mutation.compressed_position] = timer_optimized; + current_ancestral_mutations[root_mutation.compressed_position] = root_mutation; + } + } + + for (auto missing_sample_mutation : (*input.missing_sample_mutations)) { + if (missing_sample_mutation.is_missing) { + continue; + } + bool found = false; + bool found_pos = false; + bool has_ref = false; + auto anc_nuc = missing_sample_mutation.ref_nuc; + if ((missing_sample_mutation.mut_nuc & missing_sample_mutation.ref_nuc) != 0) { + has_ref = true; + } + + if (visited_ancestral_mutations[missing_sample_mutation.compressed_position] == timer_optimized) { + auto ancestral_mutation = current_ancestral_mutations[missing_sample_mutation.compressed_position]; + if (!ancestral_mutation.is_masked()) { + found_pos = true; + anc_nuc = ancestral_mutation.mut_nuc; + if ((missing_sample_mutation.mut_nuc & anc_nuc) != 0) { + found = true; + } + } + } + if (!found && !has_ref) { + Mutation mutation; + mutation.position = missing_sample_mutation.position; + mutation.compressed_position = missing_sample_mutation.compressed_position; + mutation.ref_nuc = missing_sample_mutation.ref_nuc; + mutation.par_nuc = anc_nuc; + for (int nuc = 0; nuc < 4; nuc++) { + if (((1 << nuc) & missing_sample_mutation.mut_nuc) != 0) + { + mutation.mut_nuc = (1 << nuc); + break; + } + } + addMutation(added_excess_mutation, mutation, 1, set_difference); + } + } + } + + for (auto ancestral_mutation : ancestral_mutations) { + bool found = false; + bool found_pos = false; + auto anc_nuc = ancestral_mutation.mut_nuc; + if (visited_missing_sample_mutations[ancestral_mutation.compressed_position] == timer_optimized) { + if (!ancestral_mutation.is_masked()) { + auto missing_sample_mutation = current_missing_sample_mutations[ancestral_mutation.compressed_position]; + found_pos = true; + if (missing_sample_mutation.is_missing || (missing_sample_mutation.mut_nuc & anc_nuc) != 0) { + found = true; + } + } + } + if (!found && (found_pos || ancestral_mutation.is_masked() || (anc_nuc != ancestral_mutation.ref_nuc))) { + eraseMutation(erased_excess_mutation, ancestral_mutation, set_difference); + Mutation mutation; + mutation.position = ancestral_mutation.position; + mutation.compressed_position = ancestral_mutation.compressed_position; + mutation.ref_nuc = ancestral_mutation.ref_nuc; + mutation.par_nuc = anc_nuc; + mutation.mut_nuc = ancestral_mutation.ref_nuc; + if (mutation.mut_nuc != mutation.par_nuc) { + addMutation(added_excess_mutation, mutation, 1, set_difference); + } + } + } + + size_t num_leaves = input.node_branch->num_leaves; + if (set_difference < *input.best_set_difference) { + *input.best_set_difference = set_difference; + *input.best_node_num_leaves = num_leaves; + input.best_node = input.node; + input.best_node_branch = input.node_branch; + } + else if (set_difference == *input.best_set_difference) { + if (((num_leaves >= *input.best_node_num_leaves))) { + *input.best_set_difference = set_difference; + *input.best_node_num_leaves = num_leaves; + input.best_node = input.node; + input.best_node_branch = input.node_branch; + } + } + + for (auto common_mutation : common_mutations) { + visited_excess_mutations[common_mutation.compressed_position] = 0; + } + + for (auto diff_mutation : diff_mutations) { + Mutation mutation; + mutation.ref_nuc = diff_mutation.ref_nuc; + mutation.par_nuc = diff_mutation.mut_nuc; + mutation.mut_nuc = diff_mutation.ref_nuc; + mutation.position = diff_mutation.position; + mutation.compressed_position = diff_mutation.compressed_position; + if (visited_missing_sample_mutations[diff_mutation.compressed_position] == timer_optimized) { + mutation.mut_nuc = current_missing_sample_mutations[diff_mutation.compressed_position].mut_nuc; + } + eraseMutation(erased_excess_mutation, mutation, set_difference); + if (mutation.mut_nuc != mutation.par_nuc) { + addMutation(added_excess_mutation, mutation, 1, set_difference); + } + } + + PhyloNode *node = input.node; + PhyloNode *dad = node->dad; + FOR_NEIGHBOR_IT(node, dad, it) { + PhyloNode *child_node = (PhyloNode *)(*it)->node; + PhyloNeighbor *child_node_branch = (PhyloNeighbor *)child_node->findNeighbor(node); + input.node = child_node; + input.node_branch = child_node_branch; + optimizedFindPositionPlaceNewSample(input, set_difference); + } + + for (auto mutation : added_excess_mutation) { + visited_excess_mutations[mutation.compressed_position] = 0; + } + + for (int i = erased_excess_mutation.size() - 1; i >= 0; i--) { + Mutation mutation = erased_excess_mutation[i]; + visited_excess_mutations[mutation.compressed_position] = timer_optimized; + current_excess_mutations[mutation.compressed_position] = mutation; + } +} + +void PhyloTree::addNewSample(PhyloNode *best_node, PhyloNeighbor *best_node_branch, vector node_excess_mutations, int index, string sample_name) { + PhyloNode *new_node = (PhyloNode *)newNode(); + PhyloNode *sample = (PhyloNode *)newNode(aln->getNSeq() + index, sample_name.c_str()); + sample->setMissingNode(index); + new_node->addNeighbor(sample, -1.0); + sample->addNeighbor(new_node, -1.0); + PhyloNode *best_dad = (PhyloNode *)best_node_branch->node; + + vector common_mutations, best_node_mutations, sample_mutations; + vector current_node_mutations; + // Compute current best node branch mutations + for (auto node_mutation : best_node_branch->mutations) { + current_node_mutations.emplace_back(node_mutation); + } + + best_node_branch->clearMutations(); + --timer_regular; + for (auto node_mutation : current_node_mutations) { + visited_ancestral_mutations[node_mutation.compressed_position] = timer_regular; + current_ancestral_mutations[node_mutation.compressed_position] = node_mutation; + } + for (auto excess_mutation : node_excess_mutations) { + visited_excess_mutations[excess_mutation.compressed_position] = timer_regular; + current_excess_mutations[excess_mutation.compressed_position] = excess_mutation; + } + for (auto node_mutation : current_node_mutations) { + bool found = false; + if (!node_mutation.is_masked()) { + if (visited_excess_mutations[node_mutation.compressed_position] == timer_regular) { + auto excess_mutation = current_excess_mutations[node_mutation.compressed_position]; + if (node_mutation.position == excess_mutation.position) { + if (node_mutation.mut_nuc == excess_mutation.mut_nuc) { + found = true; + } + } + } + } + if (!found) { + best_node_mutations.emplace_back(node_mutation); + } + } + // Compute sample mutations + for (auto excess_mutation : node_excess_mutations) { + bool found = false; + if (!excess_mutation.is_masked()) { + if (visited_ancestral_mutations[excess_mutation.compressed_position] == timer_regular) { + auto ancestral_mutation = current_ancestral_mutations[excess_mutation.compressed_position]; + if (excess_mutation.position == ancestral_mutation.position) { + if (excess_mutation.mut_nuc == ancestral_mutation.mut_nuc) { + found = true; + Mutation m = excess_mutation.copy(); + common_mutations.emplace_back(m); + } + } + } + } + if (!found) { + sample_mutations.emplace_back(excess_mutation); + } + } + + new_node->addNeighbor(best_node, -1.0); + new_node->addNeighbor(best_dad, -1.0); + best_node->updateNeighbor(best_dad, new_node, -1.0); + best_dad->updateNeighbor(best_node, new_node, -1.0); + // Add mutations to new node using common_mut + PhyloNeighbor *new_node_branch = (PhyloNeighbor *)new_node->findNeighbor(best_dad); + new_node_branch->mutations = common_mutations; + + PhyloNeighbor *new_best_node_branch = (PhyloNeighbor *)best_node->findNeighbor(new_node); + new_best_node_branch->mutations = best_node_mutations; + + PhyloNeighbor *sample_branch = (PhyloNeighbor *)sample->findNeighbor(new_node); + sample_branch->mutations = sample_mutations; +} + +string PhyloTree::verifyPartialMutationCorrectness(vector &position, PhyloNeighbor *dad_branch, PhyloNode *dad) { + PhyloNode *node = (PhyloNode *)dad_branch->node; + int nsite = aln->getNSite(); + + if (node->isLeaf() && dad) { + PhyloNeighbor *node_branch = (PhyloNeighbor *)node->findNeighbor(dad); + string sequence = ""; + assert(node->id < aln->getNSeq()); + for (int i = 0; i < nsite; ++i) { + Pattern pattern = aln->getPattern(i); + sequence += aln->convertStateBack(pattern[node->id]); + } + return sequence; + } + else { + string left_sequence, right_sequence; + PhyloNeighbor *left_branch, *right_branch; + bool left_child = true; + FOR_NEIGHBOR_IT(node, dad, it) { + if ((*it)->node->name != ROOT_NAME) { + if (left_child) { + left_branch = (PhyloNeighbor *)(*it)->node->findNeighbor(node); + left_sequence = verifyPartialMutationCorrectness(position, (PhyloNeighbor *)(*it), (PhyloNode *)node); + left_child = false; + continue; + } + right_branch = (PhyloNeighbor *)(*it)->node->findNeighbor(node); + right_sequence = verifyPartialMutationCorrectness(position, (PhyloNeighbor *)(*it), (PhyloNode *)node); + } + } + for (auto mutation : left_branch->mutations) { + assert(position[mutation.compressed_position] < (int)left_sequence.length()); + left_sequence[position[mutation.compressed_position]] = aln->getStateFromMutation(mutation.par_nuc); + } + for (auto mutation : right_branch->mutations) { + assert(position[mutation.compressed_position] < (int)right_sequence.length()); + right_sequence[position[mutation.compressed_position]] = aln->getStateFromMutation(mutation.par_nuc); + } + + if (left_sequence != right_sequence) { + for (int i = 0; i < (int)left_sequence.length(); ++i) { + if (left_sequence[i] != right_sequence[i] && (aln->getMutationFromState(left_sequence[i]) & aln->getMutationFromState(right_sequence[i])) == 0) { + cout << "Compute mutations wrong"; + exit(1); + } + } + } + return left_sequence; + } +} + +void PhyloTree::verifyMutationCorrectnessBranch(vector &position, PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst) { + PhyloNode *node = (PhyloNode *)dad_branch->node; + PhyloNeighbor *node_branch = (PhyloNeighbor *)node->findNeighbor(dad); + if (node->isLeaf()) { + PhyloNode *tmp_node = dad; + dad = node; + node = tmp_node; + PhyloNeighbor *tmp_nei = dad_branch; + dad_branch = node_branch; + node_branch = tmp_nei; + } + + string left_sequence = verifyPartialMutationCorrectness(position, dad_branch, dad); + string right_sequence = verifyPartialMutationCorrectness(position, node_branch, node); + for (auto mutation : node_branch->mutations) { + left_sequence[position[mutation.compressed_position]] = aln->getStateFromMutation(mutation.par_nuc); + } + for (auto mutation : dad_branch->mutations) { + right_sequence[position[mutation.compressed_position]] = aln->getStateFromMutation(mutation.par_nuc); + } + if (left_sequence != right_sequence) { + for (int i = 0; i < (int)left_sequence.length(); ++i) { + if (left_sequence[i] != right_sequence[i] && (aln->getMutationFromState(left_sequence[i]) & aln->getMutationFromState(right_sequence[i])) == 0) { + cout << "Compute mutations wrong at root"; + exit(1); + } + } + } +} + +void PhyloTree::verifyMutationCorrectness() { + cout << "========== Start checking mutations ==========\n"; + vector perm_col = aln->findRotatedColumnPermutation(); + int nsite = aln->getNSite(); + assert(perm_col.size() == nsite); + vector sorted_perm_col(perm_col); + sort(sorted_perm_col.begin(), sorted_perm_col.end()); + vector compressed_perm_col; + for (int col : perm_col) { + int idx = lower_bound(sorted_perm_col.begin(), sorted_perm_col.end(), col) - sorted_perm_col.begin(); + compressed_perm_col.push_back(idx); + } + vector position(nsite); + for (int i = 0; i < nsite; ++i) { + position[compressed_perm_col[i]] = i; + } + verifyMutationCorrectnessBranch(position, (PhyloNeighbor *)root->neighbors[0], (PhyloNode *)root); + cout << "Compute mutation correctly\n"; + cout << "========== End checking mutations ==========\n"; +} \ No newline at end of file diff --git a/phylotree.h b/phylotree.h index 58ec5cb0..ceee488c 100644 --- a/phylotree.h +++ b/phylotree.h @@ -279,6 +279,195 @@ class PhyloTree : public MTree, public Optimization { */ virtual ~PhyloTree(); + /** + * Flag indicating whether to add a new row to the tree + * Used in tree modification operations + */ + bool add_row; + + /** + * Array storing the states of the root node + * Used for tracking ancestral states in phylogenetic analysis + */ + UINT *root_states; + + /** + * Vector storing mutations at the root node + * Contains information about mutations that occurred at the root of the tree + */ + vector root_mutations; + + /** + * Allocates memory for mutation data structures + * @param num_column Number of columns in the alignment to allocate memory for + */ + void allocateMutationMemory(int num_column); + + /** + * Initializes mutation data for MAT + * @param perm_col Vector of column permutations + * @param compressed_perm_col Vector of compressed column permutations + */ + void initMutation(vector &perm_col, vector &compressed_perm_col); + + /** + * Computes mutations along a specific branch + * @param perm_col Vector of column permutations + * @param compressed_perm_col Vector of compressed column permutations + * @param dad_branch Pointer to the branch being analyzed + * @param dad Pointer to the parent node + * @param branch_subst Optional pointer to store number of substitutions on branch + */ + void computeMutationBranch(vector &perm_col, vector &compressed_perm_col, PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst = NULL); + + /** + * Computes partial mutations along a branch + * @param states_dad Array of parent node states + * @param perm_col Vector of column permutations + * @param compressed_perm_col Vector of compressed column permutations + * @param dad_branch Pointer to the branch being analyzed + * @param dad Pointer to the parent node + */ + void computePartialMutation(UINT *states_dad, vector &perm_col, vector &compressed_perm_col, PhyloNeighbor *dad_branch, PhyloNode *dad); + + /** + * Computes parsimony score using mutation data + * @return The parsimony score based on mutations + */ + int computeParsimonyScoreMutation(); + + /** + * Computes parsimony score for a specific branch using mutation data + * @param dad_branch Pointer to the branch being analyzed + * @param dad Pointer to the parent node + * @param branch_subst Optional pointer to store number of substitutions on branch + * @return The parsimony score for the branch + */ + int computeParsimonyBranchMutation(PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst = NULL); + + /** + * Computes partial parsimony score for a branch using mutation data + * @param dad_branch Pointer to the branch being analyzed + * @param dad Pointer to the parent node + * @return The partial parsimony score for the branch + */ + int computePartialParsimonyMutation(PhyloNeighbor *dad_branch, PhyloNode *dad); + + /** + * Initializes node data for placing new samples + * Computes breadth-first expansion of tree vertices + */ + void initNodeDataPlaceNewSample(); + + /** + * Vector storing current excess mutations + */ + vector current_excess_mutations; + + /** + * Vector storing mutations from the missing sample + */ + vector current_missing_sample_mutations; + + /** + * Vector storing ancestral mutations + */ + vector current_ancestral_mutations; + + /** + * Vector storing time of visited missing sample mutations + */ + vector visited_missing_sample_mutations; + + /** + * Vector storing time of visited ancestral mutations + */ + vector visited_ancestral_mutations; + + /** + * Vector storing time of visited excess mutations + */ + vector visited_excess_mutations; + + /** + * Timer for optimized operations + */ + int timer_optimized; + + /** + * Timer for regular operations + */ + int timer_regular; + + /** + * Calculates placement mutations for a candidate node + * @param input Reference to the placement candidate node + */ + void computeExcessMutations(PlacementCandidateNode &input); + + /** + * Initializes data for calculating placement mutations + * @param inp Reference to the placement candidate node + */ + void initNewSampleMutations(PlacementCandidateNode &inp); + + /** + * Erases a mutation from the excess mutations set + * @param erase_excess_mutations Vector of mutations to erase + * @param m The mutation to erase + * @param set_difference Reference to track the difference in mutation sets + */ + void eraseMutation(vector &erase_excess_mutations, Mutation m, int &set_difference); + + /** + * Adds a mutation to the excess mutations set + * @param added_excess_mutations Vector of mutations to add + * @param m The mutation to add + * @param diff The difference value + * @param set_difference Reference to track the difference in mutation sets + */ + void addMutation(vector &added_excess_mutations, Mutation m, int diff, int &set_difference); + + /** + * Optimizes function for finding the best position to place a new sample + * @param input Reference to the placement candidate node + * @param set_difference Optional difference in mutation sets + */ + void optimizedFindPositionPlaceNewSample(PlacementCandidateNode &input, int set_difference = 0); + + /** + * Adds a new sample to the tree + * @param best_node Pointer to the best node for placement + * @param best_node_branch Pointer to the best branch for placement + * @param node_excess_mutations Vector of excess mutations for the node + * @param index Index of the new sample + * @param name Name of the new sample + */ + void addNewSample(PhyloNode *best_node, PhyloNeighbor *best_node_branch, vector node_excess_mutations, int index, string name); + + /** + * Verifies the correctness of mutations at a given position + */ + void verifyMutationCorrectness(); + + /** + * Verifies mutations on a specific branch + * @param pos Vector of positions to verify + * @param dad_branch Pointer to the branch being verified + * @param dad Pointer to the parent node + * @param branch_subst Optional pointer to store number of substitutions + */ + void verifyMutationCorrectnessBranch(vector &pos, PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst = NULL); + + /** + * Verifies partial mutations on a branch + * @param pos Vector of positions to verify + * @param dad_branch Pointer to the branch being verified + * @param dad Pointer to the parent node + * @return String containing verification results + */ + string verifyPartialMutationCorrectness(vector &pos, PhyloNeighbor *dad_branch, PhyloNode *dad); + /** copy the phylogenetic tree structure into this tree, override to take sequence names in the alignment into account diff --git a/placement.cpp b/placement.cpp new file mode 100644 index 00000000..54383931 --- /dev/null +++ b/placement.cpp @@ -0,0 +1,178 @@ +#ifdef HAVE_CONFIG_H +#include +#endif + +#include "phylotree.h" +#include "alignment.h" +#include "iqtree.h" +#include "mutation.h" +#include "placement.h" + +const int VCF_HEADER_LINES = 12; // Number of header lines in VCF file +const int BATCH_SIZE = 8; // Number of columns to process in each batch +void initAlignment(IQTree *tree, Alignment *alignment, vector &rotated_permutation_column) { + int nsite = rotated_permutation_column.size(); + vector perm_col(nsite); + vector compressed_perm_col(nsite); + if (alignment->existing_sample_mutations.size()) { + for (int site = 0; site < nsite; ++site) { + int col = rotated_permutation_column[site]; + compressed_perm_col[site] = alignment->existing_sample_mutations[0][col].compressed_position; + perm_col[site] = alignment->existing_sample_mutations[0][col].position; + } + } + alignment->ungroupSitePattern(); + tree->add_row = true; + tree->root_states = new UINT[(alignment->size() + 7) / 8 + 1]; + tree->initMutation(perm_col, compressed_perm_col); +} + +int readInitialAlignment(ifstream &in_file_stream, char *out_file_name, int num_initial_rows) { + ofstream out_file(out_file_name); + if (!out_file.is_open()) { + cout << "Cannot open outputfile :" << out_file_name << '\n'; + exit(1); + } + string line; + int num_processed_rows = 0; + while (getline(in_file_stream, line)) { + if (line == "") { + continue; + } + out_file << line << '\n'; + ++num_processed_rows; + if (num_processed_rows >= num_initial_rows) { + break; + } + } + out_file.close(); + return num_processed_rows; +} + +int readVCFFile(IQTree *tree, Alignment*& alignment, Params ¶ms) { + if (params.num_existing_sequences + params.num_missing_sequences <= MAX_SEQUENCE) { + alignment = new Alignment(params.aln_file, params.sequence_type, params.intype, params.num_existing_sequences); + tree->setAlignment(alignment); + tree->aln = alignment; + vector rotatedColumnPermutation = alignment->findRotatedColumnPermutation(); + initAlignment(tree, alignment, rotatedColumnPermutation); + return (alignment)->getNSite(); + } + + ifstream in; + in.exceptions(ios::failbit | ios::badbit); + in.open(params.aln_file); + string line; + in.exceptions(ios::badbit); + + // Read first 12 lines and create tree alignment + int totalColumn = readInitialAlignment(in, "temp.vcf", VCF_HEADER_LINES) - 1; // Read header lines and write to temp.vcf + alignment = new Alignment("temp.vcf", params.sequence_type, params.intype, params.num_existing_sequences); + alignment->ungroupSitePattern(); + std::remove("temp.vcf"); + tree->setAlignment(alignment); + tree->aln = alignment; + + vector rotatedColumnPermutation = alignment->findRotatedColumnPermutation(); + initAlignment(tree, alignment, rotatedColumnPermutation); + + while (true) { + int numProcessedColumn = (alignment)->readPartialVCF(in, params.sequence_type, rotatedColumnPermutation, + params.num_existing_sequences, totalColumn, BATCH_SIZE); + if (numProcessedColumn == 0) + break; + tree->clearAllPartialLH(); + totalColumn += numProcessedColumn; + initAlignment(tree, alignment, rotatedColumnPermutation); + } + + in.close(); + return totalColumn; +} + +void placeNewSamplesOntoExistingTree(Params ¶ms) { + cout << "\n========== Start initial data structure ==========\n"; + + Alignment *alignment; + IQTree *tree = new IQTree; + bool is_rooted = false; + + tree->readTree(params.mutation_tree_file, is_rooted); + int sequence_length = readVCFFile(tree, alignment, params) + 1; + // Init new tree's memory + tree->allocateMutationMemory(sequence_length); + // free memory + delete[] tree->root_states; + tree->add_row = false; + cout << "Tree parsimony after init mutations: " << tree->computeParsimonyScoreMutation() << '\n'; + + cout << "\n========== Starting placement core ==========\n"; + int num_sequences = min((int)alignment->missing_sample_mutations.size(), params.num_missing_sequences); + + auto start_time = getCPUTime(); + for (int i = 0; i < num_sequences; ++i) { + tree->initNodeDataPlaceNewSample(); + PlacementCandidateNode input; + int best_set_difference = INT_MAX; + size_t best_node_num_leaves = INT_MAX; + std::vector excess_mutations; + + input.best_set_difference = &best_set_difference; + input.best_node_num_leaves = &best_node_num_leaves; + input.node = (PhyloNode *)tree->root->neighbors[0]->node; + input.node_branch = (PhyloNeighbor *)input.node->findNeighbor(tree->root); + input.missing_sample_mutations = &alignment->missing_sample_mutations[i]; + input.excess_mutations = &excess_mutations; + + tree->initNewSampleMutations(input); + tree->optimizedFindPositionPlaceNewSample(input, 0); + input.node = input.best_node; + input.node_branch = input.best_node_branch; + tree->computeExcessMutations(input); + tree->addNewSample(input.best_node, input.best_node_branch, excess_mutations, i, alignment->missing_seq_names[i]); + } + cout << "\n========== Finished placement core ==========\n"; + cout << "Time: " << fixed << setprecision(3) << (double)(getCPUTime() - start_time) << " seconds\n"; + cout << "Memory: " << getMemory() << " KB\n"; + cout << "New tree's parsimony score computed by mutation: " << tree->computeParsimonyScoreMutation() << '\n'; + + delete alignment; + alignment = NULL; + delete tree; +} + +void checkCorectTree(char *origin_tree_file, char *new_tree_file) { + cout << "================= Start checking correct tree ================\n"; + IQTree *origin_tree = new IQTree; + bool origin_tree_is_rooted = false; + origin_tree->readTree(origin_tree_file, origin_tree_is_rooted); + + IQTree *new_tree = new IQTree; + bool new_tree_is_rooted = false; + new_tree->readTree(new_tree_file, new_tree_is_rooted); + + vector origin_tree_leaves_name; + origin_tree->getLeavesName(origin_tree_leaves_name); + + new_tree->assignRoot(origin_tree_leaves_name[0]); + sort(origin_tree_leaves_name.begin(), origin_tree_leaves_name.end()); + new_tree->initNodeData(origin_tree_leaves_name); + + if (new_tree->compareTree(origin_tree)) { + cout << "Finish checking correct tree: Correct tree detected\n"; + } + else { + cout << "Finish checking correct tree: Wrong tree detected\n"; + } + + delete origin_tree; + delete new_tree; +} + +void configLeafNames(IQTree *tree, Node *node, Node *dad) { + if (node->isLeaf()) { + node->id = tree->aln->getSeqID(node->name); + } + FOR_NEIGHBOR_IT(node, dad, it) + configLeafNames(tree, (*it)->node, node); +} \ No newline at end of file diff --git a/placement.h b/placement.h new file mode 100644 index 00000000..8b992290 --- /dev/null +++ b/placement.h @@ -0,0 +1,19 @@ +#ifndef PLACEMENT_H +#define PLACEMENT_H + +#include "tools.h" +#include "fstream" +#include "timeutil.h" + +const int MAX_SEQUENCE = 20000; + +/** + * Place new samples onto existing tree + */ +void placeNewSamplesOntoExistingTree(Params ¶ms); + +/** + * Check if origin tree doesn't change. + */ +void checkCorrectTree(char *originTreeFile, char *newTreeFile); +#endif diff --git a/timeutil.h b/timeutil.h index 7151d70c..c1931d84 100644 --- a/timeutil.h +++ b/timeutil.h @@ -86,7 +86,16 @@ #endif #endif /* HAVE_GETTIMEOFDAY */ - +/** + * @return CPU memory usage since program was started + */ +__inline uint64_t getMemory() { +#ifdef HAVE_GETRUSAGE + struct rusage usage; + getrusage(RUSAGE_SELF, &usage); + return usage.ru_maxrss; +#endif +} /** * @return CPU time in seconds since program was started (corrrect up to micro-seconds) diff --git a/tools.cpp b/tools.cpp index d58577a6..be84aac3 100644 --- a/tools.cpp +++ b/tools.cpp @@ -552,6 +552,12 @@ void get2RandNumb(const int size, int &first, int &second) { void parseArg(int argc, char *argv[], Params ¶ms) { int cnt; + params.num_existing_sequences = INT_MAX; + params.num_missing_sequences = 0; + params.mutation_tree_file = NULL; + params.ppon = false; + params.pp_verify_preserved_tree = false; + params.original_tree_file = NULL; verbose_mode = VB_MIN; params.tree_gen = NONE; params.user_file = NULL; @@ -851,6 +857,40 @@ void parseArg(int argc, char *argv[], Params ¶ms) { #endif continue; } + if (strcmp(argv[cnt], "-pp_on") == 0) + { + params.ppon = true; + continue; + } + if (strcmp(argv[cnt], "-pp_origin") == 0) + { + cnt++; + params.original_tree_file = argv[cnt]; + continue; + } + if (strcmp(argv[cnt], "-pp_n") == 0) + { + cnt++; + params.num_existing_sequences = convert_int(argv[cnt]); + continue; + } + if (strcmp(argv[cnt], "-pp_k") == 0) + { + cnt++; + params.num_missing_sequences = convert_int(argv[cnt]); + continue; + } + if (strcmp(argv[cnt], "-pp_tree") == 0) + { + cnt++; + params.mutation_tree_file = argv[cnt]; + continue; + } + if (strcmp(argv[cnt], "-pp_test_optimize") == 0) + { + params.pp_verify_preserved_tree = true; + continue; + } if (strcmp(argv[cnt], "-ho") == 0 || strcmp(argv[cnt], "-?") == 0) { // usage_iqtree(argv, false); usage_mpboot(argv, false); @@ -3087,31 +3127,40 @@ void usage_mpboot(char* argv[], bool full_command) { } InputType detectInputFile(char *input_file) { - - try { - ifstream in; - in.exceptions(ios::failbit | ios::badbit); - in.open(input_file); - - unsigned char ch; - int count = 0; - do { - in >> ch; - } while (ch <= 32 && !in.eof() && count++ < 20); - in.close(); - switch (ch) { - case '#': return IN_NEXUS; - case '(': return IN_NEWICK; - case '[': return IN_NEWICK; - case '>': return IN_FASTA; - default: - if (isdigit(ch)) return IN_PHYLIP; - return IN_OTHER; - } - } catch (ios::failure) { - outError("Cannot read file ", input_file); - } - return IN_OTHER; + try { + ifstream in; + in.exceptions(ios::failbit | ios::badbit); + in.open(input_file); + + unsigned char ch; + int count = 0; + do { + in >> ch; + } while (ch <= 32 && !in.eof() && count++ < 20); + char tmp = 'N'; + in >> tmp; + in.close(); + switch (ch) { + case '#': + if (tmp == 'N') + return IN_NEXUS; + return IN_VCF; + case '(': + return IN_NEWICK; + case '[': + return IN_NEWICK; + case '>': + return IN_FASTA; + default: + if (isdigit(ch)) + return IN_PHYLIP; + return IN_OTHER; + } + } + catch (ios::failure) { + outError("Cannot read file ", input_file); + } + return IN_OTHER; } bool overwriteFile(char *filename) { diff --git a/tools.h b/tools.h index 315db2dd..4b2fbf79 100644 --- a/tools.h +++ b/tools.h @@ -314,7 +314,7 @@ const int SW_AVG_PRESENT = 4; // take the split weight average over all trees th input type, tree or splits graph */ enum InputType { - IN_NEWICK, IN_NEXUS, IN_FASTA, IN_PHYLIP, IN_OTHER + IN_NEWICK, IN_NEXUS, IN_FASTA, IN_PHYLIP, IN_VCF, IN_OTHER }; /** @@ -411,6 +411,35 @@ extern int NNI_MAX_NR_STEP; program parameters, everything is specified here */ struct Params { + /** + * Enable placement + */ + bool ppon; + + /** + * Number of starting row + */ + int num_existing_sequences; + + /** + * Number of adding row + */ + int num_missing_sequences; + + /** + * Tree file name + */ + char *mutation_tree_file; + + /** + * Original tree file name + */ + char *original_tree_file; + + /** + * Checking correct tree + */ + bool pp_verify_preserved_tree; /** * Number of starting parsimony trees