From 14b26dc7670c884536ecc09bd5b14164471e169f Mon Sep 17 00:00:00 2001 From: StefanFlaumberg Date: Sat, 18 Jan 2025 22:39:06 +0300 Subject: [PATCH 1/8] PMSR model implementation --- alignment/alignment.cpp | 337 ++++++++++------ alignment/alignment.h | 80 ++-- alignment/alignmentpairwise.cpp | 11 +- main/phyloanalysis.cpp | 308 +++++++------- main/phylotesting.cpp | 65 ++- main/treetesting.cpp | 162 +++++--- main/treetesting.h | 29 +- model/modeldna.cpp | 5 +- model/modelfactory.cpp | 262 +++++++++--- model/modelfactory.h | 23 +- model/modelmarkov.cpp | 119 +++--- model/modelmarkov.h | 2 +- model/modelmixture.cpp | 159 +------- model/modelmixture.h | 19 +- model/modelset.cpp | 516 ++++++++++++++++-------- model/modelset.h | 168 +++++--- model/modelsubst.h | 15 + simulator/alisimulatorheterogeneity.cpp | 8 +- tree/iqtree.cpp | 3 + tree/phylotree.cpp | 24 +- tree/phylotree.h | 12 +- utils/tools.cpp | 157 ++++--- utils/tools.h | 53 ++- 23 files changed, 1529 insertions(+), 1008 deletions(-) diff --git a/alignment/alignment.cpp b/alignment/alignment.cpp index 7e8fb8313..8bffbcd9a 100644 --- a/alignment/alignment.cpp +++ b/alignment/alignment.cpp @@ -1252,42 +1252,91 @@ void Alignment::ungroupSitePattern() { vector stored_pat = (*this); clear(); + IntVector stored_site_pattern = site_pattern; for (size_t i = 0; i < getNSite(); ++i) { Pattern pat = stored_pat[getPatternID(i)]; pat.frequency = 1; push_back(pat); site_pattern[i] = i; } + ASSERT(getNPattern() == getNSite()); pattern_index.clear(); + // update the pattern_first_site map + pattern_first_site = site_pattern; + // refill the existing pattern-specific parameters + if (isSSF()) { + vector stored_ptn_state_freq = ptn_state_freq; + ptn_state_freq.clear(); + for (size_t ptn = 0; ptn < getNPattern(); ++ptn) { + ptn_state_freq.push_back(stored_ptn_state_freq[stored_site_pattern[ptn]]); + } + } + if (isSSR()) { + vector stored_ptn_rate_scaler = ptn_rate_scaler; + ptn_rate_scaler.clear(); + for (size_t ptn = 0; ptn < getNPattern(); ++ptn) { + ptn_rate_scaler.push_back(stored_ptn_rate_scaler[stored_site_pattern[ptn]]); + } + } } -void Alignment::regroupSitePattern(int groups, IntVector& site_group) +void Alignment::regroupSitePattern(int groups, const IntVector &site_group) { + // subdivide patterns based on their assignment to the provided groups vector stored_pat = (*this); - IntVector stored_site_pattern = site_pattern; clear(); + IntVector stored_site_pattern = site_pattern; site_pattern.clear(); site_pattern.resize(stored_site_pattern.size(), -1); size_t count = 0; - for (int g = 0; g < groups; g++) { + for (int g = 0; g < groups; ++g) { pattern_index.clear(); - for (size_t i = 0; i < site_group.size(); ++i) - if (site_group[i] == g) { - count++; - Pattern pat = stored_pat[stored_site_pattern[i]]; - addPattern(pat, i); + for (size_t i = 0; i < getNSite(); ++i) { + if (site_group[i] == g) { + count++; + Pattern pat = stored_pat[stored_site_pattern[i]]; + addPattern(pat, i); + } } } ASSERT(count == stored_site_pattern.size()); + // count pattern frequencies of the new patterns count = 0; - for (iterator it = begin(); it != end(); ++it) + for (iterator it = begin(); it != end(); ++it) { count += it->frequency; + } ASSERT(count == getNSite()); pattern_index.clear(); //printPhylip("/dev/stdout"); + // update the pattern_first_site map + pattern_first_site = IntVector(getNPattern(), -1); + for (size_t i = 0; i < getNSite(); ++i) { + if (pattern_first_site[site_pattern[i]] == -1) { + pattern_first_site[site_pattern[i]] = i; + } + } + // refill the existing pattern-specific parameters + IntVector new_to_old_pattern; + for (size_t ptn = 0; ptn < getNPattern(); ++ptn) { + int i = pattern_first_site[ptn]; + new_to_old_pattern.push_back(stored_site_pattern[i]); + } + if (isSSF()) { + vector stored_ptn_state_freq = ptn_state_freq; + ptn_state_freq.clear(); + for (size_t ptn = 0; ptn < getNPattern(); ++ptn) { + ptn_state_freq.push_back(stored_ptn_state_freq[new_to_old_pattern[ptn]]); + } + } + if (isSSR()) { + vector stored_ptn_rate_scaler = ptn_rate_scaler; + ptn_rate_scaler.clear(); + for (size_t ptn = 0; ptn < getNPattern(); ++ptn) { + ptn_rate_scaler.push_back(stored_ptn_rate_scaler[new_to_old_pattern[ptn]]); + } + } } - /** detect the data type of the input sequences @param sequences vector of strings @@ -3680,12 +3729,17 @@ void Alignment::createBootstrapAlignment(Alignment *aln, IntVector* pattern_freq pattern_freq->resize(aln->getNPattern(), 0); } - if (!aln->site_state_freq.empty()) { - // resampling also the per-site state frequency vector - if (aln->site_state_freq.size() != aln->getNPattern() || spec) + if (aln->isSSF()) { + // resampling also the per-site state frequency vectors + if (aln->ptn_state_freq.size() != aln->getNPattern() || spec) outError("Unsupported bootstrap feature, pls contact the developers"); } - + if (aln->isSSR()) { + // resampling also the per-site rate scalers + if (aln->ptn_rate_scaler.size() != aln->getNPattern() || spec) + outError("Unsupported bootstrap feature, pls contact the developers"); + } + if (Params::getInstance().jackknife_prop > 0.0 && spec) { outError((string)"Unsupported jackknife with sampling " + spec); } @@ -3702,11 +3756,16 @@ void Alignment::createBootstrapAlignment(Alignment *aln, IntVector* pattern_freq Pattern pat = aln->at(ptn_id); int nptn = getNPattern(); addPattern(pat, added_sites); - if (!aln->site_state_freq.empty() && getNPattern() > nptn) { - // a new pattern is added, copy state frequency vector - double *state_freq = new double[num_states]; - memcpy(state_freq, aln->site_state_freq[ptn_id], num_states*sizeof(double)); - site_state_freq.push_back(state_freq); + if (aln->isSSF() && getNPattern() > nptn) { + // a new pattern is added, copy its state frequency vector + double *state_freqs = new double[num_states]; + memcpy(state_freqs, aln->ptn_state_freq[ptn_id], num_states*sizeof(double)); + ptn_state_freq.push_back(state_freqs); + } + if (aln->isSSR() && getNPattern() > nptn) { + // a new pattern is added, copy its rate scaler + double rate_scaler = aln->ptn_rate_scaler[ptn_id]; + ptn_rate_scaler.push_back(rate_scaler); } if (pattern_freq) ((*pattern_freq)[ptn_id])++; added_sites++; @@ -3783,10 +3842,8 @@ void Alignment::createBootstrapAlignment(Alignment *aln, IntVector* pattern_freq out_site += site_vec[part+1]; } } - if (!aln->site_state_freq.empty()) { - site_model = site_pattern; - ASSERT(site_state_freq.size() == getNPattern()); - } + if (aln->isSSF()) ASSERT(ptn_state_freq.size() == getNPattern()); + if (aln->isSSR()) ASSERT(ptn_rate_scaler.size() == getNPattern()); verbose_mode = save_mode; countConstSite(); // buildSeqStates(); @@ -3940,11 +3997,8 @@ void Alignment::buildFromPatternFreq(Alignment & aln, IntVector new_pattern_freq site_pattern[site++] = size()-1; } } - if (!aln.site_state_freq.empty()) { - site_model = site_pattern; - ASSERT(site_state_freq.size() == getNPattern()); - } - + if (aln.isSSF()) ASSERT(ptn_state_freq.size() == getNPattern()); + if (aln.isSSR()) ASSERT(ptn_rate_scaler.size() == getNPattern()); countConstSite(); // buildSeqStates(); // checkSeqName(); @@ -4221,11 +4275,11 @@ Alignment::~Alignment() non_stop_codon = nullptr; delete [] pars_lower_bound; pars_lower_bound = nullptr; - for (auto it = site_state_freq.rbegin(); it != site_state_freq.rend(); ++it) { - delete [] (*it); + for (auto rit = ptn_state_freq.rbegin(); rit != ptn_state_freq.rend(); ++rit) { + delete [] (*rit); } - site_state_freq.clear(); - site_model.clear(); + ptn_state_freq.clear(); + ptn_rate_scaler.clear(); } double Alignment::computeObsDist(int seq1, int seq2) { @@ -5617,92 +5671,116 @@ double Alignment::multinomialProb (IntVector &pattern_freq) return (fac - sumFac + sumProb); } -bool Alignment::readSiteStateFreq(const char* site_freq_file) +bool Alignment::readSiteParamFile(const char* site_param_file, const string ¶m_type) { - cout << endl << "Reading site-specific state frequency file " << site_freq_file << " ..." << endl; - site_model.resize(getNSite(), -1); - IntVector pattern_to_site; // vector from pattern to the first site - pattern_to_site.resize(getNPattern(), -1); - for (size_t i = 0; i < getNSite(); ++i) { - if (pattern_to_site[getPatternID(i)] == -1) { - pattern_to_site[getPatternID(i)] = i; - } - } - bool aln_changed = false; - + ASSERT((param_type == "freq" && ptn_state_freq.empty()) || + (param_type == "rate" && ptn_rate_scaler.empty())); + string msg = (param_type == "freq") ? "state frequency" : "rate"; + cout << endl << "Reading site-specific " << msg << " file " << site_param_file << " ..." << endl; + bool aln_changed = false; + int specified_sites = 0; + int saturated_sites = 0; + IntVector site_model(getNSite(), -1); // map each site to a model + vector models; // site-specific frequency vectors or rate scalers + // fill the pattern_first_site map + pattern_first_site = IntVector(getNPattern(), -1); + for (size_t i = 0; i < getNSite(); ++i) { + if (pattern_first_site[site_pattern[i]] == -1) { + pattern_first_site[site_pattern[i]] = i; + } + } + // read the input file try { - ifstream in; - in.exceptions(ios::failbit | ios::badbit); - in.open(site_freq_file); - double freq; + ifstream in; + in.exceptions(ios::failbit | ios::badbit); + in.open(site_param_file); + in.exceptions(ios::badbit); + int prev_site = -1; + while (true) { string site_spec; - int specified_sites = 0; - in.exceptions(ios::badbit); - for (int model_id = 0; !in.eof(); model_id++) { - // remove the failbit - in >> site_spec; - if (in.eof()) break; - IntVector site_id; - extractSiteID(this, site_spec.c_str(), site_id); - specified_sites += site_id.size(); - if (site_id.size() == 0) throw "No site ID specified"; - for (IntVector::iterator it = site_id.begin(); it != site_id.end(); it++) { - if (site_model[*it] != -1) throw "Duplicated site ID"; - site_model[*it] = site_state_freq.size(); - } - double *site_freq_entry = new double[num_states]; - double sum = 0; - for (int i = 0; i < num_states; ++i) { + // remove the failbit + in >> site_spec; + if (in.eof()) break; + // handle the line site ids + IntVector site_ids; // sites specified in a single line + extractSiteID(this, site_spec.c_str(), site_ids); + if (site_ids.size() == 0) throw "No site ID specified"; + if (site_ids.size() > 1) throw "Muptiple site IDs in a single line"; + int site = site_ids[0]; + if (prev_site == site) throw "Duplicated site ID"; + if (prev_site > site) throw "Wrong order of sites"; + prev_site = site; + specified_sites ++; + ASSERT(site_model[site] == -1); + site_model[site] = models.size(); + // handle the line params + double *site_param_entry; + if (param_type == "freq") { + double freq; + double *state_freqs = new double[num_states]; + double sum = 0.0; + for (int x = 0; x < num_states; ++x) { in >> freq; if (freq <= 0.0 || freq >= 1.0) throw "Frequencies must be strictly positive and smaller than 1"; - site_freq_entry[i] = freq; + state_freqs[x] = freq; sum += freq; } - if (fabs(sum-1.0) > 1e-4) { - if (fabs(sum-1.0) > 1e-3) - outWarning("Frequencies of site " + site_spec + " do not sum up to 1 and will be normalized"); - sum = 1.0/sum; - for (int i = 0; i < num_states; ++i) - site_freq_entry[i] *= sum; - } - convfreq(site_freq_entry); // regularize frequencies (eg if some freq = 0) - - // 2016-02-01: now check for equality of sites with same site-pattern and same freq - int prev_site = pattern_to_site[getPatternID(site_id[0])]; - if (site_id.size() == 1 && prev_site < site_id[0] && site_model[prev_site] != -1) { - // compare freq with prev_site - bool matched_freq = true; - double *prev_freq = site_state_freq[site_model[prev_site]]; - for (int i = 0; i < num_states; ++i) { - if (site_freq_entry[i] != prev_freq[i]) { - matched_freq = false; - break; - } - } - if (matched_freq) { - site_model[site_id[0]] = site_model[prev_site]; - } else - aln_changed = true; - } - - if (site_model[site_id[0]] == site_state_freq.size()) - site_state_freq.push_back(site_freq_entry); - else - delete [] site_freq_entry; + if (fabs(sum - 1.0) > 1e-4) { + outWarning("Frequencies of site " + site_spec + " do not sum up to 1 and will be normalized"); + for (int x = 0; x < num_states; ++x) { + state_freqs[x] /= sum; + } + } + convfreq(state_freqs); // regularize frequencies (eg if some freqs are too close to 0) + site_param_entry = state_freqs; + } else { + double rate; + double *rate_scaler = new double; + in >> rate; + if (rate < 0.0) throw "Negative rate not allowed"; + if (rate == 0.0) rate = MIN_SITE_RATE; + if (rate >= MAX_SITE_RATE) { + rate = MAX_SITE_RATE; + saturated_sites ++; + } + *rate_scaler = rate; + site_param_entry = rate_scaler; } - if (specified_sites < site_model.size()) { - aln_changed = true; - // there are some unspecified sites - cout << site_model.size() - specified_sites << " unspecified sites will get default frequencies" << endl; - for (size_t i = 0; i < site_model.size(); ++i) - if (site_model[i] == -1) - site_model[i] = site_state_freq.size(); - site_state_freq.push_back(NULL); + // add the model only if it is its first occurence for the pattern of the current site + int first_site = pattern_first_site[site_pattern[site]]; + if (first_site < site && site_model[first_site] != -1) { + // compare the site param with the first_site param + bool matched_param = true; + double *first_site_param_entry = models[site_model[first_site]]; + if (param_type == "freq") { + for (int x = 0; x < num_states; ++x) { + if (site_param_entry[x] != first_site_param_entry[x]) { + matched_param = false; + break; + } + } + } else { + if (*site_param_entry != *first_site_param_entry) { + matched_param = false; + } + } + if (matched_param) { + // the only case when we do not add the model + site_model[site] = site_model[first_site]; + } else { + aln_changed = true; + } + } // else: the current site is effectively the first one of its pattern + if (site_model[site] == models.size()) { + models.push_back(site_param_entry); + } else { + delete [] site_param_entry; } - in.clear(); - // set the failbit again - in.exceptions(ios::failbit | ios::badbit); - in.close(); + } + in.clear(); + // set the failbit again + in.exceptions(ios::failbit | ios::badbit); + in.close(); } catch (const char* str) { outError(str); } catch (string str) { @@ -5710,12 +5788,41 @@ bool Alignment::readSiteStateFreq(const char* site_freq_file) } catch(ios::failure) { outError(ERR_READ_INPUT); } - if (aln_changed) { - cout << "Regrouping alignment sites..." << endl; - regroupSitePattern(site_state_freq.size(), site_model); - } - cout << site_state_freq.size() << " distinct per-site state frequency vectors detected" << endl; - return aln_changed; + // check for unspecified sites + if (specified_sites < getNSite()) { + aln_changed = true; + msg = (param_type == "freq") ? "frequencies" : "rates"; + cout << site_model.size() - specified_sites << " unspecified sites will get default " << msg << endl; + for (size_t site = 0; site < getNSite(); ++site) { + if (site_model[site] == -1) { + site_model[site] = models.size(); + } + } + double *default_param_entry; + if (param_type == "freq") default_param_entry = NULL; else *default_param_entry = 1.0; + models.push_back(default_param_entry); + } + // saturated site warning + if (saturated_sites) { + cout << saturated_sites << "sites show too high rates (>=" << MAX_SITE_RATE << ")" << endl; + } + // if needed, subdivide patterns so that sites in each new pattern have same freqs and rates + if (aln_changed) { + cout << "Regrouping alignment sites..." << endl; + regroupSitePattern(models.size(), site_model); + } + // fill the selected pattern-specific parameter with the contents of the models + for (size_t ptn = 0; ptn < getNPattern(); ++ptn) { + int first_site = pattern_first_site[ptn]; + if (param_type == "freq") { + ptn_state_freq.push_back(models[site_model[first_site]]); + } else { + ptn_rate_scaler.push_back(*(models[site_model[first_site]])); + } + } + msg = (param_type == "freq") ? "state frequency vectors" : "rate scalers"; + cout << models.size() << " distinct per-site " << msg << " detected" << endl; + return aln_changed; } /** diff --git a/alignment/alignment.h b/alignment/alignment.h index 934591a80..36a77ffb7 100644 --- a/alignment/alignment.h +++ b/alignment/alignment.h @@ -1,7 +1,7 @@ // // C++ Interface: alignment // -// Description: +// Description: // // // Author: BUI Quang Minh, Steffen Klaere, Arndt von Haeseler , (C) 2008 @@ -319,21 +319,20 @@ class Alignment : public vector, public CharSet, public StateSpace { virtual void orderPatternByNumChars(int pat_type); /** - * un-group site-patterns, i.e., making #sites = #patterns and pattern frequency = 1 for all patterns + * un-group site-patterns, i.e. making #sites = #patterns and pattern frequency = 1 for all patterns */ void ungroupSitePattern(); - /** - * re-group site-patterns + * re-group site-patterns so that sites within each pattern fall into the same group * @param groups number of groups * @param site_group group ID (0, 1, ...ngroups-1; must be continuous) of all sites */ - void regroupSitePattern(int groups, IntVector &site_group); + void regroupSitePattern(int groups, const IntVector &site_group); /**************************************************************************** - output alignment + output alignment ****************************************************************************/ SeqType detectSequenceType(StrVector &sequences); @@ -535,9 +534,11 @@ class Alignment : public vector, public CharSet, public StateSpace { */ bool isGapOnlySeq(size_t seq_id); - virtual bool isSuperAlignment() { - return false; - } + bool isSSF() { return !ptn_state_freq.empty(); } + + bool isSSR() { return !ptn_rate_scaler.empty(); } + + virtual bool isSuperAlignment() { return false; } /**************************************************************************** alignment general processing @@ -917,12 +918,12 @@ class Alignment : public vector, public CharSet, public StateSpace { IntIntMap pomo_sampled_states_index; // indexing, to quickly find if a PoMo-2-state is already present /* for site-specific state frequency model with Huaichun, Edward, Andrew */ - - /* site to model ID map */ - IntVector site_model; - - /** site to state frequency vector */ - vector site_state_freq; + + /** pattern index to state frequency vector map */ + vector ptn_state_freq; + + /** pattern index to rate scaler map */ + vector ptn_rate_scaler; /** * @return true if data type is SEQ_CODON and state is a stop codon @@ -985,14 +986,14 @@ class Alignment : public vector, public CharSet, public StateSpace { void getAppearance(StateType state, StateBitset &state_app); /** - * read site specific state frequency vectors from a file to create corresponding model - * update site_model and site_state_freq variables for this class - * @param aln input alignment - * @param site_freq_file file name - * @return TRUE if alignment needs to be changed, FALSE otherwise + * read site-specific state frequency vectors or site-specific rate (branch length) scalers + * from a file to create a site-specific model + * @param site_param_file input file name + * @param param_type should be set to "freq" or "rate" + * @return TRUE if alignment patterns need to be changed, FALSE otherwise */ - bool readSiteStateFreq(const char* site_freq_file); - + bool readSiteParamFile(const char* site_param_file, const string ¶m_type); + /** * special initialization for codon sequences, e.g., setting #states, genetic_code * @param sequence_type user-defined sequence type @@ -1011,32 +1012,23 @@ class Alignment : public vector, public CharSet, public StateSpace { void extractMapleFile(const std::string& aln_name, const InputType& format); protected: + /** sequence names */ + vector seq_names; + /** expected num_sites */ + int expected_num_sites = -1; - /** - sequence names - */ - vector seq_names; - - /** - expected num_sites - */ - int expected_num_sites = -1; + /** site to pattern index map */ + IntVector site_pattern; - /** - Site to pattern index - */ - IntVector site_pattern; + /** pattern index to first pattern site map */ + IntVector pattern_first_site; - /** - hash map from pattern to index in the vector of patterns (the alignment) - */ - PatternIntMap pattern_index; - - /** - alisim: caching ntfreq if it has already randomly initialized - */ - double* cache_ntfreq = NULL; + /** hash map from pattern to index in the vector of patterns (the alignment) */ + PatternIntMap pattern_index; + + /** alisim: caching ntfreq if it has already randomly initialized */ + double* cache_ntfreq = NULL; private: /** diff --git a/alignment/alignmentpairwise.cpp b/alignment/alignmentpairwise.cpp index 8b39bfeac..8adf30764 100644 --- a/alignment/alignmentpairwise.cpp +++ b/alignment/alignmentpairwise.cpp @@ -202,7 +202,6 @@ double AlignmentPairwise::computeFunction(double value) { auto sequence2 = tree->getConvertedSequenceByNumber(seq_id2); auto frequencies = tree->getConvertedSequenceFrequencies(); size_t sequenceLength = tree->getConvertedSequenceLength(); - if (site_rate->isSiteSpecificRate()) { for (int i = 0; i < sequenceLength; i++) { int state1 = sequence1[i]; @@ -221,7 +220,7 @@ double AlignmentPairwise::computeFunction(double value) { if (state1 >= num_states || state2 >= num_states) { continue; } - double trans = tree->getModelFactory()->computeTrans(value * site_rate->getPtnRate(i), state1, state2); + double trans = tree->getModel()->computeTrans(value, model->getPtnModelID(i), state1, state2); lh -= log(trans) * frequencies[i]; } return lh; @@ -358,10 +357,10 @@ void AlignmentPairwise::computeFuncDerv(double value, double &df, double &ddf) { continue; } double freq = frequencies[i]; - double rate_val = site_rate->getPtnRate(i); + double rate_val = 1.0; double rate_sqr = rate_val * rate_val; double derv1, derv2; - double trans = tree->getModel()->computeTrans(value * rate_val,model->getPtnModelID(i), state1, state2, derv1, derv2); + double trans = tree->getModel()->computeTrans(value * rate_val, model->getPtnModelID(i), state1, state2, derv1, derv2); double d1 = derv1 / trans; df -= rate_val * d1 * freq; ddf -= rate_sqr * (derv2/trans - d1*d1) * freq; @@ -376,10 +375,10 @@ void AlignmentPairwise::computeFuncDerv(double value, double &df, double &ddf) { if (num_states<=state2) { continue; } - double rate_val = site_rate->getPtnRate(i); + double rate_val = 1.0; double rate_sqr = rate_val * rate_val; double derv1, derv2; - double trans = tree->getModel()->computeTrans(value * rate_val,model->getPtnModelID(i), state1, state2, derv1, derv2); + double trans = tree->getModel()->computeTrans(value * rate_val, model->getPtnModelID(i), state1, state2, derv1, derv2); double d1 = derv1 / trans; double freq = tree->aln->at(i).frequency; df -= rate_val * d1 * freq; diff --git a/main/phyloanalysis.cpp b/main/phyloanalysis.cpp index ac71027f4..42b275243 100644 --- a/main/phyloanalysis.cpp +++ b/main/phyloanalysis.cpp @@ -331,10 +331,7 @@ void reportNexusFile(ostream &out, ModelSubst *m) { void reportLinkSubstMatrix(ostream &out, Alignment *aln, ModelSubst *m) { int i, j, k; double *rate_mat = new double[m->num_states * m->num_states]; - if (!m->isSiteSpecificModel()) - m->getRateMatrix(rate_mat); - else - ((ModelSet*)m)->front()->getRateMatrix(rate_mat); + m->getRateMatrix(rate_mat); if (m->num_states <= 4) { out << "Linked rate parameter R:" << endl << endl; @@ -402,10 +399,7 @@ void reportModel(ostream &out, Alignment *aln, ModelSubst *m) { int i, j, k; ASSERT(aln->num_states == m->num_states); double *rate_mat = new double[m->num_states * m->num_states]; - if (!m->isSiteSpecificModel()) - m->getRateMatrix(rate_mat); - else - ((ModelSet*)m)->front()->getRateMatrix(rate_mat); + m->getRateMatrix(rate_mat); if (m->num_states <= 4) { out << "Rate parameter R:" << endl << endl; @@ -483,9 +477,9 @@ void reportModel(ostream &out, Alignment *aln, ModelSubst *m) { } out << "State frequencies: "; - if (m->isSiteSpecificModel()) - out << "(site specific frequencies)" << endl << endl; - else { + if (m->isSSF()) { + out << "(site-specific frequencies)" << endl << endl; + } else { // 2016-11-03: commented out as this is not correct anymore // if (!m->isReversible()) // out << "(inferred from Q matrix)" << endl; @@ -610,6 +604,11 @@ void reportModel(ostream &out, PhyloTree &tree) { reportLinkSubstMatrix(out, tree.aln, m); } } else { + // Update rate name if site-specific rates are used + if (tree.getModel()->isSSR()) { + tree.getRate()->name = "+SSR"; + tree.getRate()->full_name = "(site-specific rates)"; + } // Update rate name if continuous gamma is used. if (tree.getModelFactory() && tree.getModelFactory()->is_continuous_gamma) { @@ -624,7 +623,6 @@ void reportModel(ostream &out, PhyloTree &tree) { tree.getRate()->full_name = "Continuous Gamma"; } } - out << "Model of substitution: " << tree.getModelName() << endl << endl; reportModel(out, tree.aln, tree.getModel()); } @@ -1077,10 +1075,27 @@ void printOutfilesInfo(Params ¶ms, IQTree &tree) { if (raxml_format_printed) cout << " in RAxML format: " << params.out_prefix << ".best_scheme" << endl; } - if ((tree.getRate()->getGammaShape() > 0 || params.partition_file) && params.print_site_rate) + + if (params.tree_freq_file) + cout << " Site-specific state freqs: " << params.out_prefix << ".sitefreq" + << endl; + + if (params.tree_rate_file) + cout << " Site-specific rates: " << params.out_prefix << ".siterate" + << endl; + + if (params.print_site_state_freq && !params.site_freq_file && !params.tree_freq_file) + cout << " Site-specific state freqs: " << params.out_prefix << ".sitesf" + << endl; + + if ((params.print_site_rate & 1) && !params.site_rate_file && !params.tree_rate_file) cout << " Site-specific rates: " << params.out_prefix << ".rate" << endl; + if ((params.print_site_rate & 2) && !params.site_rate_file && !params.tree_rate_file) + cout << " Site-specific ML rates: " << params.out_prefix << ".mlrate" + << endl; + if ((tree.getRate()->isSiteSpecificRate() || tree.getRate()->getPtnCat(0) >= 0) && params.print_site_rate) cout << " Site-rates by MH model: " << params.out_prefix << ".rate" << endl; @@ -1863,9 +1878,9 @@ void exportAliSimCMD(Params ¶ms, IQTree &tree, ostream &out) string more_info = "For more information on using AliSim, please visit: www.iqtree.org/doc/AliSim"; // skip unsupported models - if (tree.getModel()->isMixture() || tree.getRate()->isHeterotachy() || tree.getModel()->isLieMarkov() || tree.aln->seq_type == SEQ_CODON) + if (tree.getModel()->isSiteSpecificModel() || tree.getModel()->isMixture() || tree.getModel()->isLieMarkov() || tree.getRate()->isHeterotachy() || tree.aln->seq_type == SEQ_CODON) { - out << "Currently, we only support exporting AliSim commands automatically from the analysis for common models of DNA, Protein, Binary, and Morphological data. To simulate data from other models (mixture, lie-markov, etc), please refer to the User Manual of AliSim. Thanks!" << endl << endl; + out << "Currently, we only support exporting AliSim commands automatically from the analysis for common models of DNA, Protein, Binary and Morphological data. To simulate data from other models (mixture, lie-markov, etc), please refer to the User Manual of AliSim. Thanks!" << endl << endl; out << more_info << endl << endl; return; } @@ -2330,46 +2345,6 @@ void runApproximateBranchLengths(Params ¶ms, IQTree &iqtree) { } -void printSiteRates(IQTree &iqtree, const char *rate_file, bool bayes) { - try { - ofstream out; - out.exceptions(ios::failbit | ios::badbit); - out.open(rate_file); - out << "# Site-specific subtitution rates determined by "; - if (bayes) - out<< "empirical Bayesian method" << endl; - else - out<< "maximum likelihood" << endl; - out << "# This file can be read in MS Excel or in R with command:" << endl - << "# tab=read.table('" << rate_file << "',header=TRUE)" << endl - << "# Columns are tab-separated with following meaning:" << endl; - if (iqtree.isSuperTree()) { - out << "# Part: Partition ID (1=" << ((PhyloSuperTree*)&iqtree)->front()->aln->name << ", etc)" << endl - << "# Site: Site ID within partition (starting from 1 for each partition)" << endl; - } else - out << "# Site: Alignment site ID" << endl; - - if (bayes) - out << "# Rate: Posterior mean site rate weighted by posterior probability" << endl - << "# Cat: Category with highest posterior (0=invariable, 1=slow, etc)" << endl - << "# C_Rate: Corresponding rate of highest category" << endl; - else - out << "# Rate: Site rate estimated by maximum likelihood" << endl; - if (iqtree.isSuperTree()) - out << "Part\t"; - out << "Site\tRate"; - if (bayes) - out << "\tCat\tC_Rate" << endl; - else - out << endl; - iqtree.writeSiteRates(out, bayes); - out.close(); - } catch (ios::failure) { - outError(ERR_WRITE_OUTPUT, rate_file); - } - cout << "Site rates printed to " << rate_file << endl; -} - void printMiscInfo(Params ¶ms, IQTree &iqtree, double *pattern_lh) { if (params.print_site_lh && !params.pll) { string site_lh_file = params.out_prefix; @@ -2410,15 +2385,20 @@ void printMiscInfo(Params ¶ms, IQTree &iqtree, double *pattern_lh) { if (params.print_site_prob && !params.pll) { printSiteProbCategory(((string)params.out_prefix + ".siteprob").c_str(), &iqtree, params.print_site_prob); } - + if (params.print_ancestral_sequence) { printAncestralSequences(params.out_prefix, &iqtree, params.print_ancestral_sequence); } - - if (params.print_site_state_freq != WSF_NONE && !params.site_freq_file && !params.tree_freq_file) { - string site_freq_file = params.out_prefix; - site_freq_file += ".sitesf"; - printSiteStateFreq(site_freq_file.c_str(), &iqtree); + + if (params.print_site_state_freq && !params.site_freq_file && !params.tree_freq_file) { + printSiteStateFreq(((string)params.out_prefix + ".sitesf").c_str(), &iqtree); + } + + if (params.print_site_rate && !params.site_rate_file && !params.tree_rate_file) { + if (params.print_site_rate & 1) + printSiteRate(((string)params.out_prefix + ".rate").c_str(), &iqtree, true); + if (params.print_site_rate & 2) + printSiteRate(((string)params.out_prefix + ".mlrate").c_str(), &iqtree, false); } if (params.print_trees_site_posterior) { @@ -2524,18 +2504,6 @@ void printMiscInfo(Params ¶ms, IQTree &iqtree, double *pattern_lh) { } } - if (params.print_site_rate & 1) { - string rate_file = params.out_prefix; - rate_file += ".rate"; - printSiteRates(iqtree, rate_file.c_str(), true); - } - - if (params.print_site_rate & 2) { - string rate_file = params.out_prefix; - rate_file += ".mlrate"; - printSiteRates(iqtree, rate_file.c_str(), false); - } - if (params.fixed_branch_length == BRLEN_SCALE) { string filename = (string)params.out_prefix + ".blscale"; iqtree.printTreeLengthScaling(filename.c_str()); @@ -2647,12 +2615,14 @@ void startTreeReconstruction(Params ¶ms, IQTree* &iqtree, ModelCheckpoint &m params.startCPUTime = getCPUTime(); params.start_real_time = getRealTime(); - string best_subst_name, best_rate_name; - runModelFinder(params, *iqtree, model_info, best_subst_name, best_rate_name); - - optimiseQMixModel(params, iqtree, model_info); + if (!iqtree->isTreeMix()) { + string best_subst_name, best_rate_name; + runModelFinder(params, *iqtree, model_info, best_subst_name, best_rate_name); + + optimiseQMixModel(params, iqtree, model_info); + } } - + /** optimize branch lengths of consensus tree */ @@ -3750,9 +3720,12 @@ void runStandardBootstrap(Params ¶ms, Alignment *alignment, IQTree *tree) { bootstrap_alignment->printAlignment(params.aln_output_format, bootaln_name.c_str(), true); } - if (params.print_boot_site_freq && MPIHelper::getInstance().isMaster()) { - printSiteStateFreq((((string)params.out_prefix)+"."+convertIntToString(sample)+".bootsitefreq").c_str(), bootstrap_alignment); - bootstrap_alignment->printAlignment(params.aln_output_format, (((string)params.out_prefix)+"."+convertIntToString(sample)+".bootaln").c_str()); + if ((params.print_boot_site_freq || params.print_boot_site_rate) && MPIHelper::getInstance().isMaster()) { + if (params.print_boot_site_freq) + printSiteParam((((string)params.out_prefix)+"."+convertIntToString(sample)+".bootsitefreq").c_str(), bootstrap_alignment, "freq"); + if (params.print_boot_site_rate) + printSiteParam((((string)params.out_prefix)+"."+convertIntToString(sample)+".bootsiterate").c_str(), bootstrap_alignment, "rate"); + bootstrap_alignment->printAlignment(params.aln_output_format, (((string)params.out_prefix)+"."+convertIntToString(sample)+".bootaln").c_str()); } if (!tree->constraintTree.empty()) { @@ -3955,62 +3928,81 @@ void convertAlignment(Params ¶ms, IQTree *iqtree) { /** 2016-08-04: compute a site frequency model for profile mixture model */ -void computeSiteFrequencyModel(Params ¶ms, Alignment *alignment) { - - cout << endl << "===> COMPUTING SITE FREQUENCY MODEL BASED ON TREE FILE " << params.tree_freq_file << endl; - ASSERT(params.tree_freq_file); - PhyloTree *tree = new PhyloTree(alignment); - tree->setParams(¶ms); - bool myrooted = params.is_rooted; - tree->readTree(params.tree_freq_file, myrooted); - tree->setAlignment(alignment); - tree->setRootNode(params.root); - - ModelsBlock *models_block = readModelsDefinition(params); - tree->setModelFactory(new ModelFactory(params, alignment->model_name, tree, models_block)); - delete models_block; - tree->setModel(tree->getModelFactory()->model); - tree->setRate(tree->getModelFactory()->site_rate); - tree->setLikelihoodKernel(params.SSE); - tree->setNumThreads(params.num_threads); - - if (!tree->getModel()->isMixture()) - outError("No mixture model was specified!"); - uint64_t mem_size = tree->getMemoryRequired(); - uint64_t total_mem = getMemorySize(); - cout << "NOTE: " << (mem_size / 1024) / 1024 << " MB RAM is required!" << endl; - if (mem_size >= total_mem) { - outError("Memory required exceeds your computer RAM size!"); - } +void computeSiteSpecificModel(Params ¶ms, Alignment *alignment, const string ¶m_type) +{ + ASSERT((param_type == "freq" && params.tree_freq_file) || + (param_type == "rate" && params.tree_rate_file)); + string msg = (param_type == "freq") ? "FREQUENCY" : "RATE"; + char *filename = (param_type == "freq") ? params.tree_freq_file : params.tree_rate_file; + cout << endl << "===> COMPUTING SITE " << msg << " MODEL BASED ON TREE FILE " << filename << endl; + // init auxiliary tree, model, etc. + PhyloTree *tree = new PhyloTree(alignment); + tree->setParams(¶ms); + tree->setLikelihoodKernel(params.SSE); + tree->setNumThreads(params.num_threads); + bool myrooted = params.is_rooted; + tree->readTree(filename, myrooted); + tree->setRootNode(params.root); + tree->setAlignment(alignment); + ModelsBlock *models_block = readModelsDefinition(params); + tree->setModelFactory(new ModelFactory(params, alignment->model_name, tree, models_block)); + delete models_block; + tree->setModel(tree->getModelFactory()->model); + tree->setRate(tree->getModelFactory()->site_rate); + if (!tree->getModel()->isReversible()) + outError("Non-reversible models are incompatible with site-specific models"); + if (tree->getModel()->isMixture() && !tree->getModel()->isMixtureSameQ()) + outError("Matrix mixture models are incompatible with site-specific models. Use -wsf or -wsr options to estimate site-specific parameters"); + if (tree->getModel()->isFused()) + outError("Unlinked rate mixture models are incompatible with site-specific models. Use -wsf or -wsr options to estimate site-specific parameters"); + if (param_type == "freq" && !tree->getModel()->isMixture()) + outError("No frequency mixture model was specified!"); + if (param_type == "rate" && tree->getRate()->getNRate() == 1) + outError("No rate mixture model was specified!"); + uint64_t mem_size = tree->getMemoryRequired(); + uint64_t total_mem = getMemorySize(); + cout << "NOTE: " << (mem_size / 1024) / 1024 << " MB RAM is required!" << endl; + if (mem_size >= total_mem) + outError("Memory required exceeds your computer RAM size!"); #ifdef BINARY32 - if (mem_size >= 2000000000) { - outError("Memory required exceeds 2GB limit of 32-bit executable"); - } + if (mem_size >= 2000000000) + outError("Memory required exceeds 2GB limit of 32-bit executable"); #endif - - tree->ensureNumberOfThreadsIsSet(nullptr); - - tree->initializeAllPartialLh(); - // 2017-12-07: Increase espilon ten times (0.01 -> 0.1) to speedup PMSF computation - tree->getModelFactory()->optimizeParameters(params.fixed_branch_length, true, params.modelEps*10); - - size_t nptn = alignment->getNPattern(), nstates = alignment->num_states; - double *ptn_state_freq = new double[nptn*nstates]; - tree->computePatternStateFreq(ptn_state_freq); - alignment->site_state_freq.resize(nptn); - for (size_t ptn = 0; ptn < nptn; ptn++) { - double *f = new double[nstates]; - memcpy(f, ptn_state_freq+ptn*nstates, sizeof(double)*nstates); - alignment->site_state_freq[ptn] = f; - } - alignment->getSitePatternIndex(alignment->site_model); - printSiteStateFreq(((string)params.out_prefix+".sitefreq").c_str(), tree, ptn_state_freq); - params.print_site_state_freq = WSF_NONE; - - delete [] ptn_state_freq; - delete tree; - - cout << endl << "===> CONTINUE ANALYSIS USING THE INFERRED SITE FREQUENCY MODEL" << endl; + tree->ensureNumberOfThreadsIsSet(nullptr); + tree->initializeAllPartialLh(); + // 2017-12-07: Increase espilon ten times (0.01 -> 0.1) to speedup PMSF computation + double modelEpsilon = (param_type == "freq") ? params.modelEps * 10.0 : params.modelEps; + cout << "Estimate initial model parameters (epsilon = " << modelEpsilon << ")" << endl; + tree->getModelFactory()->optimizeParameters(params.fixed_branch_length, true, modelEpsilon); + // compute state freqs or rate scalers for all the alignment patterns + size_t nptn = alignment->getNPattern(); + if (param_type == "freq") { + size_t nstates = alignment->num_states; + double *all_ptn_state_freq = new double[nptn*nstates]; + tree->computePatternStateFreq(all_ptn_state_freq); + for (size_t ptn = 0; ptn < nptn; ptn++) { + double *state_freqs = new double[nstates]; + memcpy(state_freqs, all_ptn_state_freq + ptn*nstates, sizeof(double)*nstates); + alignment->ptn_state_freq.push_back(state_freqs); + } + ASSERT(alignment->ptn_state_freq.size() == nptn); + delete [] all_ptn_state_freq; + params.site_state_freq_type = WSF_NONE; + } else { + DoubleVector ptn_rate; + tree->computePatternRate(ptn_rate); + for (size_t ptn = 0; ptn < nptn; ptn++) { + alignment->ptn_rate_scaler.push_back(ptn_rate[ptn]); + } + ASSERT(alignment->ptn_rate_scaler.size() == nptn); + params.site_rate_type = WSR_NONE; + } + delete tree; + // print the computed site-specific params into a file + string out_suffix = (param_type == "freq") ? ".sitefreq" : ".siterate"; + printSiteParam(((string)params.out_prefix + out_suffix).c_str(), alignment, param_type); + // continue analysis using the alignment with the computed site-specific params + cout << endl << "===> CONTINUE ANALYSIS USING THE INFERRED SITE " << msg << " MODEL" << endl; } @@ -4344,34 +4336,45 @@ void runPhyloAnalysis(Params ¶ms, Checkpoint *checkpoint, IQTree *&tree, Ali alignment = new SuperAlignment(params); } else { alignment = createAlignment(params.aln_file, params.sequence_type, params.intype, params.model_name); - if (params.freq_const_patterns) { int orig_nsite = alignment->getNSite(); alignment->addConstPatterns(params.freq_const_patterns); cout << "INFO: " << alignment->getNSite() - orig_nsite << " const sites added into alignment" << endl; } - // Initialize site-frequency model if (params.tree_freq_file) { if (checkpoint->getBool("finishedSiteFreqFile")) { - alignment->readSiteStateFreq(((string)params.out_prefix + ".sitefreq").c_str()); - params.print_site_state_freq = WSF_NONE; + alignment->readSiteParamFile(((string)params.out_prefix + ".sitefreq").c_str(), "freq"); + params.site_state_freq_type = WSF_NONE; cout << "CHECKPOINT: Site frequency model restored" << endl; } else { - computeSiteFrequencyModel(params, alignment); + computeSiteSpecificModel(params, alignment, "freq"); checkpoint->putBool("finishedSiteFreqFile", true); checkpoint->dump(); } } if (params.site_freq_file) { - alignment->readSiteStateFreq(params.site_freq_file); + alignment->readSiteParamFile(params.site_freq_file, "freq"); + } + // Initialize site-rate model + if (params.tree_rate_file) { + if (checkpoint->getBool("finishedSiteRateFile")) { + alignment->readSiteParamFile(((string)params.out_prefix + ".siterate").c_str(), "rate"); + params.site_rate_type = WSR_NONE; + cout << "CHECKPOINT: Site rate model restored" << endl; + } else { + computeSiteSpecificModel(params, alignment, "rate"); + checkpoint->putBool("finishedSiteRateFile", true); + checkpoint->dump(); + } + } + if (params.site_rate_file) { + alignment->readSiteParamFile(params.site_rate_file, "rate"); } } - if (params.symtest) { doSymTest(alignment, params); } - if (params.print_aln_info) { string site_info_file = string(params.out_prefix) + ".alninfo"; alignment->printSiteInfo(site_info_file.c_str()); @@ -4380,20 +4383,18 @@ void runPhyloAnalysis(Params ¶ms, Checkpoint *checkpoint, IQTree *&tree, Ali /*************** initialize tree ********************/ bool isTreeMix = isTreeMixture(params); - - if (params.optimize_params_use_hmm && !isTreeMix) { - outError("option '-hmmster' is only available for tree mixture model"); - } - + if (isTreeMix) { + cout << endl; if (params.optimize_params_use_hmm) cout << "HMMSTER "; // tree-mixture model cout << "Tree-mixture model" << endl; - if (params.compute_ml_tree_only) { + if (params.compute_ml_tree_only) outError("option compute_ml_tree_only cannot be set for tree-mixture model"); - } + if (params.user_file == NULL) + outError("To use tree-mixture model, use an option: -te "); // the minimum gamma shape should be greater than MIN_GAMMA_SHAPE_TREEMIX for tree mixture model if (params.min_gamma_shape < MIN_GAMMA_SHAPE_TREEMIX) { @@ -4402,10 +4403,6 @@ void runPhyloAnalysis(Params ¶ms, Checkpoint *checkpoint, IQTree *&tree, Ali params.min_gamma_shape = MIN_GAMMA_SHAPE_TREEMIX; } - if (params.user_file == NULL) { - outError("To use tree-mixture model, use an option: -te "); - } - // get the number after "+T" for tree-mixture model int treeNum = getTreeMixNum(params); if (treeNum == 0) { @@ -4415,6 +4412,9 @@ void runPhyloAnalysis(Params ¶ms, Checkpoint *checkpoint, IQTree *&tree, Ali } } else { + if (params.optimize_params_use_hmm) + outError("option '-hmmster' is only available for tree mixture model"); + tree = newIQTree(params, alignment); } diff --git a/main/phylotesting.cpp b/main/phylotesting.cpp index 154ce378c..a354e9533 100644 --- a/main/phylotesting.cpp +++ b/main/phylotesting.cpp @@ -781,6 +781,7 @@ void transferModelFinderParameters(IQTree *iqtree, Checkpoint *target) { void runModelFinder(Params ¶ms, IQTree &iqtree, ModelCheckpoint &model_info, string &best_subst_name, string &best_rate_name) { +/* if (params.model_name.find("+T") != string::npos) { // tree mixture return; @@ -808,6 +809,39 @@ void runModelFinder(Params ¶ms, IQTree &iqtree, ModelCheckpoint &model_info, // Model already specifed, nothing to do here if (!empty_model_found && params.model_name.substr(0, 4) != "TEST" && params.model_name.substr(0, 2) != "MF") return; +*/ + if (params.model_name == "MIX+MFP" || params.model_name == "MIX+MF") return; + bool empty_model_found = params.model_name.empty() && !iqtree.isSuperTree(); + if (params.model_name.empty() && iqtree.isSuperTree()) { + // check whether any partition has empty model_name + PhyloSuperTree *stree = (PhyloSuperTree*)&iqtree; + for (auto it = stree->begin(); it != stree->end(); it++) { + if ((*it)->aln->model_name.empty()) { + empty_model_found = true; + break; + } + } + } + if (params.model_joint) { + empty_model_found = false; + } + bool test = empty_model_found || + (params.model_name == "TEST") || (params.model_name == "TESTONLY") || + (params.model_name == "MFP") || (params.model_name == "MF"); + bool testmerge = (params.model_name == "TESTMERGE") || (params.model_name == "TESTMERGEONLY") || + (params.model_name == "MFP+MERGE") || (params.model_name == "MF+MERGE"); + if (testmerge && !iqtree.isSuperTree()) + outError("No partition file specified along with the option -m " + params.model_name); + bool test_only = (params.model_name == "TESTONLY") || (params.model_name == "MF") || + (params.model_name == "TESTMERGEONLY") || (params.model_name == "MF+MERGE"); + cout << endl; + if (!test && !testmerge) { + cout << "Skip ModelFinder, model specified: " << params.model_name << endl; + return; + } + cout << "Running ModelFinder:" << endl; + if (params.site_freq_file || params.site_rate_file) + outError("ModelFinder does not work with site-specific models"); if (MPIHelper::getInstance().getNumProcesses() > 1) outError("Please use only 1 MPI process! We are currently working on the MPI parallelization of model selection."); // TODO: check if necessary @@ -823,7 +857,7 @@ void runModelFinder(Params ¶ms, IQTree &iqtree, ModelCheckpoint &model_info, ok_model_file = model_info.load(); } - cout << endl; + //cout << endl; ok_model_file &= model_info.size() > 0; if (ok_model_file) @@ -925,14 +959,14 @@ void runModelFinder(Params ¶ms, IQTree &iqtree, ModelCheckpoint &model_info, } delete models_block; - + // force to dump all checkpointing information model_info.dump(true); - + // transfer models parameters transferModelFinderParameters(&iqtree, orig_checkpoint); iqtree.setCheckpoint(orig_checkpoint); - + params.startCPUTime = cpu_time; params.start_real_time = real_time; cpu_time = getCPUTime() - cpu_time; @@ -941,7 +975,9 @@ void runModelFinder(Params ¶ms, IQTree &iqtree, ModelCheckpoint &model_info, cout << "All model information printed to " << model_info.getFileName() << endl; cout << "CPU time for ModelFinder: " << cpu_time << " seconds (" << convert_time(cpu_time) << ")" << endl; cout << "Wall-clock time for ModelFinder: " << real_time << " seconds (" << convert_time(real_time) << ")" << endl; - + + cout << endl; + cout << "ModelFinder finished, model selected: " << iqtree.aln->model_name << endl; // alignment = iqtree.aln; if (test_only) { params.min_iterations = 0; @@ -3699,28 +3735,27 @@ void optimiseQMixModel_method_update(Params ¶ms, IQTree* &iqtree, ModelCheck // Optimisation of Q-Mixture model, including estimation of best number of classes in the mixture void optimiseQMixModel(Params ¶ms, IQTree* &iqtree, ModelCheckpoint &model_info) { - IQTree* new_iqtree; - string model_str; - - if (params.model_name.substr(0,6) != "MIX+MF") - return; - + bool test = (params.model_name == "MIX+MFP") || (params.model_name == "MIX+MF"); bool test_only = (params.model_name == "MIX+MF"); - params.model_name = ""; - + if (!test) { + return; + } if (MPIHelper::getInstance().getNumProcesses() > 1) outError("Error! The option -m '" + params.model_name + "' does not support MPI parallelization"); - if (iqtree->isSuperTree()) outError("Error! The option -m '" + params.model_name + "' cannot work on data set with partitions"); - if (iqtree->aln->seq_type != SEQ_DNA) outError("Error! The option -m '" + params.model_name + "' can only work on DNA data set"); + cout << endl; cout << "--------------------------------------------------------------------" << endl; cout << "| Optimizing Q-mixture model |" << endl; cout << "--------------------------------------------------------------------" << endl; + IQTree* new_iqtree; + string model_str; + + params.model_name = ""; // disable the bootstrapping int orig_gbo_replicates = params.gbo_replicates; ConsensusType orig_consensus_type = params.consensus_type; diff --git a/main/treetesting.cpp b/main/treetesting.cpp index 5158d9cb4..7fba40a6c 100644 --- a/main/treetesting.cpp +++ b/main/treetesting.cpp @@ -405,69 +405,111 @@ void printSiteProbCategory(const char*filename, PhyloTree *tree, SiteLoglType ws } +void printSiteStateFreq(const char *filename, PhyloTree *tree) { + if (!tree->getModel()->isMixture()) + outError("No frequency mixture model was specified!"); + size_t nsites = tree->getAlnNSite(); + size_t nstates = tree->aln->num_states; + double *all_ptn_state_freq; + all_ptn_state_freq = new double[((size_t)tree->getAlnNPattern()) * nstates]; + tree->computePatternStateFreq(all_ptn_state_freq); + try { + ofstream out; + out.exceptions(ios::failbit | ios::badbit); + out.open(filename); + IntVector site_pattern; + tree->aln->getSitePatternIndex(site_pattern); + for (size_t site = 0; site < nsites; ++site) { + out.width(6); + out << left << site + 1 << " "; + double *state_freqs = &all_ptn_state_freq[site_pattern[site]*nstates]; + for (size_t x = 0; x < nstates; ++x) { + out.width(15); + out << state_freqs[x] << " "; + } + out << endl; + } + out.close(); + } catch (ios::failure) { + outError(ERR_WRITE_OUTPUT, filename); + } + delete [] all_ptn_state_freq; + cout << "Site state frequencies printed to " << filename << endl; +} -void printSiteStateFreq(const char*filename, PhyloTree *tree, double *state_freqs) { - size_t nsites = tree->getAlnNSite(); - size_t nstates = tree->aln->num_states; - double *ptn_state_freq; - if (state_freqs) { - ptn_state_freq = state_freqs; - } else { - ptn_state_freq = new double[((size_t)tree->getAlnNPattern()) * nstates]; - tree->computePatternStateFreq(ptn_state_freq); - } - - try { - ofstream out; - out.exceptions(ios::failbit | ios::badbit); - out.open(filename); - IntVector pattern_index; - tree->aln->getSitePatternIndex(pattern_index); - for (size_t i = 0; i < nsites; ++i) { - out.width(6); - out << left << i+1 << " "; - double *state_freq = &ptn_state_freq[pattern_index[i]*nstates]; - for (size_t j = 0; j < nstates; ++j) { - out.width(15); - out << state_freq[j] << " "; - } - out << endl; - } - out.close(); - cout << "Site state frequency vectors printed to " << filename << endl; - } catch (ios::failure) { - outError(ERR_WRITE_OUTPUT, filename); - } - if (!state_freqs) - delete [] ptn_state_freq; +void printSiteRate(const char *filename, PhyloTree *tree, bool bayes) { + if (tree->getRate()->getNRate() == 1 && !tree->isSuperTree()) + outError("No rate mixture model or partition model was specified!"); + try { + ofstream out; + out.exceptions(ios::failbit | ios::badbit); + out.open(filename); + // write the comment header + string msg = (bayes) ? "empirical Bayesian method" : "maximum likelihood"; + out << "# Site-specific subtitution rates determined by " << msg << endl; + out << "# This file can be read in MS Excel or in R with command:" << endl + << "# tab=read.table('" << filename << "',header=TRUE)" << endl + << "# Columns are tab-separated with following meaning:" << endl; + if (tree->isSuperTree()) + out << "# Part: Partition ID (1=" << ((PhyloSuperTree*)tree)->front()->aln->name << ", etc)" << endl + << "# Site: Site ID within partition (starting from 1 for each partition)" << endl; + else + out << "# Site: Alignment site ID" << endl; + if (bayes) + out << "# Rate: Posterior mean site rate weighted by posterior probability" << endl + << "# Cat: Category with highest posterior (0=invariable, 1=slow, etc)" << endl + << "# C_Rate: Corresponding rate of highest category" << endl; + else + out << "# Rate: Site rate estimated by maximum likelihood" << endl; + // write the main header + msg = ""; + if (tree->isSuperTree()) msg += "Part\t"; + msg += "Site\tRate"; + if (bayes) msg += "\tCat\tC_Rate"; + out << msg << endl; + // infer and write the site rates + tree->writeSiteRates(out, bayes); + out.close(); + } catch (ios::failure) { + outError(ERR_WRITE_OUTPUT, filename); + } + cout << "Site rates printed to " << filename << endl; } -void printSiteStateFreq(const char* filename, Alignment *aln) { - if (aln->site_state_freq.empty()) - return; - size_t nsites = aln->getNSite(); - int nstates = aln->num_states; - try { - ofstream out; - out.exceptions(ios::failbit | ios::badbit); - out.open(filename); - IntVector pattern_index; - aln->getSitePatternIndex(pattern_index); - for (size_t i = 0; i < nsites; ++i) { - out.width(6); - out << left << i+1 << " "; - double *state_freq = aln->site_state_freq[pattern_index[i]]; - for (size_t j = 0; j < nstates; ++j) { - out.width(15); - out << state_freq[j] << " "; - } - out << endl; - } - out.close(); - cout << "Site state frequency vectors printed to " << filename << endl; - } catch (ios::failure) { - outError(ERR_WRITE_OUTPUT, filename); - } +void printSiteParam(const char* filename, Alignment *aln, const string ¶m_type) { + ASSERT(param_type == "freq" || param_type == "rate"); + if (param_type == "freq" && !aln->isSSF()) return; + if (param_type == "rate" && !aln->isSSR()) return; + size_t nsites = aln->getNSite(); + int nstates = aln->num_states; + try { + ofstream out; + out.exceptions(ios::failbit | ios::badbit); + out.open(filename); + IntVector site_pattern; + aln->getSitePatternIndex(site_pattern); + for (size_t site = 0; site < nsites; ++site) { + out.width(6); + out << left << site + 1 << " "; + if (param_type == "freq") { + double *state_freqs = aln->ptn_state_freq[site_pattern[site]]; + for (size_t x = 0; x < nstates; ++x) { + out.width(15); + out << state_freqs[x] << " "; + } + } else { + double rate = aln->ptn_rate_scaler[site_pattern[site]]; + out.width(15); + out << rate << " "; + } + out << endl; + } + out.close(); + } catch (ios::failure) { + outError(ERR_WRITE_OUTPUT, filename); + } + string msg = (param_type == "freq") ? "state frequency vectors" : "rate scalers"; + cout << "Site " << msg << " printed to " << filename << endl; } int countDistinctTrees(istream &in, bool rooted, IQTree *tree, IntVector &distinct_ids, bool exclude_duplicate) { diff --git a/main/treetesting.h b/main/treetesting.h index 2d1ae6851..9db338006 100644 --- a/main/treetesting.h +++ b/main/treetesting.h @@ -25,12 +25,12 @@ struct TreeInfo { double wkh_pvalue; // p-value by weighted Kishino-Hasegawa test double elw_value; // ELW - expected likelihood weights test bool elw_confident; // to represent confidence set of ELW test - double au_pvalue; // p-value by approximately unbiased (AU) test + double au_pvalue; // p-value by approximately unbiased (AU) test }; /** - * print site log likelihoods to a fileExists + * print site log likelihoods to a file * @param filename output file name * @param tree phylogenetic tree * @param ptn_lh pattern log-likelihoods, will be computed if NULL @@ -41,14 +41,14 @@ void printSiteLh(const char*filename, PhyloTree *tree, double *ptn_lh = NULL, bool append = false, const char *linename = NULL); /** - * print HMM results to a fileExists + * print HMM results to a file * @param filename output file name * @param tree IQTreeMixHmm */ void printHMMResult(const char*filename, PhyloTree *tree, int cat_assign_method = 0); /** - * print marginal probabilities to a fileExists + * print marginal probabilities to a file * @param filename output file name * @param tree IQTreeMixHmm */ @@ -80,18 +80,27 @@ void printSiteLhCategory(const char*filename, PhyloTree *tree, SiteLoglType wsl) void printSiteProbCategory(const char*filename, PhyloTree *tree, SiteLoglType wsl); /** - * print site state frequency vectors (for Huaichun) + * print site state frequency vectors (for -wsf option) * @param filename output file name * @param tree phylogenetic tree -*/ -void printSiteStateFreq(const char*filename, PhyloTree *tree, double *state_freqs = NULL); + */ +void printSiteStateFreq(const char*filename, PhyloTree *tree); /** - * print site state frequency vectors (for Huaichun) + * print site rates (for -wsr and --mlrate options) + * @param filename output file name + * @param tree phylogenetic tree + * @param bayes print mean posterior or ML rates + */ +void printSiteRate(const char *filename, PhyloTree *tree, bool bayes); + +/** + * print site state frequency vectors or rate scalers (for site-specific models) * @param filename output file name * @param aln alignment -*/ -void printSiteStateFreq(const char* filename, Alignment *aln); + * @param param_type should be set to "freq" or "rate" + */ +void printSiteParam(const char* filename, Alignment *aln, const string ¶m_type); /** print ancestral sequences diff --git a/model/modeldna.cpp b/model/modeldna.cpp index c10317c2f..398666a5d 100644 --- a/model/modeldna.cpp +++ b/model/modeldna.cpp @@ -202,8 +202,9 @@ void ModelDNA::init(const char *model_name, string model_params, StateFreqType f if (model_params != "") { readRates(model_params); } - - if (freq == FREQ_UNKNOWN || def_freq == FREQ_EQUAL) freq = def_freq; + if (def_freq == FREQ_EQUAL && phylo_tree->aln->isSSF()) + outError("DNA models with equal state frequencies are not suitable for site-specific frequencies"); + if (freq == FREQ_UNKNOWN || def_freq == FREQ_EQUAL) freq = def_freq; ModelMarkov::init(freq); // model_parameters = new double [getNDim()+1]; // see setVariables for explaination of +1 // setVariables(model_parameters); diff --git a/model/modelfactory.cpp b/model/modelfactory.cpp index 5530e4ae2..9c5b75dc5 100644 --- a/model/modelfactory.cpp +++ b/model/modelfactory.cpp @@ -24,11 +24,13 @@ #include "modelmarkov.h" #include "modelliemarkov.h" #include "modeldna.h" +#include "modeldnaerror.h" #include "modelprotein.h" #include "modelbin.h" #include "modelcodon.h" #include "modelmorphology.h" #include "modelpomo.h" +#include "modelpomomixture.h" #include "modelset.h" #include "modelmixture.h" #include "ratemeyerhaeseler.h" @@ -114,19 +116,167 @@ ModelsBlock *readModelsDefinition(Params ¶ms) { } if (params.model_def_file) { - cout << "Reading model definition file " << params.model_def_file << " ... "; + cout << "Reading model definition file " << params.model_def_file << " ..." << endl; MyReader nexus(params.model_def_file); nexus.Add(models_block); MyToken token(nexus.inf); nexus.Execute(token); int num_model = 0, num_freq = 0; - for (ModelsBlock::iterator it = models_block->begin(); it != models_block->end(); it++) + for (ModelsBlock::iterator it = models_block->begin(); it != models_block->end(); it++) { if (it->second.flag & NM_FREQ) num_freq++; else num_model++; + } cout << num_model << " models and " << num_freq << " frequency vectors loaded" << endl; } return models_block; } +ModelSubst* createModel(string model_str, ModelsBlock *models_block, + StateFreqType freq_type, string freq_params, + PhyloTree* tree) +{ + ModelSubst *model = NULL; + //cout << "Numstates: " << tree->aln->num_states << endl; + string model_params; + NxsModel *nxsmodel = models_block->findModel(model_str); + if (nxsmodel) model_params = nxsmodel->description; + + // PoMo. + bool pomo = false; + string pomo_rate_str = ""; + string pomo_heterozygosity = ""; + string::size_type p_pos = posPOMO(model_str); + pomo = (p_pos != string::npos); + + if (pomo) { + if (model_str[p_pos+2] == '{') { + string::size_type close_bracket = model_str.find("}", p_pos); + if (close_bracket == string::npos) { + cout << "Model string: " << model_str << endl; + outError("No closing bracket in PoMo parameters."); + } + else { + pomo_heterozygosity = model_str.substr(p_pos+3,close_bracket-p_pos-3); + model_str = model_str.substr(0, p_pos) + model_str.substr(close_bracket+1); + } + } + else { + model_str = model_str.substr(0, p_pos) + model_str.substr(p_pos + 2); + } + + size_t pomo_rate_start_pos; + if ((pomo_rate_start_pos = model_str.find("+G")) != string::npos) { + size_t pomo_rate_end_pos = model_str.find_first_of("+*", pomo_rate_start_pos+1); + if (pomo_rate_end_pos == string::npos) { + pomo_rate_str = model_str.substr(pomo_rate_start_pos, model_str.length() - pomo_rate_start_pos); + model_str = model_str.substr(0, pomo_rate_start_pos); + } else { + pomo_rate_str = model_str.substr(pomo_rate_start_pos, pomo_rate_end_pos - pomo_rate_start_pos); + model_str = model_str.substr(0, pomo_rate_start_pos) + model_str.substr(pomo_rate_end_pos); + } + } + } + + // sequencing error model + string seqerr = ""; + string::size_type spec_pos; + while ((spec_pos = model_str.find("+E")) != string::npos) { + string::size_type end_pos = model_str.find_first_of("+*", spec_pos+1); + if (end_pos == string::npos) { + seqerr = model_str.substr(spec_pos); + model_str = model_str.substr(0, spec_pos); + } else { + seqerr = model_str.substr(spec_pos, end_pos - spec_pos); + model_str = model_str.substr(0, spec_pos) + model_str.substr(end_pos); + } + } + + if (!seqerr.empty() && tree->aln->seq_type != SEQ_DNA) { + outError("Sequencing error model " + seqerr + " is only supported for DNA"); + } + // Now that PoMo stuff has been removed, check for model parameters. + size_t pos = model_str.find(OPEN_BRACKET); + if (pos != string::npos) { + if (model_str.rfind(CLOSE_BRACKET) != model_str.length()-1) + outError("Close bracket not found at the end of ", model_str); + string tmp_str = model_str; + // extract model_name + model_str = model_str.substr(0, pos); + + // extract model params + size_t end_pos = tmp_str.find(CLOSE_BRACKET); + + // handle cases that user doesn't specify model parameters but supply state frequencies + size_t pos_plus = model_str.find('+'); + if (pos_plus != string::npos) + { + model_params = ""; + model_str = model_str.substr(0, pos_plus); + } + else + model_params = tmp_str.substr(pos+1, end_pos-pos-1); + + // extract freqs (if specified) + pos = tmp_str.find("+FQ"); + if (pos != string::npos) + freq_type = FREQ_EQUAL; + pos = tmp_str.find("+F{"); + if (pos != string::npos) + { + freq_type = FREQ_USER_DEFINED; + tmp_str = tmp_str.substr(pos+3, tmp_str.length()-pos-3); + end_pos = tmp_str.find(CLOSE_BRACKET); + freq_params = tmp_str.substr(0, end_pos); + } + } + + /* + if ((model_str == "JC" && tree->aln->seq_type == SEQ_DNA) || + (model_str == "POISSON" && tree->aln->seq_type == SEQ_PROTEIN) || + (model_str == "JC2" && tree->aln->seq_type == SEQ_BINARY) || + (model_str == "JCC" && tree->aln->seq_type == SEQ_CODON) || + (model_str == "MK" && tree->aln->seq_type == SEQ_MORPH)) + { + model = new ModelSubst(tree->aln->num_states); + } else */ + + if ((pomo) || (tree->aln->seq_type == SEQ_POMO)) { + if (pomo_rate_str == "") + model = new ModelPoMo(model_str.c_str(), model_params, freq_type, freq_params, tree, pomo_heterozygosity); + else + model = new ModelPoMoMixture(model_str.c_str(), model_params, freq_type, freq_params, tree, pomo_heterozygosity, pomo_rate_str); + if (model->isMixture() && verbose_mode >= VB_MED) + cout << "PoMo mixture model for Gamma rate heterogeneity." << endl; +// else if ((model_str == "GTR" && tree->aln->seq_type == SEQ_DNA) || +// (model_str == "GTR2" && tree->aln->seq_type == SEQ_BINARY) || +// (model_str == "GTR20" && tree->aln->seq_type == SEQ_PROTEIN)) { +// model = new ModelGTR(tree, count_rates); +// if (freq_params != "") +// ((ModelGTR*)model)->readStateFreq(freq_params); +// if (model_params != "") +// ((ModelGTR*)model)->readRates(model_params); +// ((ModelGTR*)model)->init(freq_type); +// } else + } else if (ModelMarkov::validModelName(model_str)) { + model = ModelMarkov::getModelByName(model_str, tree, model_params, freq_type, freq_params); + } else if (tree->aln->seq_type == SEQ_BINARY) { + model = new ModelBIN(model_str.c_str(), model_params, freq_type, freq_params, tree); + } else if (tree->aln->seq_type == SEQ_DNA) { + if (seqerr.empty()) + model = new ModelDNA(model_str.c_str(), model_params, freq_type, freq_params, tree); + else + model = new ModelDNAError(model_str.c_str(), model_params, freq_type, freq_params, seqerr, tree); + } else if (tree->aln->seq_type == SEQ_PROTEIN) { + model = new ModelProtein(model_str.c_str(), model_params, freq_type, freq_params, tree, models_block); + } else if (tree->aln->seq_type == SEQ_CODON) { + model = new ModelCodon(model_str.c_str(), model_params, freq_type, freq_params, tree); + } else if (tree->aln->seq_type == SEQ_MORPH) { + model = new ModelMorphology(model_str.c_str(), model_params, freq_type, freq_params, tree); + } else { + outError("Unsupported model type"); + } + return model; +} + ModelFactory::ModelFactory() : CheckpointFactory() { model = NULL; site_rate = NULL; @@ -198,7 +348,8 @@ ModelFactory::ModelFactory(Params ¶ms, string &model_name, PhyloTree *tree, mix_pos = next_mix_pos; } if (new_model_str != model_str) - cout << "Model " << model_str << " is alias for " << new_model_str << endl; + if (verbose_mode >= VB_MIN) + cout << "Model " << model_str << " is alias for " << new_model_str << endl; model_str = new_model_str; // nxsmodel = models_block->findModel(model_str); @@ -397,7 +548,6 @@ ModelFactory::ModelFactory(Params ¶ms, string &model_name, PhyloTree *tree, case SEQ_PROTEIN: break; // let ModelProtein decide by itself case SEQ_MORPH: freq_type = FREQ_EQUAL; break; case SEQ_CODON: freq_type = FREQ_UNKNOWN; break; - break; default: freq_type = FREQ_EMPIRICAL; break; // default for DNA, PoMo and others: counted frequencies from alignment } } @@ -567,12 +717,12 @@ ModelFactory::ModelFactory(Params ¶ms, string &model_name, PhyloTree *tree, /******************** initialize model ****************************/ - if (tree->aln->site_state_freq.empty()) { - if (model_str.length() >= 4 && model_str.substr(0, 4) == "MIX{") { - string model_list; + if (!tree->aln->isSSF() && !tree->aln->isSSR()) { + // simple or mixture model + if (model_str.substr(0, 4) == "MIX{") { if (model_str.rfind(CLOSE_BRACKET) != model_str.length()-1) outError("Close bracket not found at the end of ", model_str); - model_list = model_str.substr(4, model_str.length()-5); + string model_list = model_str.substr(4, model_str.length()-5); model_str = model_str.substr(0, 3); model = new ModelMixture(model_name, model_str, model_list, models_block, freq_type, freq_params, tree, optimize_mixmodel_weight); } else if (freq_type == FREQ_MIXTURE) { @@ -587,40 +737,20 @@ ModelFactory::ModelFactory(Params ¶ms, string &model_name, PhyloTree *tree, // fused_mix_rate &= model->isMixture() && site_rate->getNRate() > 1; } else { // site-specific model - if (model_str == "JC" || model_str == "POISSON") - outError("JC is not suitable for site-specific model"); - model = new ModelSet(model_str.c_str(), tree); - ModelSet *models = (ModelSet*)model; // assign pointer for convenience - models->init((params.freq_type != FREQ_UNKNOWN) ? params.freq_type : FREQ_EMPIRICAL); - models->pattern_model_map.resize(tree->aln->getNPattern(), -1); - for (size_t i = 0; i < tree->aln->getNSite(); ++i) { - models->pattern_model_map[tree->aln->getPatternID(i)] = tree->aln->site_model[i]; - //cout << "site " << i << " ptn " << tree->aln->getPatternID(i) << " -> model " << site_model[i] << endl; - } - double *state_freq = new double[model->num_states]; - double *rates = new double[model->getNumRateEntries()]; - for (size_t i = 0; i < tree->aln->site_state_freq.size(); ++i) { - ModelMarkov *modeli; - if (i == 0) { - modeli = (ModelMarkov*)createModel(model_str, models_block, (params.freq_type != FREQ_UNKNOWN) ? params.freq_type : FREQ_EMPIRICAL, "", tree); - modeli->getStateFrequency(state_freq); - modeli->getRateMatrix(rates); - } else { - modeli = (ModelMarkov*)createModel(model_str, models_block, FREQ_EQUAL, "", tree); - modeli->setStateFrequency(state_freq); - modeli->setRateMatrix(rates); - } - if (tree->aln->site_state_freq[i]) - modeli->setStateFrequency (tree->aln->site_state_freq[i]); - - modeli->init(FREQ_USER_DEFINED); - models->push_back(modeli); + if (model_str.substr(0, 4) == "MIX{") + outError("Matrix mixture models are incompatible with site-specific models"); + if (freq_type == FREQ_MIXTURE && !params.tree_freq_file) + outError("State frequency mixture models are incompatible with site-specific models"); + if (params.tree_freq_file) { + if (verbose_mode >= VB_MIN) + cout << "NOTE: Switching frequency mixture model to SSF model" << endl; + freq_type = FREQ_EQUAL; + freq_params = ""; + } else if (params.site_freq_file) { + if (verbose_mode >= VB_MIN) + cout << "NOTE: Using SSF model" << endl; } - delete [] rates; - delete [] state_freq; - - models->joinEigenMemory(); - models->decomposeRateMatrix(); + model = new ModelSet(model_str, models_block, freq_type, freq_params, tree); // delete information of the old alignment // tree->aln->ordered_pattern.clear(); @@ -721,14 +851,16 @@ ModelFactory::ModelFactory(Params ¶ms, string &model_name, PhyloTree *tree, // choose discrete/continuous gamma if (posG != string::npos && posGC != string::npos) { - cout << "NOTE: both +G and +GC were specified, continue with " - << ((posG < posGC)? rate_str.substr(posG,2) : rate_str.substr(posGC,3)) << endl; + if (verbose_mode >= VB_MIN) + cout << "NOTE: both +G and +GC were specified, continue with " + << ((posG < posGC)? rate_str.substr(posG,2) : rate_str.substr(posGC,3)) << endl; is_continuous_gamma = posGC < posG; } if (posG2 != string::npos && posGC != string::npos) { - cout << "NOTE: both *G and +GC were specified, continue with " - << ((posG2 < posGC)? rate_str.substr(posG2,2) : rate_str.substr(posGC,3)) << endl; + if (verbose_mode >= VB_MIN) + cout << "NOTE: both *G and +GC were specified, continue with " + << ((posG2 < posGC)? rate_str.substr(posG2,2) : rate_str.substr(posGC,3)) << endl; is_continuous_gamma = posGC < posG2; } @@ -746,8 +878,9 @@ ModelFactory::ModelFactory(Params ¶ms, string &model_name, PhyloTree *tree, else { if (posG != string::npos && posG2 != string::npos) { - cout << "NOTE: both +G and *G were specified, continue with " - << ((posG < posG2)? rate_str.substr(posG,2) : rate_str.substr(posG2,2)) << endl; + if (verbose_mode >= VB_MIN) + cout << "NOTE: both +G and *G were specified, continue with " + << ((posG < posG2)? rate_str.substr(posG,2) : rate_str.substr(posG2,2)) << endl; } if (posG2 != string::npos && posG2 < posG) { posG = posG2; @@ -759,14 +892,16 @@ ModelFactory::ModelFactory(Params ¶ms, string &model_name, PhyloTree *tree, string::size_type posR2 = rate_str.find("*R"); // FreeRate model if (posG != string::npos && (posR != string::npos || posR2 != string::npos)) { - outWarning("Both Gamma and FreeRate models were specified, continue with FreeRate model"); + if (verbose_mode >= VB_MIN) + outWarning("Both Gamma and FreeRate models were specified, continue with FreeRate model"); posG = string::npos; fused_mix_rate = false; } if (posR != string::npos && posR2 != string::npos) { - cout << "NOTE: both +R and *R were specified, continue with " - << ((posR < posR2)? rate_str.substr(posR,2) : rate_str.substr(posR2,2)) << endl; + if (verbose_mode >= VB_MIN) + cout << "NOTE: both +R and *R were specified, continue with " + << ((posR < posR2)? rate_str.substr(posR,2) : rate_str.substr(posR2,2)) << endl; } if (posR2 != string::npos && posR2 < posR) { @@ -778,20 +913,23 @@ ModelFactory::ModelFactory(Params ¶ms, string &model_name, PhyloTree *tree, string::size_type posH2 = rate_str.find("*H"); // heterotachy model if (posG != string::npos && (posH != string::npos || posH2 != string::npos)) { - outWarning("Both Gamma and heterotachy models were specified, continue with heterotachy model"); + if (verbose_mode >= VB_MIN) + outWarning("Both Gamma and heterotachy models were specified, continue with heterotachy model"); posG = string::npos; fused_mix_rate = false; } if (posR != string::npos && (posH != string::npos || posH2 != string::npos)) { - outWarning("Both FreeRate and heterotachy models were specified, continue with heterotachy model"); + if (verbose_mode >= VB_MIN) + outWarning("Both FreeRate and heterotachy models were specified, continue with heterotachy model"); posR = string::npos; fused_mix_rate = false; } if (posH != string::npos && posH2 != string::npos) { - cout << "NOTE: both +H and *H were specified, continue with " - << ((posH < posH2)? rate_str.substr(posH,2) : rate_str.substr(posH2,2)) << endl; + if (verbose_mode >= VB_MIN) + cout << "NOTE: both +H and *H were specified, continue with " + << ((posH < posH2)? rate_str.substr(posH,2) : rate_str.substr(posH2,2)) << endl; } if (posH2 != string::npos && posH2 < posH) { posH = posH2; @@ -882,8 +1020,11 @@ ModelFactory::ModelFactory(Params ¶ms, string &model_name, PhyloTree *tree, outError("Wrong model name ", rate_str); } - + bool rate_mixture = false; if (rate_str.find('+') != string::npos || rate_str.find('*') != string::npos) { + rate_mixture = true; + } + if (!tree->aln->isSSR() && rate_mixture) { //string rate_str = model_str.substr(pos); if (posI != string::npos && posH != string::npos) { site_rate = new RateHeterotachyInvar(num_rate_cats, heterotachy_params, p_invar_sites, tree); @@ -955,11 +1096,22 @@ ModelFactory::ModelFactory(Params ¶ms, string &model_name, PhyloTree *tree, // else // model_str = model_str.substr(0, model_str.find('*')); } else { + if (rate_mixture && !params.tree_rate_file) + outError("Rate mixture models are incompatible with site-specific rates"); + if (params.tree_rate_file) { + if (verbose_mode >= VB_MIN) + cout << "NOTE: Switching rate mixture model to SSR model" << endl; + } else if (params.site_rate_file) { + if (verbose_mode >= VB_MIN) + cout << "NOTE: Using SSR model" << endl; + } site_rate = new RateHeterogeneity(); site_rate->setTree(tree); } if (fused_mix_rate) { + if (tree->aln->isSSF()) + outError("Unlinked rate mixture models are incompatible with site-specific frequencies"); if (!model->isMixture()) { if (verbose_mode >= VB_MED) cout << endl << "NOTE: Using mixture model with unlinked " << model_str << " parameters" << endl; diff --git a/model/modelfactory.h b/model/modelfactory.h index 1bb598b0c..a77c85281 100644 --- a/model/modelfactory.h +++ b/model/modelfactory.h @@ -54,7 +54,21 @@ string::size_type posRateFree(string &model_name); string::size_type posPOMO(string &model_name); /** -Store the transition matrix corresponding to evolutionary time so that one must not compute again. + * create a substitution model + * @param model_str model name + * @param models_block + * @param freq_type state frequency type + * @param freq_params frequency parameters + * @param tree associated phylo tree + * @return substitution model created + */ +ModelSubst *createModel(string model_str, ModelsBlock *models_block, + StateFreqType freq_type, string freq_params, + PhyloTree *tree); + + +/** +Store the transition matrix corresponding to evolutionary time so that one must not compute again. For efficiency purpose esp. for protein (20x20) or codon (61x61). The values of the map contain 3 matricies consecutively: transition matrix, 1st, and 2nd derivative @@ -210,7 +224,6 @@ class ModelFactory : public unordered_map, public Optimization, pu */ ModelSubst *model; - /** pointer to the site-rate heterogeneity, will not be deleted when deleting ModelFactory object */ @@ -279,7 +292,6 @@ class ModelFactory : public unordered_map, public Optimization, pu protected: - /** this function is served for the multi-dimension optimization. It should pack the model parameters into a vector that is index from 1 (NOTE: not from 0) @@ -295,8 +307,9 @@ class ModelFactory : public unordered_map, public Optimization, pu */ virtual bool getVariables(double *variables); - vector optimizeGammaInvWithInitValue(int fixed_len, double logl_epsilon, double gradient_epsilon, - double initPInv, double initAlpha, DoubleVector &lenvec, Checkpoint *model_ckp); + vector optimizeGammaInvWithInitValue(int fixed_len, double logl_epsilon, double gradient_epsilon, + double initPInv, double initAlpha, DoubleVector &lenvec, Checkpoint *model_ckp); + }; #endif diff --git a/model/modelmarkov.cpp b/model/modelmarkov.cpp index 067c19922..7605cb60d 100644 --- a/model/modelmarkov.cpp +++ b/model/modelmarkov.cpp @@ -1138,29 +1138,32 @@ bool ModelMarkov::isUnstableParameters() { void ModelMarkov::setBounds(double *lower_bound, double *upper_bound, bool *bound_check) { // ASSERT(is_reversible && "setBounds should only be called on subclass of ModelMarkov"); - - int i, ndim = getNDim(); - - for (i = 1; i <= ndim; i++) { - //cout << variables[i] << endl; - lower_bound[i] = MIN_RATE; - upper_bound[i] = MAX_RATE; - bound_check[i] = false; - } - + int ndim = getNDim(); + for (int i = 1; i <= ndim; i++) { + //cout << variables[i] << endl; + lower_bound[i] = MIN_RATE; + upper_bound[i] = MAX_RATE; + bound_check[i] = false; + } if (is_reversible && freq_type == FREQ_ESTIMATE) { - for (i = num_params+1; i <= num_params+num_states-1; i++) { -// lower_bound[i] = MIN_FREQUENCY/state_freq[highest_freq_state]; + // find the highest state freq + for (int i = 0; i < num_states; i++) { + if (state_freq[i] > state_freq[highest_freq_state]) { + highest_freq_state = i; + } + } + for (int i = num_params + 1; i <= num_params + num_states - 1; i++) { +// lower_bound[i] = MIN_FREQUENCY/state_freq[highest_freq_state]; // upper_bound[i] = state_freq[highest_freq_state]/MIN_FREQUENCY; - lower_bound[i] = Params::getInstance().min_state_freq; -// upper_bound[i] = 100.0; - upper_bound[i] = 1.0; - bound_check[i] = false; - } + lower_bound[i] = Params::getInstance().min_state_freq; +// upper_bound[i] = 100.0; + upper_bound[i] = 1.0; + bound_check[i] = false; + } } else if (phylo_tree->aln->seq_type == SEQ_DNA) { - setBoundsForFreqType(&lower_bound[num_params+1], &upper_bound[num_params+1], - &bound_check[num_params+1], Params::getInstance().min_state_freq, freq_type); - } + setBoundsForFreqType(&lower_bound[num_params+1], &upper_bound[num_params+1], + &bound_check[num_params+1], Params::getInstance().min_state_freq, freq_type); + } } double ModelMarkov::optimizeParameters(double gradient_epsilon) { @@ -1185,11 +1188,12 @@ double ModelMarkov::optimizeParameters(double gradient_epsilon) { bool *bound_check = new bool[ndim+1]; double score; - for (int i = 0; i < num_states; i++) { - if (state_freq[i] > state_freq[highest_freq_state]) { - highest_freq_state = i; - } - } +// Transferred to ModelMarkov::setBounds for ModelSet compatibility +// for (int i = 0; i < num_states; i++) { +// if (state_freq[i] > state_freq[highest_freq_state]) { +// highest_freq_state = i; +// } +// } // by BFGS algorithm setVariables(variables); @@ -1737,37 +1741,33 @@ void ModelMarkov::readRates(string str) noexcept(false) { } void ModelMarkov::readStateFreq(istream &in) noexcept(false) { - int i; - for (i = 0; i < num_states; i++) { - string tmp_value; - in >> tmp_value; - if (tmp_value.length() == 0) - throw "State frequencies could not be read"; - state_freq[i] = convert_double_with_distribution(tmp_value.c_str(), true); + for (int i = 0; i < num_states; i++) { + string tmp_value; + in >> tmp_value; + if (tmp_value.length() == 0) + throw "State frequencies could not be read"; + state_freq[i] = convert_double_with_distribution(tmp_value.c_str(), true); if (state_freq[i] < 0.0) throw "Negative state frequencies found"; } double sum = 0.0; - for (i = 0; i < num_states; i++) sum += state_freq[i]; - if (fabs(sum-1.0) >= 1e-7) - { - outWarning("Normalizing state frequencies so that sum of them equals to 1"); - sum = 1.0/sum; - for (i = 0; i < num_states; i++) - state_freq[i] *= sum; - } + for (int i = 0; i < num_states; i++) sum += state_freq[i]; + if (fabs(sum) <= 1e-5) + outError("Sum of all state frequencies must be greater than zero!"); + if (fabs(sum-1.0) >= 1e-7) { + if (verbose_mode >= VB_MIN) + outWarning("Normalizing state frequencies so that sum of them equals to 1"); + for (int i = 0; i < num_states; i++) state_freq[i] /= sum; + } } void ModelMarkov::readStateFreq(string str) noexcept(false) { int i; int end_pos = 0; - - // detect the seperator - char separator = ','; - if (str.find('/') != std::string::npos) - separator = '/'; - - for (i = 0; i < num_states; i++) { + // detect the seperator + char separator = ','; + if (str.find('/') != std::string::npos) separator = '/'; + for (int i = 0; i < num_states; i++) { int new_end_pos; state_freq[i] = convert_double_with_distribution(str.substr(end_pos).c_str(), new_end_pos, true, separator); end_pos += new_end_pos; @@ -1779,27 +1779,26 @@ void ModelMarkov::readStateFreq(string str) noexcept(false) { if (end_pos < str.length() && str[end_pos] != ',' && str[end_pos] != ' ' && str[end_pos] != '/') outError("Comma/Space/Forward slash to separate state frequencies not found in ", str); end_pos++; - if (i < num_states - 1 && end_pos >= str.length()) - outError("The number of frequencies ("+convertIntToString(i+1)+") is less than the number of states ("+convertIntToString(num_states)+"). Please check and try again."); + if (i < num_states - 1 && end_pos >= str.length()) + outError("The number of frequencies ("+convertIntToString(i+1)+") is less than the number of states ("+convertIntToString(num_states)+"). Please check and try again."); } double sum = 0.0; - for (i = 0; i < num_states; i++) sum += state_freq[i]; - if (fabs(sum) <= 1e-5) - outError("Sum of all state frequencies must be greater than zero!"); - if (fabs(sum-1.0) >= 1e-7) - { - outWarning("Normalizing State frequencies so that sum of them equals to 1"); - sum = 1.0/sum; - for (i = 0; i < num_states; i++) - state_freq[i] *= sum; - } + for (int i = 0; i < num_states; i++) sum += state_freq[i]; + if (fabs(sum) <= 1e-5) + outError("Sum of all state frequencies must be greater than zero!"); + if (fabs(sum-1.0) >= 1e-7) { + if (verbose_mode >= VB_MIN) + outWarning("Normalizing state frequencies so that sum of them equals to 1"); + for (int i = 0; i < num_states; i++) state_freq[i] /= sum; + } } void ModelMarkov::readParameters(const char *file_name, bool adapt_tree) { if (!fileExists(file_name)) outError("File not found ", file_name); - cout << "Reading model parameters from file " << file_name << endl; + if (verbose_mode >= VB_MIN) + cout << "Reading model parameters from file " << file_name << endl; // if detect if reading full matrix or half matrix by the first entry try { diff --git a/model/modelmarkov.h b/model/modelmarkov.h index 823ffe66b..8ca48fc05 100644 --- a/model/modelmarkov.h +++ b/model/modelmarkov.h @@ -263,7 +263,7 @@ class ModelMarkov : public ModelSubst, public EigenDecomposition rescale the state frequencies @param sum_one TRUE to make frequencies sum to 1, FALSE to make last entry equal to 1 */ - void scaleStateFreq(bool sum_one); + virtual void scaleStateFreq(bool sum_one); /** get frequency type diff --git a/model/modelmixture.cpp b/model/modelmixture.cpp index de8772e39..796f95911 100644 --- a/model/modelmixture.cpp +++ b/model/modelmixture.cpp @@ -5,18 +5,9 @@ * Author: minh */ -#include "modelmarkov.h" -#include "modeldna.h" -#include "modeldnaerror.h" -#include "modelprotein.h" -#include "modelbin.h" -#include "modelcodon.h" -#include "modelmorphology.h" -#include "modelset.h" #include "modelmixture.h" -#include "modelpomo.h" +#include "modelfactory.h" //#include "phylokernelmixture.h" -#include "modelpomomixture.h" using namespace std; @@ -1036,154 +1027,6 @@ const double MAX_MIXTURE_PROP = 1000.0; //const double MIN_MIXTURE_RATE = 0.01; //const double MAX_MIXTURE_RATE = 100.0; -ModelSubst* createModel(string model_str, ModelsBlock *models_block, - StateFreqType freq_type, string freq_params, - PhyloTree* tree) -{ - ModelSubst *model = NULL; - //cout << "Numstates: " << tree->aln->num_states << endl; - string model_params; - NxsModel *nxsmodel = models_block->findModel(model_str); - if (nxsmodel) model_params = nxsmodel->description; - - // PoMo. - bool pomo = false; - string pomo_rate_str = ""; - string pomo_heterozygosity = ""; - string::size_type p_pos = posPOMO(model_str); - pomo = (p_pos != string::npos); - - if (pomo) { - if (model_str[p_pos+2] == '{') { - string::size_type close_bracket = model_str.find("}", p_pos); - if (close_bracket == string::npos) { - cout << "Model string: " << model_str << endl; - outError("No closing bracket in PoMo parameters."); - } - else { - pomo_heterozygosity = model_str.substr(p_pos+3,close_bracket-p_pos-3); - model_str = model_str.substr(0, p_pos) + model_str.substr(close_bracket+1); - } - } - else { - model_str = model_str.substr(0, p_pos) + model_str.substr(p_pos + 2); - } - - size_t pomo_rate_start_pos; - if ((pomo_rate_start_pos = model_str.find("+G")) != string::npos) { - size_t pomo_rate_end_pos = model_str.find_first_of("+*", pomo_rate_start_pos+1); - if (pomo_rate_end_pos == string::npos) { - pomo_rate_str = model_str.substr(pomo_rate_start_pos, model_str.length() - pomo_rate_start_pos); - model_str = model_str.substr(0, pomo_rate_start_pos); - } else { - pomo_rate_str = model_str.substr(pomo_rate_start_pos, pomo_rate_end_pos - pomo_rate_start_pos); - model_str = model_str.substr(0, pomo_rate_start_pos) + model_str.substr(pomo_rate_end_pos); - } - } - } - - // sequencing error model - string seqerr = ""; - string::size_type spec_pos; - while ((spec_pos = model_str.find("+E")) != string::npos) { - string::size_type end_pos = model_str.find_first_of("+*", spec_pos+1); - if (end_pos == string::npos) { - seqerr = model_str.substr(spec_pos); - model_str = model_str.substr(0, spec_pos); - } else { - seqerr = model_str.substr(spec_pos, end_pos - spec_pos); - model_str = model_str.substr(0, spec_pos) + model_str.substr(end_pos); - } - } - - if (!seqerr.empty() && tree->aln->seq_type != SEQ_DNA) { - outError("Sequencing error model " + seqerr + " is only supported for DNA"); - } - // Now that PoMo stuff has been removed, check for model parameters. - size_t pos = model_str.find(OPEN_BRACKET); - if (pos != string::npos) { - if (model_str.rfind(CLOSE_BRACKET) != model_str.length()-1) - outError("Close bracket not found at the end of ", model_str); - string tmp_str = model_str; - // extract model_name - model_str = model_str.substr(0, pos); - - // extract model params - size_t end_pos = tmp_str.find(CLOSE_BRACKET); - - // handle cases that user doesn't specify model parameters but supply state frequencies - size_t pos_plus = model_str.find('+'); - if (pos_plus != string::npos) - { - model_params = ""; - model_str = model_str.substr(0, pos_plus); - } - else - model_params = tmp_str.substr(pos+1, end_pos-pos-1); - - // extract freqs (if specified) - pos = tmp_str.find("+FQ"); - if (pos != string::npos) - freq_type = FREQ_EQUAL; - pos = tmp_str.find("+F{"); - if (pos != string::npos) - { - freq_type = FREQ_USER_DEFINED; - tmp_str = tmp_str.substr(pos+3, tmp_str.length()-pos-3); - end_pos = tmp_str.find(CLOSE_BRACKET); - freq_params = tmp_str.substr(0, end_pos); - } - } - - /* - if ((model_str == "JC" && tree->aln->seq_type == SEQ_DNA) || - (model_str == "POISSON" && tree->aln->seq_type == SEQ_PROTEIN) || - (model_str == "JC2" && tree->aln->seq_type == SEQ_BINARY) || - (model_str == "JCC" && tree->aln->seq_type == SEQ_CODON) || - (model_str == "MK" && tree->aln->seq_type == SEQ_MORPH)) - { - model = new ModelSubst(tree->aln->num_states); - } else */ - - if ((pomo) || (tree->aln->seq_type == SEQ_POMO)) { - if (pomo_rate_str == "") - model = new ModelPoMo(model_str.c_str(), model_params, freq_type, freq_params, tree, pomo_heterozygosity); - else - model = new ModelPoMoMixture(model_str.c_str(), model_params, freq_type, freq_params, tree, pomo_heterozygosity, pomo_rate_str); - if (model->isMixture() && verbose_mode >= VB_MED) - cout << "PoMo mixture model for Gamma rate heterogeneity." << endl; -// else if ((model_str == "GTR" && tree->aln->seq_type == SEQ_DNA) || -// (model_str == "GTR2" && tree->aln->seq_type == SEQ_BINARY) || -// (model_str == "GTR20" && tree->aln->seq_type == SEQ_PROTEIN)) { -// model = new ModelGTR(tree, count_rates); -// if (freq_params != "") -// ((ModelGTR*)model)->readStateFreq(freq_params); -// if (model_params != "") -// ((ModelGTR*)model)->readRates(model_params); -// ((ModelGTR*)model)->init(freq_type); -// } else - } else if (ModelMarkov::validModelName(model_str)) { - model = ModelMarkov::getModelByName(model_str, tree, model_params, freq_type, freq_params); - } else if (tree->aln->seq_type == SEQ_BINARY) { - model = new ModelBIN(model_str.c_str(), model_params, freq_type, freq_params, tree); - } else if (tree->aln->seq_type == SEQ_DNA) { - if (seqerr.empty()) - model = new ModelDNA(model_str.c_str(), model_params, freq_type, freq_params, tree); - else - model = new ModelDNAError(model_str.c_str(), model_params, freq_type, freq_params, seqerr, tree); - } else if (tree->aln->seq_type == SEQ_PROTEIN) { - model = new ModelProtein(model_str.c_str(), model_params, freq_type, freq_params, tree, models_block); - } else if (tree->aln->seq_type == SEQ_CODON) { - model = new ModelCodon(model_str.c_str(), model_params, freq_type, freq_params, tree); - } else if (tree->aln->seq_type == SEQ_MORPH) { - model = new ModelMorphology(model_str.c_str(), model_params, freq_type, freq_params, tree); - } else { - outError("Unsupported model type"); - } - - return model; -} - /** constructor @param tree associated tree for the model diff --git a/model/modelmixture.h b/model/modelmixture.h index d564d146e..a9982640d 100644 --- a/model/modelmixture.h +++ b/model/modelmixture.h @@ -8,35 +8,18 @@ #ifndef MODELMIXTURE_H_ #define MODELMIXTURE_H_ -#include "tree/phylotree.h" -#include "modelsubst.h" #include "modelmarkov.h" #include "nclextra/modelsblock.h" - +#include "tree/phylotree.h" extern const char* builtin_mixmodels_definition; -/** - * create a substitution model - * @param model_str model nme - * @param freq_type state frequency type - * @param freq_params frequency parameters - * @param seqerr sequencing error model - * @param tree associated phylo tree - * @param count_rates TRUE to assign rates counted from alignment, FALSE to not initialize rates - * @return substitution model created - */ -ModelSubst *createModel(string model_str, ModelsBlock *models_block, - StateFreqType freq_type, string freq_params, - PhyloTree *tree); - /** * mixture model */ class ModelMixture: virtual public ModelMarkov, public vector { public: - /** constructor @param model_name model name, e.g., JC, HKY. diff --git a/model/modelset.cpp b/model/modelset.cpp index 74778e258..fdb3d4f55 100644 --- a/model/modelset.cpp +++ b/model/modelset.cpp @@ -18,12 +18,235 @@ #include "modelset.h" +#include "modelfactory.h" -ModelSet::ModelSet(const char *model_name, PhyloTree *tree) : ModelMarkov(tree) +ModelSet::ModelSet(const string model_name, ModelsBlock *models_block, + StateFreqType freq, string freq_params, PhyloTree *tree) + : ModelMarkov(tree) { name = full_name = model_name; - name += "+SSF"; - full_name += "+site-specific state-frequency model (unpublished)"; + full_name += "+site-specific state-frequency or rate model (unpublished)"; + // init the wrapper model to use its eigen + ModelMarkov::init(FREQ_EMPIRICAL); // +F is used here to calculate +I under SSF + ModelMarkov::fixParameters(true); // yet otherwise the wrapper model parameters remain unused + // init submodels + ASSERT(freq != FREQ_MIXTURE); + if (isSSF()) { // default freqs for unspecified sites under SSF + freq = FREQ_EQUAL; + freq_params = ""; + } + double *state_freqs = new double[num_states]; + double *rate_mat = new double[getNumRateEntries()]; + for (size_t ptn = 0; ptn < phylo_tree->aln->getNPattern(); ptn++) { + ModelMarkov *submodel; + if (ptn == 0) { // the front submodel, other submodels will take after it + submodel = (ModelMarkov*)createModel(model_name, models_block, freq, freq_params, tree); + if (!submodel->isReversible()) + outError("Non-reversible models are incompatible with site-specific models"); + freq = submodel->getFreqType(); + submodel->getStateFrequency(state_freqs); + submodel->getRateMatrix(rate_mat); + } else { + submodel = (ModelMarkov*)createModel(model_name, models_block, FREQ_EQUAL, "", tree); + submodel->setFreqType(freq); + submodel->setStateFrequency(state_freqs); + submodel->setRateMatrix(rate_mat); + } + // set site-specific freqs (if any) and re-init + if (isSSF()) { + if (phylo_tree->aln->ptn_state_freq[ptn]) { + submodel->setStateFrequency(phylo_tree->aln->ptn_state_freq[ptn]); + } // else: unspecified site, continue with the default equal freqs + submodel->init(FREQ_USER_DEFINED); + } + push_back(submodel); + } + delete [] state_freqs; + delete [] rate_mat; + // generate the site-specific eigen of the wrapper model + joinEigenMemory(); + decomposeRateMatrix(); +} + +ModelSet::~ModelSet() +{ + for (reverse_iterator rit = rbegin(); rit != rend(); rit++) { + (*rit)->eigenvalues = nullptr; + (*rit)->eigenvectors = nullptr; + (*rit)->inv_eigenvectors = nullptr; + (*rit)->inv_eigenvectors_transposed = nullptr; + delete (*rit); + } +} + +string ModelSet::getName() +{ + if (isSSF()) { + return name + "+SSF"; + } else { + return front()->getName(); + } +} + +string ModelSet::getNameParams(bool show_fixed_params) +{ + ostringstream retname; + retname << name; + if (!front()->fixed_parameters && front()->num_params > 0) { + double *rate_mat = front()->rates; + retname << '{'; + int nrates = getNumRateEntries(); + for (int i = 0; i < nrates; i++) { + if (i > 0) retname << ','; + retname << rate_mat[i]; + } + retname << '}'; + } + if (isSSF()) { + retname << "+SSF"; + } else { + front()->getNameParamsFreq(retname); + } + return retname.str(); +} + +void ModelSet::writeInfo(ostream& out) +{ + if (empty()) return; + if (verbose_mode >= VB_DEBUG) { + out << "Wrapper model:" << endl; + ModelMarkov::writeInfo(out); + int i = 1; + for (iterator it = begin(); it != end(); it++, i++) { + out << "Partition " << i << ":" << endl; + (*it)->writeInfo(out); + } + } else { + front()->writeInfo(out); + } +} + +void ModelSet::getRateMatrix(double *rate_mat) { + front()->getRateMatrix(rate_mat); +} + +void ModelSet::getStateFrequency(double *state_freqs, int mixture) +{ + if (isSSF()) { // get the +F freqs under SSF + ModelMarkov::getStateFrequency(state_freqs); + } else { + front()->getStateFrequency(state_freqs); + } +} + +void ModelSet::getQMatrix(double *q_mat, int mixture) { + if (isSSF()) { // get the +F derived QMatrix under SSF + ModelMarkov::getQMatrix(q_mat); + } else { + front()->getQMatrix(q_mat); + } +} + +StateFreqType ModelSet::getFreqType() +{ + if (isSSF()) { // get the +F type under SSF + return ModelMarkov::getFreqType(); + } else { + return front()->getFreqType(); + } +} + +int ModelSet::getNDim() +{ + return front()->getNDim(); +} + +int ModelSet::getNDimFreq() +{ + return front()->getNDimFreq(); +} + +bool ModelSet::isUnstableParameters() +{ + return front()->isUnstableParameters(); +} + +void ModelSet::setBounds(double *lower_bound, double *upper_bound, bool *bound_check) +{ + front()->setBounds(lower_bound, upper_bound, bound_check); +} + +void ModelSet::setVariables(double* variables) +{ + front()->setVariables(variables); +} + +bool ModelSet::getVariables(double* variables) +{ + bool changed = false; + for (iterator it = begin(); it != end(); it++) { + changed |= (*it)->getVariables(variables); + } + return changed; +} + +void ModelSet::scaleStateFreq(bool sum_one) +{ + for (iterator it = begin(); it != end(); it++) { + (*it)->scaleStateFreq(sum_one); + } +} + +double ModelSet::optimizeParameters(double gradient_epsilon) +{ + if (verbose_mode >= VB_MAX) + cout << "Optimizing " << getName() << " model parameters..." << endl; + int ndim = getNDim(); + if (ndim == 0) return 0.0; // return if nothing to be optimized + double score; + // optimize with the BFGS algorithm + double *variables = new double[ndim+1]; + double *upper_bound = new double[ndim+1]; + double *lower_bound = new double[ndim+1]; + bool *bound_check = new bool[ndim+1]; + setVariables(variables); + setBounds(lower_bound, upper_bound, bound_check); + score = -minimizeMultiDimen(variables, ndim, lower_bound, upper_bound, bound_check, max(gradient_epsilon, TOL_RATE)); + bool changed = getVariables(variables); + delete [] bound_check; + delete [] lower_bound; + delete [] upper_bound; + delete [] variables; + // BQM 2015-09-07: normalize state_freq + if (is_reversible && getFreqType() == FREQ_ESTIMATE) { + scaleStateFreq(true); + changed = true; + } + if (changed || score == -1.0e+30) { + decomposeRateMatrix(); + phylo_tree->clearAllPartialLH(); + score = phylo_tree->computeLikelihood(); + } + return score; +} + +double ModelSet::targetFunk(double x[]) +{ + bool changed = getVariables(x); + if (changed) { + decomposeRateMatrix(); + ASSERT(phylo_tree); + phylo_tree->clearAllPartialLH(); + } + // avoid numerical issue if state_freq is too small + double *state_freqs = (isSSF()) ? this->state_freq : front()->state_freq; + for (int x = 0; x < num_states; x++) { + if (state_freqs[x] < 0 || (state_freqs[x] >= 0 && + state_freqs[x] < Params::getInstance().min_state_freq)) { + return 1.0e+30; + } + } + return -phylo_tree->computeLikelihood(); } void ModelSet::computeTransMatrix(double time, double* trans_matrix, int mixture, int selected_row) @@ -50,51 +273,49 @@ void ModelSet::computeTransDerv(double time, double* trans_matrix, double* trans int ModelSet::getPtnModelID(int ptn) { - ASSERT(ptn >= 0 && ptn < pattern_model_map.size()); - ASSERT(pattern_model_map[ptn] >= 0 && pattern_model_map[ptn] < size()); - return pattern_model_map[ptn]; + ASSERT(ptn >= 0 && ptn < size()); + return ptn; } - -double ModelSet::computeTrans(double time, int model_id, int state1, int state2) { - if (phylo_tree->vector_size == 1) { - return at(model_id)->computeTrans(time, state1, state2); - } +double ModelSet::computeTrans(double time, int model_id, int state1, int state2) +{ + if (phylo_tree->vector_size == 1) { + return at(model_id)->computeTrans(time, state1, state2); + } // temporary fix problem with vectorized eigenvectors - int i; - int vsize = phylo_tree->vector_size; - int states_vsize = num_states*vsize; - int model_vec_id = model_id % vsize; - int start_ptn = model_id - model_vec_id; - double *evec = &eigenvectors[start_ptn*num_states*num_states + model_vec_id + state1*num_states*vsize]; - double *inv_evec = &inv_eigenvectors[start_ptn*num_states*num_states + model_vec_id + state2*vsize]; - double *eval = &eigenvalues[start_ptn*num_states + model_vec_id]; + int vsize = phylo_tree->vector_size; + int states_vsize = num_states*vsize; + int model_vec_id = model_id % vsize; + int start_ptn = model_id - model_vec_id; + double *evec = &eigenvectors[start_ptn*num_states*num_states + model_vec_id + state1*num_states*vsize]; + double *inv_evec = &inv_eigenvectors[start_ptn*num_states*num_states + model_vec_id + state2*vsize]; + double *eval = &eigenvalues[start_ptn*num_states + model_vec_id]; double trans_prob = 0.0; - for (i = 0; i < states_vsize; i+=vsize) { - double val = eval[i]; + for (int i = 0; i < states_vsize; i+=vsize) { + double val = eval[i]; double trans = evec[i] * inv_evec[i*num_states] * exp(time * val); trans_prob += trans; } return trans_prob; } -double ModelSet::computeTrans(double time, int model_id, int state1, int state2, double &derv1, double &derv2) { - if (phylo_tree->vector_size == 1) { - return at(model_id)->computeTrans(time, state1, state2, derv1, derv2); - } +double ModelSet::computeTrans(double time, int model_id, int state1, int state2, double &derv1, double &derv2) +{ + if (phylo_tree->vector_size == 1) { + return at(model_id)->computeTrans(time, state1, state2, derv1, derv2); + } // temporary fix problem with vectorized eigenvectors - int i; - int vsize = phylo_tree->vector_size; - int states_vsize = num_states*vsize; - int model_vec_id = model_id % vsize; - int start_ptn = model_id - model_vec_id; - double *evec = &eigenvectors[start_ptn*num_states*num_states + model_vec_id + state1*num_states*vsize]; - double *inv_evec = &inv_eigenvectors[start_ptn*num_states*num_states + model_vec_id + state2*vsize]; - double *eval = &eigenvalues[start_ptn*num_states + model_vec_id]; + int vsize = phylo_tree->vector_size; + int states_vsize = num_states*vsize; + int model_vec_id = model_id % vsize; + int start_ptn = model_id - model_vec_id; + double *evec = &eigenvectors[start_ptn*num_states*num_states + model_vec_id + state1*num_states*vsize]; + double *inv_evec = &inv_eigenvectors[start_ptn*num_states*num_states + model_vec_id + state2*vsize]; + double *eval = &eigenvalues[start_ptn*num_states + model_vec_id]; double trans_prob = 0.0; derv1 = derv2 = 0.0; - for (i = 0; i < states_vsize; i+=vsize) { - double val = eval[i]; + for (int i = 0; i < states_vsize; i+=vsize) { + double val = eval[i]; double trans = evec[i] * inv_evec[i*num_states] * exp(time * val); double trans2 = trans * val; trans_prob += trans; @@ -104,146 +325,111 @@ double ModelSet::computeTrans(double time, int model_id, int state1, int state2, return trans_prob; } -int ModelSet::getNDim() +uint64_t ModelSet::getMemoryRequired() { - ASSERT(size()); - return front()->getNDim(); -} - -void ModelSet::writeInfo(ostream& out) -{ - if (empty()) { - return; - } - if (verbose_mode >= VB_DEBUG) { - int i = 1; - for (iterator it = begin(); it != end(); it++, i++) { - out << "Partition " << i << ":" << endl; - (*it)->writeInfo(out); - } - } else { - front()->writeInfo(out); + uint64_t mem = ModelMarkov::getMemoryRequired(); + for (iterator it = begin(); it != end(); it++) { + mem += (*it)->getMemoryRequired(); } + return mem; } void ModelSet::decomposeRateMatrix() { - if (empty()) { - return; - } - for (iterator it = begin(); it != end(); it++) { - (*it)->decomposeRateMatrix(); - } - if (phylo_tree->vector_size == 1) + if (empty()) return; + // decompose for each submodel + for (iterator it = begin(); it != end(); it++) { + (*it)->decomposeRateMatrix(); + } + // set site-specific rates (if any) + if (isSSR()) { // multiply eigenvalues of each submodel by the submodel rate scaler + for (size_t ptn = 0; ptn < size(); ptn++) { + ASSERT(phylo_tree->aln->ptn_rate_scaler[ptn]); + double rate_scaler = phylo_tree->aln->ptn_rate_scaler[ptn]; + double *eval_ptr = &eigenvalues[ptn*num_states]; + for (size_t x = 0; x < num_states; x++) { + eval_ptr[x] *= rate_scaler; + } + } + } + if (phylo_tree->vector_size == 1) { return; - // rearrange eigen to obey vector_size + } + // else rearrange eigen to obey vector_size size_t vsize = phylo_tree->vector_size; size_t states2 = num_states*num_states; - - size_t max_size = get_safe_upper_limit(size()); - - // copy dummy values - for (size_t m = size(); m < max_size; m++) { - memcpy(&eigenvalues[m*num_states], &eigenvalues[(m-1)*num_states], sizeof(double)*num_states); - memcpy(&eigenvectors[m*states2], &eigenvectors[(m-1)*states2], sizeof(double)*states2); - memcpy(&inv_eigenvectors[m*states2], &inv_eigenvectors[(m-1)*states2], sizeof(double)*states2); - memcpy(&inv_eigenvectors_transposed[m*states2], &inv_eigenvectors_transposed[(m-1)*states2], sizeof(double)*states2); - } - - double new_eval[num_states*vsize]; - double new_evec[states2*vsize]; - double new_inv_evec[states2*vsize]; - - for (size_t ptn = 0; ptn < size(); ptn += vsize) { - double *eval_ptr = &eigenvalues[ptn*num_states]; - double *evec_ptr = &eigenvectors[ptn*states2]; - double *inv_evec_ptr = &inv_eigenvectors[ptn*states2]; - for (size_t i = 0; i < vsize; i++) { - for (size_t x = 0; x < num_states; x++) - new_eval[x*vsize+i] = eval_ptr[x]; - for (size_t x = 0; x < states2; x++) { - new_evec[x*vsize+i] = evec_ptr[x]; - new_inv_evec[x*vsize+i] = inv_evec_ptr[x]; - } - eval_ptr += num_states; - evec_ptr += states2; - inv_evec_ptr += states2; - } - // copy new values - memcpy(&eigenvalues[ptn*num_states], new_eval, sizeof(double)*num_states*vsize); - memcpy(&eigenvectors[ptn*states2], new_evec, sizeof(double)*states2*vsize); - memcpy(&inv_eigenvectors[ptn*states2], new_inv_evec, sizeof(double)*states2*vsize); - calculateSquareMatrixTranspose(new_inv_evec, num_states - , &inv_eigenvectors_transposed[ptn*states2]); - } -} - -bool ModelSet::getVariables(double* variables) -{ - ASSERT(size()); - bool changed = false; - for (iterator it = begin(); it != end(); it++) - changed |= (*it)->getVariables(variables); - return changed; -} - -void ModelSet::setVariables(double* variables) -{ - ASSERT(size()); - front()->setVariables(variables); + // copy dummy values + size_t max_size = get_safe_upper_limit(size()); + for (size_t m = size(); m < max_size; m++) { + memcpy(&eigenvalues[m*num_states], &eigenvalues[(m-1)*num_states], sizeof(double)*num_states); + memcpy(&eigenvectors[m*states2], &eigenvectors[(m-1)*states2], sizeof(double)*states2); + memcpy(&inv_eigenvectors[m*states2], &inv_eigenvectors[(m-1)*states2], sizeof(double)*states2); + memcpy(&inv_eigenvectors_transposed[m*states2], &inv_eigenvectors_transposed[(m-1)*states2], sizeof(double)*states2); + } + // make rearranged eigen for each submodel + double new_eval[num_states*vsize]; + double new_evec[states2*vsize]; + double new_inv_evec[states2*vsize]; + for (size_t ptn = 0; ptn < size(); ptn += vsize) { + double *eval_ptr = &eigenvalues[ptn*num_states]; + double *evec_ptr = &eigenvectors[ptn*states2]; + double *inv_evec_ptr = &inv_eigenvectors[ptn*states2]; + for (size_t i = 0; i < vsize; i++) { + for (size_t x = 0; x < num_states; x++) { + new_eval[x*vsize+i] = eval_ptr[x]; + } + for (size_t x = 0; x < states2; x++) { + new_evec[x*vsize+i] = evec_ptr[x]; + new_inv_evec[x*vsize+i] = inv_evec_ptr[x]; + } + eval_ptr += num_states; + evec_ptr += states2; + inv_evec_ptr += states2; + } + // copy new values + memcpy(&eigenvalues[ptn*num_states], new_eval, sizeof(double)*num_states*vsize); + memcpy(&eigenvectors[ptn*states2], new_evec, sizeof(double)*states2*vsize); + memcpy(&inv_eigenvectors[ptn*states2], new_inv_evec, sizeof(double)*states2*vsize); + calculateSquareMatrixTranspose(new_inv_evec, num_states, &inv_eigenvectors_transposed[ptn*states2]); + } } -ModelSet::~ModelSet() +void ModelSet::joinEigenMemory() { - for (reverse_iterator rit = rbegin(); rit != rend(); rit++) { - (*rit)->eigenvalues = nullptr; - (*rit)->eigenvectors = nullptr; - (*rit)->inv_eigenvectors = nullptr; - (*rit)->inv_eigenvectors_transposed = nullptr; - delete (*rit); - } -} - -void ModelSet::joinEigenMemory() { - size_t nmixtures = get_safe_upper_limit(size()); - aligned_free(eigenvalues); - aligned_free(eigenvectors); - aligned_free(inv_eigenvectors); - aligned_free(inv_eigenvectors_transposed); - - size_t states2 = num_states*num_states; - - eigenvalues = aligned_alloc(num_states*nmixtures); - eigenvectors = aligned_alloc(states2*nmixtures); - inv_eigenvectors = aligned_alloc(states2*nmixtures); - inv_eigenvectors_transposed = aligned_alloc(states2*nmixtures); - - // assigning memory for individual models - size_t m = 0; - for (iterator it = begin(); it != end(); it++, m++) { - // first copy memory for eigen stuffs - memcpy(&eigenvalues[m*num_states], (*it)->eigenvalues, num_states*sizeof(double)); - memcpy(&eigenvectors[m*states2], (*it)->eigenvectors, states2*sizeof(double)); - memcpy(&inv_eigenvectors[m*states2], (*it)->inv_eigenvectors, states2*sizeof(double)); - memcpy(&inv_eigenvectors_transposed[m*states2], (*it)->inv_eigenvectors_transposed, states2*sizeof(double)); - // then delete - aligned_free((*it)->eigenvalues); - aligned_free((*it)->eigenvectors); - aligned_free((*it)->inv_eigenvectors); - aligned_free((*it)->inv_eigenvectors_transposed); - - // and assign new memory - (*it)->eigenvalues = &eigenvalues[m*num_states]; - (*it)->eigenvectors = &eigenvectors[m*states2]; - (*it)->inv_eigenvectors = &inv_eigenvectors[m*states2]; - (*it)->inv_eigenvectors_transposed = &inv_eigenvectors_transposed[m*states2]; - } - - // copy dummy values - for (m = size(); m < nmixtures; m++) { - memcpy(&eigenvalues[m*num_states], &eigenvalues[(m-1)*num_states], sizeof(double)*num_states); - memcpy(&eigenvectors[m*states2], &eigenvectors[(m-1)*states2], sizeof(double)*states2); - memcpy(&inv_eigenvectors[m*states2], &inv_eigenvectors[(m-1)*states2], sizeof(double)*states2); - memcpy(&inv_eigenvectors_transposed[m*states2], &inv_eigenvectors_transposed[(m-1)*states2], sizeof(double)*states2); - } + size_t states2 = num_states*num_states; + size_t nmixtures = get_safe_upper_limit(size()); + aligned_free(eigenvalues); + aligned_free(eigenvectors); + aligned_free(inv_eigenvectors); + aligned_free(inv_eigenvectors_transposed); + eigenvalues = aligned_alloc(num_states*nmixtures); + eigenvectors = aligned_alloc(states2*nmixtures); + inv_eigenvectors = aligned_alloc(states2*nmixtures); + inv_eigenvectors_transposed = aligned_alloc(states2*nmixtures); + // assigning memory for individual models + size_t m = 0; + for (iterator it = begin(); it != end(); it++, m++) { + // first copy memory for eigen stuffs + memcpy(&eigenvalues[m*num_states], (*it)->eigenvalues, num_states*sizeof(double)); + memcpy(&eigenvectors[m*states2], (*it)->eigenvectors, states2*sizeof(double)); + memcpy(&inv_eigenvectors[m*states2], (*it)->inv_eigenvectors, states2*sizeof(double)); + memcpy(&inv_eigenvectors_transposed[m*states2], (*it)->inv_eigenvectors_transposed, states2*sizeof(double)); + // then delete + aligned_free((*it)->eigenvalues); + aligned_free((*it)->eigenvectors); + aligned_free((*it)->inv_eigenvectors); + aligned_free((*it)->inv_eigenvectors_transposed); + // and assign new memory + (*it)->eigenvalues = &eigenvalues[m*num_states]; + (*it)->eigenvectors = &eigenvectors[m*states2]; + (*it)->inv_eigenvectors = &inv_eigenvectors[m*states2]; + (*it)->inv_eigenvectors_transposed = &inv_eigenvectors_transposed[m*states2]; + } + // copy dummy values + for (size_t m = size(); m < nmixtures; m++) { + memcpy(&eigenvalues[m*num_states], &eigenvalues[(m-1)*num_states], sizeof(double)*num_states); + memcpy(&eigenvectors[m*states2], &eigenvectors[(m-1)*states2], sizeof(double)*states2); + memcpy(&inv_eigenvectors[m*states2], &inv_eigenvectors[(m-1)*states2], sizeof(double)*states2); + memcpy(&inv_eigenvectors_transposed[m*states2], &inv_eigenvectors_transposed[(m-1)*states2], sizeof(double)*states2); + } } diff --git a/model/modelset.h b/model/modelset.h index bc45d0408..83cb56cb6 100644 --- a/model/modelset.h +++ b/model/modelset.h @@ -23,47 +23,141 @@ #include "modelmarkov.h" /** - * a set of substitution models, used eg for site-specific state frequency model or - * partition model with joint branch lengths + * A set of substitution models to implement site-specific models */ class ModelSet : public ModelMarkov, public vector { public: - ModelSet(const char *model_name, PhyloTree *tree); + ModelSet(const string model_name, ModelsBlock *models_block, + StateFreqType freq, string freq_params, PhyloTree *tree); + + virtual ~ModelSet(); + /** * @return TRUE if this is a site-specific model, FALSE otherwise */ virtual bool isSiteSpecificModel() { return true; } + /** + * @return TRUE if the model has site-specific frequencies, FALSE otherwise + */ + virtual bool isSSF() { return phylo_tree->aln->isSSF(); } + + /** + * @return TRUE if the model has site-specific rates, FALSE otherwise + */ + virtual bool isSSR() { return phylo_tree->aln->isSSR(); } + /** * get the size of transition matrix, default is num_states*num_states. * can be changed for e.g. site-specific model */ virtual int getTransMatrixSize() { return num_states * num_states * size(); } + /** + * @return model name + */ + virtual string getName(); + + /** + * @return model name with parameters in form of e.g. GTR{a,b,c,d,e,f} + */ + virtual string getNameParams(bool show_fixed_params = false); + + /** + write information + @param out output stream + */ + virtual void writeInfo(ostream &out); + + /** + get the rate matrix. + @param rate_mat (OUT) upper-triagle rate matrix. + Assume rate_mat has size of num_states*(num_states-1)/2 + */ + virtual void getRateMatrix(double *rate_mat); + + /** + compute the state frequency vector. One should override this function when defining new model. + The default is equal state sequency, valid for all kind of data. + @param mixture (optional) class for mixture model + @param[out] state_freqs state frequency vector. Assume state_freqs has size of num_states + */ + virtual void getStateFrequency(double *state_freqs, int mixture = 0); + + /** + compute Q matrix + @param q_mat (OUT) Q matrix, assuming of size num_states * num_states + */ + virtual void getQMatrix(double *q_mat, int mixture = 0); + + /** + get frequency type + @return frequency type + */ + virtual StateFreqType getFreqType(); + + /** + @return the number of dimensions + */ + virtual int getNDim(); + + /** + @return the number of dimensions corresponding to state frequencies, which is + not counted in getNDim(). This serves e.g. for computing AIC, BIC score + */ + virtual int getNDimFreq(); + + /** + @return TRUE if parameters are at the boundary that may cause numerical unstability + */ + virtual bool isUnstableParameters(); + + /** + setup the bounds for joint optimization with BFGS + */ + virtual void setBounds(double *lower_bound, double *upper_bound, bool *bound_check); + + /** + rescale the state frequencies + @param sum_one TRUE to make frequencies sum to 1, FALSE to make last entry equal to 1 + */ + virtual void scaleStateFreq(bool sum_one); + + /** + optimize model parameters + @return the best likelihood + */ + virtual double optimizeParameters(double gradient_epsilon); + + /** + the target function which needs to be optimized + @param x the input vector x + @return the function value at x + */ + virtual double targetFunk(double x[]); /** compute the transition probability matrix. @param time time between two events - @param mixture (optional) class for mixture model - @param selected_row (optional) only compute the entries of one selected row. By default, compute all rows + @param mixture (optional) class for mixture model + @param selected_row (optional) only compute the entries of one selected row. By default, compute all rows @param trans_matrix (OUT) the transition matrix between all pairs of states. - Assume trans_matrix has size of num_states * num_states. + Assume trans_matrix has size of num_states * num_states. */ virtual void computeTransMatrix(double time, double *trans_matrix, int mixture = 0, int selected_row = -1); - /** compute the transition probability matrix.and the derivative 1 and 2 @param time time between two events - @param mixture (optional) class for mixture model + @param mixture (optional) class for mixture model @param trans_matrix (OUT) the transition matrix between all pairs of states. - Assume trans_matrix has size of num_states * num_states. - @param trans_derv1 (OUT) the 1st derivative matrix between all pairs of states. - @param trans_derv2 (OUT) the 2nd derivative matrix between all pairs of states. + @param trans_derv1 (OUT) the 1st derivative matrix between all pairs of states. + @param trans_derv2 (OUT) the 2nd derivative matrix between all pairs of states. + Assume trans_matrix has size of num_states * num_states. */ - virtual void computeTransDerv(double time, double *trans_matrix, + virtual void computeTransDerv(double time, double *trans_matrix, double *trans_derv1, double *trans_derv2, int mixture = 0); /** @@ -86,10 +180,8 @@ class ModelSet : public ModelMarkov, public vector */ virtual double computeTrans(double time, int state1, int state2, double &derv1, double &derv2) { return 0; } - - /** - compute the transition probability between two states at a specific site + compute the transition probability between two states at a specific site One should override this function when defining new model. The default is the Juke-Cantor model, valid for all kind of data (DNA, AA, Codon, etc) @param time time between two events @@ -117,65 +209,39 @@ class ModelSet : public ModelMarkov, public vector * @param ptn pattern ID of the alignment */ virtual int getPtnModelID(int ptn); - - /** - return the number of dimensions - */ - virtual int getNDim(); - /** - write information - @param out output stream + compute the memory size for the model, can be large for site-specific models + @return memory size required in bytes */ - virtual void writeInfo(ostream &out); + virtual uint64_t getMemoryRequired(); /** decompose the rate matrix into eigenvalues and eigenvectors */ virtual void decomposeRateMatrix(); - virtual ~ModelSet(); - - /** - * compute the memory size for the model, can be large for site-specific models - * @return memory size required in bytes - */ - virtual uint64_t getMemoryRequired() { - uint64_t mem = ModelMarkov::getMemoryRequired(); - for (iterator it = begin(); it != end(); it++) - mem += (*it)->getMemoryRequired(); - return mem; - } - - /** map from pattern ID to model ID */ - IntVector pattern_model_map; - - /** - join memory for eigen into one chunk - */ - void joinEigenMemory(); - protected: - - - /** - this function is served for the multi-dimension optimization. It should pack the model parameters + this function is served for the multi-dimension optimization. It should pack the model parameters into a vector that is index from 1 (NOTE: not from 0) @param variables (OUT) vector of variables, indexed from 1 */ virtual void setVariables(double *variables); /** - this function is served for the multi-dimension optimization. It should assign the model parameters + this function is served for the multi-dimension optimization. It should assign the model parameters from a vector of variables that is index from 1 (NOTE: not from 0) @param variables vector of variables, indexed from 1 @return TRUE if parameters are changed, FALSE otherwise (2015-10-20) */ virtual bool getVariables(double *variables); - + /** + join memory for eigen into one chunk + */ + void joinEigenMemory(); + }; #endif // MODELSET_H diff --git a/model/modelsubst.h b/model/modelsubst.h index 37835a8f2..d44f2c8d2 100644 --- a/model/modelsubst.h +++ b/model/modelsubst.h @@ -90,6 +90,16 @@ class ModelSubst: public Optimization, public CheckpointFactory */ virtual bool isSiteSpecificModel() { return false; } + /** + * @return TRUE if the model has site-specific frequencies, FALSE otherwise + */ + virtual bool isSSF() { return false; } + + /** + * @return TRUE if the model has site-specific rates, FALSE otherwise + */ + virtual bool isSSR() { return false; } + /** * @return TRUE if this is a mixture model, FALSE otherwise */ @@ -282,6 +292,11 @@ class ModelSubst: public Optimization, public CheckpointFactory */ virtual StateFreqType getFreqType() { return FREQ_EQUAL; } + /** + set frequency type + */ + virtual void setFreqType(StateFreqType freq) { freq_type = freq; } + /** set the associated tree @param tree the associated tree diff --git a/simulator/alisimulatorheterogeneity.cpp b/simulator/alisimulatorheterogeneity.cpp index 7bd3022cb..cb2bd188a 100644 --- a/simulator/alisimulatorheterogeneity.cpp +++ b/simulator/alisimulatorheterogeneity.cpp @@ -203,16 +203,14 @@ void AliSimulatorHeterogeneity::extractPatternPosteriorFreqsAndModelProb() if (!ptn_state_freq) { ptn_state_freq = new double[nptn * max_num_states]; - - SiteFreqType tmp_site_freq_type = tree->params->print_site_state_freq; - tree->params->print_site_state_freq = WSF_POSTERIOR_MEAN; + SiteFreqType tmp_site_freq_type = tree->params->site_state_freq_type; + tree->params->site_state_freq_type = WSF_POSTERIOR_MEAN; tree->computePatternStateFreq(ptn_state_freq); // get pattern-specific posterior model probability int nptn_times_nmixture = nptn * nmixture; ptn_model_dis = new double[nptn_times_nmixture]; memcpy(ptn_model_dis, tree->getPatternLhCatPointer(), nptn_times_nmixture * sizeof(double)); - tree->params->print_site_state_freq = tmp_site_freq_type; - + tree->params->site_state_freq_type = tmp_site_freq_type; // convert ptn_model_dis to accummulated matrix convertProMatrixIntoAccumulatedProMatrix(ptn_model_dis, nptn, nmixture); } diff --git a/tree/iqtree.cpp b/tree/iqtree.cpp index 7757a146a..85f35c560 100644 --- a/tree/iqtree.cpp +++ b/tree/iqtree.cpp @@ -2734,7 +2734,10 @@ void IQTree::refineBootTrees() { // copy model // BQM 2019-05-31: bug fix with -bsam option + VerboseMode saved_mode = verbose_mode; + if (verbose_mode < VB_MAX) verbose_mode = VB_QUIET; boot_tree->initializeModel(*params, aln->model_name, models_block); + verbose_mode = saved_mode; boot_tree->getModelFactory()->setCheckpoint(getCheckpoint()); if (isSuperTree()) ((PartitionModel*)boot_tree->getModelFactory())->PartitionModel::restoreCheckpoint(); diff --git a/tree/phylotree.cpp b/tree/phylotree.cpp index b6bfcb595..335cfa3d2 100644 --- a/tree/phylotree.cpp +++ b/tree/phylotree.cpp @@ -1404,12 +1404,10 @@ void PhyloTree::computePatternStateFreq(double *ptn_state_freq) { double *ptn_freq = ptn_state_freq; size_t nstates = aln->num_states; // ModelMixture *models = (ModelMixture*)model; - - if (params->print_site_state_freq == WSF_POSTERIOR_MEAN) { - cout << "Computing posterior mean site frequencies...." << endl; + if (params->site_state_freq_type == WSF_POSTERIOR_MEAN) { + cout << "Computing posterior mean site frequencies..." << endl; // loop over all site-patterns for (size_t ptn = 0; ptn < nptn; ++ptn) { - // first compute posterior for each mixture component double sum_lh = 0.0; for (size_t m = 0; m < nmixture; ++m) { @@ -1419,7 +1417,6 @@ void PhyloTree::computePatternStateFreq(double *ptn_state_freq) { for (size_t m = 0; m < nmixture; ++m) { lh_cat[m] *= sum_lh; } - // now compute state frequencies for (size_t state = 0; state < nstates; ++state) { double freq = 0; @@ -1427,13 +1424,12 @@ void PhyloTree::computePatternStateFreq(double *ptn_state_freq) { freq += model->getMixtureClass(m)->state_freq[state] * lh_cat[m]; ptn_freq[state] = freq; } - // increase the pointers lh_cat += nmixture; ptn_freq += nstates; } - } else if (params->print_site_state_freq == WSF_POSTERIOR_MAX) { - cout << "Computing posterior max site frequencies...." << endl; + } else if (params->site_state_freq_type == WSF_POSTERIOR_MAX) { + cout << "Computing posterior max site frequencies..." << endl; // loop over all site-patterns for (size_t ptn = 0; ptn < nptn; ++ptn) { // first compute posterior for each mixture component @@ -1442,10 +1438,8 @@ void PhyloTree::computePatternStateFreq(double *ptn_state_freq) { if (lh_cat[m] > lh_cat[max_comp]) { max_comp = m; } - // now compute state frequencies memcpy(ptn_freq, model->getMixtureClass(max_comp)->state_freq, nstates*sizeof(double)); - // increase the pointers lh_cat += nmixture; ptn_freq += nstates; @@ -1453,7 +1447,15 @@ void PhyloTree::computePatternStateFreq(double *ptn_state_freq) { } } - +void PhyloTree::computePatternRate(DoubleVector &ptn_rate) { + ASSERT(getRate()->getNRate() > 1); + if (params->site_rate_type == WSR_POSTERIOR_MEAN) { + cout << "Computing posterior mean site rates..." << endl; + int ncat; + IntVector ptn_cat; + ncat = site_rate->computePatternRates(ptn_rate, ptn_cat); + } +} void PhyloTree::computePatternLikelihood(double *ptn_lh, double *cur_logl, double *ptn_lh_cat, SiteLoglType wsl) { /* if (!dad_branch) { diff --git a/tree/phylotree.h b/tree/phylotree.h index b8a19f986..2fdf7b5c4 100644 --- a/tree/phylotree.h +++ b/tree/phylotree.h @@ -1059,12 +1059,18 @@ class PhyloTree : public MTree, public Optimization, public CheckpointFactory { virtual double computePatternLhCat(SiteLoglType wsl); /** - compute state frequency for each pattern (for Huaichun) - @param[out] ptn_state_freq state frequency vector per pattern, - should be pre-allocated with size of num_patterns * num_states + compute state frequencies for each pattern (for site-specific models) + @param[out] ptn_state_freq state frequencies per pattern, + should be pre-allocated with the size of num_patterns * num_states */ void computePatternStateFreq(double *ptn_state_freq); + /** + compute rates for each pattern (for site-specific models) + @param[out] ptn_rate rate scaler per pattern + */ + void computePatternRate(DoubleVector &ptn_rate); + /**************************************************************************** ancestral sequence reconstruction ****************************************************************************/ diff --git a/utils/tools.cpp b/utils/tools.cpp index a7e91dc2f..0d56640d7 100644 --- a/utils/tools.cpp +++ b/utils/tools.cpp @@ -1265,8 +1265,10 @@ void parseArg(int argc, char *argv[], Params ¶ms) { params.print_partition_lh = false; params.print_marginal_prob = false; params.print_site_prob = WSL_NONE; - params.print_site_state_freq = WSF_NONE; + params.print_site_state_freq = 0; params.print_site_rate = 0; + params.site_state_freq_type = WSF_NONE; + params.site_rate_type = WSR_NONE; params.print_trees_site_posterior = 0; params.print_ancestral_sequence = AST_NONE; params.min_ancestral_prob = 0.0; @@ -1366,7 +1368,9 @@ void parseArg(int argc, char *argv[], Params ¶ms) { params.bootlh_test = 0; params.bootlh_partitions = NULL; params.site_freq_file = NULL; + params.site_rate_file = NULL; params.tree_freq_file = NULL; + params.tree_rate_file = NULL; params.num_threads = 1; params.num_threads_max = 10000; params.openmp_by_model = false; @@ -1376,6 +1380,7 @@ void parseArg(int argc, char *argv[], Params ¶ms) { params.root_state = NULL; params.print_bootaln = false; params.print_boot_site_freq = false; + params.print_boot_site_rate = false; params.print_subaln = false; params.print_partition_info = false; params.print_conaln = false; @@ -1863,14 +1868,15 @@ void parseArg(int argc, char *argv[], Params ¶ms) { params.tree_gen = YULE_HARDING; continue; } - if (strcmp(argv[cnt], "-rs") == 0) { - cnt++; - if (cnt >= argc) - throw "Use -rs "; - params.tree_gen = YULE_HARDING; - params.aln_file = argv[cnt]; - continue; - } + // OBSOLETE: alignment should be passed with the -s option + //if (strcmp(argv[cnt], "-rs") == 0) { + // cnt++; + // if (cnt >= argc) + // throw "Use -rs "; + // params.tree_gen = YULE_HARDING; + // params.aln_file = argv[cnt]; + // continue; + //} if (strcmp(argv[cnt], "-rstar") == 0) { cnt++; if (cnt >= argc) @@ -2867,15 +2873,13 @@ void parseArg(int argc, char *argv[], Params ¶ms) { throw " must be positive!"; continue; } - if (strcmp(argv[cnt], "--site-rate") == 0) { + if (params.alisim_active && strcmp(argv[cnt], "--site-rate") == 0) { cnt++; if (cnt >= argc) throw "Use --site-rate MEAN/SAMPLING/MODEL"; - // convert option to uppercase string option = argv[cnt]; transform(option.begin(), option.end(), option.begin(), ::toupper); - if (option.compare("MEAN") == 0) params.alisim_rate_heterogeneity = POSTERIOR_MEAN; else if (option.compare("SAMPLING") == 0) @@ -2886,15 +2890,13 @@ void parseArg(int argc, char *argv[], Params ¶ms) { throw "Use --site-rate MEAN/SAMPLING/MODEL"; continue; } - if (strcmp(argv[cnt], "--site-freq") == 0) { + if (params.alisim_active && strcmp(argv[cnt], "--site-freq") == 0) { cnt++; if (cnt >= argc) throw "Use --site-freq MEAN/SAMPLING/MODEL"; - // convert option to uppercase string option = argv[cnt]; transform(option.begin(), option.end(), option.begin(), ::toupper); - if (option.compare("MEAN") == 0) params.alisim_stationarity_heterogeneity = POSTERIOR_MEAN; else if (option.compare("SAMPLING") == 0) @@ -3477,27 +3479,66 @@ void parseArg(int argc, char *argv[], Params ¶ms) { continue; } - if (strcmp(argv[cnt], "-fs") == 0 || strcmp(argv[cnt], "--site-freq") == 0) { - if (params.tree_freq_file) - throw "Specifying both -fs and -ft not allowed"; - cnt++; - if (cnt >= argc) - throw "Use -fs "; - params.site_freq_file = argv[cnt]; -// params.SSE = LK_EIGEN; - continue; - } - if (strcmp(argv[cnt], "-ft") == 0 || strcmp(argv[cnt], "--tree-freq") == 0) { - if (params.site_freq_file) - throw "Specifying both -fs and -ft not allowed"; - cnt++; - if (cnt >= argc) - throw "Use -ft "; - params.tree_freq_file = argv[cnt]; - if (params.print_site_state_freq == WSF_NONE) - params.print_site_state_freq = WSF_POSTERIOR_MEAN; - continue; - } + if (strcmp(argv[cnt], "-fs") == 0 || strcmp(argv[cnt], "--site-freq") == 0) { + if (params.tree_freq_file) + throw "Specifying both -fs and -ft or -frt not allowed"; + cnt++; + if (cnt >= argc) + throw "Use -fs "; + params.site_freq_file = argv[cnt]; +// params.SSE = LK_EIGEN; + continue; + } + if (strcmp(argv[cnt], "-rs") == 0 || strcmp(argv[cnt], "--site-rate") == 0) { + if (params.tree_rate_file || params.tree_freq_file) + throw "Specifying both -rs and -rt or -ft or -frt not allowed"; + cnt++; + if (cnt >= argc) + throw "Use -rs "; + params.site_rate_file = argv[cnt]; + continue; + } + if (strcmp(argv[cnt], "-ft") == 0 || strcmp(argv[cnt], "--tree-freq") == 0) { + if (params.site_freq_file) + throw "Specifying both -fs and -ft not allowed"; + if (params.tree_rate_file) + throw "Specifying both -ft and -rt not allowed, try using -frt option"; + cnt++; + if (cnt >= argc) + throw "Use -ft "; + params.tree_freq_file = argv[cnt]; + if (params.site_state_freq_type == WSF_NONE) + params.site_state_freq_type = WSF_POSTERIOR_MEAN; + continue; + } + if (strcmp(argv[cnt], "-rt") == 0 || strcmp(argv[cnt], "--tree-rate") == 0) { + if (params.site_rate_file) + throw "Specifying both -rs and -rt not allowed"; + if (params.tree_freq_file) + throw "Specifying both -ft and -rt not allowed, try using -frt option"; + cnt++; + if (cnt >= argc) + throw "Use -rt "; + params.tree_rate_file = argv[cnt]; + if (params.site_rate_type == WSR_NONE) + params.site_rate_type = WSR_POSTERIOR_MEAN; + continue; + } + if (strcmp(argv[cnt], "-frt") == 0 || strcmp(argv[cnt], "--tree-freq-rate") == 0) { + if (params.site_freq_file || params.site_rate_file) + throw "Specifying both -fs or -rs and -frt not allowed"; + if (params.tree_freq_file || params.tree_rate_file) + throw "Specifying both -ft or -rt and -frt not allowed"; + cnt++; + if (cnt >= argc) + throw "Use -frt "; + params.tree_freq_file = params.tree_rate_file = argv[cnt]; + if (params.site_state_freq_type == WSF_NONE) + params.site_state_freq_type = WSF_POSTERIOR_MEAN; + if (params.site_rate_type == WSR_NONE) + params.site_rate_type = WSR_POSTERIOR_MEAN; + continue; + } if (strcmp(argv[cnt], "-fconst") == 0) { cnt++; @@ -4062,7 +4103,7 @@ void parseArg(int argc, char *argv[], Params ¶ms) { if (strcmp(argv[cnt], "-asr-joint") == 0) { params.print_ancestral_sequence = AST_JOINT; - params.ignore_identical_seqs = false; + params.ignore_identical_seqs = false; continue; } @@ -4071,21 +4112,23 @@ void parseArg(int argc, char *argv[], Params ¶ms) { continue; } - if (strcmp(argv[cnt], "--mlrate") == 0) { - params.print_site_rate |= 2; - continue; - } + if (strcmp(argv[cnt], "--mlrate") == 0) { + params.print_site_rate |= 2; + continue; + } if (strcmp(argv[cnt], "-wsptrees") == 0) { params.print_trees_site_posterior = 1; continue; } if (strcmp(argv[cnt], "-wsf") == 0) { - params.print_site_state_freq = WSF_POSTERIOR_MEAN; + params.print_site_state_freq = 1; + if (params.site_state_freq_type == WSF_NONE) + params.site_state_freq_type = WSF_POSTERIOR_MEAN; continue; } if (strcmp(argv[cnt], "--freq-max") == 0 || strcmp(argv[cnt], "-fmax") == 0) { - params.print_site_state_freq = WSF_POSTERIOR_MAX; + params.site_state_freq_type = WSF_POSTERIOR_MAX; continue; } if (strcmp(argv[cnt], "-wba") == 0) { @@ -4096,6 +4139,10 @@ void parseArg(int argc, char *argv[], Params ¶ms) { params.print_boot_site_freq = true; continue; } + if (strcmp(argv[cnt], "-wbsr") == 0) { + params.print_boot_site_rate = true; + continue; + } if (strcmp(argv[cnt], "-wsa") == 0) { params.print_subaln = true; continue; @@ -5804,11 +5851,12 @@ void parseArg(int argc, char *argv[], Params ¶ms) { if (params.alisim_active && !params.aln_file && !params.user_file && !params.partition_file && params.tree_gen == NONE) outError("A tree filepath is a mandatory input to execute AliSim when neither Inference mode nor Random mode (generating a random tree) is inactive. Use -t ; or Activate the inference mode by -s ; or Activate Random mode by -t RANDOM{,} where is yh, u, cat, bal, bd{,} stands for Yule-Harding, Uniform, Caterpillar, Balanced, Birth-Death model respectively."); - // terminate if using AliSim with -ft or -fs site-specific model (ModelSet) + // terminate if using AliSim with site-specific model (ModelSet) // computeTransMatix has not yet implemented for ModelSet if (params.alisim_active && (params.tree_freq_file || params.site_freq_file)) - outError("Sorry! `-ft` (--site-freq) and `-fs` (--tree-freq) options are not fully supported in AliSim. However, AliSim can estimate posterior mean frequencies from the alignment. Please try again without `-ft` and `-fs` options!"); - + outError("Sorry! `-ft` (--tree-freq) and `-fs` (--site-freq) options are not fully supported in AliSim. However, AliSim can estimate posterior mean frequencies from the alignment. Please try again without `-ft` and `-fs` options!"); + if (params.alisim_active && (params.tree_rate_file || params.site_rate_file)) + outError("Sorry! `-rt` (--tree-rate) and `-rs` (--site-rate) options are not fully supported in AliSim. However, AliSim can estimate posterior mean rates from the alignment. Please try again without `-rt` and `-rs` options!"); // set default filename for the random tree if AliSim is running in Random mode if (params.alisim_active && !params.user_file && params.tree_gen != NONE) { @@ -6168,13 +6216,16 @@ void usage_iqtree(char* argv[], bool full_command) { // TODO DS: Maybe change default to +WH. << endl << "COMPLEX MODELS:" << endl - << " -m \"MIX{m1,...,mK}\" Mixture model with K components" << endl - << " -m \"FMIX{f1,...fK}\" Frequency mixture model with K components" << endl - << " --mix-opt Optimize mixture weights (default: detect)" << endl - << " -m ...+ASC Ascertainment bias correction" << endl - << " --tree-freq FILE Input tree to infer site frequency model" << endl - << " --site-freq FILE Input site frequency model file" << endl - << " --freq-max Posterior maximum instead of mean approximation" << endl + << " -m \"MIX{m1,...,mK}\" Mixture model with K components" << endl + << " -m \"FMIX{f1,...fK}\" Frequency mixture model with K components" << endl + << " --mix-opt Optimize mixture weights (default: detect)" << endl + << " -m ...+ASC Ascertainment bias correction" << endl + << " --tree-freq FILE Input tree to infer site frequency model" << endl + << " --tree-rate FILE Input tree to infer site rate model" << endl + << " --tree-freq-rate FILE Input tree to infer site frequency and rate model" << endl + << " --freq-max Posterior maximum instead of mean approximation" << endl + << " --site-freq FILE Input site frequency model file" << endl + << " --site-rate FILE Input site rate model file" << endl << endl << "TREE TOPOLOGY TEST:" << endl << " --trees FILE Set of trees to evaluate log-likelihoods" << endl diff --git a/utils/tools.h b/utils/tools.h index b6f24978a..e68c1068d 100644 --- a/utils/tools.h +++ b/utils/tools.h @@ -560,10 +560,14 @@ enum SiteFreqType { WSF_NONE, WSF_POSTERIOR_MEAN, WSF_POSTERIOR_MAX }; -enum MatrixExpTechnique { - MET_SCALING_SQUARING, +enum SiteRateType { + WSR_NONE, WSR_POSTERIOR_MEAN +}; + +enum MatrixExpTechnique { + MET_SCALING_SQUARING, MET_EIGEN3LIB_DECOMPOSITION, - MET_EIGEN_DECOMPOSITION, + MET_EIGEN_DECOMPOSITION, MET_LIE_MARKOV_DECOMPOSITION }; @@ -2010,17 +2014,21 @@ class Params { /** 0: print nothing - 1: print site state frequency vectors + 1: print site-specific state frequency vectors */ - SiteFreqType print_site_state_freq; + int print_site_state_freq; /** - 0 (default): do not print .rate file - 1: print site-specific rates by empirical Bayes - 2: site-specific rates by maximum-likelihood + 0: do not print .rate or .mlrate files + 1: print site-specific rates by empirical Bayes + 2: site-specific rates by maximum-likelihood */ int print_site_rate; + SiteFreqType site_state_freq_type; + + SiteRateType site_rate_type; + /* 1: print site posterior probability for many trees during tree search */ int print_trees_site_posterior; @@ -2309,24 +2317,32 @@ class Params { /** precision when printing out for floating-point number */ int numeric_precision; - /** file containing state-frequencies per site for site-specific state frequency model - * each line has n+1 entries (n=number of states): + /** file containing state frequencies per site for site-specific state frequency model + * each line has n+1 entries (n = number of states): * site_ID state1_freq state2_freq ... staten_freq - * where site_ID is from 1 to m (m=number of sites) + * where site_ID is from 1 to m (m = number of sites) */ char *site_freq_file; - /** - user tree file used to estimate site-specific state frequency model - */ + /** file containing rate scalers per site for site-specific rate model + * each line has 1+1 entries: + * site_ID rate_scaler + * where site_ID is from 1 to m (m = number of sites) + */ + char *site_rate_file; + + /** user tree file used to estimate site-specific state frequency model */ char *tree_freq_file; + /** user tree file used to estimate site-specific rate model */ + char *tree_rate_file; + /** number of threads for OpenMP version */ int num_threads; - + /** maximum number of threads, default: #CPU scores */ int num_threads_max; - + /** true to parallel ModelFinder by models instead of sites */ bool openmp_by_model; @@ -2347,9 +2363,12 @@ class Params { */ bool print_bootaln; - /** TRUE to print bootstrapped site frequency for e.g. PMSF */ + /** TRUE to print bootstrapped site frequency for PMSF */ bool print_boot_site_freq; + /** TRUE to print bootstrapped site rate for PMSR */ + bool print_boot_site_rate; + /** true to print sub alignments of super alignment, default: false */ bool print_subaln; From 952f77edf425bf0ea564dbe6bcd7ebf38ac7bbc2 Mon Sep 17 00:00:00 2001 From: StefanFlaumberg Date: Tue, 11 Feb 2025 20:17:55 +0300 Subject: [PATCH 2/8] Fix ModelSet checkpointing --- model/modelset.cpp | 47 +++++++++++++++++++++++++++++++++++++++++++++- model/modelset.h | 47 +++++++++++++++++++++++++++++++++------------- 2 files changed, 80 insertions(+), 14 deletions(-) diff --git a/model/modelset.cpp b/model/modelset.cpp index fdb3d4f55..1d979c70d 100644 --- a/model/modelset.cpp +++ b/model/modelset.cpp @@ -25,7 +25,7 @@ ModelSet::ModelSet(const string model_name, ModelsBlock *models_block, : ModelMarkov(tree) { name = full_name = model_name; - full_name += "+site-specific state-frequency or rate model (unpublished)"; + full_name += "+site-specific state frequency or rate model (unpublished)"; // init the wrapper model to use its eigen ModelMarkov::init(FREQ_EMPIRICAL); // +F is used here to calculate +I under SSF ModelMarkov::fixParameters(true); // yet otherwise the wrapper model parameters remain unused @@ -79,6 +79,51 @@ ModelSet::~ModelSet() } } +void ModelSet::setCheckpoint(Checkpoint *checkpoint) +{ + CheckpointFactory::setCheckpoint(checkpoint); + front()->setCheckpoint(checkpoint); +} + +void ModelSet::startCheckpoint() +{ + checkpoint->startStruct("ModelSet"); +} + +void ModelSet::saveCheckpoint() +{ + startCheckpoint(); + checkpoint->startStruct("frontModel"); + front()->saveCheckpoint(); + checkpoint->endStruct(); + endCheckpoint(); +} + +void ModelSet::restoreCheckpoint() +{ + startCheckpoint(); + checkpoint->startStruct("frontModel"); + front()->restoreCheckpoint(); + checkpoint->endStruct(); + endCheckpoint(); + if (getNDim() > 0) { + double *state_freqs = new double[num_states]; + double *rate_mat = new double[getNumRateEntries()]; + getStateFrequency(state_freqs); + getRateMatrix(rate_mat); + for (iterator it = begin(); it != end(); it++) { + if (getFreqType() == FREQ_ESTIMATE) + (*it)->setStateFrequency(state_freqs); + (*it)->setRateMatrix(rate_mat); + } + delete [] state_freqs; + delete [] rate_mat; + } + decomposeRateMatrix(); + if (phylo_tree) + phylo_tree->clearAllPartialLH(); +} + string ModelSet::getName() { if (isSSF()) { diff --git a/model/modelset.h b/model/modelset.h index 83cb56cb6..e82d53eeb 100644 --- a/model/modelset.h +++ b/model/modelset.h @@ -34,20 +34,41 @@ class ModelSet : public ModelMarkov, public vector virtual ~ModelSet(); + /** + set checkpoint object + @param checkpoint + */ + virtual void setCheckpoint(Checkpoint *checkpoint); + + /** + start structure for checkpointing + */ + virtual void startCheckpoint(); + + /** + save object into the checkpoint + */ + virtual void saveCheckpoint(); + + /** + restore object from the checkpoint + */ + virtual void restoreCheckpoint(); + /** * @return TRUE if this is a site-specific model, FALSE otherwise */ virtual bool isSiteSpecificModel() { return true; } - /** - * @return TRUE if the model has site-specific frequencies, FALSE otherwise - */ - virtual bool isSSF() { return phylo_tree->aln->isSSF(); } + /** + * @return TRUE if the model has site-specific frequencies, FALSE otherwise + */ + virtual bool isSSF() { return phylo_tree->aln->isSSF(); } - /** - * @return TRUE if the model has site-specific rates, FALSE otherwise - */ - virtual bool isSSR() { return phylo_tree->aln->isSSR(); } + /** + * @return TRUE if the model has site-specific rates, FALSE otherwise + */ + virtual bool isSSR() { return phylo_tree->aln->isSSR(); } /** * get the size of transition matrix, default is num_states*num_states. @@ -78,18 +99,18 @@ class ModelSet : public ModelMarkov, public vector */ virtual void getRateMatrix(double *rate_mat); - /** + /** compute the state frequency vector. One should override this function when defining new model. The default is equal state sequency, valid for all kind of data. @param mixture (optional) class for mixture model @param[out] state_freqs state frequency vector. Assume state_freqs has size of num_states - */ + */ virtual void getStateFrequency(double *state_freqs, int mixture = 0); /** - compute Q matrix - @param q_mat (OUT) Q matrix, assuming of size num_states * num_states - */ + compute Q matrix + @param q_mat (OUT) Q matrix, assuming of size num_states * num_states + */ virtual void getQMatrix(double *q_mat, int mixture = 0); /** From 4d23d071729fd00b0d9f56a66e032044a66c56ee Mon Sep 17 00:00:00 2001 From: StefanFlaumberg Date: Wed, 19 Mar 2025 03:59:31 +0300 Subject: [PATCH 3/8] Fix restrictions for --mlrate --- main/treetesting.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main/treetesting.cpp b/main/treetesting.cpp index 7fba40a6c..ec983896c 100644 --- a/main/treetesting.cpp +++ b/main/treetesting.cpp @@ -438,8 +438,8 @@ void printSiteStateFreq(const char *filename, PhyloTree *tree) { } void printSiteRate(const char *filename, PhyloTree *tree, bool bayes) { - if (tree->getRate()->getNRate() == 1 && !tree->isSuperTree()) - outError("No rate mixture model or partition model was specified!"); + if (bayes && tree->getRate()->getNRate() == 1) + outError("No rate mixture model was specified!"); try { ofstream out; out.exceptions(ios::failbit | ios::badbit); From b4a86d7b81e2a3a6cf3a8622dadfeb5167dbf458 Mon Sep 17 00:00:00 2001 From: StefanFlaumberg Date: Sat, 22 Mar 2025 17:51:20 +0300 Subject: [PATCH 4/8] Add site-specific rate normalization and fix a BNNI bug --- alignment/alignment.cpp | 86 +++++++++++++++++++++++++++++++---------- alignment/alignment.h | 8 +++- main/phyloanalysis.cpp | 9 +++-- model/modelset.cpp | 8 ++++ tree/iqtree.cpp | 60 +++++++++++++++------------- 5 files changed, 119 insertions(+), 52 deletions(-) diff --git a/alignment/alignment.cpp b/alignment/alignment.cpp index 8bffbcd9a..3ff67c1a3 100644 --- a/alignment/alignment.cpp +++ b/alignment/alignment.cpp @@ -1250,6 +1250,7 @@ void Alignment::orderPatternByNumChars(int pat_type) { void Alignment::ungroupSitePattern() { + // assign an individual pattern to each site vector stored_pat = (*this); clear(); IntVector stored_site_pattern = site_pattern; @@ -1288,24 +1289,24 @@ void Alignment::regroupSitePattern(int groups, const IntVector &site_group) IntVector stored_site_pattern = site_pattern; site_pattern.clear(); site_pattern.resize(stored_site_pattern.size(), -1); - size_t count = 0; + size_t cnt = 0; for (int g = 0; g < groups; ++g) { pattern_index.clear(); for (size_t i = 0; i < getNSite(); ++i) { if (site_group[i] == g) { - count++; + cnt ++; Pattern pat = stored_pat[stored_site_pattern[i]]; addPattern(pat, i); } } } - ASSERT(count == stored_site_pattern.size()); - // count pattern frequencies of the new patterns - count = 0; + ASSERT(cnt == getNSite()); + // check pattern frequencies of the new patterns + cnt = 0; for (iterator it = begin(); it != end(); ++it) { - count += it->frequency; + cnt += it->frequency; } - ASSERT(count == getNSite()); + ASSERT(cnt == getNSite()); pattern_index.clear(); //printPhylip("/dev/stdout"); // update the pattern_first_site map @@ -5679,7 +5680,6 @@ bool Alignment::readSiteParamFile(const char* site_param_file, const string &par cout << endl << "Reading site-specific " << msg << " file " << site_param_file << " ..." << endl; bool aln_changed = false; int specified_sites = 0; - int saturated_sites = 0; IntVector site_model(getNSite(), -1); // map each site to a model vector models; // site-specific frequency vectors or rate scalers // fill the pattern_first_site map @@ -5731,18 +5731,14 @@ bool Alignment::readSiteParamFile(const char* site_param_file, const string &par state_freqs[x] /= sum; } } - convfreq(state_freqs); // regularize frequencies (eg if some freqs are too close to 0) + convfreq(state_freqs); // regularize freqs (if some freqs are too close to 0) site_param_entry = state_freqs; } else { double rate; double *rate_scaler = new double; in >> rate; - if (rate < 0.0) throw "Negative rate not allowed"; - if (rate == 0.0) rate = MIN_SITE_RATE; - if (rate >= MAX_SITE_RATE) { - rate = MAX_SITE_RATE; - saturated_sites ++; - } + if (rate < 0.0 || rate > 100.0) throw "Rates must be non-negative and not higher than 100"; + rate = max(rate, MIN_SITE_RATE); // regularize rate (if it is too close to 0) *rate_scaler = rate; site_param_entry = rate_scaler; } @@ -5799,13 +5795,9 @@ bool Alignment::readSiteParamFile(const char* site_param_file, const string &par } } double *default_param_entry; - if (param_type == "freq") default_param_entry = NULL; else *default_param_entry = 1.0; + if (param_type == "freq") default_param_entry = NULL; else *default_param_entry = -1.0; models.push_back(default_param_entry); } - // saturated site warning - if (saturated_sites) { - cout << saturated_sites << "sites show too high rates (>=" << MAX_SITE_RATE << ")" << endl; - } // if needed, subdivide patterns so that sites in each new pattern have same freqs and rates if (aln_changed) { cout << "Regrouping alignment sites..." << endl; @@ -5825,6 +5817,60 @@ bool Alignment::readSiteParamFile(const char* site_param_file, const string &par return aln_changed; } +double Alignment::normalizePtnRateScaler() +{ + // the goal is to end up with mean rate == 1.0, + // some rates may become > MAX_SITE_RATE, but + // all rates must stay >= MIN_SITE_RATE + ASSERT(ptn_rate_scaler.size()); + int ptnf; + double rate; + size_t nptn = getNPattern(); + // calculate the mean rate value + size_t cnt = 0; + double sum = 0.0; + for (size_t ptn = 0; ptn < nptn; ++ptn) { + ptnf = at(ptn).frequency; + rate = ptn_rate_scaler[ptn]; + if (rate != -1.0) { + cnt += ptnf; + sum += rate * ptnf; + } + } + double mean = sum / cnt; + if (fabs(mean - 1.0) <= 1e-4) return 1.0; + // normalization: divide rates by their mean value + bool regularize = false; + size_t cnt_fast = 0; + double sum_slow = 0.0, sum_fast = 0.0; + for (size_t ptn = 0; ptn < nptn; ++ptn) { + ptnf = at(ptn).frequency; + rate = ptn_rate_scaler[ptn]; + rate = (rate != -1.0) ? (rate / mean) : 1.0; + ptn_rate_scaler[ptn] = rate; + // data for regularization + if (rate > 1.0) { + cnt_fast += ptnf; + sum_fast += rate * ptnf; + } else if (rate < MIN_SITE_RATE) { + regularize = true; + sum_slow += MIN_SITE_RATE * ptnf; + } else { + sum_slow += rate * ptnf; + } + } + if (!regularize) return mean; + // regularization: bound min rates and scale down the above-one parts of the fast rates + double coef = (getNSite() - cnt_fast - sum_slow) / (sum_fast - cnt_fast); + for (size_t ptn = 0; ptn < nptn; ++ptn) { + rate = ptn_rate_scaler[ptn]; + if (rate < MIN_SITE_RATE) rate = MIN_SITE_RATE; + if (rate > 1.0) rate = rate * coef + 1.0 - coef; + ptn_rate_scaler[ptn] = rate; + } + return mean; +} + /** * set the expected_num_sites (for alisim) * @param the expected_num_sites diff --git a/alignment/alignment.h b/alignment/alignment.h index 36a77ffb7..ee02ce481 100644 --- a/alignment/alignment.h +++ b/alignment/alignment.h @@ -917,7 +917,7 @@ class Alignment : public vector, public CharSet, public StateSpace { vector pomo_sampled_states; IntIntMap pomo_sampled_states_index; // indexing, to quickly find if a PoMo-2-state is already present - /* for site-specific state frequency model with Huaichun, Edward, Andrew */ + /* for site-specific models with Huaichun, Edward, Andrew */ /** pattern index to state frequency vector map */ vector ptn_state_freq; @@ -994,6 +994,12 @@ class Alignment : public vector, public CharSet, public StateSpace { */ bool readSiteParamFile(const char* site_param_file, const string ¶m_type); + /** + * normalize site-specific rates by their mean value + * @return mean rate before normalization + */ + double normalizePtnRateScaler(); + /** * special initialization for codon sequences, e.g., setting #states, genetic_code * @param sequence_type user-defined sequence type diff --git a/main/phyloanalysis.cpp b/main/phyloanalysis.cpp index 42b275243..81578afe6 100644 --- a/main/phyloanalysis.cpp +++ b/main/phyloanalysis.cpp @@ -3947,8 +3947,6 @@ void computeSiteSpecificModel(Params ¶ms, Alignment *alignment, const string ModelsBlock *models_block = readModelsDefinition(params); tree->setModelFactory(new ModelFactory(params, alignment->model_name, tree, models_block)); delete models_block; - tree->setModel(tree->getModelFactory()->model); - tree->setRate(tree->getModelFactory()->site_rate); if (!tree->getModel()->isReversible()) outError("Non-reversible models are incompatible with site-specific models"); if (tree->getModel()->isMixture() && !tree->getModel()->isMixtureSameQ()) @@ -3983,16 +3981,19 @@ void computeSiteSpecificModel(Params ¶ms, Alignment *alignment, const string for (size_t ptn = 0; ptn < nptn; ptn++) { double *state_freqs = new double[nstates]; memcpy(state_freqs, all_ptn_state_freq + ptn*nstates, sizeof(double)*nstates); + alignment->convfreq(state_freqs); // regularize freqs alignment->ptn_state_freq.push_back(state_freqs); } - ASSERT(alignment->ptn_state_freq.size() == nptn); delete [] all_ptn_state_freq; + ASSERT(alignment->ptn_state_freq.size() == nptn); params.site_state_freq_type = WSF_NONE; } else { DoubleVector ptn_rate; tree->computePatternRate(ptn_rate); for (size_t ptn = 0; ptn < nptn; ptn++) { - alignment->ptn_rate_scaler.push_back(ptn_rate[ptn]); + double rate = ptn_rate[ptn]; + rate = min(max(rate, MIN_SITE_RATE), MAX_SITE_RATE); // regularize rate + alignment->ptn_rate_scaler.push_back(rate); } ASSERT(alignment->ptn_rate_scaler.size() == nptn); params.site_rate_type = WSR_NONE; diff --git a/model/modelset.cpp b/model/modelset.cpp index 1d979c70d..8ad2b5859 100644 --- a/model/modelset.cpp +++ b/model/modelset.cpp @@ -63,6 +63,14 @@ ModelSet::ModelSet(const string model_name, ModelsBlock *models_block, } delete [] state_freqs; delete [] rate_mat; + // normalize site-specific rates (if any) and rescale the tree + if (isSSR()) { + double mean_rate = phylo_tree->aln->normalizePtnRateScaler(); + if (mean_rate != 1.0) { + phylo_tree->scaleLength(mean_rate); + phylo_tree->clearAllPartialLH(); + } + } // generate the site-specific eigen of the wrapper model joinEigenMemory(); decomposeRateMatrix(); diff --git a/tree/iqtree.cpp b/tree/iqtree.cpp index 85f35c560..848ecc1ed 100644 --- a/tree/iqtree.cpp +++ b/tree/iqtree.cpp @@ -2679,20 +2679,22 @@ void IQTree::refineBootTrees() { if (CKP_RESTORE(refined_samples)) cout << "CHECKPOINT: " << refined_samples << " refined samples restored" << endl; checkpoint->endStruct(); - + // 2018-08-17: delete duplicated memory deleteAllPartialLh(); ModelsBlock *models_block = readModelsDefinition(*params); - - // do bootstrap analysis - for (int sample = refined_samples; sample < boot_trees.size(); sample++) { + + // do bootstrap analysis + for (int sample = refined_samples; sample < boot_trees.size(); sample++) { + // create bootstrap alignment Alignment* bootstrap_alignment; - if (aln->isSuperAlignment()) + if (aln->isSuperAlignment()) { bootstrap_alignment = new SuperAlignment; - else + } else { bootstrap_alignment = new Alignment; + } bootstrap_alignment->createBootstrapAlignment(aln, NULL, params->bootstrap_spec); // create bootstrap tree @@ -2706,31 +2708,41 @@ void IQTree::refineBootTrees() { } else { // allocate heterotachy tree if neccessary int pos = posRateHeterotachy(aln->model_name); - if (params->num_mixlen > 1) { boot_tree = new PhyloTreeMixlen(bootstrap_alignment, params->num_mixlen); } else if (pos != string::npos) { boot_tree = new PhyloTreeMixlen(bootstrap_alignment, 0); - } else + } else { boot_tree = new IQTree(bootstrap_alignment); + } } boot_tree->on_refine_btree = true; boot_tree->save_all_trees = 0; - // initialize constraint tree - if (!constraintTree.empty()) { - boot_tree->constraintTree.readConstraint(constraintTree); - } - // set likelihood kernel boot_tree->setParams(params); boot_tree->setLikelihoodKernel(sse); boot_tree->setNumThreads(num_threads); // 2019-06-03: bug fix setting part_info properly - if (boot_tree->isSuperTree()) + if (boot_tree->isSuperTree()) { ((PhyloSuperTree*)boot_tree)->setPartInfo((PhyloSuperTree*)this); + } + + // initialize constraint tree + if (!constraintTree.empty()) { + boot_tree->constraintTree.readConstraint(constraintTree); + } + + // load the current ufboot tree + // 2019-02-06: fix crash with -sp and -bnni + // 2025-03-20: fix crash with -rt and -bnni (tree should be read before model) + if (isSuperTree()) { + boot_tree->PhyloTree::readTreeString(boot_trees[sample]); + } else { + boot_tree->readTreeString(boot_trees[sample]); + } // copy model // BQM 2019-05-31: bug fix with -bsam option @@ -2739,25 +2751,19 @@ void IQTree::refineBootTrees() { boot_tree->initializeModel(*params, aln->model_name, models_block); verbose_mode = saved_mode; boot_tree->getModelFactory()->setCheckpoint(getCheckpoint()); - if (isSuperTree()) + if (isSuperTree()) { ((PartitionModel*)boot_tree->getModelFactory())->PartitionModel::restoreCheckpoint(); - else + } else { boot_tree->getModelFactory()->restoreCheckpoint(); + } - // load the current ufboot tree - // 2019-02-06: fix crash with -sp and -bnni - if (isSuperTree()) - boot_tree->PhyloTree::readTreeString(boot_trees[sample]); - else - boot_tree->readTreeString(boot_trees[sample]); - + // re-initialize branch lengths for unlinked model if (boot_tree->isSuperTree() && params->partition_type == BRLEN_OPTIMIZE) { if (((PhyloSuperTree*)boot_tree)->size() > 1) { - // re-initialize branch lengths for unlinked model boot_tree->wrapperFixNegativeBranch(true); } } - + // TODO: check if this resolves the crash in reorientPartialLh() boot_tree->initializeAllPartialLh(); @@ -2809,8 +2815,8 @@ void IQTree::refineBootTrees() { checkpoint->dump(); - } - + } + delete models_block; cout << "Total " << refined_trees << " ufboot trees refined" << endl; From 3e13b78b70bb0f8b580cc2371e71e55346c3373e Mon Sep 17 00:00:00 2001 From: StefanFlaumberg Date: Wed, 7 May 2025 19:51:14 +0300 Subject: [PATCH 5/8] Fix site-specific options and improve help --- utils/tools.cpp | 261 +++++++++++++++++++++++++----------------------- 1 file changed, 134 insertions(+), 127 deletions(-) diff --git a/utils/tools.cpp b/utils/tools.cpp index 0d56640d7..9408d717e 100644 --- a/utils/tools.cpp +++ b/utils/tools.cpp @@ -2873,10 +2873,10 @@ void parseArg(int argc, char *argv[], Params ¶ms) { throw " must be positive!"; continue; } - if (params.alisim_active && strcmp(argv[cnt], "--site-rate") == 0) { + if (strcmp(argv[cnt], "--site-rate-type") == 0) { cnt++; if (cnt >= argc) - throw "Use --site-rate MEAN/SAMPLING/MODEL"; + throw "Use --site-rate-type MEAN/SAMPLING/MODEL"; // convert option to uppercase string option = argv[cnt]; transform(option.begin(), option.end(), option.begin(), ::toupper); @@ -2887,13 +2887,13 @@ void parseArg(int argc, char *argv[], Params ¶ms) { else if (option.compare("MODEL") == 0) params.alisim_rate_heterogeneity = UNSPECIFIED; else - throw "Use --site-rate MEAN/SAMPLING/MODEL"; + throw "Use --site-rate-type MEAN/SAMPLING/MODEL"; continue; } - if (params.alisim_active && strcmp(argv[cnt], "--site-freq") == 0) { + if (strcmp(argv[cnt], "--site-freq-type") == 0) { cnt++; if (cnt >= argc) - throw "Use --site-freq MEAN/SAMPLING/MODEL"; + throw "Use --site-freq-type MEAN/SAMPLING/MODEL"; // convert option to uppercase string option = argv[cnt]; transform(option.begin(), option.end(), option.begin(), ::toupper); @@ -2904,7 +2904,7 @@ void parseArg(int argc, char *argv[], Params ¶ms) { else if (option.compare("MODEL") == 0) params.alisim_stationarity_heterogeneity = UNSPECIFIED; else - throw "Use --site-freq MEAN/SAMPLING/MODEL"; + throw "Use --site-freq-type MEAN/SAMPLING/MODEL"; continue; } if (strcmp(argv[cnt], "--indel") == 0) { @@ -3284,7 +3284,7 @@ void parseArg(int argc, char *argv[], Params ¶ms) { params.model_subset = argv[cnt]; continue; } - if (strcmp(argv[cnt], "-mfreq") == 0 || strcmp(argv[cnt], "--freqs") == 0) { + if (strcmp(argv[cnt], "-mfreq") == 0 || strcmp(argv[cnt], "--mfreq") == 0 || strcmp(argv[cnt], "--freqs") == 0) { cnt++; if (cnt >= argc) throw "Use -mfreq "; @@ -3490,8 +3490,8 @@ void parseArg(int argc, char *argv[], Params ¶ms) { continue; } if (strcmp(argv[cnt], "-rs") == 0 || strcmp(argv[cnt], "--site-rate") == 0) { - if (params.tree_rate_file || params.tree_freq_file) - throw "Specifying both -rs and -rt or -ft or -frt not allowed"; + if (params.tree_freq_file || params.tree_rate_file) + throw "Specifying both -rs and -ft or -rt or -frt not allowed"; cnt++; if (cnt >= argc) throw "Use -rs "; @@ -3499,8 +3499,8 @@ void parseArg(int argc, char *argv[], Params ¶ms) { continue; } if (strcmp(argv[cnt], "-ft") == 0 || strcmp(argv[cnt], "--tree-freq") == 0) { - if (params.site_freq_file) - throw "Specifying both -fs and -ft not allowed"; + if (params.site_freq_file || params.site_rate_file) + throw "Specifying both -fs or -rs and -ft not allowed"; if (params.tree_rate_file) throw "Specifying both -ft and -rt not allowed, try using -frt option"; cnt++; @@ -3540,7 +3540,7 @@ void parseArg(int argc, char *argv[], Params ¶ms) { continue; } - if (strcmp(argv[cnt], "-fconst") == 0) { + if (strcmp(argv[cnt], "-fconst") == 0 || strcmp(argv[cnt], "--fconst") == 0) { cnt++; if (cnt >= argc) throw "Use -fconst "; @@ -5960,9 +5960,9 @@ void usage(char* argv[]) { cout << endl; cout << "OPTIONS FOR GENOMIC EPIDEMIOLOGICAL ANALYSES:" << endl; cout << " --pathogen Apply CMAPLE tree search algorithm if sequence" << endl; - cout << " divergence is low, otherwise, apply IQ-TREE algorithm." << endl; + cout << " divergence is low, otherwise apply IQ-TREE algorithm" << endl; cout << " --pathogen-force Apply CMAPLE tree search algorithm regardless" << endl; - cout << " of sequence divergence." << endl; + cout << " of sequence divergence" << endl; cout << endl; //cout << "HIDDEN OPTIONS: see the source code file pda.cpp::parseArg()" << endl; @@ -5970,51 +5970,53 @@ void usage(char* argv[]) { } void usage_alisim(){ - cout << endl << "ALISIM: ALIGNMENT SIMULATOR" << endl + cout + << endl << "ALISIM: ALIGNMENT SIMULATOR" << endl << endl << "Usage: iqtree --alisim [-m MODEL] [-t TREE] ..." << endl << endl - << " --alisim OUTPUT_ALIGNMENT Activate AliSim and specify the output alignment filename"<< endl - << " -t TREE_FILE Set the input tree file name" << endl - << " --length LENGTH Set the length of the root sequence" << endl - << " --num-alignments NUMBER Set the number of output datasets" << endl - << " --seqtype STRING BIN, DNA, AA, CODON, MORPH{NUM_STATES} (default: auto-detect)" << endl - << " For morphological data, 0, Set the insertion and deletion rate of the indel model,"<< endl - << " relative to the substitution rate"<< endl - << " --indel-size , Set the insertion and deletion size distributions" << endl - << " --sub-level-mixture Enable the feature to simulate substitution-level mixture model"<< endl - << " --no-unaligned Disable outputing a file of unaligned sequences "<< endl - << " when using indel models"<< endl - << " --root-seq FILE,SEQ_NAME Specify the root sequence from an alignment" << endl - << " -s FILE Specify the input sequence alignment" << endl - << " --no-copy-gaps Disable copying gaps from input alignment (default: false)" << endl - << " --site-freq