From 9e9a13ba66176bb2a21fe7a3942060a9ed60f6ec Mon Sep 17 00:00:00 2001 From: Noah Wahl Date: Wed, 5 Jun 2024 23:18:20 +0300 Subject: [PATCH 1/4] fix(undatedDTL): fix survival normalization --- src/undated.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/undated.cpp b/src/undated.cpp index bec7a91..b63e19a 100644 --- a/src/undated.cpp +++ b/src/undated.cpp @@ -728,7 +728,6 @@ scalar_type exODT_model::pun(approx_posterior *ale, bool verbose,bool no_T) } survive=0; root_sum=0; - O_norm=0; bool single_O=false; for (int e=0;e Date: Wed, 5 Jun 2024 23:30:34 +0300 Subject: [PATCH 2/4] chore(format): add clang-format and format everything --- .clang-format | 7 + CMakeLists.txt | 6 + misc/ALE_compareTreeDistributions.cpp | 165 +- misc/ALEml-verbose.cpp | 320 +-- misc/scALE/scALE.cpp | 593 +++-- misc/scALE/scALE.h | 115 +- misc/times_undated.cpp | 121 +- misc/undated_fast.cpp | 1672 ++++++------ src/ALE.cpp | 2762 ++++++++++---------- src/ALE.h | 380 ++- src/ALE_tutorial.cpp | 184 +- src/ALE_util.cpp | 658 +++-- src/ALE_util.h | 25 +- src/ALEadd.cpp | 128 +- src/ALEcount.cpp | 17 +- src/ALEevaluate_undated.cpp | 266 +- src/ALEmcmc_undated.cpp | 823 +++--- src/ALEml.cpp | 521 ++-- src/ALEml_scaled.cpp | 239 +- src/ALEml_undated.cpp | 941 +++---- src/ALEobserve.cpp | 60 +- src/ALEprune.cpp | 745 +++--- src/ALEsample.cpp | 354 +-- src/CCPscore.cpp | 15 +- src/computeALEcomplexity.cpp | 120 +- src/exODT.cpp | 844 +++--- src/exODT.h | 495 ++-- src/exODT_sim.cpp | 927 +++---- src/exODT_sim.h | 103 +- src/fractionMissing.cpp | 51 +- src/fractionMissing.h | 4 +- src/ls_leaves.cpp | 42 +- src/mlresampler.cpp | 235 +- src/mlresampler_undated.cpp | 221 +- src/mlsampler.cpp | 362 +-- src/model.cpp | 1497 +++++------ src/model_omp.cpp | 3402 +++++++++++++------------ src/model_qvec.cpp | 1590 ++++++------ src/model_scaled.cpp | 1383 +++++----- src/mpi_S_ml.cpp | 452 ++-- src/mpi_ml-bw_undated.cpp | 498 ++-- src/mpi_ml.cpp | 187 +- src/mpi_ml_undated.cpp | 310 +-- src/mpi_tree.cpp | 1765 ++++++------- src/mpi_tree.h | 222 +- src/omp_test.cpp | 99 +- src/pairHasher.h | 27 +- src/parse_maxtic.cpp | 57 +- src/sample.cpp | 1913 +++++++------- src/sample_qvec.cpp | 1901 +++++++------- src/sample_scaled.cpp | 2260 ++++++++-------- src/simulateSpAndGeneTrees.cpp | 194 +- src/simulation.cpp | 1009 ++++---- src/test.cpp | 158 +- src/test_simpleML.cpp | 85 +- src/times.cpp | 198 +- src/times_undated.cpp | 121 +- src/traceback.cpp | 2145 ++++++++-------- src/traceback_qvec.cpp | 1356 +++++----- src/traceback_scaled.cpp | 1568 ++++++------ src/undated.cpp | 1823 +++++++------ src/wALE_ml_sample.cpp | 429 ++-- src/wALE_ml_sample_undated.cpp | 388 +-- src/wol_host.cpp | 103 +- src/wol_paras.cpp | 104 +- src/wol_tree.cpp | 104 +- 66 files changed, 21142 insertions(+), 20727 deletions(-) create mode 100644 .clang-format diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..9630e47 --- /dev/null +++ b/.clang-format @@ -0,0 +1,7 @@ +# see https://clang.llvm.org/docs/ClangFormatStyleOptions.html +--- +BasedOnStyle: LLVM +Language: Cpp +Standard: Cpp11 + +UseTab: Never diff --git a/CMakeLists.txt b/CMakeLists.txt index 7e0670d..241c663 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,6 +3,12 @@ PROJECT(ale CXX) SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin) add_definitions(-std=c++11) +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +# ignore BOOST deprecated headers +add_definitions("-DBOOST_TIMER_ENABLE_DEPRECATED") +# add_definitions("-DBOOST_BIND_GLOBAL_PLACEHOLDERS") + IF(NOT NO_VIRTUAL_COV) SET(NO_VIRTUAL_COV FALSE CACHE BOOL "Disable covariant return type with virtual inheritance, for compilers that do not support it." diff --git a/misc/ALE_compareTreeDistributions.cpp b/misc/ALE_compareTreeDistributions.cpp index dc40eb4..9707fd1 100644 --- a/misc/ALE_compareTreeDistributions.cpp +++ b/misc/ALE_compareTreeDistributions.cpp @@ -2,97 +2,104 @@ #include "ALE_util.h" using namespace std; -//Compilation: g++ -o ALE_compareTreeDistributions ALE_compareTreeDistributions.cpp ALE.h ALE.cpp ALE_util.h ALE_util.cpp exODT.h exODT.cpp model_omp.cpp -std=c++0x -I/usr/local/include -L. -L/usr/local/lib -lbpp-core -lbpp-seq -lbpp-phyl -// g++ -g -o ALE_compareTreeDistributions ALE_compareTreeDistributions.cpp ALE.h ALE.cpp ALE_util.h ALE_util.cpp exODT.h exODT.cpp model_omp.cpp -std=c++0x -I/usr/include -I/home/ssolo/newest_bpp/include -L. -L/home/ssolo/newest_bpp/lib -L/usr/lib -lbpp-core -lbpp-seq -lbpp-phyl +// Compilation: g++ -o ALE_compareTreeDistributions +// ALE_compareTreeDistributions.cpp ALE.h ALE.cpp ALE_util.h ALE_util.cpp +// exODT.h exODT.cpp model_omp.cpp -std=c++0x -I/usr/local/include -L. +// -L/usr/local/lib -lbpp-core -lbpp-seq -lbpp-phyl +// g++ -g -o ALE_compareTreeDistributions ALE_compareTreeDistributions.cpp +// ALE.h ALE.cpp ALE_util.h ALE_util.cpp exODT.h exODT.cpp model_omp.cpp +// -std=c++0x -I/usr/include -I/home/ssolo/newest_bpp/include -L. +// -L/home/ssolo/newest_bpp/lib -L/usr/lib -lbpp-core -lbpp-seq -lbpp-phyl -int main(int argc, char ** argv) -{ - if (argc == 1 ) { - std::cout << "\tUsage: ALE_compareTreeDistributions treeDist1.trees burnin1 treeDist2.trees burnin2\n" < burnin; +int main(int argc, char **argv) { + if (argc == 1) { + std::cout << "\tUsage: ALE_compareTreeDistributions treeDist1.trees " + "burnin1 treeDist2.trees burnin2\n" + << std::endl; + } else { + // First tree distribution + string ale_file = argv[1]; + string ale_name = ale_file + ".ale"; + approx_posterior *ale; + vector burnin; - if (argc>2) - burnin.push_back(atoi(argv[2])); - vector every ; -if (argc>3) - every.push_back(atoi(argv[3])); + if (argc > 2) + burnin.push_back(atoi(argv[2])); + vector every; + if (argc > 3) + every.push_back(atoi(argv[3])); - ale=observe_ALE_from_file(ale_file, burnin[0], every[0]); + ale = observe_ALE_from_file(ale_file, burnin[0], every[0]); - cout << "# observe "<< ale->observations << "trees from: " << argv[1] << endl; - ale->save_state(ale_name); - cout << "# saved in "<< ale_name<observations << "trees from: " << argv[1] + << endl; + ale->save_state(ale_name); + cout << "# saved in " << ale_name << endl; - //Second tree distribution - string ale_file2=argv[4]; - string ale_name2=ale_file+".ale"; - approx_posterior * ale2; + // Second tree distribution + string ale_file2 = argv[4]; + string ale_name2 = ale_file + ".ale"; + approx_posterior *ale2; - if (argc>4) - burnin.push_back(atoi(argv[5])); - int every2 = 0; - if (argc>5) - every.push_back(atoi(argv[6])); + if (argc > 4) + burnin.push_back(atoi(argv[5])); + int every2 = 0; + if (argc > 5) + every.push_back(atoi(argv[6])); - ale2=observe_ALE_from_file(ale_file2, burnin[1], every[1]); - cout << "# observe "<< ale2->observations << "trees from: " << argv[4] << endl; - ale2->save_state(ale_name2); - cout << "# saved in "<< ale_name2<observations << "trees from: " << argv[4] + << endl; + ale2->save_state(ale_name2); + cout << "# saved in " << ale_name2 << endl; - //Now we want to compute the probabilities of all trees included in the files, according to the two ales. - //First, we get all trees and put them in a single vector + // Now we want to compute the probabilities of all trees included in the + // files, according to the two ales. First, we get all trees and put them in + // a single vector - vector trees; - std::vector< string > fnames; - fnames.push_back(argv[1]); - fnames.push_back(argv[4]); - size_t fileId = 0; - for (vector::iterator it=fnames.begin();it!=fnames.end();it++) - { - string fname=(*it); - ifstream file_stream (fname.c_str()); - int tree_i=0; - if (file_stream.is_open()) // ########## read trees ############ - { - while (! file_stream.eof()) - { - string line; - getline (file_stream,line); - if (line.find("(")!=line.npos ) - { - tree_i++; - if (tree_i%every[fileId]==0 ) trees.push_back(line); - } - } - } - fileId++; + vector trees; + std::vector fnames; + fnames.push_back(argv[1]); + fnames.push_back(argv[4]); + size_t fileId = 0; + for (vector::iterator it = fnames.begin(); it != fnames.end(); + it++) { + string fname = (*it); + ifstream file_stream(fname.c_str()); + int tree_i = 0; + if (file_stream.is_open()) // ########## read trees ############ + { + while (!file_stream.eof()) { + string line; + getline(file_stream, line); + if (line.find("(") != line.npos) { + tree_i++; + if (tree_i % every[fileId] == 0) + trees.push_back(line); + } + } + } + fileId++; } - string outFile; - if (argc>4) - outFile=atoi(argv[5]); - else - outFile = "outFile.txt"; + string outFile; + if (argc > 4) + outFile = atoi(argv[5]); + else + outFile = "outFile.txt"; + + ofstream myfile; + myfile.open(outFile); + myfile << "ALE1_proba\tALE2_proba" << std::endl; - ofstream myfile; - myfile.open (outFile); - myfile << "ALE1_proba\tALE2_proba"<p(trees[i]) << "\t" << ale2->p(trees[i]) << std::endl; - cout <p(trees[i]) << "\t" << ale2->p(trees[i]) << endl; + for (size_t i = 0; i < trees.size(); ++i) { + myfile << ale->p(trees[i]) << "\t" << ale2->p(trees[i]) << std::endl; + cout << ale->p(trees[i]) << "\t" << ale2->p(trees[i]) << endl; + } + myfile.flush(); + myfile.close(); + delete ale; + delete ale2; } - myfile.flush(); - myfile.close(); - delete ale; - delete ale2; -} return 1; - } diff --git a/misc/ALEml-verbose.cpp b/misc/ALEml-verbose.cpp index e46af11..bafcc13 100644 --- a/misc/ALEml-verbose.cpp +++ b/misc/ALEml-verbose.cpp @@ -1,206 +1,214 @@ -#include "exODT.h" #include "ALE_util.h" +#include "exODT.h" -#include -#include #include #include +#include +#include using namespace std; using namespace bpp; - -class p_fun: - public virtual Function, - public AbstractParametrizable -{ + +class p_fun : public virtual Function, public AbstractParametrizable { private: double fval_; - exODT_model* model_pointer; - approx_posterior* ale_pointer; -public: - p_fun(exODT_model* model,approx_posterior* ale, double delta_start=0.01,double tau_start=0.01,double lambda_start=0.1//,double sigma_hat_start=1. -) : AbstractParametrizable(""), fval_(0), model_pointer(model), ale_pointer(ale) - { - //We declare parameters here: - // IncludingInterval* constraint = new IncludingInterval(1e-6, 10-1e-6); - IntervalConstraint* constraint = new IntervalConstraint ( 1e-6, 10-1e-6, true, true ); - addParameter_( new Parameter("delta", delta_start, constraint) ) ; - addParameter_( new Parameter("tau", tau_start, constraint) ) ; - addParameter_( new Parameter("lambda", lambda_start, constraint) ) ; - //addParameter_( new Parameter("sigma_hat", sigma_hat_start, constraint) ) ; + exODT_model *model_pointer; + approx_posterior *ale_pointer; +public: + p_fun(exODT_model *model, approx_posterior *ale, double delta_start = 0.01, + double tau_start = 0.01, + double lambda_start = 0.1 //,double sigma_hat_start=1. + ) + : AbstractParametrizable(""), fval_(0), model_pointer(model), + ale_pointer(ale) { + // We declare parameters here: + // IncludingInterval* constraint = new IncludingInterval(1e-6, 10-1e-6); + IntervalConstraint *constraint = + new IntervalConstraint(1e-6, 10 - 1e-6, true, true); + addParameter_(new Parameter("delta", delta_start, constraint)); + addParameter_(new Parameter("tau", tau_start, constraint)); + addParameter_(new Parameter("lambda", lambda_start, constraint)); + // addParameter_( new Parameter("sigma_hat", sigma_hat_start, constraint) ) + // ; } - - p_fun* clone() const { return new p_fun(*this); } - + + p_fun *clone() const { return new p_fun(*this); } + public: - - void setParameters(const ParameterList& pl) - throw (ParameterNotFoundException, ConstraintException, Exception) - { - matchParametersValues(pl); - } - double getValue() const throw (Exception) { return fval_; } - void fireParameterChanged(const ParameterList& pl) - { - double delta = getParameterValue("delta"); - double tau = getParameterValue("tau"); - double lambda = getParameterValue("lambda"); - //double sigma_hat = getParameterValue("sigma_hat"); - - model_pointer->set_model_parameter("delta",delta); - model_pointer->set_model_parameter("tau",tau); - model_pointer->set_model_parameter("lambda",lambda); - //model_pointer->set_model_parameter("sigma_hat",sigma_hat); - model_pointer->calculate_EGb(); - double y=-log(model_pointer->p(ale_pointer)); - cout <set_model_parameter("delta", delta); + model_pointer->set_model_parameter("tau", tau); + model_pointer->set_model_parameter("lambda", lambda); + // model_pointer->set_model_parameter("sigma_hat",sigma_hat); + model_pointer->calculate_EGb(); + double y = -log(model_pointer->p(ale_pointer)); + cout << endl + << "delta=" << delta << "\t tau=" << tau + << "\t lambda=" << lambda //<< "\t lambda="<observations<<" trees from: " << ale_file <<".."<3) + ifstream file_stream_S(argv[1]); + getline(file_stream_S, Sstring); + cout << "Read species tree from: " << argv[1] << ".." << endl; + // we need an .ale file containing observed conditional clade probabilities + // cf. ALEobserve + string ale_file = argv[2]; + approx_posterior *ale; + ale = load_ALE_from_file(ale_file); + cout << "Read summary of tree sample for " << ale->observations + << " trees from: " << ale_file << ".." << endl; + + // we initialise a coarse grained reconciliation model for calculating the sum + exODT_model *model = new exODT_model(); + + int D = 3; + if (argc > 3) model->set_model_parameter("gene_name_separators", argv[3]); - - model->set_model_parameter("min_D",D); - model->set_model_parameter("grid_delta_t",0.05); + model->set_model_parameter("min_D", D); + model->set_model_parameter("grid_delta_t", 0.05); model->construct(Sstring); - model->set_model_parameter("event_node",0); - model->set_model_parameter("leaf_events",1); - model->set_model_parameter("N",1); - - //a set of inital rates - scalar_type delta=0.01,tau=0.01,lambda=0.1; - if (argc>6) - delta=atof(argv[4]),tau=atof(argv[5]),lambda=atof(argv[6]); + model->set_model_parameter("event_node", 0); + model->set_model_parameter("leaf_events", 1); + model->set_model_parameter("N", 1); + + // a set of inital rates + scalar_type delta = 0.01, tau = 0.01, lambda = 0.1; + if (argc > 6) + delta = atof(argv[4]), tau = atof(argv[5]), lambda = atof(argv[6]); model->set_model_parameter("delta", delta); model->set_model_parameter("tau", tau); model->set_model_parameter("lambda", lambda); model->set_model_parameter("sigma_hat", 1); - //calculate_EGb() must always be called after changing rates to calculate E-s and G-s - //cf. http://arxiv.org/abs/1211.4606 + // calculate_EGb() must always be called after changing rates to calculate E-s + // and G-s cf. http://arxiv.org/abs/1211.4606 model->calculate_EGb(); - cout << "Reconciliation model initialised, starting DTL rate optimisation" <<".."<setProfiler(0); optimizer->setMessageHandler(0); optimizer->setVerbose(2); - + optimizer->setConstraintPolicy(AutoParameter::CONSTRAINTS_AUTO); - optimizer->init(f->getParameters()); //Here we optimize all parameters, and start with the default values. - - - - // FunctionStopCondition stop(optimizer, 1);//1e-1); - // optimizer->setStopCondition(stop); - //TEMP - //optimizer->setMaximumNumberOfEvaluations( 10 ); - + optimizer->init(f->getParameters()); // Here we optimize all parameters, and + // start with the default values. + + // FunctionStopCondition stop(optimizer, 1);//1e-1); + // optimizer->setStopCondition(stop); + // TEMP + // optimizer->setMaximumNumberOfEvaluations( 10 ); + optimizer->optimize(); - - //optimizer->getParameters().printParameters(cout); - delta=optimizer->getParameterValue("delta"); - tau=optimizer->getParameterValue("tau"); - lambda=optimizer->getParameterValue("lambda"); - //scalar_type sigma_hat=optimizer->getParameterValue("sigma_hat"); - - scalar_type mlll=-optimizer->getFunctionValue(); - cout << endl << "ML rates: " << " delta=" << delta << "; tau=" << tau << "; lambda="<getParameters().printParameters(cout); + delta = optimizer->getParameterValue("delta"); + tau = optimizer->getParameterValue("tau"); + lambda = optimizer->getParameterValue("lambda"); + // scalar_type sigma_hat=optimizer->getParameterValue("sigma_hat"); + + scalar_type mlll = -optimizer->getFunctionValue(); + cout << endl + << "ML rates: " + << " delta=" << delta << "; tau=" << tau + << "; lambda=" << lambda //<<"; sigma="< res = model->p_MLRec(ale); - //and output it.. - string outname=ale_file+".ml_rec"; - ofstream fout( outname.c_str() ); - fout << "#ALEml using ALE v"<< ALE_VERSION <<" by Szollosi GJ et al.; ssolo@elte.hu; CC BY-SA 3.0;"<string_parameter["S_with_ranks"] < res = model->p_MLRec(ale); + // and output it.. + string outname = ale_file + ".ml_rec"; + ofstream fout(outname.c_str()); + fout << "#ALEml using ALE v" << ALE_VERSION + << " by Szollosi GJ et al.; ssolo@elte.hu; CC BY-SA 3.0;" << endl + << endl; + fout << "S:\t" << model->string_parameter["S_with_ranks"] << endl; fout << endl; - fout << "Input ale from:\t"<MLRec_events["D"] << "\t" << model->MLRec_events["T"] << "\t" << model->MLRec_events["L"]<< "\t" << model->MLRec_events["S"] <MLRec_events["D"] << "\t" + << model->MLRec_events["T"] << "\t" << model->MLRec_events["L"] << "\t" + << model->MLRec_events["S"] << endl; fout << endl; - fout << "# of\t Duplications\tTransfers\tLosses\tgene copies" <counts_string(); - + cout << "Results in: " << outname << endl; - string voutname=ale_file+".vstrings"; - ofstream vout( voutname.c_str() ); - - for (std::map >::iterator it=model->gid_branches.begin();it!=model->gid_branches.end();it++) - { - long int g_id=(*it).first; - vout << g_id << " " << model->vertical_string(g_id) << endl; - /* - for (int i = 0; i < (int)(*it).second.size(); i++) - { - int branch = model->gid_branches[g_id][i]; - stringstream named_branch; - if (branch==model->alpha) - named_branch<<-1; - else if (model->id_ranks[branch]==0) - { - named_branch<extant_species[branch]; - } - else - named_branch<id_ranks[branch]; - - vout << "\t" << model->gid_events[g_id][i] << "\t" << model->gid_times[g_id][i]<< "\t" << named_branch.str() << "\t" << model->gid_gidp[g_id][i] << "\t" << model->gid_gidpp[g_id][i] << "\t" <>::iterator it = + model->gid_branches.begin(); + it != model->gid_branches.end(); it++) { + long int g_id = (*it).first; + vout << g_id << " " << model->vertical_string(g_id) << endl; + /* + for (int i = 0; i < (int)(*it).second.size(); i++) + { + int branch = model->gid_branches[g_id][i]; + stringstream named_branch; + if (branch==model->alpha) + named_branch<<-1; + else if (model->id_ranks[branch]==0) + { + named_branch<extant_species[branch]; + } + else + named_branch<id_ranks[branch]; + + vout << "\t" << model->gid_events[g_id][i] << "\t" << + model->gid_times[g_id][i]<< "\t" << named_branch.str() << "\t" << + model->gid_gidp[g_id][i] << "\t" << model->gid_gidpp[g_id][i] << "\t" + < #include "scALE.h" +#include -scALE::scALE() -{ - //some default parameters - string_parameter["gene_name_separators"]="_@"; - scalar_parameter["species_field"]=0; - scalar_parameter["event_node"]=0; - scalar_parameter["min_bip_count"]=-1; - scalar_parameter["min_branch_lenghts"]=0; - // length of "stem" branch above root - scalar_parameter["stem_length"]=1; - //number of subdiscretizations for ODE calculations - //Corresponds to maximum number of coalescences on a given branch of the species tree. - scalar_parameter["DD"]=10; +scALE::scALE() { + // some default parameters + string_parameter["gene_name_separators"] = "_@"; + scalar_parameter["species_field"] = 0; + scalar_parameter["event_node"] = 0; + scalar_parameter["min_bip_count"] = -1; + scalar_parameter["min_branch_lenghts"] = 0; + // length of "stem" branch above root + scalar_parameter["stem_length"] = 1; + // number of subdiscretizations for ODE calculations + // Corresponds to maximum number of coalescences on a given branch of the + // species tree. + scalar_parameter["DD"] = 10; } - -void scALE::construct( approx_posterior *sale ) -{ - sale_pointer = sale; - speciesNames = sale_pointer->getLeafNames(); - +void scALE::construct(approx_posterior *sale) { + sale_pointer = sale; + speciesNames = sale_pointer->getLeafNames(); } +scalar_type scALE::p(approx_posterior *gale) { + gale_pointer = gale; + approx_posterior *sale = sale_pointer; + // directed partitions and their sizes, for the gene and the species tree + // distributions + vector + g_ids; // del-loc. Vector of leaf set (=clade) ids for the + // gene tree distribution, ordered by their size, small to large. + vector g_id_sizes; // del-loc. Numbers of leaves in the above sets. + vector s_ids; // del-loc. Vector of leaf set (=clade) ids for the + // species tree distribution, ordered by their size, + // small to large. + vector s_id_sizes; // del-loc. Numbers of leaves in the above sets. + gale->computeOrderedVectorOfClades(g_ids, g_id_sizes); + sale->computeOrderedVectorOfClades(s_ids, s_id_sizes); + size_t numSpeciesClades = s_ids.size(); + size_t numGeneClades = g_ids.size(); + size_t numberOfSlicesPerBranch = scalar_parameter["DD"]; + // Need to empty q + for (std::vector>> q::iterator + it = q.begin(); + it != q.end(); ++it) { + for (size_t slice = 0; slice < numberOfSlicesPerBranch; ++slice) { + (*it).second[i] = 0.0; + } + } + if (q.size() != numSpeciesClades) + q.resize(numSpeciesClades); -scalar_type scALE::p(approx_posterior *gale) { - gale_pointer = gale; - approx_posterior *sale = sale_pointer; - //directed partitions and their sizes, for the gene and the species tree distributions - vector g_ids; //del-loc. Vector of leaf set (=clade) ids for the - //gene tree distribution, ordered by their size, small to large. - vector g_id_sizes; //del-loc. Numbers of leaves in the above sets. - - vector s_ids; //del-loc. Vector of leaf set (=clade) ids for the - //species tree distribution, ordered by their size, small to large. - vector s_id_sizes; //del-loc. Numbers of leaves in the above sets. - - gale->computeOrderedVectorOfClades ( g_ids, g_id_sizes ); - sale->computeOrderedVectorOfClades ( s_ids, s_id_sizes ); - size_t numSpeciesClades = s_ids.size(); - size_t numGeneClades = g_ids.size(); - size_t numberOfSlicesPerBranch = scalar_parameter["DD"]; - - - //Need to empty q - for ( std::vector < std::pair < long int, std::vector < scalar_type > > > q::iterator it=q.begin() ; it!=q.end() ; ++it ) - { - for (size_t slice=0; slice < numberOfSlicesPerBranch ; ++slice) { - (*it).second[i] = 0.0; - } - } - if ( q.size () != numSpeciesClades ) - q.resize ( numSpeciesClades ); - - - // gene<->species mapping - for ( int i=0; i<(int)g_ids.size(); i++ ) //Going through each clade of the gene approx_posterior + // gene<->species mapping + for (int i = 0; i < (int)g_ids.size(); + i++) // Going through each clade of the gene approx_posterior + { + long int g_id = g_ids[i]; + if (g_id_sizes[i] == 1) // a leaf, mapping is by name { - long int g_id=g_ids[i]; - if (g_id_sizes[i]==1) //a leaf, mapping is by name - { - string gene_name = gale->id_leaves[(* (gale->id_sets[g_id].begin()) )]; - vector tokens; - boost::split(tokens,gene_name,boost::is_any_of(string_parameter["gene_name_separators"]),boost::token_compress_on); - string species_name; - if ((int)scalar_parameter["species_field"]==-1) - species_name=tokens[tokens.size()-1]; - else - species_name=tokens[(int)scalar_parameter["species_field"]]; - geneCladeIdToSpecies[g_id]=species_name; - } - else { - break; - } + string gene_name = gale->id_leaves[(*(gale->id_sets[g_id].begin()))]; + vector tokens; + boost::split(tokens, gene_name, + boost::is_any_of(string_parameter["gene_name_separators"]), + boost::token_compress_on); + string species_name; + if ((int)scalar_parameter["species_field"] == -1) + species_name = tokens[tokens.size() - 1]; + else + species_name = tokens[(int)scalar_parameter["species_field"]]; + geneCladeIdToSpecies[g_id] = species_name; + } else { + break; } - + } - //Now, the main loop, iterating over the clades of the species tree distribution - long int spCladeId, gCladeId; - map< set,scalar_type> speciesCladeResolutions ; - map< set,scalar_type> geneCladeResolutions ; - for ( size_t i = 0 ; i < numSpeciesClades ; ++i ) - { - spCladeId = s_ids[i]; - speciesCladeResolutions = sale->Dip_counts[spCladeId]; - //Second loop, over the resolutions of the species tree clade spCladeId. - for (map< set,scalar_type> :: iterator spResolution = speciesCladeResolutions.begin(); spResolution != speciesCladeResolutions.end(); ++spResolution) //Going through all resolutions of the clade spCladeId - { - //Third loop, over the clades of the gene tree distribution. - for ( size_t j = 0 ; j < numGeneClades ; ++j ) - { - gCladeId = g_ids[j]; - geneCladeResolutions = gale->Dip_counts[gCladeId]; - //Fourth loop, over the resolutions of the gene tree clade gCladeId. - for (map< set,scalar_type> :: iterator gResolution = geneCladeResolutions.begin(); gResolution != geneCladeResolutions.end(); ++gResolution) //Going through all resolutions of the clade gCladeId - { - if (s_id_sizes[i] == 1) - {//The species tree clade is a leaf - std::string currentSpeciesId = sale->id_leaves[(* (sale->id_sets[spCladeId].begin()) )]; - if (g_id_sizes[i] == 1) - {//The gene tree clade is a leaf - if (sale->id_leaves[(* (sale->id_sets[spCladeId].begin()) )] == geneCladeIdToSpecies[gCladeId] ) - {//Gene corresponds to the species - q[ *(spResolution)->first ][gCladeId][0] = 1.0; - computeProbabilityOfCladeInSpeciesTreeBranch (gCladeId, *(spResolution)->first,numberOfSlicesPerBranch, q ); - } - else - { - for (size_t slice=0; slice < numberOfSlicesPerBranch ; ++slice) { - q[ *(spResolution)->first ][gCladeId][slice] = 0.0; - } - } - }//End the gene tree clade is a leaf - else - {//The gene tree clade is not a leaf - //Check that all leaves in the gene tree clade correspond to the species - std::set geneLeafIds = gale->id_sets[gCladeId]; - bool allGenesFromCurrentSpecies = true; - for (std::set::iterator it = geneLeafIds.begin(); it!= geneLeafIds.end() ++it ) { - if (geneCladeIdToSpecies [*it] != currentSpeciesId ) { - allGenesFromCurrentSpecies=false; - break; - } - } - if (allGenesFromCurrentSpecies) { - //need to compute the probability of observing the given clade in this species tree branch - q[ *(spResolution)->first ][gCladeId][0] = 0.0; //We can't have an entire clade already coalesced at sampling time! - computeProbabilityOfCladeInSpeciesTreeBranch (gCladeId, *(spResolution)->first, numberOfSlicesPerBranch, q ); - } - else { - for (size_t slice=0; slice < numberOfSlicesPerBranch ; ++slice) { - q[ *(spResolution)->first ][gCladeId][slice] = 0.0; - } - } - }//End the gene tree clade is not a leaf - }//End the species tree clade is a leaf - else - { //The species tree clade is not a leaf - std::set< std::string > speciesInClade; - std::set< int > speciesLeafIds = sale->id_sets[spCladeId]; - for (std::set::iterator it = speciesLeafIds.begin(); it!= speciesLeafIds.end() ++it ) { - speciesInClade.insert ( sale->id_leaves[(*it)] ); - } - if (g_id_sizes[i] == 1) - {//The gene tree clade is a leaf - computeProbabilityOfCladeAtBeginningOfSpeciesTreeBranch (gCladeId, *(spResolution)->first, numberOfSlicesPerBranch, q ); - if (q[ *(spResolution)->first ][gCladeId][0] == 0) { //The current gene comes from a species not in this clade - for (size_t slice = 1; slice < numberOfSlicesPerBranch ; ++slice) { - q[ *(spResolution)->first ][gCladeId][slice] = 0.0; - } - } - else { - computeProbabilityOfCladeInSpeciesTreeBranch (gCladeId, *(spResolution)->first, numberOfSlicesPerBranch, q ); - } - }//End the gene tree clade is a leaf - else - {//The gene tree clade is not a leaf - //Check that all leaves in the gene tree clade correspond to the species in the species tree clade - std::set geneLeafIds = gale->id_sets[gCladeId]; - bool allGenesFromCurrentSpecies = true; - for (std::set::iterator it = geneLeafIds.begin(); it!= geneLeafIds.end() ++it ) { - if (speciesInClade.find ( geneCladeIdToSpecies [*it] ) == speciesInClade.end() ) { - allGenesFromCurrentSpecies=false; - break; - } - } - if (allGenesFromCurrentSpecies) { - computeProbabilityOfCladeAtBeginningOfSpeciesTreeBranch (gCladeId, *(spResolution)->first, numberOfSlicesPerBranch, q ); - computeProbabilityOfCladeInSpeciesTreeBranch (gCladeId, *(spResolution)->first, numberOfSlicesPerBranch, q ); - } - else { - for (size_t slice=0; slice < numberOfSlicesPerBranch ; ++slice) { - q[ *(spResolution)->first ][gCladeId][slice] = 0.0; - } - } - }//End the gene tree clade is not a leaf - }//End the species tree clade is not a leaf - }//End loop over gene tree clade resolutions - }//End loop over gene tree clades - }//End loop over species tree clade resolutions - }//End loop over species tree clades - - //del-locs - g_ids.clear(); - g_id_sizes.clear(); - - return root_sum; + // Now, the main loop, iterating over the clades of the species tree + // distribution + long int spCladeId, gCladeId; + map, scalar_type> speciesCladeResolutions; + map, scalar_type> geneCladeResolutions; + for (size_t i = 0; i < numSpeciesClades; ++i) { + spCladeId = s_ids[i]; + speciesCladeResolutions = sale->Dip_counts[spCladeId]; + // Second loop, over the resolutions of the species tree clade spCladeId. + for (map, scalar_type>::iterator spResolution = + speciesCladeResolutions.begin(); + spResolution != speciesCladeResolutions.end(); + ++spResolution) // Going through all resolutions of the clade spCladeId + { + // Third loop, over the clades of the gene tree distribution. + for (size_t j = 0; j < numGeneClades; ++j) { + gCladeId = g_ids[j]; + geneCladeResolutions = gale->Dip_counts[gCladeId]; + // Fourth loop, over the resolutions of the gene tree clade gCladeId. + for (map, scalar_type>::iterator gResolution = + geneCladeResolutions.begin(); + gResolution != geneCladeResolutions.end(); + ++gResolution) // Going through all resolutions of the clade + // gCladeId + { + if (s_id_sizes[i] == 1) { // The species tree clade is a leaf + std::string currentSpeciesId = + sale->id_leaves[(*(sale->id_sets[spCladeId].begin()))]; + if (g_id_sizes[i] == 1) { // The gene tree clade is a leaf + if (sale->id_leaves[(*(sale->id_sets[spCladeId].begin()))] == + geneCladeIdToSpecies[gCladeId]) { // Gene corresponds to the + // species + q[*(spResolution)->first][gCladeId][0] = 1.0; + computeProbabilityOfCladeInSpeciesTreeBranch( + gCladeId, *(spResolution)->first, numberOfSlicesPerBranch, + q); + } else { + for (size_t slice = 0; slice < numberOfSlicesPerBranch; + ++slice) { + q[*(spResolution)->first][gCladeId][slice] = 0.0; + } + } + } // End the gene tree clade is a leaf + else { // The gene tree clade is not a leaf + // Check that all leaves in the gene tree clade correspond to the + // species + std::set geneLeafIds = gale->id_sets[gCladeId]; + bool allGenesFromCurrentSpecies = true; + for (std::set::iterator it = geneLeafIds.begin(); + it != geneLeafIds.end()++ it) { + if (geneCladeIdToSpecies[*it] != currentSpeciesId) { + allGenesFromCurrentSpecies = false; + break; + } + } + if (allGenesFromCurrentSpecies) { + // need to compute the probability of observing the given clade + // in this species tree branch + q[*(spResolution)->first][gCladeId][0] = + 0.0; // We can't have an entire clade already coalesced at + // sampling time! + computeProbabilityOfCladeInSpeciesTreeBranch( + gCladeId, *(spResolution)->first, numberOfSlicesPerBranch, + q); + } else { + for (size_t slice = 0; slice < numberOfSlicesPerBranch; + ++slice) { + q[*(spResolution)->first][gCladeId][slice] = 0.0; + } + } + } // End the gene tree clade is not a leaf + } // End the species tree clade is a leaf + else { // The species tree clade is not a leaf + std::set speciesInClade; + std::set speciesLeafIds = sale->id_sets[spCladeId]; + for (std::set::iterator it = speciesLeafIds.begin(); + it != speciesLeafIds.end()++ it) { + speciesInClade.insert(sale->id_leaves[(*it)]); + } + if (g_id_sizes[i] == 1) { // The gene tree clade is a leaf + computeProbabilityOfCladeAtBeginningOfSpeciesTreeBranch( + gCladeId, *(spResolution)->first, numberOfSlicesPerBranch, q); + if (q[*(spResolution)->first][gCladeId][0] == + 0) { // The current gene comes from a species not in this + // clade + for (size_t slice = 1; slice < numberOfSlicesPerBranch; + ++slice) { + q[*(spResolution)->first][gCladeId][slice] = 0.0; + } + } else { + computeProbabilityOfCladeInSpeciesTreeBranch( + gCladeId, *(spResolution)->first, numberOfSlicesPerBranch, + q); + } + } // End the gene tree clade is a leaf + else { // The gene tree clade is not a leaf + // Check that all leaves in the gene tree clade correspond to the + // species in the species tree clade + std::set geneLeafIds = gale->id_sets[gCladeId]; + bool allGenesFromCurrentSpecies = true; + for (std::set::iterator it = geneLeafIds.begin(); + it != geneLeafIds.end()++ it) { + if (speciesInClade.find(geneCladeIdToSpecies[*it]) == + speciesInClade.end()) { + allGenesFromCurrentSpecies = false; + break; + } + } + if (allGenesFromCurrentSpecies) { + computeProbabilityOfCladeAtBeginningOfSpeciesTreeBranch( + gCladeId, *(spResolution)->first, numberOfSlicesPerBranch, + q); + computeProbabilityOfCladeInSpeciesTreeBranch( + gCladeId, *(spResolution)->first, numberOfSlicesPerBranch, + q); + } else { + for (size_t slice = 0; slice < numberOfSlicesPerBranch; + ++slice) { + q[*(spResolution)->first][gCladeId][slice] = 0.0; + } + } + } // End the gene tree clade is not a leaf + } // End the species tree clade is not a leaf + } // End loop over gene tree clade resolutions + } // End loop over gene tree clades + } // End loop over species tree clade resolutions + } // End loop over species tree clades + + // del-locs + g_ids.clear(); + g_id_sizes.clear(); + + return root_sum; } +void scALE::computeProbabilityOfCladeInSpeciesTreeBranch( + int gCladeId, std::set speciesTreeResolution, + int numberOfSlicesPerBranch) { + // Length of a time slice + double timeSliceWidth = 1.0 / numberOfSlicesPerBranch; + map, scalar_type> geneTreeCladeResolutions; + geneTreeCladeResolutions = gale_pointer->Dip_counts[gCladeId]; + double thetaS = timeSliceWidth * NeTs[speciesTreeResolution]; + long int speciesClade1 = *(speciesTreeResolution.begin()); + long int speciesClade2 = + *(speciesTreeResolution + .end()); // Assuming there are only two clades (binary tree) + long int geneTreeCladeDaughter1; + long int geneTreeCladeDaughter2; + speciesClade1Resolutions = sale_pointer->Dip_counts[speciesClade1]; + speciesClade2Resolutions = sale_pointer->Dip_counts[speciesClade2]; + double speciesClade1ResolutionProbability; + double speciesClade2ResolutionProbability; -void scALE::computeProbabilityOfCladeInSpeciesTreeBranch (int gCladeId, - std::set < long int > speciesTreeResolution, - int numberOfSlicesPerBranch) { - //Length of a time slice - double timeSliceWidth = 1.0 / numberOfSlicesPerBranch; - map< set,scalar_type> geneTreeCladeResolutions; - geneTreeCladeResolutions = gale_pointer->Dip_counts[gCladeId]; - double thetaS = timeSliceWidth * NeTs[speciesTreeResolution]; - long int speciesClade1 = *(speciesTreeResolution.begin()); - long int speciesClade2 = *(speciesTreeResolution.end()); //Assuming there are only two clades (binary tree) - long int geneTreeCladeDaughter1; - long int geneTreeCladeDaughter2; - speciesClade1Resolutions = sale_pointer->Dip_counts[speciesClade1]; - speciesClade2Resolutions = sale_pointer->Dip_counts[speciesClade2]; - - double speciesClade1ResolutionProbability ; - double speciesClade2ResolutionProbability ; - - for (size_t slice=1; slice < numberOfSlicesPerBranch ; ++slice) - { //Going through all slices - //TODO ! - q[ speciesTreeResolution ][gCladeId][slice] = q[ speciesTreeResolution ][gCladeId][slice - 1]; - //First, we substract the probability of the current gene tree clade coalescing with another sister clade - //Here we assume that we have in gale a map cladeToSisterClades between gCladeId and a vector of sister clades. - std::vector < int > sisterClades = gale_pointer->cladeToSisterClades[gCladeId]; - for (std::vector < int >::iterator sisterClade = sisterClades.begin() ; sisterClade != sisterClades.end() ; ++sisterClade) - { //Going through all sister clades - sisterCladeResolutions = gale->Dip_counts[*(sisterClade)]; - for (map< set,scalar_type> :: iterator sisterCladeResolution = sisterCladeResolutions.begin(); sisterCladeResolution != sisterCladeResolutions.end(); ++sisterCladeResolution) //Going through all resolutions of the clade sisterClade - { - for (map< set,scalar_type> :: iterator speciesClade1Resolution = speciesClade1Resolutions.begin(); speciesClade1Resolution != speciesClade1Resolutions.end(); ++speciesClade1Resolution) //Going through all resolutions of the clade speciesClade1 - { - speciesClade1ResolutionProbability = *(speciesClade1Resolution)->second ; - for (map< set,scalar_type> :: iterator speciesClade2Resolution = speciesClade2Resolutions.begin(); speciesClade2Resolution != speciesClade2Resolutions.end(); ++speciesClade2Resolution) //Going through all resolutions of the clade speciesClade2 - { - speciesClade2ResolutionProbability = *(speciesClade2Resolution)->second ; - //Version written on the board in Lyon: q[ speciesTreeResolution ][gCladeId][slice] -= thetaS * *(sisterCladeResolution)->second * speciesClade1ResolutionProbability * speciesClade2ResolutionProbability * q[ speciesClade1Resolution ][sisterClade][slice-1] * q[ speciesClade2Resolution ][sisterClade][slice-1]; - //Corrected version: - q[ speciesTreeResolution ][gCladeId][slice] -= thetaS * *(sisterCladeResolution)->second * ( speciesClade1ResolutionProbability * q[ speciesClade1Resolution ][sisterClade][slice-1] +speciesClade2ResolutionProbability * q[ speciesClade2Resolution ][sisterClade][slice-1] ); + for (size_t slice = 1; slice < numberOfSlicesPerBranch; + ++slice) { // Going through all slices + // TODO ! + q[speciesTreeResolution][gCladeId][slice] = + q[speciesTreeResolution][gCladeId][slice - 1]; + // First, we substract the probability of the current gene tree clade + // coalescing with another sister clade Here we assume that we have in gale + // a map cladeToSisterClades between gCladeId and a vector of sister clades. + std::vector sisterClades = gale_pointer->cladeToSisterClades[gCladeId]; + for (std::vector::iterator sisterClade = sisterClades.begin(); + sisterClade != sisterClades.end(); + ++sisterClade) { // Going through all sister clades + sisterCladeResolutions = gale->Dip_counts[*(sisterClade)]; + for (map, scalar_type>::iterator sisterCladeResolution = + sisterCladeResolutions.begin(); + sisterCladeResolution != sisterCladeResolutions.end(); + ++sisterCladeResolution) // Going through all resolutions of the + // clade sisterClade + { + for (map, scalar_type>::iterator speciesClade1Resolution = + speciesClade1Resolutions.begin(); + speciesClade1Resolution != speciesClade1Resolutions.end(); + ++speciesClade1Resolution) // Going through all resolutions of the + // clade speciesClade1 + { + speciesClade1ResolutionProbability = + *(speciesClade1Resolution)->second; + for (map, scalar_type>::iterator + speciesClade2Resolution = speciesClade2Resolutions.begin(); + speciesClade2Resolution != speciesClade2Resolutions.end(); + ++speciesClade2Resolution) // Going through all resolutions of + // the clade speciesClade2 + { + speciesClade2ResolutionProbability = + *(speciesClade2Resolution)->second; + // Version written on the board in Lyon: q[ speciesTreeResolution + // ][gCladeId][slice] -= thetaS * *(sisterCladeResolution)->second * + // speciesClade1ResolutionProbability * + // speciesClade2ResolutionProbability * q[ speciesClade1Resolution + // ][sisterClade][slice-1] * q[ speciesClade2Resolution + // ][sisterClade][slice-1]; Corrected version: + q[speciesTreeResolution][gCladeId][slice] -= + thetaS * *(sisterCladeResolution)->second * + (speciesClade1ResolutionProbability * + q[speciesClade1Resolution][sisterClade][slice - 1] + + speciesClade2ResolutionProbability * + q[speciesClade2Resolution][sisterClade][slice - 1]); - } //End loop over resolutions of speciesClade2 - } //End loop over resolutions of speciesClade1 - } //End loop over resolutions of sisterClade - } //End loop over all sister clades - - //Second, we add the probability that daughter clades of the current gene tree clade coalesce into it. - //First, we sum over all resolution of gCladeId - for (map< set,scalar_type> :: iterator geneTreeCladeResolution = geneTreeCladeResolutions.begin(); geneTreeCladeResolution != geneTreeCladeResolutions.end(); ++geneTreeCladeResolution) //Going through all resolutions of the clade gCladeId - { - geneTreeCladeDaughter1 = *(geneTreeCladeResolutions).begin(); - geneTreeCladeDaughter2 = *(geneTreeCladeResolutions).end(); - for (map< set,scalar_type> :: iterator speciesClade1Resolution = speciesClade1Resolutions.begin(); speciesClade1Resolution != speciesClade1Resolutions.end(); ++speciesClade1Resolution) //Going through all resolutions of the clade speciesClade1 - { - speciesClade1ResolutionProbability = *(speciesClade1Resolution)->second ; - for (map< set,scalar_type> :: iterator speciesClade2Resolution = speciesClade2Resolutions.begin(); speciesClade2Resolution != speciesClade2Resolutions.end(); ++speciesClade2Resolution) //Going through all resolutions of the clade speciesClade2 - { - speciesClade2ResolutionProbability = *(speciesClade2Resolution)->second ; - q[ speciesTreeResolution ][gCladeId][slice] += *(geneTreeCladeResolution)->second * speciesClade1ResolutionProbability * speciesClade2ResolutionProbability * ( q[ speciesClade1Resolution ][geneTreeCladeDaughter1][slice-1] * q[ speciesClade2Resolution ][geneTreeCladeDaughter2][slice-1] + q[ speciesClade1Resolution ][geneTreeCladeDaughter2][slice-1] * q[ speciesClade2Resolution ][geneTreeCladeDaughter1][slice-1] ) - - } //End loop over resolutions of speciesClade2 - } //End loop over resolutions of speciesClade1 - } //End loop over resolutions of gCladeId - } //End loop over time slices - return; -} + } // End loop over resolutions of speciesClade2 + } // End loop over resolutions of speciesClade1 + } // End loop over resolutions of sisterClade + } // End loop over all sister clades + // Second, we add the probability that daughter clades of the current gene + // tree clade coalesce into it. First, we sum over all resolution of + // gCladeId + for (map, scalar_type>::iterator geneTreeCladeResolution = + geneTreeCladeResolutions.begin(); + geneTreeCladeResolution != geneTreeCladeResolutions.end(); + ++geneTreeCladeResolution) // Going through all resolutions of the + // clade gCladeId + { + geneTreeCladeDaughter1 = *(geneTreeCladeResolutions).begin(); + geneTreeCladeDaughter2 = *(geneTreeCladeResolutions).end(); + for (map, scalar_type>::iterator speciesClade1Resolution = + speciesClade1Resolutions.begin(); + speciesClade1Resolution != speciesClade1Resolutions.end(); + ++speciesClade1Resolution) // Going through all resolutions of the + // clade speciesClade1 + { + speciesClade1ResolutionProbability = *(speciesClade1Resolution)->second; + for (map, scalar_type>::iterator speciesClade2Resolution = + speciesClade2Resolutions.begin(); + speciesClade2Resolution != speciesClade2Resolutions.end(); + ++speciesClade2Resolution) // Going through all resolutions of the + // clade speciesClade2 + { + speciesClade2ResolutionProbability = + *(speciesClade2Resolution)->second; + q[speciesTreeResolution][gCladeId][slice] += + *(geneTreeCladeResolution)->second * + speciesClade1ResolutionProbability * + speciesClade2ResolutionProbability * + (q[speciesClade1Resolution][geneTreeCladeDaughter1][slice - 1] * + q[speciesClade2Resolution][geneTreeCladeDaughter2] + [slice - 1] + + q[speciesClade1Resolution][geneTreeCladeDaughter2][slice - 1] * + q[speciesClade2Resolution][geneTreeCladeDaughter1] + [slice - 1]) + } // End loop over resolutions of speciesClade2 + } // End loop over resolutions of speciesClade1 + } // End loop over resolutions of gCladeId + } // End loop over time slices + return; +} -void scALE::computeProbabilityOfCladeAtBeginningOfSpeciesTreeBranch (int gCladeId, - std::set < long int > speciesTreeResolution, - int numberOfSlicesPerBranch ) { - q[ speciesTreeResolution ][gCladeId][0] = 0.0; - int lastSlice = numberOfSlicesPerBranch - 1; - map< set,scalar_type> speciesTreeDaughterCladeResolutions; - for (std::set::iterator speciesTreeDaughterClade = speciesTreeResolution.begin() ; speciesTreeDaughterClade != speciesTreeResolution.end() ; ++speciesTreeDaughterClade) { - speciesTreeDaughterCladeResolutions = sale_pointer->Dip_counts[*(speciesTreeDaughterClade)]; - //Second loop, over the resolutions of the species tree clade speciesTreeDaughterClade. - for (map< set,scalar_type> :: iterator spDaughterResolution = speciesTreeDaughterCladeResolutions.begin(); spDaughterResolution != speciesTreeDaughterCladeResolutions.end(); ++spDaughterResolution) //Going through all resolutions of the clade speciesTreeDaughterClade - { - q[ speciesTreeResolution ][gCladeId][0] += *(spDaughterResolution)->second * q[ *(spDaughterResolution)->first ][gCladeId][lastSlice]; - } - } - return; +void scALE::computeProbabilityOfCladeAtBeginningOfSpeciesTreeBranch( + int gCladeId, std::set speciesTreeResolution, + int numberOfSlicesPerBranch) { + q[speciesTreeResolution][gCladeId][0] = 0.0; + int lastSlice = numberOfSlicesPerBranch - 1; + map, scalar_type> speciesTreeDaughterCladeResolutions; + for (std::set::iterator speciesTreeDaughterClade = + speciesTreeResolution.begin(); + speciesTreeDaughterClade != speciesTreeResolution.end(); + ++speciesTreeDaughterClade) { + speciesTreeDaughterCladeResolutions = + sale_pointer->Dip_counts[*(speciesTreeDaughterClade)]; + // Second loop, over the resolutions of the species tree clade + // speciesTreeDaughterClade. + for (map, scalar_type>::iterator spDaughterResolution = + speciesTreeDaughterCladeResolutions.begin(); + spDaughterResolution != speciesTreeDaughterCladeResolutions.end(); + ++spDaughterResolution) // Going through all resolutions of the clade + // speciesTreeDaughterClade + { + q[speciesTreeResolution][gCladeId][0] += + *(spDaughterResolution)->second * + q[*(spDaughterResolution)->first][gCladeId][lastSlice]; + } + } + return; } diff --git a/misc/scALE/scALE.h b/misc/scALE/scALE.h index c09f75d..4c5083f 100644 --- a/misc/scALE/scALE.h +++ b/misc/scALE/scALE.h @@ -17,60 +17,85 @@ // of a distribution of species trees, given a distribution of gene trees and // branchwise parameters of the coalescent. // Tree distributions are summarized by conditional clade probabilities. - // Using Dynamic programming, we traverse both the species tree distribution + // Using Dynamic programming, we traverse both the species tree distribution // and the gene tree distribution, one pair of clades at a time. - // This class contains an approx_posterior object for the species tree distribution, - // with branch-wise parameters of population size Ne times time T (one parameter NeT per branch), - // and an approx_posterior object for the gene tree distribution. + // This class contains an approx_posterior object for the species tree + distribution, + // with branch-wise parameters of population size Ne times time T (one + parameter NeT per branch), + // and an approx_posterior object for the gene tree distribution. *****************************************************************************/ -class exODT_model -{ +class exODT_model { private: - - std::map scalar_parameter;//del_loc - std::map > vector_parameter;//del_loc - std::map string_parameter;//del_loc - - std::vector speciesNames; + std::map scalar_parameter; // del_loc + std::map> vector_parameter; // del_loc + std::map string_parameter; // del_loc - - approx_posterior * sale_pointer; //Pointer to an approx_posterior object used to describe a species tree distribution. Used for dynamic programming in p for instance. - approx_posterior * gale_pointer; //Pointer to an approx_posterior object used to describe a gene tree distribution. Used for dynamic programming in p for instance. - std::map < std::set < long int >, double> NeTs; //branch-wise parameters of the coalescent process, one per resolved clade of the species tree. - - std::map < long int, std::string > geneCladeIdToSpecies; //Map between clade id (from the gene approx_posterior object) and species included in that clade. - std::map < int, std::vector < map < pair , int > > > branch_counts; //del-loc + std::vector speciesNames; - //std::map < long int, std::pair< long int, std::pair < scalar_type > > > q; //del-loc. Map between resolution of a clade (from the species tree approx_posterior object), and pair between clade id from the gene tree approx_posterior object and a vector containing the probability of observing the gene tree clade at each slice of the species tree branch according to the scALE model. - std::map < std::set< long int >, std::pair < long int, std::vector < scalar_type > > > q; //del-loc. Same as above, but instead of a map we use a vector. - - /****************************************************************************** - //Computes the probability of a given gene tree clade in a given species tree branch, at all time slices except 0. - //Uses formula: \frac{d P(\gamma, s, t)}{dt} = -\theta_s \sum_{\bar g} \Pi_{\bar g} \sum_{s',s''} \Pi_{s'} \Pi_{s''} P(\bar \gamma, s', t) P(\bar \gamma, s'', t) \\ + \theta_s \sum_{ g} \Pi_{ g} \sum_{s',s''}\Pi_{s'}\Pi_{s''}P( \gamma', s', t)P(\gamma'', s'', t) - //More clearly: the probability of seeing a gene tree clade gamma at time slice n+1 is equal to the probability at time slice n of seeing two daughter clades which then coalesce into gamma before time slice n minus the probability that gamma has coalesced with another clade, and thus disappeared. - *******************************************************************************/ - void computeProbabilityOfCladeInSpeciesTreeBranch (int gCladeId, - std::set < long int > speciesTreeResolution, - int numberOfSlicesPerBranch) ; - - /****************************************************************************** - //Computes the probability of a given gene tree clade at the beginning of a given species tree non-leaf branch, at time slice 0. - //Uses formula: P(\gamma, s, t_{bottom}^s)=\sum_{s'} \Pi_{s'} P (\gamma, s', t_{top}^{s'}) - *******************************************************************************/ - void computeProbabilityOfCladeAtBeginningOfSpeciesTreeBranch (int gCladeId, - std::set < long int > speciesTreeResolution, - int numberOfSlicesPerBranch) ; + approx_posterior + *sale_pointer; // Pointer to an approx_posterior object used to describe a + // species tree distribution. Used for dynamic programming + // in p for instance. + approx_posterior *gale_pointer; // Pointer to an approx_posterior object used + // to describe a gene tree distribution. Used + // for dynamic programming in p for instance. + std::map, double> + NeTs; // branch-wise parameters of the coalescent process, one per + // resolved clade of the species tree. + std::map + geneCladeIdToSpecies; // Map between clade id (from the gene + // approx_posterior object) and species included in + // that clade. + std::map, int>>> branch_counts; // del-loc + // std::map < long int, std::pair< long int, std::pair < scalar_type > > > q; + // //del-loc. Map between resolution of a clade (from the species tree + // approx_posterior object), and pair between clade id from the gene tree + // approx_posterior object and a vector containing the probability of + // observing the gene tree clade at each slice of the species tree branch + // according to the scALE model. + std::map, std::pair>> + q; // del-loc. Same as above, but instead of a map we use a vector. - -public: - void construct(); //Constructs an object given a species tree and population size. - scALE(); - ~scALE(); - scalar_type p(approx_posterior *gale); //Computes the probability of an approx_posterior corresponding to a gene tree distribution according to the species tree distribution and parameter values. + /****************************************************************************** + //Computes the probability of a given gene tree clade in a given species tree + branch, at all time slices except 0. + //Uses formula: \frac{d P(\gamma, s, t)}{dt} = -\theta_s \sum_{\bar g} + \Pi_{\bar g} \sum_{s',s''} \Pi_{s'} \Pi_{s''} P(\bar \gamma, s', t) P(\bar + \gamma, s'', t) \\ + \theta_s \sum_{ g} \Pi_{ g} + \sum_{s',s''}\Pi_{s'}\Pi_{s''}P( \gamma', s', t)P(\gamma'', s'', t) + //More clearly: the probability of seeing a gene tree clade gamma at time + slice n+1 is equal to the probability at time slice n of seeing two daughter + clades which then coalesce into gamma before time slice n minus the + probability that gamma has coalesced with another clade, and thus + disappeared. + *******************************************************************************/ + void computeProbabilityOfCladeInSpeciesTreeBranch( + int gCladeId, std::set speciesTreeResolution, + int numberOfSlicesPerBranch); + + /****************************************************************************** + //Computes the probability of a given gene tree clade at the beginning of a + given species tree non-leaf branch, at time slice 0. + //Uses formula: P(\gamma, s, t_{bottom}^s)=\sum_{s'} \Pi_{s'} P (\gamma, s', + t_{top}^{s'}) + *******************************************************************************/ + void computeProbabilityOfCladeAtBeginningOfSpeciesTreeBranch( + int gCladeId, std::set speciesTreeResolution, + int numberOfSlicesPerBranch); - +public: + void + construct(); // Constructs an object given a species tree and population size. + scALE(); + ~scALE(); + scalar_type + p(approx_posterior + *gale); // Computes the probability of an approx_posterior corresponding + // to a gene tree distribution according to the species tree + // distribution and parameter values. } #endif diff --git a/misc/times_undated.cpp b/misc/times_undated.cpp index 8404854..ebb8d81 100644 --- a/misc/times_undated.cpp +++ b/misc/times_undated.cpp @@ -1,92 +1,89 @@ -#include "exODT.h" #include "ALE_util.h" +#include "exODT.h" -#include -#include #include #include +#include +#include using namespace std; using namespace bpp; -int main(int argc, char ** argv) -{ +int main(int argc, char **argv) { - - //we need a dated species tree in newick format + // we need a dated species tree in newick format string Sstring; - ifstream file_stream_S (argv[1]); - getline (file_stream_S,Sstring); - cout << "Read species tree from: " << argv[1] <<".."<observations<<" trees from: " << ale_file <<".."<observations + << " trees from: " << ale_file << ".." << endl; - //we initialise a coarse grained reconciliation model for calculating the sum - exODT_model* model=new exODT_model(); + // we initialise a coarse grained reconciliation model for calculating the sum + exODT_model *model = new exODT_model(); - if (argc>3) + if (argc > 3) model->set_model_parameter("gene_name_separators", argv[3]); - model->set_model_parameter("BOOT_STRAP_LABLES","yes"); + model->set_model_parameter("BOOT_STRAP_LABLES", "yes"); model->construct_undated(Sstring); - - //a set of inital rates - scalar_type delta=0.01,tau=0.01,lambda=0.1; + // a set of inital rates + scalar_type delta = 0.01, tau = 0.01, lambda = 0.1; model->set_model_parameter("delta", delta); model->set_model_parameter("tau", tau); model->set_model_parameter("lambda", lambda); model->calculate_undatedEs(); - int leaves=1; - map names; - if (ale->constructor_string.find("(")!=ale->constructor_string.npos) - { - tree_type * T1=TreeTemplateTools::parenthesisToTree(ale->constructor_string,false); - vector nodes1=T1->getLeaves(); - for (vector ::iterator it=nodes1.begin();it!=nodes1.end();it++) - { - vector tokens; - string name=(*it)->getName(); - boost::split(tokens,name,boost::is_any_of("_"),boost::token_compress_on); - names[tokens[0]]+=1; - } - leaves=T1->getNumberOfLeaves(); + int leaves = 1; + map names; + if (ale->constructor_string.find("(") != ale->constructor_string.npos) { + tree_type *T1 = + TreeTemplateTools::parenthesisToTree(ale->constructor_string, false); + vector nodes1 = T1->getLeaves(); + for (vector::iterator it = nodes1.begin(); it != nodes1.end(); + it++) { + vector tokens; + string name = (*it)->getName(); + boost::split(tokens, name, boost::is_any_of("_"), + boost::token_compress_on); + names[tokens[0]] += 1; } - else - { - vector tokens; - boost::split(tokens,ale->constructor_string,boost::is_any_of(","),boost::token_compress_on); - for (vector ::iterator it=tokens.begin();it!=tokens.end();it++) - { - vector tokens2; - string name=(*it); - boost::split(tokens2,name,boost::is_any_of("_"),boost::token_compress_on); - names[tokens2[0]]+=1; - leaves+=1; - } + leaves = T1->getNumberOfLeaves(); + } else { + vector tokens; + boost::split(tokens, ale->constructor_string, boost::is_any_of(","), + boost::token_compress_on); + for (vector::iterator it = tokens.begin(); it != tokens.end(); + it++) { + vector tokens2; + string name = (*it); + boost::split(tokens2, name, boost::is_any_of("_"), + boost::token_compress_on); + names[tokens2[0]] += 1; + leaves += 1; } - + } - boost::timer * t = new boost::timer(); - string outname=ale_file+".utimes"; - ofstream fout( outname.c_str() ); - scalar_type times=100; + boost::timer *t = new boost::timer(); + string outname = ale_file + ".utimes"; + ofstream fout(outname.c_str()); + scalar_type times = 100; scalar_type ll; - - for (int i=0;i<100;i++) - ll=model->pun(ale); - fout << t->elapsed()/times << "\t"; + + for (int i = 0; i < 100; i++) + ll = model->pun(ale); + fout << t->elapsed() / times << "\t"; fout << ale->Dip_counts.size() << "\t"; fout << leaves << "\t"; - fout << names.size() << "\t"; + fout << names.size() << "\t"; fout << ale_file; //<< "\t"; fout << endl; - //fout << ll << endl; - + // fout << ll << endl; + return 0; } - diff --git a/misc/undated_fast.cpp b/misc/undated_fast.cpp index 052fe8c..ca4f2ab 100644 --- a/misc/undated_fast.cpp +++ b/misc/undated_fast.cpp @@ -1,141 +1,138 @@ #include "exODT.h" using namespace std; using namespace bpp; -static double EPSILON = numeric_limits< double >::min(); +static double EPSILON = numeric_limits::min(); -void exODT_model::construct_undated(string Sstring) -{ +void exODT_model::construct_undated(string Sstring) { daughter.clear(); son.clear(); name_node.clear(); node_name.clear(); node_ids.clear(); id_nodes.clear(); - - string_parameter["S_un"]=Sstring; - S=TreeTemplateTools::parenthesisToTree(string_parameter["S_un"], true// (string_parameter["BOOT_STRAP_LABLES"]=="yes") - ); - S_root = S->getRootNode(); - vector nodes = TreeTemplateTools::getNodes(*S_root); - - for (vector ::iterator it=nodes.begin();it!=nodes.end();it++ ) (*it)->setDistanceToFather(1); - - for (vector ::iterator it=nodes.begin();it!=nodes.end();it++ ) - if ((*it)->isLeaf()) - { - name_node[(*it)->getName()]=(*it); - node_name[(*it)]=(*it)->getName(); - } - else - { - vector leafnames=TreeTemplateTools::getLeavesNames(*(*it)); - sort(leafnames.begin(),leafnames.end()); - stringstream name; - for (vector ::iterator st=leafnames.begin();st!=leafnames.end();st++ ) - name<<(*st)<<"."; - - name_node[name.str()]=(*it); - node_name[(*it)]=name.str(); - } + string_parameter["S_un"] = Sstring; + S = TreeTemplateTools::parenthesisToTree( + string_parameter["S_un"], + true // (string_parameter["BOOT_STRAP_LABLES"]=="yes") + ); + S_root = S->getRootNode(); + vector nodes = TreeTemplateTools::getNodes(*S_root); + + for (vector::iterator it = nodes.begin(); it != nodes.end(); it++) + (*it)->setDistanceToFather(1); + + for (vector::iterator it = nodes.begin(); it != nodes.end(); it++) + if ((*it)->isLeaf()) { + name_node[(*it)->getName()] = (*it); + node_name[(*it)] = (*it)->getName(); + } else { + vector leafnames = TreeTemplateTools::getLeavesNames(*(*it)); + sort(leafnames.begin(), leafnames.end()); + stringstream name; + for (vector::iterator st = leafnames.begin(); + st != leafnames.end(); st++) + name << (*st) << "."; + + name_node[name.str()] = (*it); + node_name[(*it)] = name.str(); + } // register species - last_branch=0; - last_leaf=0; - - set saw; - for (map ::iterator it=name_node.begin();it!=name_node.end();it++ ) - if ((*it).second->isLeaf()) - { - Node * node = (*it).second; - extant_species[last_branch]=node->getName(); - node_ids[node]=last_branch; - id_nodes[last_branch]=node; - last_branch++; - last_leaf++; - saw.insert(node); - // a leaf - daughter[last_branch]=-1; - // a leaf - son[last_branch]=-1; - } - //ad-hoc postorder - vector next_generation; - for (map ::iterator it=name_node.begin();it!=name_node.end();it++ ) - if ((*it).second->isLeaf()) - { - Node * node = (*it).second; - next_generation.push_back(node); - } - while(next_generation.size()) - { - vector new_generation; - for (vector::iterator it=next_generation.begin();it!=next_generation.end();it++ ) - { - Node * node = (*it); - if (node->hasFather() ) - { - Node * father=node->getFather(); - vector sons=father->getSons(); - Node * sister; - if (sons[0]==node) sister=sons[1]; else sister=sons[0]; - - if (not node_ids.count(father) and saw.count(sister)) - { - node_ids[father]=last_branch; - id_nodes[last_branch]=father; - stringstream name; - name << last_branch; - father->setBranchProperty("ID",BppString(name.str())); - - last_branch++; - - saw.insert(father); - new_generation.push_back(father); - } - } - } - next_generation.clear(); - for (vector::iterator it=new_generation.begin();it!=new_generation.end();it++ ) - next_generation.push_back((*it)); + last_branch = 0; + last_leaf = 0; + + set saw; + for (map::iterator it = name_node.begin(); + it != name_node.end(); it++) + if ((*it).second->isLeaf()) { + Node *node = (*it).second; + extant_species[last_branch] = node->getName(); + node_ids[node] = last_branch; + id_nodes[last_branch] = node; + last_branch++; + last_leaf++; + saw.insert(node); + // a leaf + daughter[last_branch] = -1; + // a leaf + son[last_branch] = -1; } - - for (map ::iterator it=node_ids.begin();it!=node_ids.end();it++ ) - { - Node * node = (*it).first; - int branch = (*it).second; - stringstream out; - stringstream out1; - stringstream out2; - out1<hasBranchProperty("bootstrap") ) - { - rank2label[rank]=node->getBootstrapValue(); - cout <"<setBranchProperty("ID",BppString(out.str())); - } - - string_parameter["S_with_ranks"]=TreeTemplateTools::treeToParenthesis(*S,false,"ID"); - - - for (map ::iterator it=name_node.begin();it!=name_node.end();it++ ) - if (not (*it).second->isLeaf()) - { - Node * node = (*it).second; - vector sons=node->getSons(); - daughter[node_ids[node]]=node_ids[sons[0]]; - son[node_ids[node]]=node_ids[sons[1]]; - //cout << node_ids[node] << " => " << node_ids[sons[0]] << " & " << node_ids[sons[1]] << endl; - //cout << node_name[node] << " => " << node_name[sons[0]] << " & " << node_name[sons[1]] << endl; - + // ad-hoc postorder + vector next_generation; + for (map::iterator it = name_node.begin(); + it != name_node.end(); it++) + if ((*it).second->isLeaf()) { + Node *node = (*it).second; + next_generation.push_back(node); + } + while (next_generation.size()) { + vector new_generation; + for (vector::iterator it = next_generation.begin(); + it != next_generation.end(); it++) { + Node *node = (*it); + if (node->hasFather()) { + Node *father = node->getFather(); + vector sons = father->getSons(); + Node *sister; + if (sons[0] == node) + sister = sons[1]; + else + sister = sons[0]; + + if (not node_ids.count(father) and saw.count(sister)) { + node_ids[father] = last_branch; + id_nodes[last_branch] = father; + stringstream name; + name << last_branch; + father->setBranchProperty("ID", BppString(name.str())); + + last_branch++; + + saw.insert(father); + new_generation.push_back(father); + } } + } + next_generation.clear(); + for (vector::iterator it = new_generation.begin(); + it != new_generation.end(); it++) + next_generation.push_back((*it)); + } + + for (map::iterator it = node_ids.begin(); it != node_ids.end(); + it++) { + Node *node = (*it).first; + int branch = (*it).second; + stringstream out; + stringstream out1; + stringstream out2; + out1 << t_begin[branch]; + out2 << t_end[branch]; + int rank = branch; + out << rank; + if (node->hasBranchProperty("bootstrap")) { + rank2label[rank] = node->getBootstrapValue(); + cout << rank2label[rank] << "->" << rank << endl; + } else { + rank2label[rank] = -1; + } + node->setBranchProperty("ID", BppString(out.str())); + } + + string_parameter["S_with_ranks"] = + TreeTemplateTools::treeToParenthesis(*S, false, "ID"); + + for (map::iterator it = name_node.begin(); + it != name_node.end(); it++) + if (not(*it).second->isLeaf()) { + Node *node = (*it).second; + vector sons = node->getSons(); + daughter[node_ids[node]] = node_ids[sons[0]]; + son[node_ids[node]] = node_ids[sons[1]]; + // cout << node_ids[node] << " => " << node_ids[sons[0]] << " & " << + // node_ids[sons[1]] << endl; cout << node_name[node] << " => " << + // node_name[sons[0]] << " & " << node_name[sons[1]] << endl; + } branch_counts["Os"].clear(); branch_counts["Ds"].clear(); branch_counts["Ts"].clear(); @@ -145,803 +142,790 @@ void exODT_model::construct_undated(string Sstring) branch_counts["copies"].clear(); branch_counts["singleton"].clear(); - for (int e=0;e tmp; - T_to_from.push_back(tmp); - for (int f=0;f tmp; + T_to_from.push_back(tmp); + for (int f = 0; f < last_branch; f++) + T_to_from[e].push_back(0); + } + + last_rank = last_branch; + set_model_parameter("N", 1); } -void exODT_model::calculate_undatedEs() -{ +void exODT_model::calculate_undatedEs() { uE.clear(); PD.clear(); PT.clear(); PL.clear(); - PS.clear(); - scalar_type P_T=0; - for (int f=0;f > >::iterator it=q.begin();it!=q.end();it++) - { - for ( std::map< scalar_type, std::map >::iterator jt=(*it).second.begin();jt!=(*it).second.end();jt++) - (*jt).second.clear(); - (*it).second.clear(); - } +scalar_type exODT_model::pun(approx_posterior *ale) { + scalar_type survive = 0; + scalar_type root_sum = 0; + uq.clear(); + mPTuq.clear(); // XX + ale_pointer = ale; + + for (std::map>>::iterator + it = q.begin(); + it != q.end(); it++) { + for (std::map>::iterator jt = + (*it).second.begin(); + jt != (*it).second.end(); jt++) + (*jt).second.clear(); + (*it).second.clear(); + } q.clear(); - - //directed partitions and their sizes - //vector g_ids; - //vector g_id_sizes; + // directed partitions and their sizes + // vector g_ids; + // vector g_id_sizes; g_ids.clear(); g_id_sizes.clear(); - - for (map > :: iterator it = ale->size_ordered_bips.begin(); it != ale->size_ordered_bips.end(); it++) - for (vector :: iterator jt = (*it).second.begin(); jt != (*it).second.end(); jt++) - { - g_ids.push_back((*jt)); - g_id_sizes.push_back((*it).first); - } - //root bipartition needs to be handled separately + + for (map>::iterator it = ale->size_ordered_bips.begin(); + it != ale->size_ordered_bips.end(); it++) + for (vector::iterator jt = (*it).second.begin(); + jt != (*it).second.end(); jt++) { + g_ids.push_back((*jt)); + g_id_sizes.push_back((*it).first); + } + // root bipartition needs to be handled separately g_ids.push_back(-1); g_id_sizes.push_back(ale->Gamma_size); - root_i=g_ids.size()-1; + root_i = g_ids.size() - 1; // gene<->species mapping - for (int i=0;i<(int)g_ids.size();i++) - { - long int g_id=g_ids[i]; - - if (g_id_sizes[i]==1) - { - int id = 0; - for (auto i=0; i< ale->Gamma_size + 1; ++i) - { - if ( ale->id_sets[g_id][i] ) - { - id=i; - break; - } - } - - string gene_name=ale->id_leaves[ id ]; - vector tokens; - boost::split(tokens,gene_name,boost::is_any_of(string_parameter["gene_name_separators"]),boost::token_compress_on); - string species_name; - if ((int)scalar_parameter["species_field"]==-1) - { - species_name=tokens[1]; - for (int fi=2;fi g_id2i; - for (int i=0;i<(int)g_ids.size();i++) - { - long int g_id=g_ids[i]; - g_id2i[g_id]=i; - - if (not ( i<(int)uq.size() ) ) - { - vector tmp; - uq.push_back(tmp); - mPTuq.push_back(0); - } + for (int i = 0; i < (int)g_ids.size(); i++) { + long int g_id = g_ids[i]; + + if (g_id_sizes[i] == 1) { + int id = 0; + for (auto i = 0; i < ale->Gamma_size + 1; ++i) { + if (ale->id_sets[g_id][i]) { + id = i; + break; + } + } + + string gene_name = ale->id_leaves[id]; + vector tokens; + boost::split(tokens, gene_name, + boost::is_any_of(string_parameter["gene_name_separators"]), + boost::token_compress_on); + string species_name; + if ((int)scalar_parameter["species_field"] == -1) { + species_name = tokens[1]; + for (int fi = 2; fi < tokens.size(); fi++) + species_name += "_" + tokens[fi]; + } + // species_name=tokens[tokens.size()-1]; else - mPTuq[i]=0; - - for (int e=0;e g_id2i; + for (int i = 0; i < (int)g_ids.size(); i++) { + long int g_id = g_ids[i]; + g_id2i[g_id] = i; + + if (not(i < (int)uq.size())) { + vector tmp; + uq.push_back(tmp); + mPTuq.push_back(0); + } else + mPTuq[i] = 0; + + for (int e = 0; e < last_branch; e++) + if (not(e < (int)uq[i].size())) { + uq[i].push_back(0); + } else + uq[i][e] = 0; + } + + for (int iter = 0; iter < 4; iter++) { + for (int i = 0; i < (int)g_ids.size(); i++) { + + scalar_type new_mPTuq = 0; + + // directed partition (dip) gamma's id + bool is_a_leaf = false; + long int g_id = g_ids[i]; + if (g_id_sizes[i] == 1) + is_a_leaf = true; + vector gp_is; + vector gpp_is; + vector p_part; + if (g_id != -1) + for (unordered_map, scalar_type>::iterator kt = + ale->Dip_counts[g_id].begin(); + kt != ale->Dip_counts[g_id].end(); kt++) { + pair parts = (*kt).first; + long int gp_id = parts.first; + long int gpp_id = parts.second; + int gp_i = g_id2i[parts.first]; + int gpp_i = g_id2i[parts.second]; + gp_is.push_back(gp_i); + gpp_is.push_back(gpp_i); + if (ale->Bip_counts[g_id] <= scalar_parameter["min_bip_count"]) + p_part.push_back(0); + else + p_part.push_back(ale->p_dip(g_id, gp_id, gpp_id)); + } + else { + // root bipartition needs to be handled separately + map, int> bip_parts; + for (map::iterator it = ale->Bip_counts.begin(); + it != ale->Bip_counts.end(); it++) { + long int gp_id = (*it).first; + boost::dynamic_bitset<> gamma = ale->id_sets.at(gp_id); + boost::dynamic_bitset<> not_gamma = ~gamma; + not_gamma[0] = 0; + long int gpp_id = ale->set_ids.at(not_gamma); + + set parts; + parts.insert(gp_id); + parts.insert(gpp_id); + bip_parts[parts] = 1; + // gamma.clear(); + // not_gamma.clear(); + } + for (map, int>::iterator kt = bip_parts.begin(); + kt != bip_parts.end(); kt++) { + vector parts; + for (set::iterator sit = (*kt).first.begin(); + sit != (*kt).first.end(); sit++) { + parts.push_back((*sit)); + } + long int gp_id = parts[0]; + // long int gpp_id=parts[1]; + + int gp_i = g_id2i[parts[0]]; + int gpp_i = g_id2i[parts[1]]; + gp_is.push_back(gp_i); + gpp_is.push_back(gpp_i); + + // Here we can create a new ale->Bip_counts[gp_id], in particular for + // leaves. We may want to add the leaf entries for Bip_counts when + // Bip_counts is first created. + if (ale->Bip_counts[gp_id] <= scalar_parameter.at("min_bip_count") and + not ale->Gamma_size < 4) + p_part.push_back(0); + else + p_part.push_back(ale->p_bip(gp_id)); + } + bip_parts.clear(); + } + // ###################################################################################################################### + // #########################################INNNER + // LOOP################################################################## + // ###################################################################################################################### + + for (int e = 0; e < last_branch; e++) { + scalar_type uq_sum = 0; + // S leaf and G leaf + if (e < last_leaf and is_a_leaf and + extant_species[e] == gid_sps[g_id]) { + // present + uq_sum += PS[e] * 1; + } + // G internal + if (not is_a_leaf) { + int N_parts = gp_is.size(); + for (int i = 0; i < N_parts; i++) { + int gp_i = gp_is[i]; + int gpp_i = gpp_is[i]; + scalar_type pp = p_part[i]; + if (not(e < last_leaf)) { + int f = daughter[e]; + int g = son[e]; + // S event + uq_sum += + PS[e] * + (uq[gp_i][f] * uq[gpp_i][g] + uq[gp_i][g] * uq[gpp_i][f]) * + pp; + } + // D event + uq_sum += PD[e] * (uq[gp_i][e] * uq[gpp_i][e] * 2) * pp; + // T event + uq_sum += + (uq[gp_i][e] * mPTuq[gpp_i] + uq[gpp_i][e] * mPTuq[gp_i]) * pp; + } + } + if (not(e < last_leaf)) { + int f = daughter[e]; + int g = son[e]; + // SL event + uq_sum += PS[e] * (uq[i][f] * uE[g] + uq[i][g] * uE[f]); + } + // DL event + uq_sum += PD[e] * (uq[i][e] * uE[e] * 2); + // TL event + uq_sum += (mPTuq[i] * uE[e] + uq[i][e] * mPTE); + if (uq_sum < EPSILON) + uq_sum = EPSILON; + uq[i][e] = uq_sum; + new_mPTuq += (PT[e] / (float)last_branch) * uq_sum; + } + mPTuq[i] = new_mPTuq; + + // ###################################################################################################################### + // #########################################INNNER + // LOOP################################################################## + // ###################################################################################################################### } - - - for (int iter=0;iter<4;iter++) - { - for (int i=0;i<(int)g_ids.size();i++) - { - - scalar_type new_mPTuq=0; - - // directed partition (dip) gamma's id - bool is_a_leaf=false; - long int g_id=g_ids[i]; - if (g_id_sizes[i]==1) - is_a_leaf=true; - vector gp_is; - vector gpp_is; - vector p_part; - if (g_id!=-1) - for (unordered_map< pair,scalar_type> :: iterator kt = ale->Dip_counts[g_id].begin(); kt != ale->Dip_counts[g_id].end(); kt++) - { - pair parts = (*kt).first; - long int gp_id=parts.first; - long int gpp_id=parts.second; - int gp_i=g_id2i[parts.first]; - int gpp_i=g_id2i[parts.second]; - gp_is.push_back(gp_i); - gpp_is.push_back(gpp_i); - if (ale->Bip_counts[g_id]<=scalar_parameter["min_bip_count"]) - p_part.push_back(0); - else - p_part.push_back(ale->p_dip(g_id,gp_id,gpp_id)); - } - else - { - //root bipartition needs to be handled separately - map,int> bip_parts; - for (map :: iterator it = ale->Bip_counts.begin(); it != ale->Bip_counts.end(); it++) - { - long int gp_id=(*it).first; - boost::dynamic_bitset<> gamma =ale->id_sets.at(gp_id); - boost::dynamic_bitset<> not_gamma = ~gamma; - not_gamma[0] = 0; - long int gpp_id = ale->set_ids.at(not_gamma); - - set parts; - parts.insert(gp_id); - parts.insert(gpp_id); - bip_parts[parts]=1; - // gamma.clear(); - // not_gamma.clear(); - } - for (map,int> :: iterator kt = bip_parts.begin();kt!=bip_parts.end();kt++) - { - vector parts; - for (set::iterator sit=(*kt).first.begin();sit!=(*kt).first.end();sit++) { - parts.push_back((*sit)); - } - long int gp_id=parts[0]; - //long int gpp_id=parts[1]; - - int gp_i=g_id2i[parts[0]]; - int gpp_i=g_id2i[parts[1]]; - gp_is.push_back(gp_i); - gpp_is.push_back(gpp_i); - - //Here we can create a new ale->Bip_counts[gp_id], in particular for leaves. - //We may want to add the leaf entries for Bip_counts when Bip_counts is first created. - if (ale->Bip_counts[gp_id]<=scalar_parameter.at("min_bip_count") and not ale->Gamma_size<4) - p_part.push_back(0); - else - p_part.push_back(ale->p_bip(gp_id)); - } - bip_parts.clear(); - } - //###################################################################################################################### - //#########################################INNNER LOOP################################################################## - //###################################################################################################################### - - for (int e=0;eBip_counts.count(g_id) and ale_pointer->Bip_counts[g_id]>0) - bl<Bip_bls[g_id]/ale_pointer->Bip_counts[g_id],(scalar_type)scalar_parameter["min_branch_lenghts"]); + if (ale_pointer->Bip_counts.count(g_id) and ale_pointer->Bip_counts[g_id] > 0) + bl << max(ale_pointer->Bip_bls[g_id] / ale_pointer->Bip_counts[g_id], + (scalar_type)scalar_parameter["min_branch_lenghts"]); else - bl<Bip_bls[g_id]/ale_pointer->observations,(scalar_type)scalar_parameter["min_branch_lenghts"]); - string branch_length=bl.str(); - - - vector gp_is; - vector gpp_is; - vector p_part; - if (g_id!=-1) - for (unordered_map< pair,scalar_type> :: iterator kt = ale_pointer->Dip_counts[g_id].begin(); kt != ale_pointer->Dip_counts[g_id].end(); kt++) - { - pair parts = (*kt).first; - long int gp_id=parts.first; - long int gpp_id=parts.second; - int gp_i=g_id2i[parts.first]; - int gpp_i=g_id2i[parts.second]; - gp_is.push_back(gp_i); - gpp_is.push_back(gpp_i); - if (ale_pointer->Bip_counts[g_id]<=scalar_parameter["min_bip_count"]) - p_part.push_back(0); - else - p_part.push_back(ale_pointer->p_dip(g_id,gp_id,gpp_id)); - } - else - { - //root bipartition needs to be handled separately - map,int> bip_parts; - for (map :: iterator it = ale_pointer->Bip_counts.begin(); it != ale_pointer->Bip_counts.end(); it++) - { - long int gp_id=(*it).first; - boost::dynamic_bitset<> gamma =ale_pointer->id_sets.at(gp_id); - boost::dynamic_bitset<> not_gamma = ~gamma; - not_gamma[0] = 0; - long int gpp_id = ale_pointer->set_ids.at(not_gamma); - - set parts; - parts.insert(gp_id); - parts.insert(gpp_id); - bip_parts[parts]=1; - // gamma.clear(); - // not_gamma.clear(); - } - for (map,int> :: iterator kt = bip_parts.begin();kt!=bip_parts.end();kt++) - { - vector parts; - for (set::iterator sit=(*kt).first.begin();sit!=(*kt).first.end();sit++) { - parts.push_back((*sit)); - } - long int gp_id=parts[0]; - //long int gpp_id=parts[1]; - - int gp_i=g_id2i[parts[0]]; - int gpp_i=g_id2i[parts[1]]; - gp_is.push_back(gp_i); - gpp_is.push_back(gpp_i); - - //Here we can create a new ale->Bip_counts[gp_id], in particular for leaves. - //We may want to add the leaf entries for Bip_counts when Bip_counts is first created. - if (ale_pointer->Bip_counts[gp_id]<=scalar_parameter.at("min_bip_count") and not ale_pointer->Gamma_size<4) - p_part.push_back(0); - else - p_part.push_back(ale_pointer->p_bip(gp_id)); - } - bip_parts.clear(); + bl << max(ale_pointer->Bip_bls[g_id] / ale_pointer->observations, + (scalar_type)scalar_parameter["min_branch_lenghts"]); + string branch_length = bl.str(); + + vector gp_is; + vector gpp_is; + vector p_part; + if (g_id != -1) + for (unordered_map, scalar_type>::iterator kt = + ale_pointer->Dip_counts[g_id].begin(); + kt != ale_pointer->Dip_counts[g_id].end(); kt++) { + pair parts = (*kt).first; + long int gp_id = parts.first; + long int gpp_id = parts.second; + int gp_i = g_id2i[parts.first]; + int gpp_i = g_id2i[parts.second]; + gp_is.push_back(gp_i); + gpp_is.push_back(gpp_i); + if (ale_pointer->Bip_counts[g_id] <= scalar_parameter["min_bip_count"]) + p_part.push_back(0); + else + p_part.push_back(ale_pointer->p_dip(g_id, gp_id, gpp_id)); } - - - //###################################################################################################################### - //#########################################INNNER LOOP################################################################## - //###################################################################################################################### - scalar_type uq_sum=0; - // S leaf and G leaf - if (e, int> bip_parts; + for (map::iterator it = + ale_pointer->Bip_counts.begin(); + it != ale_pointer->Bip_counts.end(); it++) { + long int gp_id = (*it).first; + boost::dynamic_bitset<> gamma = ale_pointer->id_sets.at(gp_id); + boost::dynamic_bitset<> not_gamma = ~gamma; + not_gamma[0] = 0; + long int gpp_id = ale_pointer->set_ids.at(not_gamma); + + set parts; + parts.insert(gp_id); + parts.insert(gpp_id); + bip_parts[parts] = 1; + // gamma.clear(); + // not_gamma.clear(); } - // G internal - if (not is_a_leaf) - { - int N_parts=gp_is.size(); - for (int i=0;i, int>::iterator kt = bip_parts.begin(); + kt != bip_parts.end(); kt++) { + vector parts; + for (set::iterator sit = (*kt).first.begin(); + sit != (*kt).first.end(); sit++) { + parts.push_back((*sit)); + } + long int gp_id = parts[0]; + // long int gpp_id=parts[1]; + + int gp_i = g_id2i[parts[0]]; + int gpp_i = g_id2i[parts[1]]; + gp_is.push_back(gp_i); + gpp_is.push_back(gpp_i); + + // Here we can create a new ale->Bip_counts[gp_id], in particular for + // leaves. We may want to add the leaf entries for Bip_counts when + // Bip_counts is first created. + if (ale_pointer->Bip_counts[gp_id] <= + scalar_parameter.at("min_bip_count") and + not ale_pointer->Gamma_size < 4) + p_part.push_back(0); + else + p_part.push_back(ale_pointer->p_bip(gp_id)); } - if (not (eset2name(ale_pointer->id_sets[g_id]) + branch_string + + ":" + branch_length; } - //###################################################################################################################### - //#########################################INNNER LOOP################################################################## - //###################################################################################################################### + } + // G internal + if (not is_a_leaf) { + int N_parts = gp_is.size(); + for (int i = 0; i < N_parts; i++) { + int gp_i = gp_is[i]; + int gpp_i = gpp_is[i]; + scalar_type pp = p_part[i]; + if (not(e < last_leaf)) { + int f = daughter[e]; + int g = son[e]; + // S event + uq_resum += PS[e] * uq[gp_i][f] * uq[gpp_i][g] * pp + EPSILON; + if (r * uq_sum < uq_resum) { + register_Su(e, last_event); + return "(" + sample_undated(f, gp_i, "S") + "," + + sample_undated(g, gpp_i, "S") + ")." + estr + branch_string + + ":" + branch_length; + } + uq_resum += PS[e] * uq[gp_i][g] * uq[gpp_i][f] * pp + EPSILON; + if (r * uq_sum < uq_resum) { + register_Su(e, last_event); + return "(" + sample_undated(g, gp_i, "S") + "," + + sample_undated(f, gpp_i, "S") + ")." + estr + branch_string + + ":" + branch_length; + } + } + // D event + uq_resum += PD[e] * (uq[gp_i][e] * uq[gpp_i][e] * 2) * pp + EPSILON; + if (r * uq_sum < uq_resum) { + register_D(e); + return "(" + sample_undated(e, gp_i, "D") + "," + + sample_undated(e, gpp_i, "D") + ").D@" + estr + branch_string + + ":" + branch_length; + } - //###################################################################################################################### - //#########################################INNNER LOOP################################################################## - //###################################################################################################################### - stringstream estring; - if (not (eset2name(ale_pointer->id_sets[g_id])+branch_string+":"+branch_length; - } + // T event + for (int f = 0; f < last_branch; f++) { + stringstream fstring; + if (not(f < last_leaf)) + fstring << f; + else + fstring << extant_species[f]; + string fstr = fstring.str(); + + uq_resum += + uq[gp_i][e] * (PT[f] / (float)last_branch) * uq[gpp_i][f] * pp + + EPSILON; + if (r * uq_sum < uq_resum) { + register_Tfrom(e); + register_Tto(f); + register_T_to_from(e, f); + stringstream Ttoken; + Ttoken << estr << ">" << fstr << "|" + << ale_pointer->set2name(ale_pointer->id_sets[g_ids[gpp_i]]); + Ttokens.push_back(Ttoken.str()); + + return "(" + sample_undated(e, gp_i, "S") + "," + + sample_undated(f, gpp_i, "T") + ").T@" + estr + "->" + fstr + + branch_string + ":" + branch_length; + } + uq_resum += + uq[gpp_i][e] * (PT[f] / (float)last_branch) * uq[gp_i][f] * pp + + EPSILON; + if (r * uq_sum < uq_resum) { + register_Tfrom(e); + register_Tto(f); + register_T_to_from(e, f); + stringstream Ttoken; + Ttoken << estr << ">" << fstr << "|" + << ale_pointer->set2name(ale_pointer->id_sets[g_ids[gp_i]]); + Ttokens.push_back(Ttoken.str()); + return "(" + sample_undated(e, gpp_i, "S") + "," + + sample_undated(f, gp_i, "T") + ").T@" + estr + "->" + fstr + + branch_string + ":" + branch_length; + } + } } - // G internal - if (not is_a_leaf) - { - int N_parts=gp_is.size(); - for (int i=0;i"<set2name(ale_pointer->id_sets[g_ids[gpp_i]]); - Ttokens.push_back(Ttoken.str()); - - return "("+sample_undated(e,gp_i,"S")+","+sample_undated(f,gpp_i,"T")+").T@"+estr+"->"+fstr+branch_string+":"+branch_length; - } - uq_resum+=uq[gpp_i][e]*(PT[f]/(float)last_branch)*uq[gp_i][f]*pp+EPSILON; - if (r*uq_sum"<set2name(ale_pointer->id_sets[g_ids[gp_i]]); - Ttokens.push_back(Ttoken.str()); - return "("+sample_undated(e,gpp_i,"S")+","+sample_undated(f,gp_i,"T")+").T@"+estr+"->"+fstr+branch_string+":"+branch_length; - } - - } - } + } + if (not(e < last_leaf)) { + int f = daughter[e]; + int g = son[e]; + // SL event + uq_resum += PS[e] * uq[i][f] * uE[g] + EPSILON; + if (r * uq_sum < uq_resum) { + register_Su(e, last_event); + register_L(g); + return sample_undated(f, i, "S", "." + estr + branch_string); } - if (not (e"<set2name(ale_pointer->id_sets[g_id]); - Ttokens.push_back(Ttoken.str()); - */ - register_L(e); - return sample_undated(f,i,"T",".T@"+estr+"->"+fstr); - } - uq_resum+=(PT[f]/(float)last_branch)*uE[f]*uq[i][e]+EPSILON; - if (r*uq_sum"<set2name(ale_pointer->id_sets[g_id]); + Ttokens.push_back(Ttoken.str()); + */ + register_L(e); + return sample_undated(f, i, "T", ".T@" + estr + "->" + fstr); + } + uq_resum += (PT[f] / (float)last_branch) * uE[f] * uq[i][e] + EPSILON; + if (r * uq_sum < uq_resum) { + return sample_undated(e, i, "S"); } - //###################################################################################################################### - //#########################################INNNER LOOP################################################################## - //###################################################################################################################### + } + // ###################################################################################################################### + // #########################################INNNER + // LOOP################################################################## + // ###################################################################################################################### cout << "sum error!" << endl; return "-!=-"; } -string exODT_model::counts_string_undated(scalar_type samples) -{ - +string exODT_model::counts_string_undated(scalar_type samples) { + stringstream out; - for (int e=0;e-1) - { - int f=daughter[e]; - int g=son[e]; - if (last_event=="S" or last_event=="O") branch_counts["singleton"].at(e)+=1; - branch_counts["copies"].at(e)+=1; - branch_counts["count"].at(f)+=1; - branch_counts["count"].at(g)+=1; - } +void exODT_model::register_Su(int e, string last_event) { + MLRec_events["S"] += 1; + if (e > -1) { + int f = daughter[e]; + int g = son[e]; + if (last_event == "S" or last_event == "O") + branch_counts["singleton"].at(e) += 1; + branch_counts["copies"].at(e) += 1; + branch_counts["count"].at(f) += 1; + branch_counts["count"].at(g) += 1; + } } -void exODT_model::register_leafu(int e,string last_event) -{ - if (e>-1) - { - branch_counts["copies"].at(e)+=1; - if (last_event=="S" or last_event=="O") branch_counts["singleton"].at(e)+=1; - } - //MLRec_events["genes"]+=1; +void exODT_model::register_leafu(int e, string last_event) { + if (e > -1) { + branch_counts["copies"].at(e) += 1; + if (last_event == "S" or last_event == "O") + branch_counts["singleton"].at(e) += 1; + } + // MLRec_events["genes"]+=1; } -void exODT_model::register_T_to_from(int e,int f) -{ - T_to_from[e][f]+=1; +void exODT_model::register_T_to_from(int e, int f) { T_to_from[e][f] += 1; } -} +string exODT_model::feSPR(int e, int f) { + tree_type *newS = TreeTemplateTools::parenthesisToTree( + string_parameter["S_un"], + true // (string_parameter["BOOT_STRAP_LABLES"]=="yes")); + ); + Node *newS_root = newS->getRootNode(); + vector nodes = TreeTemplateTools::getNodes(*newS_root); -string exODT_model::feSPR(int e, int f) -{ - tree_type * newS=TreeTemplateTools::parenthesisToTree(string_parameter["S_un"], true// (string_parameter["BOOT_STRAP_LABLES"]=="yes")); - ); - Node * newS_root = newS->getRootNode(); - vector nodes = TreeTemplateTools::getNodes(*newS_root); + string e_name = node_name[id_nodes[e]]; + string f_name = node_name[id_nodes[f]]; + ; - string e_name=node_name[id_nodes[e]]; - string f_name=node_name[id_nodes[f]];; - Node *e_node, *f_node; - - for (vector ::iterator it=nodes.begin();it!=nodes.end();it++ ) - { - string name_it; - if ((*it)->isLeaf()) - { - name_it=(*it)->getName(); - } - else - { - vector leafnames=TreeTemplateTools::getLeavesNames(*(*it)); - sort(leafnames.begin(),leafnames.end()); - stringstream name; - for (vector ::iterator st=leafnames.begin();st!=leafnames.end();st++ ) - name<<(*st)<<"."; - - name_it=name.str(); - } - if (name_it==e_name) e_node=(*it); - if (name_it==f_name) f_node=(*it); - } - - if (e==f) return string_parameter["S_un"]; - - bool e_below_f=false; - Node * node; - node=e_node; - while (node->hasFather()) - { - node=node->getFather(); - if (node==f_node) e_below_f=true; - } - if (e_below_f) - { - Node * swap_tmp=e_node; - e_node=f_node; - f_node=swap_tmp; - } - if (f_node->hasFather() and f_node->getFather()==e_node ) return string_parameter["S_un"]; - - Node * f_father=f_node->getFather(); - vector f_sons=f_father->getSons(); - Node * f_sister; - if (f_sons[0]==f_node) f_sister=f_sons[1]; else f_sister=f_sons[0]; - f_father->removeSon(f_sister); - if (f_father->hasFather()) - { - Node * f_grand_father=f_father->getFather(); - f_grand_father->removeSon(f_father); - f_grand_father->addSon(f_sister); - } - else - { - newS->setRootNode(f_sister); - } - if (e_node->hasFather()) - { - Node * e_father=e_node->getFather(); - e_father->removeSon(e_node); - e_father->addSon(f_father); + for (vector::iterator it = nodes.begin(); it != nodes.end(); it++) { + string name_it; + if ((*it)->isLeaf()) { + name_it = (*it)->getName(); + } else { + vector leafnames = TreeTemplateTools::getLeavesNames(*(*it)); + sort(leafnames.begin(), leafnames.end()); + stringstream name; + for (vector::iterator st = leafnames.begin(); + st != leafnames.end(); st++) + name << (*st) << "."; + + name_it = name.str(); } + if (name_it == e_name) + e_node = (*it); + if (name_it == f_name) + f_node = (*it); + } + + if (e == f) + return string_parameter["S_un"]; + + bool e_below_f = false; + Node *node; + node = e_node; + while (node->hasFather()) { + node = node->getFather(); + if (node == f_node) + e_below_f = true; + } + if (e_below_f) { + Node *swap_tmp = e_node; + e_node = f_node; + f_node = swap_tmp; + } + if (f_node->hasFather() and f_node->getFather() == e_node) + return string_parameter["S_un"]; + + Node *f_father = f_node->getFather(); + vector f_sons = f_father->getSons(); + Node *f_sister; + if (f_sons[0] == f_node) + f_sister = f_sons[1]; else + f_sister = f_sons[0]; + f_father->removeSon(f_sister); + if (f_father->hasFather()) { + Node *f_grand_father = f_father->getFather(); + f_grand_father->removeSon(f_father); + f_grand_father->addSon(f_sister); + } else { + newS->setRootNode(f_sister); + } + if (e_node->hasFather()) { + Node *e_father = e_node->getFather(); + e_father->removeSon(e_node); + e_father->addSon(f_father); + } else newS->setRootNode(f_father); f_father->addSon(e_node); - - for (vector ::iterator it=nodes.begin();it!=nodes.end();it++ ) (*it)->setDistanceToFather(1); - - return TreeTemplateTools::treeToParenthesis(*newS,false,"ID"); + + for (vector::iterator it = nodes.begin(); it != nodes.end(); it++) + (*it)->setDistanceToFather(1); + + return TreeTemplateTools::treeToParenthesis(*newS, false, "ID"); } -vector exODT_model::NNIs(int e) -{ +vector exODT_model::NNIs(int e) { vector NNIs; - int left_e,right_e,f; - - Node * root = id_nodes[e]; + int left_e, right_e, f; + + Node *root = id_nodes[e]; + + if (root->isLeaf()) + return NNIs; - if (root->isLeaf()) return NNIs; - - vector roots_sons=root->getSons(); + vector roots_sons = root->getSons(); - right_e=node_ids[roots_sons[0]]; - left_e=node_ids[roots_sons[1]]; + right_e = node_ids[roots_sons[0]]; + left_e = node_ids[roots_sons[1]]; if (roots_sons[0]->isLeaf()) ; - else - { - vector right_sons=roots_sons[0]->getSons(); - f=node_ids[right_sons[0]]; - NNIs.push_back(feSPR(left_e,f)); - f=node_ids[right_sons[1]]; - NNIs.push_back(feSPR(left_e,f)); - } - + else { + vector right_sons = roots_sons[0]->getSons(); + f = node_ids[right_sons[0]]; + NNIs.push_back(feSPR(left_e, f)); + f = node_ids[right_sons[1]]; + NNIs.push_back(feSPR(left_e, f)); + } + if (roots_sons[1]->isLeaf()) ; - else - { - vector left_sons=roots_sons[1]->getSons(); - f=node_ids[left_sons[0]]; - NNIs.push_back(feSPR(right_e,f)); - f=node_ids[left_sons[1]]; - NNIs.push_back(feSPR(right_e,f)); - } + else { + vector left_sons = roots_sons[1]->getSons(); + f = node_ids[left_sons[0]]; + NNIs.push_back(feSPR(right_e, f)); + f = node_ids[left_sons[1]]; + NNIs.push_back(feSPR(right_e, f)); + } return NNIs; - } diff --git a/src/ALE.cpp b/src/ALE.cpp index f0d45a8..c1ae87b 100644 --- a/src/ALE.cpp +++ b/src/ALE.cpp @@ -6,331 +6,302 @@ using namespace bpp; #include -//## aux. functions ## +// ## aux. functions ## -//from: http://rosettacode.org/wiki/Power_set#Recursive_version (accessed 12/13/11) -//"Given a set S, the power set (or powerset) of S, written P(S), or 2S, is the set of all subsets of S." -template set powerset(const Set& s, size_t n) -{ +// from: http://rosettacode.org/wiki/Power_set#Recursive_version (accessed +// 12/13/11) "Given a set S, the power set (or powerset) of S, written P(S), or +//2S, is the set of all subsets of S." +template set powerset(const Set &s, size_t n) { typedef typename Set::const_iterator SetCIt; typedef typename set::const_iterator PowerSetCIt; set res; - if(n > 0) { - set ps = powerset(s, n-1); - for(PowerSetCIt ss = ps.begin(); ss != ps.end(); ss++) - for(SetCIt el = s.begin(); el != s.end(); el++) { - Set subset(*ss); - subset.insert(*el); - res.insert(subset); + if (n > 0) { + set ps = powerset(s, n - 1); + for (PowerSetCIt ss = ps.begin(); ss != ps.end(); ss++) + for (SetCIt el = s.begin(); el != s.end(); el++) { + Set subset(*ss); + subset.insert(*el); + res.insert(subset); } res.insert(ps.begin(), ps.end()); } else res.insert(Set()); return res; } -//from: http://rosettacode.org/wiki/Power_set#Recursive_version (accessed 12/13/11) -template set powerset(const Set& s) -{ +// from: http://rosettacode.org/wiki/Power_set#Recursive_version (accessed +// 12/13/11) +template set powerset(const Set &s) { return powerset(s, s.size()); } - -//## approx_posterior class ## -approx_posterior::approx_posterior() -{ - //formal constructor must be followed by load state +// ## approx_posterior class ## +approx_posterior::approx_posterior() { + // formal constructor must be followed by load state ; } - -approx_posterior::approx_posterior(string tree_string) -{ +approx_posterior::approx_posterior(string tree_string) { constructor_string = tree_string; construct(constructor_string); } - -void approx_posterior::construct(string tree_string) -{ +void approx_posterior::construct(string tree_string) { // t=new boost::timer(); - last_leafset_id=0; - observations=0; - vector leaves; + last_leafset_id = 0; + observations = 0; + vector leaves; // ? name_separator="+"; - if ( tree_string.substr(0,1) !="(") - { - boost::trim(tree_string); - boost::split(leaves,tree_string,boost::is_any_of(","),boost::token_compress_on); - } - else - { - tree_type * tree = TreeTemplateTools::parenthesisToTree(tree_string,false);//del-loc - leaves = tree->getLeavesNames();//del-loc - } - int id=0; - for (vector ::iterator it=leaves.begin();it!=leaves.end();it++ ) - { - - id++; - string leaf_name=(*it); - leaf_ids[leaf_name]=id; - Gamma_s.insert(id); - id_leaves[id]=leaf_name; - } - alpha=0; - beta=0; - Gamma_size=Gamma_s.size(); + if (tree_string.substr(0, 1) != "(") { + boost::trim(tree_string); + boost::split(leaves, tree_string, boost::is_any_of(","), + boost::token_compress_on); + } else { + tree_type *tree = + TreeTemplateTools::parenthesisToTree(tree_string, false); // del-loc + leaves = tree->getLeavesNames(); // del-loc + } + int id = 0; + for (vector::iterator it = leaves.begin(); it != leaves.end(); it++) { + + id++; + string leaf_name = (*it); + leaf_ids[leaf_name] = id; + Gamma_s.insert(id); + id_leaves[id] = leaf_name; + } + alpha = 0; + beta = 0; + Gamma_size = Gamma_s.size(); /* size_t lword = BipartitionTools::LWORD; nbint = (Gamma_size + lword - 1) / lword;*/ // size_t nbword = (Gamma_size + lword - 1) / lword; // nbint = nbword * lword / (CHAR_BIT * sizeof(int)); - Gamma = boost::dynamic_bitset<> (Gamma_size + 1) ;//new int[nbint]; - //All leaves are present in Gamma: - for (auto i = 0; i < Gamma_size + 1; i++) - { - Gamma[i] = 1; - } + Gamma = boost::dynamic_bitset<>(Gamma_size + 1); // new int[nbint]; + // All leaves are present in Gamma: + for (auto i = 0; i < Gamma_size + 1; i++) { + Gamma[i] = 1; + } - //maybe should use boost pow - //number of bipartitions of Gamma - K_Gamma=pow(2.,(int)Gamma_size-1)-1; - //number of unrooted trees on Gamma_size leaves - if (Gamma_size<3) - N_Gamma=1; + // maybe should use boost pow + // number of bipartitions of Gamma + K_Gamma = pow(2., (int)Gamma_size - 1) - 1; + // number of unrooted trees on Gamma_size leaves + if (Gamma_size < 3) + N_Gamma = 1; else - N_Gamma=boost::math::double_factorial(2*Gamma_size-5); + N_Gamma = boost::math::double_factorial(2 * Gamma_size - 5); - //XX - std::unordered_map< pair,scalar_type> temp ; - while (leaves.size()+1 > Dip_counts.size() ) + // XX + std::unordered_map, scalar_type> temp; + while (leaves.size() + 1 > Dip_counts.size()) Dip_counts.push_back(temp); - //XX - //del-locs + // XX + // del-locs leaves.clear(); - //delete tree; + // delete tree; } - -void approx_posterior::save_state(string fname) -{ - //constructor_string - ofstream fout( fname.c_str() ); - //must be first! - fout<< "#constructor_string" << endl; +void approx_posterior::save_state(string fname) { + // constructor_string + ofstream fout(fname.c_str()); + // must be first! + fout << "#constructor_string" << endl; boost::trim(constructor_string); - fout<< constructor_string << endl; + fout << constructor_string << endl; - fout<< "#observations" <::iterator jt=(*it).first.begin();jt!=(*it).first.end();jt++) - fout << "\t" << (*jt);*/ - fout << endl; + fout << "#set-id" << endl; + for (auto it = set_ids.begin(); it != set_ids.end(); it++) { + fout << (*it).second; + fout << "\t:"; + for (auto i = 0; i < Gamma_size + 1; ++i) { + // if (BipartitionTools::testBit((*it).first, i)) + if ((*it).first[i]) // if the leaf is present, we print it + fout << "\t" << i; } - fout<< "#END" << endl; - fout.close(); + /* for (set< int>::iterator + jt=(*it).first.begin();jt!=(*it).first.end();jt++) fout << "\t" << + (*jt);*/ + fout << endl; + } + fout << "#END" << endl; + fout.close(); } -void approx_posterior::load_state(string fname) -{ +void approx_posterior::load_state(string fname) { string tree_string; if (!fexists(fname)) { - cout << "Error, file "< tokens; + boost::trim(line); + boost::split(tokens, line, boost::is_any_of("\t "), + boost::token_compress_on); + Bip_counts[atol(tokens[0].c_str())] = atof(tokens[1].c_str()); + } else if (reading == "#Bip_bls") { + // cout << reading << endl; + vector tokens; + boost::trim(line); + boost::split(tokens, line, boost::is_any_of("\t "), + boost::token_compress_on); + Bip_bls[atol(tokens[0].c_str())] = atof(tokens[1].c_str()); + } else if (reading == "#Dip_counts") { + // cout << reading << endl; + vector tokens; + boost::trim(line); + boost::split(tokens, line, boost::is_any_of("\t "), + boost::token_compress_on); + pair parts; + /*PREVECTORIZATION CODE + set parts; + parts.insert(atoi(tokens[1].c_str())); + parts.insert(atoi(tokens[2].c_str())); + Dip_counts[atol(tokens[0].c_str())][parts]=atof(tokens[3].c_str()); + */ + + parts.first = atoi(tokens[1].c_str()); + parts.second = atoi(tokens[2].c_str()); + if (atol(tokens[0].c_str()) >= (long int)(Dip_counts.size())) { + // XX + std::unordered_map, scalar_type> temp; + // 10.18 + while (atol(tokens[0].c_str()) > (long int)Dip_counts.size()) { + Dip_counts.push_back(temp); } - else if (reading=="#Bip_counts") - { - //cout << reading << endl; - vector tokens; - boost::trim(line); - boost::split(tokens,line,boost::is_any_of("\t "),boost::token_compress_on); - Bip_counts[atol(tokens[0].c_str())]=atof(tokens[1].c_str()); - } - else if (reading=="#Bip_bls") - { - //cout << reading << endl; - vector tokens; - boost::trim(line); - boost::split(tokens,line,boost::is_any_of("\t "),boost::token_compress_on); - Bip_bls[atol(tokens[0].c_str())]=atof(tokens[1].c_str()); - } - else if (reading=="#Dip_counts") - { - //cout << reading << endl; - vector tokens; - boost::trim(line); - boost::split(tokens,line,boost::is_any_of("\t "),boost::token_compress_on); - pair parts; - /*PREVECTORIZATION CODE - set parts; - parts.insert(atoi(tokens[1].c_str())); - parts.insert(atoi(tokens[2].c_str())); - Dip_counts[atol(tokens[0].c_str())][parts]=atof(tokens[3].c_str()); - */ - - parts.first = atoi(tokens[1].c_str()); - parts.second = atoi(tokens[2].c_str()); - if ( atol(tokens[0].c_str()) >= (long int) ( Dip_counts.size() ) ) - { - //XX - std::unordered_map< pair,scalar_type> temp ; - //10.18 - while (atol(tokens[0].c_str()) > (long int) Dip_counts.size() ) { - Dip_counts.push_back(temp); - } - //10.18 - temp[parts]=atof(tokens[3].c_str()); - Dip_counts.push_back(temp); - } - else - { - Dip_counts[atol(tokens[0].c_str())][parts]=atof(tokens[3].c_str()); - } - } - else if (reading=="#last_leafset_id") - { - //cout << reading << endl; - boost::trim(line); - last_leafset_id=atol(line.c_str()); - } - else if (reading=="#leaf-id") - { - //cout << reading << endl; - vector tokens; - boost::trim(line); - boost::split(tokens,line,boost::is_any_of("\t "),boost::token_compress_on); - int id=atoi(tokens[1].c_str()); - string leaf_name=tokens[0]; - leaf_ids[leaf_name]=id; - id_leaves[id]=leaf_name; - } - else if (reading=="#set-id") - { - //cout << reading << endl; - vector fields; - boost::trim(line); - boost::split(fields,line,boost::is_any_of(":"),boost::token_compress_on); - boost::trim(fields[0]); - long int set_id=atol(fields[0].c_str()); - vector tokens; - boost::trim(fields[1]); - boost::split(tokens,fields[1],boost::is_any_of("\t "),boost::token_compress_on); - boost::dynamic_bitset<> temp( Gamma_size + 1 ); - - for (vector::iterator it=tokens.begin();it!=tokens.end();it++) { //Setting the proper bits to 1 - temp[static_cast(atoi((*it).c_str()))] = 1; - } + // 10.18 + temp[parts] = atof(tokens[3].c_str()); + Dip_counts.push_back(temp); + } else { + Dip_counts[atol(tokens[0].c_str())][parts] = atof(tokens[3].c_str()); + } + } else if (reading == "#last_leafset_id") { + // cout << reading << endl; + boost::trim(line); + last_leafset_id = atol(line.c_str()); + } else if (reading == "#leaf-id") { + // cout << reading << endl; + vector tokens; + boost::trim(line); + boost::split(tokens, line, boost::is_any_of("\t "), + boost::token_compress_on); + int id = atoi(tokens[1].c_str()); + string leaf_name = tokens[0]; + leaf_ids[leaf_name] = id; + id_leaves[id] = leaf_name; + } else if (reading == "#set-id") { + // cout << reading << endl; + vector fields; + boost::trim(line); + boost::split(fields, line, boost::is_any_of(":"), + boost::token_compress_on); + boost::trim(fields[0]); + long int set_id = atol(fields[0].c_str()); + vector tokens; + boost::trim(fields[1]); + boost::split(tokens, fields[1], boost::is_any_of("\t "), + boost::token_compress_on); + boost::dynamic_bitset<> temp(Gamma_size + 1); + + for (vector::iterator it = tokens.begin(); it != tokens.end(); + it++) { // Setting the proper bits to 1 + temp[static_cast(atoi((*it).c_str()))] = 1; + } - // std::cout <<"setid : "<< set_id << " READING: " << temp << std::endl; - set_ids[temp]=set_id; - id_sets[set_id]=temp; - } + // std::cout <<"setid : "<< set_id << " READING: " << temp << std::endl; + set_ids[temp] = set_id; + id_sets[set_id] = temp; } + } } - //Attempt adding something for the root bipartition: - boost::dynamic_bitset<> temp (Gamma_size +1); - for (auto i = 1 ; i temp(Gamma_size + 1); + for (auto i = 1; i < Gamma_size + 1; ++i) { temp[i] = 1; } id_sets[-1] = temp; set_ids[temp] = -1; - for ( auto it = id_sets.begin(); it != id_sets.end(); it++ ) - { - size_t size = 0; - for (auto i=0; i< Gamma_size + 1; ++i) { - // if (BipartitionTools::testBit( (*it).second, static_cast(i) ) ) { - if ( (*it).second[i] ) { - size+=1; - } - } - set_sizes[ (*it).first ] = size; - size_ordered_bips[size].push_back( (*it).first ); - // std::cout << size << " AND "<< (*it).first <(i) ) ) + // { + if ((*it).second[i]) { + size += 1; + } } -/* for ( auto it = size_ordered_bips.begin(); it != size_ordered_bips.end(); it++ ) - { - VectorTools::print ( (*it).second ); - }*/ - - //cout <<"Bip " << Bip_counts.size()< leaf_set) } */ -string approx_posterior::set2name(boost::dynamic_bitset<> leaf_set) const -{ - string name=""; - for (auto i = 0; i< Gamma_size + 1; ++i) { - // if ( BipartitionTools::testBit(leaf_set, static_cast(i)) ) { - if ( leaf_set[i] ) { - //stringstream tmp; - //tmp< leaf_set) const { + string name = ""; + for (auto i = 0; i < Gamma_size + 1; ++i) { + // if ( BipartitionTools::testBit(leaf_set, static_cast(i)) ) { + if (leaf_set[i]) { + // stringstream tmp; + // tmp< leaf_set) -{ - long int id=set_ids[leaf_set]; - if (!id) - { - last_leafset_id++; - set_ids[leaf_set]=last_leafset_id; - // TMP for debug - //Dip_levels[leaf_set.size()].push_back(last_leafset_id); - //id2name[last_leafset_id]=set2name(leaf_set); - //VEC - std::unordered_map< pair,scalar_type> tmp ; - Dip_counts.push_back(tmp); - //VEC - id_sets[last_leafset_id]=leaf_set; - Bip_bls[last_leafset_id]=0; - // std::cout << "notid: "<< last_leafset_id < leaf_set) { + long int id = set_ids[leaf_set]; + if (!id) { + last_leafset_id++; + set_ids[leaf_set] = last_leafset_id; + // TMP for debug + // Dip_levels[leaf_set.size()].push_back(last_leafset_id); + // id2name[last_leafset_id]=set2name(leaf_set); + // VEC + std::unordered_map, scalar_type> tmp; + Dip_counts.push_back(tmp); + // VEC + id_sets[last_leafset_id] = leaf_set; + Bip_bls[last_leafset_id] = 0; + // std::cout << "notid: "<< last_leafset_id <(2*Gamma_size-5); - n1=max(2,n1); - n2=max(2,n2); - return boost::math::double_factorial(2*n1-3)*boost::math::double_factorial(2*n2-3); +scalar_type approx_posterior::Bi(int n2) const { + int n1 = Gamma_size - n2; + if (n2 == 1 or n1 == 1) + return boost::math::double_factorial(2 * Gamma_size - 5); + n1 = max(2, n1); + n2 = max(2, n2); + return boost::math::double_factorial(2 * n1 - 3) * + boost::math::double_factorial(2 * n2 - 3); } - -scalar_type approx_posterior::Tri(int n2,int n3) const -{ - int n1=Gamma_size-n2-n3; - n1=max(2,n1); - n2=max(2,n2); - n3=max(2,n3); - return boost::math::double_factorial(2*n1-3)*boost::math::double_factorial(2*n2-3)*boost::math::double_factorial(2*n3-3); +scalar_type approx_posterior::Tri(int n2, int n3) const { + int n1 = Gamma_size - n2 - n3; + n1 = max(2, n1); + n2 = max(2, n2); + n3 = max(2, n3); + return boost::math::double_factorial(2 * n1 - 3) * + boost::math::double_factorial(2 * n2 - 3) * + boost::math::double_factorial(2 * n3 - 3); } - -scalar_type approx_posterior::binomial(int n,int m) const -{ - //maybe worth caching - return boost::math::binomial_coefficient(n,m); +scalar_type approx_posterior::binomial(int n, int m) const { + // maybe worth caching + return boost::math::binomial_coefficient(n, m); } - -scalar_type approx_posterior::trinomial(int n1,int n2, int n3) const -{ +scalar_type approx_posterior::trinomial(int n1, int n2, int n3) const { //(n,m)!=binomial(n+m,m) //(n1,n2,n3)!= (n1+n2,n3)! (n1,n2)! = binomial(n1+n2+n3,n3) binomial(n1+n2,n1) // binomial(|Gamma|,i+j) binomial(i+j,j) - //cf. http://mathworld.wolfram.com/MultinomialCoefficient.html - return binomial(n1+n2+n3,n3)*binomial(n1+n2,n2); + // cf. http://mathworld.wolfram.com/MultinomialCoefficient.html + return binomial(n1 + n2 + n3, n3) * binomial(n1 + n2, n2); } /* @@ -458,7 +419,8 @@ scalar_type approx_posterior::p_bip(set gamma) } -scalar_type approx_posterior::p_dip(set gamma,set gammap,set gammapp) +scalar_type approx_posterior::p_dip(set gamma,set gammap,set +gammapp) { if (Gamma_size<4) return 1; @@ -469,1267 +431,1239 @@ scalar_type approx_posterior::p_dip(set gamma,set gammap,set gamm } */ -scalar_type approx_posterior::p_bip(boost::dynamic_bitset<> gamma) const -{ - if (Gamma_size<4) - return 1; - long int g_id; - if (set_ids.count(gamma)) - g_id=set_ids.at(gamma); - else - g_id=-10; -// std::cout << "g_id: "<< g_id << " TO "<< p_bip(g_id) < gamma) const { + if (Gamma_size < 4) + return 1; + long int g_id; + if (set_ids.count(gamma)) + g_id = set_ids.at(gamma); + else + g_id = -10; + // std::cout << "g_id: "<< g_id << " TO "<< p_bip(g_id) < gamma, + boost::dynamic_bitset<> gammap, + boost::dynamic_bitset<> gammapp) const { + if (Gamma_size < 4) + return 1; + long int g_id; + if (set_ids.count(gamma)) + g_id = set_ids.at(gamma); + else + g_id = -10; -scalar_type approx_posterior::p_dip(boost::dynamic_bitset<> gamma, boost::dynamic_bitset<> gammap, boost::dynamic_bitset<> gammapp) const -{ - if (Gamma_size<4) - return 1; - long int g_id; - if (set_ids.count(gamma)) - g_id=set_ids.at(gamma); - else - g_id=-10; - - long int gp_id; - if (set_ids.count(gammap)) - gp_id=set_ids.at(gammap); - else - gp_id=-10; - - long int gpp_id; - if (set_ids.count(gammapp)) - gpp_id=set_ids.at(gammapp); - else - gpp_id=-10; - - //std::cout << "g_id: "<< g_id << " AND "<< gp_id << " AND "<< gpp_id << " TO: "<< p_dip(g_id, gp_id, gpp_id) <<" " << p_dip( g_id, gpp_id, gp_id) <gp_id) - return p_dip( g_id, gp_id, gpp_id); - else - return p_dip( g_id, gpp_id, gp_id); -} + long int gp_id; + if (set_ids.count(gammap)) + gp_id = set_ids.at(gammap); + else + gp_id = -10; + long int gpp_id; + if (set_ids.count(gammapp)) + gpp_id = set_ids.at(gammapp); + else + gpp_id = -10; -scalar_type approx_posterior::p_bip(long int g_id) const -{ - if (Gamma_size<4) - return 1; + // std::cout << "g_id: "<< g_id << " AND "<< gp_id << " AND "<< gpp_id << " + // TO: "<< p_dip(g_id, gp_id, gpp_id) <<" " << p_dip( g_id, gpp_id, gp_id) + // < gp_id) + return p_dip(g_id, gp_id, gpp_id); + else + return p_dip(g_id, gpp_id, gp_id); +} - scalar_type Bip_count=0; +scalar_type approx_posterior::p_bip(long int g_id) const { + if (Gamma_size < 4) + return 1; - if (Bip_counts.count(g_id)==0 or g_id==-10 or !g_id) - { - //never saw gamma in sample - Bip_count=0; - } - else - { - Bip_count=Bip_counts.at(g_id); - } - //if ( gamma.size()==1 or (int)gamma.size()==Gamma_size-1) Bip_count=observations; - if (set_sizes.count(g_id)==0 or g_id==-10) Bip_count=0; - else if ( set_sizes.at(g_id)==1 or set_sizes.at(g_id)==Gamma_size-1) Bip_count=observations; + scalar_type Bip_count = 0; - if ( alpha>0 ) - return Bip_count / ( observations+alpha ) + ( alpha/N_Gamma*Bi ( set_sizes.at ( g_id ) ) ) / ( observations+alpha ); + if (Bip_counts.count(g_id) == 0 or g_id == -10 or !g_id) { + // never saw gamma in sample + Bip_count = 0; + } else { + Bip_count = Bip_counts.at(g_id); + } + // if ( gamma.size()==1 or (int)gamma.size()==Gamma_size-1) + // Bip_count=observations; + if (set_sizes.count(g_id) == 0 or g_id == -10) + Bip_count = 0; + else if (set_sizes.at(g_id) == 1 or set_sizes.at(g_id) == Gamma_size - 1) + Bip_count = observations; + + if (alpha > 0) + return Bip_count / (observations + alpha) + + (alpha / N_Gamma * Bi(set_sizes.at(g_id))) / (observations + alpha); else return Bip_count / observations; } - -scalar_type approx_posterior::p_dip(long int g_id,long int gp_id,long int gpp_id) const -{ - if (Gamma_size<4) +scalar_type approx_posterior::p_dip(long int g_id, long int gp_id, + long int gpp_id) const { + if (Gamma_size < 4) return 1; - scalar_type beta_switch=1; - scalar_type Dip_count=0,Bip_count=0; - if (Bip_counts.count(g_id)==0 or g_id==-10 or !g_id) - { - //never saw gamma in sample - beta_switch=0.; - Bip_count=0; - Dip_count=0; + scalar_type beta_switch = 1; + scalar_type Dip_count = 0, Bip_count = 0; + if (Bip_counts.count(g_id) == 0 or g_id == -10 or !g_id) { + // never saw gamma in sample + beta_switch = 0.; + Bip_count = 0; + Dip_count = 0; + } else { + // set parts; + // parts.insert(gp_id); + // parts.insert(gpp_id); + pair parts; + if (gpp_id > gp_id) { + parts.first = gp_id; + parts.second = gpp_id; + } else { + parts.first = gpp_id; + parts.second = gp_id; } - else - { - //set parts; - //parts.insert(gp_id); - //parts.insert(gpp_id); - pair parts; - if (gpp_id>gp_id) - { - parts.first = gp_id; - parts.second = gpp_id; - } - else - { - parts.first = gpp_id; - parts.second = gp_id; - } - - Bip_count=Bip_counts.at(g_id); - //Dip_count=Dip_counts.at(g_id).at(parts); - if (gp_id==-10 or gpp_id==-10 or Dip_counts.at(g_id).count(parts)==0 or !gp_id or !gpp_id) - { - //never saw gammap-gammapp partition in sample - Dip_count=0; - } - else - { - Dip_count=Dip_counts.at(g_id).at(parts); - } + + Bip_count = Bip_counts.at(g_id); + // Dip_count=Dip_counts.at(g_id).at(parts); + if (gp_id == -10 or gpp_id == -10 or + Dip_counts.at(g_id).count(parts) == 0 or !gp_id or !gpp_id) { + // never saw gammap-gammapp partition in sample + Dip_count = 0; + } else { + Dip_count = Dip_counts.at(g_id).at(parts); } - if (set_sizes.count(g_id)==0 or set_sizes.at(g_id)==1 or set_sizes.at(g_id)==Gamma_size-1) Bip_count=observations; + } + if (set_sizes.count(g_id) == 0 or set_sizes.at(g_id) == 1 or + set_sizes.at(g_id) == Gamma_size - 1) + Bip_count = observations; // ? above is correct or not ? - //cout << "Dip:"<0 or beta>0 ) - return ( Dip_count + ( alpha/N_Gamma*Tri ( set_sizes.at ( gp_id ),set_sizes.at ( gpp_id ) ) ) + beta_switch*beta/ ( pow ( 2.,set_sizes.at ( g_id )-1 )-1 ) ) / ( Bip_count + ( alpha/N_Gamma*Bi ( set_sizes.at ( g_id ) ) ) + beta_switch*beta ); - else - return Dip_count/Bip_count; + // cout << "Dip:"< 0 or beta > 0) + return (Dip_count + + (alpha / N_Gamma * Tri(set_sizes.at(gp_id), set_sizes.at(gpp_id))) + + beta_switch * beta / (pow(2., set_sizes.at(g_id) - 1) - 1)) / + (Bip_count + (alpha / N_Gamma * Bi(set_sizes.at(g_id))) + + beta_switch * beta); + else + return Dip_count / Bip_count; } - // an unrooted tree given by its Newick string (which can be rooted) -map < boost::dynamic_bitset<> ,scalar_type > approx_posterior::recompose(string G_string) const -{ - map < boost::dynamic_bitset<> ,scalar_type > return_map; - map dedges;//del-loc - map > neighbor;//del-loc - - tree_type * G = TreeTemplateTools::parenthesisToTree(G_string,false);//del-loc - - if (G->isRooted()) G->unroot(); - vector nodes= G -> getNodes(); //del-loc - - //Find all directed edges - for( vector::iterator it=nodes.begin(); it!=nodes.end(); it++) - { - Node * from = *it; - if (from -> hasFather()) - { - Node * father = from->getFather(); - neighbor[from].push_back(father); - } - if (! from->isLeaf() ) - { - vector sons=from->getSons(); //del-loc - for( vector::iterator it_sons=sons.begin(); it_sons!=sons.end(); it_sons++) - neighbor[from].push_back((*it_sons)); - sons.clear(); - } - for( vector::iterator it_tos=neighbor[from].begin(); it_tos!=neighbor[from].end(); it_tos++) - { - dedge_type dedge; - dedge.first = from; - dedge.second = *it_tos; - dedges[dedge]=0; - } +map, scalar_type> +approx_posterior::recompose(string G_string) const { + map, scalar_type> return_map; + map dedges; // del-loc + map> neighbor; // del-loc + + tree_type *G = + TreeTemplateTools::parenthesisToTree(G_string, false); // del-loc + + if (G->isRooted()) + G->unroot(); + vector nodes = G->getNodes(); // del-loc + + // Find all directed edges + for (vector::iterator it = nodes.begin(); it != nodes.end(); it++) { + Node *from = *it; + if (from->hasFather()) { + Node *father = from->getFather(); + neighbor[from].push_back(father); } + if (!from->isLeaf()) { + vector sons = from->getSons(); // del-loc + for (vector::iterator it_sons = sons.begin(); + it_sons != sons.end(); it_sons++) + neighbor[from].push_back((*it_sons)); + sons.clear(); + } + for (vector::iterator it_tos = neighbor[from].begin(); + it_tos != neighbor[from].end(); it_tos++) { + dedge_type dedge; + dedge.first = from; + dedge.second = *it_tos; + dedges[dedge] = 0; + } + } nodes.clear(); - map > flat_names; //del-loc - map q; //del-loc + map> flat_names; // del-loc + map q; // del-loc // Name all leaves - for( map::iterator it=dedges.begin(); it!=dedges.end(); it++) - { - - dedge_type dedge=(*it).first; - Node * from = dedge.first; - Node * to = dedge.second; - //visit dedges from a leaf as these can be named - if (from->isLeaf()) - { - if (flat_names.find (dedge) == flat_names.end() ) { - //int* temp = new int[nbint]; - boost::dynamic_bitset<> temp(Gamma_size+1); - /* for (auto i=0; i< nbint; ++i) { //Resetting all bits - temp[i] = 0; - //BipartitionTools::bit0( temp, static_cast(i) ); - }*/ - flat_names[dedge] = temp; + for (map::iterator it = dedges.begin(); it != dedges.end(); + it++) { + + dedge_type dedge = (*it).first; + Node *from = dedge.first; + Node *to = dedge.second; + // visit dedges from a leaf as these can be named + if (from->isLeaf()) { + if (flat_names.find(dedge) == flat_names.end()) { + // int* temp = new int[nbint]; + boost::dynamic_bitset<> temp(Gamma_size + 1); + /* for (auto i=0; i< nbint; ++i) { //Resetting all bits + temp[i] = 0; + //BipartitionTools::bit0( temp, static_cast(i) ); + }*/ + flat_names[dedge] = temp; + } + // BipartitionTools::bit1(flat_names.at(dedge), static_cast( + // leaf_ids.at(from->getName() ) ) ); + flat_names.at(dedge)[static_cast(leaf_ids.at(from->getName()))] = 1; + // flat_names[dedge].insert(leaf_ids[from->getName()]); + q[dedge] = 1; + return_map[flat_names[dedge]] = q[dedge]; + // mark named + dedges[dedge] = -1; + // proceed to dedges in next level - dedges from cherries can now be + // named and at least one cherry must exist + for (vector::iterator it_tos = neighbor[to].begin(); + it_tos != neighbor[to].end(); it_tos++) + if ((*it_tos) != from) { + dedge_type dedge_out; + dedge_out.first = to; + dedge_out.second = *it_tos; + dedges[dedge_out] += 1; } - //BipartitionTools::bit1(flat_names.at(dedge), static_cast( leaf_ids.at(from->getName() ) ) ); - flat_names.at(dedge)[ static_cast( leaf_ids.at(from->getName() ) ) ] = 1; - // flat_names[dedge].insert(leaf_ids[from->getName()]); - q[dedge]=1; - return_map[flat_names[dedge]]=q[dedge]; - //mark named - dedges[dedge]=-1; - //proceed to dedges in next level - dedges from cherries can now be named and at least one cherry must exist - for( vector::iterator it_tos=neighbor[to].begin(); it_tos!=neighbor[to].end(); it_tos++) - if ((*it_tos)!=from) - { - dedge_type dedge_out; - dedge_out.first = to; - dedge_out.second = *it_tos; - dedges[dedge_out]+=1; - } - } } + } - bool edges_left=false; - for( map::iterator it=dedges.begin(); it!=dedges.end(); it++) - { - dedge_type dedge=(*it).first; - if (dedges[dedge]!=-1) - edges_left=true; //Couldn't we add a "break" here? + bool edges_left = false; + for (map::iterator it = dedges.begin(); it != dedges.end(); + it++) { + dedge_type dedge = (*it).first; + if (dedges[dedge] != -1) + edges_left = true; // Couldn't we add a "break" here? + } + while (edges_left) { + for (map::iterator it = dedges.begin(); it != dedges.end(); + it++) { + dedge_type dedge = (*it).first; + // Process edges that can be named + if (dedges[dedge] == 2) { + Node *from = dedge.first; + Node *to = dedge.second; + vector dedges_in; // del-loc + for (vector::iterator it_tos = neighbor[from].begin(); + it_tos != neighbor[from].end(); it_tos++) + if (*it_tos != to) { + dedge_type dedge_in; + dedge_in.first = *it_tos; + dedge_in.second = from; + dedges_in.push_back(dedge_in); + } + + /* + set leaf_set_in_1=flat_names[dedges_in[0]]; + set leaf_set_in_2=flat_names[dedges_in[1]]; + //flat naming + for (set::iterator + sit=leaf_set_in_1.begin();sit!=leaf_set_in_1.end();sit++) + flat_names[dedge].insert((*sit)); + for (set::iterator + sit=leaf_set_in_2.begin();sit!=leaf_set_in_2.end();sit++) + flat_names[dedge].insert((*sit)); + */ + /* + int* leaf_set_in_1=flat_names[dedges_in[0]]; + int* leaf_set_in_2=flat_names[dedges_in[1]]; + */ + boost::dynamic_bitset<> leaf_set_in_1(Gamma_size + 1); + leaf_set_in_1 = flat_names[dedges_in[0]]; + boost::dynamic_bitset<> leaf_set_in_2(Gamma_size + 1); + leaf_set_in_2 = flat_names[dedges_in[1]]; + if (flat_names.find(dedge) == flat_names.end()) { + // int* temp = new int[nbint]; + boost::dynamic_bitset<> temp(Gamma_size + 1); + /* for (auto i=0; i< nbint; ++i) { //Resetting all bits + temp[i] = 0; + //BipartitionTools::bit0( temp, static_cast(i) ); + }*/ + flat_names[dedge] = temp; + } + // BipartitionTools::bitOr(flat_names[dedge], leaf_set_in_1, + // leaf_set_in_2, nbint); + flat_names[dedge] = leaf_set_in_1 | leaf_set_in_2; + + // flat naming + // dip_type dip; + // dip.first=set2id(flat_names[dedge]); + // dip.second.insert(set2id(leaf_set_in_1)); + // dip.second.insert(set2id(leaf_set_in_2)); + // XX + // cout << set2name(leaf_set_in_1) << " | "<< set2name(leaf_set_in_2) << + // endl; cout << q[dedges_in[0]] << " " << q[dedges_in[1]]<< " " << + // p_dip(flat_names[dedge],leaf_set_in_1,leaf_set_in_2) << endl; + q[dedge] = q[dedges_in[0]] * q[dedges_in[1]] * + p_dip(flat_names[dedge], leaf_set_in_1, leaf_set_in_2); + return_map[flat_names[dedge]] = q[dedge]; + + // mark named + dedges[dedge] = -1; + // proceed to dedges in next level - new dedges can now be named + for (vector::iterator it_tos = neighbor[to].begin(); + it_tos != neighbor[to].end(); it_tos++) + if ((*it_tos) != from) { + dedge_type dedge_out; + dedge_out.first = to; + dedge_out.second = *it_tos; + dedges[dedge_out] += 1; + } + dedges_in.clear(); + } } - while (edges_left) - { - for( map::iterator it=dedges.begin(); it!=dedges.end(); it++) - { - dedge_type dedge=(*it).first; - //Process edges that can be named - if (dedges[dedge]==2) - { - Node * from = dedge.first; - Node * to = dedge.second; - vector dedges_in; //del-loc - for( vector::iterator it_tos=neighbor[from].begin(); it_tos!=neighbor[from].end(); it_tos++) - if (*it_tos!=to) - { - dedge_type dedge_in; - dedge_in.first = *it_tos; - dedge_in.second = from; - dedges_in.push_back(dedge_in); - } - - /* - set leaf_set_in_1=flat_names[dedges_in[0]]; - set leaf_set_in_2=flat_names[dedges_in[1]]; - //flat naming - for (set::iterator sit=leaf_set_in_1.begin();sit!=leaf_set_in_1.end();sit++) - flat_names[dedge].insert((*sit)); - for (set::iterator sit=leaf_set_in_2.begin();sit!=leaf_set_in_2.end();sit++) - flat_names[dedge].insert((*sit)); - */ - /* - int* leaf_set_in_1=flat_names[dedges_in[0]]; - int* leaf_set_in_2=flat_names[dedges_in[1]]; - */ - boost::dynamic_bitset<> leaf_set_in_1(Gamma_size+1); - leaf_set_in_1 =flat_names[dedges_in[0]] ; - boost::dynamic_bitset<> leaf_set_in_2(Gamma_size+1); - leaf_set_in_2 =flat_names[dedges_in[1]] ; - if (flat_names.find (dedge) == flat_names.end() ) { - //int* temp = new int[nbint]; - boost::dynamic_bitset<> temp( Gamma_size+1 ); - /* for (auto i=0; i< nbint; ++i) { //Resetting all bits - temp[i] = 0; - //BipartitionTools::bit0( temp, static_cast(i) ); - }*/ - flat_names[dedge] = temp; - } - // BipartitionTools::bitOr(flat_names[dedge], leaf_set_in_1, leaf_set_in_2, nbint); - flat_names[dedge] = leaf_set_in_1 | leaf_set_in_2 ; - - //flat naming - //dip_type dip; - //dip.first=set2id(flat_names[dedge]); - //dip.second.insert(set2id(leaf_set_in_1)); - //dip.second.insert(set2id(leaf_set_in_2)); - //XX - //cout << set2name(leaf_set_in_1) << " | "<< set2name(leaf_set_in_2) << endl; - //cout << q[dedges_in[0]] << " " << q[dedges_in[1]]<< " " << p_dip(flat_names[dedge],leaf_set_in_1,leaf_set_in_2) << endl; - q[dedge]=q[dedges_in[0]]*q[dedges_in[1]]*p_dip(flat_names[dedge],leaf_set_in_1,leaf_set_in_2); - return_map[flat_names[dedge]]=q[dedge]; - - //mark named - dedges[dedge]=-1; - //proceed to dedges in next level - new dedges can now be named - for( vector::iterator it_tos=neighbor[to].begin(); it_tos!=neighbor[to].end(); it_tos++) - if ((*it_tos)!=from) - { - dedge_type dedge_out; - dedge_out.first = to; - dedge_out.second = *it_tos; - dedges[dedge_out]+=1; - } - dedges_in.clear(); - } - } - edges_left=false; - for( map::iterator it=dedges.begin(); it!=dedges.end(); it++) - { - dedge_type dedge=(*it).first; - if (dedges[dedge]!=-1) - edges_left=true; - } + edges_left = false; + for (map::iterator it = dedges.begin(); it != dedges.end(); + it++) { + dedge_type dedge = (*it).first; + if (dedges[dedge] != -1) + edges_left = true; } + } - //del-locs + // del-locs dedges.clear(); - for( map >::iterator it=neighbor.begin(); it!=neighbor.end(); it++) + for (map>::iterator it = neighbor.begin(); + it != neighbor.end(); it++) (*it).second.clear(); neighbor.clear(); delete G; - for( auto it=flat_names.begin(); it!=flat_names.end(); it++) { + for (auto it = flat_names.begin(); it != flat_names.end(); it++) { //(*it).second.clear(); - for (auto i=0; i< Gamma_size+1; ++i) { //Resetting all bits - (*it).second[i] = 0; - //BipartitionTools::bit0( (*it).second, static_cast(i) ); - } + for (auto i = 0; i < Gamma_size + 1; ++i) { // Resetting all bits + (*it).second[i] = 0; + // BipartitionTools::bit0( (*it).second, static_cast(i) ); } + } flat_names.clear(); q.clear(); return return_map; } - // an unrooted tree given by its Newick string (which can be rooted) -void approx_posterior::decompose(string G_string, set * bip_ids ,scalar_type weight) -{ - //VEC - std::unordered_map< pair,scalar_type> tmp ; +void approx_posterior::decompose(string G_string, set *bip_ids, + scalar_type weight) { + // VEC + std::unordered_map, scalar_type> tmp; Dip_counts.push_back(tmp); - //VEC + // VEC - //vector return_dips; - map dedges;//del-loc - map > neighbor;//del-loc + // vector return_dips; + map dedges; // del-loc + map> neighbor; // del-loc - tree_type * G = TreeTemplateTools::parenthesisToTree(G_string,false);//del-loc + tree_type *G = + TreeTemplateTools::parenthesisToTree(G_string, false); // del-loc - if (G->isRooted()) G->unroot(); - vector nodes= G -> getNodes(); //del-loc + if (G->isRooted()) + G->unroot(); + vector nodes = G->getNodes(); // del-loc - //Find all directed edges - for( vector::iterator it=nodes.begin(); it!=nodes.end(); it++) - { - Node * from = *it; - if (from -> hasFather()) - { - Node * father = from->getFather(); - neighbor[from].push_back(father); - } - if (! from->isLeaf() ) - { - vector sons=from->getSons(); //del-loc - for( vector::iterator it_sons=sons.begin(); it_sons!=sons.end(); it_sons++) - neighbor[from].push_back((*it_sons)); - sons.clear(); - } - for( vector::iterator it_tos=neighbor[from].begin(); it_tos!=neighbor[from].end(); it_tos++) - { - dedge_type dedge; - dedge.first = from; - dedge.second = *it_tos; - dedges[dedge]=0; - } + // Find all directed edges + for (vector::iterator it = nodes.begin(); it != nodes.end(); it++) { + Node *from = *it; + if (from->hasFather()) { + Node *father = from->getFather(); + neighbor[from].push_back(father); + } + if (!from->isLeaf()) { + vector sons = from->getSons(); // del-loc + for (vector::iterator it_sons = sons.begin(); + it_sons != sons.end(); it_sons++) + neighbor[from].push_back((*it_sons)); + sons.clear(); + } + for (vector::iterator it_tos = neighbor[from].begin(); + it_tos != neighbor[from].end(); it_tos++) { + dedge_type dedge; + dedge.first = from; + dedge.second = *it_tos; + dedges[dedge] = 0; } + } nodes.clear(); - - map > flat_names; //del-loc + map> flat_names; // del-loc // Name all leaves - for( map::iterator it=dedges.begin(); it!=dedges.end(); it++) - { - - dedge_type dedge=(*it).first; - Node * from = dedge.first; - Node * to = dedge.second; - //visit dedges from a leaf as these can be named - if (from->isLeaf()) - { - if (flat_names.find (dedge) == flat_names.end() ) { - boost::dynamic_bitset<> temp(Gamma_size+1); - flat_names[dedge] = temp; - } - /* if (flat_names.find (dedge) == flat_names.end() ) { - int* temp = new int[nbint]; - for (auto i=0; i< Gamma_size; ++i) { //Resetting all bits - temp[i] = 0; - //BipartitionTools::bit0( temp, static_cast(i) ); - } - flat_names[dedge] = temp; + for (map::iterator it = dedges.begin(); it != dedges.end(); + it++) { + + dedge_type dedge = (*it).first; + Node *from = dedge.first; + Node *to = dedge.second; + // visit dedges from a leaf as these can be named + if (from->isLeaf()) { + if (flat_names.find(dedge) == flat_names.end()) { + boost::dynamic_bitset<> temp(Gamma_size + 1); + flat_names[dedge] = temp; + } + /* if (flat_names.find (dedge) == flat_names.end() ) { + int* temp = new int[nbint]; + for (auto i=0; i< Gamma_size; ++i) { //Resetting all bits + temp[i] = 0; + //BipartitionTools::bit0( temp, static_cast(i) ); + } + flat_names[dedge] = temp; + } + BipartitionTools::bit1(flat_names[dedge], static_cast( + leaf_ids[from->getName()] ) ); + */ + flat_names.at(dedge)[static_cast(leaf_ids[from->getName()])] = 1; + // flat_names[dedge].insert(leaf_ids[from->getName()]); + // bl - hack + long int g_id = set2id(flat_names[dedge]); + if (from->hasDistanceToFather()) + Bip_bls[g_id] += from->getDistanceToFather(); + else + Bip_bls[g_id] += 0; + // mark named + dedges[dedge] = -1; + // proceed to dedges in next level - dedges from cherries can now be + // named and at least one cherry must exist + for (vector::iterator it_tos = neighbor[to].begin(); + it_tos != neighbor[to].end(); it_tos++) + if ((*it_tos) != from) { + dedge_type dedge_out; + dedge_out.first = to; + dedge_out.second = *it_tos; + dedges[dedge_out] += 1; } - BipartitionTools::bit1(flat_names[dedge], static_cast( leaf_ids[from->getName()] ) ); - */ - flat_names.at(dedge)[static_cast( leaf_ids[from->getName()] )] = 1; - // flat_names[dedge].insert(leaf_ids[from->getName()]); - //bl - hack - long int g_id=set2id(flat_names[dedge]); - if (from->hasDistanceToFather()) - Bip_bls[g_id]+=from->getDistanceToFather(); - else - Bip_bls[g_id]+=0; - //mark named - dedges[dedge]=-1; - //proceed to dedges in next level - dedges from cherries can now be named and at least one cherry must exist - for( vector::iterator it_tos=neighbor[to].begin(); it_tos!=neighbor[to].end(); it_tos++) - if ((*it_tos)!=from) - { - dedge_type dedge_out; - dedge_out.first = to; - dedge_out.second = *it_tos; - dedges[dedge_out]+=1; - } - } - } - - - bool edges_left=false; - for( map::iterator it=dedges.begin(); it!=dedges.end(); it++) - { - dedge_type dedge=(*it).first; - if (dedges[dedge]!=-1) - edges_left=true; } + } + bool edges_left = false; + for (map::iterator it = dedges.begin(); it != dedges.end(); + it++) { + dedge_type dedge = (*it).first; + if (dedges[dedge] != -1) + edges_left = true; + } - if (G -> getLeaves().size()==2) - { - Bip_counts[(long int) 1]+=weight; + if (G->getLeaves().size() == 2) { + Bip_counts[(long int)1] += weight; - } - else - while (edges_left) - { - for( map::iterator it=dedges.begin(); it!=dedges.end(); it++) - { - - dedge_type dedge=(*it).first; - //Process edges that can be named - if (dedges[dedge]==2) - { - Node * from = dedge.first; - Node * to = dedge.second; - vector dedges_in; //del-loc - for( vector::iterator it_tos=neighbor[from].begin(); it_tos!=neighbor[from].end(); it_tos++) - if (*it_tos!=to) - { - dedge_type dedge_in; - dedge_in.first = *it_tos; - dedge_in.second = from; - dedges_in.push_back(dedge_in); - } - /* - set leaf_set_in_1=flat_names[dedges_in[0]]; - set leaf_set_in_2=flat_names[dedges_in[1]]; - //flat naming - for (set::iterator sit=leaf_set_in_1.begin();sit!=leaf_set_in_1.end();sit++) - flat_names[dedge].insert((*sit)); - for (set::iterator sit=leaf_set_in_2.begin();sit!=leaf_set_in_2.end();sit++) - flat_names[dedge].insert((*sit)); - //flat naming - */ - boost::dynamic_bitset<> leaf_set_in_1=flat_names[dedges_in[0]]; - boost::dynamic_bitset<> leaf_set_in_2=flat_names[dedges_in[1]]; - if (flat_names.find (dedge) == flat_names.end() ) { - boost::dynamic_bitset<> temp( Gamma_size+1 ); - flat_names[dedge] = temp; - } - /* - if (flat_names.find (dedge) == flat_names.end() ) { - int* temp = new int[nbint]; - for (auto i=0; i< Gamma_size; ++i) { //Resetting all bits - temp[i] = 0; - //BipartitionTools::bit0( temp, static_cast(i) ); - } - flat_names[dedge] = temp; + } else + while (edges_left) { + for (map::iterator it = dedges.begin(); + it != dedges.end(); it++) { + + dedge_type dedge = (*it).first; + // Process edges that can be named + if (dedges[dedge] == 2) { + Node *from = dedge.first; + Node *to = dedge.second; + vector dedges_in; // del-loc + for (vector::iterator it_tos = neighbor[from].begin(); + it_tos != neighbor[from].end(); it_tos++) + if (*it_tos != to) { + dedge_type dedge_in; + dedge_in.first = *it_tos; + dedge_in.second = from; + dedges_in.push_back(dedge_in); + } + /* + set leaf_set_in_1=flat_names[dedges_in[0]]; + set leaf_set_in_2=flat_names[dedges_in[1]]; + //flat naming + for (set::iterator + sit=leaf_set_in_1.begin();sit!=leaf_set_in_1.end();sit++) + flat_names[dedge].insert((*sit)); + for (set::iterator + sit=leaf_set_in_2.begin();sit!=leaf_set_in_2.end();sit++) + flat_names[dedge].insert((*sit)); + //flat naming + */ + boost::dynamic_bitset<> leaf_set_in_1 = flat_names[dedges_in[0]]; + boost::dynamic_bitset<> leaf_set_in_2 = flat_names[dedges_in[1]]; + if (flat_names.find(dedge) == flat_names.end()) { + boost::dynamic_bitset<> temp(Gamma_size + 1); + flat_names[dedge] = temp; + } + /* + if (flat_names.find (dedge) == flat_names.end() ) { + int* temp = new int[nbint]; + for (auto i=0; i< Gamma_size; ++i) { //Resetting all bits + temp[i] = 0; + //BipartitionTools::bit0( temp, static_cast(i) ); } - BipartitionTools::bitOr(flat_names[dedge], leaf_set_in_1, leaf_set_in_2, nbint); - */ - flat_names[dedge] = leaf_set_in_1 | leaf_set_in_2; - - long int g_id=set2id(flat_names[dedge]); - - //set parts; - //parts.insert(set2id(leaf_set_in_1)); - //parts.insert(set2id(leaf_set_in_2)); - - pair parts; - long int tmp_id1=set2id(leaf_set_in_1); - long int tmp_id2=set2id(leaf_set_in_2); - if (tmp_id1hasFather() and from->getFather()==to) - { - if (from->hasDistanceToFather()) - Bip_bls[g_id]+=from->getDistanceToFather(); - else - Bip_bls[g_id]+=0; - } - else if (to->hasFather() and to->getFather()==from) - { - if (to->hasDistanceToFather()) - Bip_bls[g_id]+=to->getDistanceToFather(); - else - Bip_bls[g_id]+=0; - } - else - { - cout << "impossible" <insert(g_id); - - //dip.first=g_id; - //dip.second=parts; - //return_dips.push_back(dip); - - //mark named - dedges[dedge]=-1; - //proceed to dedges in next level - new dedges can now be named - for( vector::iterator it_tos=neighbor[to].begin(); it_tos!=neighbor[to].end(); it_tos++) - if ((*it_tos)!=from) - { - dedge_type dedge_out; - dedge_out.first = to; - dedge_out.second = *it_tos; - dedges[dedge_out]+=1; - } - dedges_in.clear(); - - } - - } - edges_left=false; - for( map::iterator it=dedges.begin(); it!=dedges.end(); it++) - { - dedge_type dedge=(*it).first; - if (dedges[dedge]!=-1) - edges_left=true; - } + flat_names[dedge] = temp; + } + BipartitionTools::bitOr(flat_names[dedge], leaf_set_in_1, + leaf_set_in_2, nbint); + */ + flat_names[dedge] = leaf_set_in_1 | leaf_set_in_2; + + long int g_id = set2id(flat_names[dedge]); + + // set parts; + // parts.insert(set2id(leaf_set_in_1)); + // parts.insert(set2id(leaf_set_in_2)); + + pair parts; + long int tmp_id1 = set2id(leaf_set_in_1); + long int tmp_id2 = set2id(leaf_set_in_2); + if (tmp_id1 < tmp_id2) { + parts.first = tmp_id1; + parts.second = tmp_id2; + } else { + parts.first = tmp_id2; + parts.second = tmp_id1; + } + // bl - hack + + if (from->hasFather() and from->getFather() == to) { + if (from->hasDistanceToFather()) + Bip_bls[g_id] += from->getDistanceToFather(); + else + Bip_bls[g_id] += 0; + } else if (to->hasFather() and to->getFather() == from) { + if (to->hasDistanceToFather()) + Bip_bls[g_id] += to->getDistanceToFather(); + else + Bip_bls[g_id] += 0; + } else { + cout << "impossible" << endl; + } + + // bl - hack + // INTEGRATED COUNTING + Dip_counts[g_id][parts] += weight; + + Bip_counts[g_id] += weight; + + // bipartion naming + if (bip_ids != NULL) + bip_ids->insert(g_id); + + // dip.first=g_id; + // dip.second=parts; + // return_dips.push_back(dip); + + // mark named + dedges[dedge] = -1; + // proceed to dedges in next level - new dedges can now be named + for (vector::iterator it_tos = neighbor[to].begin(); + it_tos != neighbor[to].end(); it_tos++) + if ((*it_tos) != from) { + dedge_type dedge_out; + dedge_out.first = to; + dedge_out.second = *it_tos; + dedges[dedge_out] += 1; + } + dedges_in.clear(); + } + } + edges_left = false; + for (map::iterator it = dedges.begin(); + it != dedges.end(); it++) { + dedge_type dedge = (*it).first; + if (dedges[dedge] != -1) + edges_left = true; } + } - //del-locs + // del-locs dedges.clear(); - for( map >::iterator it=neighbor.begin(); it!=neighbor.end(); it++) + for (map>::iterator it = neighbor.begin(); + it != neighbor.end(); it++) (*it).second.clear(); neighbor.clear(); delete G; - for( auto it=flat_names.begin(); it!=flat_names.end(); it++) { + for (auto it = flat_names.begin(); it != flat_names.end(); it++) { //(*it).second.clear(); - for (auto i=0; i< Gamma_size+1; ++i) { //Resetting all bits - (*it).second[i] = 0; - //BipartitionTools::bit0( (*it).second, static_cast(i) ); - } + for (auto i = 0; i < Gamma_size + 1; ++i) { // Resetting all bits + (*it).second[i] = 0; + // BipartitionTools::bit0( (*it).second, static_cast(i) ); } + } flat_names.clear(); - //return return_dips; + // return return_dips; } -void approx_posterior::observation(vector trees, bool count_topologies, scalar_type weight) -{ - for (vector::iterator it=trees.begin();it!=trees.end();it++) - { - //cout << (*it) << endl; - if (count_topologies) - { - set bip_ids;//del-loc - decompose(*it,&bip_ids,weight);//del-loc - string bip_string="|"; - for (set ::iterator st=bip_ids.begin();st!=bip_ids.end();st++) - bip_string+=(*st)+"|"; - if (bipstring_trees.count(bip_string)==0) - { - bipstring_trees[bip_string]=*it; - tree_bipstrings[*it]=bip_string; - } - tree_counts[bipstring_trees[bip_string]]+=1; - bip_ids.clear(); - } - else decompose(*it,NULL,weight);//del-loc - observations+=weight; - } - //cout << "obsdone." << endl; +void approx_posterior::observation(vector trees, bool count_topologies, + scalar_type weight) { + for (vector::iterator it = trees.begin(); it != trees.end(); it++) { + // cout << (*it) << endl; + if (count_topologies) { + set bip_ids; // del-loc + decompose(*it, &bip_ids, weight); // del-loc + string bip_string = "|"; + for (set::iterator st = bip_ids.begin(); st != bip_ids.end(); st++) + bip_string += (*st) + "|"; + if (bipstring_trees.count(bip_string) == 0) { + bipstring_trees[bip_string] = *it; + tree_bipstrings[*it] = bip_string; + } + tree_counts[bipstring_trees[bip_string]] += 1; + bip_ids.clear(); + } else + decompose(*it, NULL, weight); // del-loc + observations += weight; + } + // cout << "obsdone." << endl; set_sizes.clear(); - for (map > :: iterator it = size_ordered_bips.begin(); it != size_ordered_bips.end(); it++) + for (map>::iterator it = size_ordered_bips.begin(); + it != size_ordered_bips.end(); it++) (*it).second.clear(); size_ordered_bips.clear(); - for (auto it = id_sets.begin(); it != id_sets.end(); it++) - { - size_t size = 0; - for (auto i=0; i< Gamma_size + 1; ++i) { - // if ( BipartitionTools::testBit( (*it).second, i) ) { - if ( (*it).second[i] ) { - size++; - } - } - set_sizes[(*it).first] = size; - size_ordered_bips[ size ].push_back( (*it).first ); + for (auto it = id_sets.begin(); it != id_sets.end(); it++) { + size_t size = 0; + for (auto i = 0; i < Gamma_size + 1; ++i) { + // if ( BipartitionTools::testBit( (*it).second, i) ) { + if ((*it).second[i]) { + size++; + } } + set_sizes[(*it).first] = size; + size_ordered_bips[size].push_back((*it).first); + } } // of an unrooted tree given by its Newick string (which can be rooted) -scalar_type approx_posterior::p(string tree_string) const -{ - scalar_type p=0; - map < boost::dynamic_bitset<>,scalar_type> rec_map=recompose( tree_string); - for ( auto it=rec_map.begin();it!=rec_map.end();it++) - { - p=(*it).second; - //std::cout << "p: "<< p << std::endl; - boost::dynamic_bitset<> gamma=(*it).first; - boost::dynamic_bitset<> not_gamma = ~gamma; +scalar_type approx_posterior::p(string tree_string) const { + scalar_type p = 0; + map, scalar_type> rec_map = recompose(tree_string); + for (auto it = rec_map.begin(); it != rec_map.end(); it++) { + p = (*it).second; + // std::cout << "p: "<< p << std::endl; + boost::dynamic_bitset<> gamma = (*it).first; + boost::dynamic_bitset<> not_gamma = ~gamma; - not_gamma[0] = 0; + not_gamma[0] = 0; -/* for (auto i=0; i< Gamma_size; ++i) { - not_gamma[i] = 0; - } - BipartitionTools::bitNot(not_gamma,gamma, nbint);*/ - /* for (set::iterator st=Gamma.begin();st!=Gamma.end();st++) - if (gamma.count(*st)==0) - not_gamma.insert(*st);*/ - - p*=rec_map[not_gamma]*p_bip(gamma); - if (std::isnan(p) ) p = 0;//NumConstants::VERY_TINY (); - //std::cout << "rec_map[not_gamma]: "<::iterator st=Gamma.begin();st!=Gamma.end();st++) + if (gamma.count(*st)==0) + not_gamma.insert(*st);*/ + + p *= rec_map[not_gamma] * p_bip(gamma); + if (std::isnan(p)) + p = 0; // NumConstants::VERY_TINY (); + // std::cout << "rec_map[not_gamma]: "< approx_posterior::mpp_tree() const -{ - map qmpp; //del-loc. Map between a bipartition id and its maximum posterior probability. - for ( auto it = size_ordered_bips.begin(); it != size_ordered_bips.end(); it++) - for ( auto jt = (*it).second.begin(); jt != (*it).second.end(); jt++) - { - long int g_id=(*jt); - // leaves - if ((*it).first==1) - qmpp[g_id]=1; - else - { - scalar_type max_cp=0; - //Go through all resolutions of clade g_id - for ( auto kt = Dip_counts[g_id].begin(); kt != Dip_counts[g_id].end(); kt++) - { - long int gp_id=(*kt).first.first; - long int gpp_id=(*kt).first.second; - scalar_type cp=p_dip(g_id,gp_id,gpp_id)*qmpp[gp_id]*qmpp[gpp_id]; - if (cp>max_cp) max_cp=cp; - } - qmpp[g_id]=max_cp; - } - } - //Now we have maximum posterior probability estimates for all sets of leaves (=bipartitions) - //The second loop computes the maximum posterior probability estimate from all the trees that can be amalgamated (=all the trees that can be written as the junction of two leaf sets) - scalar_type max_pp=0,sum_pp=0; - long int max_bip=-1,max_not_bip=-1; - //we look at everything twice.. - for ( auto it = Bip_counts.begin(); it != Bip_counts.end(); it++) - { - long int g_id=(*it).first; - /* set gamma=id_sets[g_id]; - set not_gamma; - for (set::iterator st=Gamma.begin();st!=Gamma.end();st++) - if (gamma.count(*st)==0) - not_gamma.insert(*st);*/ - boost::dynamic_bitset<> gamma=id_sets.at(g_id); - boost::dynamic_bitset<> not_gamma = ~gamma; - not_gamma[0] = 0; - /* for (auto i=0; i< nbint; ++i) { - not_gamma[i] = 0; +pair approx_posterior::mpp_tree() const { + map qmpp; // del-loc. Map between a bipartition id and + // its maximum posterior probability. + for (auto it = size_ordered_bips.begin(); it != size_ordered_bips.end(); it++) + for (auto jt = (*it).second.begin(); jt != (*it).second.end(); jt++) { + long int g_id = (*jt); + // leaves + if ((*it).first == 1) + qmpp[g_id] = 1; + else { + scalar_type max_cp = 0; + // Go through all resolutions of clade g_id + for (auto kt = Dip_counts[g_id].begin(); kt != Dip_counts[g_id].end(); + kt++) { + long int gp_id = (*kt).first.first; + long int gpp_id = (*kt).first.second; + scalar_type cp = + p_dip(g_id, gp_id, gpp_id) * qmpp[gp_id] * qmpp[gpp_id]; + if (cp > max_cp) + max_cp = cp; } - BipartitionTools::bitNot(not_gamma, gamma, nbint);*/ - long int not_g_id = set_ids.at(not_gamma); - scalar_type pp=qmpp[g_id]*qmpp[not_g_id]*p_bip(g_id); - sum_pp+=pp; - if (max_pp gamma=id_sets[g_id]; + set not_gamma; + for (set::iterator st=Gamma.begin();st!=Gamma.end();st++) + if (gamma.count(*st)==0) + not_gamma.insert(*st);*/ + boost::dynamic_bitset<> gamma = id_sets.at(g_id); + boost::dynamic_bitset<> not_gamma = ~gamma; + not_gamma[0] = 0; + /* for (auto i=0; i< nbint; ++i) { + not_gamma[i] = 0; + } + BipartitionTools::bitNot(not_gamma, gamma, nbint);*/ + long int not_g_id = set_ids.at(not_gamma); + scalar_type pp = qmpp[g_id] * qmpp[not_g_id] * p_bip(g_id); + sum_pp += pp; + if (max_pp < pp) { + max_pp = pp; + max_bip = g_id; + max_not_bip = not_g_id; } + } stringstream bs; - //we looked at everything twice.. - bs< return_pair; - return_pair.first=max_tree; - return_pair.second=max_pp; + // cout << max_tree << endl; + pair return_pair; + return_pair.first = max_tree; + return_pair.second = max_pp; return return_pair; } - -string approx_posterior::mpp_backtrack(long int g_id, map * qmpp) const -{ - //leaf - if (set_sizes.at(g_id)==1) - { - stringstream bs; - bs< *qmpp) const { + // leaf + if (set_sizes.at(g_id) == 1) { + stringstream bs; + bs << Bip_bls.at(g_id) / observations; + int id = 0; + for (auto i = 0; i < Gamma_size + 1; ++i) { + // if ( BipartitionTools::testBit( id_sets.at(g_id), i) ) { + if (id_sets.at(g_id)[i]) { + id = i; + break; + } } - scalar_type max_cp=0,sum_cp=0; - long int max_gp_id=-1; - long int max_gpp_id=-1; - for ( auto kt = Dip_counts.at(g_id).begin(); kt != Dip_counts.at(g_id).end(); kt++) - { - long int gp_id= (*kt).first.first; - long int gpp_id=(*kt).first.second; - scalar_type cp=p_dip(g_id,gp_id,gpp_id)*(*qmpp).at(gp_id)*(*qmpp).at(gpp_id); - sum_cp+=cp; - if (cp>max_cp) {max_cp=cp; max_gp_id=gp_id; max_gpp_id=gpp_id;} + return id_leaves.at(id) + ":" + bs.str(); + } + + scalar_type max_cp = 0, sum_cp = 0; + long int max_gp_id = -1; + long int max_gpp_id = -1; + for (auto kt = Dip_counts.at(g_id).begin(); kt != Dip_counts.at(g_id).end(); + kt++) { + long int gp_id = (*kt).first.first; + long int gpp_id = (*kt).first.second; + scalar_type cp = + p_dip(g_id, gp_id, gpp_id) * (*qmpp).at(gp_id) * (*qmpp).at(gpp_id); + sum_cp += cp; + if (cp > max_cp) { + max_cp = cp; + max_gp_id = gp_id; + max_gpp_id = gpp_id; } + } stringstream bs; - bs< gamma; - scalar_type sum=0; - for ( auto it=Bip_counts.begin();it!=Bip_counts.end();it++) - { - sum+=(*it).second; - } - scalar_type rnd=RandomTools::giveRandomNumberBetweenZeroAndEntry(1); - scalar_type re_sum=0; + scalar_type sum = 0; + for (auto it = Bip_counts.begin(); it != Bip_counts.end(); it++) { + sum += (*it).second; + } + scalar_type rnd = RandomTools::giveRandomNumberBetweenZeroAndEntry(1); + scalar_type re_sum = 0; long int g_id; - for (auto it=Bip_counts.begin();it!=Bip_counts.end();it++) - { - re_sum+=(*it).second; - g_id=(*it).first; - if (re_sum>sum*rnd) - break; - } + for (auto it = Bip_counts.begin(); it != Bip_counts.end(); it++) { + re_sum += (*it).second; + g_id = (*it).first; + if (re_sum > sum * rnd) + break; + } gamma = id_sets.at(g_id); /*set not_gamma; for (int i=1;i not_gamma = ~gamma; - not_gamma[0] = 0; - /* for (auto i=0; i< nbint; ++i) { - not_gamma[i] = 0; - } - BipartitionTools::bitNot(not_gamma, gamma, nbint);*/ - return "("+random_split(gamma)+":1,"+random_split(not_gamma)+":1);\n"; + boost::dynamic_bitset<> not_gamma = ~gamma; + not_gamma[0] = 0; + /* for (auto i=0; i< nbint; ++i) { + not_gamma[i] = 0; + } + BipartitionTools::bitNot(not_gamma, gamma, nbint);*/ + return "(" + random_split(gamma) + ":1," + random_split(not_gamma) + ":1);\n"; } - -string approx_posterior::random_split(boost::dynamic_bitset<> gamma) const -{ +string approx_posterior::random_split(boost::dynamic_bitset<> gamma) const { // if gamma contains only a leaf we return its name - vector gamma_v; + vector gamma_v; // std::set-s are ordered and SHOULD have random acces, but don't, hence this -// for (set::iterator sit=gamma.begin();sit!=gamma.end();sit++) gamma_v.push_back((*sit)); - for (auto i =0 ; i < Gamma_size + 1 ; ++i) { - if ( gamma[i] ) - gamma_v.push_back( i ); - } - int gamma_size = gamma_v.size(); - if ( gamma_size == 1 ) - return id_leaves.at( gamma_v[0]); - scalar_type p_sum=0; - //rnd for choosing directed partition - scalar_type rnd=RandomTools::giveRandomNumberBetweenZeroAndEntry(1); - long int gp_id,gpp_id,g_id; - scalar_type Bip_count,beta_switch=1; - g_id=set_ids.at(gamma); - boost::dynamic_bitset<> gammap; - boost::dynamic_bitset<> gammapp; - if (!g_id) - { - //never saw gamma in sample - beta_switch=0.; - Bip_count=0; - } + // for (set::iterator sit=gamma.begin();sit!=gamma.end();sit++) + // gamma_v.push_back((*sit)); + for (auto i = 0; i < Gamma_size + 1; ++i) { + if (gamma[i]) + gamma_v.push_back(i); + } + int gamma_size = gamma_v.size(); + if (gamma_size == 1) + return id_leaves.at(gamma_v[0]); + scalar_type p_sum = 0; + // rnd for choosing directed partition + scalar_type rnd = RandomTools::giveRandomNumberBetweenZeroAndEntry(1); + long int gp_id, gpp_id, g_id; + scalar_type Bip_count, beta_switch = 1; + g_id = set_ids.at(gamma); + boost::dynamic_bitset<> gammap; + boost::dynamic_bitset<> gammapp; + if (!g_id) { + // never saw gamma in sample + beta_switch = 0.; + Bip_count = 0; + } - for (int gp_size=1 ; gp_size <= (int)gamma_v.size()/2; gp_size++) - { - int saw=0; - //see if a directed partition is the one we choose - if (g_id) - for (auto dit=Dip_counts[g_id].begin();dit!=Dip_counts[g_id].end();dit++) - { - vector parts_v; - gp_id=(*dit).first.first; - gpp_id=(*dit).first.second; - int gp_id_size = 0; + for (int gp_size = 1; gp_size <= (int)gamma_v.size() / 2; gp_size++) { + int saw = 0; + // see if a directed partition is the one we choose + if (g_id) + for (auto dit = Dip_counts[g_id].begin(); dit != Dip_counts[g_id].end(); + dit++) { + vector parts_v; + gp_id = (*dit).first.first; + gpp_id = (*dit).first.second; + int gp_id_size = 0; int gpp_id_size = 0; - boost::dynamic_bitset<> gp_id_bitvec = id_sets.at(gp_id); - boost::dynamic_bitset<> gpp_id_bitvec = id_sets.at(gpp_id); - for (auto i =0 ; i < Gamma_size + 1 ; ++i) { - if ( gp_id_bitvec[i] ) - gp_id_size++; - if ( gpp_id_bitvec[i] ) - gpp_id_size++; - } - int this_size=min( gp_id_size, gpp_id_size); - if ( this_size==gp_size ) - { - p_sum+=p_dip(gamma,id_sets.at(gp_id),id_sets.at(gpp_id)); - //see if this directed partition is the one we choose - if (rnd (gamma_size); - gammapp = boost::dynamic_bitset<> (gamma_size); - /* - for (auto i =0 ; i < nbint ; ++i) { - gammap[i] = 0; - gammapp[i] = 0; - }*/ - - /* I propose a rewriting for that, using getSample instead of giveIntRandomNumberBetweenZeroAndEntry several times - for (int i=0;i gammapv ( gp_size, 0 ); - RandomTools::getSample ( gamma_v, gammapv ); - for (int i=0;i parts; - if (gpp_id>gp_id) - { - parts.first = gp_id; - parts.second = gpp_id; - } - else - { - parts.first = gpp_id; - parts.second = gp_id; - } - - - if (Dip_counts.at(g_id).at(parts)==0) stop=true; - } - break; - } + boost::dynamic_bitset<> gp_id_bitvec = id_sets.at(gp_id); + boost::dynamic_bitset<> gpp_id_bitvec = id_sets.at(gpp_id); + for (auto i = 0; i < Gamma_size + 1; ++i) { + if (gp_id_bitvec[i]) + gp_id_size++; + if (gpp_id_bitvec[i]) + gpp_id_size++; + } + int this_size = min(gp_id_size, gpp_id_size); + if (this_size == gp_size) { + p_sum += p_dip(gamma, id_sets.at(gp_id), id_sets.at(gpp_id)); + // see if this directed partition is the one we choose + if (rnd < p_sum) + p_sum = -1; + saw += 1; + } + if (p_sum < 0) { + gammap = id_sets.at(gp_id); + gammapp = id_sets.at(gpp_id); + break; + } + } + if (p_sum < 0) + break; + // sum the prob.s of all unobserved bipartitons + Bip_count = Bip_counts.at(g_id); + if (gamma_size == 1 or gamma_size == Gamma_size - 1) + Bip_count = observations; + int nbip = binomial(gamma_size, gp_size); + if (gamma_size - gp_size == gp_size) + nbip /= 2; + p_sum += + (0 + (alpha / N_Gamma * Tri(gp_size, gamma_size - gp_size)) + + beta_switch * beta / (pow(2., (int)gamma_size - 1) - 1)) / + (Bip_count + (alpha / N_Gamma * Bi(gamma_size)) + beta_switch * beta) * + (nbip - saw); + + // see if an unsampled directed partition is the one we choose + + if (rnd < p_sum) + p_sum = -1; + if (p_sum < 0) { + // pick one of these at random + bool stop = false; + while (!stop) { + // reset gammap and gammapp + // gammap.clear();gammapp.clear(); + gammap = boost::dynamic_bitset<>(gamma_size); + gammapp = boost::dynamic_bitset<>(gamma_size); + /* + for (auto i =0 ; i < nbint ; ++i) { + gammap[i] = 0; + gammapp[i] = 0; + }*/ + + /* I propose a rewriting for that, using getSample instead of + giveIntRandomNumberBetweenZeroAndEntry several times for (int + i=0;i gammapv(gp_size, 0); + RandomTools::getSample(gamma_v, gammapv); + for (int i = 0; i < gp_size; i++) { + gammap[gammapv[i]] = 1; + // BipartitionTools::bit1( gammap, gammapv[i]) ; + } + gammapp = ~gammap; + gammapp[0] = 0; + // BipartitionTools::bitNot( gammapp, gammap, nbint) ; + gp_id = set_ids.at(gammap); + gpp_id = set_ids.at(gammapp); + + pair parts; + if (gpp_id > gp_id) { + parts.first = gp_id; + parts.second = gpp_id; + } else { + parts.first = gpp_id; + parts.second = gp_id; + } + if (Dip_counts.at(g_id).at(parts) == 0) + stop = true; + } + break; } - return "("+random_split(gammap)+":1,"+random_split(gammapp)+":1)"; - + } + return "(" + random_split(gammap) + ":1," + random_split(gammapp) + ":1)"; } - -vector approx_posterior::all_trees( boost::dynamic_bitset<> gamma) const -{ - vector< string > all_trees_g; - set gamma_s ; //not very efficient to use sets again, but this function is rarely used - for (auto i =0 ; i < Gamma_size + 1 ; ++i) { - // if ( BipartitionTools::testBit( gamma, i) ) { - if ( gamma[i] ) { - gamma_s.insert( i ); - } - } - if (gamma_s.size()==1) - { - all_trees_g.push_back(id_leaves.at( *(gamma_s.begin()) )); +vector +approx_posterior::all_trees(boost::dynamic_bitset<> gamma) const { + vector all_trees_g; + set gamma_s; // not very efficient to use sets again, but this function + // is rarely used + for (auto i = 0; i < Gamma_size + 1; ++i) { + // if ( BipartitionTools::testBit( gamma, i) ) { + if (gamma[i]) { + gamma_s.insert(i); } - else - { - set< set > P_gamma=powerset< set >(gamma_s);//del-loc - for (set >::iterator st=P_gamma.begin();st!=P_gamma.end();st++) - { - if (gamma_s.size()>(*st).size() and (*st).size()>0 and (*st).count(*(gamma_s.begin()))==1) - { - - set not_st;//del-loc - //int* st_bitV = new int[nbint]; - boost::dynamic_bitset<> st_bitV ( Gamma_size + 1 ); - /*for (auto i =0 ; i < nbint ; ++i) { - st_bitV[i] = 0; - // BipartitionTools::bit0( st_bitV, i) ; - }*/ - for (auto it = (*st).begin() ; it != (*st).end() ; ++it) { - //BipartitionTools::bit1( st_bitV, (*it) ) ; - st_bitV[ (*it) ] = 1; - } - + } + if (gamma_s.size() == 1) { + all_trees_g.push_back(id_leaves.at(*(gamma_s.begin()))); + } else { + set> P_gamma = powerset>(gamma_s); // del-loc + for (set>::iterator st = P_gamma.begin(); st != P_gamma.end(); + st++) { + if (gamma_s.size() > (*st).size() and (*st).size() > 0 and + (*st).count(*(gamma_s.begin())) == 1) { + + set not_st; // del-loc + // int* st_bitV = new int[nbint]; + boost::dynamic_bitset<> st_bitV(Gamma_size + 1); + /*for (auto i =0 ; i < nbint ; ++i) { + st_bitV[i] = 0; + // BipartitionTools::bit0( st_bitV, i) ; + }*/ + for (auto it = (*st).begin(); it != (*st).end(); ++it) { + // BipartitionTools::bit1( st_bitV, (*it) ) ; + st_bitV[(*it)] = 1; + } - vector< string > all_trees_gp=all_trees( st_bitV );//del-loc + vector all_trees_gp = all_trees(st_bitV); // del-loc - for (set::iterator nst=gamma_s.begin();nst!=gamma_s.end();nst++) - if ((*st).count(*nst)==0) - not_st.insert(*nst); + for (set::iterator nst = gamma_s.begin(); nst != gamma_s.end(); + nst++) + if ((*st).count(*nst) == 0) + not_st.insert(*nst); - /* int* not_st_bitV = new int[nbint]; + /* int* not_st_bitV = new int[nbint]; - for (auto i =0 ; i < nbint ; ++i) { - not_st_bitV[i] = 0; - //BipartitionTools::bit0( not_st_bitV, i) ; - } - for (auto it = not_st.begin() ; it != not_st.end() ; ++it) { - BipartitionTools::bit1( not_st_bitV, (*it) ) ; - } + for (auto i =0 ; i < nbint ; ++i) { + not_st_bitV[i] = 0; + //BipartitionTools::bit0( not_st_bitV, i) ; + } + for (auto it = not_st.begin() ; it != not_st.end() ; ++it) { + BipartitionTools::bit1( not_st_bitV, (*it) ) ; + } */ - boost::dynamic_bitset<> not_st_bitV ( Gamma_size + 1 ); - for (auto it = not_st.begin() ; it != not_st.end() ; ++it) { - not_st_bitV[(*it) ] = 1 ; - } - + boost::dynamic_bitset<> not_st_bitV(Gamma_size + 1); + for (auto it = not_st.begin(); it != not_st.end(); ++it) { + not_st_bitV[(*it)] = 1; + } - vector< string > all_trees_gpp=all_trees( not_st_bitV );//del-loc + vector all_trees_gpp = all_trees(not_st_bitV); // del-loc - for (vector::iterator lt=all_trees_gp.begin();lt!=all_trees_gp.end();lt++) - for (vector::iterator rt=all_trees_gpp.begin();rt!=all_trees_gpp.end();rt++) - { - all_trees_g.push_back("("+(*lt)+","+(*rt)+")"); - } - not_st.clear(); - all_trees_gp.clear(); - all_trees_gpp.clear(); - } - } + for (vector::iterator lt = all_trees_gp.begin(); + lt != all_trees_gp.end(); lt++) + for (vector::iterator rt = all_trees_gpp.begin(); + rt != all_trees_gpp.end(); rt++) { + all_trees_g.push_back("(" + (*lt) + "," + (*rt) + ")"); + } + not_st.clear(); + all_trees_gp.clear(); + all_trees_gpp.clear(); + } + } - P_gamma.clear(); + P_gamma.clear(); + } + // if (Gamma==gamma) + if (Gamma_size == (int)gamma_s.size()) + for (vector::iterator it = all_trees_g.begin(); + it != all_trees_g.end(); it++) { + (*it) += ";\n"; } -// if (Gamma==gamma) - if ( Gamma_size == (int) gamma_s.size() ) - for (vector::iterator it=all_trees_g.begin();it!=all_trees_g.end();it++) - { - (*it)+=";\n"; - } return all_trees_g; } - -scalar_type approx_posterior::count_all_trees(boost::dynamic_bitset<> gamma) const -{ - scalar_type count_trees_g=0; - set gamma_s ; //not very efficient to use sets again, but this function is rarely used - for ( auto i =0 ; i < Gamma_size + 1 ; ++i) { -// if ( BipartitionTools::testBit( gamma, i) ) { - if ( gamma[ i] ) { - gamma_s.insert( i ); - } - } - if (gamma_s.size()==1) - { - count_trees_g=1; +scalar_type +approx_posterior::count_all_trees(boost::dynamic_bitset<> gamma) const { + scalar_type count_trees_g = 0; + set gamma_s; // not very efficient to use sets again, but this function + // is rarely used + for (auto i = 0; i < Gamma_size + 1; ++i) { + // if ( BipartitionTools::testBit( gamma, i) ) { + if (gamma[i]) { + gamma_s.insert(i); } - else - { - set< set > P_gamma=powerset< set >(gamma_s);//del-loc - for (set >::iterator st=P_gamma.begin();st!=P_gamma.end();st++) - { - if (gamma_s.size()>(*st).size() and (*st).size()>0 and (*st).count(*(gamma_s.begin()))==1) - { - - set not_st;//del-loc - /* - int* st_bitV = new int[nbint]; - - for (auto i =0 ; i < nbint ; ++i) { - st_bitV[i] = 0; - //BipartitionTools::bit0( st_bitV, i) ; - } - for (auto it = (*st).begin() ; it != (*st).end() ; ++it) { - BipartitionTools::bit1( st_bitV, (*it) ) ; - } - */ - boost::dynamic_bitset<> st_bitV (Gamma_size + 1); - for (auto it = (*st).begin() ; it != (*st).end() ; ++it) { - st_bitV[ (*it) ] = 1 ; - } - - - scalar_type count_trees_gp=count_all_trees( st_bitV );//del-loc - - for (set::iterator nst=gamma_s.begin();nst!=gamma_s.end();nst++) - if ((*st).count(*nst)==0) - not_st.insert(*nst); - /* - int* not_st_bitV = new int[nbint]; - for (auto i =0 ; i < nbint ; ++i) { - not_st_bitV[i] = 0; - //BipartitionTools::bit0( not_st_bitV, i) ; - } - for (auto it = not_st.begin() ; it != not_st.end() ; ++it) { - BipartitionTools::bit1( not_st_bitV, (*it) ) ; - } - */ - boost::dynamic_bitset<> not_st_bitV (Gamma_size + 1); - for (auto it = not_st.begin() ; it != not_st.end() ; ++it) { - not_st_bitV[ (*it) ] = 1 ; - } + } + if (gamma_s.size() == 1) { + count_trees_g = 1; + } else { + set> P_gamma = powerset>(gamma_s); // del-loc + for (set>::iterator st = P_gamma.begin(); st != P_gamma.end(); + st++) { + if (gamma_s.size() > (*st).size() and (*st).size() > 0 and + (*st).count(*(gamma_s.begin())) == 1) { + + set not_st; // del-loc + /* + int* st_bitV = new int[nbint]; + + for (auto i =0 ; i < nbint ; ++i) { + st_bitV[i] = 0; + //BipartitionTools::bit0( st_bitV, i) ; + } + for (auto it = (*st).begin() ; it != (*st).end() ; ++it) { + BipartitionTools::bit1( st_bitV, (*it) ) ; + } + */ + boost::dynamic_bitset<> st_bitV(Gamma_size + 1); + for (auto it = (*st).begin(); it != (*st).end(); ++it) { + st_bitV[(*it)] = 1; + } - scalar_type count_trees_gpp=count_all_trees( not_st_bitV );//del-loc + scalar_type count_trees_gp = count_all_trees(st_bitV); // del-loc + + for (set::iterator nst = gamma_s.begin(); nst != gamma_s.end(); + nst++) + if ((*st).count(*nst) == 0) + not_st.insert(*nst); + /* + int* not_st_bitV = new int[nbint]; + for (auto i =0 ; i < nbint ; ++i) { + not_st_bitV[i] = 0; + //BipartitionTools::bit0( not_st_bitV, i) ; + } + for (auto it = not_st.begin() ; it != not_st.end() ; ++it) { + BipartitionTools::bit1( not_st_bitV, (*it) ) ; + } + */ + boost::dynamic_bitset<> not_st_bitV(Gamma_size + 1); + for (auto it = not_st.begin(); it != not_st.end(); ++it) { + not_st_bitV[(*it)] = 1; + } - count_trees_g+=count_trees_gp*count_trees_gpp; - not_st.clear(); - } - } + scalar_type count_trees_gpp = count_all_trees(not_st_bitV); // del-loc - P_gamma.clear(); + count_trees_g += count_trees_gp * count_trees_gpp; + not_st.clear(); + } } + + P_gamma.clear(); + } return count_trees_g; } +scalar_type approx_posterior::count_trees() const { + scalar_type count_trees_g = 0; -scalar_type approx_posterior::count_trees() const -{ - scalar_type count_trees_g=0; - - map g_id_count;//del-loc + map g_id_count; // del-loc cout << "." << endl; - for ( auto it = size_ordered_bips.begin(); it != size_ordered_bips.end(); it++) - for ( auto jt = (*it).second.begin(); jt != (*it).second.end(); jt++) - { - long int g_id=(*jt); - // leaves - if ((*it).first==1) - g_id_count[g_id]=1; - else - { - g_id_count[g_id]=0; - for ( auto kt = Dip_counts.at(g_id).begin(); kt != Dip_counts[g_id].end(); kt++) - { - pair parts = (*kt).first; - long int gp_id= parts.first; - long int gpp_id= parts.second; - - g_id_count[g_id]+=g_id_count[gp_id]*g_id_count[gpp_id]; - } - } + for (auto it = size_ordered_bips.begin(); it != size_ordered_bips.end(); it++) + for (auto jt = (*it).second.begin(); jt != (*it).second.end(); jt++) { + long int g_id = (*jt); + // leaves + if ((*it).first == 1) + g_id_count[g_id] = 1; + else { + g_id_count[g_id] = 0; + for (auto kt = Dip_counts.at(g_id).begin(); + kt != Dip_counts[g_id].end(); kt++) { + pair parts = (*kt).first; + long int gp_id = parts.first; + long int gpp_id = parts.second; + + g_id_count[g_id] += g_id_count[gp_id] * g_id_count[gpp_id]; + } } + } cout << ".." << endl; - for ( auto it = Bip_counts.begin(); it != Bip_counts.end(); it++) - { - long int g_id=(*it).first; - /* set gamma=id_sets[g_id]; - set not_gamma; - for (set::iterator st=Gamma.begin();st!=Gamma.end();st++) - if (gamma.count(*st)==0) - not_gamma.insert(*st);*/ - boost::dynamic_bitset<> gamma=id_sets.at(g_id); - boost::dynamic_bitset<> not_gamma = ~gamma; //new int[nbint]; - not_gamma[0] = 0; - /* for (auto i =0 ; i < nbint ; ++i) { - not_gamma[i] = 0; - } - BipartitionTools::bitNot(not_gamma, gamma, nbint);*/ - size_t gamma_size = 0; - size_t not_gamma_size = 0; - for (auto i = 0; i < Gamma_size + 1; ++i) { - // if ( BipartitionTools::testBit(gamma, i) ) { - if ( gamma[ i] ) { - gamma_size++; - } - // if ( BipartitionTools::testBit(not_gamma, i) ) { - if ( not_gamma[ i] ) { - not_gamma_size++; - } - } - if ( gamma_size > not_gamma_size ) - count_trees_g+=g_id_count[set_ids.at(gamma)]*g_id_count[set_ids.at(not_gamma)];//count_trees(set_ids[gamma])*count_trees(set_ids[not_gamma]); - else if ( gamma_size == not_gamma_size ) - count_trees_g+=g_id_count[set_ids.at(gamma)]*g_id_count[set_ids.at(not_gamma)]/2.0;//count_trees(set_ids[gamma])*count_trees(set_ids[not_gamma])/2.0; - //cout << count_trees(gamma) << " " << set2name(gamma) << " " << count_trees(not_gamma) << " " << set2name(not_gamma) < gamma=id_sets[g_id]; + set not_gamma; + for (set::iterator st=Gamma.begin();st!=Gamma.end();st++) + if (gamma.count(*st)==0) + not_gamma.insert(*st);*/ + boost::dynamic_bitset<> gamma = id_sets.at(g_id); + boost::dynamic_bitset<> not_gamma = ~gamma; // new int[nbint]; + not_gamma[0] = 0; + /* for (auto i =0 ; i < nbint ; ++i) { + not_gamma[i] = 0; + } + BipartitionTools::bitNot(not_gamma, gamma, nbint);*/ + size_t gamma_size = 0; + size_t not_gamma_size = 0; + for (auto i = 0; i < Gamma_size + 1; ++i) { + // if ( BipartitionTools::testBit(gamma, i) ) { + if (gamma[i]) { + gamma_size++; + } + // if ( BipartitionTools::testBit(not_gamma, i) ) { + if (not_gamma[i]) { + not_gamma_size++; + } } + if (gamma_size > not_gamma_size) + count_trees_g += + g_id_count[set_ids.at(gamma)] * + g_id_count[set_ids.at( + not_gamma)]; // count_trees(set_ids[gamma])*count_trees(set_ids[not_gamma]); + else if (gamma_size == not_gamma_size) + count_trees_g += + g_id_count[set_ids.at(gamma)] * g_id_count[set_ids.at(not_gamma)] / + 2.0; // count_trees(set_ids[gamma])*count_trees(set_ids[not_gamma])/2.0; + // cout << count_trees(gamma) << " " << set2name(gamma) << " " << + // count_trees(not_gamma) << " " << set2name(not_gamma) <,long int> set_ids;//del-loc - //std::map< long int, std::set > id_sets;//del-loc - //long int g_id=set_ids[gamma]; - boost::dynamic_bitset<> gamma=id_sets.at(g_id); - int gamma_size=0; - for (auto i = 0; i < Gamma_size + 1; ++i) { - // if ( BipartitionTools::testBit(gamma, i) ) { - if ( gamma[ i] ) { - gamma_size++; - } +scalar_type approx_posterior::count_trees(long int g_id) const { + scalar_type count_trees_g = 0; + // std::map ,long int> set_ids;//del-loc + // std::map< long int, std::set > id_sets;//del-loc + // long int g_id=set_ids[gamma]; + boost::dynamic_bitset<> gamma = id_sets.at(g_id); + int gamma_size = 0; + for (auto i = 0; i < Gamma_size + 1; ++i) { + // if ( BipartitionTools::testBit(gamma, i) ) { + if (gamma[i]) { + gamma_size++; } - //gamma.size(); - if (gamma_size==1) - { - count_trees_g=1; + } + // gamma.size(); + if (gamma_size == 1) { + count_trees_g = 1; + } else { + set P_gamma; //=powerset< set >(gamma);//del-loc + for (auto kt = Dip_counts[g_id].begin(); kt != Dip_counts[g_id].end(); + kt++) { + pair parts = (*kt).first; + long int gp_id = parts.first; + long int gpp_id = parts.second; + P_gamma.insert(gp_id); + P_gamma.insert(gpp_id); } - else - { - set< long int > P_gamma;//=powerset< set >(gamma);//del-loc - for ( auto kt = Dip_counts[g_id].begin(); kt != Dip_counts[g_id].end(); kt++) - { - pair parts = (*kt).first; - long int gp_id = parts.first; - long int gpp_id = parts.second; - P_gamma.insert(gp_id); - P_gamma.insert(gpp_id); - } - for ( auto st=P_gamma.begin();st!=P_gamma.end();st++) - { - boost::dynamic_bitset<> gammap=id_sets.at( (*st) ); - int gammap_size=0; - bool sameFirstElement = false; - for (auto i = 0; i < Gamma_size +1; ++i) { - // if ( BipartitionTools::testBit(gammap, i) ) { - if ( gammap[ i] ) { - gammap_size++; - } + for (auto st = P_gamma.begin(); st != P_gamma.end(); st++) { + boost::dynamic_bitset<> gammap = id_sets.at((*st)); + int gammap_size = 0; + bool sameFirstElement = false; + for (auto i = 0; i < Gamma_size + 1; ++i) { + // if ( BipartitionTools::testBit(gammap, i) ) { + if (gammap[i]) { + gammap_size++; } + } - for (auto i = 0; i < Gamma_size+1; ++i) { -// if (BipartitionTools::testBit(gamma, i) ) { - if (gamma[ i] ) { + for (auto i = 0; i < Gamma_size + 1; ++i) { + // if (BipartitionTools::testBit(gamma, i) ) { + if (gamma[i]) { -// if ( BipartitionTools::testBit(gammap, i) ) { - if ( gammap[ i] ) { + // if ( BipartitionTools::testBit(gammap, i) ) { + if (gammap[i]) { - sameFirstElement = true; - } - break; - } + sameFirstElement = true; + } + break; } - if ( gamma_size > gammap_size and gammap_size > 0 and sameFirstElement ) - { - /* int* not_gammap = new int[nbint];//del-loc - for (auto i = 0; i < nbint; ++i) { - not_gammap[i] = 0; - } - BipartitionTools::bitNot(not_gammap, gammap, nbint); - */ - boost::dynamic_bitset<> not_gammap = ~ gammap; - not_gammap[0] = 0; - scalar_type count_trees_gp=count_trees((*st));//del-loc - /* for (set::iterator nst=gamma.begin();nst!=gamma.end();nst++) - if (gammap.count(*nst)==0) - not_gammap.insert(*nst); */ - scalar_type count_trees_gpp=count_trees(set_ids.at(not_gammap));//del-loc - // not_gammap.clear(); - - count_trees_g+=count_trees_gp*count_trees_gpp; - } - // gammap.clear(); - } - //gamma.clear(); - P_gamma.clear(); + } + if (gamma_size > gammap_size and gammap_size > 0 and sameFirstElement) { + /* int* not_gammap = new int[nbint];//del-loc + for (auto i = 0; i < nbint; ++i) { + not_gammap[i] = 0; + } + BipartitionTools::bitNot(not_gammap, gammap, nbint); + */ + boost::dynamic_bitset<> not_gammap = ~gammap; + not_gammap[0] = 0; + scalar_type count_trees_gp = count_trees((*st)); // del-loc + /* for (set::iterator nst=gamma.begin();nst!=gamma.end();nst++) + if (gammap.count(*nst)==0) + not_gammap.insert(*nst); */ + scalar_type count_trees_gpp = + count_trees(set_ids.at(not_gammap)); // del-loc + // not_gammap.clear(); + + count_trees_g += count_trees_gp * count_trees_gpp; + } + // gammap.clear(); } + // gamma.clear(); + P_gamma.clear(); + } return count_trees_g; } - // of an unrooted tree given by its Newick string (which can be rooted) -scalar_type approx_posterior::nbipp(string tree_string) const -{ - scalar_type n=0; - scalar_type c=0; - - map < boost::dynamic_bitset<>,scalar_type> rec_map=recompose( tree_string); - for ( auto it=rec_map.begin();it!=rec_map.end();it++) - { - boost::dynamic_bitset<> gamma=(*it).first; - if (Bip_counts.at(set_ids.at(gamma) ) ) n+=1; - c+=1; - } - return n/c; +scalar_type approx_posterior::nbipp(string tree_string) const { + scalar_type n = 0; + scalar_type c = 0; + + map, scalar_type> rec_map = recompose(tree_string); + for (auto it = rec_map.begin(); it != rec_map.end(); it++) { + boost::dynamic_bitset<> gamma = (*it).first; + if (Bip_counts.at(set_ids.at(gamma))) + n += 1; + c += 1; + } + return n / c; } -//Set the value for the alpha parameter -void approx_posterior::setAlpha ( scalar_type a ) { +// Set the value for the alpha parameter +void approx_posterior::setAlpha(scalar_type a) { alpha = a; return; } -//Set the value for the beta parameter -void approx_posterior::setBeta ( scalar_type b ) { +// Set the value for the beta parameter +void approx_posterior::setBeta(scalar_type b) { beta = b; return; } - -std::vector < std::string > approx_posterior::getLeafNames() const -{ - std::vector leafNames (leaf_ids.size(), ""); - for ( auto it = leaf_ids.begin() ; it != leaf_ids.end() ; ++it ) - { - leafNames.push_back ( (*it).first ) ; - } +std::vector approx_posterior::getLeafNames() const { + std::vector leafNames(leaf_ids.size(), ""); + for (auto it = leaf_ids.begin(); it != leaf_ids.end(); ++it) { + leafNames.push_back((*it).first); + } return leafNames; } - -void approx_posterior::computeOrderedVectorOfClades (vector & ids, vector & id_sizes) -{ - //I sort the directed partitions by size (number of gene tree leaves) to ensure that we calculate things in the proper order (smaller to larger) - for (map > :: iterator it = size_ordered_bips.begin(); it != size_ordered_bips.end(); it++) - { - for (vector :: iterator jt = (*it).second.begin(); jt != (*it).second.end(); jt++) - { - ids.push_back((*jt)); - id_sizes.push_back((*it).first); - } +void approx_posterior::computeOrderedVectorOfClades( + vector &ids, vector &id_sizes) { + // I sort the directed partitions by size (number of gene tree leaves) to + // ensure that we calculate things in the proper order (smaller to larger) + for (map>::iterator it = size_ordered_bips.begin(); + it != size_ordered_bips.end(); it++) { + for (vector::iterator jt = (*it).second.begin(); + jt != (*it).second.end(); jt++) { + ids.push_back((*jt)); + id_sizes.push_back((*it).first); } - //root bipartition needs to be handled separately (and last, given it's the largest) + } + // root bipartition needs to be handled separately (and last, given it's the + // largest) ids.push_back(-1); id_sizes.push_back(Gamma_size); return; - - } diff --git a/src/ALE.h b/src/ALE.h index 6a5ed95..7086d0c 100644 --- a/src/ALE.h +++ b/src/ALE.h @@ -1,37 +1,35 @@ -//all code by Szollosi GJ et al.; ssolo@elte.hu; GNU GPL 3.0; +// all code by Szollosi GJ et al.; ssolo@elte.hu; GNU GPL 3.0; #pragma once #define ALE_VERSION "1.0" -#include -#include -#include #include -#include -#include +#include +#include #include #include +#include +#include #include -#include -#include -#include -#include -#include -#include #include #include - +#include +#include +#include +#include +#include +#include +#include #include "pairHasher.h" -//#include +// #include typedef bpp::TreeTemplate tree_type; -//typedef long double scalar_type; -typedef long double scalar_type; -typedef std::pair dedge_type; -//typedef std::pair > dip_type; - +// typedef long double scalar_type; +typedef long double scalar_type; +typedef std::pair dedge_type; +// typedef std::pair > dip_type; /**************************************************************************** // approx_posterior class. @@ -41,134 +39,240 @@ typedef std::pair dedge_type; // leaf set = bipartition = clade *****************************************************************************/ -class approx_posterior -{ - - public: - //must load - scalar_type observations; //Number of trees observed to build the approx_posterior object. +class approx_posterior { +public: + // must load + scalar_type observations; // Number of trees observed to build the + // approx_posterior object. - //no need to load - std::string constructor_string; //string representing the tree in Newick format + // no need to load + std::string + constructor_string; // string representing the tree in Newick format scalar_type alpha; scalar_type beta; - std::map tree_counts; - - ~approx_posterior() - { - leaf_ids.clear(); - id_leaves.clear(); - set_ids.clear(); - id_sets.clear(); - Bip_counts.clear(); - for (std::vector < std::unordered_map< std::pair ,scalar_type> >::iterator it=Dip_counts.begin();it!=Dip_counts.end();it++) - (*it).clear(); - Dip_counts.clear(); - Gamma_s.clear(); - tree_bipstrings.clear(); - bipstring_trees.clear(); - set_sizes.clear(); - for (std::map > :: iterator it = size_ordered_bips.begin(); it != size_ordered_bips.end(); it++) - (*it).second.clear(); - size_ordered_bips.clear(); - Bip_bls.clear(); - } - - approx_posterior(); //Does nothing. Formal constructor must be followed by load_state. - approx_posterior(std::string tree); //Constructs a basic instance by calling construct. - void construct(std::string tree_string); //Constructs a basic instance. - - void save_state(std::string fname) ; //Writes the object to a file. + std::map tree_counts; + + ~approx_posterior() { + leaf_ids.clear(); + id_leaves.clear(); + set_ids.clear(); + id_sets.clear(); + Bip_counts.clear(); + for (std::vector, + scalar_type>>::iterator it = + Dip_counts.begin(); + it != Dip_counts.end(); it++) + (*it).clear(); + Dip_counts.clear(); + Gamma_s.clear(); + tree_bipstrings.clear(); + bipstring_trees.clear(); + set_sizes.clear(); + for (std::map>::iterator it = + size_ordered_bips.begin(); + it != size_ordered_bips.end(); it++) + (*it).second.clear(); + size_ordered_bips.clear(); + Bip_bls.clear(); + } + + approx_posterior(); // Does nothing. Formal constructor must be followed by + // load_state. + approx_posterior( + std::string tree); // Constructs a basic instance by calling construct. + void construct(std::string tree_string); // Constructs a basic instance. + + void save_state(std::string fname); // Writes the object to a file. void load_state(std::string fname); - void observation(std::vector trees,bool count_topologies=false, scalar_type weight=1.0); //Given a vector of trees, fills an approx_posterior object by recursively calling decompose, and then doing some more counts. - scalar_type p(std::string tree) const; //Computes the probability of a string tree. Calls recompose on the tree, and then uses the map returned by recompose to compute the probability of the whole tree. - scalar_type nbipp(std::string tree) const; //Computes the proportion of bipartitions already in the approx_posterior object that are present in the tree - scalar_type binomial(int n,int m) const; //Computes the binomial coefficient. - scalar_type trinomial(int n1,int n2,int n3) const; //Computes the multinomial coefficient for 3 elements. - - std::pair mpp_tree() const; //Returns the maximum a posteriori tree that can be amalgamated from the approx_prior object. Uses a double-recursive traversal of all bipartitions. - std::string mpp_backtrack(long int g_id, std::map * qmpp) const;//Recursive function that, given a bipartition id and a map associating bipartition ids to their maximum a posteriori value, builds the maximum a posteriori tree, complete with (average) branch lengths. - std::string random_tree() const; //Function that returns a random tree with unit branch lengths. Calls random_split. - std::vector all_trees() const {return all_trees(Gamma);}; //del-loc. Builds all rooted trees that can be built based on the complete set of leaves. - - - //no need to load - std::set Gamma_s; //del-loc. Set containing all leaf ids. ~Clade of all leaves in the tree. - - boost::dynamic_bitset<> Gamma; //bit vector with all bits to 1 (all species present) - int Gamma_size; //Number of leaves. - // size_t nbint; //Number of ints in a bitvector used to store a partition - // int filterForXor; //Integer used for xor operations to leave the unused bits of the last integer of the bitvector to 0 - scalar_type K_Gamma; //number of bipartitions of Gamma. - long double N_Gamma; //number of unrooted trees on Gamma_size leaves - std::string name_separator; //Character used between leaf names when writing the content of a leaf set. - std::map tree_bipstrings; //del-loc. Map between tree string and string containing all bipartitions in the tree. - std::map bipstring_trees; //del-loc. Dual from above. Map between string containing all bipartitions in the tree and tree string. - std::map set_sizes; //del-loc. Map between a bipartition id and the sizes of the corresponding leaf set. - std::map > size_ordered_bips; //del-loc. Map between bipartition size, and the ids of all bipartitions of this size. - - //must load - long int last_leafset_id; //Total number of sets of leaves (=bipartitions) observed in the posterior. - std::map leaf_ids; //del-loc. Map between species name and leaf id. Leaf ids go from 1 to Gamma_size. - std::map id_leaves; //del-loc. Map between leaf id and species name. Dual from above. - std::map Bip_counts; //del-loc. For each bipartition, gives the number of times it was observed. - std::map Bip_bls; //del-loc. Sum of the branch lengths associated to the bipartitions. - - //VECTORIZED BELOW std::map ,scalar_type> > Dip_counts; //del-loc. Contains the frequency of triplets: mother clade and its two daughter clades. Map between the bipartition id of the mother clade and another map containing a set of bipartition ids (couldn't it be just a pair, in the case of bifurcating trees?) and the frequency of the associated triplet of bipartitions. - - std::vector < std::unordered_map< std::pair,scalar_type> > Dip_counts; - // std::map ,long int> set_ids; //del-loc. Map between a set of leaf ids and the corresponding bipartition index. -// std::map< long int, std::set > id_sets; //del-loc. Dual from above. Map between a bipartition index and the corresponding leaf ids. - - std::map < boost::dynamic_bitset<>,long int> set_ids; //del-loc. Map between a bit vector of leaf ids and the corresponding bipartition index. - std::map< long int, boost::dynamic_bitset<> > id_sets; //del-loc. Dual from above. Map between a bipartition index and the corresponding leaf id bit vector. - - - //nuisance vars - boost::timer * t; - - //algorithmic - void decompose(std::string G_string,std::set * bip_ids=NULL , scalar_type weight=1.0); //Parses a tree in string format and updates the approx_prior object accordingly (notably updates the Bip_bls, Bip_counts, Dip_counts, and set_ids + id_sets through set2id) - std::map < boost::dynamic_bitset<> ,scalar_type> recompose(std::string G_string) const; //For a given input tree string, returns a map between all sets of leaves contained in the tree and their corresponding conditional clade probability. + void observation(std::vector trees, + bool count_topologies = false, + scalar_type weight = + 1.0); // Given a vector of trees, fills an + // approx_posterior object by recursively calling + // decompose, and then doing some more counts. + scalar_type p(std::string tree) + const; // Computes the probability of a string tree. Calls recompose on + // the tree, and then uses the map returned by recompose to compute + // the probability of the whole tree. + scalar_type nbipp(std::string tree) + const; // Computes the proportion of bipartitions already in the + // approx_posterior object that are present in the tree + scalar_type binomial(int n, int m) const; // Computes the binomial + // coefficient. + scalar_type trinomial(int n1, int n2, int n3) + const; // Computes the multinomial coefficient for 3 elements. + + std::pair + mpp_tree() const; // Returns the maximum a posteriori tree that can be + // amalgamated from the approx_prior object. Uses a + // double-recursive traversal of all bipartitions. + std::string mpp_backtrack(long int g_id, + std::map *qmpp) + const; // Recursive function that, given a bipartition id and a map + // associating bipartition ids to their maximum a posteriori value, + // builds the maximum a posteriori tree, complete with (average) + // branch lengths. + std::string random_tree() const; // Function that returns a random tree with + // unit branch lengths. Calls random_split. + std::vector all_trees() const { + return all_trees(Gamma); + }; // del-loc. Builds all rooted trees that can be built based on the complete + // set of leaves. + + // no need to load + std::set Gamma_s; // del-loc. Set containing all leaf ids. ~Clade of all + // leaves in the tree. + + boost::dynamic_bitset<> + Gamma; // bit vector with all bits to 1 (all species present) + int Gamma_size; // Number of leaves. + // size_t nbint; //Number of ints in a bitvector used to store a partition int + // filterForXor; //Integer used for xor operations to leave the unused bits of + // the last integer of the bitvector to 0 + scalar_type K_Gamma; // number of bipartitions of Gamma. + long double N_Gamma; // number of unrooted trees on Gamma_size leaves + std::string name_separator; // Character used between leaf names when writing + // the content of a leaf set. + std::map + tree_bipstrings; // del-loc. Map between tree string and string containing + // all bipartitions in the tree. + std::map + bipstring_trees; // del-loc. Dual from above. Map between string + // containing all bipartitions in the tree and tree + // string. + std::map + set_sizes; // del-loc. Map between a bipartition id and the sizes of the + // corresponding leaf set. + std::map> + size_ordered_bips; // del-loc. Map between bipartition size, and the ids + // of all bipartitions of this size. + + // must load + long int last_leafset_id; // Total number of sets of leaves (=bipartitions) + // observed in the posterior. + std::map + leaf_ids; // del-loc. Map between species name and leaf id. Leaf ids go + // from 1 to Gamma_size. + std::map id_leaves; // del-loc. Map between leaf id and + // species name. Dual from above. + std::map + Bip_counts; // del-loc. For each bipartition, gives the number of times it + // was observed. + std::map Bip_bls; // del-loc. Sum of the branch lengths + // associated to the bipartitions. + + // VECTORIZED BELOW std::map ,scalar_type> > Dip_counts; //del-loc. Contains the frequency of + // triplets: mother clade and its two daughter clades. Map between the + // bipartition id of the mother clade and another map containing a set of + // bipartition ids (couldn't it be just a pair, in the case of bifurcating + // trees?) and the frequency of the associated triplet of bipartitions. + + std::vector, scalar_type>> + Dip_counts; + // std::map ,long int> set_ids; //del-loc. Map between a set + // of leaf ids and the corresponding bipartition index. + // std::map< long int, std::set > id_sets; //del-loc. Dual from + // above. Map between a bipartition index and the corresponding leaf ids. + + std::map, long int> + set_ids; // del-loc. Map between a bit vector of leaf ids and the + // corresponding bipartition index. + std::map> + id_sets; // del-loc. Dual from above. Map between a bipartition index and + // the corresponding leaf id bit vector. + + // nuisance vars + boost::timer *t; + + // algorithmic + void decompose( + std::string G_string, std::set *bip_ids = NULL, + scalar_type weight = + 1.0); // Parses a tree in string format and updates the approx_prior + // object accordingly (notably updates the Bip_bls, Bip_counts, + // Dip_counts, and set_ids + id_sets through set2id) + std::map, scalar_type> recompose(std::string G_string) + const; // For a given input tree string, returns a map between all sets of + // leaves contained in the tree and their corresponding conditional + // clade probability. void register_leafset(std::string); - long int set2id( boost::dynamic_bitset<> leaf_set) ; //If the set exists, returns the set id, otherwise creates a new set id for this set and returns it. - - //numeric - scalar_type Bi(int n2) const; //Returns the total number of binary tree topologies possible given a fixed bipartition between n2 leaves on one side and Gamma_size-n2 leaves on the other side. - scalar_type Tri(int n2,int n3) const; //Returns the total number of binary tree topologies possible given a fixed trifurcation between n2 leaves in one clade, n3 in another, and Gamma_size-n2-n3 leaves in the last one. - - scalar_type p_dip(long int g_id,long int gp_id,long int gpp_id) const; //Probability of a trifurcation given by the ids of the clades. - //scalar_type p_dip(std::set gamma,std::set gammap,std::set gammapp); //Probability of a trifurcation given by the leaf sets of the clades. - scalar_type p_dip(boost::dynamic_bitset<> gamma, boost::dynamic_bitset<> gammap, boost::dynamic_bitset<> gammapp) const; //Probability of a trifurcation given by the leaf sets of the clades. - - scalar_type p_bip(long int g_id) const; //Probability of a bipartition given by its id. Uses the correction term alpha. -// scalar_type p_bip(std::set gamma); //Probability of a bipartition given by its leaf set. - scalar_type p_bip(boost::dynamic_bitset<> gamma) const; //Probability of a bipartition given by its leaf set. - - - //nuisance - std::string set2name( boost::dynamic_bitset<> leaf_set) const; //Prints the leaf names of leaves contained in a leaf set - std::string random_split( boost::dynamic_bitset<> gamma) const; //Recursive function that returns a random subtree given a leaf set as input and given the approx_posterior object. Can return clades never observed in the posterior sample. - std::vector all_trees( boost::dynamic_bitset<> gamma) const; //del-loc. Builds all rooted trees that can be built with leaf set gamma. - scalar_type count_trees() const; //Counts trees that can be amalgamated with the approx_posterior object with the complete leaf set, without actually building these trees. - scalar_type count_trees(long int g_id) const; //Counts trees that can be amalgamated with the leaf set with id g_id, without actually building these trees. - - scalar_type count_all_trees( boost::dynamic_bitset<> gamma) const; //Counts all trees that can be built with the leaf set gamma, without actually building these trees. - void setAlpha ( scalar_type a ) ; //Set the value for the alpha parameter used for normalizing counts - void setBeta ( scalar_type b ); //Set the value for the beta parameter used for normalizing counts - std::vector < std::string > getLeafNames() const; //get a vector containing all leaf names in the ale. - void computeOrderedVectorOfClades (std::vector & ids, std::vector & id_sizes); //fills the ids and id_sizes maps, ids of clades ordered by their size. + long int + set2id(boost::dynamic_bitset<> + leaf_set); // If the set exists, returns the set id, otherwise + // creates a new set id for this set and returns it. + + // numeric + scalar_type + Bi(int n2) const; // Returns the total number of binary tree topologies + // possible given a fixed bipartition between n2 leaves on + // one side and Gamma_size-n2 leaves on the other side. + scalar_type Tri(int n2, int n3) + const; // Returns the total number of binary tree topologies possible + // given a fixed trifurcation between n2 leaves in one clade, n3 in + // another, and Gamma_size-n2-n3 leaves in the last one. + + scalar_type p_dip(long int g_id, long int gp_id, long int gpp_id) + const; // Probability of a trifurcation given by the ids of the clades. + // scalar_type p_dip(std::set gamma,std::set gammap,std::set + // gammapp); //Probability of a trifurcation given by the leaf sets of the + // clades. + scalar_type p_dip(boost::dynamic_bitset<> gamma, + boost::dynamic_bitset<> gammap, + boost::dynamic_bitset<> gammapp) + const; // Probability of a trifurcation given by the leaf sets of the + // clades. + + scalar_type + p_bip(long int g_id) const; // Probability of a bipartition given by its id. + // Uses the correction term alpha. + // scalar_type p_bip(std::set gamma); //Probability of a bipartition + // given by its leaf set. + scalar_type p_bip(boost::dynamic_bitset<> gamma) + const; // Probability of a bipartition given by its leaf set. + + // nuisance + std::string set2name(boost::dynamic_bitset<> leaf_set) + const; // Prints the leaf names of leaves contained in a leaf set + std::string random_split(boost::dynamic_bitset<> gamma) + const; // Recursive function that returns a random subtree given a leaf + // set as input and given the approx_posterior object. Can return + // clades never observed in the posterior sample. + std::vector all_trees(boost::dynamic_bitset<> gamma) + const; // del-loc. Builds all rooted trees that can be built with leaf set + // gamma. + scalar_type + count_trees() const; // Counts trees that can be amalgamated with the + // approx_posterior object with the complete leaf set, + // without actually building these trees. + scalar_type count_trees(long int g_id) + const; // Counts trees that can be amalgamated with the leaf set with id + // g_id, without actually building these trees. + + scalar_type count_all_trees(boost::dynamic_bitset<> gamma) + const; // Counts all trees that can be built with the leaf set gamma, + // without actually building these trees. + void setAlpha(scalar_type a); // Set the value for the alpha parameter used + // for normalizing counts + void setBeta(scalar_type b); // Set the value for the beta parameter used for + // normalizing counts + std::vector + getLeafNames() const; // get a vector containing all leaf names in the ale. + void computeOrderedVectorOfClades( + std::vector &ids, + std::vector &id_sizes); // fills the ids and id_sizes maps, ids + // of clades ordered by their size. private: ; }; - - -//from: http://rosettacode.org/wiki/Power_set#Recursive_version (accessed 12/13/11) -//"Given a set S, the power set (or powerset) of S, written P(S), or 2S, is the set of all subsets of S." -template std::set powerset(const Set& s, size_t n); -//from: http://rosettacode.org/wiki/Power_set#Recursive_version (accessed 12/13/11) -template std::set powerset(const Set& s); +// from: http://rosettacode.org/wiki/Power_set#Recursive_version (accessed +// 12/13/11) "Given a set S, the power set (or powerset) of S, written P(S), or +//2S, is the set of all subsets of S." +template std::set powerset(const Set &s, size_t n); +// from: http://rosettacode.org/wiki/Power_set#Recursive_version (accessed +// 12/13/11) +template std::set powerset(const Set &s); diff --git a/src/ALE_tutorial.cpp b/src/ALE_tutorial.cpp index 96fa6bc..ec21c57 100644 --- a/src/ALE_tutorial.cpp +++ b/src/ALE_tutorial.cpp @@ -2,102 +2,124 @@ #include "ALE_util.h" using namespace std; -scalar_type count_trees(approx_posterior * ale) -{ - //For clade g, a split is defined by the the complementary subclades gp and gpp - //..similar to the ALE manuscripts (gamma',gamma''|gamma) notation.. - //..or put differently C[1]==g,C[2]==gp and C[3]==gp .. +scalar_type count_trees(approx_posterior *ale) { + // For clade g, a split is defined by the the complementary subclades gp and + // gpp + //..similar to the ALE manuscripts (gamma',gamma''|gamma) notation.. + //..or put differently C[1]==g,C[2]==gp and C[3]==gp .. - // The loop below goes over all clades, i.e. the same as line 3 in algo 2 ( ..in increasing size order do:) - // g_id_count stores the number of trees (subtrees of G) that can be amalgamated for the clade g. - // In the approx_posterior object each clade has an id, a long int, that I call, e.g. g_id - map g_id_count;//del-loc - // ale->size_ordered_bips is a map: int "size of clade" -> vector "vector of clade ids" - for (map > :: iterator it = ale->size_ordered_bips.begin(); it != ale->size_ordered_bips.end(); it++) - for (vector :: iterator jt = (*it).second.begin(); jt != (*it).second.end(); jt++) - { - long int g_id=(*jt); + // The loop below goes over all clades, i.e. the same as line 3 in algo 2 ( + // ..in increasing size order do:) g_id_count stores the number of trees + // (subtrees of G) that can be amalgamated for the clade g. In the + // approx_posterior object each clade has an id, a long int, that I call, e.g. + // g_id + map g_id_count; // del-loc + // ale->size_ordered_bips is a map: int "size of clade" -> vector + // "vector of clade ids" + for (map>::iterator it = ale->size_ordered_bips.begin(); + it != ale->size_ordered_bips.end(); it++) + for (vector::iterator jt = (*it).second.begin(); + jt != (*it).second.end(); jt++) { + long int g_id = (*jt); - // leaves have size one, and only one tree can be amalgamated - if ((*it).first==1) - g_id_count[g_id]=1; - else - { - g_id_count[g_id]=0; - // for non-leaves we have the recursion - // n(g) = sum "over splits of g" n(gp)*n(gpp) - // the ale->Dip_counts object is a bit confusing, but it was the most efficient way I could find to store splits - // ale->Dip_counts is a map: long int "clade id" -> map < set,scalar_type> "a map object recording a split and the number of time we saw it". - // the map < set,scalar_type> "a map object recording a split and the number of time we saw it" - // consist of a set which has always two parts, gp and gpp (i.e. C[2] and C[3]) and the scalar_type is a double recording the number of times we saw this split - // in short the loop below implements << sum "over splits of g" >> - for (map< set,scalar_type> :: iterator kt = ale->Dip_counts[g_id].begin(); kt != ale->Dip_counts[g_id].end(); kt++) - { - vector parts; - for (set::iterator sit=(*kt).first.begin();sit!=(*kt).first.end();sit++) parts.push_back((*sit)); - long int gp_id=parts[0]; - long int gpp_id=parts[1]; - // n(g) += n(gp)*n(gpp) - g_id_count[g_id]+=g_id_count[gp_id]*g_id_count[gpp_id]; - } - } + // leaves have size one, and only one tree can be amalgamated + if ((*it).first == 1) + g_id_count[g_id] = 1; + else { + g_id_count[g_id] = 0; + // for non-leaves we have the recursion + // n(g) = sum "over splits of g" n(gp)*n(gpp) + // the ale->Dip_counts object is a bit confusing, but it was the most + // efficient way I could find to store splits ale->Dip_counts is a map: + // long int "clade id" -> map < set,scalar_type> "a map object + // recording a split and the number of time we saw it". the map < + // set,scalar_type> "a map object recording a split and the + // number of time we saw it" consist of a set which has always two + // parts, gp and gpp (i.e. C[2] and C[3]) and the scalar_type is a + // double recording the number of times we saw this split in short the + // loop below implements << sum "over splits of g" >> + for (map, scalar_type>::iterator kt = + ale->Dip_counts[g_id].begin(); + kt != ale->Dip_counts[g_id].end(); kt++) { + vector parts; + for (set::iterator sit = (*kt).first.begin(); + sit != (*kt).first.end(); sit++) + parts.push_back((*sit)); + long int gp_id = parts[0]; + long int gpp_id = parts[1]; + // n(g) += n(gp)*n(gpp) + g_id_count[g_id] += g_id_count[gp_id] * g_id_count[gpp_id]; + } } + } - scalar_type count_trees_g=0; + scalar_type count_trees_g = 0; // I handle separately splits that correspond to roots of G - // I record the number of times I saw a clade such that its father was the root.. - // the loop below implements << sum "over splits of g" >> for splits corresponding to roots of G - // ale->Bip_counts map: long int "clade id" -> scalar_type "number of times we saw this split" - // here this is like C[2] -> number of times C[2] was present such that C[1] = all genes.. - // with the condition that C[2] is the smaller subclade of L(G), else if |C[2]|=|C[3]| we record both C[2] and C[3] taking care to consider this below (messy sorry) - for (map :: iterator it = ale->Bip_counts.begin(); it != ale->Bip_counts.end(); it++) - { - long int g_id=(*it).first; - // ale->id_sets is a map: long int "clade id" -> set "set of gene ids" - set gamma=ale->id_sets[g_id]; - // since this is a root L(G)/C[2]=C[3] - set not_gamma; - for (set::iterator st=ale->Gamma.begin();st!=ale->Gamma.end();st++) - if (gamma.count(*st)==0) - not_gamma.insert(*st); - - if ( gamma.size()>not_gamma.size()) - // n(g) += n(gp)*n(gpp) - count_trees_g+=g_id_count[ale->set_ids[gamma]]*g_id_count[ale->set_ids[not_gamma]]; - else if (gamma.size()==not_gamma.size()) - // for the case where |C[2]|=|C[3]|, we divide by two to compensate for over-counting - count_trees_g+=g_id_count[ale->set_ids[gamma]]*g_id_count[ale->set_ids[not_gamma]]/2.0; - } + // I record the number of times I saw a clade such that its father was the + // root.. the loop below implements << sum "over splits of g" >> for splits + // corresponding to roots of G ale->Bip_counts map: long int "clade id" -> + // scalar_type "number of times we saw this split" here this is like C[2] -> + // number of times C[2] was present such that C[1] = all genes.. with the + // condition that C[2] is the smaller subclade of L(G), else if |C[2]|=|C[3]| + // we record both C[2] and C[3] taking care to consider this below (messy + // sorry) + for (map::iterator it = ale->Bip_counts.begin(); + it != ale->Bip_counts.end(); it++) { + long int g_id = (*it).first; + // ale->id_sets is a map: long int "clade id" -> set "set of gene ids" + set gamma = ale->id_sets[g_id]; + // since this is a root L(G)/C[2]=C[3] + set not_gamma; + for (set::iterator st = ale->Gamma.begin(); st != ale->Gamma.end(); + st++) + if (gamma.count(*st) == 0) + not_gamma.insert(*st); + + if (gamma.size() > not_gamma.size()) + // n(g) += n(gp)*n(gpp) + count_trees_g += + g_id_count[ale->set_ids[gamma]] * g_id_count[ale->set_ids[not_gamma]]; + else if (gamma.size() == not_gamma.size()) + // for the case where |C[2]|=|C[3]|, we divide by two to compensate for + // over-counting + count_trees_g += g_id_count[ale->set_ids[gamma]] * + g_id_count[ale->set_ids[not_gamma]] / 2.0; + } g_id_count.clear(); return count_trees_g; } +int main(int argc, char **argv) { -int main(int argc, char ** argv) -{ - - string ale_file=argv[1]; - string ale_name=ale_file+".ale"; - approx_posterior * ale; - int burnin=0; - if (argc>2) - burnin=atoi(argv[2]); - ale=observe_ALE_from_file(ale_file,burnin); - cout << "# observe "<< ale->observations << "trees from: " << argv[1] << endl; + string ale_file = argv[1]; + string ale_name = ale_file + ".ale"; + approx_posterior *ale; + int burnin = 0; + if (argc > 2) + burnin = atoi(argv[2]); + ale = observe_ALE_from_file(ale_file, burnin); + cout << "# observe " << ale->observations << "trees from: " << argv[1] + << endl; ale->save_state(ale_name); - cout << "# saved in "<< ale_name<observations<<" trees from: " << ale_name <<".. with :" << ale->count_trees() << " possible amalgamations .." << endl << endl ; + // some info about our ale: + cout << "Read summary of tree sample for " << ale->observations + << " trees from: " << ale_name << ".. with :" << ale->count_trees() + << " possible amalgamations .." << endl + << endl; delete ale; - - ale=load_ALE_from_file(ale_name); - - // above I put a version of the count_trees function which has comments to explain the interface the approx_posterior class provides for iterating over splits, it should give the same result as the previous count - cout << " counting again .. :" << count_trees(ale) << " possible amalgamations .." << endl << endl ; - return 1; + ale = load_ALE_from_file(ale_name); + + // above I put a version of the count_trees function which has comments to + // explain the interface the approx_posterior class provides for iterating + // over splits, it should give the same result as the previous count + cout << " counting again .. :" << count_trees(ale) + << " possible amalgamations .." << endl + << endl; + return 1; } diff --git a/src/ALE_util.cpp b/src/ALE_util.cpp index 21110e4..a75614a 100644 --- a/src/ALE_util.cpp +++ b/src/ALE_util.cpp @@ -2,80 +2,71 @@ using namespace std; using namespace bpp; - - -approx_posterior * observe_ALE_from_file(vector fnames, int burnin,int every,int until) -{ - +approx_posterior *observe_ALE_from_file(vector fnames, int burnin, + int every, int until) { vector trees; - for (vector::iterator it=fnames.begin();it!=fnames.end();it++) + for (vector::iterator it = fnames.begin(); it != fnames.end(); it++) { + string fname = (*it); + if (!fexists(fname)) { + cout << "Error, file " << fname << " does not seem accessible." << endl; + exit(1); + } + ifstream file_stream(fname.c_str()); + int tree_i = 0; + if (file_stream.is_open()) // ########## read trees ############ { - string fname=(*it); - if (!fexists(fname)) { - cout << "Error, file "< burnin and tree_i % every == 0) + trees.push_back(line); + } else if (line.find(";") != line.npos) { + vector tokens; + boost::trim(line); + boost::split(tokens, line, boost::is_any_of(",;: "), + boost::token_compress_on); + string name = tokens[0]; + + approx_posterior *ale = new approx_posterior(); // NO del-loc + string tmp = "tmp_single_ale"; + ofstream fout(tmp.c_str()); + fout << "#constructor_string\n"; + fout << name << endl; + fout << "#observations\n" + "1\n" + "#Bip_counts\n" + "#Bip_bls\n" + "1 1\n" + "#Dip_counts\n" + "#last_leafset_id\n" + "1\n" + "#leaf-id\n"; + fout << name << " 1" << endl; + fout << "#set-id\n" + "1 : 1\n" + "#END\n"; + fout.close(); + + ale->load_state(tmp); + return ale; + } } - ifstream file_stream (fname.c_str()); - int tree_i=0; - if (file_stream.is_open()) // ########## read trees ############ - { - while (! file_stream.eof()) - { - string line; - getline (file_stream,line); - if (line.find("(")!=line.npos ) - { - tree_i++; - if (tree_i>burnin and tree_i%every==0) trees.push_back(line); - } - else if (line.find(";")!=line.npos) - { - vector tokens; - boost::trim(line); - boost::split(tokens,line,boost::is_any_of(",;: "),boost::token_compress_on); - string name=tokens[0]; - - approx_posterior* ale=new approx_posterior();// NO del-loc - string tmp="tmp_single_ale"; - ofstream fout( tmp.c_str() ); - fout << "#constructor_string\n"; - fout << name << endl; - fout << - "#observations\n" - "1\n" - "#Bip_counts\n" - "#Bip_bls\n" - "1 1\n" - "#Dip_counts\n" - "#last_leafset_id\n" - "1\n" - "#leaf-id\n"; - fout << name <<" 1"<load_state(tmp); - return ale; - } - - } - } } - if (trees.size()<1) + } + if (trees.size() < 1) return NULL; vector observe_trees; - if (until==-1) - until=trees.size(); - for (int i=0;iobservation(observe_trees); @@ -85,68 +76,64 @@ approx_posterior * observe_ALE_from_file(vector fnames, int burnin,int e return ale; } -approx_posterior * observe_ALE_from_file(string fname, int burnin,int every,int until) -{ +approx_posterior *observe_ALE_from_file(string fname, int burnin, int every, + int until) { vector trees; if (!fexists(fname)) { - cout << "Error, file "<burnin and tree_i%every==0) trees.push_back(line); - } - else if (line.find(";")!=line.npos) - { - vector tokens; - boost::trim(line); - boost::split(tokens,line,boost::is_any_of(",;: "),boost::token_compress_on); - string name=tokens[0]; - - approx_posterior* ale=new approx_posterior();// NO del-loc - string tmp="tmp_single_ale"; - ofstream fout( tmp.c_str() ); - fout << "#constructor_string\n"; - fout << name << endl; - fout << - "#observations\n" - "1\n" - "#Bip_counts\n" - "#Bip_bls\n" - "1 1\n" - "#Dip_counts\n" - "#last_leafset_id\n" - "1\n" - "#leaf-id\n"; - fout << name <<" 1"<load_state(tmp); - return ale; - } - } + ifstream file_stream(fname.c_str()); + int tree_i = 0; + if (file_stream.is_open()) // ########## read trees ############ + { + while (!file_stream.eof()) { + string line; + getline(file_stream, line); + if (line.find("(") != line.npos) { + tree_i++; + if (tree_i > burnin and tree_i % every == 0) + trees.push_back(line); + } else if (line.find(";") != line.npos) { + vector tokens; + boost::trim(line); + boost::split(tokens, line, boost::is_any_of(",;: "), + boost::token_compress_on); + string name = tokens[0]; + + approx_posterior *ale = new approx_posterior(); // NO del-loc + string tmp = "tmp_single_ale"; + ofstream fout(tmp.c_str()); + fout << "#constructor_string\n"; + fout << name << endl; + fout << "#observations\n" + "1\n" + "#Bip_counts\n" + "#Bip_bls\n" + "1 1\n" + "#Dip_counts\n" + "#last_leafset_id\n" + "1\n" + "#leaf-id\n"; + fout << name << " 1" << endl; + fout << "#set-id\n" + "1 : 1\n" + "#END\n"; + fout.close(); + + ale->load_state(tmp); + return ale; + } } + } - approx_posterior* ale=new approx_posterior(trees[0]);// NO del-loc + approx_posterior *ale = new approx_posterior(trees[0]); // NO del-loc vector observe_trees; - if (until==-1) - until=trees.size(); - for (int i=0;iobservation(observe_trees); @@ -157,282 +144,261 @@ approx_posterior * observe_ALE_from_file(string fname, int burnin,int every,int return ale; } -approx_posterior * observe_ALE_from_string(string tree) -{ +approx_posterior *observe_ALE_from_string(string tree) { vector trees; trees.push_back(tree); - approx_posterior* ale=new approx_posterior(trees[0]);// NO del-loc - ale->observation(trees,false); + approx_posterior *ale = new approx_posterior(trees[0]); // NO del-loc + ale->observation(trees, false); return ale; } -approx_posterior * observe_ALE_from_strings(vector trees) -{ - approx_posterior* ale=new approx_posterior(trees[0]);// NO del-loc - ale->observation(trees,false); +approx_posterior *observe_ALE_from_strings(vector trees) { + approx_posterior *ale = new approx_posterior(trees[0]); // NO del-loc + ale->observation(trees, false); return ale; } -//this function is specific to a dataset with a particular problem with species name seporators -//DO NOT USE AS IS -approx_posterior * observe_ALE_from_nexus(string fname, int burnin,int every,int until) -{ +// this function is specific to a dataset with a particular problem with species +// name seporators DO NOT USE AS IS +approx_posterior *observe_ALE_from_nexus(string fname, int burnin, int every, + int until) { vector trees; - map translate; - ifstream file_stream (fname.c_str()); - int tree_i=0; - cout << "reading nexus." < tokens; - boost::trim(line); - boost::split(tokens,line,boost::is_any_of(",; "),boost::token_compress_on); - vector name_tokens; - //cout << tokens[1] << endl;; - boost::split(name_tokens,tokens[1],boost::is_any_of("_"),boost::token_compress_on); - string new_name=name_tokens[0]+"-"+name_tokens[1]; - for (int i = 2 ; i<(int)name_tokens.size();i++) - new_name+="_"+name_tokens[i]; - translate[tokens[0]]=new_name; - } - } - else - { - if (line.find("(")!=line.npos) - { - vector tokens; - boost::trim(line); - boost::split(tokens,line,boost::is_any_of(" "),boost::token_compress_on); - tree_type * tree=TreeTemplateTools::parenthesisToTree(tokens[4],false); - vector leaves=tree->getLeaves(); - for (vector :: iterator it=leaves.begin();it!=leaves.end();it++) - (*it)->setName(translate[(*it)->getName()]); - tree_i++; - if (tree_i>burnin and tree_i%every==0) trees.push_back(TreeTemplateTools::treeToParenthesis(*tree)); - delete tree; - - } - } - } + map translate; + ifstream file_stream(fname.c_str()); + int tree_i = 0; + cout << "reading nexus." << endl; + + if (file_stream.is_open()) // ########## read trees ############ + { + bool header = true; + while (!file_stream.eof()) { + string line; + getline(file_stream, line); + if (line.find("tree gen") != line.npos) + header = false; + if (header) { + if (line.find("Param:") == line.npos and + line.find("begin trees") == line.npos and + line.find("translate") == line.npos and + line.find("#NEXUS") == line.npos and line.find("ID") == line.npos) { + vector tokens; + boost::trim(line); + boost::split(tokens, line, boost::is_any_of(",; "), + boost::token_compress_on); + vector name_tokens; + // cout << tokens[1] << endl;; + boost::split(name_tokens, tokens[1], boost::is_any_of("_"), + boost::token_compress_on); + string new_name = name_tokens[0] + "-" + name_tokens[1]; + for (int i = 2; i < (int)name_tokens.size(); i++) + new_name += "_" + name_tokens[i]; + translate[tokens[0]] = new_name; + } + } else { + if (line.find("(") != line.npos) { + vector tokens; + boost::trim(line); + boost::split(tokens, line, boost::is_any_of(" "), + boost::token_compress_on); + tree_type *tree = + TreeTemplateTools::parenthesisToTree(tokens[4], false); + vector leaves = tree->getLeaves(); + for (vector::iterator it = leaves.begin(); it != leaves.end(); + it++) + (*it)->setName(translate[(*it)->getName()]); + tree_i++; + if (tree_i > burnin and tree_i % every == 0) + trees.push_back(TreeTemplateTools::treeToParenthesis(*tree)); + delete tree; + } + } } - cout << "translated nexus." < observe_trees; - if (until==-1) - until=trees.size(); - for (int i=0;iobservation(observe_trees,false); + cout << "start observe." << endl; + ale->observation(observe_trees, false); trees.clear(); observe_trees.clear(); return ale; } -approx_posterior * load_ALE_from_file(string fname) -{ - approx_posterior* ale=new approx_posterior();// NO del-loc +approx_posterior *load_ALE_from_file(string fname) { + approx_posterior *ale = new approx_posterior(); // NO del-loc ale->load_state(fname); return ale; } -string save_ALE_to_file(string fname) -{ +string save_ALE_to_file(string fname) { vector trees; - ifstream file_stream (fname.c_str()); - - if (file_stream.is_open()) // ########## read trees ############ - { - while (! file_stream.eof()) - { - string line; - getline (file_stream,line); - if (line.find("(")!=line.npos) - trees.push_back(line); - } + ifstream file_stream(fname.c_str()); + + if (file_stream.is_open()) // ########## read trees ############ + { + while (!file_stream.eof()) { + string line; + getline(file_stream, line); + if (line.find("(") != line.npos) + trees.push_back(line); } - approx_posterior* ale=new approx_posterior(trees[0]);// del-loc - ale->observation(trees,false); + } + approx_posterior *ale = new approx_posterior(trees[0]); // del-loc + ale->observation(trees, false); - vector tokens; + vector tokens; boost::trim(fname); - boost::split(tokens,fname,boost::is_any_of("."),boost::token_compress_on); - fname=tokens[0]; - - ofstream fout( (fname+".trees").c_str() ); - fout << "#tree" << " " << "pp" << " " << "alepp" << endl; - - for ( map< string , int >::iterator it=ale->tree_counts.begin();it!=ale->tree_counts.end();it++) - fout << (*it).first << " " << (*it).second/ale->observations << " " << ale->p((*it).first) << endl; - - ale->save_state(fname+".ale"); + boost::split(tokens, fname, boost::is_any_of("."), boost::token_compress_on); + fname = tokens[0]; + + ofstream fout((fname + ".trees").c_str()); + fout << "#tree" + << " " + << "pp" + << " " + << "alepp" << endl; + + for (map::iterator it = ale->tree_counts.begin(); + it != ale->tree_counts.end(); it++) + fout << (*it).first << " " << (*it).second / ale->observations << " " + << ale->p((*it).first) << endl; + + ale->save_state(fname + ".ale"); delete ale; - return fname+".ale"; + return fname + ".ale"; } -string canonical_branch_lengths( string Sstring) -{ - tree_type * S =TreeTemplateTools::parenthesisToTree(Sstring,false);//del-loc - vector nodes=S->getNodes();//del-loc - map node2height;//del-loc - map height2node;//del-loc - for (vector ::iterator it=nodes.begin();it!=nodes.end();it++) - { - if ((*it)->isLeaf()) - { - node2height[(*it)]=0; - } - else - { - vector sons=(*it)->getSons(); - scalar_type h0 = sons[0]->getDistanceToFather() + node2height[sons[0]]; - scalar_type h1 = sons[1]->getDistanceToFather() + node2height[sons[1]]; - if (abs(h0-h1)>1e-3) - { - cout << " tree is not ultrametric! with diff " << abs(h0-h1)<< endl; - h0=max(h0,h1); - } - node2height[(*it)]=h0; - if (height2node.count(h0)) - { - cout << " tree is degenerate! at height " << h0 < node2rank;//del-loc - //map rank2node;//del-loc - int rank=0; - - map new_height;//del-loc - int n=S->getNumberOfLeaves(); - scalar_type rank_height=0; - - for (map ::iterator hit=height2node.begin();hit!=height2node.end();hit++) - { - rank_height+=1.0/(scalar_type)(n-rank); - rank+=1; - //rank2node[rank]=(*hit).second; - //node2rank[(*hit).second]=rank; - new_height[(*hit).second]=rank_height; +string canonical_branch_lengths(string Sstring) { + tree_type *S = TreeTemplateTools::parenthesisToTree(Sstring, false); // del-loc + vector nodes = S->getNodes(); // del-loc + map node2height; // del-loc + map height2node; // del-loc + for (vector::iterator it = nodes.begin(); it != nodes.end(); it++) { + if ((*it)->isLeaf()) { + node2height[(*it)] = 0; + } else { + vector sons = (*it)->getSons(); + scalar_type h0 = sons[0]->getDistanceToFather() + node2height[sons[0]]; + scalar_type h1 = sons[1]->getDistanceToFather() + node2height[sons[1]]; + if (abs(h0 - h1) > 1e-3) { + cout << " tree is not ultrametric! with diff " << abs(h0 - h1) << endl; + h0 = max(h0, h1); + } + node2height[(*it)] = h0; + if (height2node.count(h0)) { + cout << " tree is degenerate! at height " << h0 << endl; + height2node[h0 + 1e-6] = (*it); + } else + height2node[h0] = (*it); } + } + // map node2rank;//del-loc + // map rank2node;//del-loc + int rank = 0; + + map new_height; // del-loc + int n = S->getNumberOfLeaves(); + scalar_type rank_height = 0; + + for (map::iterator hit = height2node.begin(); + hit != height2node.end(); hit++) { + rank_height += 1.0 / (scalar_type)(n - rank); + rank += 1; + // rank2node[rank]=(*hit).second; + // node2rank[(*hit).second]=rank; + new_height[(*hit).second] = rank_height; + } - for (vector ::iterator it=nodes.begin();it!=nodes.end();it++) - if ((*it)->hasFather()) - { - if ((*it)->isLeaf()) - { - scalar_type fathers_height= new_height[ (*it)->getFather() ]; - (*it)->setDistanceToFather(fathers_height/rank_height); - } - else - { - scalar_type fathers_height= new_height[ (*it)->getFather() ]; - (*it)->setDistanceToFather((fathers_height- new_height[ (*it)])/rank_height); - } + for (vector::iterator it = nodes.begin(); it != nodes.end(); it++) + if ((*it)->hasFather()) { + if ((*it)->isLeaf()) { + scalar_type fathers_height = new_height[(*it)->getFather()]; + (*it)->setDistanceToFather(fathers_height / rank_height); + } else { + scalar_type fathers_height = new_height[(*it)->getFather()]; + (*it)->setDistanceToFather((fathers_height - new_height[(*it)]) / + rank_height); } + } node2height.clear(); height2node.clear(); - //node2rank.clear(); - //rank2node.clear(); + // node2rank.clear(); + // rank2node.clear(); new_height.clear(); nodes.clear(); - Sstring=TreeTemplateTools::treeToParenthesis(*S); + Sstring = TreeTemplateTools::treeToParenthesis(*S); delete S; return Sstring; } +void canonical_branch_lengths(tree_type *S) { + vector nodes = S->getNodes(); // del-loc + map node2height; // del-loc + map height2node; // del-loc + for (vector::iterator it = nodes.begin(); it != nodes.end(); it++) { + if ((*it)->isLeaf()) { + node2height[(*it)] = 0; + } else { + vector sons = (*it)->getSons(); + scalar_type h0 = sons[0]->getDistanceToFather() + node2height[sons[0]]; + scalar_type h1 = sons[1]->getDistanceToFather() + node2height[sons[1]]; + if (abs(h0 - h1) > 1e-3) { + cout << " tree is not ultrametric! with diff " << abs(h0 - h1) << endl; + h0 = max(h0, h1); + } + node2height[(*it)] = h0; + if (height2node.count(h0)) { + cout << " tree is degenerate! at height " << h0 << endl; + height2node[h0 + 1e-6] = (*it); + } else + height2node[h0] = (*it); + } + } + // map node2rank;//del-loc + // map rank2node;//del-loc + int rank = 0; + + map new_height; // del-loc + int n = S->getNumberOfLeaves(); + scalar_type rank_height = 0; + + for (map::iterator hit = height2node.begin(); + hit != height2node.end(); hit++) { + rank_height += 1.0 / (scalar_type)(n - rank); + rank += 1; + // rank2node[rank]=(*hit).second; + // node2rank[(*hit).second]=rank; + new_height[(*hit).second] = rank_height; + } - -void canonical_branch_lengths( tree_type * S ) -{ - vector nodes=S->getNodes();//del-loc - map node2height;//del-loc - map height2node;//del-loc - for (vector ::iterator it=nodes.begin();it!=nodes.end();it++) - { - if ((*it)->isLeaf()) - { - node2height[(*it)]=0; - } - else - { - vector sons=(*it)->getSons(); - scalar_type h0 = sons[0]->getDistanceToFather() + node2height[sons[0]]; - scalar_type h1 = sons[1]->getDistanceToFather() + node2height[sons[1]]; - if (abs(h0-h1)>1e-3) - { - cout << " tree is not ultrametric! with diff " << abs(h0-h1)<< endl; - h0=max(h0,h1); - } - node2height[(*it)]=h0; - if (height2node.count(h0)) - { - cout << " tree is degenerate! at height " << h0 <::iterator it = nodes.begin(); it != nodes.end(); it++) + if ((*it)->hasFather()) { + if ((*it)->isLeaf()) { + scalar_type fathers_height = new_height[(*it)->getFather()]; + (*it)->setDistanceToFather(fathers_height / rank_height); + } else { + scalar_type fathers_height = new_height[(*it)->getFather()]; + (*it)->setDistanceToFather((fathers_height - new_height[(*it)]) / + rank_height); + } } - //map node2rank;//del-loc - //map rank2node;//del-loc - int rank=0; + node2height.clear(); + height2node.clear(); - map new_height;//del-loc - int n=S->getNumberOfLeaves(); - scalar_type rank_height=0; + // node2rank.clear(); + // rank2node.clear(); - for (map ::iterator hit=height2node.begin();hit!=height2node.end();hit++) - { - rank_height+=1.0/(scalar_type)(n-rank); - rank+=1; - //rank2node[rank]=(*hit).second; - //node2rank[(*hit).second]=rank; - new_height[(*hit).second]=rank_height; - } + new_height.clear(); + + nodes.clear(); - for (vector ::iterator it=nodes.begin();it!=nodes.end();it++) - if ((*it)->hasFather()) - { - if ((*it)->isLeaf()) - { - scalar_type fathers_height= new_height[ (*it)->getFather() ]; - (*it)->setDistanceToFather(fathers_height/rank_height); - } - else - { - scalar_type fathers_height= new_height[ (*it)->getFather() ]; - (*it)->setDistanceToFather((fathers_height- new_height[ (*it)])/rank_height); - } - } - node2height.clear(); - height2node.clear(); - - //node2rank.clear(); - //rank2node.clear(); - - new_height.clear(); - - nodes.clear(); - - return ; + return; } diff --git a/src/ALE_util.h b/src/ALE_util.h index 7df02b3..e0e723d 100644 --- a/src/ALE_util.h +++ b/src/ALE_util.h @@ -1,20 +1,25 @@ -//all code by Szollosi GJ et al.; ssolo@elte.hu; GNU GPL 3.0; +// all code by Szollosi GJ et al.; ssolo@elte.hu; GNU GPL 3.0; #pragma once #include "ALE.h" -inline bool fexists(const std::string& filename) -{ +inline bool fexists(const std::string &filename) { std::ifstream ifile(filename); return ifile.good(); }; -approx_posterior * observe_ALE_from_nexus(std::string fname,int burnin=100, int every=1,int until=-1); -approx_posterior * observe_ALE_from_file(std::string fname,int burnin=100, int every=1,int until=-1); // NO del-loc -approx_posterior * observe_ALE_from_file(std::vector fnames,int burnin=100, int every=1,int until=-1); // NO del-loc -approx_posterior * observe_ALE_from_strings(std::vector trees); // NO del-loc -approx_posterior * observe_ALE_from_string(std::string tree); // NO del-loc -approx_posterior * load_ALE_from_file(std::string fname); // NO del-loc +approx_posterior *observe_ALE_from_nexus(std::string fname, int burnin = 100, + int every = 1, int until = -1); +approx_posterior *observe_ALE_from_file(std::string fname, int burnin = 100, + int every = 1, + int until = -1); // NO del-loc +approx_posterior *observe_ALE_from_file(std::vector fnames, + int burnin = 100, int every = 1, + int until = -1); // NO del-loc +approx_posterior * +observe_ALE_from_strings(std::vector trees); // NO del-loc +approx_posterior *observe_ALE_from_string(std::string tree); // NO del-loc +approx_posterior *load_ALE_from_file(std::string fname); // NO del-loc std::string save_ALE_to_file(std::string fname); std::string canonical_branch_lengths(std::string Sstring); -void canonical_branch_lengths( tree_type * S ); +void canonical_branch_lengths(tree_type *S); diff --git a/src/ALEadd.cpp b/src/ALEadd.cpp index bbc5b46..7b7793c 100644 --- a/src/ALEadd.cpp +++ b/src/ALEadd.cpp @@ -3,85 +3,81 @@ using namespace std; using namespace bpp; -int main(int argc, char ** argv) -{ - cout << "ALEadd using ALE v"<< ALE_VERSION < tokens; - boost::split(tokens,next_field,boost::is_any_of("="),boost::token_compress_on); - if (tokens[0]=="burnin") - burnin=atoi(tokens[1].c_str()); - else if (tokens[0]=="every") - every=atoi(tokens[1].c_str()); - else if (tokens[0]=="until") - until=atoi(tokens[1].c_str()); - else if (tokens[0]=="weight") - weight=atof(tokens[1].c_str()); - else if (tokens[0]=="outfile") - ale_name=tokens[1]; - } - ale=load_ALE_from_file(ale_file); - cout << "." << endl; + for (int i = 3; i < argc; i++) { + string next_field = argv[i]; + vector tokens; + boost::split(tokens, next_field, boost::is_any_of("="), + boost::token_compress_on); + if (tokens[0] == "burnin") + burnin = atoi(tokens[1].c_str()); + else if (tokens[0] == "every") + every = atoi(tokens[1].c_str()); + else if (tokens[0] == "until") + until = atoi(tokens[1].c_str()); + else if (tokens[0] == "weight") + weight = atof(tokens[1].c_str()); + else if (tokens[0] == "outfile") + ale_name = tokens[1]; + } + ale = load_ALE_from_file(ale_file); + cout << "." << endl; vector trees; - ifstream file_stream (trees_file.c_str()); - int tree_i=0; - if (file_stream.is_open()) // ########## read trees ############ - { - while (! file_stream.eof()) - { - string line; - getline (file_stream,line); - if (line.find("(")!=line.npos) - { - if (tree_i>=burnin and tree_i%every==0) - { - cout << line; - trees.push_back(line); - } - tree_i++; - } - } + ifstream file_stream(trees_file.c_str()); + int tree_i = 0; + if (file_stream.is_open()) // ########## read trees ############ + { + while (!file_stream.eof()) { + string line; + getline(file_stream, line); + if (line.find("(") != line.npos) { + if (tree_i >= burnin and tree_i % every == 0) { + cout << line; + trees.push_back(line); + } + tree_i++; + } } + } - cout << ".." << endl; + cout << ".." << endl; vector observe_trees; - if (until==-1) - until=trees.size(); - for (int i=0;iobservation(observe_trees,false,weight); + if (until == -1) + until = trees.size(); + for (int i = 0; i < min((int)trees.size(), until); i++) { + observe_trees.push_back(trees[i]); + } + ale->observation(observe_trees, false, weight); + cout << "# " << observe_trees.size() << " new tree(s) observed with weight " + << weight << " from: " << argv[2]; + cout << "; " << burnin << " trees burnin discarded." << endl; - cout <<"# " << observe_trees.size() << " new tree(s) observed with weight "<observations << " tree(s) from: " << argv[1] << " and " << argv[2] << endl; + cout << "# .ale with " << ale->observations << " tree(s) from: " << argv[1] + << " and " << argv[2] << endl; ale->save_state(ale_name); - cout << "# saved in "<< ale_name<count_trees()<< endl; - //cout << ale->count_all_trees(ale->Gamma)<< endl; + cout << ale->count_trees() << endl; + // cout << ale->count_all_trees(ale->Gamma)<< endl; return 1; } diff --git a/src/ALEevaluate_undated.cpp b/src/ALEevaluate_undated.cpp index d7cff2c..bdc4aa3 100644 --- a/src/ALEevaluate_undated.cpp +++ b/src/ALEevaluate_undated.cpp @@ -5,22 +5,19 @@ using namespace std; using namespace bpp; - string readTreeFromFile(string fname) { if (!fexists(fname)) { - cout << "Error, file "< gene_tree_strs ; // Silly: we need to produce a vector with a single element... + string head = gene_tree_file; + string ale_name = head + ".ale"; + // Reading the gene tree from within the file + string gene_tree_str = readTreeFromFile(gene_tree_file); + approx_posterior *ale = new approx_posterior(gene_tree_str); + vector gene_tree_strs; // Silly: we need to produce a vector with a + // single element... gene_tree_strs.push_back(gene_tree_str); ale->observation(gene_tree_strs); - cout << "\n\tObserved "<< ale->observations << " gene tree(s) from: " << argv[2] <observations + << " gene tree(s) from: " << argv[2] << endl; - //we initialise a coarse grained reconciliation model for calculating the sum - exODT_model* model=new exODT_model(); + // we initialise a coarse grained reconciliation model for calculating the sum + exODT_model *model = new exODT_model(); // Getting the other options - scalar_type samples=100; - scalar_type O_R=1,beta=1; - scalar_type delta=0.01,tau=0.01,lambda=0.1; + scalar_type samples = 100; + scalar_type O_R = 1, beta = 1; + scalar_type delta = 0.01, tau = 0.01, lambda = 0.1; string fractionMissingFile = ""; bool outputyn = false; - for (int i=3;i tokens; - boost::split(tokens,next_field,boost::is_any_of("="),boost::token_compress_on); - if (tokens[0]=="sample") - samples=atoi(tokens[1].c_str()); - else if (tokens[0]=="separators") - model->set_model_parameter("gene_name_separators", tokens[1]); - else if (tokens[0]=="delta") - { - delta=atof(tokens[1].c_str()); + for (int i = 3; i < argc; i++) { + string next_field = argv[i]; + vector tokens; + boost::split(tokens, next_field, boost::is_any_of("="), + boost::token_compress_on); + if (tokens[0] == "sample") + samples = atoi(tokens[1].c_str()); + else if (tokens[0] == "separators") + model->set_model_parameter("gene_name_separators", tokens[1]); + else if (tokens[0] == "delta") { + delta = atof(tokens[1].c_str()); cout << "\n\tDelta fixed to " << delta << endl; - } - else if (tokens[0]=="tau") - { - tau=atof(tokens[1].c_str()); + } else if (tokens[0] == "tau") { + tau = atof(tokens[1].c_str()); cout << "\n\tTau fixed to " << tau << endl; - } - else if (tokens[0]=="lambda") - { - lambda=atof(tokens[1].c_str()); + } else if (tokens[0] == "lambda") { + lambda = atof(tokens[1].c_str()); cout << "Lambda fixed to " << lambda << endl; - } - else if (tokens[0]=="O_R") - { - O_R=atof(tokens[1].c_str()); + } else if (tokens[0] == "O_R") { + O_R = atof(tokens[1].c_str()); cout << "\n\tO_R set to " << O_R << endl; - } - else if (tokens[0]=="beta") - { - beta=atof(tokens[1].c_str()); + } else if (tokens[0] == "beta") { + beta = atof(tokens[1].c_str()); cout << "\n\tBeta set to " << beta << endl; - } - else if (tokens[0]=="fraction_missing") - { - fractionMissingFile=tokens[1]; - cout << "\n\tFile containing fractions of missing genes set to " << fractionMissingFile << endl; - } - else if (tokens[0]=="outputFiles") - { - if (tokens[1] == "y" || tokens[1] == "yes" || tokens[1] == "Y" || tokens[1] == "YES") { + } else if (tokens[0] == "fraction_missing") { + fractionMissingFile = tokens[1]; + cout << "\n\tFile containing fractions of missing genes set to " + << fractionMissingFile << endl; + } else if (tokens[0] == "outputFiles") { + if (tokens[1] == "y" || tokens[1] == "yes" || tokens[1] == "Y" || + tokens[1] == "YES") { outputyn = true; } } @@ -116,45 +106,45 @@ int main(int argc, char ** argv) // Constructing the ALE_undated object and computing the logLk. - model->set_model_parameter("BOOTSTRAP_LABELS","yes"); + model->set_model_parameter("BOOTSTRAP_LABELS", "yes"); model->construct_undated(species_tree_str, fractionMissingFile); model->set_model_parameter("seq_beta", beta); model->set_model_parameter("O_R", O_R); - //a set of inital rates + // a set of inital rates model->set_model_parameter("delta", delta); model->set_model_parameter("tau", tau); model->set_model_parameter("lambda", lambda); - //calculate_EGb() must always be called after changing rates to calculate E-s and G-s - //cf. http://arxiv.org/abs/1211.4606 + // calculate_EGb() must always be called after changing rates to calculate E-s + // and G-s cf. http://arxiv.org/abs/1211.4606 model->calculate_undatedEs(); double loglk = log(model->pun(ale, true)); - cout << "\n\tReconciliation model likelihood computed, logLk: " < sample_strings; - vector sample_trees; - boost::progress_display pd( samples ); + cout << "\n\tSampling reconciled gene trees.." << endl; + vector sample_strings; + vector sample_trees; + boost::progress_display pd(samples); - for (int i=0;isample_undated(); + string sample_tree = model->sample_undated(); sample_strings.push_back(sample_tree); - if (ale->last_leafset_id>3) - { + if (ale->last_leafset_id > 3) { - tree_type * G=TreeTemplateTools::parenthesisToTree(sample_tree,false); + tree_type *G = TreeTemplateTools::parenthesisToTree(sample_tree, false); - vector leaves = G->getLeaves(); - for (vector::iterator it=leaves.begin();it!=leaves.end();it++ ) - { - string name=(*it)->getName(); + vector leaves = G->getLeaves(); + for (vector::iterator it = leaves.begin(); it != leaves.end(); + it++) { + string name = (*it)->getName(); vector tokens; - boost::split(tokens,name,boost::is_any_of(".@"),boost::token_compress_on); + boost::split(tokens, name, boost::is_any_of(".@"), + boost::token_compress_on); (*it)->setName(tokens[0]); tokens.clear(); } @@ -163,67 +153,73 @@ int main(int argc, char ** argv) } } vector tokens; - boost::split(tokens,gene_tree_file,boost::is_any_of("/"),boost::token_compress_on); - ale_name=tokens[tokens.size()-1]; - string outname=ale_name+".uml_rec"; - ofstream fout( outname.c_str() ); - fout << "#ALEevaluate using ALE v"<< ALE_VERSION <<"; CC BY-SA 3.0;"<string_parameter["S_with_ranks"] <string_parameter["S_with_ranks"] << endl; fout << endl; - fout << "Gene tree from:\t"<logl: " << loglk << endl; - fout << "rate of\t Duplications\tTransfers\tLosses" <MLRec_events["D"]/samples << "\t" << model->MLRec_events["T"]/samples << "\t" << model->MLRec_events["L"]/samples<< "\t" << model->MLRec_events["S"]/samples <MLRec_events["D"] / samples << "\t" + << model->MLRec_events["T"] / samples << "\t" + << model->MLRec_events["L"] / samples << "\t" + << model->MLRec_events["S"] / samples << endl; fout << endl; - fout << "# of\t Duplications\tTransfers\tLosses\tOriginations\tcopies" <counts_string_undated(samples); fout.close(); cout << "Results in: " << outname << endl; - if (ale->last_leafset_id>3) - { - cout << "Calculating MRP consensus tree."<last_leafset_id > 3) { + cout << "Calculating MRP consensus tree." << endl; + Tree *con_tree = TreeTools::MRP(sample_trees); - string con_name=ale_name+".ucons_tree"; + string con_name = ale_name + ".ucons_tree"; - ofstream con_out( con_name.c_str() ); - con_out << "#ALEsample using ALE v"<< ALE_VERSION <<" by Szollosi GJ et al.; ssolo@elte.hu; CC BY-SA 3.0;"<last_branch;e++) - for (int f=0;flast_branch;f++) - if (model->T_to_from[e][f]>0) - { - if (elast_leaf) - tout << "\t" << model->node_name[model->id_nodes[e]]; - else - tout << "\t" << e; - if (flast_leaf) - tout << "\t" << model->node_name[model->id_nodes[f]]; - else - tout << "\t" << f; - tout << "\t" << model->T_to_from[e][f]/samples << endl; - } + string t_name = ale_name + ".uTs"; + ofstream tout(t_name.c_str()); + tout << "#from\tto\tfreq.\n"; + + for (int e = 0; e < model->last_branch; e++) + for (int f = 0; f < model->last_branch; f++) + if (model->T_to_from[e][f] > 0) { + if (e < model->last_leaf) + tout << "\t" << model->node_name[model->id_nodes[e]]; + else + tout << "\t" << e; + if (f < model->last_leaf) + tout << "\t" << model->node_name[model->id_nodes[f]]; + else + tout << "\t" << f; + tout << "\t" << model->T_to_from[e][f] / samples << endl; + } tout.close(); cout << "Transfers in: " << t_name << endl; } diff --git a/src/ALEmcmc_undated.cpp b/src/ALEmcmc_undated.cpp index 5bf7c93..895f434 100644 --- a/src/ALEmcmc_undated.cpp +++ b/src/ALEmcmc_undated.cpp @@ -1,37 +1,40 @@ -#include "exODT.h" #include "ALE_util.h" +#include "exODT.h" -#include -#include #include #include +#include +#include using namespace std; using namespace bpp; - /************************************************************************ * Scaling move to change the value of a Real Positive (e.g. a rate). * The lambda parameter here represents a scale. ************************************************************************/ -//This version of the scaling function ensures that the new value is not larger than some maximum value -double scaleDoubleConstrained ( const double& value, const double& maxi, const double& lambda, double& hastingsRatio, const bool verbose ) { - double newValue = value; - // Generate new value (no reflection, so we simply abort later if we propose value here outside of support) - double u = RandomTools::giveRandomNumberBetweenZeroAndEntry ( 1.0 ); - double scalingFactor = std::exp( lambda * ( u - 0.5 ) ); - newValue *= scalingFactor; - if (newValue < 0.00001 ) { - newValue = 0.00001; - } - if (newValue > maxi ) { - newValue = maxi; - } - - // compute the Hastings ratio - hastingsRatio = scalingFactor ; - return newValue; +// This version of the scaling function ensures that the new value is not larger +// than some maximum value +double scaleDoubleConstrained(const double &value, const double &maxi, + const double &lambda, double &hastingsRatio, + const bool verbose) { + double newValue = value; + // Generate new value (no reflection, so we simply abort later if we propose + // value here outside of support) + double u = RandomTools::giveRandomNumberBetweenZeroAndEntry(1.0); + double scalingFactor = std::exp(lambda * (u - 0.5)); + newValue *= scalingFactor; + if (newValue < 0.00001) { + newValue = 0.00001; + } + if (newValue > maxi) { + newValue = maxi; + } + + // compute the Hastings ratio + hastingsRatio = scalingFactor; + return newValue; } /************************************************************************ @@ -40,21 +43,22 @@ double scaleDoubleConstrained ( const double& value, const double& maxi, const d * P(value) = lambda exp (-lambda*value) * mean:1/lambda ************************************************************************/ -double computeExponentialLogProbability ( const double& param, const double& value ) { - return (std::log(param) - param * value); +double computeExponentialLogProbability(const double ¶m, + const double &value) { + return (std::log(param) - param * value); } - ////////////////////////////////////////// ////////////////////////////////////////// -double computeLogLk (exODT_model* model, approx_posterior * ale, const double &o, const double &d, const double &t, const double &l) { +double computeLogLk(exODT_model *model, approx_posterior *ale, const double &o, + const double &d, const double &t, const double &l) { model->set_model_parameter("O_R", o); model->set_model_parameter("delta", d); model->set_model_parameter("tau", t); model->set_model_parameter("lambda", l); - //calculate_EGb() must always be called after changing rates to calculate E-s and G-s - //cf. http://arxiv.org/abs/1211.4606 + // calculate_EGb() must always be called after changing rates to calculate E-s + // and G-s cf. http://arxiv.org/abs/1211.4606 model->calculate_undatedEs(); double ll = log(model->pun(ale)); return ll; @@ -63,19 +67,25 @@ double computeLogLk (exODT_model* model, approx_posterior * ale, const double &o ////////////////////////////////////////// ////////////////////////////////////////// -double computeLogPrior ( const double &o, const double &d, const double &t, const double &l, const double &priorOrigination, const double &priorDelta, const double &priorTau, const double &priorLambda) { +double computeLogPrior(const double &o, const double &d, const double &t, + const double &l, const double &priorOrigination, + const double &priorDelta, const double &priorTau, + const double &priorLambda) { double pp = 0.0; - pp += computeExponentialLogProbability ( priorOrigination, o); - pp += computeExponentialLogProbability ( priorDelta, d); - pp += computeExponentialLogProbability ( priorTau, t); - pp += computeExponentialLogProbability ( priorLambda, l); + pp += computeExponentialLogProbability(priorOrigination, o); + pp += computeExponentialLogProbability(priorDelta, d); + pp += computeExponentialLogProbability(priorTau, t); + pp += computeExponentialLogProbability(priorLambda, l); return pp; } ////////////////////////////////////////// ////////////////////////////////////////// -void acceptMove(double ¤tOrigination, double ¤tDelta, double ¤tTau, double ¤tLambda, const double &newOrigination, const double &newDelta, const double &newTau, const double &newLambda) { +void acceptMove(double ¤tOrigination, double ¤tDelta, + double ¤tTau, double ¤tLambda, + const double &newOrigination, const double &newDelta, + const double &newTau, const double &newLambda) { currentOrigination = newOrigination; currentDelta = newDelta; currentTau = newTau; @@ -85,37 +95,42 @@ void acceptMove(double ¤tOrigination, double ¤tDelta, double &cu ////////////////////////////////////////// ////////////////////////////////////////// -void rejectMove( const double ¤tOrigination, const double ¤tDelta, const double ¤tTau, const double ¤tLambda, double &newOrigination, double &newDelta, double &newTau, double &newLambda) { -newOrigination = currentOrigination; -newDelta = currentDelta; -newTau = currentTau; -newLambda = currentLambda; +void rejectMove(const double ¤tOrigination, const double ¤tDelta, + const double ¤tTau, const double ¤tLambda, + double &newOrigination, double &newDelta, double &newTau, + double &newLambda) { + newOrigination = currentOrigination; + newDelta = currentDelta; + newTau = currentTau; + newLambda = currentLambda; } ////////////////////////////////////////// ////////////////////////////////////////// -void sampleTree (exODT_model* model, approx_posterior * ale, double &o, double &d, double &t, double &l, vector &sample_strings, vector &sample_trees) { +void sampleTree(exODT_model *model, approx_posterior *ale, double &o, double &d, + double &t, double &l, vector &sample_strings, + vector &sample_trees) { model->set_model_parameter("O_R", o); model->set_model_parameter("delta", d); model->set_model_parameter("tau", t); model->set_model_parameter("lambda", l); - //calculate_EGb() must always be called after changing rates to calculate E-s and G-s - //cf. http://arxiv.org/abs/1211.4606 + // calculate_EGb() must always be called after changing rates to calculate E-s + // and G-s cf. http://arxiv.org/abs/1211.4606 model->calculate_undatedEs(); - string sample_tree=model->sample_undated(); + string sample_tree = model->sample_undated(); sample_strings.push_back(sample_tree); - if (ale->last_leafset_id>3) - { + if (ale->last_leafset_id > 3) { - tree_type * G=TreeTemplateTools::parenthesisToTree(sample_tree,false); + tree_type *G = TreeTemplateTools::parenthesisToTree(sample_tree, false); - vector leaves = G->getLeaves(); - for (vector::iterator it=leaves.begin();it!=leaves.end();it++ ) - { - string name=(*it)->getName(); + vector leaves = G->getLeaves(); + for (vector::iterator it = leaves.begin(); it != leaves.end(); + it++) { + string name = (*it)->getName(); vector tokens; - boost::split(tokens,name,boost::is_any_of(".@"),boost::token_compress_on); + boost::split(tokens, name, boost::is_any_of(".@"), + boost::token_compress_on); (*it)->setName(tokens[0]); tokens.clear(); } @@ -124,223 +139,238 @@ void sampleTree (exODT_model* model, approx_posterior * ale, double &o, double & } } - -void fillTToFrom(exODT_model* model, map& tToFrom) { - for (int e=0;elast_branch;e++) { - for (int f=0;flast_branch;f++) { - if (model->T_to_from[e][f]>0) - { - string name1, name2, names; - if (elast_leaf) { - name1 = model->node_name[model->id_nodes[e]]; +void fillTToFrom(exODT_model *model, map &tToFrom) { + for (int e = 0; e < model->last_branch; e++) { + for (int f = 0; f < model->last_branch; f++) { + if (model->T_to_from[e][f] > 0) { + string name1, name2, names; + if (e < model->last_leaf) { + name1 = model->node_name[model->id_nodes[e]]; + } else { + name1 = std::to_string(e); + } + if (f < model->last_leaf) { + name2 = model->node_name[model->id_nodes[f]]; + } else { + name2 = std::to_string(f); + } + names = name1 + "\t" + name2; + map::iterator it = tToFrom.find(names); + if (it != tToFrom.end()) { + // element found; + tToFrom[names] += model->T_to_from[e][f]; + } else { + tToFrom[names] = model->T_to_from[e][f]; + } } - else { - name1 = std::to_string(e); - } - if (flast_leaf) { - name2 = model->node_name[model->id_nodes[f]]; - } - else { - name2 = std::to_string(f); - } - names = name1 + "\t" + name2; - map::iterator it = tToFrom.find(names); - if(it != tToFrom.end()) - { - //element found; - tToFrom[names] += model->T_to_from[e][f]; - } - else { - tToFrom[names] = model->T_to_from[e][f]; - } - } - } + } } return; } - ////////////////////////////////////////// ////////////////////////////////////////// -int main(int argc, char ** argv) -{ - cout << "ALEmcmc using ALE v"<< ALE_VERSION <observations<<" trees from: " << ale_file <<".."<observations + << " trees from: " << ale_file << ".." << endl; + + // Getting the radical for output files: vector tokens; - boost::split(tokens,ale_file,boost::is_any_of("/"),boost::token_compress_on); - ale_file=tokens[tokens.size()-1]; - - - //we initialise a coarse grained reconciliation model for calculating the sum - exODT_model* model=new exODT_model(); - - scalar_type samples=100; - - //a set of inital rates - double priorOrigination =1.0, priorDelta=0.01,priorTau=0.01,priorLambda=0.1; - size_t sampling_rate = 1; - scalar_type beta=1; - - string fractionMissingFile = ""; - map >rate_multipliers; - - string outputSpeciesTree = ""; - model->set_model_parameter("undatedBL",false); - model->set_model_parameter("reldate",false); - - for (int i=3;i tokens; - boost::split(tokens,next_field,boost::is_any_of("=:"),boost::token_compress_on); - if (tokens[0]=="sample") - samples=atof(tokens[1].c_str()); - else if (tokens[0]=="separators") - model->set_model_parameter("gene_name_separators", tokens[1]); - else if (tokens[0]=="delta") - { - priorDelta=atof(tokens[1].c_str()); - cout << "# priorDelta fixed to " << priorDelta << endl; - } - else if (tokens[0]=="tau") - { - priorTau=atof(tokens[1].c_str()); - cout << "# priorTau fixed to " << priorTau << endl; - } - else if (tokens[0]=="lambda") - { - priorLambda=atof(tokens[1].c_str()); - cout << "# priorLambda fixed to " << priorLambda << endl; - - } - else if (tokens[0]=="O_R") - { - priorOrigination=atof(tokens[1].c_str()); - cout << "# priorOrigination set to " << priorOrigination << endl; - } - else if (tokens[0]=="beta") - { - beta=atof(tokens[1].c_str()); - cout << "# beta set to " << beta << endl; - } - else if (tokens[0]=="sampling_rate") - { - sampling_rate=atoi(tokens[1].c_str()); - cout << "# sampling_rate set to " << sampling_rate << endl; - } - else if (tokens[0]=="fraction_missing") - { - fractionMissingFile=tokens[1]; - cout << "# File containing fractions of missing genes set to " << fractionMissingFile << endl; - } - else if (tokens[0]=="output_species_tree") - { - std::string valArg = boost::algorithm::to_lower_copy(tokens[1]); - if (valArg == "y" || valArg == "ye" || valArg == "yes" ) { - outputSpeciesTree= ale_file + ".spTree"; - cout << "# outputting the annotated species tree to "<< outputSpeciesTree << endl; - } - } - else if (tokens[0]=="S_branch_lengths") - { - model->set_model_parameter("undatedBL",true); - if (tokens.size()==1) - { - model->set_model_parameter("root_BL", 1); - cout << "# unsing branch lengths of input S tree as rate multipliers with 1 at root! "<< endl; - } - else - { - scalar_type root_rm=atof(tokens[1].c_str()); - model->set_model_parameter("root_BL", root_rm); - cout << "# unsing branch lengths of input S tree as rate multipliers with "<set_model_parameter("reldate",true); - } - - } - - - model->set_model_parameter("BOOTSTRAP_LABELS","yes"); - model->set_model_parameter("seq_beta", beta); - for (auto it=rate_multipliers.begin();it!=rate_multipliers.end();it++) - for (auto jt=(*it).second.begin();jt!=(*it).second.end();jt++) - { - model->vector_parameter[(*it).first][(*jt).first]=(*jt).second; + boost::split(tokens, ale_file, boost::is_any_of("/"), + boost::token_compress_on); + ale_file = tokens[tokens.size() - 1]; + + // we initialise a coarse grained reconciliation model for calculating the sum + exODT_model *model = new exODT_model(); + + scalar_type samples = 100; + + // a set of inital rates + double priorOrigination = 1.0, priorDelta = 0.01, priorTau = 0.01, + priorLambda = 0.1; + size_t sampling_rate = 1; + scalar_type beta = 1; + + string fractionMissingFile = ""; + map> rate_multipliers; + + string outputSpeciesTree = ""; + model->set_model_parameter("undatedBL", false); + model->set_model_parameter("reldate", false); + + for (int i = 3; i < argc; i++) { + string next_field = argv[i]; + vector tokens; + boost::split(tokens, next_field, boost::is_any_of("=:"), + boost::token_compress_on); + if (tokens[0] == "sample") + samples = atof(tokens[1].c_str()); + else if (tokens[0] == "separators") + model->set_model_parameter("gene_name_separators", tokens[1]); + else if (tokens[0] == "delta") { + priorDelta = atof(tokens[1].c_str()); + cout << "# priorDelta fixed to " << priorDelta << endl; + } else if (tokens[0] == "tau") { + priorTau = atof(tokens[1].c_str()); + cout << "# priorTau fixed to " << priorTau << endl; + } else if (tokens[0] == "lambda") { + priorLambda = atof(tokens[1].c_str()); + cout << "# priorLambda fixed to " << priorLambda << endl; + + } else if (tokens[0] == "O_R") { + priorOrigination = atof(tokens[1].c_str()); + cout << "# priorOrigination set to " << priorOrigination << endl; + } else if (tokens[0] == "beta") { + beta = atof(tokens[1].c_str()); + cout << "# beta set to " << beta << endl; + } else if (tokens[0] == "sampling_rate") { + sampling_rate = atoi(tokens[1].c_str()); + cout << "# sampling_rate set to " << sampling_rate << endl; + } else if (tokens[0] == "fraction_missing") { + fractionMissingFile = tokens[1]; + cout << "# File containing fractions of missing genes set to " + << fractionMissingFile << endl; + } else if (tokens[0] == "output_species_tree") { + std::string valArg = boost::algorithm::to_lower_copy(tokens[1]); + if (valArg == "y" || valArg == "ye" || valArg == "yes") { + outputSpeciesTree = ale_file + ".spTree"; + cout << "# outputting the annotated species tree to " + << outputSpeciesTree << endl; } + } else if (tokens[0] == "S_branch_lengths") { + model->set_model_parameter("undatedBL", true); + if (tokens.size() == 1) { + model->set_model_parameter("root_BL", 1); + cout << "# unsing branch lengths of input S tree as rate multipliers " + "with 1 at root! " + << endl; + } else { + scalar_type root_rm = atof(tokens[1].c_str()); + model->set_model_parameter("root_BL", root_rm); + cout << "# unsing branch lengths of input S tree as rate multipliers " + "with " + << root_rm << " at root! " << endl; + } + + } else if (tokens[0] == "rate_multiplier") { + string rate_name = tokens[1]; + int e = atoi(tokens[2].c_str()); + scalar_type rm = atof(tokens[3].c_str()); + cout << "# rate multiplier for rate " << rate_name + << " on branch with ID " << e << " set to " << rm << endl; + rate_multipliers["rate_multiplier_" + rate_name][e] = rm; + } else if (tokens[0] == "reldate") { + cout << "Respecting realtive ages from input S tree, please make sure " + "input S tree is ultrametric!" + << endl; + model->set_model_parameter("reldate", true); + } + } + + model->set_model_parameter("BOOTSTRAP_LABELS", "yes"); + model->set_model_parameter("seq_beta", beta); + for (auto it = rate_multipliers.begin(); it != rate_multipliers.end(); it++) + for (auto jt = (*it).second.begin(); jt != (*it).second.end(); jt++) { + model->vector_parameter[(*it).first][(*jt).first] = (*jt).second; + } - model->construct_undated(Sstring, fractionMissingFile); - double currentOrigination = RandomTools::randExponential(priorOrigination) ; - double currentDelta = RandomTools::randExponential(priorDelta) ; - double currentTau = RandomTools::randExponential(priorTau) ; - double currentLambda = RandomTools::randExponential(priorLambda) ; + double currentOrigination = RandomTools::randExponential(priorOrigination); + double currentDelta = RandomTools::randExponential(priorDelta); + double currentTau = RandomTools::randExponential(priorTau); + double currentLambda = RandomTools::randExponential(priorLambda); double newOrigination = currentOrigination; double newDelta = currentDelta; double newTau = currentTau; double newLambda = currentLambda; - double currentLogLikelihood = computeLogLk (model, ale, currentOrigination, currentDelta, currentTau, currentLambda); + double currentLogLikelihood = computeLogLk( + model, ale, currentOrigination, currentDelta, currentTau, currentLambda); double newLogLikelihood = currentLogLikelihood; - double currentLogPrior = computeLogPrior (currentOrigination, currentDelta, currentTau, currentLambda, priorOrigination, priorDelta, priorTau, priorLambda) ; + double currentLogPrior = computeLogPrior( + currentOrigination, currentDelta, currentTau, currentLambda, + priorOrigination, priorDelta, priorTau, priorLambda); double newLogPrior = currentLogPrior; - std::cout << "Initial logLK: "<< currentLogLikelihood << " and logPrior: "<< currentLogPrior < moveWeights; - moveWeights.push_back( 1 ) ; // originationId - moveWeights.push_back( 1 ); // deltaId - moveWeights.push_back( 1 ); // lambdaId - moveWeights.push_back( 1 ); // tauId + moveWeights.push_back(1); // originationId + moveWeights.push_back(1); // deltaId + moveWeights.push_back(1); // lambdaId + moveWeights.push_back(1); // tauId size_t move = 0; - std::vector< int > order = VectorTools::seq ( 0, (int) moveWeights.size(), 1 ); + std::vector order = VectorTools::seq(0, (int)moveWeights.size(), 1); double maxSumDTL = 10; double maxOrigination = 1000000; double hastingsRatio = 0.0; bool verbose = false; double scale = 1; - std::vector scaleMoveParameters ; - scaleMoveParameters.push_back (0.1) ; - scaleMoveParameters.push_back (1) ; - scaleMoveParameters.push_back (10) ; + std::vector scaleMoveParameters; + scaleMoveParameters.push_back(0.1); + scaleMoveParameters.push_back(1); + scaleMoveParameters.push_back(10); std::vector scaleWeights; scaleWeights.push_back(1); scaleWeights.push_back(1); scaleWeights.push_back(1); double threshold = 0.0; double acceptanceProbability = 0.0; - std::vector orderScaleMoveParameters = VectorTools::seq ( 0, (int) scaleMoveParameters.size(), 1 ); - vector sample_strings; - vector sample_trees; + std::vector orderScaleMoveParameters = + VectorTools::seq(0, (int)scaleMoveParameters.size(), 1); + vector sample_strings; + vector sample_trees; size_t i = 0; // Summary variables double numSpeciations = 0.0; double numDuplications = 0.0; double numTransfers = 0.0; double numLosses = 0.0; - map tToFrom ; - //boost::progress_display pd( i ); - - string mcmcoutname=ale_file+"_umcmc.csv"; - ofstream mcmcout( mcmcoutname.c_str() ); - - mcmcout << "Iteration" << "\t"<< "LogLk" << "\t" << "LogPrior" << "\t" << "Origination" << "\t" << "Delta" << "\t" << "Tau" << "\t" << "Lambda" < tToFrom; + // boost::progress_display pd( i ); + + string mcmcoutname = ale_file + "_umcmc.csv"; + ofstream mcmcout(mcmcoutname.c_str()); + + mcmcout << "Iteration" + << "\t" + << "LogLk" + << "\t" + << "LogPrior" + << "\t" + << "Origination" + << "\t" + << "Delta" + << "\t" + << "Tau" + << "\t" + << "Lambda" << std::endl; + std::cout << "Iteration" + << "\t" + << "LogLk" + << "\t" + << "LogPrior" + << "\t" + << "Origination" + << "\t" + << "Delta" + << "\t" + << "Tau" + << "\t" + << "Lambda" << std::endl; // BURNIN loop size_t burninLength = 100; - std::cout << "BURNIN during "< threshold) { //accept - acceptMove(currentOrigination, currentDelta, currentTau, currentLambda, newOrigination, newDelta, newTau, newLambda); + newLogLikelihood = + computeLogLk(model, ale, newOrigination, newDelta, newTau, newLambda); + newLogPrior = + computeLogPrior(newOrigination, newDelta, newTau, newLambda, + priorOrigination, priorDelta, priorTau, priorLambda); + // Accept or reject? + acceptanceProbability = exp((newLogLikelihood + newLogPrior) - + (currentLogLikelihood + currentLogPrior)) * + hastingsRatio; + threshold = RandomTools::giveRandomNumberBetweenZeroAndEntry(1.0); + if (acceptanceProbability > threshold) { // accept + acceptMove(currentOrigination, currentDelta, currentTau, currentLambda, + newOrigination, newDelta, newTau, newLambda); currentLogLikelihood = newLogLikelihood; currentLogPrior = newLogPrior; + } else { + rejectMove(currentOrigination, currentDelta, currentTau, currentLambda, + newOrigination, newDelta, newTau, newLambda); } - else { - rejectMove(currentOrigination, currentDelta, currentTau, currentLambda, newOrigination, newDelta, newTau, newLambda); - } - std::cout << i <<"\t"<< currentLogLikelihood << "\t" << currentLogPrior << "\t" << currentOrigination << "\t" << currentDelta << "\t" << currentTau << "\t" << currentLambda < threshold) { //accept - acceptMove(currentOrigination, currentDelta, currentTau, currentLambda, newOrigination, newDelta, newTau, newLambda); + newLogLikelihood = + computeLogLk(model, ale, newOrigination, newDelta, newTau, newLambda); + newLogPrior = + computeLogPrior(newOrigination, newDelta, newTau, newLambda, + priorOrigination, priorDelta, priorTau, priorLambda); + // Accept or reject? + acceptanceProbability = exp((newLogLikelihood + newLogPrior) - + (currentLogLikelihood + currentLogPrior)) * + hastingsRatio; + threshold = RandomTools::giveRandomNumberBetweenZeroAndEntry(1.0); + if (acceptanceProbability > threshold) { // accept + acceptMove(currentOrigination, currentDelta, currentTau, currentLambda, + newOrigination, newDelta, newTau, newLambda); currentLogLikelihood = newLogLikelihood; currentLogPrior = newLogPrior; + } else { + rejectMove(currentOrigination, currentDelta, currentTau, currentLambda, + newOrigination, newDelta, newTau, newLambda); } - else { - rejectMove(currentOrigination, currentDelta, currentTau, currentLambda, newOrigination, newDelta, newTau, newLambda); - } - if (i % sampling_rate == 0 ) { + if (i % sampling_rate == 0) { model->MLRec_events.clear(); model->reset_T_to_from(); - sampleTree (model, ale, currentOrigination, currentDelta, currentTau, currentLambda, sample_strings, sample_trees); + sampleTree(model, ale, currentOrigination, currentDelta, currentTau, + currentLambda, sample_strings, sample_trees); numSpeciations += model->MLRec_events["S"]; numDuplications += model->MLRec_events["D"]; numTransfers += model->MLRec_events["T"]; numLosses += model->MLRec_events["L"]; fillTToFrom(model, tToFrom); - std::cout << i <<"\t"<< currentLogLikelihood << "\t" << currentLogPrior << "\t" << currentOrigination << "\t" << currentDelta << "\t" << currentTau << "\t" << currentLambda <string_parameter["S_with_ranks"] <string_parameter["S_with_ranks"] << endl; fout << endl; - fout << "Input ale from:\t"<logl: " << mlll << endl; - //fout << "rate of\t Duplications\tTransfers\tLosses" <logl: " << mlll << endl; + // fout << "rate of\t Duplications\tTransfers\tLosses" <counts_string_undated(samples); fout.close(); -// Outputting the species tree to its own file: + // Outputting the species tree to its own file: if (outputSpeciesTree != "") { - ofstream fout( outputSpeciesTree.c_str() ); - fout <string_parameter["S_with_ranks"] <string_parameter["S_with_ranks"] << endl; fout.close(); } cout << "Results in: " << outname << endl; - if (ale->last_leafset_id>3) - { - cout << "Calculating MRP consensus tree."<last_leafset_id > 3) { + cout << "Calculating MRP consensus tree." << endl; + Tree *con_tree = TreeTools::MRP(sample_trees); + + string con_name = ale_file + ".ucons_mcmc_tree"; + + ofstream con_out(con_name.c_str()); + con_out << "#ALEmcmc_undated using ALE v" << ALE_VERSION + << " by Szollosi GJ et al.; ssolo@elte.hu; CC BY-SA 3.0;" << endl; + // TreeTools::computeBootstrapValues(*con_tree,sample_trees); + string con_tree_sup = TreeTemplateTools::treeToParenthesis(*con_tree); + con_out << con_tree_sup << endl; + cout << endl << "Consensus tree in " << con_name << endl; + con_out.close(); + } - string t_name=ale_file+"_mcmc.uTs"; - ofstream tout( t_name.c_str() ); - tout <<"#from\tto\tfreq.\n"; + string t_name = ale_file + "_mcmc.uTs"; + ofstream tout(t_name.c_str()); + tout << "#from\tto\tfreq.\n"; std::map::const_iterator it; - for(it = tToFrom.begin(); it != tToFrom.end(); it++) { - tout << "\t" << it->first << "\t" << it->second/samples << endl; + for (it = tToFrom.begin(); it != tToFrom.end(); it++) { + tout << "\t" << it->first << "\t" << it->second / samples << endl; } - tout.close(); + tout.close(); cout << "Transfers in: " << t_name << endl; return 0; } diff --git a/src/ALEml.cpp b/src/ALEml.cpp index 2089fd1..9fe2667 100644 --- a/src/ALEml.cpp +++ b/src/ALEml.cpp @@ -1,350 +1,349 @@ -#include "exODT.h" #include "ALE_util.h" +#include "exODT.h" #include - -#include -#include #include #include +#include +#include using namespace std; using namespace bpp; -class p_fun: - public virtual Function, - public AbstractParametrizable -{ +class p_fun : public virtual Function, public AbstractParametrizable { private: double fval_; bool delta_fixed; bool tau_fixed; bool lambda_fixed; - exODT_model* model_pointer; - approx_posterior* ale_pointer; -public: - p_fun(exODT_model* model,approx_posterior* ale, double delta_start=0.01,double tau_start=0.01,double lambda_start=0.1//,double sigma_hat_start=1. - ,bool delta_fixed_in=false,bool tau_fixed_in=false,bool lambda_fixed_in=false) : AbstractParametrizable(""), fval_(0), model_pointer(model), ale_pointer(ale) - { - //We declare parameters here: - // IncludingInterval* constraint = new IncludingInterval(1e-6, 10-1e-6); - IntervalConstraint* constraint = new IntervalConstraint ( 1e-5, 1, true, true ); - delta_fixed=delta_fixed_in; - tau_fixed=tau_fixed_in; - lambda_fixed=lambda_fixed_in; - - if (not delta_fixed) - { - addParameter_( new Parameter("delta", delta_start, constraint) ) ; - cout << "#optimizing delta rate"<< endl; - } - if (not tau_fixed) - { - addParameter_( new Parameter("tau", tau_start, constraint) ) ; - cout << "#optimizing tau rate"<< endl; - } - if (not lambda_fixed) - { - addParameter_( new Parameter("lambda", lambda_start, constraint) ) ; - cout << "#optimizing lambda rate"<< endl; - } + exODT_model *model_pointer; + approx_posterior *ale_pointer; +public: + p_fun(exODT_model *model, approx_posterior *ale, double delta_start = 0.01, + double tau_start = 0.01, + double lambda_start = 0.1 //,double sigma_hat_start=1. + , + bool delta_fixed_in = false, bool tau_fixed_in = false, + bool lambda_fixed_in = false) + : AbstractParametrizable(""), fval_(0), model_pointer(model), + ale_pointer(ale) { + // We declare parameters here: + // IncludingInterval* constraint = new IncludingInterval(1e-6, 10-1e-6); + IntervalConstraint *constraint = + new IntervalConstraint(1e-5, 1, true, true); + delta_fixed = delta_fixed_in; + tau_fixed = tau_fixed_in; + lambda_fixed = lambda_fixed_in; + + if (not delta_fixed) { + addParameter_(new Parameter("delta", delta_start, constraint)); + cout << "#optimizing delta rate" << endl; + } + if (not tau_fixed) { + addParameter_(new Parameter("tau", tau_start, constraint)); + cout << "#optimizing tau rate" << endl; + } + if (not lambda_fixed) { + addParameter_(new Parameter("lambda", lambda_start, constraint)); + cout << "#optimizing lambda rate" << endl; + } } - p_fun* clone() const { return new p_fun(*this); } + p_fun *clone() const { return new p_fun(*this); } public: - - void setParameters(const ParameterList& pl) - throw (ParameterNotFoundException, ConstraintException, Exception) - { - matchParametersValues(pl); + void setParameters(const ParameterList &pl) throw(ParameterNotFoundException, + ConstraintException, + Exception) { + matchParametersValues(pl); + } + double getValue() const throw(Exception) { return fval_; } + void fireParameterChanged(const ParameterList &pl) { + double delta, tau, lambda; + if (not delta_fixed) { + delta = getParameterValue("delta"); + model_pointer->set_model_parameter("delta", delta); } - double getValue() const throw (Exception) { return fval_; } - void fireParameterChanged(const ParameterList& pl) - { - double delta,tau,lambda; - if (not delta_fixed) - { - delta = getParameterValue("delta"); - model_pointer->set_model_parameter("delta",delta); - } - if (not tau_fixed) - { - tau = getParameterValue("tau"); - model_pointer->set_model_parameter("tau",tau); - } - if (not lambda_fixed) - { - lambda = getParameterValue("lambda"); - model_pointer->set_model_parameter("lambda",lambda); - } - - model_pointer->calculate_EGb(); - double y=-log(model_pointer->p(ale_pointer)); - cout << "delta=" << delta << "\t tau=" << tau << "\t lambda=" << lambda << "\t ll=" << -y <set_model_parameter("tau", tau); + } + if (not lambda_fixed) { + lambda = getParameterValue("lambda"); + model_pointer->set_model_parameter("lambda", lambda); } -}; - - - - -int main(int argc, char ** argv) -{ - cout << "ALEml using ALE v"<< ALE_VERSION <calculate_EGb(); + double y = -log(model_pointer->p(ale_pointer)); + cout << "delta=" << delta << "\t tau=" << tau << "\t lambda=" << lambda + << "\t ll=" << -y << endl; + fval_ = y; + } +}; - return 0; - } +int main(int argc, char **argv) { + cout << "ALEml using ALE v" << ALE_VERSION << endl; + + if (argc < 3) { + cout << "\nUsage:\n ./ALEml species_tree.newick gene_tree_sample.ale " + "sample=number_of_samples separators=gene_name_separator " + "O_R=OriginationAtRoot delta=DuplicationRate tau=TransferRate " + "lambda=LossRate beta=weight_of_sequence_evidence " + "fraction_missing=file_with_fraction_of_missing_genes_per_species " + "output_species_tree=n S_branch_lengths:root_length " + "rate_mutiplier:rate_name:branch_id:value" + << endl; + cout << "\n1st example: we fix the DTL values and do not perform any " + "optimization \n ./ALEml species_tree.newick gene_tree_sample.ale " + "sample=100 separators=_ delta=0.05 tau=0.1 lambda=0.2 " + << endl; + cout << "\n2nd example: we fix the T value to 0 to get a DL-only model and " + "optimize the DL parameters \n ./ALEml species_tree.newick " + "gene_tree_sample.ale sample=100 separators=_ tau=0\n" + << endl; + + return 0; + } - //we need a dared species tree in newick format + // we need a dared species tree in newick format string Sstring; - string S_treefile=argv[1]; + string S_treefile = argv[1]; if (!fexists(argv[1])) { - cout << "Error, file "<observations<<" trees from: " << ale_file <<".."<observations + << " trees from: " << ale_file << ".." << endl; + // Getting the radical for output files: vector tokens; - boost::split(tokens,ale_file,boost::is_any_of("/"),boost::token_compress_on); - ale_file=tokens[tokens.size()-1]; - boost::split(tokens,S_treefile,boost::is_any_of("/"),boost::token_compress_on); - ale_file=tokens[tokens.size()-1]+"_"+ale_file; - - //we initialise a coarse grained reconciliation model for calculating the sum - exODT_model* model=new exODT_model(); - - scalar_type samples=100; - scalar_type O_R=1,beta=1; - bool delta_fixed=false; - bool tau_fixed=false; - bool lambda_fixed=false; - scalar_type delta=1e-2,tau=1e-2,lambda=1e-1; + boost::split(tokens, ale_file, boost::is_any_of("/"), + boost::token_compress_on); + ale_file = tokens[tokens.size() - 1]; + boost::split(tokens, S_treefile, boost::is_any_of("/"), + boost::token_compress_on); + ale_file = tokens[tokens.size() - 1] + "_" + ale_file; + + // we initialise a coarse grained reconciliation model for calculating the sum + exODT_model *model = new exODT_model(); + + scalar_type samples = 100; + scalar_type O_R = 1, beta = 1; + bool delta_fixed = false; + bool tau_fixed = false; + bool lambda_fixed = false; + scalar_type delta = 1e-2, tau = 1e-2, lambda = 1e-1; string fractionMissingFile = ""; string outputSpeciesTree = ""; - //############################################################ - for (int i=3;i tokens; - boost::split(tokens,next_field,boost::is_any_of("=:"),boost::token_compress_on); - if (tokens[0]=="sample") - samples=atoi(tokens[1].c_str()); - else if (tokens[0]=="separators") + // ############################################################ + for (int i = 3; i < argc; i++) { + string next_field = argv[i]; + + vector tokens; + boost::split(tokens, next_field, boost::is_any_of("=:"), + boost::token_compress_on); + if (tokens[0] == "sample") + samples = atoi(tokens[1].c_str()); + else if (tokens[0] == "separators") model->set_model_parameter("gene_name_separators", tokens[1]); - else if (tokens[0]=="delta") - { - delta=atof(tokens[1].c_str()); - delta_fixed=true; + else if (tokens[0] == "delta") { + delta = atof(tokens[1].c_str()); + delta_fixed = true; cout << "# delta fixed to " << delta << endl; - } - else if (tokens[0]=="tau") - { - tau=atof(tokens[1].c_str()); - tau_fixed=true; + } else if (tokens[0] == "tau") { + tau = atof(tokens[1].c_str()); + tau_fixed = true; cout << "# tau fixed to " << tau << endl; - } - else if (tokens[0]=="lambda") - { - lambda=atof(tokens[1].c_str()); - lambda_fixed=true; + } else if (tokens[0] == "lambda") { + lambda = atof(tokens[1].c_str()); + lambda_fixed = true; cout << "# lambda fixed to " << lambda << endl; - } - else if (tokens[0]=="O_R") - { - O_R=atof(tokens[1].c_str()); + } else if (tokens[0] == "O_R") { + O_R = atof(tokens[1].c_str()); cout << "# NOT YET IMPLEMENTED O_R set to " << O_R << endl; - } - else if (tokens[0]=="beta") - { - beta=atof(tokens[1].c_str()); + } else if (tokens[0] == "beta") { + beta = atof(tokens[1].c_str()); cout << "# beta set to " << beta << endl; - } - else if (tokens[0]=="fraction_missing") - { - fractionMissingFile=tokens[1]; - cout << "# NOT YET IMPLEMENTED File containing fractions of missing genes set to " << fractionMissingFile << endl; - } - else if (tokens[0]=="output_species_tree") - { + } else if (tokens[0] == "fraction_missing") { + fractionMissingFile = tokens[1]; + cout << "# NOT YET IMPLEMENTED File containing fractions of missing " + "genes set to " + << fractionMissingFile << endl; + } else if (tokens[0] == "output_species_tree") { std::string valArg = boost::algorithm::to_lower_copy(tokens[1]); - if (valArg == "y" || valArg == "ye" || valArg == "yes" ) { - outputSpeciesTree= ale_file + ".spTree"; - cout << "# outputting the annotated species tree to "<< outputSpeciesTree << endl; + if (valArg == "y" || valArg == "ye" || valArg == "yes") { + outputSpeciesTree = ale_file + ".spTree"; + cout << "# outputting the annotated species tree to " + << outputSpeciesTree << endl; + } else { + cout << "# NOT outputting the annotated species tree to " + << outputSpeciesTree << endl; } - else { - cout << "# NOT outputting the annotated species tree to "<< outputSpeciesTree << endl; - - } - } - } - //############################################################ - - + // ############################################################ - int D=2; - model->set_model_parameter("BOOTSTRAP_LABELS","yes"); + int D = 2; + model->set_model_parameter("BOOTSTRAP_LABELS", "yes"); - model->set_model_parameter("min_D",D); - model->set_model_parameter("grid_delta_t",0.1); + model->set_model_parameter("min_D", D); + model->set_model_parameter("grid_delta_t", 0.1); model->construct(Sstring); - model->set_model_parameter("event_node",0); - model->set_model_parameter("leaf_events",1); - model->set_model_parameter("N",1); - + model->set_model_parameter("event_node", 0); + model->set_model_parameter("leaf_events", 1); + model->set_model_parameter("N", 1); model->set_model_parameter("delta", delta); model->set_model_parameter("tau", tau); model->set_model_parameter("lambda", lambda); model->set_model_parameter("sigma_hat", 1); - //calculate_EGb() must always be called after changing rates to calculate E-s and G-s - //cf. http://arxiv.org/abs/1211.4606 + // calculate_EGb() must always be called after changing rates to calculate E-s + // and G-s cf. http://arxiv.org/abs/1211.4606 model->calculate_EGb(); - - cout << "Reconciliation model initialised, starting DTL rate optimisation" <<".."<setProfiler(0); optimizer->setMessageHandler(0); optimizer->setVerbose(2); optimizer->setConstraintPolicy(AutoParameter::CONSTRAINTS_AUTO); - optimizer->init(f->getParameters()); //Here we optimize all parameters, and start with the default values. + optimizer->init(f->getParameters()); // Here we optimize all parameters, and + // start with the default values. scalar_type mlll; - if (not (delta_fixed and tau_fixed and lambda_fixed) ) - { - cout << "#optimizing rates" << endl; - optimizer->optimize(); - if (not delta_fixed) delta=optimizer->getParameterValue("delta"); - if (not tau_fixed) tau=optimizer->getParameterValue("tau"); - if (not lambda_fixed) lambda=optimizer->getParameterValue("lambda"); - mlll=-optimizer->getFunctionValue(); - } - else - { - mlll=log(model->p(ale)); - } - cout << endl << "ML rates: " << " delta=" << delta << "; tau=" << tau << "; lambda="<optimize(); + if (not delta_fixed) + delta = optimizer->getParameterValue("delta"); + if (not tau_fixed) + tau = optimizer->getParameterValue("tau"); + if (not lambda_fixed) + lambda = optimizer->getParameterValue("lambda"); + mlll = -optimizer->getFunctionValue(); + } else { + mlll = log(model->p(ale)); + } + cout << endl + << "ML rates: " + << " delta=" << delta << "; tau=" << tau + << "; lambda=" << lambda //<<"; sigma="< sample_strings; - vector sample_trees; - boost::progress_display pd( samples ); - - for (int i=0;isample(false); - sample_strings.push_back(sample_tree); - - if (ale->last_leafset_id>3) - { - - tree_type * G=TreeTemplateTools::parenthesisToTree(sample_tree,false); - vector leaves = G->getLeaves(); - for (vector::iterator it=leaves.begin();it!=leaves.end();it++ ) - { - string name=(*it)->getName(); - vector tokens; - boost::split(tokens,name,boost::is_any_of(".@"),boost::token_compress_on); - (*it)->setName(tokens[0]); - tokens.clear(); - } - leaves.clear(); - sample_trees.push_back(G); - } + cout << "Sampling reconciled gene trees.." << endl; + vector sample_strings; + vector sample_trees; + boost::progress_display pd(samples); + + for (int i = 0; i < samples; i++) { + ++pd; + string sample_tree = model->sample(false); + sample_strings.push_back(sample_tree); + + if (ale->last_leafset_id > 3) { + + tree_type *G = TreeTemplateTools::parenthesisToTree(sample_tree, false); + vector leaves = G->getLeaves(); + for (vector::iterator it = leaves.begin(); it != leaves.end(); + it++) { + string name = (*it)->getName(); + vector tokens; + boost::split(tokens, name, boost::is_any_of(".@"), + boost::token_compress_on); + (*it)->setName(tokens[0]); + tokens.clear(); + } + leaves.clear(); + sample_trees.push_back(G); } + } /*cout << "Calculating ML reconciled gene tree.."< res = model->p_MLRec(ale); //and output it.. */ - string outname=ale_file+".ml_rec"; - ofstream fout( outname.c_str() ); - fout << "#ALEml using ALE v"<< ALE_VERSION <<" by Szollosi GJ et al.; ssolo@elte.hu; CC BY-SA 3.0;"<string_parameter["S_with_ranks"] <string_parameter["S_with_ranks"] << endl; fout << endl; - fout << "Input ale from:\t"<logl: " << mlll << endl; - fout << "rate of\t Duplications\tTransfers\tLosses" <MLRec_events["D"]/samples << "\t" << model->MLRec_events["T"]/samples << "\t" << model->MLRec_events["L"]/samples<< "\t" << model->MLRec_events["S"]/samples <MLRec_events["D"] / samples << "\t" + << model->MLRec_events["T"] / samples << "\t" + << model->MLRec_events["L"] / samples << "\t" + << model->MLRec_events["S"] / samples << endl; fout << endl; - fout << "# of\t Duplications\tTransfers\tLosses\tOriginations\tcopies" <counts_string(samples); fout.close(); // Outputting the species tree to its own file: if (outputSpeciesTree != "") { - ofstream fout( outputSpeciesTree.c_str() ); - fout <string_parameter["S_with_ranks"] <string_parameter["S_with_ranks"] << endl; fout.close(); } cout << "Results in: " << outname << endl; - if (ale->last_leafset_id>3) - { - cout << "Calculating MRP consensus tree."<last_leafset_id > 3) { + cout << "Calculating MRP consensus tree." << endl; + Tree *con_tree = TreeTools::MRP(sample_trees); + + string con_name = ale_file + ".cons_tree"; + + ofstream con_out(con_name.c_str()); + con_out << "#ALEsample using ALE v" << ALE_VERSION + << " by Szollosi GJ et al.; ssolo@elte.hu; CC BY-SA 3.0;" << endl; + TreeTools::computeBootstrapValues(*con_tree, sample_trees); + string con_tree_sup = TreeTemplateTools::treeToParenthesis(*con_tree); + con_out << con_tree_sup << endl; + cout << endl << "Consensus tree in " << con_name << endl; + } - string t_name=ale_file+".Ts"; - ofstream tout( t_name.c_str() ); - tout <<"#Transfer & duplications tokens: \n"; - tout <<"#D|rank|named_branch|g_id\n;"; - tout <<"#T(|rank|t|named_branch|g_id)>(|rank|t|named_branch|gp_id)>.. or \n"; - tout <<"#>(|rank|t|named_branch|-1), where g_id=-1 is the root of G.\n"; + string t_name = ale_file + ".Ts"; + ofstream tout(t_name.c_str()); + tout << "#Transfer & duplications tokens: \n"; + tout << "#D|rank|named_branch|g_id\n;"; + tout << "#T(|rank|t|named_branch|g_id)>(|rank|t|named_branch|gp_id)>.. or \n"; + tout << "#>(|rank|t|named_branch|-1), where g_id=-1 is the root of G.\n"; - for (std::vector::iterator it=model->Ttokens.begin();it!=model->Ttokens.end();it++) - tout << (*it) <::iterator it = model->Ttokens.begin(); + it != model->Ttokens.end(); it++) + tout << (*it) << endl; tout.close(); cout << "Transfers in: " << t_name << endl; diff --git a/src/ALEml_scaled.cpp b/src/ALEml_scaled.cpp index f4a41df..5035dba 100644 --- a/src/ALEml_scaled.cpp +++ b/src/ALEml_scaled.cpp @@ -1,175 +1,182 @@ -#include "exODT.h" #include "ALE_util.h" +#include "exODT.h" -#include -#include #include #include +#include +#include using namespace std; using namespace bpp; -class p_fun: - public virtual Function, - public AbstractParametrizable -{ +class p_fun : public virtual Function, public AbstractParametrizable { private: double fval_; - exODT_model* model_pointer; - approx_posterior* ale_pointer; -public: - p_fun(exODT_model* model,approx_posterior* ale, double delta_start=0.01,double tau_start=0.01,double lambda_start=0.1,double sigma_hat_start=1.) : AbstractParametrizable(""), fval_(0), model_pointer(model), ale_pointer(ale) - { - //We declare parameters here: - // IncludingInterval* constraint = new IncludingInterval(1e-6, 10-1e-6); - IntervalConstraint* constraint = new IntervalConstraint ( 1e-6, 10-1e-6, true, true ); - addParameter_( new Parameter("delta", delta_start, constraint) ) ; - addParameter_( new Parameter("tau", tau_start, constraint) ) ; - addParameter_( new Parameter("lambda", lambda_start, constraint) ) ; - addParameter_( new Parameter("sigma_hat", sigma_hat_start, constraint) ) ; + exODT_model *model_pointer; + approx_posterior *ale_pointer; +public: + p_fun(exODT_model *model, approx_posterior *ale, double delta_start = 0.01, + double tau_start = 0.01, double lambda_start = 0.1, + double sigma_hat_start = 1.) + : AbstractParametrizable(""), fval_(0), model_pointer(model), + ale_pointer(ale) { + // We declare parameters here: + // IncludingInterval* constraint = new IncludingInterval(1e-6, 10-1e-6); + IntervalConstraint *constraint = + new IntervalConstraint(1e-6, 10 - 1e-6, true, true); + addParameter_(new Parameter("delta", delta_start, constraint)); + addParameter_(new Parameter("tau", tau_start, constraint)); + addParameter_(new Parameter("lambda", lambda_start, constraint)); + addParameter_(new Parameter("sigma_hat", sigma_hat_start, constraint)); } - p_fun* clone() const { return new p_fun(*this); } + p_fun *clone() const { return new p_fun(*this); } public: - - void setParameters(const ParameterList& pl) - throw (ParameterNotFoundException, ConstraintException, Exception) - { - matchParametersValues(pl); - } - double getValue() const throw (Exception) { return fval_; } - void fireParameterChanged(const ParameterList& pl) - { - double delta = getParameterValue("delta"); - double tau = getParameterValue("tau"); - double lambda = getParameterValue("lambda"); - double sigma_hat = getParameterValue("sigma_hat"); - - model_pointer->set_model_parameter("delta",delta); - model_pointer->set_model_parameter("tau",tau); - model_pointer->set_model_parameter("lambda",lambda); - model_pointer->set_model_parameter("sigma_hat",sigma_hat); - model_pointer->calculate_EGb(); - double y=-log(model_pointer->p(ale_pointer)); - cout <set_model_parameter("delta", delta); + model_pointer->set_model_parameter("tau", tau); + model_pointer->set_model_parameter("lambda", lambda); + model_pointer->set_model_parameter("sigma_hat", sigma_hat); + model_pointer->calculate_EGb(); + double y = -log(model_pointer->p(ale_pointer)); + cout << endl + << "delta=" << delta << "\t tau=" << tau << "\t lambda=" << lambda + << "\t lambda=" << sigma_hat << "\t ll=" << -y << endl; + fval_ = y; + } }; +int main(int argc, char **argv) { + cout << "ALEml using ALE v" << ALE_VERSION << endl; -int main(int argc, char ** argv) -{ - cout << "ALEml using ALE v"<< ALE_VERSION <observations<<" trees from: " << ale_file <<".."<3) + ifstream file_stream_S(argv[1]); + getline(file_stream_S, Sstring); + cout << "Read species tree from: " << argv[1] << ".." << endl; + // we need an .ale file containing observed conditional clade probabilities + // cf. ALEobserve + string ale_file = argv[2]; + approx_posterior *ale; + ale = load_ALE_from_file(ale_file); + cout << "Read summary of tree sample for " << ale->observations + << " trees from: " << ale_file << ".." << endl; + + // we initialise a coarse grained reconciliation model for calculating the sum + exODT_model *model = new exODT_model(); + + int D = 3; + if (argc > 3) model->set_model_parameter("gene_name_separators", argv[3]); - - model->set_model_parameter("min_D",D); - model->set_model_parameter("grid_delta_t",0.005); + model->set_model_parameter("min_D", D); + model->set_model_parameter("grid_delta_t", 0.005); model->construct(Sstring); - model->set_model_parameter("event_node",0); - model->set_model_parameter("leaf_events",1); - model->set_model_parameter("N",1); - - //a set of inital rates - scalar_type delta=0.01,tau=0.01,lambda=0.1; - if (argc>6) - delta=atof(argv[4]),tau=atof(argv[5]),lambda=atof(argv[6]); + model->set_model_parameter("event_node", 0); + model->set_model_parameter("leaf_events", 1); + model->set_model_parameter("N", 1); + + // a set of inital rates + scalar_type delta = 0.01, tau = 0.01, lambda = 0.1; + if (argc > 6) + delta = atof(argv[4]), tau = atof(argv[5]), lambda = atof(argv[6]); model->set_model_parameter("delta", delta); model->set_model_parameter("tau", tau); model->set_model_parameter("lambda", lambda); model->set_model_parameter("sigma_hat", 1); - //calculate_EGb() must always be called after changing rates to calculate E-s and G-s - //cf. http://arxiv.org/abs/1211.4606 + // calculate_EGb() must always be called after changing rates to calculate E-s + // and G-s cf. http://arxiv.org/abs/1211.4606 model->calculate_EGb(); - cout << "Reconciliation model initialised, starting DTL rate optimisation" <<".."<setProfiler(0); optimizer->setMessageHandler(0); optimizer->setVerbose(2); optimizer->setConstraintPolicy(AutoParameter::CONSTRAINTS_AUTO); - optimizer->init(f->getParameters()); //Here we optimize all parameters, and start with the default values. - + optimizer->init(f->getParameters()); // Here we optimize all parameters, and + // start with the default values. + // FunctionStopCondition stop(optimizer, 1);//1e-1); + // optimizer->setStopCondition(stop); + // TEMP + // optimizer->setMaximumNumberOfEvaluations( 10 ); - // FunctionStopCondition stop(optimizer, 1);//1e-1); - // optimizer->setStopCondition(stop); - //TEMP - //optimizer->setMaximumNumberOfEvaluations( 10 ); + optimizer->optimize(); - optimizer->optimize(); + // optimizer->getParameters().printParameters(cout); + delta = optimizer->getParameterValue("delta"); + tau = optimizer->getParameterValue("tau"); + lambda = optimizer->getParameterValue("lambda"); + scalar_type sigma_hat = optimizer->getParameterValue("sigma_hat"); - //optimizer->getParameters().printParameters(cout); - delta=optimizer->getParameterValue("delta"); - tau=optimizer->getParameterValue("tau"); - lambda=optimizer->getParameterValue("lambda"); - scalar_type sigma_hat=optimizer->getParameterValue("sigma_hat"); + scalar_type mlll = -optimizer->getFunctionValue(); + cout << endl + << "ML rates: " + << " delta=" << delta << "; tau=" << tau << "; lambda=" << lambda + << "; sigma=" << sigma_hat << "." << endl; - scalar_type mlll=-optimizer->getFunctionValue(); - cout << endl << "ML rates: " << " delta=" << delta << "; tau=" << tau << "; lambda="< delta_LL; + map delta_LL; cout << "LL=" << mlll << endl; - cout << "Calculating ML reconciled gene tree.."< res = model->p_MLRec(ale); - //and output it.. - string outname=ale_file+".ml_rec"; - ofstream fout( outname.c_str() ); - fout << "#ALEml using ALE v"<< ALE_VERSION <<" by Szollosi GJ et al.; ssolo@elte.hu; CC BY-SA 3.0;"<string_parameter["S_with_ranks"] <string_parameter["S_with_ranks"] << endl; fout << endl; - fout << "Input ale from:\t"<MLRec_events["D"] << "\t" << model->MLRec_events["T"] << "\t" << model->MLRec_events["L"]<< "\t" << model->MLRec_events["S"] <MLRec_events["D"] << "\t" + << model->MLRec_events["T"] << "\t" << model->MLRec_events["L"] << "\t" + << model->MLRec_events["S"] << endl; fout << endl; - fout << "# of\t Duplications\tTransfers\tLosses\tgene copies" <counts_string(); fout.close(); cout << "Results in: " << outname << endl; diff --git a/src/ALEml_undated.cpp b/src/ALEml_undated.cpp index c1ed7b7..1dc6430 100644 --- a/src/ALEml_undated.cpp +++ b/src/ALEml_undated.cpp @@ -1,18 +1,15 @@ -#include "exODT.h" #include "ALE_util.h" +#include "exODT.h" -#include -#include #include #include +#include +#include using namespace std; using namespace bpp; -class p_fun: - public virtual Function, - public AbstractParametrizable -{ +class p_fun : public virtual Function, public AbstractParametrizable { private: double fval_; bool delta_fixed; @@ -20,523 +17,573 @@ class p_fun: bool lambda_fixed; bool DT_fixed; bool MLOR; - - bool no_T=false; - exODT_model* model_pointer; - approx_posterior* ale_pointer; + + bool no_T = false; + exODT_model *model_pointer; + approx_posterior *ale_pointer; + public: - p_fun(exODT_model* model,approx_posterior* ale,vector ml_branch_ids,vector ml_ratetype_names,double delta_start=0.1,double tau_start=0.1,double lambda_start=0.5//,double sigma_hat_start=1. - ,bool delta_fixed_in=false,bool tau_fixed_in=false,bool lambda_fixed_in=false, bool DT_fixed_in=false, bool MLOR_in=false) : AbstractParametrizable(""), fval_(0), model_pointer(model), ale_pointer(ale) - { - //We declare parameters here: - // IncludingInterval* constraint = new IncludingInterval(1e-6, 10-1e-6); - IntervalConstraint* constraint; - if (no_T) constraint= new IntervalConstraint ( 1e-6, 100-1e-7, true, true ); - else constraint= new IntervalConstraint ( 1e-10, 100-1e-7, true, true ); - - IntervalConstraint* rate_multiplier_constraint = new IntervalConstraint ( 1e-7, 10000-1e-7, true, true ); - delta_fixed=delta_fixed_in; - tau_fixed=tau_fixed_in; - lambda_fixed=lambda_fixed_in; - DT_fixed=DT_fixed_in; - MLOR=MLOR_in; - if (tau_start<1e-10) no_T=true; - if (not delta_fixed and not DT_fixed) - { - addParameter_( new Parameter("delta", delta_start , constraint) ) ; - cout << "#optimizing delta rate"<< endl; - } - if (not tau_fixed and not DT_fixed) - { - addParameter_( new Parameter("tau",tau_start, constraint) ) ; - cout << "#optimizing tau rate"<< endl; - } - if (not lambda_fixed) - { - addParameter_( new Parameter("lambda", lambda_start, constraint) ) ; - cout << "#optimizing lambda rate"<< endl; - } - if (DT_fixed) - { - addParameter_( new Parameter("tau", tau_start, constraint) ) ; - cout << "#optimizing delta and tau rates with fixed D/T ratio"<< endl; - } - if (MLOR) - { - IntervalConstraint* OR_constraint= new IntervalConstraint ( 1e-10, 1000-1e-7, true, true ); - addParameter_( new Parameter("O_R", 1., OR_constraint) ) ; - cout << "#optimizing O_R"<< endl; + p_fun(exODT_model *model, approx_posterior *ale, vector ml_branch_ids, + vector ml_ratetype_names, double delta_start = 0.1, + double tau_start = 0.1, + double lambda_start = 0.5 //,double sigma_hat_start=1. + , + bool delta_fixed_in = false, bool tau_fixed_in = false, + bool lambda_fixed_in = false, bool DT_fixed_in = false, + bool MLOR_in = false) + : AbstractParametrizable(""), fval_(0), model_pointer(model), + ale_pointer(ale) { + // We declare parameters here: + // IncludingInterval* constraint = new IncludingInterval(1e-6, 10-1e-6); + IntervalConstraint *constraint; + if (no_T) + constraint = new IntervalConstraint(1e-6, 100 - 1e-7, true, true); + else + constraint = new IntervalConstraint(1e-10, 100 - 1e-7, true, true); + + IntervalConstraint *rate_multiplier_constraint = + new IntervalConstraint(1e-7, 10000 - 1e-7, true, true); + delta_fixed = delta_fixed_in; + tau_fixed = tau_fixed_in; + lambda_fixed = lambda_fixed_in; + DT_fixed = DT_fixed_in; + MLOR = MLOR_in; + if (tau_start < 1e-10) + no_T = true; + if (not delta_fixed and not DT_fixed) { + addParameter_(new Parameter("delta", delta_start, constraint)); + cout << "#optimizing delta rate" << endl; + } + if (not tau_fixed and not DT_fixed) { + addParameter_(new Parameter("tau", tau_start, constraint)); + cout << "#optimizing tau rate" << endl; + } + if (not lambda_fixed) { + addParameter_(new Parameter("lambda", lambda_start, constraint)); + cout << "#optimizing lambda rate" << endl; + } + if (DT_fixed) { + addParameter_(new Parameter("tau", tau_start, constraint)); + cout << "#optimizing delta and tau rates with fixed D/T ratio" << endl; + } + if (MLOR) { + IntervalConstraint *OR_constraint = + new IntervalConstraint(1e-10, 1000 - 1e-7, true, true); + addParameter_(new Parameter("O_R", 1., OR_constraint)); + cout << "#optimizing O_R" << endl; + } - } + // vector ml_branch_ids; + // vector ml_ratetype_names; - //vector ml_branch_ids; - //vector ml_ratetype_names; - - for (int i=0;i public_ml_branch_ids; vector public_ml_ratetype_names; - void setParameters(const ParameterList& pl) - throw (ParameterNotFoundException, ConstraintException, Exception) - { + void setParameters(const ParameterList &pl) throw(ParameterNotFoundException, + ConstraintException, + Exception) { matchParametersValues(pl); } - double getValue() const throw (Exception) { return fval_; } - void fireParameterChanged(const ParameterList& pl) - { - if (not delta_fixed and not DT_fixed) - { - double delta = getParameterValue("delta"); - model_pointer->set_model_parameter("delta",delta); - } - if (not tau_fixed and not DT_fixed) - { - double tau = getParameterValue("tau"); - model_pointer->set_model_parameter("tau",tau); - if (tau<1e-10) no_T=true; - } - if (not lambda_fixed) - { - double lambda = getParameterValue("lambda"); - model_pointer->set_model_parameter("lambda",lambda); - } - if (DT_fixed) - { - double tau = getParameterValue("tau"); - model_pointer->set_model_parameter("tau",tau); - double delta = tau * model_pointer->scalar_parameter["DT_ratio"]; - model_pointer->set_model_parameter("delta",delta); - } - if (MLOR) - { - double O_R = getParameterValue("O_R"); - model_pointer->set_model_parameter("O_R",O_R); - } - - for (int i=0;ivector_parameter[name][e]=multiplier; - } + double getValue() const throw(Exception) { return fval_; } + void fireParameterChanged(const ParameterList &pl) { + if (not delta_fixed and not DT_fixed) { + double delta = getParameterValue("delta"); + model_pointer->set_model_parameter("delta", delta); + } + if (not tau_fixed and not DT_fixed) { + double tau = getParameterValue("tau"); + model_pointer->set_model_parameter("tau", tau); + if (tau < 1e-10) + no_T = true; + } + if (not lambda_fixed) { + double lambda = getParameterValue("lambda"); + model_pointer->set_model_parameter("lambda", lambda); + } + if (DT_fixed) { + double tau = getParameterValue("tau"); + model_pointer->set_model_parameter("tau", tau); + double delta = tau * model_pointer->scalar_parameter["DT_ratio"]; + model_pointer->set_model_parameter("delta", delta); + } + if (MLOR) { + double O_R = getParameterValue("O_R"); + model_pointer->set_model_parameter("O_R", O_R); + } + for (int i = 0; i < public_ml_branch_ids.size(); i++) { + int e = public_ml_branch_ids[i]; + string name = public_ml_ratetype_names[i]; + stringstream branch; + branch << e; + scalar_type multiplier = + getParameterValue("rm_" + name + "_" + branch.str()); + model_pointer->vector_parameter[name][e] = multiplier; + } model_pointer->calculate_undatedEs(); - double y=-log(model_pointer->pun(ale_pointer,false,no_T)); - //cout <observations<<" trees from: " << ale_file <<".."<observations + << " trees from: " << ale_file << ".." << endl; + + // Getting the radical for output files: vector tokens; - boost::split(tokens,ale_file,boost::is_any_of("/"),boost::token_compress_on); - ale_file=tokens[tokens.size()-1]; - boost::split(tokens,S_treefile,boost::is_any_of("/"),boost::token_compress_on); - - ale_file=tokens[tokens.size()-1]+"_"+ale_file; - - //we initialise a coarse grained reconciliation model for calculating the sum - exODT_model* model=new exODT_model(); - - scalar_type samples=100; - scalar_type O_R=1,beta=1; - bool delta_fixed=false; - bool tau_fixed=false; - bool lambda_fixed=false; - bool DT_fixed=false; - - scalar_type delta=1e-2,tau=1e-2,lambda=1e-1,DT_ratio=0.05; + boost::split(tokens, ale_file, boost::is_any_of("/"), + boost::token_compress_on); + ale_file = tokens[tokens.size() - 1]; + boost::split(tokens, S_treefile, boost::is_any_of("/"), + boost::token_compress_on); + + ale_file = tokens[tokens.size() - 1] + "_" + ale_file; + + // we initialise a coarse grained reconciliation model for calculating the sum + exODT_model *model = new exODT_model(); + + scalar_type samples = 100; + scalar_type O_R = 1, beta = 1; + bool delta_fixed = false; + bool tau_fixed = false; + bool lambda_fixed = false; + bool DT_fixed = false; + + scalar_type delta = 1e-2, tau = 1e-2, lambda = 1e-1, DT_ratio = 0.05; string fractionMissingFile = ""; string outputSpeciesTree = ""; - map >rate_multipliers; + map> rate_multipliers; vector ml_branch_ids; vector ml_ratetype_names; - bool MRP=false; - bool MLOR=false; - - model->set_model_parameter("undatedBL",false); - model->set_model_parameter("reldate",false); - MLOR=false; - - for (int i=3;i tokens; - boost::split(tokens,next_field,boost::is_any_of("=:"),boost::token_compress_on); - if (tokens[0]=="sample") - samples=atoi(tokens[1].c_str()); - else if (tokens[0]=="separators") - model->set_model_parameter("gene_name_separators", tokens[1]); - else if (tokens[0]=="delta") - { - delta=atof(tokens[1].c_str()); - delta_fixed=true; - cout << "# delta fixed to " << delta << endl; - } - else if (tokens[0]=="tau") - { - tau=atof(tokens[1].c_str()); - tau_fixed=true; - if (tau<1e-10) - { - no_T=true; - cout << "# tau fixed to no transfer!" << endl; - tau=1e-19; - } - else cout << "# tau fixed to " << tau << endl; - } - else if (tokens[0]=="lambda") - { - lambda=atof(tokens[1].c_str()); - lambda_fixed=true; - cout << "# lambda fixed to " << lambda << endl; - } - else if (tokens[0]=="DT") - { - DT_ratio=atof(tokens[1].c_str()); - DT_fixed=true; - model->set_model_parameter("DT_ratio", DT_ratio); - cout << "# D/T ratio fixed to " << model->scalar_parameter["DT_ratio"] << endl; - } - else if (tokens[0]=="O_R") - { - O_R=atof(tokens[1].c_str()); - cout << "# O_R set to " << O_R << endl; - } - else if (tokens[0]=="beta") - { - beta=atof(tokens[1].c_str()); - cout << "# beta set to " << beta << endl; - } - else if (tokens[0]=="fraction_missing") - { - fractionMissingFile=tokens[1]; - cout << "# File containing fractions of missing genes set to " << fractionMissingFile << endl; - } - else if (tokens[0]=="S_branch_lengths") - { - model->set_model_parameter("undatedBL",true); - if (tokens.size()==1) - { - model->set_model_parameter("root_BL", 1); - cout << "# unsing branch lengths of input S tree as rate multipliers with 1 at root! "<< endl; - } - else - { - scalar_type root_rm=atof(tokens[1].c_str()); - model->set_model_parameter("root_BL", root_rm); - cout << "# unsing branch lengths of input S tree as rate multipliers with "<set_model_parameter("reldate",true); - } - else if (tokens[0]=="MLOR") - { - cout << "Optimizing root origination multiplier." << endl; - MLOR=true; - } - - else if (tokens[0]=="rate_multiplier") - { - string rate_name=tokens[1]; - int e=atoi(tokens[2].c_str()); - scalar_type rm=atof(tokens[3].c_str()); - if (rm>=-1) - { - cout << "# rate multiplier for rate " << rate_name << " on branch with ID " << e<< " set to " << rm << endl; - rate_multipliers["rate_multiplier_"+rate_name][e]=rm; - } - else - { - cout << "# rate multiplier for rate " << rate_name << " on branch with ID " << e<< " to be optimized " << endl; - ml_branch_ids.push_back(e); - ml_ratetype_names.push_back("rate_multiplier_"+rate_name); - } - } - else if (tokens[0]=="output_species_tree") - { - std::string valArg = boost::algorithm::to_lower_copy(tokens[1]); - if (valArg == "y" || valArg == "ye" || valArg == "yes" ) { - outputSpeciesTree= ale_file + ".spTree"; - cout << "# outputting the annotated species tree to "<< outputSpeciesTree << endl; - } - else { - cout << "# NOT outputting the annotated species tree to "<< outputSpeciesTree << endl; - - } - - } - else if (tokens[0]=="seed") - { - long seed = atoi(tokens[1].c_str()); - cout << "Set random seed to " <set_model_parameter("undatedBL", false); + model->set_model_parameter("reldate", false); + MLOR = false; + + for (int i = 3; i < argc; i++) { + string next_field = argv[i]; + + vector tokens; + boost::split(tokens, next_field, boost::is_any_of("=:"), + boost::token_compress_on); + if (tokens[0] == "sample") + samples = atoi(tokens[1].c_str()); + else if (tokens[0] == "separators") + model->set_model_parameter("gene_name_separators", tokens[1]); + else if (tokens[0] == "delta") { + delta = atof(tokens[1].c_str()); + delta_fixed = true; + cout << "# delta fixed to " << delta << endl; + } else if (tokens[0] == "tau") { + tau = atof(tokens[1].c_str()); + tau_fixed = true; + if (tau < 1e-10) { + no_T = true; + cout << "# tau fixed to no transfer!" << endl; + tau = 1e-19; + } else + cout << "# tau fixed to " << tau << endl; + } else if (tokens[0] == "lambda") { + lambda = atof(tokens[1].c_str()); + lambda_fixed = true; + cout << "# lambda fixed to " << lambda << endl; + } else if (tokens[0] == "DT") { + DT_ratio = atof(tokens[1].c_str()); + DT_fixed = true; + model->set_model_parameter("DT_ratio", DT_ratio); + cout << "# D/T ratio fixed to " << model->scalar_parameter["DT_ratio"] + << endl; + } else if (tokens[0] == "O_R") { + O_R = atof(tokens[1].c_str()); + cout << "# O_R set to " << O_R << endl; + } else if (tokens[0] == "beta") { + beta = atof(tokens[1].c_str()); + cout << "# beta set to " << beta << endl; + } else if (tokens[0] == "fraction_missing") { + fractionMissingFile = tokens[1]; + cout << "# File containing fractions of missing genes set to " + << fractionMissingFile << endl; + } else if (tokens[0] == "S_branch_lengths") { + model->set_model_parameter("undatedBL", true); + if (tokens.size() == 1) { + model->set_model_parameter("root_BL", 1); + cout << "# unsing branch lengths of input S tree as rate multipliers " + "with 1 at root! " + << endl; + } else { + scalar_type root_rm = atof(tokens[1].c_str()); + model->set_model_parameter("root_BL", root_rm); + cout << "# unsing branch lengths of input S tree as rate multipliers " + "with " + << root_rm << " at root! " << endl; + } + } else if (tokens[0] == "reldate") { + cout << "Respecting realtive ages from input S tree, please make sure " + "input S tree is ultrametric!" + << endl; + model->set_model_parameter("reldate", true); + } else if (tokens[0] == "MLOR") { + cout << "Optimizing root origination multiplier." << endl; + MLOR = true; } - model->set_model_parameter("BOOTSTRAP_LABELS","yes"); - model->construct_undated(Sstring, fractionMissingFile); - - for (auto it=rate_multipliers.begin();it!=rate_multipliers.end();it++) - for (auto jt=(*it).second.begin();jt!=(*it).second.end();jt++) - { - model->vector_parameter[(*it).first][(*jt).first]=(*jt).second; + else if (tokens[0] == "rate_multiplier") { + string rate_name = tokens[1]; + int e = atoi(tokens[2].c_str()); + scalar_type rm = atof(tokens[3].c_str()); + if (rm >= -1) { + cout << "# rate multiplier for rate " << rate_name + << " on branch with ID " << e << " set to " << rm << endl; + rate_multipliers["rate_multiplier_" + rate_name][e] = rm; + } else { + cout << "# rate multiplier for rate " << rate_name + << " on branch with ID " << e << " to be optimized " << endl; + ml_branch_ids.push_back(e); + ml_ratetype_names.push_back("rate_multiplier_" + rate_name); } - + } else if (tokens[0] == "output_species_tree") { + std::string valArg = boost::algorithm::to_lower_copy(tokens[1]); + if (valArg == "y" || valArg == "ye" || valArg == "yes") { + outputSpeciesTree = ale_file + ".spTree"; + cout << "# outputting the annotated species tree to " + << outputSpeciesTree << endl; + } else { + cout << "# NOT outputting the annotated species tree to " + << outputSpeciesTree << endl; + } + + } else if (tokens[0] == "seed") { + long seed = atoi(tokens[1].c_str()); + cout << "Set random seed to " << seed << endl; + RandomTools::setSeed(seed); + } else if (tokens[0] == "MRP") { + bool MRP = true; + } + } + + model->set_model_parameter("BOOTSTRAP_LABELS", "yes"); + model->construct_undated(Sstring, fractionMissingFile); + + for (auto it = rate_multipliers.begin(); it != rate_multipliers.end(); it++) + for (auto jt = (*it).second.begin(); jt != (*it).second.end(); jt++) { + model->vector_parameter[(*it).first][(*jt).first] = (*jt).second; + } + model->set_model_parameter("seq_beta", beta); model->set_model_parameter("O_R", O_R); - //a set of inital rates + // a set of inital rates model->set_model_parameter("delta", delta); model->set_model_parameter("tau", tau); model->set_model_parameter("lambda", lambda); - //calculate_EGb() must always be called after changing rates to calculate E-s and G-s - //cf. http://arxiv.org/abs/1211.4606 + // calculate_EGb() must always be called after changing rates to calculate E-s + // and G-s cf. http://arxiv.org/abs/1211.4606 model->calculate_undatedEs(); - cout << "Reconciliation model initialised, starting DTL rate optimisation" <<".."<setProfiler(0); optimizer->setMessageHandler(0); optimizer->setVerbose(2); optimizer->setConstraintPolicy(AutoParameter::CONSTRAINTS_AUTO); - optimizer->init(f->getParameters()); //Here we optimize all parameters, and start with the default values. + optimizer->init(f->getParameters()); // Here we optimize all parameters, and + // start with the default values. scalar_type mlll; - - if (not (delta_fixed and tau_fixed and lambda_fixed and not MLOR ) ) // not all rates fixed - { - cout << "#optimizing rates" << endl; - optimizer->optimize(); - if (not delta_fixed and not DT_fixed) delta=optimizer->getParameterValue("delta"); - if (not tau_fixed) tau=optimizer->getParameterValue("tau"); - if (DT_fixed) delta = tau * model->scalar_parameter["DT_ratio"]; - if (not lambda_fixed) lambda=optimizer->getParameterValue("lambda"); - if (MLOR) O_R=optimizer->getParameterValue("O_R"); - mlll=-optimizer->getFunctionValue(); - - } - else - { - mlll=log(model->pun(ale,false,no_T)); - } + + if (not(delta_fixed and tau_fixed and lambda_fixed and + not MLOR)) // not all rates fixed + { + cout << "#optimizing rates" << endl; + optimizer->optimize(); + if (not delta_fixed and not DT_fixed) + delta = optimizer->getParameterValue("delta"); + if (not tau_fixed) + tau = optimizer->getParameterValue("tau"); + if (DT_fixed) + delta = tau * model->scalar_parameter["DT_ratio"]; + if (not lambda_fixed) + lambda = optimizer->getParameterValue("lambda"); + if (MLOR) + O_R = optimizer->getParameterValue("O_R"); + mlll = -optimizer->getFunctionValue(); + + } else { + mlll = log(model->pun(ale, false, no_T)); + } stringstream ml_rate_multipliers; - for (int i=0;igetParameterValue("rm_"+name+"_"+branch.str()); - ml_rate_multipliers << name << "\t" << e << "\t" << multiplier << ";\n"; - } + for (int i = 0; i < ml_branch_ids.size(); i++) { + int e = ml_branch_ids[i]; + string name = ml_ratetype_names[i]; + stringstream branch; + branch << e; + scalar_type multiplier = + optimizer->getParameterValue("rm_" + name + "_" + branch.str()); + ml_rate_multipliers << name << "\t" << e << "\t" << multiplier << ";\n"; + } - cout << endl << "ML rates: " << " delta=" << delta << "; tau=" << tau << "; lambda="<0) cout << "ML rate multipliers:\n" << ml_rate_multipliers.str(); + if (ml_branch_ids.size() > 0) + cout << "ML rate multipliers:\n" << ml_rate_multipliers.str(); cout << "LL=" << mlll << endl; - cout << "Sampling reconciled gene trees.."< sample_strings; - vector sample_trees; - boost::progress_display pd( samples ); - - for (int i=0;isample_undated(no_T); - sample_strings.push_back(sample_tree); - if (ale->last_leafset_id>3 and MRP) - { - - tree_type * G=TreeTemplateTools::parenthesisToTree(sample_tree,false); - - vector leaves = G->getLeaves(); - for (vector::iterator it=leaves.begin();it!=leaves.end();it++ ) - { - string name=(*it)->getName(); - vector tokens; - boost::split(tokens,name,boost::is_any_of(".@"),boost::token_compress_on); - (*it)->setName(tokens[0]); - tokens.clear(); - } - leaves.clear(); - sample_trees.push_back(G); - } + cout << "Sampling reconciled gene trees.." << endl; + vector sample_strings; + vector sample_trees; + boost::progress_display pd(samples); + + for (int i = 0; i < samples; i++) { + ++pd; + string sample_tree = model->sample_undated(no_T); + sample_strings.push_back(sample_tree); + if (ale->last_leafset_id > 3 and MRP) { + + tree_type *G = TreeTemplateTools::parenthesisToTree(sample_tree, false); + + vector leaves = G->getLeaves(); + for (vector::iterator it = leaves.begin(); it != leaves.end(); + it++) { + string name = (*it)->getName(); + vector tokens; + boost::split(tokens, name, boost::is_any_of(".@"), + boost::token_compress_on); + (*it)->setName(tokens[0]); + tokens.clear(); + } + leaves.clear(); + sample_trees.push_back(G); } + } /*cout << "Calculating ML reconciled gene tree.."< res = model->p_MLRec(ale); //and output it.. */ - string outname=ale_file+".uml_rec"; - ofstream fout( outname.c_str() ); - fout << "#ALEml_undated using ALE v"<< ALE_VERSION <<" by Szollosi GJ et al.; ssolo@elte.hu; CC BY-SA 3.0;"<string_parameter["S_with_ranks"] <string_parameter["S_with_ranks"] << endl; fout << endl; - fout << "Input ale from:\t"<logl: " << mlll << endl; - fout << "rate of\t Duplications\tTransfers\tLosses" <0) - { - fout << "ML rate multipliers:\n" << ml_rate_multipliers.str(); - fout << endl; - } - fout << samples << " reconciled G-s:\n"< 0) { + fout << "ML rate multipliers:\n" << ml_rate_multipliers.str(); + fout << endl; + } + fout << samples << " reconciled G-s:\n" << endl; + for (int i = 0; i < samples; i++) { + fout << sample_strings[i] << endl; + } - //fout << "reconciled G:\t"<< res.first <MLRec_events["D"]/samples << "\t" << model->MLRec_events["T"]/samples << "\t" << model->MLRec_events["L"]/samples<< "\t" << model->MLRec_events["S"]/samples <MLRec_events["D"] / samples << "\t" + << model->MLRec_events["T"] / samples << "\t" + << model->MLRec_events["L"] / samples << "\t" + << model->MLRec_events["S"] / samples << endl; fout << endl; - fout << "# of\t Duplications\tTransfers\tLosses\tOriginations\tcopies\tsingletons\textinction_prob\tpresence\tLL" <counts_string_undated(samples); fout.close(); // Outputting the species tree to its own file: if (outputSpeciesTree != "") { - ofstream fout( outputSpeciesTree.c_str() ); - fout <string_parameter["S_with_ranks"] <string_parameter["S_with_ranks"] << endl; fout.close(); } cout << "Results in: " << outname << endl; - if (ale->last_leafset_id>3 and MRP) - { - // cout << "Calculating consensus tree."<last_leafset_id > 3 and MRP) { + // cout << "Calculating consensus tree."<last_branch;e++) - for (int f=0;flast_branch;f++) - if (model->T_to_from[e][f]>0) - { - if (elast_leaf) - tout << model->node_name[model->id_nodes[e]] <<"("<last_leaf) - tout << "\t" << model->node_name[model->id_nodes[f]] <<"("<T_to_from[e][f]/samples << endl; - } + // con_tree= TreeTools::thresholdConsensus(sample_trees,0.5); + + string con_name = ale_file + ".ucons_tree"; + + ofstream con_out(con_name.c_str()); + con_out << "#ALEsample using ALE v" << ALE_VERSION + << " by Szollosi GJ et al.; ssolo@elte.hu; CC BY-SA 3.0;" << endl; + // TreeTools::computeBootstrapValues(*con_tree,sample_trees); + string con_tree_sup = TreeTemplateTools::treeToParenthesis(*con_tree); + con_out << con_tree_sup << endl; + cout << endl << "Consensus tree in " << con_name << endl; + } + + string t_name = ale_file + ".uTs"; + ofstream tout(t_name.c_str()); + tout << "#from\tto\tfreq.\n"; + + for (int e = 0; e < model->last_branch; e++) + for (int f = 0; f < model->last_branch; f++) + if (model->T_to_from[e][f] > 0) { + if (e < model->last_leaf) + tout << model->node_name[model->id_nodes[e]] << "(" << e << ")"; + else + tout << "\t" << e; + if (f < model->last_leaf) + tout << "\t" << model->node_name[model->id_nodes[f]] << "(" << f + << ")"; + else + tout << "\t" << f; + tout << "\t" << model->T_to_from[e][f] / samples << endl; + } tout.close(); cout << "Transfers in: " << t_name << endl; return 0; diff --git a/src/ALEobserve.cpp b/src/ALEobserve.cpp index e25e79f..ac159cf 100644 --- a/src/ALEobserve.cpp +++ b/src/ALEobserve.cpp @@ -3,43 +3,41 @@ using namespace std; using namespace bpp; -int main(int argc, char ** argv) -{ - cout << "ALEobserve using ALE v"<< ALE_VERSION < ale_files; ale_files.push_back(first_file); - for (int i=2;i tokens; - boost::split(tokens,next_field,boost::is_any_of("="),boost::token_compress_on); - if (tokens[0]=="burnin") - burnin=atoi(tokens[1].c_str()); - else - ale_files.push_back(argv[i]); - } - ale=observe_ALE_from_file(ale_files,burnin); - cout << "# observe "<< ale->observations << " tree(s) from: " << argv[1] ; - for (int i=2;i tokens; + boost::split(tokens, next_field, boost::is_any_of("="), + boost::token_compress_on); + if (tokens[0] == "burnin") + burnin = atoi(tokens[1].c_str()); + else + ale_files.push_back(argv[i]); + } + ale = observe_ALE_from_file(ale_files, burnin); + cout << "# observe " << ale->observations << " tree(s) from: " << argv[1]; + for (int i = 2; i < argc - 1; i++) cout << " " << argv[i]; cout << endl; - cout << burnin<<" burn in per file discarded."<save_state(ale_name); - cout << "# saved in "<< ale_name<mpp_tree().first << endl; + cout << "# saved in " << ale_name << endl; + cout << "# mpp tree from sample: " << endl; + cout << ale->mpp_tree().first << endl; return 0; } diff --git a/src/ALEprune.cpp b/src/ALEprune.cpp index 57e6cd7..eedb64c 100644 --- a/src/ALEprune.cpp +++ b/src/ALEprune.cpp @@ -2,411 +2,402 @@ using namespace std; using namespace bpp; -int prune(string * tree,vector keep_list,vector * keep_leaves) -{ - - map keep_map; - for (vector::iterator it=keep_list.begin();it!=keep_list.end();it++) keep_map[(*it)]=1; - tree_type * T=TreeTemplateTools::parenthesisToTree(*tree,false,"ID"); - vector leaves=T->getLeaves(); - int N_leaves=leaves.size(); - for (vector ::iterator it=leaves.begin();it!=leaves.end();it++) - { - string name=(*it)->getName(); - boost::trim(name); - vector tokens; - boost::split(tokens,name,boost::is_any_of("_"),boost::token_compress_on); - string sp=tokens[0]; - //string sp=name; - if (keep_map.count(sp)==1) (*keep_leaves).push_back(name); - if (not keep_map.count(sp)==1 ) - { - if (N_leaves>3) TreeTemplateTools::dropLeaf(*T,name); - N_leaves--; - } - } - if (N_leaves>3) *tree=TreeTemplateTools::treeToParenthesis(*T,false,"ID"); +int prune(string *tree, vector keep_list, vector *keep_leaves) { + + map keep_map; + for (vector::iterator it = keep_list.begin(); it != keep_list.end(); + it++) + keep_map[(*it)] = 1; + tree_type *T = TreeTemplateTools::parenthesisToTree(*tree, false, "ID"); + vector leaves = T->getLeaves(); + int N_leaves = leaves.size(); + for (vector::iterator it = leaves.begin(); it != leaves.end(); it++) { + string name = (*it)->getName(); + boost::trim(name); + vector tokens; + boost::split(tokens, name, boost::is_any_of("_"), boost::token_compress_on); + string sp = tokens[0]; + // string sp=name; + if (keep_map.count(sp) == 1) + (*keep_leaves).push_back(name); + if (not keep_map.count(sp) == 1) { + if (N_leaves > 3) + TreeTemplateTools::dropLeaf(*T, name); + N_leaves--; + } + } + if (N_leaves > 3) + *tree = TreeTemplateTools::treeToParenthesis(*T, false, "ID"); return N_leaves; } -int cout_small(vector keep_leaves,string ale_name) -{ +int cout_small(vector keep_leaves, string ale_name) { ofstream fout(ale_name.c_str()); - cout << "#wrting "<< ale_name << endl; - if (keep_leaves.size()==1) - { - fout << "#constructor_string"<< endl; - fout << keep_leaves[0]<< endl; - fout << "#observations"<< endl; - fout << "1"<< endl; - fout << "#Bip_counts"<< endl; - fout << "#Bip_bls"<< endl; - fout << "1 1"<< endl; - fout << "#Dip_counts"<< endl; - fout << "#last_leafset_id"<< endl; - fout << "1"<< endl; - fout << "#leaf-id"<< endl; - fout << keep_leaves[0]+" 1"<< endl; - fout << "#set-id"<< endl; - fout << "1 : 1"<< endl; - fout << "#END"<< endl; - } - else if (keep_leaves.size()==2) - { - fout << "#constructor_string"<< endl; - fout << keep_leaves[0]+","+keep_leaves[1]<< endl; - fout << "#observations"<< endl; - fout << "1"<< endl; - fout << "#Bip_counts" << endl; - fout << "1 1"<< endl; - fout << "2 1"<< endl; - fout << "#Bip_bls"<< endl; - fout << "1 1"<< endl; - fout << "2 1"<< endl; - fout << "#Dip_counts"<< endl; - fout << "#last_leafset_id"<< endl; - fout << "2"<< endl; - fout << "#leaf-id"<< endl; - fout << keep_leaves[0]+" 1"<< endl; - fout << keep_leaves[1]+" 2"<< endl; - fout << "#set-id"<< endl; - fout << "1 : 1"<< endl; - fout << "2 : 2"<< endl; - fout << "#END"<< endl; - } - else if (keep_leaves.size()==3) - { - fout << "#constructor_string"<< endl; - fout << keep_leaves[0]+","+keep_leaves[1]+","+keep_leaves[2]<< endl; - fout << "#observations"<< endl; - fout << "1"<< endl; - fout << "#Bip_counts" << endl; - fout << "4 1"<< endl; - fout << "5 1"<< endl; - fout << "6 1"<< endl; - fout << "#Bip_bls"<< endl; - fout << "1 1"<< endl; - fout << "2 1"<< endl; - fout << "3 1"<< endl; - fout << "4 1"<< endl; - fout << "5 1"<< endl; - fout << "6 1"<< endl; - fout << "#Dip_counts"<< endl; - fout << "4 2 3 1"<< endl; - fout << "5 1 3 1"<< endl; - fout << "6 1 2 1"<< endl; - fout << "#last_leafset_id"<< endl; - fout << "6"<< endl; - fout << "#leaf-id"<< endl; - fout << keep_leaves[0]+" 1"<< endl; - fout << keep_leaves[1]+" 2"<< endl; - fout << keep_leaves[2]+" 3"<< endl; - fout << "#set-id"<< endl; - fout << "1 : 1"<< endl; - fout << "2 : 2"<< endl; - fout << "6 : 1 2"<< endl; - fout << "3 : 3"<< endl; - fout << "5 : 1 3"<< endl; - fout << "4 : 2 3"<< endl; - fout << "#END"<< endl; - } + cout << "#wrting " << ale_name << endl; + if (keep_leaves.size() == 1) { + fout << "#constructor_string" << endl; + fout << keep_leaves[0] << endl; + fout << "#observations" << endl; + fout << "1" << endl; + fout << "#Bip_counts" << endl; + fout << "#Bip_bls" << endl; + fout << "1 1" << endl; + fout << "#Dip_counts" << endl; + fout << "#last_leafset_id" << endl; + fout << "1" << endl; + fout << "#leaf-id" << endl; + fout << keep_leaves[0] + " 1" << endl; + fout << "#set-id" << endl; + fout << "1 : 1" << endl; + fout << "#END" << endl; + } else if (keep_leaves.size() == 2) { + fout << "#constructor_string" << endl; + fout << keep_leaves[0] + "," + keep_leaves[1] << endl; + fout << "#observations" << endl; + fout << "1" << endl; + fout << "#Bip_counts" << endl; + fout << "1 1" << endl; + fout << "2 1" << endl; + fout << "#Bip_bls" << endl; + fout << "1 1" << endl; + fout << "2 1" << endl; + fout << "#Dip_counts" << endl; + fout << "#last_leafset_id" << endl; + fout << "2" << endl; + fout << "#leaf-id" << endl; + fout << keep_leaves[0] + " 1" << endl; + fout << keep_leaves[1] + " 2" << endl; + fout << "#set-id" << endl; + fout << "1 : 1" << endl; + fout << "2 : 2" << endl; + fout << "#END" << endl; + } else if (keep_leaves.size() == 3) { + fout << "#constructor_string" << endl; + fout << keep_leaves[0] + "," + keep_leaves[1] + "," + keep_leaves[2] + << endl; + fout << "#observations" << endl; + fout << "1" << endl; + fout << "#Bip_counts" << endl; + fout << "4 1" << endl; + fout << "5 1" << endl; + fout << "6 1" << endl; + fout << "#Bip_bls" << endl; + fout << "1 1" << endl; + fout << "2 1" << endl; + fout << "3 1" << endl; + fout << "4 1" << endl; + fout << "5 1" << endl; + fout << "6 1" << endl; + fout << "#Dip_counts" << endl; + fout << "4 2 3 1" << endl; + fout << "5 1 3 1" << endl; + fout << "6 1 2 1" << endl; + fout << "#last_leafset_id" << endl; + fout << "6" << endl; + fout << "#leaf-id" << endl; + fout << keep_leaves[0] + " 1" << endl; + fout << keep_leaves[1] + " 2" << endl; + fout << keep_leaves[2] + " 3" << endl; + fout << "#set-id" << endl; + fout << "1 : 1" << endl; + fout << "2 : 2" << endl; + fout << "6 : 1 2" << endl; + fout << "3 : 3" << endl; + fout << "5 : 1 3" << endl; + fout << "4 : 2 3" << endl; + fout << "#END" << endl; + } return 1; } -int main(int argc, char ** argv) -{ - ifstream ale_stream (argv[1]); - ifstream keep_stream (argv[2]); +int main(int argc, char **argv) { + ifstream ale_stream(argv[1]); + ifstream keep_stream(argv[2]); - string ale_name=argv[1]; + string ale_name = argv[1]; vector tokens; - boost::split(tokens,ale_name,boost::is_any_of("."),boost::token_compress_on); - //ale_name=tokens[0]; + boost::split(tokens, ale_name, boost::is_any_of("."), + boost::token_compress_on); + // ale_name=tokens[0]; - string footer=argv[2]; - boost::split(tokens,footer,boost::is_any_of("/"),boost::token_compress_on); - footer=tokens[tokens.size()-1]; + string footer = argv[2]; + boost::split(tokens, footer, boost::is_any_of("/"), boost::token_compress_on); + footer = tokens[tokens.size() - 1]; + + ale_name = ale_name + "_" + footer + ".ale"; - ale_name=ale_name+"_"+footer+".ale"; - vector keep_list; vector keep_leaves; string line; - while(! keep_stream.eof()) - { - getline (keep_stream,line); - if (line.find("(")!=line.npos) - { - tree_type * T=TreeTemplateTools::parenthesisToTree(line,false,"ID"); - vector leaves=T->getLeaves(); - for (vector ::iterator it=leaves.begin();it!=leaves.end();it++) - { - string name=(*it)->getName(); - boost::trim(name); - vector tokens; - boost::split(tokens,name,boost::is_any_of("_"),boost::token_compress_on); - string sp=tokens[0]; - //string sp=name; - keep_list.push_back(sp); - } - break; - } - else - { - boost::trim(line); - vector tokens; - //boost::split(tokens,line,boost::is_any_of("_"),boost::token_compress_on); - //string sp=tokens[0]; - string sp=line; - keep_list.push_back(sp); - } + while (!keep_stream.eof()) { + getline(keep_stream, line); + if (line.find("(") != line.npos) { + tree_type *T = TreeTemplateTools::parenthesisToTree(line, false, "ID"); + vector leaves = T->getLeaves(); + for (vector::iterator it = leaves.begin(); it != leaves.end(); + it++) { + string name = (*it)->getName(); + boost::trim(name); + vector tokens; + boost::split(tokens, name, boost::is_any_of("_"), + boost::token_compress_on); + string sp = tokens[0]; + // string sp=name; + keep_list.push_back(sp); + } + break; + } else { + boost::trim(line); + vector tokens; + // boost::split(tokens,line,boost::is_any_of("_"),boost::token_compress_on); + // string sp=tokens[0]; + string sp = line; + keep_list.push_back(sp); } - - string field=""; + } + + string field = ""; string constructor_string; int N_leaves; int observations; long int last_leafset_id; - map Bip_counts; - map > > Dip_counts; - map < boost::dynamic_bitset<>,long int> set_ids; - map< long int, boost::dynamic_bitset<> > id_sets; - - map id_leaves; - map leaf_ids; - while(! ale_stream.eof()) - { - getline (ale_stream,line); - boost::trim(line); - if (line[0]=='#') field=line; - else - { - if (field=="#constructor_string") - { - constructor_string=line; - } - else if (field=="#observations") - { - boost::trim(line); - observations=atof(line.c_str()); - } - else if (field=="#Bip_counts") - { - vector tokens; - boost::trim(line); - boost::split(tokens,line,boost::is_any_of("\t "),boost::token_compress_on); - Bip_counts[atol(tokens[0].c_str())]=atof(tokens[1].c_str()); - } - else if (field=="#Dip_counts") - { - vector tokens; - boost::trim(line); - boost::split(tokens,line,boost::is_any_of("\t "),boost::token_compress_on); - Dip_counts[atol(tokens[0].c_str())][atol(tokens[1].c_str())][atol(tokens[2].c_str())]=atof(tokens[3].c_str()); - } - else if (field=="#last_leafset_id") - { - boost::trim(line); - last_leafset_id=atol(line.c_str()); - } - else if (field=="#leaf-id") - { - vector tokens; - boost::trim(line); - boost::split(tokens,line,boost::is_any_of("\t "),boost::token_compress_on); - long int id=atol(tokens[1].c_str()); - string leaf_name=tokens[0]; - leaf_ids[leaf_name]=id; - id_leaves[id]=leaf_name; - } - else if (field=="#set-id") - { - vector fields; - boost::trim(line); - boost::split(fields,line,boost::is_any_of(":"),boost::token_compress_on); - boost::trim(fields[0]); - long int set_id=atol(fields[0].c_str()); - vector tokens; - boost::trim(fields[1]); - boost::split(tokens,fields[1],boost::is_any_of("\t "),boost::token_compress_on); - boost::dynamic_bitset<> temp( leaf_ids.size()+1 ); - - for (vector::iterator it=tokens.begin();it!=tokens.end();it++) { //Setting the proper bits to 1 - temp[static_cast(atoi((*it).c_str()))] = 1; //cout << id_leaves[static_cast(atoi((*it).c_str()))] << " "; - } - - //std::cout <<"setid : "<< set_id << " READING: " << temp << std::endl; - set_ids[temp]=set_id; - id_sets[set_id]=temp; - } - - } + map Bip_counts; + map>> Dip_counts; + map, long int> set_ids; + map> id_sets; + + map id_leaves; + map leaf_ids; + while (!ale_stream.eof()) { + getline(ale_stream, line); + boost::trim(line); + if (line[0] == '#') + field = line; + else { + if (field == "#constructor_string") { + constructor_string = line; + } else if (field == "#observations") { + boost::trim(line); + observations = atof(line.c_str()); + } else if (field == "#Bip_counts") { + vector tokens; + boost::trim(line); + boost::split(tokens, line, boost::is_any_of("\t "), + boost::token_compress_on); + Bip_counts[atol(tokens[0].c_str())] = atof(tokens[1].c_str()); + } else if (field == "#Dip_counts") { + vector tokens; + boost::trim(line); + boost::split(tokens, line, boost::is_any_of("\t "), + boost::token_compress_on); + Dip_counts[atol(tokens[0].c_str())][atol(tokens[1].c_str())] + [atol(tokens[2].c_str())] = atof(tokens[3].c_str()); + } else if (field == "#last_leafset_id") { + boost::trim(line); + last_leafset_id = atol(line.c_str()); + } else if (field == "#leaf-id") { + vector tokens; + boost::trim(line); + boost::split(tokens, line, boost::is_any_of("\t "), + boost::token_compress_on); + long int id = atol(tokens[1].c_str()); + string leaf_name = tokens[0]; + leaf_ids[leaf_name] = id; + id_leaves[id] = leaf_name; + } else if (field == "#set-id") { + vector fields; + boost::trim(line); + boost::split(fields, line, boost::is_any_of(":"), + boost::token_compress_on); + boost::trim(fields[0]); + long int set_id = atol(fields[0].c_str()); + vector tokens; + boost::trim(fields[1]); + boost::split(tokens, fields[1], boost::is_any_of("\t "), + boost::token_compress_on); + boost::dynamic_bitset<> temp(leaf_ids.size() + 1); + + for (vector::iterator it = tokens.begin(); it != tokens.end(); + it++) { // Setting the proper bits to 1 + temp[static_cast(atoi((*it).c_str()))] = + 1; // cout << id_leaves[static_cast(atoi((*it).c_str()))] << + // " "; + } + + // std::cout <<"setid : "<< set_id << " READING: " << temp << std::endl; + set_ids[temp] = set_id; + id_sets[set_id] = temp; + } } - N_leaves=prune(&constructor_string,keep_list,&keep_leaves); - - if (N_leaves<4) return cout_small(keep_leaves,ale_name); - - - boost::dynamic_bitset<> keep_set( leaf_ids.size()+1 ); - long int new_last_leafset_id=1; - map new_id_leaves; - map new_leaf_ids; - map > > new_Dip_counts; - map < boost::dynamic_bitset<>,long int> new_set_ids; - map< long int, boost::dynamic_bitset<> > new_id_sets; - map< long int, long int > new_ids; - - map < boost::dynamic_bitset<> , boost::dynamic_bitset<> > new_sets; - map < boost::dynamic_bitset<> ,scalar_type> new_Bip_counts; - - - for (vector::iterator it=keep_leaves.begin() ;it!=keep_leaves.end() ; it++) - { - string name=(*it); - boost::dynamic_bitset<> old_set( leaf_ids.size()+1 ); - size_t i = leaf_ids[name]; - keep_set[i] = 1; //! XX ! - old_set[i] = 1; //! XX ! - - cout << i << " " << name << " " << leaf_ids[name] <<" "<< keep_set << " " << old_set << endl; - - new_ids[i]=new_last_leafset_id; - new_leaf_ids[name] = new_last_leafset_id; - new_id_leaves[new_last_leafset_id] = name; - - new_set_ids[old_set]=new_last_leafset_id; - new_id_sets[new_last_leafset_id]=old_set; - new_sets[old_set] = old_set; - - new_last_leafset_id++; - - } - - - for (map < boost::dynamic_bitset<>,long int>::iterator it = set_ids.begin(); it!=set_ids.end(); it++) - { - boost::dynamic_bitset<> old_set = (*it).first; - boost::dynamic_bitset<> new_set = old_set & keep_set; - new_sets[old_set] = new_set; - //cout <<(*it).second<<" "<< old_set << " -> " << new_set << endl; - //if (old_set!=keep_set and not (new_set.none())) new_sets[old_set] = new_set; - - if (old_set!=keep_set and new_set!=keep_set and not (new_set.none()) and (new_set_ids.count(new_set)==0) and Bip_counts.count((*it).second)) - new_Bip_counts[new_set] = 0; - } - - for (map < boost::dynamic_bitset<> ,scalar_type> ::iterator it=new_Bip_counts.begin(); it!=new_Bip_counts.end(); it++) - { - new_set_ids[(*it).first]=new_last_leafset_id; - new_id_sets[new_last_leafset_id]=(*it).first; - new_last_leafset_id++; - } - - for (map > >::iterator it = Dip_counts.begin(); it!=Dip_counts.end(); it++) - { - boost::dynamic_bitset<> gamma = id_sets[(*it).first]; - for (map > ::iterator jt = (*it).second.begin(); jt!=(*it).second.end(); jt++) - { - boost::dynamic_bitset<> gamma_p = id_sets[(*jt).first]; - for ( map ::iterator kt = (*jt).second.begin(); kt!=(*jt).second.end(); kt++) - { - boost::dynamic_bitset<> gamma_pp = id_sets[(*kt).first]; - - if (new_sets[gamma].none() or new_sets[gamma_p].none() or new_sets[gamma_pp].none() or (new_sets[gamma]==keep_set)) - { - ; - } - else - { - long int new_g_id = new_set_ids[new_sets[gamma ]]; - long int new_gp_id = new_set_ids[new_sets[gamma_p ]]; - long int new_gpp_id = new_set_ids[new_sets[gamma_pp]]; - //cout << "---------------------------------" << endl; - //cout << keep_set << " " << keep_set << " " << keep_set << endl; - //cout << gamma << " " << gamma_p << " " << gamma_pp << endl; - //cout << new_sets[gamma] << " " << new_sets[gamma_p] << " " << new_sets[gamma_pp] << endl; - //cout << "---------------------------------" << endl; - new_Bip_counts[new_sets[gamma ]]+= (*kt).second; - if (new_gp_id keep_set(leaf_ids.size() + 1); + long int new_last_leafset_id = 1; + map new_id_leaves; + map new_leaf_ids; + map>> new_Dip_counts; + map, long int> new_set_ids; + map> new_id_sets; + map new_ids; + + map, boost::dynamic_bitset<>> new_sets; + map, scalar_type> new_Bip_counts; + + for (vector::iterator it = keep_leaves.begin(); + it != keep_leaves.end(); it++) { + string name = (*it); + boost::dynamic_bitset<> old_set(leaf_ids.size() + 1); + size_t i = leaf_ids[name]; + keep_set[i] = 1; //! XX ! + old_set[i] = 1; //! XX ! + + cout << i << " " << name << " " << leaf_ids[name] << " " << keep_set << " " + << old_set << endl; + + new_ids[i] = new_last_leafset_id; + new_leaf_ids[name] = new_last_leafset_id; + new_id_leaves[new_last_leafset_id] = name; + + new_set_ids[old_set] = new_last_leafset_id; + new_id_sets[new_last_leafset_id] = old_set; + new_sets[old_set] = old_set; + + new_last_leafset_id++; + } + + for (map, long int>::iterator it = set_ids.begin(); + it != set_ids.end(); it++) { + boost::dynamic_bitset<> old_set = (*it).first; + boost::dynamic_bitset<> new_set = old_set & keep_set; + new_sets[old_set] = new_set; + // cout <<(*it).second<<" "<< old_set << " -> " << new_set << endl; + // if (old_set!=keep_set and not (new_set.none())) new_sets[old_set] = + // new_set; + + if (old_set != keep_set and new_set != keep_set and not(new_set.none()) and + (new_set_ids.count(new_set) == 0) and Bip_counts.count((*it).second)) + new_Bip_counts[new_set] = 0; + } + + for (map, scalar_type>::iterator it = + new_Bip_counts.begin(); + it != new_Bip_counts.end(); it++) { + new_set_ids[(*it).first] = new_last_leafset_id; + new_id_sets[new_last_leafset_id] = (*it).first; + new_last_leafset_id++; + } + + for (map>>::iterator it = + Dip_counts.begin(); + it != Dip_counts.end(); it++) { + boost::dynamic_bitset<> gamma = id_sets[(*it).first]; + for (map>::iterator jt = + (*it).second.begin(); + jt != (*it).second.end(); jt++) { + boost::dynamic_bitset<> gamma_p = id_sets[(*jt).first]; + for (map::iterator kt = (*jt).second.begin(); + kt != (*jt).second.end(); kt++) { + boost::dynamic_bitset<> gamma_pp = id_sets[(*kt).first]; + + if (new_sets[gamma].none() or new_sets[gamma_p].none() or + new_sets[gamma_pp].none() or (new_sets[gamma] == keep_set)) { + ; + } else { + long int new_g_id = new_set_ids[new_sets[gamma]]; + long int new_gp_id = new_set_ids[new_sets[gamma_p]]; + long int new_gpp_id = new_set_ids[new_sets[gamma_pp]]; + // cout << "---------------------------------" << endl; + // cout << keep_set << " " << keep_set << " " << keep_set << endl; + // cout << gamma << " " << gamma_p << " " << gamma_pp << endl; + // cout << new_sets[gamma] << " " << new_sets[gamma_p] << " " << + // new_sets[gamma_pp] << endl; cout << + // "---------------------------------" << endl; + new_Bip_counts[new_sets[gamma]] += (*kt).second; + if (new_gp_id < new_gpp_id) + new_Dip_counts[new_g_id][new_gp_id][new_gpp_id] += (*kt).second; + else + new_Dip_counts[new_g_id][new_gpp_id][new_gp_id] += (*kt).second; + } + } } + } ofstream fout(ale_name.c_str()); - cout << "#writing "<< ale_name << endl; + cout << "#writing " << ale_name << endl; - fout << "#constructor_string"<< endl; + fout << "#constructor_string" << endl; fout << constructor_string; - fout << "#observations"<< endl; - fout << observations<< endl; + fout << "#observations" << endl; + fout << observations << endl; fout << "#Bip_counts" << endl; - for (map < boost::dynamic_bitset<> ,scalar_type>::iterator it=new_Bip_counts.begin(); it!=new_Bip_counts.end(); it++) - { - fout << new_set_ids[(*it).first] << "\t" << (*it).second << endl; + for (map, scalar_type>::iterator it = + new_Bip_counts.begin(); + it != new_Bip_counts.end(); it++) { + fout << new_set_ids[(*it).first] << "\t" << (*it).second << endl; + } + + fout << "#Bip_bls" << endl; + for (map, long int>::iterator it = + new_set_ids.begin(); + it != new_set_ids.end(); it++) { + long int g_id = (*it).second; + // fout << (*it).first << "\t"; + fout << g_id << "\t" << 1 << endl; + } + + fout << "#Dip_counts" << endl; + for (map>>::iterator it = + new_Dip_counts.begin(); + it != new_Dip_counts.end(); it++) { + boost::dynamic_bitset<> gamma = new_id_sets[(*it).first]; + + for (map>::iterator jt = + (*it).second.begin(); + jt != (*it).second.end(); jt++) { + boost::dynamic_bitset<> gamma_p = new_id_sets[(*jt).first]; + + for (map::iterator kt = (*jt).second.begin(); + kt != (*jt).second.end(); kt++) { + boost::dynamic_bitset<> gamma_pp = new_id_sets[(*kt).first]; + long int new_g_id = new_set_ids[gamma]; + long int new_gp_id = new_set_ids[gamma_p]; + long int new_gpp_id = new_set_ids[gamma_pp]; + fout << new_g_id << "\t" << new_gp_id << "\t" << new_gpp_id << "\t" + << (*kt).second << endl; + } } - - fout << "#Bip_bls"<< endl; - for (map < boost::dynamic_bitset<>,long int> ::iterator it=new_set_ids.begin() ;it!=new_set_ids.end() ; it++) - { - long int g_id=(*it).second; - //fout << (*it).first << "\t"; - fout << g_id << "\t" << 1 << endl; - } - - fout << "#Dip_counts"<< endl; - for (map > >::iterator it = new_Dip_counts.begin(); it!=new_Dip_counts.end(); it++) - { - boost::dynamic_bitset<> gamma = new_id_sets[(*it).first]; - - for (map > ::iterator jt = (*it).second.begin(); jt!=(*it).second.end(); jt++) - { - boost::dynamic_bitset<> gamma_p = new_id_sets[(*jt).first]; - - for ( map ::iterator kt = (*jt).second.begin(); kt!=(*jt).second.end(); kt++) - { - boost::dynamic_bitset<> gamma_pp = new_id_sets[(*kt).first]; - long int new_g_id = new_set_ids[gamma ]; - long int new_gp_id = new_set_ids[gamma_p ]; - long int new_gpp_id = new_set_ids[gamma_pp]; - fout << new_g_id << "\t" << new_gp_id << "\t" << new_gpp_id << "\t" << (*kt).second << endl; - } - } - } - - fout << "#last_leafset_id"<< endl; - fout << new_last_leafset_id-1 << endl; - - fout << "#leaf-id"<< endl; - for (vector::iterator it=keep_leaves.begin() ;it!=keep_leaves.end() ; it++) - { - fout << (*it) << "\t" << new_leaf_ids[(*it)] <::iterator it = keep_leaves.begin(); + it != keep_leaves.end(); it++) { + fout << (*it) << "\t" << new_leaf_ids[(*it)] << endl; + } + + fout << "#set-id" << endl; + for (map, long int>::iterator it = + new_set_ids.begin(); + it != new_set_ids.end(); it++) { + boost::dynamic_bitset<> gamma = (*it).first; + long int g_id = (*it).second; + // fout << gamma << "\t:"; + fout << g_id << "\t:"; + + size_t i = gamma.find_first(); + while (i != boost::dynamic_bitset<>::npos) { + fout << "\t" << new_ids[i]; + i = gamma.find_next(i); } - - fout << "#set-id"<< endl; - for (map < boost::dynamic_bitset<>,long int> ::iterator it=new_set_ids.begin() ;it!=new_set_ids.end() ; it++) - { - boost::dynamic_bitset<> gamma=(*it).first; - long int g_id=(*it).second; - //fout << gamma << "\t:"; - fout << g_id << "\t:"; - - size_t i = gamma.find_first(); - while(i != boost::dynamic_bitset<>::npos) - { - fout << "\t" << new_ids[i]; - i = gamma.find_next(i); - } - fout << endl; - } - fout << "#END"<< endl; + fout << endl; + } + fout << "#END" << endl; return 1; - - - - } diff --git a/src/ALEsample.cpp b/src/ALEsample.cpp index d05c760..eff7748 100644 --- a/src/ALEsample.cpp +++ b/src/ALEsample.cpp @@ -1,54 +1,55 @@ -#include "exODT.h" #include "ALE_util.h" +#include "exODT.h" #include using namespace std; using namespace bpp; -int main(int argc, char ** argv) -{ - cout << "ALEsample using ALE v"<< ALE_VERSION <observations<<" trees from: " << ale_file <<".."<observations + << " trees from: " << ale_file << ".." << endl; + + // we initialise the model + exODT_model *model = new exODT_model(); // - model->set_model_parameter("BOOTSTRAP_LABELS","yes"); + model->set_model_parameter("BOOTSTRAP_LABELS", "yes"); - model->set_model_parameter("min_D",3); - model->set_model_parameter("grid_delta_t",0.05); + model->set_model_parameter("min_D", 3); + model->set_model_parameter("grid_delta_t", 0.05); model->construct(Sstring); - model->set_model_parameter("event_node",0); - model->set_model_parameter("leaf_events",1); - model->set_model_parameter("N",1); - - //a set of inital rates - scalar_type delta=0.01,tau=0.01,lambda=0.1; - string append=""; - if (argc>4) - append=argv[4]; - if (argc>7) - delta=atof(argv[5]),tau=atof(argv[6]),lambda=atof(argv[7]); + model->set_model_parameter("event_node", 0); + model->set_model_parameter("leaf_events", 1); + model->set_model_parameter("N", 1); + + // a set of inital rates + scalar_type delta = 0.01, tau = 0.01, lambda = 0.1; + string append = ""; + if (argc > 4) + append = argv[4]; + if (argc > 7) + delta = atof(argv[5]), tau = atof(argv[6]), lambda = atof(argv[7]); model->set_model_parameter("delta", delta); model->set_model_parameter("tau", tau); model->set_model_parameter("lambda", lambda); @@ -56,134 +57,179 @@ int main(int argc, char ** argv) model->calculate_EGb(); - //p(ale) calculates the probability of the obsorved set of gene trees, i.e. Pi(Gamma) - //cf. ALEPAPER - scalar_type old_p=model->p(ale); - - int steps=0,accapted=0,sampled=0; - int N_samples=1000; - if (argc>3) N_samples=atoi(argv[3]); - int burnin=100; - if (argc>4) burnin=atoi(argv[4]); - bool allprint=false; - int print_mod=10; - int subsamples=10; - vector sample_trees; + // p(ale) calculates the probability of the obsorved set of gene trees, i.e. + // Pi(Gamma) cf. ALEPAPER + scalar_type old_p = model->p(ale); + + int steps = 0, accapted = 0, sampled = 0; + int N_samples = 1000; + if (argc > 3) + N_samples = atoi(argv[3]); + int burnin = 100; + if (argc > 4) + burnin = atoi(argv[4]); + bool allprint = false; + int print_mod = 10; + int subsamples = 10; + vector sample_trees; vector sample_strings; - cout << "Starting burnin.." << endl; - - string rate_name=ale_file+append+".ratelist"; - string event_name=ale_file+append+".eventlist"; - string sample_name=ale_file+append+".treelist"; - - ofstream rate_out( rate_name.c_str() ); - ofstream event_out( event_name.c_str() ); - ofstream sample_out( sample_name.c_str() ); - - rate_out << "#ALEsample using ALE v"<< ALE_VERSION <<" by Szollosi GJ et al.; ssolo@elte.hu; CC BY-SA 3.0;"< ds; - //rate proposal - for (int i=0;i<3;i++) - { - scalar_type r=RandomTools::giveRandomNumberBetweenZeroAndEntry(1); - scalar_type d; - if (r<1./3.) d=RandomTools::randExponential(0.001)*2*(0.5-RandomTools::giveRandomNumberBetweenZeroAndEntry(1)); - else if (r<2./3.) d=RandomTools::randExponential(0.01)*2*(0.5-RandomTools::giveRandomNumberBetweenZeroAndEntry(1)); - else d=RandomTools::randExponential(0.1)*2*(0.5-RandomTools::giveRandomNumberBetweenZeroAndEntry(1)); - ds.push_back(d); - } - scalar_type new_delta=delta+ds[0]; - scalar_type new_tau=tau+ds[1]; - scalar_type new_lambda=lambda+ds[2]; - - //boundaries - if (new_delta<1e-6) new_delta=1e-6; - if (new_delta>10-1e-6) new_delta=10-1e-6; - if (new_tau<1e-6) new_tau=1e-6; - if (new_tau>10-1e-6) new_tau=10-1e-6; - if (new_lambda<1e-6) new_lambda=1e-6; - if (new_lambda>10-1e-6) new_lambda=10-1e-6; - - //likelihood - model->set_model_parameter("delta",new_delta); - model->set_model_parameter("tau",new_tau); - model->set_model_parameter("lambda",new_lambda); + cout << "Starting burnin.." << endl; + + string rate_name = ale_file + append + ".ratelist"; + string event_name = ale_file + append + ".eventlist"; + string sample_name = ale_file + append + ".treelist"; + + ofstream rate_out(rate_name.c_str()); + ofstream event_out(event_name.c_str()); + ofstream sample_out(sample_name.c_str()); + + rate_out << "#ALEsample using ALE v" << ALE_VERSION + << " by Szollosi GJ et al.; ssolo@elte.hu; CC BY-SA 3.0;" << endl; + rate_out << "#" + << "sample" + << "\t" + << "step" + << "\t" + << "duplication_rate" + << "\t" + << "transfer_rate" + << "\t" + << "loss_rate" + << "\t" + << "log_likelihood" << endl; + + event_out << "#ALEsample using ALE v" << ALE_VERSION + << " by Szollosi GJ et al.; ssolo@elte.hu; CC BY-SA 3.0;" << endl; + event_out << "#" + << "subsample" + << "\t" + << "sample" + << "\t" + << "step" + << "\t" + << "duplications" + << "\t" + << "transfers" + << "\t" + << "losses" + << "\t" + << "speciations" << endl; + + sample_out << "#ALEsample using ALE v" << ALE_VERSION + << " by Szollosi GJ et al.; ssolo@elte.hu; CC BY-SA 3.0;" << endl; + + boost::progress_display pd(burnin); + + while (sampled < N_samples) { + vector ds; + // rate proposal + for (int i = 0; i < 3; i++) { + scalar_type r = RandomTools::giveRandomNumberBetweenZeroAndEntry(1); + scalar_type d; + if (r < 1. / 3.) + d = RandomTools::randExponential(0.001) * 2 * + (0.5 - RandomTools::giveRandomNumberBetweenZeroAndEntry(1)); + else if (r < 2. / 3.) + d = RandomTools::randExponential(0.01) * 2 * + (0.5 - RandomTools::giveRandomNumberBetweenZeroAndEntry(1)); + else + d = RandomTools::randExponential(0.1) * 2 * + (0.5 - RandomTools::giveRandomNumberBetweenZeroAndEntry(1)); + ds.push_back(d); + } + scalar_type new_delta = delta + ds[0]; + scalar_type new_tau = tau + ds[1]; + scalar_type new_lambda = lambda + ds[2]; + + // boundaries + if (new_delta < 1e-6) + new_delta = 1e-6; + if (new_delta > 10 - 1e-6) + new_delta = 10 - 1e-6; + if (new_tau < 1e-6) + new_tau = 1e-6; + if (new_tau > 10 - 1e-6) + new_tau = 10 - 1e-6; + if (new_lambda < 1e-6) + new_lambda = 1e-6; + if (new_lambda > 10 - 1e-6) + new_lambda = 10 - 1e-6; + + // likelihood + model->set_model_parameter("delta", new_delta); + model->set_model_parameter("tau", new_tau); + model->set_model_parameter("lambda", new_lambda); + model->calculate_EGb(); + scalar_type new_p = model->p(ale); + if (new_p >= old_p or + new_p / old_p > RandomTools::giveRandomNumberBetweenZeroAndEntry(1)) { + old_p = new_p; + delta = new_delta; + tau = new_tau; + lambda = new_lambda; + if (accapted < burnin) + ++pd; + if (accapted == burnin) { + cout << "\nFinished burnin. \n Sampling:" << endl; + pd.restart(N_samples); + } + accapted++; + } + steps++; + + if ((accapted > burnin and steps % print_mod == 0) or allprint) { + sampled++; + ++pd; + model->set_model_parameter("delta", delta); + model->set_model_parameter("tau", tau); + model->set_model_parameter("lambda", lambda); model->calculate_EGb(); - scalar_type new_p=model->p(ale); - if (new_p>=old_p or new_p/old_p>RandomTools::giveRandomNumberBetweenZeroAndEntry(1)) - { - old_p=new_p; - delta=new_delta; tau=new_tau; lambda=new_lambda; - if (accaptedburnin and steps%print_mod==0 ) or allprint) - { - sampled++; - ++pd; - model->set_model_parameter("delta",delta); - model->set_model_parameter("tau",tau); - model->set_model_parameter("lambda",lambda); - model->calculate_EGb(); - old_p= model->p(ale); - - rate_out << sampled << "\t" << steps << "\t" << delta << "\t" << tau << "\t" << lambda << "\t" << log(old_p) << endl; - - for (int i=0;isample(false); - sample_out << sample_tree << endl; - tree_type * G=TreeTemplateTools::parenthesisToTree(sample_tree,false); - vector leaves = G->getLeaves(); - for (vector::iterator it=leaves.begin();it!=leaves.end();it++ ) - { - string name=(*it)->getName(); - vector tokens; - boost::split(tokens,name,boost::is_any_of(".@"),boost::token_compress_on); - (*it)->setName(tokens[0]); - tokens.clear(); - } - leaves.clear(); - sample_trees.push_back(G); - event_out << i << "\t" << sampled << "\t" << steps << "\t" << model->MLRec_events["D"] << "\t" << model->MLRec_events["T"] << "\t" << model->MLRec_events["L"]<< "\t" << model->MLRec_events["S"] <p(ale); + + rate_out << sampled << "\t" << steps << "\t" << delta << "\t" << tau + << "\t" << lambda << "\t" << log(old_p) << endl; + + for (int i = 0; i < subsamples; i++) { + string sample_tree = model->sample(false); + sample_out << sample_tree << endl; + tree_type *G = TreeTemplateTools::parenthesisToTree(sample_tree, false); + vector leaves = G->getLeaves(); + for (vector::iterator it = leaves.begin(); it != leaves.end(); + it++) { + string name = (*it)->getName(); + vector tokens; + boost::split(tokens, name, boost::is_any_of(".@"), + boost::token_compress_on); + (*it)->setName(tokens[0]); + tokens.clear(); + } + leaves.clear(); + sample_trees.push_back(G); + event_out << i << "\t" << sampled << "\t" << steps << "\t" + << model->MLRec_events["D"] << "\t" + << model->MLRec_events["T"] << "\t" + << model->MLRec_events["L"] << "\t" + << model->MLRec_events["S"] << endl; + } } - cout << "Calculating MRP consensus tree."<p(tree)) << endl; return 1; diff --git a/src/computeALEcomplexity.cpp b/src/computeALEcomplexity.cpp index c3cba13..6e07def 100644 --- a/src/computeALEcomplexity.cpp +++ b/src/computeALEcomplexity.cpp @@ -1,67 +1,72 @@ -#include "exODT.h" #include "ALE_util.h" +#include "exODT.h" #include using namespace std; using namespace bpp; -int main(int argc, char ** argv) -{ - //we need a species tree +int main(int argc, char **argv) { + // we need a species tree if (argc < 5) { - std::cout << "\n\n\t\tUsage: computeALEcomplexity speciesTreeFile ALEfile delta tau lambda \n"<sample_species(10); + // simulation->sample_species(10); - - //for (vector::iterator it=simulation->gene_trees.begin();it!=simulation->gene_trees.end();it++) - //cout << (*it) << endl; + // for (vector::iterator + // it=simulation->gene_trees.begin();it!=simulation->gene_trees.end();it++) + // cout << (*it) << endl; - //cout << simulation->S_string << endl; + // cout << simulation->S_string << endl; - model->set_model_parameter("min_D",1); - model->set_model_parameter("grid_delta_t",0.01); - model->set_model_parameter("DD",10); + model->set_model_parameter("min_D", 1); + model->set_model_parameter("grid_delta_t", 0.01); + model->set_model_parameter("DD", 10); model->construct(Sstring); - model->set_model_parameter("event_node",0); - model->set_model_parameter("delta",delta); + model->set_model_parameter("event_node", 0); + model->set_model_parameter("delta", delta); model->set_model_parameter("tau", tau); model->set_model_parameter("lambda", lambda); - model->set_model_parameter("leaf_events",1); + model->set_model_parameter("leaf_events", 1); model->calculate_EGb(); - //Computing time: + // Computing time: double ll; double time1, time2, time3; - boost::timer * t = new boost::timer(); + boost::timer *t = new boost::timer(); ll = model->p(ale); time1 = t->elapsed(); @@ -71,33 +76,34 @@ int main(int argc, char ** argv) time3 = t->elapsed(); time3 = time3 - time2; time2 = time2 - time1; - + vector leafNames = ale->getLeafNames(); - //Now, getting the number of species in the current ale_file - std::map scalar_parameter; - std::map string_parameter; - string_parameter["gene_name_separators"]="_@"; - scalar_parameter["species_field"]=0; - vector speciesPresent; - for (auto gene_name = leafNames.begin(); gene_name != leafNames.end(); ++gene_name) { - vector tokens; - boost::split(tokens,*gene_name,boost::is_any_of(string_parameter["gene_name_separators"]),boost::token_compress_on); - string species_name; - if ((int)scalar_parameter["species_field"]==-1) - species_name=tokens[tokens.size()-1]; - else - species_name=tokens[(int)scalar_parameter["species_field"]]; - speciesPresent.push_back( species_name ); + // Now, getting the number of species in the current ale_file + std::map scalar_parameter; + std::map string_parameter; + string_parameter["gene_name_separators"] = "_@"; + scalar_parameter["species_field"] = 0; + vector speciesPresent; + for (auto gene_name = leafNames.begin(); gene_name != leafNames.end(); + ++gene_name) { + vector tokens; + boost::split(tokens, *gene_name, + boost::is_any_of(string_parameter["gene_name_separators"]), + boost::token_compress_on); + string species_name; + if ((int)scalar_parameter["species_field"] == -1) + species_name = tokens[tokens.size() - 1]; + else + species_name = tokens[(int)scalar_parameter["species_field"]]; + speciesPresent.push_back(species_name); } size_t numSpecies = (VectorTools::unique(speciesPresent)).size(); - // gene<->species mapping - - cout << ale_file << "\t" << ll << "\t" << leafNames.size() << "\t" << numSpecies << "\t" << time1 << "\t" << time2 << "\t" << time3 << "\t" << (time1+time2+time3)/3 << endl; - cout.flush(); + // gene<->species mapping + cout << ale_file << "\t" << ll << "\t" << leafNames.size() << "\t" + << numSpecies << "\t" << time1 << "\t" << time2 << "\t" << time3 << "\t" + << (time1 + time2 + time3) / 3 << endl; + cout.flush(); return 1; - - } - diff --git a/src/exODT.cpp b/src/exODT.cpp index 76d08c9..b6b4913 100644 --- a/src/exODT.cpp +++ b/src/exODT.cpp @@ -4,487 +4,459 @@ using namespace std; using namespace bpp; -exODT_model::exODT_model() -{ - //some default parameters - string_parameter["BOOTSTRAP_LABELS"]="no"; - string_parameter["gene_name_separators"]="_@"; - scalar_parameter["species_field"]=0; - scalar_parameter["event_node"]=0; - scalar_parameter["min_bip_count"]=-1; - scalar_parameter["min_branch_lenghts"]=0; +exODT_model::exODT_model() { + // some default parameters + string_parameter["BOOTSTRAP_LABELS"] = "no"; + string_parameter["gene_name_separators"] = "_@"; + scalar_parameter["species_field"] = 0; + scalar_parameter["event_node"] = 0; + scalar_parameter["min_bip_count"] = -1; + scalar_parameter["min_branch_lenghts"] = 0; // length of "stem" branch above root - scalar_parameter["stem_length"]=1; - //number of discretization slices (subslices) per time slice - scalar_parameter["D"]=3; - scalar_parameter["grid_delta_t"]=0.005; - scalar_parameter["min_D"]=3; - //number of subdiscretizations for ODE calculations - scalar_parameter["DD"]=10; - Ee_y = vector (100,0.0);//del-loc - - Ee_y_1=0.0; + scalar_parameter["stem_length"] = 1; + // number of discretization slices (subslices) per time slice + scalar_parameter["D"] = 3; + scalar_parameter["grid_delta_t"] = 0.005; + scalar_parameter["min_D"] = 3; + // number of subdiscretizations for ODE calculations + scalar_parameter["DD"] = 10; + Ee_y = vector(100, 0.0); // del-loc + + Ee_y_1 = 0.0; // map Ge_y;//del-loc - Ge_y = vector (100,0.0);//del-loc - Ge_y_1=0.0; - - E_k1 = vector (100,0.0); - E_k2 = vector (100,0.0); - E_k3 = vector (100,0.0); - E_k4 = vector (100,0.0);//del-loc. Maps used for Runge-Kutta computations (4 stages). - - E_k1_1= 0.0; - E_k2_1 = 0.0; - E_k3_1 = 0.0; - E_k4_1=0.0; - - G_k1= vector (100,0.0); - G_k2= vector (100,0.0); - G_k3= vector (100,0.0); - G_k4 = vector (100,0.0);//del-loc. Maps used for Runge-Kutta computations (4 stages). + Ge_y = vector(100, 0.0); // del-loc + Ge_y_1 = 0.0; + + E_k1 = vector(100, 0.0); + E_k2 = vector(100, 0.0); + E_k3 = vector(100, 0.0); + E_k4 = vector( + 100, 0.0); // del-loc. Maps used for Runge-Kutta computations (4 stages). + + E_k1_1 = 0.0; + E_k2_1 = 0.0; + E_k3_1 = 0.0; + E_k4_1 = 0.0; + + G_k1 = vector(100, 0.0); + G_k2 = vector(100, 0.0); + G_k3 = vector(100, 0.0); + G_k4 = vector( + 100, 0.0); // del-loc. Maps used for Runge-Kutta computations (4 stages). G_k1_1 = 0.0; - G_k2_1= 0.0; + G_k2_1 = 0.0; G_k3_1 = 0.0; G_k4_1 = 0.0; - - } -double exODT_model::height(Node *node) -{ - if( not node->isLeaf()) - { - vector sons = node->getSons(); - return 0.5*(sons[0]->getDistanceToFather()+height(sons[0]) + sons[1]->getDistanceToFather()+height(sons[1])); - } - else +double exODT_model::height(Node *node) { + if (not node->isLeaf()) { + vector sons = node->getSons(); + return 0.5 * (sons[0]->getDistanceToFather() + height(sons[0]) + + sons[1]->getDistanceToFather() + height(sons[1])); + } else return 0; } -void exODT_model::construct(const string& Sstring, const scalar_type& N, const string& fractionMissingFile) -{ - string_parameter["S_in"]=Sstring; - //virtual branch - alpha=-1; - last_branch=0; - - S=TreeTemplateTools::parenthesisToTree(string_parameter["S_in"], (string_parameter["BOOTSTRAP_LABELS"]=="yes") -);//del-loc - S_root = S->getRootNode();//del-loc - vector leaves = TreeTemplateTools::getLeaves(*S_root);//del-loc - //sort leaves according to name - map leaf_sort; //map storing leaves according to their names - for (vector ::iterator it=leaves.begin();it!=leaves.end();it++ ) - leaf_sort[(*it)->getName()]=(*it); +void exODT_model::construct(const string &Sstring, const scalar_type &N, + const string &fractionMissingFile) { + string_parameter["S_in"] = Sstring; + // virtual branch + alpha = -1; + last_branch = 0; + + S = TreeTemplateTools::parenthesisToTree( + string_parameter["S_in"], + (string_parameter["BOOTSTRAP_LABELS"] == "yes")); // del-loc + S_root = S->getRootNode(); // del-loc + vector leaves = TreeTemplateTools::getLeaves(*S_root); // del-loc + // sort leaves according to name + map leaf_sort; // map storing leaves according to their names + for (vector::iterator it = leaves.begin(); it != leaves.end(); it++) + leaf_sort[(*it)->getName()] = (*it); leaves.clear(); - for (map ::iterator it=leaf_sort.begin();it!=leaf_sort.end();it++ ) + for (map::iterator it = leaf_sort.begin(); + it != leaf_sort.end(); it++) leaves.push_back((*it).second); leaf_sort.clear(); - //leaves is now sorted by alphabetical order of the names - map next_generation; //Map between node and slice id of descendant nodes. - map node_ts; //Map between node and its time. + // leaves is now sorted by alphabetical order of the names + map + next_generation; // Map between node and slice id of descendant nodes. + map node_ts; // Map between node and its time. - map species_order; + map species_order; // register extant species - for (vector ::iterator it=leaves.begin();it!=leaves.end();it++ ) - { - Node * node = (*it); - // a leaf - id_ranks[last_branch]=0; - daughters[last_branch].push_back(-1); - // a leaf - daughters[last_branch].push_back(-1); - extant_species[last_branch]=node->getName(); - species_order[node->getName()]=-1; - node_ts[node]=0; - branch_ts[last_branch]=0; - node_ids[node]=last_branch; - id_nodes[last_branch]=node; - next_generation[node->getFather()]=-1; - last_branch++; - } - - // make sure S is ultrametric - while(1) - { - vector tmp; - bool stop=true; - for (map ::iterator it=next_generation.begin();it!=next_generation.end();it++ ) - if (next_generation[(*it).first]==-1) //father of leaves at first, then a node that needs to be examined - { - Node * node = (*it).first; - vector sons=node->getSons();//del-loc - if (node_ts.count(sons[0])!=0 and node_ts.count(sons[1])!=0) - { - - scalar_type l0= sons[0]->getDistanceToFather(); - scalar_type l1= sons[1]->getDistanceToFather(); - scalar_type h0= node_ts[sons[0]]; - scalar_type h1= node_ts[sons[1]]; - scalar_type d0= l0+h0; - scalar_type d1= l1+h1; - scalar_type d = (d0+d1) / 2; - - sons[0]->setDistanceToFather(d-h0); - sons[1]->setDistanceToFather(d-h1); - next_generation[node]=1; - node_ts[node]=d; - if (node->hasFather()) - { - next_generation[node->getFather()]=-1; - stop=false; - } - } - sons.clear(); - } - if (stop) - break; - } + for (vector::iterator it = leaves.begin(); it != leaves.end(); it++) { + Node *node = (*it); + // a leaf + id_ranks[last_branch] = 0; + daughters[last_branch].push_back(-1); + // a leaf + daughters[last_branch].push_back(-1); + extant_species[last_branch] = node->getName(); + species_order[node->getName()] = -1; + node_ts[node] = 0; + branch_ts[last_branch] = 0; + node_ids[node] = last_branch; + id_nodes[last_branch] = node; + next_generation[node->getFather()] = -1; + last_branch++; + } + + // make sure S is ultrametric + while (1) { + vector tmp; + bool stop = true; + for (map::iterator it = next_generation.begin(); + it != next_generation.end(); it++) + if (next_generation[(*it).first] == + -1) // father of leaves at first, then a node that needs to be + // examined + { + Node *node = (*it).first; + vector sons = node->getSons(); // del-loc + if (node_ts.count(sons[0]) != 0 and node_ts.count(sons[1]) != 0) { + + scalar_type l0 = sons[0]->getDistanceToFather(); + scalar_type l1 = sons[1]->getDistanceToFather(); + scalar_type h0 = node_ts[sons[0]]; + scalar_type h1 = node_ts[sons[1]]; + scalar_type d0 = l0 + h0; + scalar_type d1 = l1 + h1; + scalar_type d = (d0 + d1) / 2; + + sons[0]->setDistanceToFather(d - h0); + sons[1]->setDistanceToFather(d - h1); + next_generation[node] = 1; + node_ts[node] = d; + if (node->hasFather()) { + next_generation[node->getFather()] = -1; + stop = false; + } + } + sons.clear(); + } + if (stop) + break; + } // and has height one - scalar_type h=node_ts[S_root]; - //h=1; - scalar_type tree_heigth=1;//node_ts[S_root]/h; - for (map ::iterator it=node_ts.begin();it!=node_ts.end();it++ ) - { - (*it).second/=h; - if ((*it).first->hasFather()) - { - scalar_type l=(*it).first->getDistanceToFather(); - (*it).first->setDistanceToFather(l/h); - } + scalar_type h = node_ts[S_root]; + // h=1; + scalar_type tree_heigth = 1; // node_ts[S_root]/h; + for (map::iterator it = node_ts.begin(); + it != node_ts.end(); it++) { + (*it).second /= h; + if ((*it).first->hasFather()) { + scalar_type l = (*it).first->getDistanceToFather(); + (*it).first->setDistanceToFather(l / h); } + } string_parameter["S"] = TreeTemplateTools::treeToParenthesis(*S); - //cout << string_parameter["S"] << endl;//XX - - //determine time order and time slices - map t_nodes; - for (map ::iterator it=node_ts.begin();it!=node_ts.end();it++ ) - { - scalar_type t=(*it).second; - Node * node=(*it).first; - //nonleaves - if (t>0) - { - //degenerate speciation times, where >1 nodes have same age .. should be avoided! - while (t_nodes.count(t)!=0 ) - t+=1e-5; - t_nodes[t]=node; - } - } - for (map ::iterator it=t_nodes.begin();it!=t_nodes.end();it++ ) - {//we update node_ts - scalar_type t=(*it).first; - Node * node=(*it).second; - node_ts[node]=t; - } - - last_rank=1; //the rank of the slice above the leaves - for (map ::iterator it=t_nodes.begin();it!=t_nodes.end();it++ ) - {//We go through the nodes, ordered according to their age. - scalar_type t=(*it).first; - Node * node=(*it).second; - branch_ts[last_branch]=t; - id_ranks[last_branch]=last_rank; - rank_ids[last_rank]=last_branch; - node_ids[node]=last_branch; - id_nodes[last_branch]=node; - vector sons=node->getSons();//del-loc - daughters[last_branch].push_back(node_ids[sons[0]]); - daughters[last_branch].push_back(node_ids[sons[1]]); - father[node_ids[sons[0]]]=last_branch; - father[node_ids[sons[1]]]=last_branch; - - t_end[last_branch]=t; - if (node->hasFather()) - t_begin[last_branch]=node_ts[node->getFather()]; - //the root - else - t_begin[last_branch]=t_end[last_branch]+scalar_parameter["stem_length"]; - last_rank++; - last_branch++; - sons.clear(); + // cout << string_parameter["S"] << endl;//XX + + // determine time order and time slices + map t_nodes; + for (map::iterator it = node_ts.begin(); + it != node_ts.end(); it++) { + scalar_type t = (*it).second; + Node *node = (*it).first; + // nonleaves + if (t > 0) { + // degenerate speciation times, where >1 nodes have same age .. should be + // avoided! + while (t_nodes.count(t) != 0) + t += 1e-5; + t_nodes[t] = node; } + } + for (map::iterator it = t_nodes.begin(); + it != t_nodes.end(); it++) { // we update node_ts + scalar_type t = (*it).first; + Node *node = (*it).second; + node_ts[node] = t; + } + + last_rank = 1; // the rank of the slice above the leaves + for (map::iterator it = t_nodes.begin(); + it != t_nodes.end(); + it++) { // We go through the nodes, ordered according to their age. + scalar_type t = (*it).first; + Node *node = (*it).second; + branch_ts[last_branch] = t; + id_ranks[last_branch] = last_rank; + rank_ids[last_rank] = last_branch; + node_ids[node] = last_branch; + id_nodes[last_branch] = node; + vector sons = node->getSons(); // del-loc + daughters[last_branch].push_back(node_ids[sons[0]]); + daughters[last_branch].push_back(node_ids[sons[1]]); + father[node_ids[sons[0]]] = last_branch; + father[node_ids[sons[1]]] = last_branch; + + t_end[last_branch] = t; + if (node->hasFather()) + t_begin[last_branch] = node_ts[node->getFather()]; + // the root + else + t_begin[last_branch] = + t_end[last_branch] + scalar_parameter["stem_length"]; + last_rank++; + last_branch++; + sons.clear(); + } // extant_taxa map for id-ing branches across trees - int i=0; - for (map::iterator it=species_order.begin();it!=species_order.end();it++ ) - { - (*it).second=i; - //cout << (*it).first << " " << (*it).second << endl; - i++; - } - vector nodes = TreeTemplateTools::getNodes(*S_root); - //map extant_taxa; - for (vector ::iterator it=nodes.begin();it!=nodes.end();it++ ) - { - stringstream name; - if (not (*it)->isLeaf()) - { - Node * node=(*it); - vector tmp = TreeTemplateTools::getLeaves(*node); - map tmp2; - for (vector ::iterator jt=tmp.begin();jt!=tmp.end();jt++ ) - { - //cout << (*jt)->getName() << ":" << species_order[ (*jt)->getName() ]<getName()]=-1; - } - for (map::iterator jt=tmp2.begin();jt!=tmp2.end();jt++ ) - { - name<< species_order[(*jt).first] << "."; - } - } - else - { - name<< species_order[(*it)->getName()] << "."; - } - string taxa_name=name.str(); - int branch = node_ids[(*it)]; - //taxa_name.pop_back(); - extant_taxa[ branch ]=taxa_name; - //cout << branch << " " << extant_taxa[branch] << endl; + int i = 0; + for (map::iterator it = species_order.begin(); + it != species_order.end(); it++) { + (*it).second = i; + // cout << (*it).first << " " << (*it).second << endl; + i++; + } + vector nodes = TreeTemplateTools::getNodes(*S_root); + // map extant_taxa; + for (vector::iterator it = nodes.begin(); it != nodes.end(); it++) { + stringstream name; + if (not(*it)->isLeaf()) { + Node *node = (*it); + vector tmp = TreeTemplateTools::getLeaves(*node); + map tmp2; + for (vector::iterator jt = tmp.begin(); jt != tmp.end(); jt++) { + // cout << (*jt)->getName() << ":" << species_order[ (*jt)->getName() + // ]<getName()] = -1; + } + for (map::iterator jt = tmp2.begin(); jt != tmp2.end(); + jt++) { + name << species_order[(*jt).first] << "."; + } + } else { + name << species_order[(*it)->getName()] << "."; } + string taxa_name = name.str(); + int branch = node_ids[(*it)]; + // taxa_name.pop_back(); + extant_taxa[branch] = taxa_name; + // cout << branch << " " << extant_taxa[branch] << endl; + } // extant_taxa map end. - //set t_begin for terminal branches - for (map ::iterator it=extant_species.begin();it!=extant_species.end();it++ ) - { - int branch = (*it).first; - Node * node=id_nodes[branch]; - t_begin[branch]=node_ts[node->getFather()]; + // set t_begin for terminal branches + for (map::iterator it = extant_species.begin(); + it != extant_species.end(); it++) { + int branch = (*it).first; + Node *node = id_nodes[branch]; + t_begin[branch] = node_ts[node->getFather()]; + } + + for (int rank = 0; rank < last_rank; rank++) { + // terminal time slice terminated by present + if (rank == 0) { + for (int branch = 0; branch < last_branch; branch++) + if (t_end[branch] == 0) { + time_slices[rank].push_back(branch); + branch_slices[branch].push_back(rank); + } + } else { + // time slice terminated by next speciation + int terminating_branch = rank_ids[rank]; + for (vector::iterator it = time_slices[rank - 1].begin(); + it != time_slices[rank - 1].end(); it++) { + int branch = (*it); + if (father[branch] != terminating_branch) { + time_slices[rank].push_back(branch); + branch_slices[branch].push_back(rank); + } + } + // terminating branch is last in time_slices + time_slices[rank].push_back(terminating_branch); + branch_slices[terminating_branch].push_back(rank); } - - for (int rank=0;rank ::iterator it=time_slices[rank-1].begin();it!=time_slices[rank-1].end();it++) - { - int branch = (*it); - if (father[branch]!=terminating_branch) - { - time_slices[rank].push_back(branch); - branch_slices[branch].push_back(rank); - } - } - //terminating branch is last in time_slices - time_slices[rank].push_back(terminating_branch); - branch_slices[terminating_branch].push_back(rank); - } + } + + for (int rank = 0; rank < last_rank; rank++) { + scalar_type slice_end; + int terminating_branch; + if (rank + 1 < last_rank) { + terminating_branch = rank_ids[rank]; + slice_end = t_end[terminating_branch]; + } else if (rank == 0) + // rank 0 arrives at present + slice_end = 0; + else + // root is at t=1 + slice_end = tree_heigth; + scalar_type slice_begin; + if (rank + 1 < last_rank) + slice_begin = t_end[rank_ids[rank + 1]]; + else + // stem above root ends itself + slice_begin = t_begin[rank_ids[rank]]; + + scalar_type slice_height = slice_begin - slice_end; + + time_slice_times[rank].push_back(slice_end); + // we calculate the local D + scalar_type delta_t = scalar_parameter["grid_delta_t"]; + int min_D = scalar_parameter["min_D"]; + + int local_D = max((int)ceil(slice_height / delta_t), min_D); + // cout << rank << " " << local_D << " " << slice_height<< " " + // <::iterator it=node_ids.begin();it!=node_ids.end();it++ ) - { - Node * node = (*it).first; - int branch = (*it).second; - stringstream out; - stringstream out1; - stringstream out2; - out1<hasBranchProperty("bootstrap") ) - { - rank2label[rank]=node->getBootstrapValue(); - //cout <"<setBranchProperty("ID",BppString(out.str())); + time_slice_begins[rank] = slice_begin; + + // for (scalar_type + // internal_interval=1;internal_interval::iterator it = node_ids.begin(); it != node_ids.end(); + it++) { + Node *node = (*it).first; + int branch = (*it).second; + stringstream out; + stringstream out1; + stringstream out2; + out1 << t_begin[branch]; + out2 << t_end[branch]; + int rank = id_ranks[branch]; + out << rank; + if (node->hasBranchProperty("bootstrap")) { + rank2label[rank] = node->getBootstrapValue(); + // cout <"<::iterator it=node_ids.begin();it!=node_ids.end();it++ ) - (*it).first->setBranchProperty("ID",BppString("")); - - //event node approximation - //cf. section S1.4 of the Supporting Material of www.pnas.org/cgi/doi/10.1073/pnas.1202997109 - //compatible with p(ale) - //not compatible with sample() and p_MLRec(ale) - set_model_parameter("event_node",0); - //the calculation of depends only very weakly on the value of N - //default value of N=1e6 is set in exODT.h - set_model_parameter("N",1); - set_model_parameter("sigma_hat",1); - - //if we assume the that height of the species tree is equal to its expected value under the colaescent - // cf. http://arxiv.org/abs/1211.4606 - //Delta is sigma, i.e. the speciation rate of the Moran model in http://arxiv.org/abs/1211.4606 and ALEPAPER - set_model_parameter("Delta_bar",N); - //Lambda is not used - set_model_parameter("Lambda_bar",N); + node->setBranchProperty("ID", BppString(out.str())); + } + string_parameter["S_with_ranks"] = + TreeTemplateTools::treeToParenthesis(*S, false, "ID"); + cout << string_parameter["S_with_ranks"] << endl; // XX + for (map::iterator it = node_ids.begin(); it != node_ids.end(); + it++) + (*it).first->setBranchProperty("ID", BppString("")); + + // event node approximation + // cf. section S1.4 of the Supporting Material of + // www.pnas.org/cgi/doi/10.1073/pnas.1202997109 compatible with p(ale) not + // compatible with sample() and p_MLRec(ale) + set_model_parameter("event_node", 0); + // the calculation of depends only very weakly on the value of N + // default value of N=1e6 is set in exODT.h + set_model_parameter("N", 1); + set_model_parameter("sigma_hat", 1); + + // if we assume the that height of the species tree is equal to its expected + // value under the colaescent + // cf. http://arxiv.org/abs/1211.4606 + // Delta is sigma, i.e. the speciation rate of the Moran model in + // http://arxiv.org/abs/1211.4606 and ALEPAPER + set_model_parameter("Delta_bar", N); + // Lambda is not used + set_model_parameter("Lambda_bar", N); // delta is the gene duplication rate - set_model_parameter("delta",0.2); + set_model_parameter("delta", 0.2); // tau is the gene transfer rate - set_model_parameter("tau",0.17); + set_model_parameter("tau", 0.17); // lambda is the gene loss rate - set_model_parameter("lambda",1.0); + set_model_parameter("lambda", 1.0); // O_R is the multiplier for O at the root - set_model_parameter("O_R",1.0); - set_model_parameter("seq_beta",1.0); - - for (int branch=0;branch (leaves.size(), 0.0); - //Put user-defined values, if available - if (fractionMissingFile == "") { - - } - else { - fraction_missing = readFractionMissingFile(fractionMissingFile); - } - - - //del-locs + set_model_parameter("O_R", 1.0); + set_model_parameter("seq_beta", 1.0); + + for (int branch = 0; branch < last_branch; branch++) { + branch_counts["Os"].push_back(0); + branch_counts["Ds"].push_back(0); + branch_counts["Ts"].push_back(0); + branch_counts["Tfroms"].push_back(0); + branch_counts["Ls"].push_back(0); + branch_counts["count"].push_back(0); + branch_counts["saw"].push_back(0); + branch_counts["O_LL"].push_back(0); + branch_counts["presence"].push_back(0); + branch_counts["copies"].push_back(0); + branch_counts["singleton"].push_back(0); + } + + // Put default values for the fraction of missing genes at the leaves. + vector_parameter["fraction_missing"] = + vector(leaves.size(), 0.0); + // Put user-defined values, if available + if (fractionMissingFile == "") { + + } else { + fraction_missing = readFractionMissingFile(fractionMissingFile); + } + + // del-locs node_ts.clear(); next_generation.clear(); leaves.clear(); } - -void exODT_model::set_model_parameter(string name,string value) -{ - string_parameter[name]=value; +void exODT_model::set_model_parameter(string name, string value) { + string_parameter[name] = value; } +void exODT_model::set_model_parameter(string name, scalar_type value) { -void exODT_model::set_model_parameter(string name,scalar_type value) -{ - - if (name=="delta" or name=="tau" or name=="lambda") - { - scalar_type N=vector_parameter["N"][0]; - vector_parameter[name].clear(); - for (int branch=0;branch value_vector) -{ - if (name=="delta" or name=="tau" or name=="lambda") - { - scalar_type N=vector_parameter["N"][0]; - vector_parameter[name].clear(); - scalar_type avg=0; - scalar_type c=0; - for (int branch=0;branch value_vector) { + if (name == "delta" or name == "tau" or name == "lambda") { + scalar_type N = vector_parameter["N"][0]; + vector_parameter[name].clear(); + scalar_type avg = 0; + scalar_type c = 0; + for (int branch = 0; branch < last_branch; branch++) { + if (name == "tau") { + vector_parameter[name].push_back(value_vector[branch] / N); + avg += value_vector[branch] / N; + } else { + vector_parameter[name].push_back(value_vector[branch]); + avg += value_vector[branch]; + } + c += 1; } + scalar_parameter[name + "_avg"] = avg / c; + } else // if (name=="N" or name=="Delta_bar" or name=="Lambda_bar" ) + { + vector_parameter[name].clear(); + for (int rank = 0; rank < last_rank; rank++) + vector_parameter[name].push_back(value_vector[rank]); + } } diff --git a/src/exODT.h b/src/exODT.h index 58ab6fc..7cde972 100644 --- a/src/exODT.h +++ b/src/exODT.h @@ -1,8 +1,7 @@ -//all code by Szollosi GJ et al.; ssolo@elte.hu; GNU GPL 3.0; +// all code by Szollosi GJ et al.; ssolo@elte.hu; GNU GPL 3.0; #include "ALE.h" -//using namespace std; -struct step -{ +// using namespace std; +struct step { int e; int ep; int epp; @@ -14,28 +13,26 @@ struct step std::string event; }; - /**************************************************************************** // exODT_model class. // This class contains the description of a time-sliced species tree, // with parameters of duplication, transfer and loss, and a "population size", // and an approx_posterior object. This object can compute the probability of - // an approx_posterior given the species tree and values for the ODTL parameters. + // an approx_posterior given the species tree and values for the ODTL + parameters. // Contains lots of maps to map time slices to nodes, nodes to time slices, // nodes to ages... // Each time slice ends at a speciation node, with a particular age. // Each time slice has a rank, with the most recent one with rank 0, // and the older one with rank number_of_species. *****************************************************************************/ -class exODT_model -{ - public: - - std::map rank2label; +class exODT_model { +public: + std::map rank2label; - std::map scalar_parameter;//del_loc - std::map > vector_parameter;//del_loc - std::map string_parameter;//del_loc + std::map scalar_parameter; // del_loc + std::map> vector_parameter; // del_loc + std::map string_parameter; // del_loc int signal; std::string signal_string; @@ -43,73 +40,114 @@ class exODT_model int alpha; int last_branch; int last_rank; - approx_posterior * ale_pointer; //Pointer to an approx_posterior object on which dynamic programming is performed in p for instance. + approx_posterior + *ale_pointer; // Pointer to an approx_posterior object on which dynamic + // programming is performed in p for instance. - //Runge-Krutta variables - std::vector Ee_y ;//del-loc + // Runge-Krutta variables + std::vector Ee_y; // del-loc scalar_type Ee_y_1; // map Ge_y;//del-loc - std::vector Ge_y ;//del-loc + std::vector Ge_y; // del-loc scalar_type Ge_y_1; - std::vector E_k1 ; - std::vector E_k2 ; - std::vector E_k3 ; - std::vector E_k4 ;//del-loc. Maps used for Runge-Kutta computations (4 stages). + std::vector E_k1; + std::vector E_k2; + std::vector E_k3; + std::vector + E_k4; // del-loc. Maps used for Runge-Kutta computations (4 stages). scalar_type E_k1_1, E_k2_1, E_k3_1, E_k4_1; std::vector G_k1; std::vector G_k2; std::vector G_k3; - std::vector G_k4;//del-loc. Maps used for Runge-Kutta computations (4 stages). + std::vector + G_k4; // del-loc. Maps used for Runge-Kutta computations (4 stages). scalar_type G_k1_1, G_k2_1, G_k3_1, G_k4_1; - - std::map father; //del-loc. Map between node id and id of its father. - std::map > daughters; //del-loc. Map between node id and ids of its daughters (-1 if node is a leaf). - std::map daughter; - std::map son; - - std::map extant_species; //del-loc. Map between leaf id (0 to # of leaves) and leaf name. - std::map extant_taxa; // extant_taxa map for id-ing branches across trees - - - std::map branch_ts; //del-loc. Map between branch identified by the id of the node it ends at, and time of the time slice. - std::maprank_ids; //del-loc. Map between rank of a time slice and the id of the node that terminates it. - std::mapid_ranks; //del-loc. Map between node id and rank of the time slice it terminates. Time slice at leaves has rank 0. - - tree_type * S; //Species tree - bpp::Node * S_root; //Root of the species tree - std::mapnode_ids; //Map between node and its id. - std::mapid_nodes; //Dual from the above, map between node id and node. - - std::map t_begin; //del-loc. Map between the id of a node, and the beginning of the branch that leads to it. - std::map t_end; //del-loc. Map between the id of a node, and the end of the time slice it defines, corresponding to the age of this node. - - std::map > time_slices; //del-loc. Map between rank of time slice and indices of branches going through it. Terminating branch is last in the vector. - std::map > branch_slices; //del-loc. Map between a branch and all the time slices it traverses. - std::map > time_slice_times; //del-loc. Map between rank of time slice and all the end times of the sub-slices inside this time slice. - std::map time_slice_begins; //del-loc. Map between rank of time slice and begin time of this time slice. - - //Variables used for computing. - std::map > Ee; //del-loc. Probability (scalar value) that a gene present at a given time slice (whose rank is the int key) at time the first scalar key is getting extinct before reaching extant species. - std::map name_node; - std::map node_name; - std::map > ancestral_names; - std::map > ancestral; - std::map > below; - - std::vector < std::vector > ancestors; // contains the ancestors of a given branch; useful to forbid transfers to them. + std::map + father; // del-loc. Map between node id and id of its father. + std::map> + daughters; // del-loc. Map between node id and ids of its daughters (-1 if + // node is a leaf). + std::map daughter; + std::map son; + + std::map extant_species; // del-loc. Map between leaf id (0 + // to # of leaves) and leaf name. + std::map + extant_taxa; // extant_taxa map for id-ing branches across trees + + std::map + branch_ts; // del-loc. Map between branch identified by the id of the node + // it ends at, and time of the time slice. + std::map rank_ids; // del-loc. Map between rank of a time slice and + // the id of the node that terminates it. + std::map + id_ranks; // del-loc. Map between node id and rank of the time slice it + // terminates. Time slice at leaves has rank 0. + + tree_type *S; // Species tree + bpp::Node *S_root; // Root of the species tree + std::map node_ids; // Map between node and its id. + std::map + id_nodes; // Dual from the above, map between node id and node. + + std::map + t_begin; // del-loc. Map between the id of a node, and the beginning of + // the branch that leads to it. + std::map + t_end; // del-loc. Map between the id of a node, and the end of the time + // slice it defines, corresponding to the age of this node. + + std::map> + time_slices; // del-loc. Map between rank of time slice and indices of + // branches going through it. Terminating branch is last in + // the vector. + std::map> + branch_slices; // del-loc. Map between a branch and all the time slices it + // traverses. + std::map> + time_slice_times; // del-loc. Map between rank of time slice and all the + // end times of the sub-slices inside this time slice. + std::map + time_slice_begins; // del-loc. Map between rank of time slice and begin + // time of this time slice. + + // Variables used for computing. + std::map> + Ee; // del-loc. Probability (scalar value) that a gene present at a given + // time slice (whose rank is the int key) at time the first scalar key + // is getting extinct before reaching extant species. + std::map name_node; + std::map node_name; + std::map> ancestral_names; + std::map> ancestral; + std::map> below; + + std::vector> + ancestors; // contains the ancestors of a given branch; useful to forbid + // transfers to them. std::vector fm; // Fraction of genes missing at the tips - std::vector uE; // Probability for a gene to become extinct on each branch - scalar_type mPTE; // Mean probability across all branches for a gene to be transferred to branch h and then become extinct on that branch h - std::vector mPTE_ancestral_correction; // branch-wise adjustments of mPTE to obtain the branch-wise probability for a gene to be transferred to branch h and then become extinct on that branch h by doing mPTE - mPTE_ancestral_correction[e]. These branch-wise corrections are here to forbid transfers to ancestors of a branch. + std::vector + uE; // Probability for a gene to become extinct on each branch + scalar_type + mPTE; // Mean probability across all branches for a gene to be transferred + // to branch h and then become extinct on that branch h + std::vector + mPTE_ancestral_correction; // branch-wise adjustments of mPTE to obtain + // the branch-wise probability for a gene to be + // transferred to branch h and then become + // extinct on that branch h by doing mPTE - + // mPTE_ancestral_correction[e]. These + // branch-wise corrections are here to forbid + // transfers to ancestors of a branch. int root_i; - std::vector < std::vector > uq; - std::vector < scalar_type > mPTuq; - std::vector < std::vector > mPTuq_ancestral_correction; + std::vector> uq; + std::vector mPTuq; + std::vector> mPTuq_ancestral_correction; std::vector PD; // Duplication probability, per branch std::vector wT; // Transfer probability, per branch @@ -123,163 +161,226 @@ class exODT_model std::vector PL; // Loss probability, per branch std::vector PS; // Speciation probability, per branch int last_leaf; - std::map > Ge; //del-loc. Probability (scalar value) that a gene present at a given time slice (whose rank is the int key) actually reaches extant species. - std::map > > q; //del-loc. Map between clade id (from the approx_posterior object) and a map between the time of a subslice and a map between branch id and probability of the clade given the ODTL model. - - std::vector< std::vector < std::vector < std::map > > > qvec;// NO del-loc !! - - std::map > > q_step; //del-loc - std::map gid_sps; //del-loc. Map between clade id (from the approx_posterior object) and species included in that clade. - - std::map MLRec_events; //del-loc - std::map > branch_counts; //del-loc - std::vector Ttokens; //del-loc - - std::map > gid_events; //del-loc - std::map > gid_times; //del-loc - std::map > gid_branches; //del-loc - std::map > gid_gidp; //del-loc - std::map > gid_gidpp; //del-loc + std::map> + Ge; // del-loc. Probability (scalar value) that a gene present at a given + // time slice (whose rank is the int key) actually reaches extant + // species. + std::map>> + q; // del-loc. Map between clade id (from the approx_posterior object) and + // a map between the time of a subslice and a map between branch id and + // probability of the clade given the ODTL model. + + std::vector>>> + qvec; // NO del-loc !! + + std::map>> + q_step; // del-loc + std::map + gid_sps; // del-loc. Map between clade id (from the approx_posterior + // object) and species included in that clade. + + std::map MLRec_events; // del-loc + std::map> branch_counts; // del-loc + std::vector Ttokens; // del-loc + + std::map> gid_events; // del-loc + std::map> gid_times; // del-loc + std::map> gid_branches; // del-loc + std::map> gid_gidp; // del-loc + std::map> gid_gidpp; // del-loc std::map fraction_missing; double height(bpp::Node *node); - - void construct_undated(const std::string& Sstring, const std::string& fractionMissingFile=""); //Constructs an object given a species tree and file containing fractions of missing genes per species. + + void construct_undated( + const std::string &Sstring, + const std::string &fractionMissingFile = + ""); // Constructs an object given a species tree and file containing + // fractions of missing genes per species. void calculate_undatedEs(); - scalar_type pun(approx_posterior *ale, bool verbose=false,bool no_T=false); + scalar_type pun(approx_posterior *ale, bool verbose = false, + bool no_T = false); std::string feSPR(int e, int f); std::vector NNIs(int e); - std::string sample_undated(bool no_T=false); - std::string sample_undated(int e,int i,std::string last_event,std::string branch_string="",bool no_T=false); - std::vector g_ids; - std::vector g_id_sizes; - std::map g_id2i; + std::string sample_undated(bool no_T = false); + std::string sample_undated(int e, int i, std::string last_event, + std::string branch_string = "", bool no_T = false); + std::vector g_ids; + std::vector g_id_sizes; + std::map g_id2i; - //implemented in exODT.cpp + // implemented in exODT.cpp - void construct(const std::string& Sstring, const scalar_type& N=1e6, const std::string& fractionMissingFile=""); //Constructs an object given a species tree, population size and file containing fractions of missing genes per species. + void construct( + const std::string &Sstring, const scalar_type &N = 1e6, + const std::string &fractionMissingFile = + ""); // Constructs an object given a species tree, population size and + // file containing fractions of missing genes per species. exODT_model(); - ~exODT_model() - { - rank_ids.clear(); - id_ranks.clear(); - father.clear(); - for (std::map >::iterator it=daughters.begin();it!=daughters.end();it++) - (*it).second.clear(); - daughters.clear(); - extant_species.clear(); - branch_ts.clear(); - rank_ids.clear(); - id_ranks.clear(); - t_begin.clear(); - t_end.clear(); - for (std::map >::iterator it=time_slices.begin();it!=time_slices.end();it++) - (*it).second.clear(); - time_slices.clear(); - for (std::map >::iterator it=branch_slices.begin();it!=branch_slices.end();it++) - (*it).second.clear(); - for (std::map >::iterator it=time_slice_times.begin();it!=time_slice_times.end();it++) - (*it).second.clear(); - time_slice_times.clear(); - time_slice_begins.clear(); - scalar_parameter.clear(); - for (std::map >::iterator it=vector_parameter.begin();it!=vector_parameter.end();it++)//del_loc - (*it).second.clear(); - vector_parameter.clear(); - string_parameter.clear(); - node_ids.clear(); - id_nodes.clear(); - delete S; - for (std::map >::iterator it=Ee.begin();it!=Ee.end();it++)//del_loc - (*it).second.clear(); - Ee.clear(); - for (std::map >::iterator it=Ge.begin();it!=Ge.end();it++)//del_loc - (*it).second.clear(); - Ge.clear(); - Ee.clear(); - for (std::map > >::iterator it=q.begin();it!=q.end();it++) - { - for ( std::map< scalar_type, std::map >::iterator jt=(*it).second.begin();jt!=(*it).second.end();jt++) - (*jt).second.clear(); - (*it).second.clear(); - } - q.clear(); - for (std::map > >::iterator it=q_step.begin();it!=q_step.end();it++) - { - for ( std::map< scalar_type, std::map >::iterator jt=(*it).second.begin();jt!=(*it).second.end();jt++) - (*jt).second.clear(); - (*it).second.clear(); - } - q_step.clear(); - gid_sps.clear(); - MLRec_events.clear(); - for (std::map >::iterator it=branch_counts.begin();it!=branch_counts.end();it++)//del_loc - (*it).second.clear(); - branch_counts.clear(); - - for (std::map >::iterator it=gid_events.begin();it!=gid_events.end();it++)//del_loc - (*it).second.clear(); - gid_events.clear(); - - for (std::map >::iterator it=gid_times.begin();it!=gid_times.end();it++)//del_loc - (*it).second.clear(); - gid_times.clear(); - - for (std::map >::iterator it=gid_branches.begin();it!=gid_branches.end();it++)//del_loc - (*it).second.clear(); - gid_branches.clear(); - - - Ttokens.clear(); - }; - - void set_model_parameter(std::string name,std::string value); //Sets the value of a string parameter. - void set_model_parameter(std::string,scalar_type); //Sets the value of a scalar parameter. - void set_model_parameter(std::string,std::vector); //Sets the value of a vector of scalars parameter. - - //implemented in model.cpp - scalar_type p(approx_posterior *ale); //Computes the probability of an approx_posterior according to the species tree and parameter values. + ~exODT_model() { + rank_ids.clear(); + id_ranks.clear(); + father.clear(); + for (std::map>::iterator it = daughters.begin(); + it != daughters.end(); it++) + (*it).second.clear(); + daughters.clear(); + extant_species.clear(); + branch_ts.clear(); + rank_ids.clear(); + id_ranks.clear(); + t_begin.clear(); + t_end.clear(); + for (std::map>::iterator it = time_slices.begin(); + it != time_slices.end(); it++) + (*it).second.clear(); + time_slices.clear(); + for (std::map>::iterator it = branch_slices.begin(); + it != branch_slices.end(); it++) + (*it).second.clear(); + for (std::map>::iterator it = + time_slice_times.begin(); + it != time_slice_times.end(); it++) + (*it).second.clear(); + time_slice_times.clear(); + time_slice_begins.clear(); + scalar_parameter.clear(); + for (std::map>::iterator it = + vector_parameter.begin(); + it != vector_parameter.end(); it++) // del_loc + (*it).second.clear(); + vector_parameter.clear(); + string_parameter.clear(); + node_ids.clear(); + id_nodes.clear(); + delete S; + for (std::map>::iterator it = + Ee.begin(); + it != Ee.end(); it++) // del_loc + (*it).second.clear(); + Ee.clear(); + for (std::map>::iterator it = + Ge.begin(); + it != Ge.end(); it++) // del_loc + (*it).second.clear(); + Ge.clear(); + Ee.clear(); + for (std::map>>::iterator + it = q.begin(); + it != q.end(); it++) { + for (std::map>::iterator jt = + (*it).second.begin(); + jt != (*it).second.end(); jt++) + (*jt).second.clear(); + (*it).second.clear(); + } + q.clear(); + for (std::map>>::iterator it = + q_step.begin(); + it != q_step.end(); it++) { + for (std::map>::iterator jt = + (*it).second.begin(); + jt != (*it).second.end(); jt++) + (*jt).second.clear(); + (*it).second.clear(); + } + q_step.clear(); + gid_sps.clear(); + MLRec_events.clear(); + for (std::map>::iterator it = + branch_counts.begin(); + it != branch_counts.end(); it++) // del_loc + (*it).second.clear(); + branch_counts.clear(); + + for (std::map>::iterator it = + gid_events.begin(); + it != gid_events.end(); it++) // del_loc + (*it).second.clear(); + gid_events.clear(); + + for (std::map>::iterator it = + gid_times.begin(); + it != gid_times.end(); it++) // del_loc + (*it).second.clear(); + gid_times.clear(); + + for (std::map>::iterator it = + gid_branches.begin(); + it != gid_branches.end(); it++) // del_loc + (*it).second.clear(); + gid_branches.clear(); + + Ttokens.clear(); + }; + + void set_model_parameter( + std::string name, + std::string value); // Sets the value of a string parameter. + void + set_model_parameter(std::string, + scalar_type); // Sets the value of a scalar parameter. + void set_model_parameter( + std::string, std::vector); // Sets the value of a vector of + // scalars parameter. + + // implemented in model.cpp + scalar_type p(approx_posterior * + ale); // Computes the probability of an approx_posterior + // according to the species tree and parameter values. void calculate_EG(); - void calculate_EGb(); //Fills Ee. Calculates extinction probabilities per branch and per time slice. - - //implemented in traceback.cpp - std::pair p_MLRec(approx_posterior *ale,bool lowmem=true); - std::pair traceback(); - std::string traceback(long int g_id,scalar_type t,scalar_type rank,int e,scalar_type branch_length,std::string branch_events,std::string transfer_token=""); + void calculate_EGb(); // Fills Ee. Calculates extinction probabilities per + // branch and per time slice. + + // implemented in traceback.cpp + std::pair p_MLRec(approx_posterior *ale, + bool lowmem = true); + std::pair traceback(); + std::string traceback(long int g_id, scalar_type t, scalar_type rank, int e, + scalar_type branch_length, std::string branch_events, + std::string transfer_token = ""); void register_O(int e); void register_D(int e); void register_Tto(int e); void register_Tfrom(int e); void register_L(int e); void register_S(int e); - void register_Su(int e,std::string last_event); - void register_T_to_from(int e,int f); - void reset_T_to_from( ); + void register_Su(int e, std::string last_event); + void register_T_to_from(int e, int f); + void reset_T_to_from(); - std::vector < std::vector > T_to_from; + std::vector> T_to_from; void register_leaf(int e); - void register_leafu(int e,std::string last_event); + void register_leafu(int e, std::string last_event); void register_Ttoken(std::string token); - //implemented in traceback_lowmem.cpp - not done - std::pair p_MLRec_lowmem(approx_posterior *ale); - std::string traceback_lowmem(long int g_id,scalar_type t,scalar_type rank,int e,scalar_type branch_length,std::string branch_events,std::string transfer_token=""); - //implemented in sample.cpp - std::string sample(bool max_rec=false); - std::string sample(bool S_node,long int g_id,int t_i,scalar_type rank,int e,scalar_type branch_length,std::string branch_events, std::string transfer_toke="",bool max_rec=false); - - - void show_counts(std::string name, bool as_branch_length=true, bool per_copy=false); - std::string counts_string(scalar_type samples=1); - std::string counts_string_undated(scalar_type samples=1); + // implemented in traceback_lowmem.cpp - not done + std::pair p_MLRec_lowmem(approx_posterior *ale); + std::string traceback_lowmem(long int g_id, scalar_type t, scalar_type rank, + int e, scalar_type branch_length, + std::string branch_events, + std::string transfer_token = ""); + // implemented in sample.cpp + std::string sample(bool max_rec = false); + std::string sample(bool S_node, long int g_id, int t_i, scalar_type rank, + int e, scalar_type branch_length, + std::string branch_events, std::string transfer_toke = "", + bool max_rec = false); + + void show_counts(std::string name, bool as_branch_length = true, + bool per_copy = false); + std::string counts_string(scalar_type samples = 1); + std::string counts_string_undated(scalar_type samples = 1); void show_rates(std::string name); std::string gid_string(long int g_id); - std::string vertical_string(long int g_id,std::string ancestral_string="",scalar_type t_0=-1); + std::string vertical_string(long int g_id, std::string ancestral_string = "", + scalar_type t_0 = -1); - private: +private: ; - }; diff --git a/src/exODT_sim.cpp b/src/exODT_sim.cpp index f40ebfd..19c602d 100644 --- a/src/exODT_sim.cpp +++ b/src/exODT_sim.cpp @@ -2,150 +2,149 @@ using namespace std; using namespace bpp; -exODT_sim::exODT_sim( int N_in, long int S_seed_in, scalar_type init_t_in, scalar_type sigma_in) -{ - if (S_seed_in==-1) - S_seed=good_seed(); +exODT_sim::exODT_sim(int N_in, long int S_seed_in, scalar_type init_t_in, + scalar_type sigma_in) { + if (S_seed_in == -1) + S_seed = good_seed(); else - S_seed=S_seed_in; + S_seed = S_seed_in; - N=N_in; + N = N_in; - if (sigma_in==-1) - sigma=N; + if (sigma_in == -1) + sigma = N; else - sigma=sigma_in; + sigma = sigma_in; - init_t=init_t_in; - - RandomTools::setSeed(S_seed); - - cout << "# Species seed is : " << S_seed << endl; - - next_index=0; - - for (int i=0;i family; - long int mother=population[birth]; - long int daugther=next_index; - next_index++; - long int son=next_index; - next_index++; - family.push_back(mother); - family.push_back(daugther); - family.push_back(son); - - population[birth]=daugther; - population[death]=son; - - families.push_back(family); - event_times[species_event]=t; - - species_event++; - - } - number_of_species_events=species_event; -} - -string exODT_sim::sample_species(int n_in) -{ - n=n_in; - - //sample + init_t = init_t_in; - vector population_indicies;//del-loc - for (int i=0;i sampled_population_counts;//del-loc - map strings; - map age; - for (int i=0;i family; + long int mother = population[birth]; + long int daugther = next_index; + next_index++; + long int son = next_index; + next_index++; + family.push_back(mother); + family.push_back(daugther); + family.push_back(son); + + population[birth] = daugther; + population[death] = son; + + families.push_back(family); + event_times[species_event] = t; + + species_event++; + } + number_of_species_events = species_event; +} - int rank=0; - for(vector >::reverse_iterator event=families.rbegin();event!=families.rend();event++) - { - scalar_type t_event=event_times[species_event-1]; - long int mother=(*event)[0]; - long int daugther=(*event)[1]; - long int son=(*event)[2]; - if (sampled_population_counts[daugther]==1 and sampled_population_counts[son]==1) - { - rank++; - sampled_population_counts[daugther]=0; - sampled_population_counts[son]=0; - sampled_population_counts[mother]=1; - stringstream sons_bl; - stringstream daugthers_bl; - stringstream rank_bs; - rank_bs << rank; - sons_bl << t_event - age[son]; - daugthers_bl << t_event - age[daugther]; - strings[mother]="("+strings[daugther]+":"+daugthers_bl.str()+","+strings[son]+":"+sons_bl.str()+")"+rank_bs.str(); - age[mother]=t_event; - lca=mother; - } - else if (sampled_population_counts[daugther]==1) - { - sampled_population_counts[daugther]=0; - sampled_population_counts[mother]=1; - strings[mother]=strings[daugther]; - age[mother]=age[daugther]; - fca=mother; - } - else if (sampled_population_counts[son]==1) - { - sampled_population_counts[son]=0; - sampled_population_counts[mother]=1; - strings[mother]=strings[son]; - age[mother]=age[son]; - fca=mother; - } - species_event--; +string exODT_sim::sample_species(int n_in) { + n = n_in; + + // sample + + vector population_indicies; // del-loc + for (int i = 0; i < N; i++) + population_indicies.push_back(i); + + for (int i = 0; i < n; i++) + sampled_population_indicies.push_back(-1); + + RandomTools::getSample(population_indicies, sampled_population_indicies); + + for (int i = 0; i < n; i++) + sampled_population.push_back(population[sampled_population_indicies[i]]); + + // traceback + + map sampled_population_counts; // del-loc + map strings; + map age; + for (int i = 0; i < n; i++) { + long int extant_species = sampled_population[i]; + stringstream extant_species_name; + extant_species_name << i; // extant_species; + strings[extant_species] = extant_species_name.str(); + sampled_population_counts[extant_species] = 1; + age[extant_species] = 0; + } + + fca = -1; + + int rank = 0; + for (vector>::reverse_iterator event = families.rbegin(); + event != families.rend(); event++) { + scalar_type t_event = event_times[species_event - 1]; + long int mother = (*event)[0]; + long int daugther = (*event)[1]; + long int son = (*event)[2]; + if (sampled_population_counts[daugther] == 1 and + sampled_population_counts[son] == 1) { + rank++; + sampled_population_counts[daugther] = 0; + sampled_population_counts[son] = 0; + sampled_population_counts[mother] = 1; + stringstream sons_bl; + stringstream daugthers_bl; + stringstream rank_bs; + rank_bs << rank; + sons_bl << t_event - age[son]; + daugthers_bl << t_event - age[daugther]; + strings[mother] = "(" + strings[daugther] + ":" + daugthers_bl.str() + + "," + strings[son] + ":" + sons_bl.str() + ")" + + rank_bs.str(); + age[mother] = t_event; + lca = mother; + } else if (sampled_population_counts[daugther] == 1) { + sampled_population_counts[daugther] = 0; + sampled_population_counts[mother] = 1; + strings[mother] = strings[daugther]; + age[mother] = age[daugther]; + fca = mother; + } else if (sampled_population_counts[son] == 1) { + sampled_population_counts[son] = 0; + sampled_population_counts[mother] = 1; + strings[mother] = strings[son]; + age[mother] = age[son]; + fca = mother; } + species_event--; + } stringstream root_bl; - root_bl< random_tree_population; - map random_tree_ages; - for (int i=0;i1) - { - int Nr=random_tree_population.size(); - scalar_type t_next=RandomTools::randExponential(1./(2*Nr)); - t+=t_next; - int i=RandomTools::giveIntRandomNumberBetweenZeroAndEntry(Nr); - int j=i; - while (i==j) j=RandomTools::giveIntRandomNumberBetweenZeroAndEntry(Nr); - stringstream tmp; - tmp<<"("<i) - { - random_tree_population.erase(random_tree_population.begin()+j); - random_tree_population.erase(random_tree_population.begin()+i); - } - else - { - random_tree_population.erase(random_tree_population.begin()+i); - random_tree_population.erase(random_tree_population.begin()+j); - } - random_tree_population.push_back(tmp.str()); + vector random_tree_population; + map random_tree_ages; + for (int i = 0; i < n; i++) { + stringstream tmp; + tmp << i; + random_tree_population.push_back(tmp.str()); + random_tree_ages[tmp.str()] = 0; + } + scalar_type t = 0; + while (random_tree_population.size() > 1) { + int Nr = random_tree_population.size(); + scalar_type t_next = RandomTools::randExponential(1. / (2 * Nr)); + t += t_next; + int i = RandomTools::giveIntRandomNumberBetweenZeroAndEntry(Nr); + int j = i; + while (i == j) + j = RandomTools::giveIntRandomNumberBetweenZeroAndEntry(Nr); + stringstream tmp; + tmp << "(" << random_tree_population[i] << ":" + << t - random_tree_ages[random_tree_population[i]] << "," + << random_tree_population[j] << ":" + << t - random_tree_ages[random_tree_population[j]] << ")"; + random_tree_ages[tmp.str()] = t; + if (j > i) { + random_tree_population.erase(random_tree_population.begin() + j); + random_tree_population.erase(random_tree_population.begin() + i); + } else { + random_tree_population.erase(random_tree_population.begin() + i); + random_tree_population.erase(random_tree_population.begin() + j); } + random_tree_population.push_back(tmp.str()); + } /* stringstream rfname; @@ -202,326 +200,331 @@ string exODT_sim::sample_species(int n_in) r_out << random_tree_population[0]<<";" << endl; */ - R_string=random_tree_population[0]+";"; + R_string = random_tree_population[0] + ";"; - //del-locs + // del-locs population_indicies.clear(); sampled_population_counts.clear(); return S_string; } -vector exODT_sim::simulate_gene_trees(int G_n,scalar_type delta,scalar_type tau,scalar_type lambda,scalar_type omega,bool only_root, long int G_seed_in,bool event_string) -{ +vector exODT_sim::simulate_gene_trees( + int G_n, scalar_type delta, scalar_type tau, scalar_type lambda, + scalar_type omega, bool only_root, long int G_seed_in, bool event_string) { - if (G_seed_in==-1) - G_seed=good_seed(); + if (G_seed_in == -1) + G_seed = good_seed(); else - G_seed=G_seed_in; + G_seed = G_seed_in; - cout << "# Gene tree simulation seed is : " << G_seed << endl; + cout << "# Gene tree simulation seed is : " << G_seed << endl; // genes RandomTools::setSeed(G_seed); - //we seed genes .. this part is trvilally parallelizable.. - long int gene_count=0; - map > population_of_genes; - for (int i=0;i genes_in_species_i; - //we could have more genes per species .. - for (int j=0;j > > gene_families; - map gene_event_times; - map gene_event_types; - - scalar_type t=init_t; - species_event=0; + // we seed genes .. this part is trvilally parallelizable.. + long int gene_count = 0; + map> population_of_genes; + for (int i = 0; i < N; i++) { + vector genes_in_species_i; + // we could have more genes per species .. + for (int j = 0; j < G_n; j++) { + if (i == fca or not only_root) { + genes_in_species_i.push_back(gene_count); + gene_count++; + } + } + population_of_genes[i] = genes_in_species_i; + }; + // cout <<"GC: "<< gene_count << endl; + //..we replay species history + species_event = 0; + // and record gene stories.. + long int next_gene = gene_count; + vector>> gene_families; + map gene_event_times; + map gene_event_types; + + scalar_type t = init_t; + species_event = 0; cout << "#gene stories .." << endl; - long long gene_event=0; - boost::progress_display show_progress( number_of_species_events ); - while(species_event t_species_event) - { - t-=t_next; - - scalar_type r=RandomTools::giveRandomNumberBetweenZeroAndEntry(rate_sum); - if (r family; - long int father=population_of_genes[species][gene]; - family.push_back(father); - long int daugther=next_gene; - next_gene++; - family.push_back(daugther); - long int son=next_gene; - next_gene++; - family.push_back(son); - - vector < vector > fam_vec; - fam_vec.push_back(family); - gene_families.push_back(fam_vec); - gene_event++; - - gene_event_times[father]=t; - if (event_string) gene_event_types[father]="D"; else gene_event_types[father]=""; - - population_of_genes[species][gene]=daugther; - population_of_genes[species].push_back(son); - gene_count++; - // cout << "D."< family; - long int father=population_of_genes[species][gene]; - family.push_back(father); - long int daugther=next_gene; - next_gene++; - family.push_back(daugther); - long int son=next_gene; - next_gene++; - family.push_back(son); - vector < vector > fam_vec; - fam_vec.push_back(family); - gene_families.push_back(fam_vec); - gene_event++; - - gene_event_times[father]=t; - if (event_string) gene_event_types[father]="T"; else gene_event_types[father]=""; - - population_of_genes[species][gene]=daugther; - population_of_genes[T_to].push_back(son); - gene_count++; - //cout << "T."< genes_in_daugther_species; - vector genes_in_son_species; - vector < vector > fam_vec; - - for (vector ::iterator gene=population_of_genes[births[species_event]].begin();gene!=population_of_genes[births[species_event]].end();gene++) - { - vector family; - long int father=(*gene); - family.push_back(father); - long int daugther=next_gene; - next_gene++; - family.push_back(daugther); - long int son=next_gene; - next_gene++; - family.push_back(son); - fam_vec.push_back(family); - - genes_in_daugther_species.push_back(daugther); - genes_in_son_species.push_back(son); - gene_count++; - if (event_string) gene_event_types[father]="S"; else gene_event_types[father]=""; - gene_event_times[father]=t; - - } - - gene_count-=population_of_genes[deaths[species_event]].size(); - - population_of_genes[births[species_event]].clear(); - population_of_genes[births[species_event]]=genes_in_daugther_species; - population_of_genes[deaths[species_event]]=genes_in_son_species; - - //cout << "S."< t_species_event) { + t -= t_next; + + scalar_type r = + RandomTools::giveRandomNumberBetweenZeroAndEntry(rate_sum); + if (r < gene_count * delta + gene_count * tau + gene_count * lambda) { + int gene = -1; + int species = -1; + + long int gene_r = + RandomTools::giveIntRandomNumberBetweenZeroAndEntry(gene_count); + long int gene_re_count = 0; + gene_re_count = 0; + for (int i = 0; i < N; i++) { + if (gene_r < + gene_re_count + (long int)population_of_genes[i].size()) { + gene = (gene_re_count + (long int)population_of_genes[i].size() - + gene_r) - + 1; + species = i; + break; + } + gene_re_count += population_of_genes[i].size(); + } + + if (r < gene_count * delta) + // D + { + // cout << "D"< family; + long int father = population_of_genes[species][gene]; + family.push_back(father); + long int daugther = next_gene; + next_gene++; + family.push_back(daugther); + long int son = next_gene; + next_gene++; + family.push_back(son); + + vector> fam_vec; + fam_vec.push_back(family); + gene_families.push_back(fam_vec); + gene_event++; + + gene_event_times[father] = t; + if (event_string) + gene_event_types[father] = "D"; + else + gene_event_types[father] = ""; + + population_of_genes[species][gene] = daugther; + population_of_genes[species].push_back(son); + gene_count++; + // cout << "D."< family; + long int father = population_of_genes[species][gene]; + family.push_back(father); + long int daugther = next_gene; + next_gene++; + family.push_back(daugther); + long int son = next_gene; + next_gene++; + family.push_back(son); + vector> fam_vec; + fam_vec.push_back(family); + gene_families.push_back(fam_vec); + gene_event++; + + gene_event_times[father] = t; + if (event_string) + gene_event_types[father] = "T"; + else + gene_event_types[father] = ""; + + population_of_genes[species][gene] = daugther; + population_of_genes[T_to].push_back(son); + gene_count++; + // cout << "T."< genes_in_daugther_species; + vector genes_in_son_species; + vector> fam_vec; + + for (vector::iterator gene = + population_of_genes[births[species_event]].begin(); + gene != population_of_genes[births[species_event]].end(); gene++) { + vector family; + long int father = (*gene); + family.push_back(father); + long int daugther = next_gene; + next_gene++; + family.push_back(daugther); + long int son = next_gene; + next_gene++; + family.push_back(son); + fam_vec.push_back(family); + + genes_in_daugther_species.push_back(daugther); + genes_in_son_species.push_back(son); + gene_count++; + if (event_string) + gene_event_types[father] = "S"; + else + gene_event_types[father] = ""; + gene_event_times[father] = t; + } - // cout << "#gene simulation ends "<< Ds << " Ds; "<< Ts << " Ts; " << Ls << " Ls; " << Os <<" Os; "<< Ss << " Ss."< sampled_gene_counts; - map gene_strings; - map gene_age; - int j=0; - for (int i=0;i::iterator git=population_of_genes[ sampled_i ].begin();git!=population_of_genes[ sampled_i ].end();git++) - { - long int extant_gene=(*git); - stringstream extant_gene_name; - extant_gene_name << i << "_" << j;// extant_species << "_" << extant_gene; - gene_strings[extant_gene]=extant_gene_name.str(); - sampled_gene_counts[extant_gene]=1; - gene_age[extant_gene]=0; - j++; - } - } - int gene_rank=0; + // cout << "S."< suffix; - for(vector > > ::reverse_iterator event_vec=gene_families.rbegin();event_vec!=gene_families.rend();event_vec++) - { + // XX oh my, what a waste of time .. + gene_families.push_back(fam_vec); + gene_event++; - for(vector< vector > ::iterator event=(*event_vec).begin();event!=(*event_vec).end();event++) - { - - long int father=(*event)[0]; - long int daugther=(*event)[1]; - long int son=(*event)[2]; - scalar_type t_event=gene_event_times[father]; - - if (sampled_gene_counts[daugther]==1 and sampled_gene_counts[son]==1) - { - gene_rank++; - sampled_gene_counts[daugther]=0; - sampled_gene_counts[son]=0; - sampled_gene_counts[father]=1; - stringstream sons_bl; - stringstream daugthers_bl; - stringstream gene_rank_bs; - gene_rank_bs << gene_event_types[father] << suffix[son] << suffix[daugther]; - suffix[father]=""; - sons_bl << t_event - gene_age[son]; - daugthers_bl << t_event - gene_age[daugther]; - gene_strings[father]="("+gene_strings[daugther]+":"+daugthers_bl.str()+","+gene_strings[son]+":"+sons_bl.str()+")"+gene_rank_bs.str(); - gene_age[father]=t_event; - //lca=father; - } - else if (sampled_gene_counts[daugther]==1) - { - sampled_gene_counts[daugther]=0; - sampled_gene_counts[father]=1; - gene_strings[father]=gene_strings[daugther]; - suffix[father]=suffix[daugther]; - gene_age[father]=gene_age[daugther]; - - } - else if (sampled_gene_counts[son]==1) - { - sampled_gene_counts[son]=0; - sampled_gene_counts[father]=1; - gene_strings[father]=gene_strings[son]; - suffix[father]=suffix[son]; - if (gene_event_types[father]=="T") suffix[father]+="T"; - gene_age[father]=gene_age[son]; - } - } - gene_event--; - ++show_trace_progress; + species_event++; + ++show_progress; } - - //int i=0; - for (map::iterator git=sampled_gene_counts.begin(); git!=sampled_gene_counts.end(); git++) - if ((*git).second==1 and gene_strings[(*git).first].find("(")!=string::npos ) - { - /* - stringstream fname; - fname << "G_" << S_seed << "_" << G_seed << "_" << i << ".tree"; - ofstream gene_out( fname.str().c_str() ); - gene_out << gene_strings[(*git).first] << ";" << endl; - i++; - */ - gene_trees.push_back(gene_strings[(*git).first] + ";"); - + } + + // cout << "#gene simulation ends "<< Ds << " Ds; "<< Ts << " Ts; " << Ls << " + // Ls; " << Os <<" Os; "<< Ss << " Ss."< sampled_gene_counts; + map gene_strings; + map gene_age; + int j = 0; + for (int i = 0; i < n; i++) { + int sampled_i = sampled_population_indicies[i]; + long int extant_species = sampled_population[i]; + if (extant_species == population[sampled_i]) + for (vector::iterator git = + population_of_genes[sampled_i].begin(); + git != population_of_genes[sampled_i].end(); git++) { + long int extant_gene = (*git); + stringstream extant_gene_name; + extant_gene_name << i << "_" + << j; // extant_species << "_" << extant_gene; + gene_strings[extant_gene] = extant_gene_name.str(); + sampled_gene_counts[extant_gene] = 1; + gene_age[extant_gene] = 0; + j++; + } + } + int gene_rank = 0; + + cout << "#traceback begins.." << endl; + boost::progress_display show_trace_progress(gene_event); + map suffix; + for (vector>>::reverse_iterator event_vec = + gene_families.rbegin(); + event_vec != gene_families.rend(); event_vec++) { + + for (vector>::iterator event = (*event_vec).begin(); + event != (*event_vec).end(); event++) { + + long int father = (*event)[0]; + long int daugther = (*event)[1]; + long int son = (*event)[2]; + scalar_type t_event = gene_event_times[father]; + + if (sampled_gene_counts[daugther] == 1 and + sampled_gene_counts[son] == 1) { + gene_rank++; + sampled_gene_counts[daugther] = 0; + sampled_gene_counts[son] = 0; + sampled_gene_counts[father] = 1; + stringstream sons_bl; + stringstream daugthers_bl; + stringstream gene_rank_bs; + gene_rank_bs << gene_event_types[father] << suffix[son] + << suffix[daugther]; + suffix[father] = ""; + sons_bl << t_event - gene_age[son]; + daugthers_bl << t_event - gene_age[daugther]; + gene_strings[father] = "(" + gene_strings[daugther] + ":" + + daugthers_bl.str() + "," + gene_strings[son] + + ":" + sons_bl.str() + ")" + gene_rank_bs.str(); + gene_age[father] = t_event; + // lca=father; + } else if (sampled_gene_counts[daugther] == 1) { + sampled_gene_counts[daugther] = 0; + sampled_gene_counts[father] = 1; + gene_strings[father] = gene_strings[daugther]; + suffix[father] = suffix[daugther]; + gene_age[father] = gene_age[daugther]; + + } else if (sampled_gene_counts[son] == 1) { + sampled_gene_counts[son] = 0; + sampled_gene_counts[father] = 1; + gene_strings[father] = gene_strings[son]; + suffix[father] = suffix[son]; + if (gene_event_types[father] == "T") + suffix[father] += "T"; + gene_age[father] = gene_age[son]; } + } + gene_event--; + ++show_trace_progress; + } + + // int i=0; + for (map::iterator git = sampled_gene_counts.begin(); + git != sampled_gene_counts.end(); git++) + if ((*git).second == 1 and + gene_strings[(*git).first].find("(") != string::npos) { + /* + stringstream fname; + fname << "G_" << S_seed << "_" << G_seed << "_" << i << ".tree"; + ofstream gene_out( fname.str().c_str() ); + gene_out << gene_strings[(*git).first] << ";" << endl; + i++; + */ + gene_trees.push_back(gene_strings[(*git).first] + ";"); + } return gene_trees; } diff --git a/src/exODT_sim.h b/src/exODT_sim.h index 0ee324a..7d8f424 100644 --- a/src/exODT_sim.h +++ b/src/exODT_sim.h @@ -1,86 +1,83 @@ -//all code by Szollosi GJ et al.; ssolo@elte.hu; GNU GPL 3.0; +// all code by Szollosi GJ et al.; ssolo@elte.hu; GNU GPL 3.0; #include "ALE.h" #include -class exODT_sim -{ - public: - long int S_seed,G_seed; - int N,n; +class exODT_sim { +public: + long int S_seed, G_seed; + int N, n; - scalar_type init_t,sigma,lca_age; + scalar_type init_t, sigma, lca_age; std::vector gene_trees; - std::string S_string,R_string; + std::string S_string, R_string; + + exODT_sim(int N_in, long int S_seed_in = -1, scalar_type init_t_in = 2, + scalar_type sigma = -1); - exODT_sim( int N_in, long int S_seed_in=-1, scalar_type init_t_in=2, scalar_type sigma=-1); - std::string sample_species(int n_in); - std::vector simulate_gene_trees(int G_n,scalar_type delta,scalar_type tau,scalar_type lambda,scalar_type omega=0,bool only_root=true, long int G_seed_in=-1,bool event_string=false); - - ~exODT_sim() - { - population.clear(); + std::vector + simulate_gene_trees(int G_n, scalar_type delta, scalar_type tau, + scalar_type lambda, scalar_type omega = 0, + bool only_root = true, long int G_seed_in = -1, + bool event_string = false); - for (std::vector >::iterator it=families.begin();it!=families.end();it++) - (*it).clear(); - families.clear(); + ~exODT_sim() { + population.clear(); - event_times.clear(); + for (std::vector>::iterator it = families.begin(); + it != families.end(); it++) + (*it).clear(); + families.clear(); - births.clear(); - - deaths.clear(); + event_times.clear(); - sampled_population.clear(); + births.clear(); - sampled_population_indicies.clear(); - - } + deaths.clear(); + sampled_population.clear(); - private: - std::vector population;//del-loc + sampled_population_indicies.clear(); + } + +private: + std::vector population; // del-loc long int next_index; long long species_event; long long number_of_species_events; - std::vector > families;//del-loc - std::map event_times;//del-loc + std::vector> families; // del-loc + std::map event_times; // del-loc - std::vector < int > births;//del-loc - std::vector < int > deaths;//del-loc + std::vector births; // del-loc + std::vector deaths; // del-loc - std::vector sampled_population;//del-loc - std::vector sampled_population_indicies;//del-loc + std::vector sampled_population; // del-loc + std::vector sampled_population_indicies; // del-loc long int lca; int fca; - - unsigned int good_seed() - { - unsigned int random_seed, random_seed_a, random_seed_b; - std::ifstream file ("/dev/random", std::ios::binary); - if (file.is_open()) - { - char * memblock; - int size = sizeof(int); - memblock = new char [size]; - file.read (memblock, size); - file.close(); - random_seed_a = long(memblock); - delete[] memblock; - }// end if - else - { - random_seed_a = 0; + unsigned int good_seed() { + unsigned int random_seed, random_seed_a, random_seed_b; + std::ifstream file("/dev/random", std::ios::binary); + if (file.is_open()) { + char *memblock; + int size = sizeof(int); + memblock = new char[size]; + file.read(memblock, size); + file.close(); + random_seed_a = long(memblock); + delete[] memblock; + } // end if + else { + random_seed_a = 0; } random_seed_b = std::time(0); random_seed = random_seed_a xor random_seed_b; return random_seed; } - }; diff --git a/src/fractionMissing.cpp b/src/fractionMissing.cpp index 5f873e0..f5b15b4 100644 --- a/src/fractionMissing.cpp +++ b/src/fractionMissing.cpp @@ -2,32 +2,31 @@ #include #include -std::map readFractionMissingFile(std::string fractionMissingFile) { +std::map +readFractionMissingFile(std::string fractionMissingFile) { std::map toReturn; - if (fractionMissingFile=="" ) - { - std::cout << "No file providing the fraction of missing genes per species, we assume that all species have 100% of their genes."< listCoverages; - std::string line; - - while(getline(inCoverage,line)) - { - listCoverages.push_back(line); - } - for(std::vector::iterator it = listCoverages.begin(); it != listCoverages.end(); it++) - { - bpp::StringTokenizer st1(*it, ":", true); - toReturn[st1.getToken(0)] = bpp::TextTools::toDouble ( st1.getToken( 1 ) ) ; - } - } - return (toReturn); + if (fractionMissingFile == "") { + std::cout << "No file providing the fraction of missing genes per species, " + "we assume that all species have 100% of their genes." + << std::endl; + } else { + if (!fexists(fractionMissingFile)) { + std::cout << "Error, file " << fractionMissingFile + << " does not seem accessible." << std::endl; + exit(1); + } + std::ifstream inCoverage(fractionMissingFile.c_str()); + std::vector listCoverages; + std::string line; + while (getline(inCoverage, line)) { + listCoverages.push_back(line); + } + for (std::vector::iterator it = listCoverages.begin(); + it != listCoverages.end(); it++) { + bpp::StringTokenizer st1(*it, ":", true); + toReturn[st1.getToken(0)] = bpp::TextTools::toDouble(st1.getToken(1)); + } + } + return (toReturn); } diff --git a/src/fractionMissing.h b/src/fractionMissing.h index ba44800..1a0dab2 100644 --- a/src/fractionMissing.h +++ b/src/fractionMissing.h @@ -1,5 +1,5 @@ #include "ALE.h" #include "ALE_util.h" - -std::map readFractionMissingFile(std::string fractionMissingFile); +std::map +readFractionMissingFile(std::string fractionMissingFile); diff --git a/src/ls_leaves.cpp b/src/ls_leaves.cpp index e24310d..cd1f7fb 100644 --- a/src/ls_leaves.cpp +++ b/src/ls_leaves.cpp @@ -2,29 +2,27 @@ using namespace std; using namespace bpp; -int main(int argc, char ** argv) -{ +int main(int argc, char **argv) { - map names; - for (int i=1;i leaves=T->getLeaves(); - for (vector ::iterator it=leaves.begin();it!=leaves.end();it++) - { - string name=(*it)->getName(); - vector tokens; - boost::split(tokens,name,boost::is_any_of("_"),boost::token_compress_on); - //name=tokens[0]; + map names; + for (int i = 1; i < argc; i++) { + ifstream file_stream1(argv[i]); + string tree; + getline(file_stream1, tree); + tree_type *T = TreeTemplateTools::parenthesisToTree(tree, false); + vector leaves = T->getLeaves(); + for (vector::iterator it = leaves.begin(); it != leaves.end(); + it++) { + string name = (*it)->getName(); + vector tokens; + boost::split(tokens, name, boost::is_any_of("_"), + boost::token_compress_on); + // name=tokens[0]; - names[name]++; - } - } - for (map ::iterator it=names.begin();it!=names.end();it++) - { - cout << (*it).first << " " << (*it).second << endl; + names[name]++; } + } + for (map::iterator it = names.begin(); it != names.end(); it++) { + cout << (*it).first << " " << (*it).second << endl; + } } diff --git a/src/mlresampler.cpp b/src/mlresampler.cpp index ae4c56d..cf5c9db 100644 --- a/src/mlresampler.cpp +++ b/src/mlresampler.cpp @@ -6,144 +6,142 @@ using namespace std; using namespace bpp; -int main(int argc, char ** argv) -{ - //we need a species tree +int main(int argc, char **argv) { + // we need a species tree - string ml_rec_file=argv[1]; + string ml_rec_file = argv[1]; string Sstring; - vector tokens; - boost::split(tokens,ml_rec_file,boost::is_any_of("."),boost::token_compress_on); - string ale_file=argv[2]; - - approx_posterior * ale; - ale=load_ALE_from_file(ale_file); - - scalar_type delta,tau,lambda; - ifstream ml_file_stream (ml_rec_file.c_str()); - while(! ml_file_stream.eof()) - { - string line; - getline (ml_file_stream,line); - if (line.find("S:")!=line.npos ) - { - vector tokens; - boost::split(tokens,line,boost::is_any_of(" \t"),boost::token_compress_on); - Sstring=tokens[1]; - } - if (line.find("ML")!=line.npos ) - { - vector tokens; - boost::split(tokens,line,boost::is_any_of(" \t"),boost::token_compress_on); - delta=atof(tokens[1].c_str()); - tau=atof(tokens[2].c_str()); - lambda=atof(tokens[3].c_str()); - - } + vector tokens; + boost::split(tokens, ml_rec_file, boost::is_any_of("."), + boost::token_compress_on); + string ale_file = argv[2]; + + approx_posterior *ale; + ale = load_ALE_from_file(ale_file); + + scalar_type delta, tau, lambda; + ifstream ml_file_stream(ml_rec_file.c_str()); + while (!ml_file_stream.eof()) { + string line; + getline(ml_file_stream, line); + if (line.find("S:") != line.npos) { + vector tokens; + boost::split(tokens, line, boost::is_any_of(" \t"), + boost::token_compress_on); + Sstring = tokens[1]; } - cout <<"# S: "<< Sstring << endl; - cout <<"# rates: "<< delta << " " << tau << " " << lambda << endl; - exODT_model* model=new exODT_model(); + if (line.find("ML") != line.npos) { + vector tokens; + boost::split(tokens, line, boost::is_any_of(" \t"), + boost::token_compress_on); + delta = atof(tokens[1].c_str()); + tau = atof(tokens[2].c_str()); + lambda = atof(tokens[3].c_str()); + } + } + cout << "# S: " << Sstring << endl; + cout << "# rates: " << delta << " " << tau << " " << lambda << endl; + exODT_model *model = new exODT_model(); - model->set_model_parameter("BOOTSTRAP_LABELS","yes"); + model->set_model_parameter("BOOTSTRAP_LABELS", "yes"); - model->set_model_parameter("min_D",3); - model->set_model_parameter("grid_delta_t",0.05); + model->set_model_parameter("min_D", 3); + model->set_model_parameter("grid_delta_t", 0.05); model->construct(Sstring); - model->set_model_parameter("event_node",0); - model->set_model_parameter("leaf_events",1); - model->set_model_parameter("N",1); - + model->set_model_parameter("event_node", 0); + model->set_model_parameter("leaf_events", 1); + model->set_model_parameter("N", 1); - model->set_model_parameter("delta",delta); + model->set_model_parameter("delta", delta); model->set_model_parameter("tau", tau); model->set_model_parameter("lambda", lambda); model->set_model_parameter("sigma_hat", 1); - cout << "." << endl; + cout << "." << endl; model->calculate_EGb(); - cout << ".." << endl; + cout << ".." << endl; - boost::timer * t = new boost::timer(); - cout <<"LL: " << log(model->p(ale)) << endl; - cout <<"time: " << t->elapsed() << endl; - cout << ".."< sample_trees; + boost::timer *t = new boost::timer(); + cout << "LL: " << log(model->p(ale)) << endl; + cout << "time: " << t->elapsed() << endl; + cout << ".." << endl; + vector sample_trees; <<<<<<< HEAD - string outname=ml_rec_file+".rate_resample.samples"; - ofstream fout( outname.c_str() ); - string outname2=ml_rec_file+".rate_resample.Ttokens"; + string outname = ml_rec_file + ".rate_resample.samples"; + ofstream fout(outname.c_str()); + string outname2 = ml_rec_file + ".rate_resample.Ttokens"; ======= - string outname=ale_file+".rate_resample.samples"; - ofstream fout( outname.c_str() ); - string outname2=ale_file+".rate_resample.Ttokens"; + string outname = ale_file + ".rate_resample.samples"; + ofstream fout(outname.c_str()); + string outname2 = ale_file + ".rate_resample.Ttokens"; >>>>>>> 474d82cc0794533095baad28e4c97d06f7545120 - ofstream fout2( outname2.c_str() ); - - int samples=atoi(argv[3]); - //boost::progress_display pd( samples ); - for (int i=0;isample(false); - fout << sample_tree << endl; - for (vector::iterator it=model->Ttokens.begin();it!=model->Ttokens.end();it++) fout << i << " " <<(*it)< leaves = G->getLeaves(); - for (vector::iterator it=leaves.begin();it!=leaves.end();it++ ) - { - string name=(*it)->getName(); - vector tokens; - boost::split(tokens,name,boost::is_any_of(".@"),boost::token_compress_on); - (*it)->setName(tokens[0]); - tokens.clear(); - } - leaves.clear(); - sample_trees.push_back(G); - } - - scalar_type cumsum=0; - map cum_copies; - for (int branch=model->last_branch-1;branch>=0;branch--) - { - scalar_type extant; - if (model->id_ranks[branch]==0) - extant=model->last_branch; - else - extant=model->id_ranks[branch]-model->last_branch; - cumsum+=model->branch_counts["copies"][branch]; - //cum_copies[model->branch_ts[branch]]=cumsum; - cum_copies[model->id_ranks[branch]]=cumsum; + ofstream fout2(outname2.c_str()); + + int samples = atoi(argv[3]); + // boost::progress_display pd( samples ); + for (int i = 0; i < samples; i++) { + //++pd; + string sample_tree = model->sample(false); + fout << sample_tree << endl; + for (vector::iterator it = model->Ttokens.begin(); + it != model->Ttokens.end(); it++) + fout << i << " " << (*it) << endl; + + tree_type *G = TreeTemplateTools::parenthesisToTree(sample_tree, false); + vector leaves = G->getLeaves(); + for (vector::iterator it = leaves.begin(); it != leaves.end(); + it++) { + string name = (*it)->getName(); + vector tokens; + boost::split(tokens, name, boost::is_any_of(".@"), + boost::token_compress_on); + (*it)->setName(tokens[0]); + tokens.clear(); } - for (map::iterator it=cum_copies.begin();it!=cum_copies.end();it++) + leaves.clear(); + sample_trees.push_back(G); + } + + scalar_type cumsum = 0; + map cum_copies; + for (int branch = model->last_branch - 1; branch >= 0; branch--) { + scalar_type extant; + if (model->id_ranks[branch] == 0) + extant = model->last_branch; + else + extant = model->id_ranks[branch] - model->last_branch; + cumsum += model->branch_counts["copies"][branch]; + // cum_copies[model->branch_ts[branch]]=cumsum; + cum_copies[model->id_ranks[branch]] = cumsum; + } + for (map::iterator it = cum_copies.begin(); + it != cum_copies.end(); it++) cout << (*it).first << " " << (*it).second << endl; cout << model->counts_string(); - cout << "Os" <show_counts("Os"); - cout << "Ds" <show_counts("Ds"); - cout << "Ts" <show_counts("Ts"); - cout << "Ts from" <show_counts("Tfroms"); - cout << "Ls" <show_counts("Ls"); - cout << "copies" <show_counts("copies"); - Tree* con_tree= TreeTools::thresholdConsensus(sample_trees,0.5); - TreeTools::computeBootstrapValues(*con_tree,sample_trees); + Tree *con_tree = TreeTools::thresholdConsensus(sample_trees, 0.5); + TreeTools::computeBootstrapValues(*con_tree, sample_trees); cout << endl; - cout << "thcon: "< mpp_res=sale->mpp_tree(); Tree* mpp_T = TreeTemplateTools::parenthesisToTree(mpp_res.first,false); @@ -156,28 +154,25 @@ int main(int argc, char ** argv) approx_posterior * cale=observe_ALE_from_string(con_str); */ return 1; - pair res = model->p_MLRec(ale); + pair res = model->p_MLRec(ale); cout << endl; - cout << "ML: "<< endl; + cout << "ML: " << endl; cout << res.first << endl; cout << endl; <<<<<<< HEAD - string voutname=ml_rec_file+".rate_resample.vstrings"; + string voutname = ml_rec_file + ".rate_resample.vstrings"; ======= - string voutname=ale_file+".rate_resample.vstrings"; + string voutname = ale_file + ".rate_resample.vstrings"; >>>>>>> 474d82cc0794533095baad28e4c97d06f7545120 - ofstream vout( voutname.c_str() ); - - for (std::map >::iterator it=model->gid_branches.begin();it!=model->gid_branches.end();it++) - { - long int g_id=(*it).first; - vout << g_id << " " << model->vertical_string(g_id) << endl; + ofstream vout(voutname.c_str()); - }; + for (std::map>::iterator it = + model->gid_branches.begin(); + it != model->gid_branches.end(); it++) { + long int g_id = (*it).first; + vout << g_id << " " << model->vertical_string(g_id) << endl; + }; return 1; - - } - diff --git a/src/mlresampler_undated.cpp b/src/mlresampler_undated.cpp index c61d02b..75e0fb9 100644 --- a/src/mlresampler_undated.cpp +++ b/src/mlresampler_undated.cpp @@ -7,132 +7,130 @@ using namespace std; using namespace bpp; -int main(int argc, char ** argv) -{ - //we need a species tree +int main(int argc, char **argv) { + // we need a species tree - string ml_rec_file=argv[1]; + string ml_rec_file = argv[1]; string Sstring; - vector tokens; - boost::split(tokens,ml_rec_file,boost::is_any_of("."),boost::token_compress_on); - string ale_file=argv[2]; - - approx_posterior * ale; - ale=load_ALE_from_file(ale_file); - - scalar_type delta,tau,lambda; - ifstream ml_file_stream (ml_rec_file.c_str()); - while(! ml_file_stream.eof()) - { - string line; - getline (ml_file_stream,line); - if (line.find("S:")!=line.npos ) - { - vector tokens; - boost::split(tokens,line,boost::is_any_of(" \t"),boost::token_compress_on); - Sstring=tokens[1]; - } - if (line.find("ML")!=line.npos ) - { - vector tokens; - boost::split(tokens,line,boost::is_any_of(" \t"),boost::token_compress_on); - delta=atof(tokens[1].c_str()); - tau=atof(tokens[2].c_str()); - lambda=atof(tokens[3].c_str()); - - } + vector tokens; + boost::split(tokens, ml_rec_file, boost::is_any_of("."), + boost::token_compress_on); + string ale_file = argv[2]; + + approx_posterior *ale; + ale = load_ALE_from_file(ale_file); + + scalar_type delta, tau, lambda; + ifstream ml_file_stream(ml_rec_file.c_str()); + while (!ml_file_stream.eof()) { + string line; + getline(ml_file_stream, line); + if (line.find("S:") != line.npos) { + vector tokens; + boost::split(tokens, line, boost::is_any_of(" \t"), + boost::token_compress_on); + Sstring = tokens[1]; } - cout <<"# S: "<< Sstring << endl; - cout <<"# rates: "<< delta << " " << tau << " " << lambda << endl; - exODT_model* model=new exODT_model(); + if (line.find("ML") != line.npos) { + vector tokens; + boost::split(tokens, line, boost::is_any_of(" \t"), + boost::token_compress_on); + delta = atof(tokens[1].c_str()); + tau = atof(tokens[2].c_str()); + lambda = atof(tokens[3].c_str()); + } + } + cout << "# S: " << Sstring << endl; + cout << "# rates: " << delta << " " << tau << " " << lambda << endl; + exODT_model *model = new exODT_model(); - model->set_model_parameter("BOOTSTRAP_LABELS","yes"); + model->set_model_parameter("BOOTSTRAP_LABELS", "yes"); model->construct_undated(Sstring); - - model->set_model_parameter("delta",delta); + model->set_model_parameter("delta", delta); model->set_model_parameter("tau", tau); model->set_model_parameter("lambda", lambda); - cout << "." << endl; + cout << "." << endl; model->calculate_undatedEs(); - cout << ".." << endl; - - boost::timer * t = new boost::timer(); - cout <<"LL: " << log(model->pun(ale)) << endl; - cout <<"time: " << t->elapsed() << endl; - cout << ".."< sample_trees; - string outname=ml_rec_file+".rate_resample.usamples"; - ofstream fout( outname.c_str() ); - string outname2=ml_rec_file+".rate_resample.uTtokens"; - ofstream fout2( outname2.c_str() ); - - int samples=atoi(argv[3]); - //boost::progress_display pd( samples ); - for (int i=0;iTtokens.clear(); - string sample_tree=model->sample_undated(); - fout << sample_tree << endl; - for (vector::iterator it=model->Ttokens.begin();it!=model->Ttokens.end();it++) fout2 << i << " " <<(*it)< leaves = G->getLeaves(); - for (vector::iterator it=leaves.begin();it!=leaves.end();it++ ) - { - string name=(*it)->getName(); - vector tokens; - boost::split(tokens,name,boost::is_any_of(".@"),boost::token_compress_on); - (*it)->setName(tokens[0]); - tokens.clear(); - } - leaves.clear(); - sample_trees.push_back(G); - } - - scalar_type cumsum=0; - map cum_copies; - for (int branch=model->last_branch-1;branch>=0;branch--) - { - scalar_type extant; - if (model->id_ranks[branch]==0) - extant=model->last_branch; - else - extant=model->id_ranks[branch]-model->last_branch; - cumsum+=model->branch_counts["copies"][branch]; - //cum_copies[model->branch_ts[branch]]=cumsum; - cum_copies[model->id_ranks[branch]]=cumsum; + cout << ".." << endl; + + boost::timer *t = new boost::timer(); + cout << "LL: " << log(model->pun(ale)) << endl; + cout << "time: " << t->elapsed() << endl; + cout << ".." << endl; + vector sample_trees; + string outname = ml_rec_file + ".rate_resample.usamples"; + ofstream fout(outname.c_str()); + string outname2 = ml_rec_file + ".rate_resample.uTtokens"; + ofstream fout2(outname2.c_str()); + + int samples = atoi(argv[3]); + // boost::progress_display pd( samples ); + for (int i = 0; i < samples; i++) { + //++pd; + model->Ttokens.clear(); + string sample_tree = model->sample_undated(); + fout << sample_tree << endl; + for (vector::iterator it = model->Ttokens.begin(); + it != model->Ttokens.end(); it++) + fout2 << i << " " << (*it) << endl; + + tree_type *G = TreeTemplateTools::parenthesisToTree(sample_tree, false); + vector leaves = G->getLeaves(); + for (vector::iterator it = leaves.begin(); it != leaves.end(); + it++) { + string name = (*it)->getName(); + vector tokens; + boost::split(tokens, name, boost::is_any_of(".@"), + boost::token_compress_on); + (*it)->setName(tokens[0]); + tokens.clear(); } - for (map::iterator it=cum_copies.begin();it!=cum_copies.end();it++) + leaves.clear(); + sample_trees.push_back(G); + } + + scalar_type cumsum = 0; + map cum_copies; + for (int branch = model->last_branch - 1; branch >= 0; branch--) { + scalar_type extant; + if (model->id_ranks[branch] == 0) + extant = model->last_branch; + else + extant = model->id_ranks[branch] - model->last_branch; + cumsum += model->branch_counts["copies"][branch]; + // cum_copies[model->branch_ts[branch]]=cumsum; + cum_copies[model->id_ranks[branch]] = cumsum; + } + for (map::iterator it = cum_copies.begin(); + it != cum_copies.end(); it++) cout << (*it).first << " " << (*it).second << endl; cout << model->counts_string(); - cout << "Os" <show_counts("Os"); - cout << "Ds" <show_counts("Ds"); - cout << "Ts" <show_counts("Ts"); - cout << "Ts from" <show_counts("Tfroms"); - cout << "Ls" <show_counts("Ls"); - cout << "copies" <show_counts("copies"); - Tree* con_tree= TreeTools::thresholdConsensus(sample_trees,0.5); - TreeTools::computeBootstrapValues(*con_tree,sample_trees); + Tree *con_tree = TreeTools::thresholdConsensus(sample_trees, 0.5); + TreeTools::computeBootstrapValues(*con_tree, sample_trees); cout << endl; - cout << "thcon: "< mpp_res=sale->mpp_tree(); Tree* mpp_T = TreeTemplateTools::parenthesisToTree(mpp_res.first,false); @@ -145,24 +143,21 @@ int main(int argc, char ** argv) approx_posterior * cale=observe_ALE_from_string(con_str); */ return 1; - pair res = model->p_MLRec(ale); + pair res = model->p_MLRec(ale); cout << endl; - cout << "ML: "<< endl; + cout << "ML: " << endl; cout << res.first << endl; cout << endl; - string voutname=ml_rec_file+".rate_resample.vstrings"; - ofstream vout( voutname.c_str() ); - - for (std::map >::iterator it=model->gid_branches.begin();it!=model->gid_branches.end();it++) - { - long int g_id=(*it).first; - vout << g_id << " " << model->vertical_string(g_id) << endl; + string voutname = ml_rec_file + ".rate_resample.vstrings"; + ofstream vout(voutname.c_str()); - }; + for (std::map>::iterator it = + model->gid_branches.begin(); + it != model->gid_branches.end(); it++) { + long int g_id = (*it).first; + vout << g_id << " " << model->vertical_string(g_id) << endl; + }; return 1; - - } - diff --git a/src/mlsampler.cpp b/src/mlsampler.cpp index 73977b4..a84b11e 100644 --- a/src/mlsampler.cpp +++ b/src/mlsampler.cpp @@ -2,207 +2,228 @@ #include "exODT_sim.h" #include "ALE_util.h" -#include #include +#include using namespace std; using namespace bpp; -int main(int argc, char ** argv) -{ - //we need a species tree +int main(int argc, char **argv) { + // we need a species tree - string sname=argv[1]; + string sname = argv[1]; string Sstring; - ifstream file_stream (sname.c_str()); - getline (file_stream,Sstring); + ifstream file_stream(sname.c_str()); + getline(file_stream, Sstring); - string ml_rec_file=argv[2]; + string ml_rec_file = argv[2]; - vector tokens; - boost::split(tokens,ml_rec_file,boost::is_any_of("."),boost::token_compress_on); + vector tokens; + boost::split(tokens, ml_rec_file, boost::is_any_of("."), + boost::token_compress_on); - string ale_file=tokens[0]+".ale"; + string ale_file = tokens[0] + ".ale"; cout << ale_file << endl; - approx_posterior * ale; - ale=load_ALE_from_file(ale_file); - - scalar_type delta,tau,lambda; - ifstream ml_file_stream (ml_rec_file.c_str()); - while(! ml_file_stream.eof()) - { - string line; - getline (ml_file_stream,line); - if (line.find("ML")!=line.npos ) - { - vector tokens; - boost::split(tokens,line,boost::is_any_of(" \t"),boost::token_compress_on); - delta=atof(tokens[1].c_str()); - tau=atof(tokens[2].c_str()); - lambda=atof(tokens[3].c_str()); - - } + approx_posterior *ale; + ale = load_ALE_from_file(ale_file); + + scalar_type delta, tau, lambda; + ifstream ml_file_stream(ml_rec_file.c_str()); + while (!ml_file_stream.eof()) { + string line; + getline(ml_file_stream, line); + if (line.find("ML") != line.npos) { + vector tokens; + boost::split(tokens, line, boost::is_any_of(" \t"), + boost::token_compress_on); + delta = atof(tokens[1].c_str()); + tau = atof(tokens[2].c_str()); + lambda = atof(tokens[3].c_str()); } + } cout << delta << " " << tau << " " << lambda << endl; - exODT_model* model=new exODT_model(); - + exODT_model *model = new exODT_model(); - model->set_model_parameter("min_D",3); - model->set_model_parameter("grid_delta_t",0.05); + model->set_model_parameter("min_D", 3); + model->set_model_parameter("grid_delta_t", 0.05); model->construct(Sstring); - model->set_model_parameter("event_node",0); - model->set_model_parameter("leaf_events",1); - model->set_model_parameter("N",1); + model->set_model_parameter("event_node", 0); + model->set_model_parameter("leaf_events", 1); + model->set_model_parameter("N", 1); model->construct(Sstring); - model->set_model_parameter("delta",delta); + model->set_model_parameter("delta", delta); model->set_model_parameter("tau", tau); model->set_model_parameter("lambda", lambda); model->set_model_parameter("sigma_hat", 1); - model->set_model_parameter("leaf_events",1); + model->set_model_parameter("leaf_events", 1); model->calculate_EGb(); - scalar_type old_p=model->p(ale); - - boost::timer * t = new boost::timer(); - cout <<"LL: " << log(model->p(ale)) << endl; - cout <<"time: " << t->elapsed() << endl; - cout << ".."< sample_trees; - string outname=ale_file+".rate_sample.samples"; - ofstream fout( outname.c_str() ); - string outname2=ale_file+".rate_sample.Ttokens"; - ofstream fout2( outname2.c_str() ); - - int subsamples=atoi(argv[3]); - boost::progress_display pd( subsamples ); - - for (int i=0;i<90;i++) - { + scalar_type old_p = model->p(ale); + + boost::timer *t = new boost::timer(); + cout << "LL: " << log(model->p(ale)) << endl; + cout << "time: " << t->elapsed() << endl; + cout << ".." << endl; + vector sample_trees; + string outname = ale_file + ".rate_sample.samples"; + ofstream fout(outname.c_str()); + string outname2 = ale_file + ".rate_sample.Ttokens"; + ofstream fout2(outname2.c_str()); + + int subsamples = atoi(argv[3]); + boost::progress_display pd(subsamples); + + for (int i = 0; i < 90; i++) { + vector ds; + // rate proposal + for (int i = 0; i < 3; i++) { + scalar_type r = RandomTools::giveRandomNumberBetweenZeroAndEntry(1); + scalar_type d; + if (r < 1. / 3.) + d = RandomTools::randExponential(0.001) * 2 * + (0.5 - RandomTools::giveRandomNumberBetweenZeroAndEntry(1)); + else if (r < 2. / 3.) + d = RandomTools::randExponential(0.01) * 2 * + (0.5 - RandomTools::giveRandomNumberBetweenZeroAndEntry(1)); + else + d = RandomTools::randExponential(0.1) * 2 * + (0.5 - RandomTools::giveRandomNumberBetweenZeroAndEntry(1)); + ds.push_back(d); + } + scalar_type new_delta = delta + ds[0]; + scalar_type new_tau = tau + ds[1]; + scalar_type new_lambda = lambda + ds[2]; + + // boundaries + if (new_delta < 1e-6) + new_delta = 1e-6; + if (new_delta > 10 - 1e-6) + new_delta = 10 - 1e-6; + if (new_tau < 1e-6) + new_tau = 1e-6; + if (new_tau > 10 - 1e-6) + new_tau = 10 - 1e-6; + if (new_lambda < 1e-6) + new_lambda = 1e-6; + if (new_lambda > 10 - 1e-6) + new_lambda = 10 - 1e-6; + + // likelihood + model->set_model_parameter("delta", new_delta); + model->set_model_parameter("tau", new_tau); + model->set_model_parameter("lambda", new_lambda); + model->calculate_EGb(); + scalar_type new_p = model->p(ale); + if (new_p >= old_p or + new_p / old_p > RandomTools::giveRandomNumberBetweenZeroAndEntry(1)) { + old_p = new_p; + delta = new_delta; + tau = new_tau; + lambda = new_lambda; + } + cout << delta << " " << tau << " " << lambda << " " << log(old_p) << endl; + } + for (int i = 0; i < subsamples; i++) { + + for (int i = 0; i < 10; i++) { vector ds; - //rate proposal - for (int i=0;i<3;i++) - { - scalar_type r=RandomTools::giveRandomNumberBetweenZeroAndEntry(1); - scalar_type d; - if (r<1./3.) d=RandomTools::randExponential(0.001)*2*(0.5-RandomTools::giveRandomNumberBetweenZeroAndEntry(1)); - else if (r<2./3.) d=RandomTools::randExponential(0.01)*2*(0.5-RandomTools::giveRandomNumberBetweenZeroAndEntry(1)); - else d=RandomTools::randExponential(0.1)*2*(0.5-RandomTools::giveRandomNumberBetweenZeroAndEntry(1)); - ds.push_back(d); - } - scalar_type new_delta=delta+ds[0]; - scalar_type new_tau=tau+ds[1]; - scalar_type new_lambda=lambda+ds[2]; - - //boundaries - if (new_delta<1e-6) new_delta=1e-6; - if (new_delta>10-1e-6) new_delta=10-1e-6; - if (new_tau<1e-6) new_tau=1e-6; - if (new_tau>10-1e-6) new_tau=10-1e-6; - if (new_lambda<1e-6) new_lambda=1e-6; - if (new_lambda>10-1e-6) new_lambda=10-1e-6; - - //likelihood - model->set_model_parameter("delta",new_delta); - model->set_model_parameter("tau",new_tau); - model->set_model_parameter("lambda",new_lambda); + // rate proposal + for (int i = 0; i < 3; i++) { + scalar_type r = RandomTools::giveRandomNumberBetweenZeroAndEntry(1); + scalar_type d; + if (r < 1. / 3.) + d = RandomTools::randExponential(0.001) * 2 * + (0.5 - RandomTools::giveRandomNumberBetweenZeroAndEntry(1)); + else if (r < 2. / 3.) + d = RandomTools::randExponential(0.01) * 2 * + (0.5 - RandomTools::giveRandomNumberBetweenZeroAndEntry(1)); + else + d = RandomTools::randExponential(0.1) * 2 * + (0.5 - RandomTools::giveRandomNumberBetweenZeroAndEntry(1)); + ds.push_back(d); + } + scalar_type new_delta = delta + ds[0]; + scalar_type new_tau = tau + ds[1]; + scalar_type new_lambda = lambda + ds[2]; + + // boundaries + if (new_delta < 1e-6) + new_delta = 1e-6; + if (new_delta > 10 - 1e-6) + new_delta = 10 - 1e-6; + if (new_tau < 1e-6) + new_tau = 1e-6; + if (new_tau > 10 - 1e-6) + new_tau = 10 - 1e-6; + if (new_lambda < 1e-6) + new_lambda = 1e-6; + if (new_lambda > 10 - 1e-6) + new_lambda = 10 - 1e-6; + + // likelihood + model->set_model_parameter("delta", new_delta); + model->set_model_parameter("tau", new_tau); + model->set_model_parameter("lambda", new_lambda); model->calculate_EGb(); - scalar_type new_p=model->p(ale); - if (new_p>=old_p or new_p/old_p>RandomTools::giveRandomNumberBetweenZeroAndEntry(1)) - { - old_p=new_p; - delta=new_delta; tau=new_tau; lambda=new_lambda; - } + scalar_type new_p = model->p(ale); + if (new_p >= old_p or + new_p / old_p > RandomTools::giveRandomNumberBetweenZeroAndEntry(1)) { + old_p = new_p; + delta = new_delta; + tau = new_tau; + lambda = new_lambda; + } cout << delta << " " << tau << " " << lambda << " " << log(old_p) << endl; - } - for (int i=0;i ds; - //rate proposal - for (int i=0;i<3;i++) - { - scalar_type r=RandomTools::giveRandomNumberBetweenZeroAndEntry(1); - scalar_type d; - if (r<1./3.) d=RandomTools::randExponential(0.001)*2*(0.5-RandomTools::giveRandomNumberBetweenZeroAndEntry(1)); - else if (r<2./3.) d=RandomTools::randExponential(0.01)*2*(0.5-RandomTools::giveRandomNumberBetweenZeroAndEntry(1)); - else d=RandomTools::randExponential(0.1)*2*(0.5-RandomTools::giveRandomNumberBetweenZeroAndEntry(1)); - ds.push_back(d); - } - scalar_type new_delta=delta+ds[0]; - scalar_type new_tau=tau+ds[1]; - scalar_type new_lambda=lambda+ds[2]; - - //boundaries - if (new_delta<1e-6) new_delta=1e-6; - if (new_delta>10-1e-6) new_delta=10-1e-6; - if (new_tau<1e-6) new_tau=1e-6; - if (new_tau>10-1e-6) new_tau=10-1e-6; - if (new_lambda<1e-6) new_lambda=1e-6; - if (new_lambda>10-1e-6) new_lambda=10-1e-6; - - //likelihood - model->set_model_parameter("delta",new_delta); - model->set_model_parameter("tau",new_tau); - model->set_model_parameter("lambda",new_lambda); - model->calculate_EGb(); - scalar_type new_p=model->p(ale); - if (new_p>=old_p or new_p/old_p>RandomTools::giveRandomNumberBetweenZeroAndEntry(1)) - { - old_p=new_p; - delta=new_delta; tau=new_tau; lambda=new_lambda; - } - cout << delta << " " << tau << " " << lambda << " " << log(old_p) << endl; - } - ++pd; - string sample_tree=model->sample(false); - fout << sample_tree << endl; - for (vector::iterator it=model->Ttokens.begin();it!=model->Ttokens.end();it++) fout << i << " " <<(*it)< leaves = G->getLeaves(); - for (vector::iterator it=leaves.begin();it!=leaves.end();it++ ) - { - string name=(*it)->getName(); - vector tokens; - boost::split(tokens,name,boost::is_any_of(".@"),boost::token_compress_on); - (*it)->setName(tokens[0]); - tokens.clear(); - } - leaves.clear(); - sample_trees.push_back(G); + ++pd; + string sample_tree = model->sample(false); + fout << sample_tree << endl; + for (vector::iterator it = model->Ttokens.begin(); + it != model->Ttokens.end(); it++) + fout << i << " " << (*it) << endl; + + tree_type *G = TreeTemplateTools::parenthesisToTree(sample_tree, false); + vector leaves = G->getLeaves(); + for (vector::iterator it = leaves.begin(); it != leaves.end(); + it++) { + string name = (*it)->getName(); + vector tokens; + boost::split(tokens, name, boost::is_any_of(".@"), + boost::token_compress_on); + (*it)->setName(tokens[0]); + tokens.clear(); } - + leaves.clear(); + sample_trees.push_back(G); + } + cout << model->counts_string(); - cout << "Os" <show_counts("Os"); - cout << "Ds" <show_counts("Ds"); - cout << "Ts" <show_counts("Ts"); - cout << "Ts from" <show_counts("Tfroms"); - cout << "Ls" <show_counts("Ls"); - cout << "copies" <show_counts("copies"); - Tree* con_tree= TreeTools::thresholdConsensus(sample_trees,0.5); - TreeTools::computeBootstrapValues(*con_tree,sample_trees); + Tree *con_tree = TreeTools::thresholdConsensus(sample_trees, 0.5); + TreeTools::computeBootstrapValues(*con_tree, sample_trees); cout << endl; - cout << "thcon: "< mpp_res=sale->mpp_tree(); Tree* mpp_T = TreeTemplateTools::parenthesisToTree(mpp_res.first,false); @@ -215,24 +236,21 @@ int main(int argc, char ** argv) approx_posterior * cale=observe_ALE_from_string(con_str); */ - pair res = model->p_MLRec(ale); + pair res = model->p_MLRec(ale); cout << endl; - cout << "ML: "<< endl; + cout << "ML: " << endl; cout << res.first << endl; cout << endl; - string voutname=ale_file+".rate_sample.vstrings"; - ofstream vout( voutname.c_str() ); + string voutname = ale_file + ".rate_sample.vstrings"; + ofstream vout(voutname.c_str()); - for (std::map >::iterator it=model->gid_branches.begin();it!=model->gid_branches.end();it++) - { - long int g_id=(*it).first; - vout << g_id << " " << model->vertical_string(g_id) << endl; - - }; + for (std::map>::iterator it = + model->gid_branches.begin(); + it != model->gid_branches.end(); it++) { + long int g_id = (*it).first; + vout << g_id << " " << model->vertical_string(g_id) << endl; + }; return 1; - - } - diff --git a/src/model.cpp b/src/model.cpp index 74593fc..8e3986a 100644 --- a/src/model.cpp +++ b/src/model.cpp @@ -4,764 +4,813 @@ using namespace bpp; #include +// p(ale) calculates Pi(Gamma) cf. ALEPAPER +scalar_type exODT_model::p(approx_posterior *ale) { + ale_pointer = ale; + + for (std::map>>::iterator + it = q.begin(); + it != q.end(); it++) { + for (std::map>::iterator jt = + (*it).second.begin(); + jt != (*it).second.end(); jt++) + (*jt).second.clear(); + (*it).second.clear(); + } + q.clear(); -//p(ale) calculates Pi(Gamma) cf. ALEPAPER -scalar_type exODT_model::p(approx_posterior *ale) -{ - ale_pointer=ale; + // directed partitions and their sizes + vector g_ids; // del-loc + vector g_id_sizes; // del-loc + for (map>::iterator it = ale->size_ordered_bips.begin(); + it != ale->size_ordered_bips.end(); it++) + for (vector::iterator jt = (*it).second.begin(); + jt != (*it).second.end(); jt++) { + g_ids.push_back((*jt)); + g_id_sizes.push_back((*it).first); + } + // root bipartition needs to be handled separately + g_ids.push_back(-1); + g_id_sizes.push_back(ale->Gamma_size); - for (std::map > >::iterator it=q.begin();it!=q.end();it++) - { - for ( std::map< scalar_type, std::map >::iterator jt=(*it).second.begin();jt!=(*it).second.end();jt++) - (*jt).second.clear(); - (*it).second.clear(); + // gene<->species mapping + for (int i = 0; i < (int)g_ids.size(); i++) { + long int g_id = g_ids[i]; + for (int rank = 0; rank < last_rank; rank++) { + int n = time_slices[rank].size(); + for (int t_i = 0; t_i < (int)time_slice_times[rank].size(); t_i++) { + scalar_type t = time_slice_times[rank][t_i]; + for (int branch_i = 0; branch_i < n; branch_i++) { + int e = time_slices[rank][branch_i]; + q[g_id][t][e] = 0; + } + q[g_id][t][alpha] = 0; + } } - q.clear(); - //directed partitions and their sizes - vector g_ids;//del-loc - vector g_id_sizes;//del-loc - for (map > :: iterator it = ale->size_ordered_bips.begin(); it != ale->size_ordered_bips.end(); it++) - for (vector :: iterator jt = (*it).second.begin(); jt != (*it).second.end(); jt++) - { - g_ids.push_back((*jt)); - g_id_sizes.push_back((*it).first); + if (g_id_sizes[i] == 1) { + /* int id = 0; + boost::dynamic_bitset<> temp = ale->id_sets[g_id]; + for (auto i = 0; i < ale->Gamma_size + 1; ++i) { + // if ( BipartitionTools::testBit ( temp, i) ) { + if ( temp[i] ) { + id = i; + break; + } + }*/ + int id = 0; + for (auto i = 0; i < ale->Gamma_size + 1; ++i) { + if (ale->id_sets[g_id][i]) { + id = i; + break; + } } - //root bipartition needs to be handled separately - g_ids.push_back(-1); - g_id_sizes.push_back(ale->Gamma_size); - // gene<->species mapping - for (int i=0;i<(int)g_ids.size();i++) - { - long int g_id=g_ids[i]; - for (int rank=0;rank temp = ale->id_sets[g_id]; - for (auto i = 0; i < ale->Gamma_size + 1; ++i) { - // if ( BipartitionTools::testBit ( temp, i) ) { - if ( temp[i] ) { - id = i; - break; + string gene_name = ale->id_leaves[id /*g_id*/]; + // string gene_name=ale->id_leaves[ (* (ale->id_sets[g_id].begin()) )]; + vector tokens; + boost::split(tokens, gene_name, + boost::is_any_of(string_parameter["gene_name_separators"]), + boost::token_compress_on); + string species_name; + if ((int)scalar_parameter["species_field"] == -1) + species_name = tokens[tokens.size() - 1]; + else + species_name = tokens[(int)scalar_parameter["species_field"]]; + gid_sps[g_id] = species_name; + } + } + + for (int i = 0; i < (int)g_ids.size(); i++) { + // directed partition (dip) gamma's id + bool is_a_leaf = false; + long int g_id = g_ids[i]; + if (g_id_sizes[i] == 1) + is_a_leaf = true; + + vector gp_ids; // del-loc + vector gpp_ids; // del-loc + vector p_part; // del-loc + if (g_id != -1) + for (unordered_map, scalar_type>::iterator kt = + ale->Dip_counts[g_id].begin(); + kt != ale->Dip_counts[g_id].end(); kt++) { + pair parts = (*kt).first; + long int gp_id = parts.first; + long int gpp_id = parts.second; + gp_ids.push_back(gp_id); + gpp_ids.push_back(gpp_id); + if (ale->Bip_counts[g_id] <= scalar_parameter["min_bip_count"]) + p_part.push_back(0); + else + p_part.push_back(ale->p_dip(g_id, gp_id, gpp_id)); + } + else { + // root bipartition needs to be handled separately + map, int> bip_parts; + for (map::iterator it = ale->Bip_counts.begin(); + it != ale->Bip_counts.end(); it++) { + long int gp_id = (*it).first; + boost::dynamic_bitset<> gamma = ale->id_sets.at(gp_id); + boost::dynamic_bitset<> not_gamma = ~gamma; + not_gamma[0] = 0; + long int gpp_id = ale->set_ids.at(not_gamma); + + set parts; + parts.insert(gp_id); + parts.insert(gpp_id); + bip_parts[parts] = 1; + // gamma.clear(); + // not_gamma.clear(); + } + for (map, int>::iterator kt = bip_parts.begin(); + kt != bip_parts.end(); kt++) { + vector parts; + for (set::iterator sit = (*kt).first.begin(); + sit != (*kt).first.end(); sit++) { + parts.push_back((*sit)); + } + long int gp_id = parts[0]; + long int gpp_id = parts[1]; + gp_ids.push_back(gp_id); + gpp_ids.push_back(gpp_id); + + // Here we can create a new ale->Bip_counts[gp_id], in particular for + // leaves. We may want to add the leaf entries for Bip_counts when + // Bip_counts is first created. + if (ale->Bip_counts[gp_id] <= scalar_parameter.at("min_bip_count") and + not ale->Gamma_size < 4) + p_part.push_back(0); + else + p_part.push_back(ale->p_bip(gp_id)); + } + bip_parts.clear(); + } + int N_parts = gp_ids.size(); + + // iterate over all postions along S + for (int rank = 0; rank < last_rank; rank++) { + int n = time_slices[rank].size(); + for (int t_i = 0; t_i < (int)time_slice_times[rank].size(); t_i++) { + // ###################################################################################################################### + // #########################################INNNER + // LOOP################################################################## + // ###################################################################################################################### + + scalar_type t = time_slice_times[rank][t_i]; + scalar_type tpdt, tpdt_nl; + if (t_i < (int)time_slice_times[rank].size() - 1) + tpdt = time_slice_times[rank][t_i + 1]; + else if (rank < last_rank - 1) + tpdt = time_slice_times[rank + 1][0]; + else + // top of root stem + tpdt = t_begin[time_slices[rank][0]]; + + if (scalar_parameter["event_node"] == 1 and false) + tpdt_nl = t; + else + tpdt_nl = tpdt; + + // root + scalar_type Delta_t = (tpdt - t) * 1; + + // Delat_bar corresponds to sigma in ALEPAPER + scalar_type Delta_bar = vector_parameter["Delta_bar"][rank]; + // scalar_type Lambda_bar=vector_parameter["Lambda_bar"][rank]; + // scalar_type tmp + scalar_type p_Delta_bar = Delta_bar * Delta_t; + scalar_type Ebar = Ee[-1][t]; + + // boundaries for branch alpha virtual branch + + // boundary at present + if (t == 0) + q[g_id][t][alpha] = 0; + + // boundary between slice rank and rank-1 slice is trivial + ; // q[g_id][t][alpha]=q[g_id][t][alpha]; + + // boundaries for branch alpha virtual branch. + if (1) { + for (int branch_i = 0; branch_i < n; branch_i++) { + int e = time_slices[rank][branch_i]; + + // boundaries for branch e + // boundary at present + if (t == 0) { + if (is_a_leaf && extant_species[e] == gid_sps[g_id]) { + q[g_id][t][e] = 1; + } else + q[g_id][t][e] = 0; } - }*/ - int id = 0; - for (auto i=0; i< ale->Gamma_size + 1; ++i) { - if ( ale->id_sets[g_id][i] ) { - id=i; - break; + // boundary between slice rank and rank-1 + else if (t_i == 0) { + // terminating branch is last in time_slices and defines a + // represented speciation + if (branch_i == n - 1 && rank > 0) { + int f = daughters[e][0]; + int g = daughters[e][1]; + scalar_type Eft = Ee[f][t]; + scalar_type Egt = Ee[g][t]; + + scalar_type q_sum = 0; + // q[g_id][t][e]=0; + + scalar_type SL_fLg = q[g_id][t][f] * Egt; + scalar_type SL_Lfg = q[g_id][t][g] * Eft; + // SL EVENT, events #3 and #4 in part c of Fig.A1 in + // http://arxiv.org/abs/1211.4606 + // q[g_id][t][e]=q[g_id][t][f]*Egt + q[g_id][t][g]*Eft; + q_sum += SL_fLg + SL_Lfg; + // SL. + + // non-leaf directed partition + if (not is_a_leaf) + for (int i = 0; i < N_parts; i++) { + long int gp_id = gp_ids[i]; + long int gpp_id = gpp_ids[i]; + scalar_type pp = p_part[i]; + scalar_type S_pf_ppg = + q[gp_id][t][f] * q[gpp_id][t][g] * pp; + scalar_type S_ppf_pg = + q[gpp_id][t][f] * q[gp_id][t][g] * pp; + // S EVENT, events #1 and #2 in part c of Fig.A1 in + // http://arxiv.org/abs/1211.4606 + // q[g_id][t][e]+=q[gp_id][t][f]*q[gpp_id][t][g] + // +q[gpp_id][t][f]*q[gp_id][t][g]; + q_sum += S_pf_ppg + S_ppf_pg; + // S. + } + q[g_id][t][e] = q_sum; + + } + + // branches that cross to next time slice + else { + // trivial + ; // q[g_id][t][e]=q[g_id][t][e]; + } } + // boundaries for branch e. + } } - string gene_name=ale->id_leaves[ id /*g_id*/ ]; - // string gene_name=ale->id_leaves[ (* (ale->id_sets[g_id].begin()) )]; - vector tokens; - boost::split(tokens,gene_name,boost::is_any_of(string_parameter["gene_name_separators"]),boost::token_compress_on); - string species_name; - if ((int)scalar_parameter["species_field"]==-1) - species_name=tokens[tokens.size()-1]; - else - species_name=tokens[(int)scalar_parameter["species_field"]]; - gid_sps[g_id]=species_name; - } - } - - for (int i=0;i<(int)g_ids.size();i++) - { - // directed partition (dip) gamma's id - bool is_a_leaf=false; - long int g_id=g_ids[i]; - if (g_id_sizes[i]==1) - is_a_leaf=true; - - vector gp_ids;//del-loc - vector gpp_ids;//del-loc - vector p_part;//del-loc - if (g_id!=-1) - for (unordered_map< pair,scalar_type> :: iterator kt = ale->Dip_counts[g_id].begin(); kt != ale->Dip_counts[g_id].end(); kt++) - { - pair parts = (*kt).first; - long int gp_id=parts.first; - long int gpp_id=parts.second; - gp_ids.push_back(gp_id); - gpp_ids.push_back(gpp_id); - if (ale->Bip_counts[g_id]<=scalar_parameter["min_bip_count"]) - p_part.push_back(0); - else - p_part.push_back(ale->p_dip(g_id,gp_id,gpp_id)); - } - else - { - //root bipartition needs to be handled separately - map,int> bip_parts; - for (map :: iterator it = ale->Bip_counts.begin(); it != ale->Bip_counts.end(); it++) - { - long int gp_id=(*it).first; - boost::dynamic_bitset<> gamma =ale->id_sets.at(gp_id); - boost::dynamic_bitset<> not_gamma = ~gamma; - not_gamma[0] = 0; - long int gpp_id = ale->set_ids.at(not_gamma); - - set parts; - parts.insert(gp_id); - parts.insert(gpp_id); - bip_parts[parts]=1; - // gamma.clear(); - // not_gamma.clear(); - } - for (map,int> :: iterator kt = bip_parts.begin();kt!=bip_parts.end();kt++) - { - vector parts; - for (set::iterator sit=(*kt).first.begin();sit!=(*kt).first.end();sit++) { - parts.push_back((*sit)); + if (1) { + + // events within slice rank at time t on alpha virtual branch + scalar_type G_bar = Ge[-1][t]; + // note that the coalescent approximation in + // http://arxiv.org/abs/1211.4606 is + // exp(-(Delta_bar*(n-N)/N+Lambda_bar)*Delta_t ); + + q[g_id][tpdt][alpha] = 0; + scalar_type q_sum = 0; + scalar_type q_sum_nl = 0; + for (int branch_i = 0; branch_i < n; branch_i++) { + int e = time_slices[rank][branch_i]; + scalar_type tau_e = vector_parameter["tau"][e]; + scalar_type p_Ntau_e = tau_e * Delta_t; + + // non-leaf directed partition + if (not is_a_leaf) + for (int i = 0; i < N_parts; i++) { + long int gp_id = gp_ids[i]; + long int gpp_id = gpp_ids[i]; + scalar_type pp = p_part[i]; + scalar_type T_ep_app = + p_Ntau_e * q[gp_id][t][e] * q[gpp_id][t][alpha] * pp; + scalar_type T_ap_epp = + p_Ntau_e * q[gp_id][t][alpha] * q[gpp_id][t][e] * pp; + // T EVENT, events #3 and #4 in part b of Fig.A1 in + // http://arxiv.org/abs/1211.4606 + // q[g_id][tpdt][alpha]+=p_Ntau_e*(q[gp_id][t][e]*q[gpp_id][t][alpha]+q[gp_id][t][alpha]*q[gpp_id][t][e]); + q_sum_nl += T_ep_app + T_ap_epp; + // T. + } + } + // non-leaf directed partition + if (not is_a_leaf) + for (int i = 0; i < N_parts; i++) { + long int gp_id = gp_ids[i]; + long int gpp_id = gpp_ids[i]; + scalar_type pp = p_part[i]; + + scalar_type Sb = p_Delta_bar * + (2 * q[gp_id][t][alpha] * q[gpp_id][t][alpha]) * + pp; + // S_bar EVENT, event #2 in part b of Fig.A1 in + // http://arxiv.org/abs/1211.4606 (note that Delta_bar corresponds + //to sigma, the Delta_bar,Lambda_bar distinction keeps track of + //speciaiton (birth) vs extiction (death), + // but for the Moran process Delta_bar=Lambda_bar=sigma ) + // q[g_id][tpdt][alpha]+=p_Delta_bar*(2*q[gp_id][t][alpha]*q[gpp_id][t][alpha]); + q_sum_nl += Sb; + // S_bar. } - long int gp_id=parts[0]; - long int gpp_id=parts[1]; - gp_ids.push_back(gp_id); - gpp_ids.push_back(gpp_id); - - //Here we can create a new ale->Bip_counts[gp_id], in particular for leaves. - //We may want to add the leaf entries for Bip_counts when Bip_counts is first created. - if (ale->Bip_counts[gp_id]<=scalar_parameter.at("min_bip_count") and not ale->Gamma_size<4) - p_part.push_back(0); - else - p_part.push_back(ale->p_bip(gp_id)); - } - bip_parts.clear(); - } - int N_parts=gp_ids.size(); - - //iterate over all postions along S - for (int rank=0;rank0) - { - int f=daughters[e][0]; - int g=daughters[e][1]; - scalar_type Eft=Ee[f][t]; - scalar_type Egt=Ee[g][t]; - - scalar_type q_sum=0; - //q[g_id][t][e]=0; - - scalar_type SL_fLg=q[g_id][t][f]*Egt; - scalar_type SL_Lfg=q[g_id][t][g]*Eft; - //SL EVENT, events #3 and #4 in part c of Fig.A1 in http://arxiv.org/abs/1211.4606 - //q[g_id][t][e]=q[g_id][t][f]*Egt + q[g_id][t][g]*Eft; - q_sum+=SL_fLg+SL_Lfg; - //SL. - - //non-leaf directed partition - if (not is_a_leaf) - for (int i=0;i1) q[g_id][tpdt_nl][e]=1;//XX - - scalar_type empty=Get*q[g_id][t][e]; - //0 EVENT, event #1 in part a of Fig.A1 in http://arxiv.org/abs/1211.4606 - //q[g_id][tpdt][e]=Get*q[g_id][t][e]; - q_sum+=empty; - //0. - - q[g_id][tpdt][e]+=q_sum; - //if (q[g_id][tpdt][e]>1) q[g_id][tpdt][e]=1; - //events within slice rank at time t on branch e. - } - } - //###################################################################################################################### - //#########################################INNNER LOOP################################################################## - //###################################################################################################################### - } - } - gp_ids.clear(); - gpp_ids.clear(); - p_part.clear(); - } - scalar_type root_norm=0; - for (int rank=0;rank1) q[g_id][tpdt_nl][e]=1;//XX + + scalar_type empty = Get * q[g_id][t][e]; + // 0 EVENT, event #1 in part a of Fig.A1 in + // http://arxiv.org/abs/1211.4606 + // q[g_id][tpdt][e]=Get*q[g_id][t][e]; + q_sum += empty; + // 0. + + q[g_id][tpdt][e] += q_sum; + // if (q[g_id][tpdt][e]>1) q[g_id][tpdt][e]=1; + // events within slice rank at time t on branch e. + } + } + // ###################################################################################################################### + // #########################################INNNER + // LOOP################################################################## + // ###################################################################################################################### + } } - - scalar_type root_sum=0; - for (int rank=0;rank >::iterator it=Ee.begin();it!=Ee.end();it++)//del_loc + for (std::map>::iterator it = + Ee.begin(); + it != Ee.end(); it++) // del_loc (*it).second.clear(); Ee.clear(); - for (std::map >::iterator it=Ge.begin();it!=Ge.end();it++)//del_loc + for (std::map>::iterator it = + Ge.begin(); + it != Ge.end(); it++) // del_loc (*it).second.clear(); Ge.clear(); + map Ee_y; // del-loc + map Ge_y; // del-loc + map E_k1, E_k2, E_k3, E_k4; // del-loc + map G_k1, G_k2, G_k3, G_k4; // del-loc - map Ee_y;//del-loc - map Ge_y;//del-loc - map E_k1,E_k2,E_k3,E_k4;//del-loc - map G_k1,G_k2,G_k3,G_k4;//del-loc - - map tmp; //XX - tmp[0]=1; - tmp[1]=1; - - for (int rank=0;rank > y_E,y_G;//del-loc - map > iy_E,iy_G;//del-loc - - scalar_type t_b; - if (tsi==(int)time_slice_times[rank].size()-1) - t_b = time_slice_begins[rank]; - else - t_b = time_slice_times[rank][tsi+1]; - scalar_type t_e; - if (tsi==0) - { - if (rank>0 ) - t_e = time_slice_begins[rank-1]; - else - t_e = 0; - } - else - { - t_e=time_slice_times[rank][tsi]; - } - scalar_type N=vector_parameter["N"][rank]; - - scalar_type ni=time_slices[rank].size(); - scalar_type Delta_bar=vector_parameter["Delta_bar"][rank];//1 - scalar_type Lambda_bar=vector_parameter["Lambda_bar"][rank]*N/(N-ni);; - scalar_type t=t_e; - scalar_type tpdt=t_b; - scalar_type h=(tpdt-t)/scalar_parameter["DD"]; - //scalar_type ti=t; - scalar_type h_lambda_avg=h*scalar_parameter["lambda_avg"]; - scalar_type h_delta_avg=h*scalar_parameter["delta_avg"]; - scalar_type h_tau_avg=h*scalar_parameter["tau_avg"]*(N-ni)/(N-1)*N; - scalar_type h_Delta_bar=h*Delta_bar; - scalar_type h_Lambda_bar=h*Lambda_bar; - - - for (int ii=0;ii tmp; // XX + tmp[0] = 1; + tmp[1] = 1; + + for (int rank = 0; rank < last_rank; rank++) + for (int tsi = 0; tsi < (int)time_slice_times[rank].size(); tsi++) { + map> y_E, y_G; // del-loc + map> iy_E, iy_G; // del-loc + + scalar_type t_b; + if (tsi == (int)time_slice_times[rank].size() - 1) + t_b = time_slice_begins[rank]; + else + t_b = time_slice_times[rank][tsi + 1]; + scalar_type t_e; + if (tsi == 0) { + if (rank > 0) + t_e = time_slice_begins[rank - 1]; + else + t_e = 0; + } else { + t_e = time_slice_times[rank][tsi]; } - //del-locs + scalar_type N = vector_parameter["N"][rank]; + + scalar_type ni = time_slices[rank].size(); + scalar_type Delta_bar = vector_parameter["Delta_bar"][rank]; // 1 + scalar_type Lambda_bar = + vector_parameter["Lambda_bar"][rank] * N / (N - ni); + ; + scalar_type t = t_e; + scalar_type tpdt = t_b; + scalar_type h = (tpdt - t) / scalar_parameter["DD"]; + // scalar_type ti=t; + scalar_type h_lambda_avg = h * scalar_parameter["lambda_avg"]; + scalar_type h_delta_avg = h * scalar_parameter["delta_avg"]; + scalar_type h_tau_avg = + h * scalar_parameter["tau_avg"] * (N - ni) / (N - 1) * N; + scalar_type h_Delta_bar = h * Delta_bar; + scalar_type h_Lambda_bar = h * Lambda_bar; + + for (int ii = 0; ii < scalar_parameter["DD"]; ii++) { + + // intial conditions + if (ii == 0) { + if (t == 0) + Ee[-1][t] = 1; + // trivial else Ee[-1][t]=Ee[-1][t]; + + // y_E[-1][t]=Ee[-1][t]; + iy_E[-1][ii] = Ee[-1][t]; + + // Ee_y[-1]=y_E[-1][t]; + Ee_y[-1] = iy_E[-1][ii]; + + Ge_y[-1] = 1; + + // y_G[-1][t]=1; + iy_G[-1][ii] = 1; + } + + for (int i = 0; i < (int)time_slices[rank].size(); i++) { + int e = time_slices[rank][i]; + if (ii == 0) { + if (t == 0) { + Ee[e][t] = vector_parameter["fraction_missing"][e]; // 0; + } else if (t == t_end[e]) { + int f = daughters[e][0]; + int g = daughters[e][1]; + Ee[e][t] = Ee[f][t] * Ee[g][t]; + } + // trivial else{Ee[e][t]=Ee[e][t];} + // y_E[e][t]=Ee[e][t]; + iy_E[e][ii] = Ee[e][t]; + + // Ee_y[e]=y_E[e][t]; + Ee_y[e] = iy_E[e][ii]; + + Ge_y[e] = 1; + + // y_G[e][t]=1; + iy_G[e][ii] = 1; + } + } + // RK4: 4th order Runge-Kutta for y'=f(y) + // k1 = f(y[n]) + E_k1[-1] = + (h_Lambda_bar + h_lambda_avg) * (1 - Ee_y[-1]) - + (h_Delta_bar + h_delta_avg + h_tau_avg) * (1 - Ee_y[-1]) * Ee_y[-1]; + G_k1[-1] = + -((h_Delta_bar + h_delta_avg + h_tau_avg) * (1 - 2 * Ee_y[-1]) + + (h_Lambda_bar + h_lambda_avg)) * + Ge_y[-1]; + for (int j = 0; j < (int)time_slices[rank].size(); j++) { + int f = time_slices[rank][j]; + scalar_type h_tau_f = h * vector_parameter["tau"][f]; + E_k1[-1] -= h_tau_f * (1 - Ee_y[f]) * Ee_y[-1]; + G_k1[-1] -= h_tau_f * (1 - Ee_y[f]) * Ge_y[-1]; + } + + for (int i = 0; i < (int)time_slices[rank].size(); i++) { + int e = time_slices[rank][i]; + scalar_type delta = vector_parameter["delta"][e]; + scalar_type lambda = vector_parameter["lambda"][e]; + scalar_type h_lambda = h * lambda; + scalar_type h_delta = h * delta; + + // k1 = f(y[n]) + E_k1[e] = h_lambda * (1 - Ee_y[e]) - + (h_delta * (1 - Ee_y[e]) + + (h_Delta_bar + h_tau_avg) * (1 - Ee_y[-1])) * + Ee_y[e]; + G_k1[e] = -1 * + (h_lambda + h_delta * (1 - 2 * Ee_y[e]) + + (h_Delta_bar + h_tau_avg) * (1 - Ee_y[-1])) * + Ge_y[e]; + } + // k2 = f(y[n]+h/2 k1) + + // Ee_y[-1]=y_E[-1][ti]+1/2.* E_k1[-1]; + Ee_y[-1] = iy_E[-1][ii] + 1 / 2. * E_k1[-1]; + // Ge_y[-1]=y_G[-1][ti]+1/2.* G_k1[-1]; + Ge_y[-1] = iy_G[-1][ii] + 1 / 2. * G_k1[-1]; + + E_k2[-1] = + (h_Lambda_bar + h_lambda_avg) * (1 - Ee_y[-1]) - + (h_Delta_bar + h_delta_avg + h_tau_avg) * (1 - Ee_y[-1]) * Ee_y[-1]; + G_k2[-1] = + -((h_Delta_bar + h_delta_avg + h_tau_avg) * (1 - 2 * Ee_y[-1]) + + (h_Lambda_bar + h_lambda_avg)) * + Ge_y[-1]; + for (int j = 0; j < (int)time_slices[rank].size(); j++) { + int f = time_slices[rank][j]; + scalar_type h_tau_f = h * vector_parameter["tau"][f]; + E_k2[-1] -= h_tau_f * (1 - Ee_y[f]) * Ee_y[-1]; + G_k2[-1] -= h_tau_f * (1 - Ee_y[f]) * Ge_y[-1]; + } + + for (int i = 0; i < (int)time_slices[rank].size(); i++) { + int e = time_slices[rank][i]; + scalar_type delta = vector_parameter["delta"][e]; + scalar_type lambda = vector_parameter["lambda"][e]; + scalar_type h_lambda = h * lambda; + scalar_type h_delta = h * delta; + + // k2 = f(y[n]+h/2 k1) + // Ee_y[e] =y_E[e][ti]+1/2. * E_k1[e]; + Ee_y[e] = iy_E[e][ii] + 1 / 2. * E_k1[e]; + // Ge_y[e] =y_G[e][ti]+1/2. * G_k1[e]; + Ge_y[e] = iy_G[e][ii] + 1 / 2. * G_k1[e]; + + E_k2[e] = h_lambda * (1 - Ee_y[e]) - + (h_delta * (1 - Ee_y[e]) + + (h_Delta_bar + h_tau_avg) * (1 - Ee_y[-1])) * + Ee_y[e]; + G_k2[e] = -1 * + (h_lambda + h_delta * (1 - 2 * Ee_y[e]) + + (h_Delta_bar + h_tau_avg) * (1 - Ee_y[-1])) * + Ge_y[e]; + } + + // k3 = f(y[n]+h/2 k2) + // Ee_y[-1]=y_E[-1][ti]+1/2.* E_k2[-1]; + Ee_y[-1] = iy_E[-1][ii] + 1 / 2. * E_k2[-1]; + // Ge_y[-1]=y_G[-1][ti]+1/2.* G_k2[-1]; + Ge_y[-1] = iy_G[-1][ii] + 1 / 2. * G_k2[-1]; + + E_k3[-1] = + (h_Lambda_bar + h_lambda_avg) * (1 - Ee_y[-1]) - + (h_Delta_bar + h_delta_avg + h_tau_avg) * (1 - Ee_y[-1]) * Ee_y[-1]; + G_k3[-1] = + -((h_Delta_bar + h_delta_avg + h_tau_avg) * (1 - 2 * Ee_y[-1]) + + (h_Lambda_bar + h_lambda_avg)) * + Ge_y[-1]; + for (int j = 0; j < (int)time_slices[rank].size(); j++) { + int f = time_slices[rank][j]; + scalar_type h_tau_f = h * vector_parameter["tau"][f]; + E_k3[-1] -= h_tau_f * (1 - Ee_y[f]) * Ee_y[-1]; + G_k3[-1] -= h_tau_f * (1 - Ee_y[f]) * Ge_y[-1]; + } + + for (int i = 0; i < (int)time_slices[rank].size(); i++) { + int e = time_slices[rank][i]; + scalar_type delta = vector_parameter["delta"][e]; + scalar_type lambda = vector_parameter["lambda"][e]; + scalar_type h_lambda = h * lambda; + scalar_type h_delta = h * delta; + + // k3 = f(y[n]+h/2 k2) + // Ee_y[e] =y_E[e][ti]+1/2. * E_k2[e]; + Ee_y[e] = iy_E[e][ii] + 1 / 2. * E_k2[e]; + // Ge_y[e] =y_G[e][ti]+1/2. * G_k2[e]; + Ge_y[e] = iy_G[e][ii] + 1 / 2. * G_k2[e]; + + E_k3[e] = h_lambda * (1 - Ee_y[e]) - + (h_delta * (1 - Ee_y[e]) + + (h_Delta_bar + h_tau_avg) * (1 - Ee_y[-1])) * + Ee_y[e]; + G_k3[e] = -1 * + (h_lambda + h_delta * (1 - 2 * Ee_y[e]) + + (h_Delta_bar + h_tau_avg) * (1 - Ee_y[-1])) * + Ge_y[e]; + } + + // k4 = f(y[n]+h k3) + // Ee_y[-1]=y_E[-1][ti]+1* E_k3[-1]; + Ee_y[-1] = iy_E[-1][ii] + 1 * E_k3[-1]; + // Ge_y[-1]=y_G[-1][ti]+1* G_k3[-1]; + Ge_y[-1] = iy_G[-1][ii] + 1 * G_k3[-1]; + + E_k4[-1] = + (h_Lambda_bar + h_lambda_avg) * (1 - Ee_y[-1]) - + (h_Delta_bar + h_delta_avg + h_tau_avg) * (1 - Ee_y[-1]) * Ee_y[-1]; + G_k4[-1] = + -((h_Delta_bar + h_delta_avg + h_tau_avg) * (1 - 2 * Ee_y[-1]) + + (h_Lambda_bar + h_lambda_avg)) * + Ge_y[-1]; + for (int j = 0; j < (int)time_slices[rank].size(); j++) { + int f = time_slices[rank][j]; + scalar_type h_tau_f = h * vector_parameter["tau"][f]; + E_k4[-1] -= h_tau_f * (1 - Ee_y[f]) * Ee_y[-1]; + G_k4[-1] -= h_tau_f * (1 - Ee_y[f]) * Ge_y[-1]; + } + + for (int i = 0; i < (int)time_slices[rank].size(); i++) { + int e = time_slices[rank][i]; + scalar_type delta = vector_parameter["delta"][e]; + scalar_type lambda = vector_parameter["lambda"][e]; + scalar_type h_lambda = h * lambda; + scalar_type h_delta = h * delta; + + // k4 = f(y[n]+h k3) + // Ee_y[e] =y_E[e][ti]+1 * E_k3[e]; + Ee_y[e] = iy_E[e][ii] + 1 * E_k3[e]; + + // Ge_y[e] =y_G[e][ti]+1 * G_k3[e]; + Ge_y[e] = iy_G[e][ii] + 1 * G_k3[e]; + + E_k4[e] = h_lambda * (1 - Ee_y[e]) - + (h_delta * (1 - Ee_y[e]) + + (h_Delta_bar + h_tau_avg) * (1 - Ee_y[-1])) * + Ee_y[e]; + G_k4[e] = -1 * + (h_lambda + h_delta * (1 - 2 * Ee_y[e]) + + (h_Delta_bar + h_tau_avg) * (1 - Ee_y[-1])) * + Ge_y[e]; + } + // y[n+1] = y[n] + h/6 (k1 + 2 k2 + 2 k3 + k4) + // y_E[-1][ti+h]=Ee_y[-1] + 1/6. * (E_k1[-1] + 2*E_k2[-1] + 2*E_k3[-1] + + // E_k4[-1]); iy_E[-1][ii+1]=Ee_y[-1] + 1/6. * (E_k1[-1] + 2*E_k2[-1] + + // 2*E_k3[-1] + E_k4[-1]); + + ///* + if (ii == 0) + iy_E[-1][ii + 1] = + Ee[-1][t] + + 1 / 6. * (E_k1[-1] + 2 * E_k2[-1] + 2 * E_k3[-1] + E_k4[-1]); + else + iy_E[-1][ii + 1] = + iy_E[-1][ii] + + 1 / 6. * (E_k1[-1] + 2 * E_k2[-1] + 2 * E_k3[-1] + E_k4[-1]); + //*/ + // y_G[-1][ti+h]=Ge_y[-1] + 1/6. * (G_k1[-1] + 2*G_k2[-1] + 2*G_k3[-1] + + // G_k4[-1]); iy_G[-1][ii+1]=Ge_y[-1] + 1/6. * (G_k1[-1] + 2*G_k2[-1] + + // 2*G_k3[-1] + G_k4[-1]); + + if (ii == 0) + iy_G[-1][ii + 1] = + 1 + 1 / 6. * (G_k1[-1] + 2 * G_k2[-1] + 2 * G_k3[-1] + G_k4[-1]); + else + iy_G[-1][ii + 1] = + iy_G[-1][ii] + + 1 / 6. * (G_k1[-1] + 2 * G_k2[-1] + 2 * G_k3[-1] + G_k4[-1]); + + if (ii == scalar_parameter["DD"] - 1) { + // Ee[-1][tpdt]=y_E[-1][ti+h]; + Ee[-1][tpdt] = iy_E[-1][ii + 1]; + + // Ge[-1][t]=y_G[-1][ti+h]; + Ge[-1][t] = iy_G[-1][ii + 1]; + + // cout << -1 << " " << t << " " << Ee[-1][tpdt] << " " << + // Ge[-1][t]< >::iterator it=y_E.begin();it!=y_E.end();it++) + for (map >::iterator + it=y_E.begin();it!=y_E.end();it++) (*it).second.clear(); y_E.clear(); - for (map >::iterator it=y_G.begin();it!=y_G.end();it++) + for (map >::iterator + it=y_G.begin();it!=y_G.end();it++) (*it).second.clear(); y_G.clear(); */ diff --git a/src/model_omp.cpp b/src/model_omp.cpp index 5df0312..042184c 100644 --- a/src/model_omp.cpp +++ b/src/model_omp.cpp @@ -3,74 +3,83 @@ using namespace std; using namespace bpp; -//#include - -//oMP// -//oMP// openMP added into this function -//oMP// - -scalar_type exODT_model::p(approx_posterior *ale) -{ - ale_pointer=ale; - //directed partitions and their sizes - vector g_ids; //del-loc. Vector of leaf set (=clade) ids, ordered by their size, small to large. - vector g_id_sizes; //del-loc. Numbers of leaves in the above sets. - - //First, cleaning q. - for (std::map > >::iterator it=q.begin();it!=q.end();it++) - { - for ( std::map< scalar_type, std::map >::iterator jt=(*it).second.begin();jt!=(*it).second.end();jt++) - (*jt).second.clear(); - (*it).second.clear(); - } +// #include + +// oMP// +// oMP// openMP added into this function +// oMP// + +scalar_type exODT_model::p(approx_posterior *ale) { + ale_pointer = ale; + // directed partitions and their sizes + vector g_ids; // del-loc. Vector of leaf set (=clade) ids, ordered + // by their size, small to large. + vector g_id_sizes; // del-loc. Numbers of leaves in the above sets. + + // First, cleaning q. + for (std::map>>::iterator + it = q.begin(); + it != q.end(); it++) { + for (std::map>::iterator jt = + (*it).second.begin(); + jt != (*it).second.end(); jt++) + (*jt).second.clear(); + (*it).second.clear(); + } q.clear(); - //cout << "start" << endl; - //iterate over directed partitions (i.e. clades) ordered by the number of leaves - //cout << "start loop" << endl; - //test - //long int tmp_g_id=-1; - //cout << ale->set2name(ale->id_sets[tmp_g_id]) < > :: iterator it = ale->size_ordered_bips.begin(); it != ale->size_ordered_bips.end(); it++) - for (vector :: iterator jt = (*it).second.begin(); jt != (*it).second.end(); jt++) - { - g_ids.push_back((*jt)); - g_id_sizes.push_back((*it).first); - } - //root bipartition needs to be handled separately (and last, given it's the largest) + // cout << "start" << endl; + // iterate over directed partitions (i.e. clades) ordered by the number of + // leaves cout << "start loop" << endl; test long int tmp_g_id=-1; cout << + // ale->set2name(ale->id_sets[tmp_g_id]) <>::iterator it = ale->size_ordered_bips.begin(); + it != ale->size_ordered_bips.end(); it++) + for (vector::iterator jt = (*it).second.begin(); + jt != (*it).second.end(); jt++) { + g_ids.push_back((*jt)); + g_id_sizes.push_back((*it).first); + } + // root bipartition needs to be handled separately (and last, given it's the + // largest) g_ids.push_back(-1); g_id_sizes.push_back(ale->Gamma_size); /* // gene<->species mapping - for (int i=0;i<(int)g_ids.size();i++) //Going through each clade of the approx_posterior + for (int i=0;i<(int)g_ids.size();i++) //Going through each clade of the + approx_posterior { long int g_id=g_ids[i]; cerr<<"g_id: "<id_leaves[(* (ale->id_sets[g_id].begin()) )]; @@ -86,61 +95,66 @@ scalar_type exODT_model::p(approx_posterior *ale) } */ - vector > > > qvec; - + vector>>> qvec; + // gene<->species mapping - // for (int i=0;i<(int)g_ids.size();i++) //Going through each clade of the approx_posterior + // for (int i=0;i<(int)g_ids.size();i++) //Going through each clade of the + // approx_posterior // { // long int g_id=g_ids[i]; // cerr<<"i: "< case vide - vector > > vrank; - vector > vt_i; - map vbranch; - vt_i.push_back(vbranch); - vrank.push_back(vt_i); - qvec.push_back(vrank); - } - else{ - //vector > > vrank; - vector > > vrank; - for (int rank=0;rank > vt_i; - vector > vt_i; - for (int t_i=0;t_i<(int)time_slice_times[rank].size();t_i++) //Going through the subslices - { - //cerr<<"\t\tt_i: "< vbranch(n, 0.); - map vbranch; - for (int branch_i=0;branch_i case vide + vector>> vrank; + vector> vt_i; + map vbranch; + vt_i.push_back(vbranch); + vrank.push_back(vt_i); + qvec.push_back(vrank); + } else { + // vector > > vrank; + vector>> vrank; + for (int rank = 0; rank < last_rank; + rank++) // Going through time slices, from leaves to root + { + // cerr<<"\trank: "< > vt_i; + vector> vt_i; + for (int t_i = 0; t_i < (int)time_slice_times[rank].size(); + t_i++) // Going through the subslices + { + // cerr<<"\t\tt_i: "< vbranch(n, 0.); + map vbranch; + for (int branch_i = 0; branch_i < n; branch_i++) { + int e = time_slices[rank][branch_i]; + // cerr<<"\t\t\te: "< temp = ale->id_sets[g_id]; for (auto i = 0; i < ale->Gamma_size + 1 ; ++i) { @@ -150,1257 +164,1300 @@ scalar_type exODT_model::p(approx_posterior *ale) break; } }*/ - - int id = 0; - for (auto i=0; i< ale->Gamma_size + 1; ++i) { - if ( ale->id_sets[g_id][i] ) { - id=i; - break; - } + + int id = 0; + for (auto i = 0; i < ale->Gamma_size + 1; ++i) { + if (ale->id_sets[g_id][i]) { + id = i; + break; } - - string gene_name=ale->id_leaves[ id /*g_id*/ ]; - -// string gene_name=ale->id_leaves[ g_id ]; - // string gene_name=ale->id_leaves[(* (ale->id_sets[g_id].begin()) )]; - vector tokens; - boost::split(tokens,gene_name,boost::is_any_of(string_parameter["gene_name_separators"]),boost::token_compress_on); - string species_name; - if ((int)scalar_parameter["species_field"]==-1) - species_name=tokens[tokens.size()-1]; - else - species_name=tokens[(int)scalar_parameter["species_field"]]; - gid_sps[g_id]=species_name; - } - } - - //oMP// - //oMP// below is the loop that iterates over the sorted g_ids, it is this one that should be amicable to openMP - //oMP// the important thing is that we can only do the g_ids in parallel that have the same number of leaves - //oMP// hence the sorting above.. - //oMP// - //oMP// the calculation fills out the global q, cf. exODT.h, this is later needed for sampling reconciliations! - //oMP// - //oMP// - - - std::map > size2i; //Map between clade size and vector of ids of the clades of that size. Seems to me this size2i could be built once and for all, and saved in the approx_posterior object, and not reconstructed every time? - - for (int i=0;i<(int)g_ids.size();i++) { //Filling up size2i + } + + string gene_name = ale->id_leaves[id /*g_id*/]; + + // string gene_name=ale->id_leaves[ g_id ]; + // string gene_name=ale->id_leaves[(* (ale->id_sets[g_id].begin()) )]; + vector tokens; + boost::split(tokens, gene_name, + boost::is_any_of(string_parameter["gene_name_separators"]), + boost::token_compress_on); + string species_name; + if ((int)scalar_parameter["species_field"] == -1) + species_name = tokens[tokens.size() - 1]; + else + species_name = tokens[(int)scalar_parameter["species_field"]]; + gid_sps[g_id] = species_name; + } + } + + // oMP// + // oMP// below is the loop that iterates over the sorted g_ids, it is this one + // that should be amicable to openMP oMP// the important thing is that we can + // only do the g_ids in parallel that have the same number of leaves oMP// + // hence the sorting above.. oMP// oMP// the calculation fills out the global + // q, cf. exODT.h, this is later needed for sampling reconciliations! oMP// + // oMP// + + std::map> + size2i; // Map between clade size and vector of ids of the clades of that + // size. Seems to me this size2i could be built once and for all, + // and saved in the approx_posterior object, and not reconstructed + // every time? + + for (int i = 0; i < (int)g_ids.size(); i++) { // Filling up size2i if (size2i.count(g_id_sizes[i]) == 0) - size2i[g_id_sizes[i]] = vector (); + size2i[g_id_sizes[i]] = vector(); size2i[g_id_sizes[i]].push_back(i); } - - for (map > :: iterator it2 = size2i.begin(); it2 != size2i.end(); it2++) - { // - int j=0; - int siz = (int)it2->second.size(); //Number of clades with that size - if (siz <= 4 )// num_threads ) //If few clades: inside loop parallelization - { - for ( j=0 ; j < siz ;j++) - { - int i = it2->second.at(j); - - // directed partition (dip) gamma's id - bool is_a_leaf=false; - long int g_id=g_ids[i]; - if (g_id_sizes[i]==1) - is_a_leaf=true; - - vector gp_ids;//del-loc - vector gpp_ids;//del-loc - vector p_part;//del-loc - if (g_id!=-1) - { - { - for (unordered_map< pair,scalar_type> :: iterator kt = ale->Dip_counts[g_id].begin(); kt != ale->Dip_counts[g_id].end(); kt++) - { - /* vector parts; - for (set::iterator sit=(*kt).first.begin();sit!=(*kt).first.end();sit++) parts.push_back((*sit)); - long int gp_id=parts[0]; - long int gpp_id=parts[1]; */ - pair parts = (*kt).first; - long int gp_id = parts.first; - long int gpp_id = parts.second; - gp_ids.push_back(gp_id); - gpp_ids.push_back(gpp_id); - if (ale->Bip_counts[g_id]<=scalar_parameter["min_bip_count"]) - p_part.push_back(0); - else - p_part.push_back(ale->p_dip(g_id,gp_id,gpp_id)); - } - } - } - else - { - //root bipartition needs to be handled separately - map,int> bip_parts; - for (map :: iterator it = ale->Bip_counts.begin(); it != ale->Bip_counts.end(); it++) - { - long int gp_id=(*it).first; - boost::dynamic_bitset<> gamma = ale->id_sets.at(gp_id); - boost::dynamic_bitset<> not_gamma = ~gamma; - not_gamma[0] = 0; - /*for (set::iterator st=ale->Gamma.begin();st!=ale->Gamma.end();st++) - if (gamma.count(*st)==0) - not_gamma.insert(*st);*/ - long int gpp_id = ale->set_ids.at(not_gamma); - set parts; - parts.insert(gp_id); - parts.insert(gpp_id); - bip_parts[parts]=1; - /*gamma.clear(); - not_gamma.clear();*/ - } - for (map,int> :: iterator kt = bip_parts.begin();kt!=bip_parts.end();kt++) - { - vector parts; - for (set::iterator sit=(*kt).first.begin();sit!=(*kt).first.end();sit++) parts.push_back((*sit)); - long int gp_id=parts[0]; - long int gpp_id=parts[1]; - gp_ids.push_back(gp_id); - gpp_ids.push_back(gpp_id); - //Here we can create a new ale->Bip_counts[gp_id], in particular for leaves. - //We may want to add the leaf entries for Bip_counts when Bip_counts is first created. - if (ale->Bip_counts[gp_id]<=scalar_parameter.at("min_bip_count") and not ale->Gamma_size<4) - p_part.push_back(0); - else - p_part.push_back(ale->p_bip(gp_id)); - } - bip_parts.clear(); - } - - int N_parts=gp_ids.size(); - - //iterate over all positions along S - for (int rank=0;rank0) - { - int f=daughters[e][0]; - int g=daughters[e][1]; - - scalar_type Eft=Ee[f][t]; - scalar_type Egt=Ee[g][t]; - - scalar_type q_sum=0; - //q[g_id][t][e]=0; - - scalar_type SL_fLg=qvec[g_id+1][rank][t_i][f]*Egt; - scalar_type SL_Lfg=qvec[g_id+1][rank][t_i][g]*Eft; - //scalar_type SL_fLg=q[g_id][t][f]*Egt; - //scalar_type SL_Lfg=q[g_id][t][g]*Eft; - //SL EVENT - q_sum+=SL_fLg+SL_Lfg; - //q[g_id][t][e]=q[g_id][t][f]*Egt + q[g_id][t][g]*Eft; - //SL. - - //non-leaf directed partition - if (not is_a_leaf) - for (int i=0;ielapsed(); - //n cout << "Inside loop parallelization: Nb Partitions: " << N_parts << " iteration duration: " << (tnow-tatom) << endl; ; - //tatom=tnow; + + for (map>::iterator it2 = size2i.begin(); + it2 != size2i.end(); it2++) { // + int j = 0; + int siz = (int)it2->second.size(); // Number of clades with that size + if (siz <= 4) // num_threads ) //If few clades: inside loop parallelization + { + for (j = 0; j < siz; j++) { + int i = it2->second.at(j); + + // directed partition (dip) gamma's id + bool is_a_leaf = false; + long int g_id = g_ids[i]; + if (g_id_sizes[i] == 1) + is_a_leaf = true; + + vector gp_ids; // del-loc + vector gpp_ids; // del-loc + vector p_part; // del-loc + if (g_id != -1) { + { + for (unordered_map, scalar_type>::iterator + kt = ale->Dip_counts[g_id].begin(); + kt != ale->Dip_counts[g_id].end(); kt++) { + /* vector parts; + for (set::iterator + sit=(*kt).first.begin();sit!=(*kt).first.end();sit++) + parts.push_back((*sit)); long int gp_id=parts[0]; long int + gpp_id=parts[1]; */ + pair parts = (*kt).first; + long int gp_id = parts.first; + long int gpp_id = parts.second; + gp_ids.push_back(gp_id); + gpp_ids.push_back(gpp_id); + if (ale->Bip_counts[g_id] <= scalar_parameter["min_bip_count"]) + p_part.push_back(0); + else + p_part.push_back(ale->p_dip(g_id, gp_id, gpp_id)); } - } - else { ////If more clades of a given size than number of threads: outside loop parallelization -#pragma omp parallel //num_threads(8) //p6 - { -#pragma omp for schedule(dynamic,1) //p7 - for ( j=0 ; jfirst: "<first << std::endl; - - // std::cout << " and : "<< it2->second.at(j) <second.at(j); //working on clade i - - // directed partition (dip) gamma's id - bool is_a_leaf=false; - long int g_id=g_ids[i]; //clade i has id g_id - if (g_id_sizes[i]==1) - is_a_leaf=true; - - vector gp_ids;//del-loc. All clades that are left daughters of clade g_id. - vector gpp_ids;//del-loc. All clades that are rigt daughters of clade g_id. - vector p_part;//del-loc. Stores all probabilities of the observed resolutions of clade g_id. - if (g_id!=-1) //Not at the root - { -#pragma omp critical - { - for (unordered_map< pair,scalar_type> :: iterator kt = ale->Dip_counts[g_id].begin(); kt != ale->Dip_counts[g_id].end(); kt++) //Going through all resolutions of the clade g_id - { - /*vector parts; - for (set::iterator sit=(*kt).first.begin();sit!=(*kt).first.end();sit++) parts.push_back((*sit)); - long int gp_id=parts[0]; - long int gpp_id=parts[1]; */ - pair parts = (*kt).first; - long int gp_id = parts.first; - long int gpp_id = parts.second; - gp_ids.push_back(gp_id); - gpp_ids.push_back(gpp_id); - if (ale->Bip_counts[g_id]<=scalar_parameter["min_bip_count"]) - p_part.push_back(0); - else - p_part.push_back(ale->p_dip(g_id,gp_id,gpp_id)); - //cout << p_part.size() << " " ; - } - } - } - else //at the root - { - //root bipartition needs to be handled separately - map,int> bip_parts; // the map is here just for ordering the sets of clade ids. Each set only has 2 elements. - for (map :: iterator it = ale->Bip_counts.begin(); it != ale->Bip_counts.end(); it++) - { //Going through all possible roots - long int gp_id=(*it).first; - boost::dynamic_bitset<> gamma=ale->id_sets.at(gp_id); - boost::dynamic_bitset<> not_gamma = ~gamma; - not_gamma[0] = 0; - /*for (set::iterator st=ale->Gamma.begin();st!=ale->Gamma.end();st++) - if (gamma.count(*st)==0) - not_gamma.insert(*st); //Building a function for constructing not_gamma would be useful! - */ - long int gpp_id = ale->set_ids.at(not_gamma); - set parts; - parts.insert(gp_id); - parts.insert(gpp_id); - bip_parts[parts]=1; //1 is a default value of no interest - /*gamma.clear(); - not_gamma.clear();*/ - } - for (map,int> :: iterator kt = bip_parts.begin();kt!=bip_parts.end();kt++) - { - vector parts; - for (set::iterator sit=(*kt).first.begin();sit!=(*kt).first.end();sit++) parts.push_back((*sit)); - long int gp_id=parts[0]; - long int gpp_id=parts[1]; - gp_ids.push_back(gp_id); - gpp_ids.push_back(gpp_id); - if (ale->Bip_counts.at(gp_id)<=scalar_parameter.at("min_bip_count") and not ale->Gamma_size<4) - p_part.push_back(0); - else - p_part.push_back(ale->p_bip(gp_id)); - } - bip_parts.clear(); - } - //Now we have filled the vectors gp_ids, gpp_ids and p_part: we know all resolutions of clade g_id with the associated probability. - - int N_parts=gp_ids.size(); //Number of resolutions of clade g_id. - if (!1) { //N_parts >= num_threads) { // It makes sense to do further parallelization but that slows things down! - //iterate over all postions along S - for (int rank=0;rank0) - { - int f=daughters[e][0]; - int g=daughters[e][1]; - scalar_type Eft=Ee[f][t]; - scalar_type Egt=Ee[g][t]; - - scalar_type q_sum=0; - //q[g_id][t][e]=0; - - scalar_type SL_fLg=qvec[g_id+1][rank][t_i][f]*Egt; - scalar_type SL_Lfg=qvec[g_id+1][rank][t_i][g]*Eft; - //scalar_type SL_fLg=q[g_id][t][f]*Egt; - //scalar_type SL_Lfg=q[g_id][t][g]*Eft; - //SL EVENT - q_sum+=SL_fLg+SL_Lfg; - //q[g_id][t][e]=q[g_id][t][f]*Egt + q[g_id][t][g]*Eft; - //SL. - - //non-leaf directed partition - if (not is_a_leaf) - for (int i=0;i0) - { - int f=daughters[e][0]; - int g=daughters[e][1]; - scalar_type Eft=Ee[f][t]; - scalar_type Egt=Ee[g][t]; - - scalar_type q_sum=0; - //q[g_id][t][e]=0; - - scalar_type SL_fLg=qvec[g_id+1][rank][t_i][f]*Egt; - scalar_type SL_Lfg=qvec[g_id+1][rank][t_i][g]*Eft; - //scalar_type SL_fLg=q[g_id][t][f]*Egt; - //scalar_type SL_Lfg=q[g_id][t][g]*Eft; - //SL EVENT - q_sum+=SL_fLg+SL_Lfg; - //q[g_id][t][e]=q[g_id][t][f]*Egt + q[g_id][t][g]*Eft; - //SL. - - //non-leaf directed partition - if (not is_a_leaf) - for (int i=0;ielapsed(); - //n cout << "Outer loop parallelization: Nb Partitions: "<, int> bip_parts; + for (map::iterator it = + ale->Bip_counts.begin(); + it != ale->Bip_counts.end(); it++) { + long int gp_id = (*it).first; + boost::dynamic_bitset<> gamma = ale->id_sets.at(gp_id); + boost::dynamic_bitset<> not_gamma = ~gamma; + not_gamma[0] = 0; + /*for (set::iterator + st=ale->Gamma.begin();st!=ale->Gamma.end();st++) if + (gamma.count(*st)==0) not_gamma.insert(*st);*/ + long int gpp_id = ale->set_ids.at(not_gamma); + set parts; + parts.insert(gp_id); + parts.insert(gpp_id); + bip_parts[parts] = 1; + /*gamma.clear(); + not_gamma.clear();*/ + } + for (map, int>::iterator kt = bip_parts.begin(); + kt != bip_parts.end(); kt++) { + vector parts; + for (set::iterator sit = (*kt).first.begin(); + sit != (*kt).first.end(); sit++) + parts.push_back((*sit)); + long int gp_id = parts[0]; + long int gpp_id = parts[1]; + gp_ids.push_back(gp_id); + gpp_ids.push_back(gpp_id); + // Here we can create a new ale->Bip_counts[gp_id], in particular + // for leaves. We may want to add the leaf entries for Bip_counts + // when Bip_counts is first created. + if (ale->Bip_counts[gp_id] <= + scalar_parameter.at("min_bip_count") and + not ale->Gamma_size < 4) + p_part.push_back(0); + else + p_part.push_back(ale->p_bip(gp_id)); + } + bip_parts.clear(); + } + + int N_parts = gp_ids.size(); + + // iterate over all positions along S + for (int rank = 0; rank < last_rank; rank++) { + int n = time_slices[rank].size(); + for (int t_i = 0; t_i < (int)time_slice_times[rank].size(); t_i++) { + + // ###################################################################################################################### + // #########################################INNER + // LOOP################################################################## + // ###################################################################################################################### + + scalar_type t = time_slice_times[rank][t_i]; + scalar_type tpdt; //,tpdt_nl; + int tpdt_rank, tpdt_t_i; + if (t_i < (int)time_slice_times[rank].size() - 1) { + tpdt = time_slice_times[rank][t_i + 1]; + tpdt_rank = rank; + tpdt_t_i = t_i + 1; + } else if (rank < last_rank - 1) { + tpdt = time_slice_times[rank + 1][0]; + tpdt_rank = rank + 1; + tpdt_t_i = 0; + } else { + // top of root stem + tpdt = t_begin[time_slices[rank][0]]; // PBM PBM PBM + tpdt_rank = rank; // PBM PBM PBM + tpdt_t_i = 0; // PBM PBM PBM + } + + bool tpdt_nl_is_t = true; + if (scalar_parameter["event_node"] == 1 and false) + ; // tpdt_nl=t; + else { + // tpdt_nl=tpdt; + tpdt_nl_is_t = false; + } + + // root + scalar_type Delta_t = tpdt - t; + // scalar_type N=vector_parameter["N"][rank]; + scalar_type Delta_bar = vector_parameter["Delta_bar"][rank]; + // scalar_type Lambda_bar=vector_parameter["Lambda_bar"][rank]; + // OMG + // scalar_type p_Delta_bar=1-exp(-Delta_bar/N*Delta_t); + scalar_type p_Delta_bar = Delta_bar * Delta_t; + scalar_type Ebar = Ee[-1][t]; + + // boundaries for branch alpha virtual branch + // boundary at present + if (t == 0) { + // #pragma omp critical + { + // q[g_id][t][alpha]=0; + qvec[g_id + 1][rank][t_i][alpha] = 0; + } + } // boundary between slice rank and rank-1 slice is trivial + ; // q[g_id][t][alpha]=q[g_id][t][alpha]; + // boundaries for branch alpha virtual branch. + if (1) { +#pragma omp parallel for schedule(dynamic, 1) // p2 + for (int branch_i = 0; branch_i < n; branch_i++) { + { + int e = time_slices[rank][branch_i]; + + // boundaries for branch e + // boundary at present + if (t == 0) { + if (is_a_leaf && extant_species[e] == gid_sps[g_id]) { + // #pragma omp critical + { + // q[g_id][t][e]=1; + qvec[g_id + 1][rank][t_i][e] = 1; + } + } else { + // #pragma omp critical + { + //[i][rank][t_i][branch_i]=0; + // q[g_id][t][e]=0; + qvec[g_id + 1][rank][t_i][e] = 0; + } + } + } + // boundary between slice rank and rank-1 + else if (t_i == 0) { + // terminating branch is last in time_slices and defines a + // represented speciation + if (branch_i == n - 1 && rank > 0) { + int f = daughters[e][0]; + int g = daughters[e][1]; + + scalar_type Eft = Ee[f][t]; + scalar_type Egt = Ee[g][t]; + + scalar_type q_sum = 0; + // q[g_id][t][e]=0; + + scalar_type SL_fLg = qvec[g_id + 1][rank][t_i][f] * Egt; + scalar_type SL_Lfg = qvec[g_id + 1][rank][t_i][g] * Eft; + // scalar_type SL_fLg=q[g_id][t][f]*Egt; + // scalar_type SL_Lfg=q[g_id][t][g]*Eft; + // SL EVENT + q_sum += SL_fLg + SL_Lfg; + // q[g_id][t][e]=q[g_id][t][f]*Egt + q[g_id][t][g]*Eft; + // SL. + + // non-leaf directed partition + if (not is_a_leaf) + for (int i = 0; i < N_parts; i++) { + long int gp_id = gp_ids[i]; + long int gpp_id = gpp_ids[i]; + scalar_type pp = p_part[i]; + scalar_type S_pf_ppg = + qvec[gp_id + 1][rank][t_i][f] * + qvec[gpp_id + 1][rank][t_i][g] * pp; + scalar_type S_ppf_pg = + qvec[gpp_id + 1][rank][t_i][f] * + qvec[gp_id + 1][rank][t_i][g] * pp; + // scalar_type + // S_pf_ppg=q[gp_id][t][f]*q[gpp_id][t][g]*pp; + // scalar_type + // S_ppf_pg=q[gpp_id][t][f]*q[gp_id][t][g]*pp; S EVENT + // q[g_id][t][e]+=q[gp_id][t][f]*q[gpp_id][t][g] + // +q[gpp_id][t][f]*q[gp_id][t][g]; + q_sum += S_pf_ppg + S_ppf_pg; + // S. + } + // #pragma omp critical + { + qvec[g_id + 1][rank][t_i][e] = q_sum; + // q[g_id][t][e]=q_sum; + } + + } + // branches that cross to next time slice + else { + // trivial + ; // q[g_id][t][e]=q[g_id][t][e]; + } + } + // boundaries for branch e. + } + } + } + + if (1) { + // events within slice rank at time t on alpha virtual branch + scalar_type G_bar = + Ge[-1][t]; // exp(-(Delta_bar*(n-N)/N+Lambda_bar)*Delta_t ); + // #pragma omp critical + { + qvec[g_id + 1][tpdt_rank][tpdt_t_i][alpha] = 0; + // q[g_id][tpdt][alpha]=0; + } + scalar_type q_sum = 0; + scalar_type q_sum_nl = 0; + // #pragma omp parallel for schedule(dynamic,1) + // reduction(+:q_sum_nl) //p3 rerunning p(ale) after sample() + // crashes if this active !? + for (int branch_i = 0; branch_i < n; branch_i++) { + { + + int e = time_slices[rank][branch_i]; + scalar_type tau_e = vector_parameter["tau"][e]; + // G_bar*=exp(- tau_e*Delta_t); + + // scalar_type p_Ntau_e=1-exp(-N*tau_e*Delta_t); + // OMG + scalar_type p_Ntau_e = tau_e * Delta_t; + // non-leaf directed partition + if (not is_a_leaf) + for (int i = 0; i < N_parts; i++) { + long int gp_id = gp_ids.at(i); + long int gpp_id = gpp_ids.at(i); + scalar_type pp = p_part.at(i); + scalar_type T_ep_app = + p_Ntau_e * qvec[gp_id + 1][rank][t_i][e] * + qvec[gpp_id + 1][rank][t_i][alpha] * pp; + scalar_type T_ap_epp = + p_Ntau_e * qvec[gp_id + 1][rank][t_i][alpha] * + qvec[gpp_id + 1][rank][t_i][e] * pp; + // scalar_type + // T_ep_app=p_Ntau_e*q[gp_id][t][e]*q[gpp_id][t][alpha]*pp; + // scalar_type + // T_ap_epp=p_Ntau_e*q[gp_id][t][alpha]*q[gpp_id][t][e]*pp; + // T EVENT + q_sum_nl += T_ep_app + T_ap_epp; + // q[g_id][tpdt][alpha]+=p_Ntau_e*(q[gp_id][t][e]*q[gpp_id][t][alpha]+q[gp_id][t][alpha]*q[gpp_id][t][e]); + // T. + } + } + } + // non-leaf directed partition + if (not is_a_leaf) + for (int i = 0; i < N_parts; i++) { + long int gp_id = gp_ids[i]; + long int gpp_id = gpp_ids[i]; + scalar_type pp = p_part[i]; + scalar_type Sb = p_Delta_bar * + (2 * qvec[gp_id + 1][rank][t_i][alpha] * + qvec[gpp_id + 1][rank][t_i][alpha]) * + pp; + // scalar_type + // Sb=p_Delta_bar*(2*q[gp_id][t][alpha]*q[gpp_id][t][alpha])*pp; + // S_bar EVENT + q_sum_nl += Sb; + // q[g_id][tpdt][alpha]+=p_Delta_bar*(2*q[gp_id][t][alpha]*q[gpp_id][t][alpha]); + // S_bar. + } + if (tpdt_nl_is_t) + qvec[g_id + 1][rank][t_i][alpha] += q_sum_nl; + else + qvec[g_id + 1][tpdt_rank][tpdt_t_i][alpha] += q_sum_nl; + // q[g_id][tpdt_nl][alpha]+=q_sum_nl; +#pragma omp parallel for schedule(dynamic, 1) reduction(+ : q_sum) // p4 + // #pragma omp task untied + for (int branch_i = 0; branch_i < n; branch_i++) { + int e = time_slices[rank][branch_i]; + scalar_type tau_e = vector_parameter["tau"][e]; + // OMG + // scalar_type p_Ntau_e=1-exp(-N*tau_e*Delta_t); + scalar_type p_Ntau_e = tau_e * Delta_t; + scalar_type TLb = + p_Ntau_e * Ebar * qvec[g_id + 1][rank][t_i][e]; + // scalar_type TLb=p_Ntau_e*Ebar*q[g_id][t][e]; + // TL_bar EVENT + q_sum += TLb; + // q[g_id][tpdt][alpha]+=p_Ntau_e*Ebar*q[g_id][t][e]; + // TL_bar. + } + // 0 EVENT + scalar_type empty = G_bar * qvec[g_id + 1][rank][t_i][alpha]; + // scalar_type empty=G_bar*q[g_id][t][alpha]; + q_sum += empty; + + // q[g_id][tpdt][alpha]+=G_bar*q[g_id][t][alpha]; + // 0. + // max + /* + if (max_termelapsed(); + // n cout << "Inside loop parallelization: Nb Partitions: " << N_parts + // << " iteration duration: " << (tnow-tatom) << endl; ; tatom=tnow; } + } else { ////If more clades of a given size than number of threads: outside + ///loop parallelization +#pragma omp parallel // num_threads(8) //p6 + { +#pragma omp for schedule(dynamic, 1) // p7 + for (j = 0; j < siz; j++) { + // std::cout << "Num Thread: "<< omp_get_thread_num()<first: "<first << std::endl; + + // std::cout << " and : "<< it2->second.at(j) <second.at(j); // working on clade i + + // directed partition (dip) gamma's id + bool is_a_leaf = false; + long int g_id = g_ids[i]; // clade i has id g_id + if (g_id_sizes[i] == 1) + is_a_leaf = true; + + vector gp_ids; // del-loc. All clades that are left + // daughters of clade g_id. + vector gpp_ids; // del-loc. All clades that are rigt + // daughters of clade g_id. + vector p_part; // del-loc. Stores all probabilities of + // the observed resolutions of clade g_id. + if (g_id != -1) // Not at the root + { +#pragma omp critical + { + for (unordered_map, + scalar_type>::iterator kt = + ale->Dip_counts[g_id].begin(); + kt != ale->Dip_counts[g_id].end(); + kt++) // Going through all resolutions of the clade g_id + { + /*vector parts; + for (set::iterator + sit=(*kt).first.begin();sit!=(*kt).first.end();sit++) + parts.push_back((*sit)); long int gp_id=parts[0]; long int + gpp_id=parts[1]; */ + pair parts = (*kt).first; + long int gp_id = parts.first; + long int gpp_id = parts.second; + gp_ids.push_back(gp_id); + gpp_ids.push_back(gpp_id); + if (ale->Bip_counts[g_id] <= scalar_parameter["min_bip_count"]) + p_part.push_back(0); + else + p_part.push_back(ale->p_dip(g_id, gp_id, gpp_id)); + // cout << p_part.size() << " " ; + } + } + } else // at the root + { + // root bipartition needs to be handled separately + map, int> + bip_parts; // the map is here just for ordering the sets of + // clade ids. Each set only has 2 elements. + for (map::iterator it = + ale->Bip_counts.begin(); + it != ale->Bip_counts.end(); + it++) { // Going through all possible roots + long int gp_id = (*it).first; + boost::dynamic_bitset<> gamma = ale->id_sets.at(gp_id); + boost::dynamic_bitset<> not_gamma = ~gamma; + not_gamma[0] = 0; + /*for (set::iterator + st=ale->Gamma.begin();st!=ale->Gamma.end();st++) if + (gamma.count(*st)==0) not_gamma.insert(*st); //Building a + function for constructing not_gamma would be useful! +*/ + long int gpp_id = ale->set_ids.at(not_gamma); + set parts; + parts.insert(gp_id); + parts.insert(gpp_id); + bip_parts[parts] = 1; // 1 is a default value of no interest + /*gamma.clear(); + not_gamma.clear();*/ + } + for (map, int>::iterator kt = bip_parts.begin(); + kt != bip_parts.end(); kt++) { + vector parts; + for (set::iterator sit = (*kt).first.begin(); + sit != (*kt).first.end(); sit++) + parts.push_back((*sit)); + long int gp_id = parts[0]; + long int gpp_id = parts[1]; + gp_ids.push_back(gp_id); + gpp_ids.push_back(gpp_id); + if (ale->Bip_counts.at(gp_id) <= + scalar_parameter.at("min_bip_count") and + not ale->Gamma_size < 4) + p_part.push_back(0); + else + p_part.push_back(ale->p_bip(gp_id)); + } + bip_parts.clear(); + } + // Now we have filled the vectors gp_ids, gpp_ids and p_part: we know + // all resolutions of clade g_id with the associated probability. + + int N_parts = gp_ids.size(); // Number of resolutions of clade g_id. + if (!1) { // N_parts >= num_threads) { // It makes sense to do further + // parallelization but that slows things down! + // iterate over all postions along S + for (int rank = 0; rank < last_rank; rank++) { + int n = time_slices[rank].size(); + for (int t_i = 0; t_i < (int)time_slice_times[rank].size(); + t_i++) { + // ###################################################################################################################### + // #########################################INNNER + // LOOP################################################################## + // ###################################################################################################################### + + scalar_type t = time_slice_times[rank][t_i]; + scalar_type tpdt; //,tpdt_nl; + int tpdt_rank, tpdt_t_i; + if (t_i < (int)time_slice_times[rank].size() - 1) { + tpdt = time_slice_times[rank][t_i + 1]; + tpdt_rank = rank; + tpdt_t_i = t_i + 1; + } else if (rank < last_rank - 1) { + tpdt = time_slice_times[rank + 1][0]; + tpdt_rank = rank + 1; + tpdt_t_i = 0; + } else { + // top of root stem + tpdt = t_begin[time_slices[rank][0]]; // PBM PBM PBM + tpdt_rank = rank; // PBM PBM PBM + tpdt_t_i = 0; // PBM PBM PBM + } + + bool tpdt_nl_is_t = true; + if (scalar_parameter["event_node"] == 1 and false) + ; // tpdt_nl=t; + else { + // tpdt_nl=tpdt; + tpdt_nl_is_t = false; + } + + // root + scalar_type Delta_t = tpdt - t; + // scalar_type N=vector_parameter["N"][rank]; + scalar_type Delta_bar = vector_parameter["Delta_bar"][rank]; + // scalar_type Lambda_bar=vector_parameter["Lambda_bar"][rank]; + // OMG + // scalar_type p_Delta_bar=1-exp(-Delta_bar/N*Delta_t); + scalar_type p_Delta_bar = Delta_bar * Delta_t; + scalar_type Ebar = Ee[-1][t]; + + // boundaries for branch alpha virtual branch + // boundary at present + if (t == 0) { + qvec[g_id + 1][rank][t_i][alpha] = 0; + // q[g_id][t][alpha]=0; + } + // boundary between slice rank and rank-1 slice is trivial + ; // q[g_id][t][alpha]=q[g_id][t][alpha]; + // boundaries for branch alpha virtual branch. + if (1) { +#pragma omp parallel for schedule(dynamic, 1) // p9 + for (int branch_i = 0; branch_i < n; branch_i++) { + { + int e = time_slices[rank][branch_i]; + + // boundaries for branch e + // boundary at present + if (t == 0) { + if (is_a_leaf && extant_species[e] == gid_sps[g_id]) { + qvec[g_id + 1][rank][t_i][e] = 1; + // q[g_id][t][e]=1; + } else { + qvec[g_id + 1][rank][t_i][e] = 0; + // q[g_id][t][e]=0; + } + } + // boundary between slice rank and rank-1 + else if (t_i == 0) { + // terminating branch is last in time_slices and defines + // a represented speciation + if (branch_i == n - 1 && rank > 0) { + int f = daughters[e][0]; + int g = daughters[e][1]; + scalar_type Eft = Ee[f][t]; + scalar_type Egt = Ee[g][t]; + + scalar_type q_sum = 0; + // q[g_id][t][e]=0; + + scalar_type SL_fLg = + qvec[g_id + 1][rank][t_i][f] * Egt; + scalar_type SL_Lfg = + qvec[g_id + 1][rank][t_i][g] * Eft; + // scalar_type SL_fLg=q[g_id][t][f]*Egt; + // scalar_type SL_Lfg=q[g_id][t][g]*Eft; + // SL EVENT + q_sum += SL_fLg + SL_Lfg; + // q[g_id][t][e]=q[g_id][t][f]*Egt + + // q[g_id][t][g]*Eft; SL. + + // non-leaf directed partition + if (not is_a_leaf) + for (int i = 0; i < N_parts; i++) { + long int gp_id = gp_ids[i]; + long int gpp_id = gpp_ids[i]; + scalar_type pp = p_part[i]; + scalar_type S_pf_ppg = + qvec[gp_id + 1][rank][t_i][f] * + qvec[gpp_id + 1][rank][t_i][g] * pp; + scalar_type S_ppf_pg = + qvec[gpp_id + 1][rank][t_i][f] * + qvec[gp_id + 1][rank][t_i][g] * pp; + // scalar_type + // S_pf_ppg=q[gp_id][t][f]*q[gpp_id][t][g]*pp; + // scalar_type + // S_ppf_pg=q[gpp_id][t][f]*q[gp_id][t][g]*pp; S + // EVENT + // q[g_id][t][e]+=q[gp_id][t][f]*q[gpp_id][t][g] + // +q[gpp_id][t][f]*q[gp_id][t][g]; + q_sum += S_pf_ppg + S_ppf_pg; + // S. + } + qvec[g_id + 1][rank][t_i][e] = q_sum; + // q[g_id][t][e]=q_sum; + + } + // branches that cross to next time slice + else { + // trivial + ; // q[g_id][t][e]=q[g_id][t][e]; + } + } + // boundaries for branch e. + } + } + } + + if (1) { + // events within slice rank at time t on alpha virtual branch + scalar_type G_bar = + Ge[-1] + [t]; // exp(-(Delta_bar*(n-N)/N+Lambda_bar)*Delta_t ); + qvec[g_id + 1][tpdt_rank][tpdt_t_i][alpha] = 0; + // q[g_id][tpdt][alpha]=0; + scalar_type q_sum = 0; + scalar_type q_sum_nl = 0; +#pragma omp parallel for schedule(dynamic, 1) reduction(+ : q_sum_nl) // p10 + for (int branch_i = 0; branch_i < n; branch_i++) { + int e = time_slices[rank][branch_i]; + scalar_type tau_e = vector_parameter["tau"][e]; + // G_bar*=exp(- tau_e*Delta_t); + + // scalar_type p_Ntau_e=1-exp(-N*tau_e*Delta_t); + // OMG + scalar_type p_Ntau_e = tau_e * Delta_t; + // non-leaf directed partition + if (not is_a_leaf) + for (int i = 0; i < N_parts; i++) { + long int gp_id = gp_ids[i]; + long int gpp_id = gpp_ids[i]; + scalar_type pp = p_part[i]; + scalar_type T_ep_app = + p_Ntau_e * qvec[gp_id + 1][rank][t_i][e] * + qvec[gpp_id + 1][rank][t_i][alpha] * pp; + scalar_type T_ap_epp = + p_Ntau_e * qvec[gp_id + 1][rank][t_i][alpha] * + qvec[gpp_id + 1][rank][t_i][e] * pp; + // scalar_type + // T_ep_app=p_Ntau_e*q[gp_id][t][e]*q[gpp_id][t][alpha]*pp; + // scalar_type + // T_ap_epp=p_Ntau_e*q[gp_id][t][alpha]*q[gpp_id][t][e]*pp; + // T EVENT + q_sum_nl += T_ep_app + T_ap_epp; + // q[g_id][tpdt][alpha]+=p_Ntau_e*(q[gp_id][t][e]*q[gpp_id][t][alpha]+q[gp_id][t][alpha]*q[gpp_id][t][e]); + // T. + } + } + // non-leaf directed partition + if (not is_a_leaf) + for (int i = 0; i < N_parts; i++) { + long int gp_id = gp_ids[i]; + long int gpp_id = gpp_ids[i]; + scalar_type pp = p_part[i]; + scalar_type Sb = p_Delta_bar * + (2 * qvec[gp_id + 1][rank][t_i][alpha] * + qvec[gpp_id + 1][rank][t_i][alpha]) * + pp; + // scalar_type + // Sb=p_Delta_bar*(2*q[gp_id][t][alpha]*q[gpp_id][t][alpha])*pp; + // S_bar EVENT + q_sum_nl += Sb; + // q[g_id][tpdt][alpha]+=p_Delta_bar*(2*q[gp_id][t][alpha]*q[gpp_id][t][alpha]); + // S_bar. + } + if (tpdt_nl_is_t) + qvec[g_id + 1][tpdt_rank][tpdt_t_i][alpha] += q_sum_nl; + else + qvec[g_id + 1][tpdt_rank][tpdt_t_i][alpha] += q_sum_nl; + // q[g_id][tpdt_nl][alpha]+=q_sum_nl; + for (int branch_i = 0; branch_i < n; branch_i++) { + + int e = time_slices[rank][branch_i]; + scalar_type tau_e = vector_parameter["tau"][e]; + // OMG + // scalar_type p_Ntau_e=1-exp(-N*tau_e*Delta_t); + scalar_type p_Ntau_e = tau_e * Delta_t; + scalar_type TLb = + p_Ntau_e * Ebar * qvec[g_id + 1][rank][t_i][e]; + // scalar_type TLb=p_Ntau_e*Ebar*q[g_id][t][e]; + // TL_bar EVENT + q_sum += TLb; + // q[g_id][tpdt][alpha]+=p_Ntau_e*Ebar*q[g_id][t][e]; + // TL_bar. + } + // 0 EVENT + scalar_type empty = G_bar * qvec[g_id + 1][rank][t_i][alpha]; + // scalar_type empty=G_bar*q[g_id][t][alpha]; + q_sum += empty; + + // q[g_id][tpdt][alpha]+=G_bar*q[g_id][t][alpha]; + // 0. + // max + /* + if (max_termelapsed(); - //n cout << endl << it2->first << " "<< siz << " "< 0) { + int f = daughters[e][0]; + int g = daughters[e][1]; + scalar_type Eft = Ee[f][t]; + scalar_type Egt = Ee[g][t]; + + scalar_type q_sum = 0; + // q[g_id][t][e]=0; + + scalar_type SL_fLg = + qvec[g_id + 1][rank][t_i][f] * Egt; + scalar_type SL_Lfg = + qvec[g_id + 1][rank][t_i][g] * Eft; + // scalar_type SL_fLg=q[g_id][t][f]*Egt; + // scalar_type SL_Lfg=q[g_id][t][g]*Eft; + // SL EVENT + q_sum += SL_fLg + SL_Lfg; + // q[g_id][t][e]=q[g_id][t][f]*Egt + + // q[g_id][t][g]*Eft; SL. + + // non-leaf directed partition + if (not is_a_leaf) + for (int i = 0; i < N_parts; i++) { + long int gp_id = gp_ids[i]; + long int gpp_id = gpp_ids[i]; + scalar_type pp = p_part[i]; + scalar_type S_pf_ppg = + qvec[gp_id + 1][rank][t_i][f] * + qvec[gpp_id + 1][rank][t_i][g] * pp; + scalar_type S_ppf_pg = + qvec[gpp_id + 1][rank][t_i][f] * + qvec[gp_id + 1][rank][t_i][g] * pp; + // scalar_type + // S_pf_ppg=q[gp_id][t][f]*q[gpp_id][t][g]*pp; + // scalar_type + // S_ppf_pg=q[gpp_id][t][f]*q[gp_id][t][g]*pp; S + // EVENT + // q[g_id][t][e]+=q[gp_id][t][f]*q[gpp_id][t][g] + // +q[gpp_id][t][f]*q[gp_id][t][g]; + q_sum += S_pf_ppg + S_ppf_pg; + // S. + } + qvec[g_id + 1][rank][t_i][e] = q_sum; + // q[g_id][t][e]=q_sum; + + } + // branches that cross to next time slice + else { + // trivial + ; // q[g_id][t][e]=q[g_id][t][e]; + } + } + // boundaries for branch e. + } + } + } + if (1) // Only dealing with T events, S_bar, TL_bar and naught + // events? + { + // events within slice rank at time t on alpha virtual branch + scalar_type G_bar = + Ge[-1] + [t]; // exp(-(Delta_bar*(n-N)/N+Lambda_bar)*Delta_t ); + // #pragma omp critical + { + qvec[g_id + 1][tpdt_rank][tpdt_t_i][alpha] = 0; + // q[g_id][tpdt][alpha]=0; + } + scalar_type q_sum = 0; + scalar_type q_sum_nl = 0; + for (int branch_i = 0; branch_i < n; branch_i++) { + + int e = time_slices[rank][branch_i]; + scalar_type tau_e = vector_parameter["tau"][e]; + // G_bar*=exp(- tau_e*Delta_t); + + // scalar_type p_Ntau_e=1-exp(-N*tau_e*Delta_t); + // OMG + scalar_type p_Ntau_e = tau_e * Delta_t; + // non-leaf directed partition + if (not is_a_leaf) + for (int i = 0; i < N_parts; i++) // For each partition + { + long int gp_id = gp_ids[i]; + long int gpp_id = gpp_ids[i]; + scalar_type pp = p_part[i]; + scalar_type T_ep_app = + p_Ntau_e * qvec[gp_id + 1][rank][t_i][e] * + qvec[gpp_id + 1][rank][t_i][alpha] * pp; + scalar_type T_ap_epp = + p_Ntau_e * qvec[gp_id + 1][rank][t_i][alpha] * + qvec[gpp_id + 1][rank][t_i][e] * pp; + // scalar_type + // T_ep_app=p_Ntau_e*q[gp_id][t][e]*q[gpp_id][t][alpha]*pp; + // scalar_type + // T_ap_epp=p_Ntau_e*q[gp_id][t][alpha]*q[gpp_id][t][e]*pp; + // T EVENT + q_sum_nl += T_ep_app + T_ap_epp; + // q[g_id][tpdt][alpha]+=p_Ntau_e*(q[gp_id][t][e]*q[gpp_id][t][alpha]+q[gp_id][t][alpha]*q[gpp_id][t][e]); + // T. + } + } + // non-leaf directed partition + if (not is_a_leaf) + for (int i = 0; i < N_parts; i++) { + long int gp_id = gp_ids[i]; + long int gpp_id = gpp_ids[i]; + scalar_type pp = p_part[i]; + scalar_type Sb = p_Delta_bar * + (2 * qvec[gp_id + 1][rank][t_i][alpha] * + qvec[gpp_id + 1][rank][t_i][alpha]) * + pp; + // scalar_type + // Sb=p_Delta_bar*(2*q[gp_id][t][alpha]*q[gpp_id][t][alpha])*pp; + // S_bar EVENT Speciation in an unseen lineage? + q_sum_nl += Sb; + // q[g_id][tpdt][alpha]+=p_Delta_bar*(2*q[gp_id][t][alpha]*q[gpp_id][t][alpha]); + // S_bar. + } + if (tpdt_nl_is_t) + qvec[g_id + 1][rank][t_i][alpha] += q_sum_nl; + else + qvec[g_id + 1][tpdt_rank][tpdt_t_i][alpha] += q_sum_nl; + // q[g_id][tpdt_nl][alpha]+=q_sum_nl; + for (int branch_i = 0; branch_i < n; branch_i++) { + + int e = time_slices[rank][branch_i]; + scalar_type tau_e = vector_parameter["tau"][e]; + // OMG + // scalar_type p_Ntau_e=1-exp(-N*tau_e*Delta_t); + scalar_type p_Ntau_e = tau_e * Delta_t; + scalar_type TLb = + p_Ntau_e * Ebar * qvec[g_id + 1][rank][t_i][e]; + // scalar_type TLb=p_Ntau_e*Ebar*q[g_id][t][e]; + // TL_bar EVENT + q_sum += TLb; + // q[g_id][tpdt][alpha]+=p_Ntau_e*Ebar*q[g_id][t][e]; + // TL_bar. + } + // 0 EVENT + scalar_type empty = G_bar * qvec[g_id + 1][rank][t_i][alpha]; + // scalar_type empty=G_bar*q[g_id][t][alpha]; + q_sum += empty; + + // q[g_id][tpdt][alpha]+=G_bar*q[g_id][t][alpha]; + // 0. + // max + /* + if (max_termelapsed(); + // n cout << "Outer loop parallelization: Nb Partitions: "<elapsed(); + // n cout << endl << it2->first << " "<< siz << " "<(tmp_node->getBranchProperty("ID")))).toSTL(); - if (tmp_q>0) - out << log(tmp_q) ; + string name = (* (dynamic_cast(tmp_node->getBranchProperty("ID")))).toSTL(); if (tmp_q>0) out << + log(tmp_q) ; //out << Ge[branch][t]; - tmp_node->setBranchProperty("ID",BppString(name+out.str().substr(0,4)+"|")); + tmp_node->setBranchProperty("ID",BppString(name+out.str().substr(0,4)+"|")); //if (tmp_node->isLeaf()) //tmp_node->setName(tmp_node->getName()+out.str().substr(0,4)+"|"); } } } cout << TreeTemplateTools::treeToParenthesis(*S,false,"ID") << endl; - for (map ::iterator it=node_ids.begin();it!=node_ids.end();it++ ) + for (map ::iterator it=node_ids.begin();it!=node_ids.end();it++ + ) (*it).first->setBranchProperty("ID",BppString("")); */ - //test + // test - //del-locs + // del-locs g_ids.clear(); g_id_sizes.clear(); - return root_sum; + return root_sum; } -void exODT_model::calculate_EGb() -{ +void exODT_model::calculate_EGb() { - for (std::map >::iterator it=Ee.begin();it!=Ee.end();it++)//del_loc + for (std::map>::iterator it = + Ee.begin(); + it != Ee.end(); it++) // del_loc (*it).second.clear(); Ee.clear(); - for (std::map >::iterator it=Ge.begin();it!=Ge.end();it++)//del_loc + for (std::map>::iterator it = + Ge.begin(); + it != Ge.end(); it++) // del_loc (*it).second.clear(); Ge.clear(); + map Ee_y; // del-loc + map Ge_y; // del-loc + map E_k1, E_k2, E_k3, E_k4; // del-loc + map G_k1, G_k2, G_k3, G_k4; // del-loc + + for (int rank = 0; rank < last_rank; rank++) + for (int tsi = 0; tsi < (int)time_slice_times[rank].size(); tsi++) { + map> y_E, y_G; // del-loc + map> iy_E, iy_G; // del-loc + + scalar_type t_b; + if (tsi == (int)time_slice_times[rank].size() - 1) + t_b = time_slice_begins[rank]; + else + t_b = time_slice_times[rank][tsi + 1]; + scalar_type t_e; + if (tsi == 0) { + if (rank > 0) + t_e = time_slice_begins[rank - 1]; + else + t_e = 0; + } else { + t_e = time_slice_times[rank][tsi]; + } + scalar_type N = vector_parameter["N"][rank]; + + scalar_type ni = time_slices[rank].size(); + scalar_type Delta_bar = vector_parameter["Delta_bar"][rank]; // 1 + scalar_type Lambda_bar = + vector_parameter["Lambda_bar"][rank] * N / (N - ni); + ; - map Ee_y;//del-loc - map Ge_y;//del-loc - map E_k1,E_k2,E_k3,E_k4;//del-loc - map G_k1,G_k2,G_k3,G_k4;//del-loc + scalar_type t = t_e; + scalar_type tpdt = t_b; + scalar_type h = (tpdt - t) / scalar_parameter["DD"]; + // scalar_type ti=t; - for (int rank=0;rank > y_E,y_G;//del-loc - map > iy_E,iy_G;//del-loc - - scalar_type t_b; - if (tsi==(int)time_slice_times[rank].size()-1) - t_b = time_slice_begins[rank]; - else - t_b = time_slice_times[rank][tsi+1]; - scalar_type t_e; - if (tsi==0) - { - if (rank>0 ) - t_e = time_slice_begins[rank-1]; - else - t_e = 0; - } - else - { - t_e=time_slice_times[rank][tsi]; - } - scalar_type N=vector_parameter["N"][rank]; - - scalar_type ni=time_slices[rank].size(); - scalar_type Delta_bar=vector_parameter["Delta_bar"][rank];//1 - scalar_type Lambda_bar=vector_parameter["Lambda_bar"][rank]*N/(N-ni);; - - scalar_type t=t_e; - scalar_type tpdt=t_b; - scalar_type h=(tpdt-t)/scalar_parameter["DD"]; - //scalar_type ti=t; - - scalar_type h_lambda_avg=h*scalar_parameter["lambda_avg"]; - scalar_type h_delta_avg=h*scalar_parameter["delta_avg"]; - scalar_type h_tau_avg=h*scalar_parameter["tau_avg"]*(N-ni)/(N-1)*N; - scalar_type h_Delta_bar=h*Delta_bar; - scalar_type h_Lambda_bar=h*Lambda_bar; - - - for (int ii=0;ii >::iterator it=y_E.begin();it!=y_E.end();it++) + E_k1.clear(); + E_k2.clear(); + E_k3.clear(); + E_k4.clear(); + G_k1.clear(); + G_k2.clear(); + G_k3.clear(); + G_k4.clear(); + /* + for (map >::iterator + it=y_E.begin();it!=y_E.end();it++) (*it).second.clear(); y_E.clear(); - for (map >::iterator it=y_G.begin();it!=y_G.end();it++) + for (map >::iterator + it=y_G.begin();it!=y_G.end();it++) (*it).second.clear(); - y_G.clear(); + y_G.clear(); */ } diff --git a/src/model_qvec.cpp b/src/model_qvec.cpp index 014af5f..d102b60 100644 --- a/src/model_qvec.cpp +++ b/src/model_qvec.cpp @@ -4,87 +4,92 @@ using namespace bpp; #include - -static double EPSILON = numeric_limits< double >::min(); - -//static double EPSILON = 10^-300; - -//p(ale) calculates Pi(Gamma) cf. ALEPAPER -scalar_type exODT_model::p(approx_posterior *ale) -{ - ale_pointer=ale; - //directed partitions and their sizes - vector g_ids;//del-loc - vector g_id_sizes;//del-loc - - //We sort the directed partitions by size (number of gene tree leaves) to ensure that we calculate things in the proper order (smaller to larger) - for (map > :: iterator it = ale->size_ordered_bips.begin(); it != ale->size_ordered_bips.end(); it++) - for (vector :: iterator jt = (*it).second.begin(); jt != (*it).second.end(); jt++) - { - g_ids.push_back((*jt)); - g_id_sizes.push_back((*it).first); - } - //root bipartition needs to be handled separately +static double EPSILON = numeric_limits::min(); + +// static double EPSILON = 10^-300; + +// p(ale) calculates Pi(Gamma) cf. ALEPAPER +scalar_type exODT_model::p(approx_posterior *ale) { + ale_pointer = ale; + // directed partitions and their sizes + vector g_ids; // del-loc + vector g_id_sizes; // del-loc + + // We sort the directed partitions by size (number of gene tree leaves) to + // ensure that we calculate things in the proper order (smaller to larger) + for (map>::iterator it = ale->size_ordered_bips.begin(); + it != ale->size_ordered_bips.end(); it++) + for (vector::iterator jt = (*it).second.begin(); + jt != (*it).second.end(); jt++) { + g_ids.push_back((*jt)); + g_id_sizes.push_back((*it).first); + } + // root bipartition needs to be handled separately g_ids.push_back(-1); g_id_sizes.push_back(ale->Gamma_size); // gene<->species mapping - //vector > > > qvec; - qvec.clear();//hope this doesn't leak.. + // vector > > > qvec; + qvec.clear(); // hope this doesn't leak.. // gene<->species mapping - // for (int i=0;i<(int)g_ids.size();i++) //Going through each clade of the approx_posterior + // for (int i=0;i<(int)g_ids.size();i++) //Going through each clade of the + // approx_posterior // { // long int g_id=g_ids[i]; // cerr<<"i: "< case vide - vector > > vrank; - vector > vt_i; - map vbranch; - vt_i.push_back(vbranch); - vrank.push_back(vt_i); - qvec.push_back(vrank); - } - else{ - //vector > > vrank; - vector > > vrank; - for (int rank=0;rank > vt_i; - vector > vt_i; - for (int t_i=0;t_i<(int)time_slice_times[rank].size();t_i++) //Going through the subslices - { - //cerr<<"\t\tt_i: "< vbranch(n, 0.); - map vbranch; - for (int branch_i=0;branch_i case vide + vector>> vrank; + vector> vt_i; + map vbranch; + vt_i.push_back(vbranch); + vrank.push_back(vt_i); + qvec.push_back(vrank); + } else { + // vector > > vrank; + vector>> vrank; + for (int rank = 0; rank < last_rank; + rank++) // Going through time slices, from leaves to root + { + // cerr<<"\trank: "< > vt_i; + vector> vt_i; + for (int t_i = 0; t_i < (int)time_slice_times[rank].size(); + t_i++) // Going through the subslices + { + // cerr<<"\t\tt_i: "< vbranch(n, 0.); + map vbranch; + for (int branch_i = 0; branch_i < n; branch_i++) { + int e = time_slices[rank][branch_i]; + // cerr<<"\t\t\te: "< temp = ale->id_sets[g_id]; for (auto i = 0; i < ale->Gamma_size + 1 ; ++i) { @@ -94,731 +99,780 @@ scalar_type exODT_model::p(approx_posterior *ale) break; } }*/ - - int id = 0; - for (auto i=0; i< ale->Gamma_size + 1; ++i) { - if ( ale->id_sets[g_id][i] ) { - id=i; - break; - } + + int id = 0; + for (auto i = 0; i < ale->Gamma_size + 1; ++i) { + if (ale->id_sets[g_id][i]) { + id = i; + break; } - - string gene_name=ale->id_leaves[ id /*g_id*/ ]; - -// string gene_name=ale->id_leaves[ g_id ]; - // string gene_name=ale->id_leaves[(* (ale->id_sets[g_id].begin()) )]; - vector tokens; - boost::split(tokens,gene_name,boost::is_any_of(string_parameter["gene_name_separators"]),boost::token_compress_on); - string species_name; - if ((int)scalar_parameter["species_field"]==-1) - species_name=tokens[tokens.size()-1]; - else - species_name=tokens[(int)scalar_parameter["species_field"]]; - gid_sps[g_id]=species_name; - } - } - - //p_parts is filled up with CCPs - for (int i=0;i<(int)g_ids.size();i++) - { - // directed partition (dip) gamma's id - bool is_a_leaf=false; - long int g_id=g_ids[i]; - if (g_id_sizes[i]==1) - is_a_leaf=true; - - vector gp_ids;//del-loc - vector gpp_ids;//del-loc - vector p_part;//del-loc - if (g_id!=-1) - for (unordered_map< pair,scalar_type> :: iterator kt = ale->Dip_counts[g_id].begin(); kt != ale->Dip_counts[g_id].end(); kt++) - { - pair parts = (*kt).first; - long int gp_id=parts.first; - long int gpp_id=parts.second; - gp_ids.push_back(gp_id); - gpp_ids.push_back(gpp_id); - if (ale->Bip_counts[g_id]<=scalar_parameter["min_bip_count"]) - p_part.push_back(0); - else - p_part.push_back(ale->p_dip(g_id,gp_id,gpp_id)); - } + } + + string gene_name = ale->id_leaves[id /*g_id*/]; + + // string gene_name=ale->id_leaves[ g_id ]; + // string gene_name=ale->id_leaves[(* (ale->id_sets[g_id].begin()) )]; + vector tokens; + boost::split(tokens, gene_name, + boost::is_any_of(string_parameter["gene_name_separators"]), + boost::token_compress_on); + string species_name; + if ((int)scalar_parameter["species_field"] == -1) + species_name = tokens[tokens.size() - 1]; else - { - //root bipartition needs to be handled separately - map,int> bip_parts; - for (map :: iterator it = ale->Bip_counts.begin(); it != ale->Bip_counts.end(); it++) - { - long int gp_id=(*it).first; - boost::dynamic_bitset<> gamma =ale->id_sets.at(gp_id); - boost::dynamic_bitset<> not_gamma = ~gamma; - not_gamma[0] = 0; - long int gpp_id = ale->set_ids.at(not_gamma); - - set parts; - parts.insert(gp_id); - parts.insert(gpp_id); - bip_parts[parts]=1; - // gamma.clear(); - // not_gamma.clear(); - } - for (map,int> :: iterator kt = bip_parts.begin();kt!=bip_parts.end();kt++) - { - vector parts; - for (set::iterator sit=(*kt).first.begin();sit!=(*kt).first.end();sit++) { - parts.push_back((*sit)); + species_name = tokens[(int)scalar_parameter["species_field"]]; + gid_sps[g_id] = species_name; + } + } + + // p_parts is filled up with CCPs + for (int i = 0; i < (int)g_ids.size(); i++) { + // directed partition (dip) gamma's id + bool is_a_leaf = false; + long int g_id = g_ids[i]; + if (g_id_sizes[i] == 1) + is_a_leaf = true; + + vector gp_ids; // del-loc + vector gpp_ids; // del-loc + vector p_part; // del-loc + if (g_id != -1) + for (unordered_map, scalar_type>::iterator kt = + ale->Dip_counts[g_id].begin(); + kt != ale->Dip_counts[g_id].end(); kt++) { + pair parts = (*kt).first; + long int gp_id = parts.first; + long int gpp_id = parts.second; + gp_ids.push_back(gp_id); + gpp_ids.push_back(gpp_id); + if (ale->Bip_counts[g_id] <= scalar_parameter["min_bip_count"]) + p_part.push_back(0); + else + p_part.push_back(ale->p_dip(g_id, gp_id, gpp_id)); + } + else { + // root bipartition needs to be handled separately + map, int> bip_parts; + for (map::iterator it = ale->Bip_counts.begin(); + it != ale->Bip_counts.end(); it++) { + long int gp_id = (*it).first; + boost::dynamic_bitset<> gamma = ale->id_sets.at(gp_id); + boost::dynamic_bitset<> not_gamma = ~gamma; + not_gamma[0] = 0; + long int gpp_id = ale->set_ids.at(not_gamma); + + set parts; + parts.insert(gp_id); + parts.insert(gpp_id); + bip_parts[parts] = 1; + // gamma.clear(); + // not_gamma.clear(); + } + for (map, int>::iterator kt = bip_parts.begin(); + kt != bip_parts.end(); kt++) { + vector parts; + for (set::iterator sit = (*kt).first.begin(); + sit != (*kt).first.end(); sit++) { + parts.push_back((*sit)); + } + long int gp_id = parts[0]; + long int gpp_id = parts[1]; + gp_ids.push_back(gp_id); + gpp_ids.push_back(gpp_id); + + // Here we can create a new ale->Bip_counts[gp_id], in particular for + // leaves. We may want to add the leaf entries for Bip_counts when + // Bip_counts is first created. + if (ale->Bip_counts[gp_id] <= scalar_parameter.at("min_bip_count") and + not ale->Gamma_size < 4) + p_part.push_back(0); + else + p_part.push_back(ale->p_bip(gp_id)); + } + bip_parts.clear(); + } + int N_parts = gp_ids.size(); + + // iterate over all postions along S + for (int rank = 0; rank < last_rank; rank++) { + int n = time_slices[rank].size(); + for (int t_i = 0; t_i < (int)time_slice_times[rank].size(); t_i++) { + // ###################################################################################################################### + // #########################################INNNER + // LOOP################################################################## + // ###################################################################################################################### + + scalar_type t = time_slice_times[rank][t_i]; + scalar_type tpdt; + int tpdt_rank, tpdt_t_i; + if (t_i < (int)time_slice_times[rank].size() - 1) { + tpdt = time_slice_times[rank][t_i + 1]; + tpdt_rank = rank; + tpdt_t_i = t_i + 1; + } else if (rank < last_rank - 1) { + tpdt = time_slice_times[rank + 1][0]; + tpdt_rank = rank + 1; + tpdt_t_i = 0; + } else + // top of root ste + { + tpdt = t_begin[time_slices[rank][0]]; + tpdt_rank = rank; + tpdt_t_i = 0; + } + + // root + scalar_type Delta_t = (tpdt - t) * 1; + + // Delat_bar corresponds to sigma in ALEPAPER + scalar_type Delta_bar = vector_parameter["Delta_bar"][rank]; + // scalar_type Lambda_bar=vector_parameter["Lambda_bar"][rank]; + // scalar_type tmp + scalar_type p_Delta_bar = Delta_bar * Delta_t; + scalar_type Ebar = Ee[-1][t]; + + // boundaries for branch alpha virtual branch + + // boundary at present + if (t == 0) + qvec[g_id + 1][rank][t_i][alpha] = 0; + + // boundary between slice rank and rank-1 slice is trivial + ; // qvec[g_id+1][rank][t_i][alpha]=qvec[g_id+1][rank][t_i][alpha]; + + // boundaries for branch alpha virtual branch. + if (1) { + for (int branch_i = 0; branch_i < n; branch_i++) { + int e = time_slices[rank][branch_i]; + + // boundaries for branch e + // boundary at present + if (t == 0) { + if (is_a_leaf && extant_species[e] == gid_sps[g_id]) { + qvec[g_id + 1][rank][t_i][e] = 1; + } else + qvec[g_id + 1][rank][t_i][e] = 0; } - long int gp_id=parts[0]; - long int gpp_id=parts[1]; - gp_ids.push_back(gp_id); - gpp_ids.push_back(gpp_id); - - //Here we can create a new ale->Bip_counts[gp_id], in particular for leaves. - //We may want to add the leaf entries for Bip_counts when Bip_counts is first created. - if (ale->Bip_counts[gp_id]<=scalar_parameter.at("min_bip_count") and not ale->Gamma_size<4) - p_part.push_back(0); - else - p_part.push_back(ale->p_bip(gp_id)); - } - bip_parts.clear(); - } - int N_parts=gp_ids.size(); - - //iterate over all postions along S - for (int rank=0;rank0) - { - int f=daughters[e][0]; - int g=daughters[e][1]; - scalar_type Eft=Ee[f][t]; - scalar_type Egt=Ee[g][t]; - - scalar_type q_sum=0; - //qvec[g_id+1][rank][t_i][e]=0; - - scalar_type SL_fLg=qvec[g_id+1][rank][t_i][f]*Egt; - scalar_type SL_Lfg=qvec[g_id+1][rank][t_i][g]*Eft; - //SL EVENT, events #3 and #4 in part c of Fig.A1 in http://arxiv.org/abs/1211.4606 - //qvec[g_id+1][rank][t_i][e]=qvec[g_id+1][rank][t_i][f]*Egt + qvec[g_id+1][rank][t_i][g]*Eft; - q_sum+=SL_fLg+SL_Lfg; - //SL. - - //non-leaf directed partition - if (not is_a_leaf) - for (int i=0;i 0) { + int f = daughters[e][0]; + int g = daughters[e][1]; + scalar_type Eft = Ee[f][t]; + scalar_type Egt = Ee[g][t]; + + scalar_type q_sum = 0; + // qvec[g_id+1][rank][t_i][e]=0; + + scalar_type SL_fLg = qvec[g_id + 1][rank][t_i][f] * Egt; + scalar_type SL_Lfg = qvec[g_id + 1][rank][t_i][g] * Eft; + // SL EVENT, events #3 and #4 in part c of Fig.A1 in + // http://arxiv.org/abs/1211.4606 + // qvec[g_id+1][rank][t_i][e]=qvec[g_id+1][rank][t_i][f]*Egt + + // qvec[g_id+1][rank][t_i][g]*Eft; + q_sum += SL_fLg + SL_Lfg; + // SL. + + // non-leaf directed partition + if (not is_a_leaf) + for (int i = 0; i < N_parts; i++) { + long int gp_id = gp_ids[i]; + long int gpp_id = gpp_ids[i]; + scalar_type pp = p_part[i]; + scalar_type S_pf_ppg = qvec[gp_id + 1][rank][t_i][f] * + qvec[gpp_id + 1][rank][t_i][g] * pp; + scalar_type S_ppf_pg = qvec[gpp_id + 1][rank][t_i][f] * + qvec[gp_id + 1][rank][t_i][g] * pp; + // S EVENT, events #1 and #2 in part c of Fig.A1 in + // http://arxiv.org/abs/1211.4606 + // qvec[g_id+1][rank][t_i][e]+=qvec[gp_id+1][rank][t_i][f]*qvec[gpp_id+1][rank][t_i][g] + // +qvec[gpp_id+1][rank][t_i][f]*qvec[gp_id+1][rank][t_i][g]; + q_sum += S_pf_ppg + S_ppf_pg; + // S. } - } - - //branches that cross to next time slice - else - { - //trivial - ;//qvec[g_id+1][rank][t_i][e]=qvec[g_id+1][rank][t_i][e]; - } - } - //boundaries for branch e. - } - } - - if(1) - { - - //events within slice rank at time t on alpha virtual branch - scalar_type G_bar=Ge[-1][t]; - //note that the coalescent approximation in http://arxiv.org/abs/1211.4606 is exp(-(Delta_bar*(n-N)/N+Lambda_bar)*Delta_t ); - - qvec[g_id+1][tpdt_rank][tpdt_t_i][alpha]=0; - scalar_type q_sum=0; - for (int branch_i=0;branch_i1) qvec[g_id+1][tpdt_rank][tpdt_t_i][e]=1; - //events within slice rank at time t on branch e. - } - } - //###################################################################################################################### - //#########################################INNNER LOOP################################################################## - //###################################################################################################################### - } - } - gp_ids.clear(); - gpp_ids.clear(); - p_part.clear(); + } + // boundaries for branch e. + } + } + + if (1) { + + // events within slice rank at time t on alpha virtual branch + scalar_type G_bar = Ge[-1][t]; + // note that the coalescent approximation in + // http://arxiv.org/abs/1211.4606 is + // exp(-(Delta_bar*(n-N)/N+Lambda_bar)*Delta_t ); + + qvec[g_id + 1][tpdt_rank][tpdt_t_i][alpha] = 0; + scalar_type q_sum = 0; + for (int branch_i = 0; branch_i < n; branch_i++) { + int e = time_slices[rank][branch_i]; + scalar_type tau_e = vector_parameter["tau"][e]; + scalar_type p_Ntau_e = tau_e * Delta_t; + + // non-leaf directed partition + if (not is_a_leaf) + for (int i = 0; i < N_parts; i++) { + long int gp_id = gp_ids[i]; + long int gpp_id = gpp_ids[i]; + scalar_type pp = p_part[i]; + scalar_type T_ep_app = p_Ntau_e * + qvec[gp_id + 1][rank][t_i][e] * + qvec[gpp_id + 1][rank][t_i][alpha] * pp; + scalar_type T_ap_epp = p_Ntau_e * + qvec[gp_id + 1][rank][t_i][alpha] * + qvec[gpp_id + 1][rank][t_i][e] * pp; + // T EVENT, events #3 and #4 in part b of Fig.A1 in + // http://arxiv.org/abs/1211.4606 + // qvec[g_id+1][tpdt_rank][tpdt_t_i][alpha]+=p_Ntau_e*(qvec[gp_id+1][rank][t_i][e]*qvec[gpp_id+1][rank][t_i][alpha]+qvec[gp_id+1][rank][t_i][alpha]*qvec[gpp_id+1][rank][t_i][e]); + q_sum += T_ep_app + T_ap_epp; + // T. + } + } + // non-leaf directed partition + if (not is_a_leaf) + for (int i = 0; i < N_parts; i++) { + long int gp_id = gp_ids[i]; + long int gpp_id = gpp_ids[i]; + scalar_type pp = p_part[i]; + + scalar_type Sb = p_Delta_bar * + (2 * qvec[gp_id + 1][rank][t_i][alpha] * + qvec[gpp_id + 1][rank][t_i][alpha]) * + pp; + // S_bar EVENT, event #2 in part b of Fig.A1 in + // http://arxiv.org/abs/1211.4606 (note that Delta_bar corresponds + //to sigma, the Delta_bar,Lambda_bar distinction keeps track of + //speciaiton (birth) vs extiction (death), + // but for the Moran process Delta_bar=Lambda_bar=sigma ) + // qvec[g_id+1][tpdt_rank][tpdt_t_i][alpha]+=p_Delta_bar*(2*qvec[gp_id+1][rank][t_i][alpha]*qvec[gpp_id+1][rank][t_i][alpha]); + q_sum += Sb; + // S_bar. + } + + for (int branch_i = 0; branch_i < n; branch_i++) { + int e = time_slices[rank][branch_i]; + scalar_type tau_e = vector_parameter["tau"][e]; + scalar_type p_Ntau_e = tau_e * Delta_t; + scalar_type TLb = p_Ntau_e * Ebar * qvec[g_id + 1][rank][t_i][e]; + // TL_bar EVENT, event #5 in part a of Fig.A1 in + // http://arxiv.org/abs/1211.4606 (note that since Ebar ~ 1, most + //transfers are expected to involve the TL evenet not the T event, + // this should not be confused with the TL event of the + // Tofigh/Doyon/ODTL models, which here corresponds + // to SL_bar + TL ..) + // qvec[g_id+1][tpdt_rank][tpdt_t_i][alpha]+=p_Ntau_e*Ebar*qvec[g_id+1][rank][t_i][e]; + q_sum += TLb; + // TL_bar. + } + scalar_type empty = G_bar * qvec[g_id + 1][rank][t_i][alpha]; + // 0 EVENT, event #1 in part b of Fig.A1 in + // http://arxiv.org/abs/1211.4606 + // qvec[g_id+1][tpdt_rank][tpdt_t_i][alpha]+=G_bar*qvec[g_id+1][rank][t_i][alpha]; + q_sum += empty; + // 0. + + // UNDERFLOW ? + qvec[g_id + 1][tpdt_rank][tpdt_t_i][alpha] += q_sum; + if (qvec[g_id + 1][tpdt_rank][tpdt_t_i][alpha] < EPSILON) { + qvec[g_id + 1][tpdt_rank][tpdt_t_i][alpha] = EPSILON; + } + + // events within slice rank at time t on alpha virtual branch. + } + if (1) { + for (int branch_i = 0; branch_i < n; branch_i++) { + int e = time_slices[rank][branch_i]; + scalar_type Get = Ge[e][t]; + scalar_type Eet = Ee[e][t]; + scalar_type delta_e = vector_parameter["delta"][e]; + scalar_type p_delta_e = delta_e * Delta_t; + + // events within slice rank at time t on branch e + qvec[g_id + 1][tpdt_rank][tpdt_t_i][e] = 0; + scalar_type q_sum = 0; + + // non-leaf directed partition + if (not is_a_leaf) + for (int i = 0; i < N_parts; i++) { + long int gp_id = gp_ids[i]; + long int gpp_id = gpp_ids[i]; + scalar_type pp = p_part[i]; + scalar_type qpe = qvec[gp_id + 1][rank][t_i][e]; + scalar_type qppe = qvec[gpp_id + 1][rank][t_i][e]; + scalar_type Sb_pa_ppe = + p_Delta_bar * qvec[gp_id + 1][rank][t_i][alpha] * qppe * pp; + scalar_type Sb_pe_ppa = + p_Delta_bar * qpe * qvec[gpp_id + 1][rank][t_i][alpha] * pp; + + // S_bar EVENT, events #3 and #4 in part a of Fig.A1 in + // http://arxiv.org/abs/1211.4606 (The majority of transfer + //events involve this event.) + // qvec[g_id+1][tpdt_rank][tpdt_t_i][e]+=p_Delta_bar*(qvec[gp_id+1][rank][t_i][alpha]*qvec[gpp_id+1][rank][t_i][e]+qvec[gp_id+1][rank][t_i][e]*qvec[gpp_id+1][rank][t_i][alpha]); + q_sum += Sb_pa_ppe + Sb_pe_ppa; + // S_bar. + + scalar_type D = 2 * p_delta_e * qpe * qppe * pp; + // D EVENT, event #2 in part a of Fig.A1 in + // http://arxiv.org/abs/1211.4606 + // qvec[g_id+1][tpdt_rank][tpdt_t_i][e]+=p_delta_e*qvec[gp_id+1][rank][t_i][e]*qvec[gpp_id+1][rank][t_i][e]; + q_sum += D; + // D. + } + + scalar_type SLb = + p_Delta_bar * Eet * qvec[g_id + 1][rank][t_i][alpha]; + // SL_bar EVENT, event #5 in part a of Fig.A1 in + // http://arxiv.org/abs/1211.4606 (Transfer events where the donor + //copy is lost involve this event.) + // qvec[g_id+1][tpdt_rank][tpdt_t_i][e]+=p_Delta_bar*Eet*qvec[g_id+1][rank][t_i][alpha]; + q_sum += SLb; + // SL_bar. + + scalar_type empty = Get * qvec[g_id + 1][rank][t_i][e]; + // 0 EVENT, event #1 in part a of Fig.A1 in + // http://arxiv.org/abs/1211.4606 + // qvec[g_id+1][tpdt_rank][tpdt_t_i][e]=Get*qvec[g_id+1][rank][t_i][e]; + q_sum += empty; + // 0. + + qvec[g_id + 1][tpdt_rank][tpdt_t_i][e] += q_sum; + // UNDERFLOW? + if (qvec[g_id + 1][tpdt_rank][tpdt_t_i][e] < EPSILON) { + qvec[g_id + 1][tpdt_rank][tpdt_t_i][e] = EPSILON; + } + + // if (qvec[g_id+1][tpdt_rank][tpdt_t_i][e]>1) + // qvec[g_id+1][tpdt_rank][tpdt_t_i][e]=1; events within slice rank + // at time t on branch e. + } + } + // ###################################################################################################################### + // #########################################INNNER + // LOOP################################################################## + // ###################################################################################################################### + } } - - scalar_type root_norm=0; - for (int rank=0;rank >::iterator it=Ee.begin();it!=Ee.end();it++)//del_loc + for (std::map>::iterator it = + Ee.begin(); + it != Ee.end(); it++) // del_loc (*it).second.clear(); Ee.clear(); - for (std::map >::iterator it=Ge.begin();it!=Ge.end();it++)//del_loc + for (std::map>::iterator it = + Ge.begin(); + it != Ge.end(); it++) // del_loc (*it).second.clear(); Ge.clear(); + map Ee_y; // del-loc + map Ge_y; // del-loc + map E_k1, E_k2, E_k3, E_k4; // del-loc + map G_k1, G_k2, G_k3, G_k4; // del-loc - map Ee_y;//del-loc - map Ge_y;//del-loc - map E_k1,E_k2,E_k3,E_k4;//del-loc - map G_k1,G_k2,G_k3,G_k4;//del-loc - - map tmp; //XX - tmp[0]=1; - tmp[1]=1; + map tmp; // XX + tmp[0] = 1; + tmp[1] = 1; - for (int rank=0;rank > y_E,y_G;//del-loc - map > iy_E,iy_G;//del-loc - - scalar_type t_b; - if (tsi==(int)time_slice_times[rank].size()-1) - t_b = time_slice_begins[rank]; - else - t_b = time_slice_times[rank][tsi+1]; - scalar_type t_e; - if (tsi==0) - { - if (rank>0 ) - t_e = time_slice_begins[rank-1]; - else - t_e = 0; - } - else - { - t_e=time_slice_times[rank][tsi]; - } - scalar_type N=vector_parameter["N"][rank]; - - scalar_type ni=time_slices[rank].size(); - scalar_type Delta_bar=vector_parameter["Delta_bar"][rank];//1 - scalar_type Lambda_bar=vector_parameter["Lambda_bar"][rank]*N/(N-ni);; - scalar_type t=t_e; - scalar_type tpdt=t_b; - scalar_type h=(tpdt-t)/scalar_parameter["DD"]; - //scalar_type ti=t; - scalar_type h_lambda_avg=h*scalar_parameter["lambda_avg"]; - scalar_type h_delta_avg=h*scalar_parameter["delta_avg"]; - scalar_type h_tau_avg=h*scalar_parameter["tau_avg"]*(N-ni)/(N-1)*N; - scalar_type h_Delta_bar=h*Delta_bar; - scalar_type h_Lambda_bar=h*Lambda_bar; - - - for (int ii=0;ii> y_E, y_G; // del-loc + map> iy_E, iy_G; // del-loc + + scalar_type t_b; + if (tsi == (int)time_slice_times[rank].size() - 1) + t_b = time_slice_begins[rank]; + else + t_b = time_slice_times[rank][tsi + 1]; + scalar_type t_e; + if (tsi == 0) { + if (rank > 0) + t_e = time_slice_begins[rank - 1]; + else + t_e = 0; + } else { + t_e = time_slice_times[rank][tsi]; + } + scalar_type N = vector_parameter["N"][rank]; + + scalar_type ni = time_slices[rank].size(); + scalar_type Delta_bar = vector_parameter["Delta_bar"][rank]; // 1 + scalar_type Lambda_bar = + vector_parameter["Lambda_bar"][rank] * N / (N - ni); + ; + scalar_type t = t_e; + scalar_type tpdt = t_b; + scalar_type h = (tpdt - t) / scalar_parameter["DD"]; + // scalar_type ti=t; + scalar_type h_lambda_avg = h * scalar_parameter["lambda_avg"]; + scalar_type h_delta_avg = h * scalar_parameter["delta_avg"]; + scalar_type h_tau_avg = + h * scalar_parameter["tau_avg"] * (N - ni) / (N - 1) * N; + scalar_type h_Delta_bar = h * Delta_bar; + scalar_type h_Lambda_bar = h * Lambda_bar; + + for (int ii = 0; ii < scalar_parameter["DD"]; ii++) { + + // intial conditions + if (ii == 0) { + if (t == 0) + Ee[-1][t] = 1; + // trivial else Ee[-1][t]=Ee[-1][t]; + + // y_E[-1][t]=Ee[-1][t]; + iy_E[-1][ii] = Ee[-1][t]; + + // Ee_y[-1]=y_E[-1][t]; + Ee_y[-1] = iy_E[-1][ii]; + + Ge_y[-1] = 1; + + // y_G[-1][t]=1; + iy_G[-1][ii] = 1; + } + + for (int i = 0; i < (int)time_slices[rank].size(); i++) { + int e = time_slices[rank][i]; + if (ii == 0) { + if (t == 0) { + Ee[e][t] = 0; + } else if (t == t_end[e]) { + int f = daughters[e][0]; + int g = daughters[e][1]; + Ee[e][t] = Ee[f][t] * Ee[g][t]; + } + // trivial else{Ee[e][t]=Ee[e][t];} + // y_E[e][t]=Ee[e][t]; + iy_E[e][ii] = Ee[e][t]; + + // Ee_y[e]=y_E[e][t]; + Ee_y[e] = iy_E[e][ii]; + + Ge_y[e] = 1; + + // y_G[e][t]=1; + iy_G[e][ii] = 1; + } + } + // RK4: 4th order Runge-Kutta for y'=f(y) + // k1 = f(y[n]) + E_k1[-1] = + (h_Lambda_bar + h_lambda_avg) * (1 - Ee_y[-1]) - + (h_Delta_bar + h_delta_avg + h_tau_avg) * (1 - Ee_y[-1]) * Ee_y[-1]; + G_k1[-1] = + -((h_Delta_bar + h_delta_avg + h_tau_avg) * (1 - 2 * Ee_y[-1]) + + (h_Lambda_bar + h_lambda_avg)) * + Ge_y[-1]; + for (int j = 0; j < (int)time_slices[rank].size(); j++) { + int f = time_slices[rank][j]; + scalar_type h_tau_f = h * vector_parameter["tau"][f]; + E_k1[-1] -= h_tau_f * (1 - Ee_y[f]) * Ee_y[-1]; + G_k1[-1] -= h_tau_f * (1 - Ee_y[f]) * Ge_y[-1]; + } + + for (int i = 0; i < (int)time_slices[rank].size(); i++) { + int e = time_slices[rank][i]; + scalar_type delta = vector_parameter["delta"][e]; + scalar_type lambda = vector_parameter["lambda"][e]; + scalar_type h_lambda = h * lambda; + scalar_type h_delta = h * delta; + + // k1 = f(y[n]) + E_k1[e] = h_lambda * (1 - Ee_y[e]) - + (h_delta * (1 - Ee_y[e]) + + (h_Delta_bar + h_tau_avg) * (1 - Ee_y[-1])) * + Ee_y[e]; + G_k1[e] = -1 * + (h_lambda + h_delta * (1 - 2 * Ee_y[e]) + + (h_Delta_bar + h_tau_avg) * (1 - Ee_y[-1])) * + Ge_y[e]; + } + // k2 = f(y[n]+h/2 k1) + + // Ee_y[-1]=y_E[-1][ti]+1/2.* E_k1[-1]; + Ee_y[-1] = iy_E[-1][ii] + 1 / 2. * E_k1[-1]; + // Ge_y[-1]=y_G[-1][ti]+1/2.* G_k1[-1]; + Ge_y[-1] = iy_G[-1][ii] + 1 / 2. * G_k1[-1]; + + E_k2[-1] = + (h_Lambda_bar + h_lambda_avg) * (1 - Ee_y[-1]) - + (h_Delta_bar + h_delta_avg + h_tau_avg) * (1 - Ee_y[-1]) * Ee_y[-1]; + G_k2[-1] = + -((h_Delta_bar + h_delta_avg + h_tau_avg) * (1 - 2 * Ee_y[-1]) + + (h_Lambda_bar + h_lambda_avg)) * + Ge_y[-1]; + for (int j = 0; j < (int)time_slices[rank].size(); j++) { + int f = time_slices[rank][j]; + scalar_type h_tau_f = h * vector_parameter["tau"][f]; + E_k2[-1] -= h_tau_f * (1 - Ee_y[f]) * Ee_y[-1]; + G_k2[-1] -= h_tau_f * (1 - Ee_y[f]) * Ge_y[-1]; + } + + for (int i = 0; i < (int)time_slices[rank].size(); i++) { + int e = time_slices[rank][i]; + scalar_type delta = vector_parameter["delta"][e]; + scalar_type lambda = vector_parameter["lambda"][e]; + scalar_type h_lambda = h * lambda; + scalar_type h_delta = h * delta; + + // k2 = f(y[n]+h/2 k1) + // Ee_y[e] =y_E[e][ti]+1/2. * E_k1[e]; + Ee_y[e] = iy_E[e][ii] + 1 / 2. * E_k1[e]; + // Ge_y[e] =y_G[e][ti]+1/2. * G_k1[e]; + Ge_y[e] = iy_G[e][ii] + 1 / 2. * G_k1[e]; + + E_k2[e] = h_lambda * (1 - Ee_y[e]) - + (h_delta * (1 - Ee_y[e]) + + (h_Delta_bar + h_tau_avg) * (1 - Ee_y[-1])) * + Ee_y[e]; + G_k2[e] = -1 * + (h_lambda + h_delta * (1 - 2 * Ee_y[e]) + + (h_Delta_bar + h_tau_avg) * (1 - Ee_y[-1])) * + Ge_y[e]; + } + + // k3 = f(y[n]+h/2 k2) + // Ee_y[-1]=y_E[-1][ti]+1/2.* E_k2[-1]; + Ee_y[-1] = iy_E[-1][ii] + 1 / 2. * E_k2[-1]; + // Ge_y[-1]=y_G[-1][ti]+1/2.* G_k2[-1]; + Ge_y[-1] = iy_G[-1][ii] + 1 / 2. * G_k2[-1]; + + E_k3[-1] = + (h_Lambda_bar + h_lambda_avg) * (1 - Ee_y[-1]) - + (h_Delta_bar + h_delta_avg + h_tau_avg) * (1 - Ee_y[-1]) * Ee_y[-1]; + G_k3[-1] = + -((h_Delta_bar + h_delta_avg + h_tau_avg) * (1 - 2 * Ee_y[-1]) + + (h_Lambda_bar + h_lambda_avg)) * + Ge_y[-1]; + for (int j = 0; j < (int)time_slices[rank].size(); j++) { + int f = time_slices[rank][j]; + scalar_type h_tau_f = h * vector_parameter["tau"][f]; + E_k3[-1] -= h_tau_f * (1 - Ee_y[f]) * Ee_y[-1]; + G_k3[-1] -= h_tau_f * (1 - Ee_y[f]) * Ge_y[-1]; + } + + for (int i = 0; i < (int)time_slices[rank].size(); i++) { + int e = time_slices[rank][i]; + scalar_type delta = vector_parameter["delta"][e]; + scalar_type lambda = vector_parameter["lambda"][e]; + scalar_type h_lambda = h * lambda; + scalar_type h_delta = h * delta; + + // k3 = f(y[n]+h/2 k2) + // Ee_y[e] =y_E[e][ti]+1/2. * E_k2[e]; + Ee_y[e] = iy_E[e][ii] + 1 / 2. * E_k2[e]; + // Ge_y[e] =y_G[e][ti]+1/2. * G_k2[e]; + Ge_y[e] = iy_G[e][ii] + 1 / 2. * G_k2[e]; + + E_k3[e] = h_lambda * (1 - Ee_y[e]) - + (h_delta * (1 - Ee_y[e]) + + (h_Delta_bar + h_tau_avg) * (1 - Ee_y[-1])) * + Ee_y[e]; + G_k3[e] = -1 * + (h_lambda + h_delta * (1 - 2 * Ee_y[e]) + + (h_Delta_bar + h_tau_avg) * (1 - Ee_y[-1])) * + Ge_y[e]; + } + + // k4 = f(y[n]+h k3) + // Ee_y[-1]=y_E[-1][ti]+1* E_k3[-1]; + Ee_y[-1] = iy_E[-1][ii] + 1 * E_k3[-1]; + // Ge_y[-1]=y_G[-1][ti]+1* G_k3[-1]; + Ge_y[-1] = iy_G[-1][ii] + 1 * G_k3[-1]; + + E_k4[-1] = + (h_Lambda_bar + h_lambda_avg) * (1 - Ee_y[-1]) - + (h_Delta_bar + h_delta_avg + h_tau_avg) * (1 - Ee_y[-1]) * Ee_y[-1]; + G_k4[-1] = + -((h_Delta_bar + h_delta_avg + h_tau_avg) * (1 - 2 * Ee_y[-1]) + + (h_Lambda_bar + h_lambda_avg)) * + Ge_y[-1]; + for (int j = 0; j < (int)time_slices[rank].size(); j++) { + int f = time_slices[rank][j]; + scalar_type h_tau_f = h * vector_parameter["tau"][f]; + E_k4[-1] -= h_tau_f * (1 - Ee_y[f]) * Ee_y[-1]; + G_k4[-1] -= h_tau_f * (1 - Ee_y[f]) * Ge_y[-1]; + } + + for (int i = 0; i < (int)time_slices[rank].size(); i++) { + int e = time_slices[rank][i]; + scalar_type delta = vector_parameter["delta"][e]; + scalar_type lambda = vector_parameter["lambda"][e]; + scalar_type h_lambda = h * lambda; + scalar_type h_delta = h * delta; + + // k4 = f(y[n]+h k3) + // Ee_y[e] =y_E[e][ti]+1 * E_k3[e]; + Ee_y[e] = iy_E[e][ii] + 1 * E_k3[e]; + + // Ge_y[e] =y_G[e][ti]+1 * G_k3[e]; + Ge_y[e] = iy_G[e][ii] + 1 * G_k3[e]; + + E_k4[e] = h_lambda * (1 - Ee_y[e]) - + (h_delta * (1 - Ee_y[e]) + + (h_Delta_bar + h_tau_avg) * (1 - Ee_y[-1])) * + Ee_y[e]; + G_k4[e] = -1 * + (h_lambda + h_delta * (1 - 2 * Ee_y[e]) + + (h_Delta_bar + h_tau_avg) * (1 - Ee_y[-1])) * + Ge_y[e]; + } + // y[n+1] = y[n] + h/6 (k1 + 2 k2 + 2 k3 + k4) + // y_E[-1][ti+h]=Ee_y[-1] + 1/6. * (E_k1[-1] + 2*E_k2[-1] + 2*E_k3[-1] + + // E_k4[-1]); iy_E[-1][ii+1]=Ee_y[-1] + 1/6. * (E_k1[-1] + 2*E_k2[-1] + + // 2*E_k3[-1] + E_k4[-1]); + + ///* + if (ii == 0) + iy_E[-1][ii + 1] = + Ee[-1][t] + + 1 / 6. * (E_k1[-1] + 2 * E_k2[-1] + 2 * E_k3[-1] + E_k4[-1]); + else + iy_E[-1][ii + 1] = + iy_E[-1][ii] + + 1 / 6. * (E_k1[-1] + 2 * E_k2[-1] + 2 * E_k3[-1] + E_k4[-1]); + //*/ + // y_G[-1][ti+h]=Ge_y[-1] + 1/6. * (G_k1[-1] + 2*G_k2[-1] + 2*G_k3[-1] + + // G_k4[-1]); iy_G[-1][ii+1]=Ge_y[-1] + 1/6. * (G_k1[-1] + 2*G_k2[-1] + + // 2*G_k3[-1] + G_k4[-1]); + + if (ii == 0) + iy_G[-1][ii + 1] = + 1 + 1 / 6. * (G_k1[-1] + 2 * G_k2[-1] + 2 * G_k3[-1] + G_k4[-1]); + else + iy_G[-1][ii + 1] = + iy_G[-1][ii] + + 1 / 6. * (G_k1[-1] + 2 * G_k2[-1] + 2 * G_k3[-1] + G_k4[-1]); + + if (ii == scalar_parameter["DD"] - 1) { + // Ee[-1][tpdt]=y_E[-1][ti+h]; + Ee[-1][tpdt] = iy_E[-1][ii + 1]; + + // Ge[-1][t]=y_G[-1][ti+h]; + Ge[-1][t] = iy_G[-1][ii + 1]; + + // cout << -1 << " " << t << " " << Ee[-1][tpdt] << " " << + // Ge[-1][t]< >::iterator it=y_E.begin();it!=y_E.end();it++) + E_k1.clear(); + E_k2.clear(); + E_k3.clear(); + E_k4.clear(); + G_k1.clear(); + G_k2.clear(); + G_k3.clear(); + G_k4.clear(); + /* + for (map >::iterator + it=y_E.begin();it!=y_E.end();it++) (*it).second.clear(); y_E.clear(); - for (map >::iterator it=y_G.begin();it!=y_G.end();it++) + for (map >::iterator + it=y_G.begin();it!=y_G.end();it++) (*it).second.clear(); - y_G.clear(); + y_G.clear(); */ } diff --git a/src/model_scaled.cpp b/src/model_scaled.cpp index 4d7a8ad..56b72fd 100644 --- a/src/model_scaled.cpp +++ b/src/model_scaled.cpp @@ -4,87 +4,93 @@ using namespace bpp; #include - -static double EPSILON = numeric_limits< double >::min(); - -//static double EPSILON = 10^-300; - -//p(ale) calculates Pi(Gamma) cf. ALEPAPER -scalar_type exODT_model::p(approx_posterior *ale) -{ - ale_pointer=ale; - //directed partitions and their sizes - vector g_ids;//del-loc - vector g_id_sizes;//del-loc - - //We sort the directed partitions by size (number of gene tree leaves) to ensure that we calculate things in the proper order (smaller to larger) - for (map > :: iterator it = ale->size_ordered_bips.begin(); it != ale->size_ordered_bips.end(); it++) - for (vector :: iterator jt = (*it).second.begin(); jt != (*it).second.end(); jt++) - { - g_ids.push_back((*jt)); - g_id_sizes.push_back((*it).first); - } - //root bipartition needs to be handled separately +static double EPSILON = numeric_limits::min(); + +// static double EPSILON = 10^-300; + +// p(ale) calculates Pi(Gamma) cf. ALEPAPER +scalar_type exODT_model::p(approx_posterior *ale) { + ale_pointer = ale; + // directed partitions and their sizes + vector g_ids; // del-loc + vector g_id_sizes; // del-loc + + // We sort the directed partitions by size (number of gene tree leaves) to + // ensure that we calculate things in the proper order (smaller to larger) + for (map>::iterator it = ale->size_ordered_bips.begin(); + it != ale->size_ordered_bips.end(); it++) + for (vector::iterator jt = (*it).second.begin(); + jt != (*it).second.end(); jt++) { + g_ids.push_back((*jt)); + g_id_sizes.push_back((*it).first); + } + // root bipartition needs to be handled separately g_ids.push_back(-1); g_id_sizes.push_back(ale->Gamma_size); // gene<->species mapping - //vector > > > qvec; - qvec.clear();//hope this doesn't leak.. + // vector > > > qvec; + qvec.clear(); // hope this doesn't leak.. // gene<->species mapping - // for (int i=0;i<(int)g_ids.size();i++) //Going through each clade of the approx_posterior + // for (int i=0;i<(int)g_ids.size();i++) //Going through each clade of the + // approx_posterior // { // long int g_id=g_ids[i]; // cerr<<"i: "< case vide - vector > > vrank; - vector > vt_i; - map vbranch; - vt_i.push_back(vbranch); - vrank.push_back(vt_i); - qvec.push_back(vrank); - } - else{ - //vector > > vrank; - vector > > vrank; - for (int rank=0;rank > vt_i; - vector > vt_i; - for (int t_i=0;t_i<(int)time_slice_times[rank].size()+(rank==last_rank-1) ;t_i++) //Going through the subslices - { - //cerr<<"\t\tt_i: "< vbranch(n, 0.); - map vbranch; - for (int branch_i=0;branch_i case vide + vector>> vrank; + vector> vt_i; + map vbranch; + vt_i.push_back(vbranch); + vrank.push_back(vt_i); + qvec.push_back(vrank); + } else { + // vector > > vrank; + vector>> vrank; + for (int rank = 0; rank < last_rank; + rank++) // Going through time slices, from leaves to root + { + // cerr<<"\trank: "< > vt_i; + vector> vt_i; + for (int t_i = 0; + t_i < (int)time_slice_times[rank].size() + (rank == last_rank - 1); + t_i++) // Going through the subslices + { + // cerr<<"\t\tt_i: "< vbranch(n, 0.); + map vbranch; + for (int branch_i = 0; branch_i < n; branch_i++) { + int e = time_slices[rank][branch_i]; + // cerr<<"\t\t\te: "< temp = ale->id_sets[g_id]; for (auto i = 0; i < ale->Gamma_size + 1 ; ++i) { @@ -94,636 +100,657 @@ scalar_type exODT_model::p(approx_posterior *ale) break; } }*/ - - int id = 0; - for (auto i=0; i< ale->Gamma_size + 1; ++i) { - if ( ale->id_sets[g_id][i] ) { - id=i; - break; - } + + int id = 0; + for (auto i = 0; i < ale->Gamma_size + 1; ++i) { + if (ale->id_sets[g_id][i]) { + id = i; + break; } - - string gene_name=ale->id_leaves[ id /*g_id*/ ]; - -// string gene_name=ale->id_leaves[ g_id ]; - // string gene_name=ale->id_leaves[(* (ale->id_sets[g_id].begin()) )]; - vector tokens; - boost::split(tokens,gene_name,boost::is_any_of(string_parameter["gene_name_separators"]),boost::token_compress_on); - string species_name; - if ((int)scalar_parameter["species_field"]==-1) - species_name=tokens[tokens.size()-1]; - else - species_name=tokens[(int)scalar_parameter["species_field"]]; - gid_sps[g_id]=species_name; - } - } - - //p_parts is filled up with CCPs - for (int i=0;i<(int)g_ids.size();i++) - { - // directed partition (dip) gamma's id - bool is_a_leaf=false; - long int g_id=g_ids[i]; - if (g_id_sizes[i]==1) - is_a_leaf=true; - - vector gp_ids;//del-loc - vector gpp_ids;//del-loc - vector p_part;//del-loc - if (g_id!=-1) - for (unordered_map< pair,scalar_type> :: iterator kt = ale->Dip_counts[g_id].begin(); kt != ale->Dip_counts[g_id].end(); kt++) - { - pair parts = (*kt).first; - long int gp_id=parts.first; - long int gpp_id=parts.second; - gp_ids.push_back(gp_id); - gpp_ids.push_back(gpp_id); - if (ale->Bip_counts[g_id]<=scalar_parameter["min_bip_count"]) - p_part.push_back(0); - else - p_part.push_back(ale->p_dip(g_id,gp_id,gpp_id)); - } + } + + string gene_name = ale->id_leaves[id /*g_id*/]; + + // string gene_name=ale->id_leaves[ g_id ]; + // string gene_name=ale->id_leaves[(* (ale->id_sets[g_id].begin()) )]; + vector tokens; + boost::split(tokens, gene_name, + boost::is_any_of(string_parameter["gene_name_separators"]), + boost::token_compress_on); + string species_name; + if ((int)scalar_parameter["species_field"] == -1) + species_name = tokens[tokens.size() - 1]; else - { - //root bipartition needs to be handled separately - map,int> bip_parts; - for (map :: iterator it = ale->Bip_counts.begin(); it != ale->Bip_counts.end(); it++) - { - long int gp_id=(*it).first; - boost::dynamic_bitset<> gamma =ale->id_sets.at(gp_id); - boost::dynamic_bitset<> not_gamma = ~gamma; - not_gamma[0] = 0; - long int gpp_id = ale->set_ids.at(not_gamma); - - set parts; - parts.insert(gp_id); - parts.insert(gpp_id); - bip_parts[parts]=1; - // gamma.clear(); - // not_gamma.clear(); - } - for (map,int> :: iterator kt = bip_parts.begin();kt!=bip_parts.end();kt++) - { - vector parts; - for (set::iterator sit=(*kt).first.begin();sit!=(*kt).first.end();sit++) { - parts.push_back((*sit)); + species_name = tokens[(int)scalar_parameter["species_field"]]; + gid_sps[g_id] = species_name; + } + } + + // p_parts is filled up with CCPs + for (int i = 0; i < (int)g_ids.size(); i++) { + // directed partition (dip) gamma's id + bool is_a_leaf = false; + long int g_id = g_ids[i]; + if (g_id_sizes[i] == 1) + is_a_leaf = true; + + vector gp_ids; // del-loc + vector gpp_ids; // del-loc + vector p_part; // del-loc + if (g_id != -1) + for (unordered_map, scalar_type>::iterator kt = + ale->Dip_counts[g_id].begin(); + kt != ale->Dip_counts[g_id].end(); kt++) { + pair parts = (*kt).first; + long int gp_id = parts.first; + long int gpp_id = parts.second; + gp_ids.push_back(gp_id); + gpp_ids.push_back(gpp_id); + if (ale->Bip_counts[g_id] <= scalar_parameter["min_bip_count"]) + p_part.push_back(0); + else + p_part.push_back(ale->p_dip(g_id, gp_id, gpp_id)); + } + else { + // root bipartition needs to be handled separately + map, int> bip_parts; + for (map::iterator it = ale->Bip_counts.begin(); + it != ale->Bip_counts.end(); it++) { + long int gp_id = (*it).first; + boost::dynamic_bitset<> gamma = ale->id_sets.at(gp_id); + boost::dynamic_bitset<> not_gamma = ~gamma; + not_gamma[0] = 0; + long int gpp_id = ale->set_ids.at(not_gamma); + + set parts; + parts.insert(gp_id); + parts.insert(gpp_id); + bip_parts[parts] = 1; + // gamma.clear(); + // not_gamma.clear(); + } + for (map, int>::iterator kt = bip_parts.begin(); + kt != bip_parts.end(); kt++) { + vector parts; + for (set::iterator sit = (*kt).first.begin(); + sit != (*kt).first.end(); sit++) { + parts.push_back((*sit)); + } + long int gp_id = parts[0]; + long int gpp_id = parts[1]; + gp_ids.push_back(gp_id); + gpp_ids.push_back(gpp_id); + + // Here we can create a new ale->Bip_counts[gp_id], in particular for + // leaves. We may want to add the leaf entries for Bip_counts when + // Bip_counts is first created. + if (ale->Bip_counts[gp_id] <= scalar_parameter.at("min_bip_count") and + not ale->Gamma_size < 4) + p_part.push_back(0); + else + p_part.push_back(ale->p_bip(gp_id)); + } + bip_parts.clear(); + } + int N_parts = gp_ids.size(); + + // iterate over all postions along S + for (int rank = 0; rank < last_rank; rank++) { + int n = time_slices[rank].size(); + for (int t_i = 0; t_i < (int)time_slice_times[rank].size(); t_i++) { + // ###################################################################################################################### + // #########################################INNNER + // LOOP################################################################## + // ###################################################################################################################### + + scalar_type t = time_slice_times[rank][t_i]; + scalar_type tpdt; + int tpdt_rank, tpdt_t_i; + if (t_i < (int)time_slice_times[rank].size() - 1) { + tpdt = time_slice_times[rank][t_i + 1]; + tpdt_rank = rank; + tpdt_t_i = t_i + 1; + } else if (rank < last_rank - 1) { + tpdt = time_slice_times[rank + 1][0]; + tpdt_rank = rank + 1; + tpdt_t_i = 0; + } else + // top of root stem + { + tpdt = t_begin[time_slices[rank][0]]; + tpdt_rank = rank; + tpdt_t_i = t_i + 1; + } + + // root + scalar_type Delta_t = (tpdt - t) * 1; + // Delat_bar corresponds to \hat \sigma + scalar_type ni = time_slices[rank].size(); + scalar_type delta_avg = scalar_parameter["delta_avg"]; + scalar_type tau_avg = scalar_parameter["tau_avg"]; + scalar_type lambda_avg = scalar_parameter["lambda_avg"]; + scalar_type sigma_hat = scalar_parameter["sigma_hat"]; + scalar_type H_hat = Ee[-1][t]; + + // boundaries for branch alpha virtual branch + + // boundary at present + if (t == 0) + qvec[g_id + 1][rank][t_i][alpha] = 0; + + // boundary between slice rank and rank-1 slice is trivial + ; // qvec[g_id+1][rank][t_i][alpha]=qvec[g_id+1][rank][t_i][alpha]; + + // boundaries for branch alpha virtual branch. + if (1) { + for (int branch_i = 0; branch_i < n; branch_i++) { + int e = time_slices[rank][branch_i]; + + // boundaries for branch e + // boundary at present + if (t == 0) { + if (is_a_leaf && extant_species[e] == gid_sps[g_id]) + qvec[g_id + 1][rank][t_i][e] = 1; + else + qvec[g_id + 1][rank][t_i][e] = 0; } - long int gp_id=parts[0]; - long int gpp_id=parts[1]; - gp_ids.push_back(gp_id); - gpp_ids.push_back(gpp_id); - - //Here we can create a new ale->Bip_counts[gp_id], in particular for leaves. - //We may want to add the leaf entries for Bip_counts when Bip_counts is first created. - if (ale->Bip_counts[gp_id]<=scalar_parameter.at("min_bip_count") and not ale->Gamma_size<4) - p_part.push_back(0); - else - p_part.push_back(ale->p_bip(gp_id)); - } - bip_parts.clear(); - } - int N_parts=gp_ids.size(); - - //iterate over all postions along S - for (int rank=0;rank0) - { - int f=daughters[e][0]; - int g=daughters[e][1]; - scalar_type Eft=Ee[f][t]; - scalar_type Egt=Ee[g][t]; - - scalar_type q_sum=0; - //qvec[g_id+1][rank][t_i][e]=0; - - scalar_type SL_fLg=qvec[g_id+1][rank][t_i][f]*Egt; - scalar_type SL_Lfg=qvec[g_id+1][rank][t_i][g]*Eft; - //SL EVENT, events #3 and #4 in part c of Fig.A1 in http://arxiv.org/abs/1211.4606 - //qvec[g_id+1][rank][t_i][e]=qvec[g_id+1][rank][t_i][f]*Egt + qvec[g_id+1][rank][t_i][g]*Eft; - q_sum+=SL_fLg+SL_Lfg; - //SL. - - //non-leaf directed partition - if (not is_a_leaf) - for (int i=0;i 0) { + int f = daughters[e][0]; + int g = daughters[e][1]; + scalar_type Eft = Ee[f][t]; + scalar_type Egt = Ee[g][t]; + + scalar_type q_sum = 0; + // qvec[g_id+1][rank][t_i][e]=0; + + scalar_type SL_fLg = qvec[g_id + 1][rank][t_i][f] * Egt; + scalar_type SL_Lfg = qvec[g_id + 1][rank][t_i][g] * Eft; + // SL EVENT, events #3 and #4 in part c of Fig.A1 in + // http://arxiv.org/abs/1211.4606 + // qvec[g_id+1][rank][t_i][e]=qvec[g_id+1][rank][t_i][f]*Egt + + // qvec[g_id+1][rank][t_i][g]*Eft; + q_sum += SL_fLg + SL_Lfg; + // SL. + + // non-leaf directed partition + if (not is_a_leaf) + for (int i = 0; i < N_parts; i++) { + long int gp_id = gp_ids[i]; + long int gpp_id = gpp_ids[i]; + scalar_type pp = p_part[i]; + scalar_type S_pf_ppg = qvec[gp_id + 1][rank][t_i][f] * + qvec[gpp_id + 1][rank][t_i][g] * pp; + scalar_type S_ppf_pg = qvec[gpp_id + 1][rank][t_i][f] * + qvec[gp_id + 1][rank][t_i][g] * pp; + // S EVENT, events #1 and #2 in part c of Fig.A1 in + // http://arxiv.org/abs/1211.4606 + // qvec[g_id+1][rank][t_i][e]+=qvec[gp_id+1][rank][t_i][f]*qvec[gpp_id+1][rank][t_i][g] + // +qvec[gpp_id+1][rank][t_i][f]*qvec[gp_id+1][rank][t_i][g]; + q_sum += S_pf_ppg + S_ppf_pg; + // S. } - else { - qvec[g_id+1][rank][t_i][e] = q_sum; + // UNDERFLOW ? + if (q_sum < EPSILON) { + qvec[g_id + 1][rank][t_i][e] = EPSILON; + } else { + qvec[g_id + 1][rank][t_i][e] = q_sum; + } + } + + // branches that cross to next time slice + else { + // trivial + ; // qvec[g_id+1][rank][t_i][e]=qvec[g_id+1][rank][t_i][e]; + } + } + // boundaries for branch e. + } + } + + if (1) { + + // events within slice rank at time t on alpha virtual branch + // scalar_type G_bar=Ge[-1][t]; + // note that the coalescent approximation in + // http://arxiv.org/abs/1211.4606 is + // exp(-(Delta_bar*(n-N)/N+Lambda_bar)*Delta_t ); + + qvec[g_id + 1][tpdt_rank][tpdt_t_i][alpha] = 0; + scalar_type q_sum = 0; + /* vanishes in the scaling limit + for (int branch_i=0;branch_i1) qvec[g_id+1][tpdt_rank][tpdt_t_i][e]=1; - //events within slice rank at time t on branch e. - } - } - //###################################################################################################################### - //#########################################INNNER LOOP################################################################## - //###################################################################################################################### - } - } - gp_ids.clear(); - gpp_ids.clear(); - p_part.clear(); + + scalar_type SLb = + sigma_hat * Delta_t * Eet * qvec[g_id + 1][rank][t_i][alpha]; + // SL_bar EVENT, event #5 in part a of Fig.A1 in + // http://arxiv.org/abs/1211.4606 (Transfer events where the donor + //copy is lost involve this event.) + // qvec[g_id+1][tpdt_rank][tpdt_t_i][e]+=p_Delta_bar*Eet*qvec[g_id+1][rank][t_i][alpha]; + q_sum += SLb; + // SL_bar. + + scalar_type empty = (1 + (2 * delta_e * Eet - sigma_hat * H_hat - + delta_e - lambda_e) * + Delta_t) * + qvec[g_id + 1][rank][t_i][e]; + // 0 EVENT, event #1 in part a of Fig.A1 in + // http://arxiv.org/abs/1211.4606 + // qvec[g_id+1][tpdt_rank][tpdt_t_i][e]=Get*qvec[g_id+1][rank][t_i][e]; + q_sum += empty; + // 0. + + qvec[g_id + 1][tpdt_rank][tpdt_t_i][e] += q_sum; + // UNDERFLOW? + if (qvec[g_id + 1][tpdt_rank][tpdt_t_i][e] < EPSILON) { + qvec[g_id + 1][tpdt_rank][tpdt_t_i][e] = EPSILON; + } + + // if (qvec[g_id+1][tpdt_rank][tpdt_t_i][e]>1) + // qvec[g_id+1][tpdt_rank][tpdt_t_i][e]=1; events within slice rank + // at time t on branch e. + } + } + // ###################################################################################################################### + // #########################################INNNER + // LOOP################################################################## + // ###################################################################################################################### + } } - - scalar_type survive=0; - scalar_type root_sum=0; - for (int rank=0;rank >::iterator it=Ee.begin();it!=Ee.end();it++)//del_loc + for (std::map>::iterator it = + Ee.begin(); + it != Ee.end(); it++) // del_loc (*it).second.clear(); Ee.clear(); - for (std::map >::iterator it=Ge.begin();it!=Ge.end();it++)//del_loc + for (std::map>::iterator it = + Ge.begin(); + it != Ge.end(); it++) // del_loc (*it).second.clear(); Ge.clear(); + map Ee_y; // del-loc + map E_k1, E_k2, E_k3, E_k4; // del-loc - map Ee_y;//del-loc - map E_k1,E_k2,E_k3,E_k4;//del-loc - + for (int rank = 0; rank < last_rank; rank++) + for (int tsi = 0; tsi < (int)time_slice_times[rank].size(); tsi++) { + map> y_E; // del-loc + map> iy_E; // del-loc - for (int rank=0;rank > y_E;//del-loc - map > iy_E;//del-loc - - scalar_type t_b; - if (tsi==(int)time_slice_times[rank].size()-1) - t_b = time_slice_begins[rank]; - else - t_b = time_slice_times[rank][tsi+1]; - scalar_type t_e; - if (tsi==0) - { - if (rank>0 ) - t_e = time_slice_begins[rank-1]; - else - t_e = 0; - } - else - { - t_e=time_slice_times[rank][tsi]; - } - scalar_type sigma_hat=scalar_parameter["sigma_hat"];//vector_parameter["sigma_hat"][rank];//this we usually set to 1 - scalar_type t=t_e; - scalar_type tpdt=t_b; - scalar_type h=(tpdt-t)/scalar_parameter["DD"]; - //scalar_type ti=t; - scalar_type ni=time_slices[rank].size(); - scalar_type delta_avg=scalar_parameter["delta_avg"]; - scalar_type tau_avg=scalar_parameter["tau_avg"]; - scalar_type lambda_avg=scalar_parameter["lambda_avg"]; - scalar_type Delta_bar=Delta_bar; - scalar_type Lambda_bar=Lambda_bar; - - - for (int ii=0;ii 0) + t_e = time_slice_begins[rank - 1]; + else + t_e = 0; + } else { + t_e = time_slice_times[rank][tsi]; + } + scalar_type sigma_hat = + scalar_parameter["sigma_hat"]; // vector_parameter["sigma_hat"][rank];//this + // we usually set to 1 + scalar_type t = t_e; + scalar_type tpdt = t_b; + scalar_type h = (tpdt - t) / scalar_parameter["DD"]; + // scalar_type ti=t; + scalar_type ni = time_slices[rank].size(); + scalar_type delta_avg = scalar_parameter["delta_avg"]; + scalar_type tau_avg = scalar_parameter["tau_avg"]; + scalar_type lambda_avg = scalar_parameter["lambda_avg"]; + scalar_type Delta_bar = Delta_bar; + scalar_type Lambda_bar = Lambda_bar; + + for (int ii = 0; ii < scalar_parameter["DD"]; ii++) { + + // intial conditions + if (ii == 0) { + if (t == 0) + Ee[-1][t] = 0; // this is \bar H + // trivial else Ee[-1][t]=Ee[-1][t]; + // y_E[-1][t]=Ee[-1][t]; + iy_E[-1][ii] = Ee[-1][t]; + + // Ee_y[-1]=y_E[-1][t]; + Ee_y[-1] = iy_E[-1][ii]; + } + + for (int i = 0; i < (int)time_slices[rank].size(); i++) { + int e = time_slices[rank][i]; + if (ii == 0) { + if (t == 0) { + Ee[e][t] = 0; + } else if (t == t_end[e]) { + int f = daughters[e][0]; + int g = daughters[e][1]; + Ee[e][t] = Ee[f][t] * Ee[g][t]; + } + // trivial else{Ee[e][t]=Ee[e][t];} + // y_E[e][t]=Ee[e][t]; + iy_E[e][ii] = Ee[e][t]; + + // Ee_y[e]=y_E[e][t]; + Ee_y[e] = iy_E[e][ii]; + } + } + // RK4: 4th order Runge-Kutta for y'=f(y) + // k1 = f(y[n]) + E_k1[-1] = + (-sigma_hat) * Ee_y[-1] * Ee_y[-1] * h + + (delta_avg + tau_avg - lambda_avg - sigma_hat * ni) * Ee_y[-1] * h; + + for (int j = 0; j < (int)time_slices[rank].size(); j++) { + int f = time_slices[rank][j]; + scalar_type tau_f = vector_parameter["tau"][f]; + E_k1[-1] += tau_f * (1 - Ee_y[f]) * h; + } + + for (int i = 0; i < (int)time_slices[rank].size(); i++) { + int e = time_slices[rank][i]; + scalar_type delta = vector_parameter["delta"][e]; + scalar_type lambda = vector_parameter["lambda"][e]; + + // k1 = f(y[n]) + E_k1[e] = delta * Ee_y[e] * Ee_y[e] * h + + (-sigma_hat * Ee_y[-1] - delta - lambda) * Ee_y[e] * h + + lambda * h; + } + // k2 = f(y[n]+h/2 k1) + + // Ee_y[-1]=y_E[-1][ti]+1/2.* E_k1[-1]; + Ee_y[-1] = iy_E[-1][ii] + 1 / 2. * E_k1[-1]; + + E_k2[-1] = + (-sigma_hat) * Ee_y[-1] * Ee_y[-1] * h + + (delta_avg + tau_avg - lambda_avg - sigma_hat * ni) * Ee_y[-1] * h; + + for (int j = 0; j < (int)time_slices[rank].size(); j++) { + int f = time_slices[rank][j]; + scalar_type tau_f = vector_parameter["tau"][f]; + E_k2[-1] += tau_f * (1 - Ee_y[f]) * h; + } + + for (int i = 0; i < (int)time_slices[rank].size(); i++) { + int e = time_slices[rank][i]; + scalar_type delta = vector_parameter["delta"][e]; + scalar_type lambda = vector_parameter["lambda"][e]; + + // k2 = f(y[n]+h/2 k1) + // Ee_y[e] =y_E[e][ti]+1/2. * E_k1[e]; + Ee_y[e] = iy_E[e][ii] + 1 / 2. * E_k1[e]; + + E_k2[e] = delta * Ee_y[e] * Ee_y[e] * h + + (-sigma_hat * Ee_y[-1] - delta - lambda) * Ee_y[e] * h + + lambda * h; + } + + // k3 = f(y[n]+h/2 k2) + // Ee_y[-1]=y_E[-1][ti]+1/2.* E_k2[-1]; + Ee_y[-1] = iy_E[-1][ii] + 1 / 2. * E_k2[-1]; + + E_k3[-1] = + (-sigma_hat) * Ee_y[-1] * Ee_y[-1] * h + + (delta_avg + tau_avg - lambda_avg - sigma_hat * ni) * Ee_y[-1] * h; + for (int j = 0; j < (int)time_slices[rank].size(); j++) { + int f = time_slices[rank][j]; + scalar_type tau_f = vector_parameter["tau"][f]; + E_k3[-1] += tau_f * (1 - Ee_y[f]) * h; + } + + for (int i = 0; i < (int)time_slices[rank].size(); i++) { + int e = time_slices[rank][i]; + scalar_type delta = vector_parameter["delta"][e]; + scalar_type lambda = vector_parameter["lambda"][e]; + + // k3 = f(y[n]+h/2 k2) + // Ee_y[e] =y_E[e][ti]+1/2. * E_k2[e]; + Ee_y[e] = iy_E[e][ii] + 1 / 2. * E_k2[e]; + + E_k3[e] = delta * Ee_y[e] * Ee_y[e] * h + + (-sigma_hat * Ee_y[-1] - delta - lambda) * Ee_y[e] * h + + lambda * h; + } + + // k4 = f(y[n]+h k3) + // Ee_y[-1]=y_E[-1][ti]+1* E_k3[-1]; + Ee_y[-1] = iy_E[-1][ii] + 1 * E_k3[-1]; + + E_k4[-1] = + (-sigma_hat) * Ee_y[-1] * Ee_y[-1] * h + + (delta_avg + tau_avg - lambda_avg - sigma_hat * ni) * Ee_y[-1] * h; + for (int j = 0; j < (int)time_slices[rank].size(); j++) { + int f = time_slices[rank][j]; + scalar_type tau_f = vector_parameter["tau"][f]; + E_k4[-1] += tau_f * (1 - Ee_y[f]) * h; + } + + for (int i = 0; i < (int)time_slices[rank].size(); i++) { + int e = time_slices[rank][i]; + scalar_type delta = vector_parameter["delta"][e]; + scalar_type lambda = vector_parameter["lambda"][e]; + + // k4 = f(y[n]+h k3) + // Ee_y[e] =y_E[e][ti]+1 * E_k3[e]; + Ee_y[e] = iy_E[e][ii] + 1 * E_k3[e]; + + E_k4[e] = delta * Ee_y[e] * Ee_y[e] * h + + (-sigma_hat * Ee_y[-1] - delta - lambda) * Ee_y[e] * h + + lambda * h; + } + if (ii == 0) + iy_E[-1][ii + 1] = + Ee[-1][t] + + 1 / 6. * (E_k1[-1] + 2 * E_k2[-1] + 2 * E_k3[-1] + E_k4[-1]); + else + iy_E[-1][ii + 1] = + iy_E[-1][ii] + + 1 / 6. * (E_k1[-1] + 2 * E_k2[-1] + 2 * E_k3[-1] + E_k4[-1]); + + if (ii == scalar_parameter["DD"] - 1) { + // Ee[-1][tpdt]=y_E[-1][ti+h]; + Ee[-1][tpdt] = iy_E[-1][ii + 1]; + } + + for (int i = 0; i < (int)time_slices[rank].size(); i++) { + int e = time_slices[rank][i]; + // y[n+1] = y[n] + h/6 (k1 + 2 k2 + 2 k3 + k4) + if (ii == 0) + iy_E[e][ii + 1] = + Ee[e][t] + + 1 / 6. * (E_k1[e] + 2 * E_k2[e] + 2 * E_k3[e] + E_k4[e]); + else + iy_E[e][ii + 1] = + iy_E[e][ii] + + 1 / 6. * (E_k1[e] + 2 * E_k2[e] + 2 * E_k3[e] + E_k4[e]); + + if (ii == scalar_parameter["DD"] - 1) { + // Ee[e][tpdt]=y_E[e][ti+h]; + Ee[e][tpdt] = iy_E[e][ii + 1]; + // if (e<1) cout << e << " " << t << " " << Ee[e][tpdt] << " " << + // Ee[-1][tpdt] < -#include #include #include +#include +#include using namespace std; using namespace bpp; using namespace boost::mpi; -string random_tree_newick(string Sstring) -{ - tree_type * T=TreeTemplateTools::parenthesisToTree(Sstring,false); - vector random_tree_population; - map random_tree_ages; - vector leaf_names=T->getLeavesNames(); - for (vector::iterator it=leaf_names.begin();it!=leaf_names.end();it++) - { - random_tree_population.push_back((*it)); - random_tree_ages[(*it)]=0; - } - scalar_type t=0; - while(random_tree_population.size()>1) - { - int Nr=random_tree_population.size(); - scalar_type t_next=RandomTools::randExponential(1./(2*Nr)); - t+=t_next; - int i=RandomTools::giveIntRandomNumberBetweenZeroAndEntry(Nr); - int j=i; - while (i==j) j=RandomTools::giveIntRandomNumberBetweenZeroAndEntry(Nr); - stringstream tmp; - tmp<<"("<i) - { - random_tree_population.erase(random_tree_population.begin()+j); - random_tree_population.erase(random_tree_population.begin()+i); - } - else - { - random_tree_population.erase(random_tree_population.begin()+i); - random_tree_population.erase(random_tree_population.begin()+j); - } - random_tree_population.push_back(tmp.str()); +string random_tree_newick(string Sstring) { + tree_type *T = TreeTemplateTools::parenthesisToTree(Sstring, false); + vector random_tree_population; + map random_tree_ages; + vector leaf_names = T->getLeavesNames(); + for (vector::iterator it = leaf_names.begin(); it != leaf_names.end(); + it++) { + random_tree_population.push_back((*it)); + random_tree_ages[(*it)] = 0; + } + scalar_type t = 0; + while (random_tree_population.size() > 1) { + int Nr = random_tree_population.size(); + scalar_type t_next = RandomTools::randExponential(1. / (2 * Nr)); + t += t_next; + int i = RandomTools::giveIntRandomNumberBetweenZeroAndEntry(Nr); + int j = i; + while (i == j) + j = RandomTools::giveIntRandomNumberBetweenZeroAndEntry(Nr); + stringstream tmp; + tmp << "(" << random_tree_population[i] << ":" + << t - random_tree_ages[random_tree_population[i]] << "," + << random_tree_population[j] << ":" + << t - random_tree_ages[random_tree_population[j]] << ")"; + random_tree_ages[tmp.str()] = t; + if (j > i) { + random_tree_population.erase(random_tree_population.begin() + j); + random_tree_population.erase(random_tree_population.begin() + i); + } else { + random_tree_population.erase(random_tree_population.begin() + i); + random_tree_population.erase(random_tree_population.begin() + j); } + random_tree_population.push_back(tmp.str()); + } - //cout << random_tree_population[0]<<";" << endl; + // cout << random_tree_population[0]<<";" << endl; - return random_tree_population[0]+";"; + return random_tree_population[0] + ";"; } - -int main(int argc, char ** argv) -{ - map ll_cache; +int main(int argc, char **argv) { + map ll_cache; environment env(argc, argv); communicator world; - int done=1; - int it_num=2; - bool bw=1; - int N_SPR=50; - ifstream file_stream_S (argv[1]); + int done = 1; + int it_num = 2; + bool bw = 1; + int N_SPR = 50; + ifstream file_stream_S(argv[1]); string Sstring; - getline (file_stream_S,Sstring); - string Rstring=Sstring; - if (atoi(argv[3])==1) Rstring=random_tree_newick(Sstring); - map parameters; - mpi_tree * infer_tree = new mpi_tree(Sstring,world,parameters,true); + getline(file_stream_S, Sstring); + string Rstring = Sstring; + if (atoi(argv[3]) == 1) + Rstring = random_tree_newick(Sstring); + map parameters; + mpi_tree *infer_tree = new mpi_tree(Sstring, world, parameters, true); infer_tree->load_distributed_ales(argv[2]); scalar_type Sll = infer_tree->calculate_pun(it_num); - - string old_S=Rstring; - string max_S=Rstring; + string old_S = Rstring; + string max_S = Rstring; infer_tree->model->construct_undated(max_S); - scalar_type max_ll = infer_tree->calculate_pun(it_num,bw); + scalar_type max_ll = infer_tree->calculate_pun(it_num, bw); scalar_type new_ll; infer_tree->gather_T_to_from(); - //infer_tree->model->construct_undated(max_S); - //infer_tree->calculate_pun(it_num,1); - //infer_tree->gather_T_to_from(); - - bool changed=true; - bool last=false; - while (changed) - { - changed=false; - infer_tree->model->construct_undated(max_S); - - if (world.rank()==0) cout <<"@ll "<< max_ll << " " << Sll << " " << TreeTools::robinsonFouldsDistance(*TreeTemplateTools::parenthesisToTree(max_S),*TreeTemplateTools::parenthesisToTree(Sstring)) << endl; - - - string old_S=max_S; - - vector sorted_e; - vector sorted_f; - vector sorted_Ts; - - if (world.rank()==0) - for (map >::iterator it=infer_tree->sort_e.begin();it!=infer_tree->sort_e.end();it++) - { - scalar_type Ts=(*it).first; - for (int i=0;i<(*it).second.size();i++) - { - int e=infer_tree->sort_e[Ts][i]; - int f=infer_tree->sort_f[Ts][i]; - sorted_e.push_back(e); - sorted_f.push_back(f); - sorted_Ts.push_back(Ts); - } - } - broadcast(world,sorted_e,0); - broadcast(world,sorted_f,0); - for (int i=0;imodel->construct_undated(old_S); - - string new_S=infer_tree->model->feSPR(sorted_e[i],sorted_f[i]); - if (ll_cache.count(new_S)==0) - { - infer_tree->model->construct_undated(new_S); - new_ll= infer_tree->calculate_pun(it_num,bw); - ll_cache[new_S]=new_ll; - } - else - { - new_ll=ll_cache[new_S]; - } - if (world.rank()==0) - { - if (sorted_e[i]model->last_leaf) - cout << " " << infer_tree->model->node_name[infer_tree->model->id_nodes[sorted_e[i]]]; - else - cout << " " << sorted_e[i]; - if (sorted_f[i]model->last_leaf) - cout << "->" << infer_tree->model->node_name[infer_tree->model->id_nodes[sorted_f[i]]]; - else - cout << "->" << sorted_f[i]; - cout << " with: " << -sorted_Ts[i]<< " Ts " ; //" " << new_S << endl; - } - if (world.rank()==0) cout << new_ll << " " << max_ll << " " << Sll << " " - << TreeTools::robinsonFouldsDistance(*TreeTemplateTools::parenthesisToTree(new_S),*TreeTemplateTools::parenthesisToTree(Sstring)) <<" " - << TreeTools::robinsonFouldsDistance(*TreeTemplateTools::parenthesisToTree(new_S),*TreeTemplateTools::parenthesisToTree(Rstring)) << " " - << TreeTools::robinsonFouldsDistance(*TreeTemplateTools::parenthesisToTree(new_S),*TreeTemplateTools::parenthesisToTree(old_S)) << endl; - if ( (new_ll>max_ll) ) - { - infer_tree->gather_T_to_from(); - max_S=new_S; - changed=true; - max_ll=new_ll; - break; - } - } - if (not changed) - { - N_SPR=0; - infer_tree->model->construct_undated(old_S); - if (world.rank()==0) cout<< " .. new roots .. " << infer_tree->model->string_parameter["S_with_ranks"] << endl; - int e=infer_tree->model->last_branch-1; - { - infer_tree->model->construct_undated(old_S); - vector NNIs=infer_tree->model->NNIs(e); - for (int nni=0;nnimodel->construct_undated(new_S); - new_ll= infer_tree->calculate_pun(it_num,bw); - ll_cache[new_S]=new_ll; - } - else - { - new_ll=ll_cache[new_S]; - } - if (world.rank()==0) cout << new_ll << " " << max_ll << " " << Sll << " " - << TreeTools::robinsonFouldsDistance(*TreeTemplateTools::parenthesisToTree(new_S),*TreeTemplateTools::parenthesisToTree(Sstring)) <<" " - << TreeTools::robinsonFouldsDistance(*TreeTemplateTools::parenthesisToTree(new_S),*TreeTemplateTools::parenthesisToTree(Rstring)) << " " - << TreeTools::robinsonFouldsDistance(*TreeTemplateTools::parenthesisToTree(new_S),*TreeTemplateTools::parenthesisToTree(old_S)) << endl; - if ( (new_ll>max_ll) ) - { - max_S=new_S; - max_ll=new_ll; - changed=true; - break; - } - } - if (changed) e=0; - } - } - if (not changed) - { - infer_tree->model->construct_undated(old_S); - if (world.rank()==0) cout<< " .. all NNIs .. " << infer_tree->model->string_parameter["S_with_ranks"] << endl; - for (int e=infer_tree->model->last_branch-2;e>infer_tree->model->last_leaf-1;e--) - { - infer_tree->model->construct_undated(old_S); - vector NNIs=infer_tree->model->NNIs(e); - for (int nni=0;nnimodel->construct_undated(new_S); - new_ll= infer_tree->calculate_pun(it_num,bw); - ll_cache[new_S]=new_ll; - } - else - { - new_ll=ll_cache[new_S]; - } - if (world.rank()==0) cout << new_ll << " " << max_ll << " " << Sll << " " - << TreeTools::robinsonFouldsDistance(*TreeTemplateTools::parenthesisToTree(new_S),*TreeTemplateTools::parenthesisToTree(Sstring)) <<" " - << TreeTools::robinsonFouldsDistance(*TreeTemplateTools::parenthesisToTree(new_S),*TreeTemplateTools::parenthesisToTree(Rstring)) << " " - << TreeTools::robinsonFouldsDistance(*TreeTemplateTools::parenthesisToTree(new_S),*TreeTemplateTools::parenthesisToTree(old_S)) << endl; - if ( (new_ll>max_ll) ) - { - max_S=new_S; - max_ll=new_ll; - changed=true; - break; - } - } - if (changed) e=0; - } - } - if (world.rank()==0) cout<< "@ " << infer_tree->model->string_parameter["S_with_ranks"] << endl; - if (not changed and not last) - { - last=true; - changed=true; - N_SPR=200; - } - broadcast(world,done,0); - } - if (world.rank()==0) cout<< ". " << infer_tree->model->string_parameter["S_with_ranks"] << endl; + // infer_tree->model->construct_undated(max_S); + // infer_tree->calculate_pun(it_num,1); + // infer_tree->gather_T_to_from(); -} + bool changed = true; + bool last = false; + while (changed) { + changed = false; + infer_tree->model->construct_undated(max_S); + + if (world.rank() == 0) + cout << "@ll " << max_ll << " " << Sll << " " + << TreeTools::robinsonFouldsDistance( + *TreeTemplateTools::parenthesisToTree(max_S), + *TreeTemplateTools::parenthesisToTree(Sstring)) + << endl; + string old_S = max_S; + + vector sorted_e; + vector sorted_f; + vector sorted_Ts; + + if (world.rank() == 0) + for (map>::iterator it = + infer_tree->sort_e.begin(); + it != infer_tree->sort_e.end(); it++) { + scalar_type Ts = (*it).first; + for (int i = 0; i < (*it).second.size(); i++) { + int e = infer_tree->sort_e[Ts][i]; + int f = infer_tree->sort_f[Ts][i]; + sorted_e.push_back(e); + sorted_f.push_back(f); + sorted_Ts.push_back(Ts); + } + } + broadcast(world, sorted_e, 0); + broadcast(world, sorted_f, 0); + for (int i = 0; i < N_SPR; i++) { + infer_tree->model->construct_undated(old_S); + + string new_S = infer_tree->model->feSPR(sorted_e[i], sorted_f[i]); + if (ll_cache.count(new_S) == 0) { + infer_tree->model->construct_undated(new_S); + new_ll = infer_tree->calculate_pun(it_num, bw); + ll_cache[new_S] = new_ll; + } else { + new_ll = ll_cache[new_S]; + } + if (world.rank() == 0) { + if (sorted_e[i] < infer_tree->model->last_leaf) + cout << " " + << infer_tree->model + ->node_name[infer_tree->model->id_nodes[sorted_e[i]]]; + else + cout << " " << sorted_e[i]; + if (sorted_f[i] < infer_tree->model->last_leaf) + cout << "->" + << infer_tree->model + ->node_name[infer_tree->model->id_nodes[sorted_f[i]]]; + else + cout << "->" << sorted_f[i]; + cout << " with: " << -sorted_Ts[i] << " Ts "; //" " << new_S << endl; + } + if (world.rank() == 0) + cout << new_ll << " " << max_ll << " " << Sll << " " + << TreeTools::robinsonFouldsDistance( + *TreeTemplateTools::parenthesisToTree(new_S), + *TreeTemplateTools::parenthesisToTree(Sstring)) + << " " + << TreeTools::robinsonFouldsDistance( + *TreeTemplateTools::parenthesisToTree(new_S), + *TreeTemplateTools::parenthesisToTree(Rstring)) + << " " + << TreeTools::robinsonFouldsDistance( + *TreeTemplateTools::parenthesisToTree(new_S), + *TreeTemplateTools::parenthesisToTree(old_S)) + << endl; + if ((new_ll > max_ll)) { + infer_tree->gather_T_to_from(); + max_S = new_S; + changed = true; + max_ll = new_ll; + break; + } + } + if (not changed) { + N_SPR = 0; + infer_tree->model->construct_undated(old_S); + if (world.rank() == 0) + cout << " .. new roots .. " + << infer_tree->model->string_parameter["S_with_ranks"] << endl; + int e = infer_tree->model->last_branch - 1; + { + infer_tree->model->construct_undated(old_S); + vector NNIs = infer_tree->model->NNIs(e); + for (int nni = 0; nni < NNIs.size(); nni++) { + string new_S = NNIs[nni]; + if (ll_cache.count(new_S) == 0) { + infer_tree->model->construct_undated(new_S); + new_ll = infer_tree->calculate_pun(it_num, bw); + ll_cache[new_S] = new_ll; + } else { + new_ll = ll_cache[new_S]; + } + if (world.rank() == 0) + cout << new_ll << " " << max_ll << " " << Sll << " " + << TreeTools::robinsonFouldsDistance( + *TreeTemplateTools::parenthesisToTree(new_S), + *TreeTemplateTools::parenthesisToTree(Sstring)) + << " " + << TreeTools::robinsonFouldsDistance( + *TreeTemplateTools::parenthesisToTree(new_S), + *TreeTemplateTools::parenthesisToTree(Rstring)) + << " " + << TreeTools::robinsonFouldsDistance( + *TreeTemplateTools::parenthesisToTree(new_S), + *TreeTemplateTools::parenthesisToTree(old_S)) + << endl; + if ((new_ll > max_ll)) { + max_S = new_S; + max_ll = new_ll; + changed = true; + break; + } + } + if (changed) + e = 0; + } + } + if (not changed) { + infer_tree->model->construct_undated(old_S); + if (world.rank() == 0) + cout << " .. all NNIs .. " + << infer_tree->model->string_parameter["S_with_ranks"] << endl; + for (int e = infer_tree->model->last_branch - 2; + e > infer_tree->model->last_leaf - 1; e--) { + infer_tree->model->construct_undated(old_S); + vector NNIs = infer_tree->model->NNIs(e); + for (int nni = 0; nni < NNIs.size(); nni++) { + string new_S = NNIs[nni]; + if (ll_cache.count(new_S) == 0) { + infer_tree->model->construct_undated(new_S); + new_ll = infer_tree->calculate_pun(it_num, bw); + ll_cache[new_S] = new_ll; + } else { + new_ll = ll_cache[new_S]; + } + if (world.rank() == 0) + cout << new_ll << " " << max_ll << " " << Sll << " " + << TreeTools::robinsonFouldsDistance( + *TreeTemplateTools::parenthesisToTree(new_S), + *TreeTemplateTools::parenthesisToTree(Sstring)) + << " " + << TreeTools::robinsonFouldsDistance( + *TreeTemplateTools::parenthesisToTree(new_S), + *TreeTemplateTools::parenthesisToTree(Rstring)) + << " " + << TreeTools::robinsonFouldsDistance( + *TreeTemplateTools::parenthesisToTree(new_S), + *TreeTemplateTools::parenthesisToTree(old_S)) + << endl; + if ((new_ll > max_ll)) { + max_S = new_S; + max_ll = new_ll; + changed = true; + break; + } + } + if (changed) + e = 0; + } + } + if (world.rank() == 0) + cout << "@ " << infer_tree->model->string_parameter["S_with_ranks"] + << endl; + if (not changed and not last) { + last = true; + changed = true; + N_SPR = 200; + } + broadcast(world, done, 0); + } + if (world.rank() == 0) + cout << ". " << infer_tree->model->string_parameter["S_with_ranks"] << endl; +} diff --git a/src/mpi_ml-bw_undated.cpp b/src/mpi_ml-bw_undated.cpp index b7b8f8b..4c36536 100644 --- a/src/mpi_ml-bw_undated.cpp +++ b/src/mpi_ml-bw_undated.cpp @@ -1,246 +1,255 @@ #include "ALE_util.h" #include "mpi_tree.h" -#include -#include #include #include +#include +#include using namespace std; using namespace bpp; using namespace boost::mpi; -class p_fun: - public virtual Function, - public AbstractParametrizable -{ +class p_fun : public virtual Function, public AbstractParametrizable { private: double fval_; - mpi_tree* model_pointer; + mpi_tree *model_pointer; int last_branch; communicator world; public: - p_fun(mpi_tree* model, int last_branch_in,communicator world_in ,double delta_start=0.01,double tau_start=0.01,double lambda_start=0.01) : AbstractParametrizable(""), fval_(0), model_pointer(model) - { - last_branch=last_branch_in; - world=world_in; - //We declare parameters here: - // IncludingInterval* constraint = new IncludingInterval(1e-6, 10-1e-6); - IntervalConstraint* constraint = new IntervalConstraint ( 1e-10, 1e3, true, true ); - for (int e=0;emodel->vector_parameter["rate_multiplier_delta"][e]=tmp; - - stringstream tau_to_e; - tau_to_e<<"rm_tau_to"<<"_"<model->vector_parameter["rate_multiplier_tau_to"][e]=tmp; - - stringstream tau_from_e; - tau_from_e<<"rm_tau_from"<<"_"<model->vector_parameter["rate_multiplier_tau_from"][e]=tmp; - - stringstream lambdae; - lambdae<<"rm_lambda"<<"_"<model->vector_parameter["rate_multiplier_lambda"][e]=tmp; - - stringstream Oe; - Oe<<"rm_O"<<"_"<model->vector_parameter["rate_multiplier_O"][e]=tmp; - - } - - double delta = getParameterValue("delta"); - double tau = getParameterValue("tau"); - double lambda = getParameterValue("lambda"); - - - model_pointer->model->set_model_parameter("delta",delta); - model_pointer->model->set_model_parameter("tau",tau); - model_pointer->model->set_model_parameter("lambda",lambda); - - - double y=-(model_pointer->calculate_pun()); - if (world.rank()==0) { - cout <model->vector_parameter["rate_multiplier_delta"][e] = tmp; + + stringstream tau_to_e; + tau_to_e << "rm_tau_to" + << "_" << e; + tmp = getParameterValue(tau_to_e.str()); + model_pointer->model->vector_parameter["rate_multiplier_tau_to"][e] = tmp; + + stringstream tau_from_e; + tau_from_e << "rm_tau_from" + << "_" << e; + tmp = getParameterValue(tau_from_e.str()); + model_pointer->model->vector_parameter["rate_multiplier_tau_from"][e] = + tmp; + + stringstream lambdae; + lambdae << "rm_lambda" + << "_" << e; + tmp = getParameterValue(lambdae.str()); + model_pointer->model->vector_parameter["rate_multiplier_lambda"][e] = tmp; + + stringstream Oe; + Oe << "rm_O" + << "_" << e; + tmp = getParameterValue(Oe.str()); + model_pointer->model->vector_parameter["rate_multiplier_O"][e] = tmp; } -}; + double delta = getParameterValue("delta"); + double tau = getParameterValue("tau"); + double lambda = getParameterValue("lambda"); + + model_pointer->model->set_model_parameter("delta", delta); + model_pointer->model->set_model_parameter("tau", tau); + model_pointer->model->set_model_parameter("lambda", lambda); + + double y = -(model_pointer->calculate_pun()); + if (world.rank() == 0) { + cout << endl + << "delta=" << delta << "\t tau=" << tau << "\t lambda=" << lambda + << "\t ll=" << -y << endl; + }; + fval_ = y; + } +}; -int main(int argc, char ** argv) -{ +int main(int argc, char **argv) { environment env(argc, argv); communicator world; - int done=1; + int done = 1; - ifstream file_stream_S (argv[1]); + ifstream file_stream_S(argv[1]); string Sstring; - - getline (file_stream_S,Sstring); - map parameters; - if (world.rank()==0) cout << Sstring << endl; - mpi_tree * infer_tree = new mpi_tree(Sstring,world,parameters,true); - - if (world.rank()==0) cout << "..construct.. " << endl; + + getline(file_stream_S, Sstring); + map parameters; + if (world.rank() == 0) + cout << Sstring << endl; + mpi_tree *infer_tree = new mpi_tree(Sstring, world, parameters, true); + + if (world.rank() == 0) + cout << "..construct.. " << endl; infer_tree->load_distributed_ales(argv[2]); - if (world.rank()==0) cout << "..load.. " << endl; + if (world.rank() == 0) + cout << "..load.. " << endl; - - if (world.rank()==0) cout << infer_tree->model->string_parameter["S_with_ranks"] << endl; + if (world.rank() == 0) + cout << infer_tree->model->string_parameter["S_with_ranks"] << endl; infer_tree->gather_counts(); infer_tree->gather_T_to_from(); - broadcast(world,done,0); - scalar_type delta=0.1; - scalar_type tau=0.1; - scalar_type lambda=0.1; - scalar_type O_R=1,beta=1; - infer_tree->model->set_model_parameter("O_R",O_R); - infer_tree->model->set_model_parameter("seq_beta",beta); - - if (atoi(argv[3])==1) infer_tree->model->set_model_parameter("undatedBL",true); - else infer_tree->model->set_model_parameter("undatedBL",false); - - infer_tree->model->calculate_undatedEs(); - - scalar_type samples=1;//atoi(argv[4]); - - if (argc<7) - { - - Function* f = new p_fun(infer_tree,infer_tree->model->last_branch,world); - Optimizer* optimizer = new DownhillSimplexMethod(f); - - optimizer->setProfiler(0); - optimizer->setMessageHandler(0); - optimizer->setVerbose(0); - if (world.rank()==0) optimizer->setVerbose(1); - - - optimizer->setConstraintPolicy(AutoParameter::CONSTRAINTS_AUTO); - cout << "optimizer up to here" <init(f->getParameters()); //Here we optimize all parameters, and start with the default values. - - if (world.rank()==0) cout << "#ML rate optimization.." << endl; - - optimizer->optimize(); - broadcast(world,done,0); - - for (int e=0;emodel->last_branch;e++) - { - stringstream deltae; - deltae<<"rm_delta"<<"_"<getParameterValue(deltae.str()); - infer_tree->model->vector_parameter["rate_multiplier_delta"][e]=tmp; - - stringstream tau_to_e; - tau_to_e<<"rm_tau_to"<<"_"<getParameterValue(tau_to_e.str()); - infer_tree->model->vector_parameter["rate_multiplier_tau_to"][e]=tmp; - - stringstream tau_from_e; - tau_from_e<<"rm_tau_from"<<"_"<getParameterValue(tau_from_e.str()); - infer_tree->model->vector_parameter["rate_multiplier_tau_from"][e]=tmp; - - stringstream lambdae; - lambdae<<"rm_lambda"<<"_"<getParameterValue(lambdae.str()); - infer_tree->model->vector_parameter["rate_multiplier_lambda"][e]=tmp; - - stringstream Oe; - Oe<<"rm_O"<<"_"<getParameterValue(Oe.str()); - infer_tree->model->vector_parameter["rate_multiplier_O"][e]=tmp; - - } - - delta = optimizer->getParameterValue("delta"); - tau = optimizer->getParameterValue("tau"); - lambda = optimizer->getParameterValue("lambda"); - - - infer_tree->model->set_model_parameter("delta",delta); - infer_tree->model->set_model_parameter("tau",tau); - infer_tree->model->set_model_parameter("lambda",lambda); - if (world.rank()==0 ) - { - optimizer->getParameters().printParameters(cout); - cout <model->set_model_parameter("O_R", O_R); + infer_tree->model->set_model_parameter("seq_beta", beta); + + if (atoi(argv[3]) == 1) + infer_tree->model->set_model_parameter("undatedBL", true); else - { - if (world.rank()==0) cout << "#skipping with: delta=" <model->calculate_undatedEs(); + + scalar_type samples = 1; // atoi(argv[4]); + + if (argc < 7) { + + Function *f = new p_fun(infer_tree, infer_tree->model->last_branch, world); + Optimizer *optimizer = new DownhillSimplexMethod(f); + + optimizer->setProfiler(0); + optimizer->setMessageHandler(0); + optimizer->setVerbose(0); + if (world.rank() == 0) + optimizer->setVerbose(1); + + optimizer->setConstraintPolicy(AutoParameter::CONSTRAINTS_AUTO); + cout << "optimizer up to here" << endl; + + optimizer->init(f->getParameters()); // Here we optimize all parameters, and + // start with the default values. + + if (world.rank() == 0) + cout << "#ML rate optimization.." << endl; + + optimizer->optimize(); + broadcast(world, done, 0); + + for (int e = 0; e < infer_tree->model->last_branch; e++) { + stringstream deltae; + deltae << "rm_delta" + << "_" << e; + double tmp = optimizer->getParameterValue(deltae.str()); + infer_tree->model->vector_parameter["rate_multiplier_delta"][e] = tmp; + + stringstream tau_to_e; + tau_to_e << "rm_tau_to" + << "_" << e; + tmp = optimizer->getParameterValue(tau_to_e.str()); + infer_tree->model->vector_parameter["rate_multiplier_tau_to"][e] = tmp; + + stringstream tau_from_e; + tau_from_e << "rm_tau_from" + << "_" << e; + tmp = optimizer->getParameterValue(tau_from_e.str()); + infer_tree->model->vector_parameter["rate_multiplier_tau_from"][e] = tmp; + + stringstream lambdae; + lambdae << "rm_lambda" + << "_" << e; + tmp = optimizer->getParameterValue(lambdae.str()); + infer_tree->model->vector_parameter["rate_multiplier_lambda"][e] = tmp; + + stringstream Oe; + Oe << "rm_O" + << "_" << e; + tmp = optimizer->getParameterValue(Oe.str()); + infer_tree->model->vector_parameter["rate_multiplier_O"][e] = tmp; } - //optimizer->getParameters().printParameters(cout); - if (argc>7) - delta=atof(argv[5]),tau=atof(argv[6]),lambda=atof(argv[7]);samples=atoi(argv[8]); - if (world.rank()==0) cout << "#rates : delta=" <getParameterValue("tau"); + lambda = optimizer->getParameterValue("lambda"); + + infer_tree->model->set_model_parameter("delta", delta); + infer_tree->model->set_model_parameter("tau", tau); + infer_tree->model->set_model_parameter("lambda", lambda); + if (world.rank() == 0) { + optimizer->getParameters().printParameters(cout); + cout << endl + << delta << " " << tau << " " << lambda // << " " << sigma + << endl; + } - - infer_tree->model->set_model_parameter("delta",delta); - infer_tree->model->set_model_parameter("tau",tau); - infer_tree->model->set_model_parameter("lambda",lambda); - + } else { + if (world.rank() == 0) + cout << "#skipping with: delta=" << delta << " lambda=" << lambda + << " tau=" << tau << endl; + } + // optimizer->getParameters().printParameters(cout); + if (argc > 7) + delta = atof(argv[5]), tau = atof(argv[6]), lambda = atof(argv[7]); + samples = atoi(argv[8]); + if (world.rank() == 0) + cout << "#rates : delta=" << delta << " lambda=" << lambda << " tau=" << tau + << endl; + + infer_tree->model->set_model_parameter("delta", delta); + infer_tree->model->set_model_parameter("tau", tau); + infer_tree->model->set_model_parameter("lambda", lambda); - infer_tree->calculate_pun(); infer_tree->calculate_pun(); infer_tree->calculate_pun(); @@ -249,42 +258,53 @@ int main(int argc, char ** argv) infer_tree->calculate_pun(); infer_tree->calculate_pun(); - samples=1; - if (world.rank()==0) cout << "#sampling .." << endl; + samples = 1; + if (world.rank() == 0) + cout << "#sampling .." << endl; scalar_type ll_final = infer_tree->calculate_pun(samples); - if (world.rank()==0) cout << "#sampling done." << endl; + if (world.rank() == 0) + cout << "#sampling done." << endl; infer_tree->gather_counts(samples); - if (world.rank()==0) cout << "#gather done." << endl; + if (world.rank() == 0) + cout << "#gather done." << endl; infer_tree->gather_T_to_from(samples); - if (world.rank()==0) cout << "#gather T_from done." << endl; - - if (world.rank()==0) cout<< ">tree:\t"<< infer_tree->model->string_parameter["S_with_ranks"] << endl; - if (world.rank()==0) cout<< ">logl:\t"<< ll_final << endl; - if (world.rank()==0) cout<< ">Ts:\tfrom\tto"<< endl; - if (world.rank()==0) infer_tree->print_branch_counts(samples); + if (world.rank() == 0) + cout << "#gather T_from done." << endl; + + if (world.rank() == 0) + cout << ">tree:\t" << infer_tree->model->string_parameter["S_with_ranks"] + << endl; + if (world.rank() == 0) + cout << ">logl:\t" << ll_final << endl; + if (world.rank() == 0) + cout << ">Ts:\tfrom\tto" << endl; + if (world.rank() == 0) + infer_tree->print_branch_counts(samples); return 0; - if (world.rank()==0) - for (map >::iterator it=infer_tree->sort_e.begin();it!=infer_tree->sort_e.end();it++) - { - scalar_type Ts=-(*it).first; - if (Ts>0) - for (int i=0;i<(*it).second.size();i++) - { - int e=infer_tree->sort_e[-Ts][i]; - int f=infer_tree->sort_f[-Ts][i]; - if (emodel->last_leaf) - cout << "\t" << infer_tree->model->node_name[infer_tree->model->id_nodes[e]]; - else - cout << "\t" << e; - if (fmodel->last_leaf) - cout << "\t" << infer_tree->model->node_name[infer_tree->model->id_nodes[f]]; - else - cout << "\t" << f; - cout << "\t" << Ts << endl; //" " << new_S << endl; - } - } - - + if (world.rank() == 0) + for (map>::iterator it = + infer_tree->sort_e.begin(); + it != infer_tree->sort_e.end(); it++) { + scalar_type Ts = -(*it).first; + if (Ts > 0) + for (int i = 0; i < (*it).second.size(); i++) { + int e = infer_tree->sort_e[-Ts][i]; + int f = infer_tree->sort_f[-Ts][i]; + if (e < infer_tree->model->last_leaf) + cout + << "\t" + << infer_tree->model->node_name[infer_tree->model->id_nodes[e]]; + else + cout << "\t" << e; + if (f < infer_tree->model->last_leaf) + cout + << "\t" + << infer_tree->model->node_name[infer_tree->model->id_nodes[f]]; + else + cout << "\t" << f; + cout << "\t" << Ts << endl; //" " << new_S << endl; + } + } } diff --git a/src/mpi_ml.cpp b/src/mpi_ml.cpp index e91b64a..e8c5e49 100644 --- a/src/mpi_ml.cpp +++ b/src/mpi_ml.cpp @@ -1,133 +1,128 @@ #include "ALE_util.h" #include "mpi_tree.h" -#include -#include #include #include +#include +#include using namespace std; using namespace bpp; using namespace boost::mpi; -class p_fun: - public virtual Function, - public AbstractParametrizable -{ +class p_fun : public virtual Function, public AbstractParametrizable { private: double fval_; - mpi_tree* model_pointer; -public: - p_fun(mpi_tree* model, double delta_start=0.2,double tau_start=0.2,double lambda_start=0.5//,double sigma_start=2 -) : AbstractParametrizable(""), fval_(0), model_pointer(model) - { - //We declare parameters here: - // IncludingInterval* constraint = new IncludingInterval(1e-6, 10-1e-6); - IntervalConstraint* constraint = new IntervalConstraint ( 1e-6, 10-1e-6, true, true ); - addParameter_( new Parameter("delta", delta_start, constraint) ) ; - addParameter_( new Parameter("tau", tau_start, constraint) ) ; - addParameter_( new Parameter("lambda", lambda_start, constraint) ) ; - //addParameter_( new Parameter("sigma", sigma_start, constraint) ) ; + mpi_tree *model_pointer; +public: + p_fun(mpi_tree *model, double delta_start = 0.2, double tau_start = 0.2, + double lambda_start = 0.5 //,double sigma_start=2 + ) + : AbstractParametrizable(""), fval_(0), model_pointer(model) { + // We declare parameters here: + // IncludingInterval* constraint = new IncludingInterval(1e-6, 10-1e-6); + IntervalConstraint *constraint = + new IntervalConstraint(1e-6, 10 - 1e-6, true, true); + addParameter_(new Parameter("delta", delta_start, constraint)); + addParameter_(new Parameter("tau", tau_start, constraint)); + addParameter_(new Parameter("lambda", lambda_start, constraint)); + // addParameter_( new Parameter("sigma", sigma_start, constraint) ) ; } - - p_fun* clone() const { return new p_fun(*this); } - + + p_fun *clone() const { return new p_fun(*this); } + public: - - void setParameters(const ParameterList& pl) - throw (ParameterNotFoundException, ConstraintException, Exception) - { - matchParametersValues(pl); - } - double getValue() const throw (Exception) { return fval_; } - void fireParameterChanged(const ParameterList& pl) - { - double delta = getParameterValue("delta"); - double tau = getParameterValue("tau"); - double lambda = getParameterValue("lambda"); - - //double sigma = getParameterValue("sigma"); - - model_pointer->model->set_model_parameter("delta",delta); - model_pointer->model->set_model_parameter("tau",tau); - model_pointer->model->set_model_parameter("lambda",lambda); - - //model_pointer->model->set_model_parameter("Delta_bar",sigma*1e6); - //model_pointer->model->set_model_parameter("Lambda_bar",sigma*1e6); - - //model_pointer->calculate_EGb(); - double y=-(model_pointer->calculate_p()); - //if (world.rank()==0) cout <model->set_model_parameter("delta", delta); + model_pointer->model->set_model_parameter("tau", tau); + model_pointer->model->set_model_parameter("lambda", lambda); + + // model_pointer->model->set_model_parameter("Delta_bar",sigma*1e6); + // model_pointer->model->set_model_parameter("Lambda_bar",sigma*1e6); + + // model_pointer->calculate_EGb(); + double y = -(model_pointer->calculate_p()); + // if (world.rank()==0) cout < ale_names; + if (world.rank() == 0) { + ifstream file_stream(argv[2]); + while (!file_stream.eof()) { + string fname; + getline(file_stream, fname); + boost::trim(fname); + if (fname.find(".ale") != fname.npos) + ale_names.push_back(fname); } - int done=1; - if (world.rank()==0) cout << "#list of " <distribute_ales(ale_names); scalar_type ll = infer_tree->calculate_p(); - if (world.rank()==0) cout << "LL = " <getParameterValue("tau"); - scalar_type lambda=optimizer->getParameterValue("lambda"); - //scalar_type sigma=optimizer->getParameterValue("sigma"); + if (world.rank() == 0) { + optimizer->getParameters().printParameters(cout); + scalar_type delta = optimizer->getParameterValue("delta"); + scalar_type tau = optimizer->getParameterValue("tau"); + scalar_type lambda = optimizer->getParameterValue("lambda"); + // scalar_type sigma=optimizer->getParameterValue("sigma"); - cout < -#include #include #include +#include +#include using namespace std; using namespace bpp; using namespace boost::mpi; -class p_fun: - public virtual Function, - public AbstractParametrizable -{ +class p_fun : public virtual Function, public AbstractParametrizable { private: double fval_; - mpi_tree* model_pointer; + mpi_tree *model_pointer; communicator world; -public: - p_fun(mpi_tree* model, communicator world_in , double delta_start=0.05,double tau_start=0.,double lambda_start=0.2//,double sigma_start=2 -) : AbstractParametrizable(""), fval_(0), model_pointer(model) - { - world=world_in; - - //We declare parameters here: - // IncludingInterval* constraint = new IncludingInterval(1e-6, 10-1e-6); - IntervalConstraint* constraint = new IntervalConstraint ( 0, 4, true, true ); - addParameter_( new Parameter("delta", delta_start, constraint) ) ; - addParameter_( new Parameter("tau", tau_start, constraint) ) ; - addParameter_( new Parameter("lambda", lambda_start, constraint) ) ; - //addParameter_( new Parameter("sigma", sigma_start, constraint) ) ; +public: + p_fun(mpi_tree *model, communicator world_in, double delta_start = 0.05, + double tau_start = 0., double lambda_start = 0.2 //,double sigma_start=2 + ) + : AbstractParametrizable(""), fval_(0), model_pointer(model) { + world = world_in; + + // We declare parameters here: + // IncludingInterval* constraint = new IncludingInterval(1e-6, 10-1e-6); + IntervalConstraint *constraint = new IntervalConstraint(0, 4, true, true); + addParameter_(new Parameter("delta", delta_start, constraint)); + addParameter_(new Parameter("tau", tau_start, constraint)); + addParameter_(new Parameter("lambda", lambda_start, constraint)); + // addParameter_( new Parameter("sigma", sigma_start, constraint) ) ; } - - p_fun* clone() const { return new p_fun(*this); } + + p_fun *clone() const { return new p_fun(*this); } public: - - void setParameters(const ParameterList& pl) - throw (ParameterNotFoundException, ConstraintException, Exception) - { - matchParametersValues(pl); - } - double getValue() const throw (Exception) { return fval_; } - void fireParameterChanged(const ParameterList& pl) - { - double delta = getParameterValue("delta"); - double tau = getParameterValue("tau"); - double lambda = getParameterValue("lambda"); - - //double sigma = getParameterValue("sigma"); - - model_pointer->model->set_model_parameter("delta",delta); - model_pointer->model->set_model_parameter("tau",tau); - model_pointer->model->set_model_parameter("lambda",lambda); - - //model_pointer->model->set_model_parameter("Delta_bar",sigma*1e6); - //model_pointer->model->set_model_parameter("Lambda_bar",sigma*1e6); - - //model_pointer->calculate_EGb(); - double y=-(model_pointer->calculate_pun()); - //model_pointer->gather_counts(); - if (world.rank()==0) { - cout <model->set_model_parameter("delta", delta); + model_pointer->model->set_model_parameter("tau", tau); + model_pointer->model->set_model_parameter("lambda", lambda); + + // model_pointer->model->set_model_parameter("Delta_bar",sigma*1e6); + // model_pointer->model->set_model_parameter("Lambda_bar",sigma*1e6); + + // model_pointer->calculate_EGb(); + double y = -(model_pointer->calculate_pun()); + // model_pointer->gather_counts(); + if (world.rank() == 0) { + cout << endl + << "delta=" << delta << "\t tau=" << tau << "\t lambda=" << lambda + << "\t ll=" << -y << endl; + // model_pointer->print_branch_counts(); + }; + fval_ = y; + } }; - -int main(int argc, char ** argv) -{ +int main(int argc, char **argv) { environment env(argc, argv); communicator world; - int done=1; + int done = 1; - ifstream file_stream_S (argv[1]); + ifstream file_stream_S(argv[1]); string Sstring; - - getline (file_stream_S,Sstring); - map parameters; - if (world.rank()==0) cout << Sstring << endl; - mpi_tree * infer_tree = new mpi_tree(Sstring,world,parameters,true); - if (world.rank()==0) cout << "..construct.. " << endl; + + getline(file_stream_S, Sstring); + map parameters; + if (world.rank() == 0) + cout << Sstring << endl; + mpi_tree *infer_tree = new mpi_tree(Sstring, world, parameters, true); + if (world.rank() == 0) + cout << "..construct.. " << endl; infer_tree->load_distributed_ales(argv[2]); - if (world.rank()==0) cout << "..load.. " << endl; + if (world.rank() == 0) + cout << "..load.. " << endl; - //scalar_type ll = infer_tree->calculate_pun(3); + // scalar_type ll = infer_tree->calculate_pun(3); - //scalar_type ll = infer_tree->calculate_pun(10,1); - - if (world.rank()==0) cout << infer_tree->model->string_parameter["S_with_ranks"] << endl; + // scalar_type ll = infer_tree->calculate_pun(10,1); + + if (world.rank() == 0) + cout << infer_tree->model->string_parameter["S_with_ranks"] << endl; infer_tree->gather_counts(); infer_tree->gather_T_to_from(); - broadcast(world,done,0); - scalar_type delta=0.1; - scalar_type tau=0.1; - scalar_type lambda=0.1; - scalar_type O_R=1,beta=1; - infer_tree->model->set_model_parameter("O_R",O_R); - infer_tree->model->set_model_parameter("seq_beta",beta); + broadcast(world, done, 0); + scalar_type delta = 0.1; + scalar_type tau = 0.1; + scalar_type lambda = 0.1; + scalar_type O_R = 1, beta = 1; + infer_tree->model->set_model_parameter("O_R", O_R); + infer_tree->model->set_model_parameter("seq_beta", beta); infer_tree->model->calculate_undatedEs(); - if (argc<5) - { - - Function* f = new p_fun(infer_tree,world ); - Optimizer* optimizer = new DownhillSimplexMethod(f); - - optimizer->setProfiler(0); - optimizer->setMessageHandler(0); - optimizer->setVerbose(0); - if (world.rank()==0) optimizer->setVerbose(1); - - - optimizer->setConstraintPolicy(AutoParameter::CONSTRAINTS_AUTO); - optimizer->init(f->getParameters()); //Here we optimize all parameters, and start with the default values. - if (world.rank()==0) cout << "#ML rate optimization.." << endl; - optimizer->optimize(); - broadcast(world,done,0); - - delta=optimizer->getParameterValue("delta"); - tau=optimizer->getParameterValue("tau"); - lambda=optimizer->getParameterValue("lambda"); - if (world.rank()==0 ) - { - optimizer->getParameters().printParameters(cout); - cout <optimize(); + broadcast(world, done, 0); + + delta = optimizer->getParameterValue("delta"); + tau = optimizer->getParameterValue("tau"); + lambda = optimizer->getParameterValue("lambda"); + if (world.rank() == 0) { + optimizer->getParameters().printParameters(cout); + cout << endl + << delta << " " << tau << " " << lambda // << " " << sigma + << endl; } - //optimizer->getParameters().printParameters(cout); - if (argc>5) - delta=atof(argv[3]),tau=atof(argv[4]),lambda=atof(argv[5]); - if (world.rank()==0) cout << "#rates : delta=" <model->set_model_parameter("tau",tau); - infer_tree->model->set_model_parameter("lambda",lambda); + + } else { + if (world.rank() == 0) + cout << "#skipping with: delta=" << delta << " lambda=" << lambda + << " tau=" << tau << endl; + } + // optimizer->getParameters().printParameters(cout); + if (argc > 5) + delta = atof(argv[3]), tau = atof(argv[4]), lambda = atof(argv[5]); + if (world.rank() == 0) + cout << "#rates : delta=" << delta << " lambda=" << lambda << " tau=" << tau + << endl; + + infer_tree->model->set_model_parameter("delta", delta); + infer_tree->model->set_model_parameter("tau", tau); + infer_tree->model->set_model_parameter("lambda", lambda); infer_tree->calculate_pun(); infer_tree->calculate_pun(); infer_tree->calculate_pun(); @@ -155,42 +158,53 @@ int main(int argc, char ** argv) infer_tree->calculate_pun(); infer_tree->calculate_pun(); - - scalar_type samples=1; - if (world.rank()==0) cout << "#sampling .." << endl; + scalar_type samples = 1; + if (world.rank() == 0) + cout << "#sampling .." << endl; scalar_type ll_final = infer_tree->calculate_pun(samples); - if (world.rank()==0) cout << "#sampling done." << endl; + if (world.rank() == 0) + cout << "#sampling done." << endl; infer_tree->gather_counts(samples); - if (world.rank()==0) cout << "#gather done." << endl; + if (world.rank() == 0) + cout << "#gather done." << endl; infer_tree->gather_T_to_from(samples); - if (world.rank()==0) cout << "#gather T_from done." << endl; - - if (world.rank()==0) cout<< ">tree:\t"<< infer_tree->model->string_parameter["S_with_ranks"] << endl; - if (world.rank()==0) cout<< ">logl:\t"<< ll_final << endl; - if (world.rank()==0) cout<< ">Ts:\tfrom\tto"<< endl; - if (world.rank()==0) infer_tree->print_branch_counts(samples); + if (world.rank() == 0) + cout << "#gather T_from done." << endl; + + if (world.rank() == 0) + cout << ">tree:\t" << infer_tree->model->string_parameter["S_with_ranks"] + << endl; + if (world.rank() == 0) + cout << ">logl:\t" << ll_final << endl; + if (world.rank() == 0) + cout << ">Ts:\tfrom\tto" << endl; + if (world.rank() == 0) + infer_tree->print_branch_counts(samples); return 0; - if (world.rank()==0) - for (map >::iterator it=infer_tree->sort_e.begin();it!=infer_tree->sort_e.end();it++) - { - scalar_type Ts=-(*it).first; - if (Ts>0) - for (int i=0;i<(*it).second.size();i++) - { - int e=infer_tree->sort_e[-Ts][i]; - int f=infer_tree->sort_f[-Ts][i]; - if (emodel->last_leaf) - cout << "\t" << infer_tree->model->node_name[infer_tree->model->id_nodes[e]]; - else - cout << "\t" << e; - if (fmodel->last_leaf) - cout << "\t" << infer_tree->model->node_name[infer_tree->model->id_nodes[f]]; - else - cout << "\t" << f; - cout << "\t" << Ts << endl; //" " << new_S << endl; - } - } - + if (world.rank() == 0) + for (map>::iterator it = + infer_tree->sort_e.begin(); + it != infer_tree->sort_e.end(); it++) { + scalar_type Ts = -(*it).first; + if (Ts > 0) + for (int i = 0; i < (*it).second.size(); i++) { + int e = infer_tree->sort_e[-Ts][i]; + int f = infer_tree->sort_f[-Ts][i]; + if (e < infer_tree->model->last_leaf) + cout + << "\t" + << infer_tree->model->node_name[infer_tree->model->id_nodes[e]]; + else + cout << "\t" << e; + if (f < infer_tree->model->last_leaf) + cout + << "\t" + << infer_tree->model->node_name[infer_tree->model->id_nodes[f]]; + else + cout << "\t" << f; + cout << "\t" << Ts << endl; //" " << new_S << endl; + } + } } diff --git a/src/mpi_tree.cpp b/src/mpi_tree.cpp index f7a2de1..8a0505d 100644 --- a/src/mpi_tree.cpp +++ b/src/mpi_tree.cpp @@ -5,978 +5,983 @@ using namespace std; using namespace bpp; using namespace boost::mpi; -void mpi_tree::prune_distributed_ales(string fname,string Sstring) -{ - tree_type * Stree=TreeTemplateTools::parenthesisToTree(Sstring,false); - vector keep_names=Stree->getLeavesNames(); - map keep; - for (vector ::iterator it=keep_names.begin();it!=keep_names.end();it++) - { - string name=(*it); - keep[name]=1; - } - +void mpi_tree::prune_distributed_ales(string fname, string Sstring) { + tree_type *Stree = TreeTemplateTools::parenthesisToTree(Sstring, false); + vector keep_names = Stree->getLeavesNames(); + map keep; + for (vector::iterator it = keep_names.begin(); it != keep_names.end(); + it++) { + string name = (*it); + keep[name] = 1; + } + client_fnames.clear(); - vector > scatter_fnames;//del-loc - if (rank==server) + vector> scatter_fnames; // del-loc + if (rank == server) { + ifstream file_stream(fname.c_str()); + int tree_i = 0; + set verify; + if (file_stream.is_open()) // ########## read trees ############ { - ifstream file_stream (fname.c_str()); - int tree_i=0; - set verify; - if (file_stream.is_open()) // ########## read trees ############ - { - while (! file_stream.eof()) - { - string line; - getline (file_stream,line); - vector tokens; - boost::trim(line); - boost::split(tokens,line,boost::is_any_of("\t "),boost::token_compress_on); - int client_i=atoi(tokens[1].c_str()); - string ale_file=tokens[0]; - if (ale_file.size()>1) - { - if (not (client_i tmp; scatter_fnames.push_back(tmp);} - scatter_fnames[client_i].push_back(ale_file); - verify.insert(ale_file); - } - } - } - cout << "# Scattering: " << verify.size() << " ale files.."<::iterator it=client_fnames.begin();it!=client_fnames.end();it++) - { - i+=1; - //cout << rank << " has " << (*it) << endl; - approx_posterior * ale;//del-loc - ifstream file_stream((*it)); - vector trees; - string tree; - bool give_up=false; - while(! file_stream.eof() and not give_up) - { - getline (file_stream,tree); - if (tree.find(")")!=tree.npos ) - { - tree_type * T=TreeTemplateTools::parenthesisToTree(tree,false,"ID"); - vector leaves=T->getLeaves(); - for (vector ::iterator it=leaves.begin();it!=leaves.end();it++) - { - string name=(*it)->getName(); - //cout << name << endl; - vector tokens; - boost::split(tokens,name,boost::is_any_of("_"),boost::token_compress_on); - if ( (not keep[tokens[0]]==1) and (T->getNumberOfLeaves()>1)) - { - TreeTemplateTools::dropLeaf(*T,name); - } - } - if (T->getNumberOfLeaves()>2) - trees.push_back(TreeTemplateTools::treeToParenthesis(*T)); - else - give_up=true; - delete T; - } - } - //ale = load_ALE_from_file((*it)); - if (trees.size()>0) - { - ale=observe_ALE_from_strings(trees); - trees.clear(); - ale_pointers.push_back(ale); - } - cout << rank << " " << i << " of " << client_fnames.size() << endl; - } - - //del-locs - for ( vector >::iterator jt=scatter_fnames.begin();jt!=scatter_fnames.end();jt++) + while (!file_stream.eof()) { + string line; + getline(file_stream, line); + vector tokens; + boost::trim(line); + boost::split(tokens, line, boost::is_any_of("\t "), + boost::token_compress_on); + int client_i = atoi(tokens[1].c_str()); + string ale_file = tokens[0]; + if (ale_file.size() > 1) { + if (not(client_i < scatter_fnames.size())) { + vector tmp; + scatter_fnames.push_back(tmp); + } + scatter_fnames[client_i].push_back(ale_file); + verify.insert(ale_file); + } + } + } + cout << "# Scattering: " << verify.size() << " ale files.." << endl; + N_ales = verify.size(); + verify.clear(); + } + scatter(world, scatter_fnames, client_fnames, server); + + if (rank == server) + cout << "#..loading.." << endl; + int i = 0; + for (vector::iterator it = client_fnames.begin(); + it != client_fnames.end(); it++) { + i += 1; + // cout << rank << " has " << (*it) << endl; + approx_posterior *ale; // del-loc + ifstream file_stream((*it)); + vector trees; + string tree; + bool give_up = false; + while (!file_stream.eof() and not give_up) { + getline(file_stream, tree); + if (tree.find(")") != tree.npos) { + tree_type *T = TreeTemplateTools::parenthesisToTree(tree, false, "ID"); + vector leaves = T->getLeaves(); + for (vector::iterator it = leaves.begin(); it != leaves.end(); + it++) { + string name = (*it)->getName(); + // cout << name << endl; + vector tokens; + boost::split(tokens, name, boost::is_any_of("_"), + boost::token_compress_on); + if ((not keep[tokens[0]] == 1) and (T->getNumberOfLeaves() > 1)) { + TreeTemplateTools::dropLeaf(*T, name); + } + } + if (T->getNumberOfLeaves() > 2) + trees.push_back(TreeTemplateTools::treeToParenthesis(*T)); + else + give_up = true; + delete T; + } + } + // ale = load_ALE_from_file((*it)); + if (trees.size() > 0) { + ale = observe_ALE_from_strings(trees); + trees.clear(); + ale_pointers.push_back(ale); + } + cout << rank << " " << i << " of " << client_fnames.size() << endl; + } + + // del-locs + for (vector>::iterator jt = scatter_fnames.begin(); + jt != scatter_fnames.end(); jt++) (*jt).clear(); scatter_fnames.clear(); - scalar_type tmp=N_ales; - broadcast(world,tmp,server); - if (rank==server) cout << "# done."< > scatter_fnames;//del-loc - if (rank==server) - { - ifstream file_stream (fname.c_str()); - int tree_i=0; - set verify; - if (file_stream.is_open()) // ########## read trees ############ - { - while (! file_stream.eof()) - { - string line; - getline (file_stream,line); - vector tokens; - boost::trim(line); - boost::split(tokens,line,boost::is_any_of("\t "),boost::token_compress_on); - if (tokens.size()==3) - { - - int client_i=atoi(tokens[1].c_str()); - string ale_file=tokens[0]; - if (ale_file.size()>0) - { - if (not (client_i tmp; scatter_fnames.push_back(tmp);} - scatter_fnames[client_i].push_back(ale_file); - verify.insert(ale_file); - } - } - } - } - cout << "# Scattering: " << verify.size() << " ale files.."<::iterator it=client_fnames.begin();it!=client_fnames.end();it++) + vector> scatter_fnames; // del-loc + if (rank == server) { + ifstream file_stream(fname.c_str()); + int tree_i = 0; + set verify; + if (file_stream.is_open()) // ########## read trees ############ { - //cout << rank << " has " << (*it) << endl; - approx_posterior * ale;//del-loc - ale = load_ALE_from_file((*it)); - ale_pointers.push_back(ale); + while (!file_stream.eof()) { + string line; + getline(file_stream, line); + vector tokens; + boost::trim(line); + boost::split(tokens, line, boost::is_any_of("\t "), + boost::token_compress_on); + if (tokens.size() == 3) { + + int client_i = atoi(tokens[1].c_str()); + string ale_file = tokens[0]; + if (ale_file.size() > 0) { + if (not(client_i < scatter_fnames.size())) { + vector tmp; + scatter_fnames.push_back(tmp); + } + scatter_fnames[client_i].push_back(ale_file); + verify.insert(ale_file); + } + } + } } - - //del-locs - for ( vector >::iterator jt=scatter_fnames.begin();jt!=scatter_fnames.end();jt++) + cout << "# Scattering: " << verify.size() << " ale files.." << endl; + N_ales = verify.size(); + verify.clear(); + } + scatter(world, scatter_fnames, client_fnames, server); + + if (rank == server) + cout << "#..loading.." << endl; + + for (vector::iterator it = client_fnames.begin(); + it != client_fnames.end(); it++) { + // cout << rank << " has " << (*it) << endl; + approx_posterior *ale; // del-loc + ale = load_ALE_from_file((*it)); + ale_pointers.push_back(ale); + } + + // del-locs + for (vector>::iterator jt = scatter_fnames.begin(); + jt != scatter_fnames.end(); jt++) (*jt).clear(); scatter_fnames.clear(); - if (rank==server) cout << "# done." < fnames,bool list_of_trees) -{ +void mpi_tree::distribute_ales(vector fnames, bool list_of_trees) { client_fnames.clear(); - vector > scatter_fnames;//del-loc - - if (rank==server) - { - //cout << "#rank:" < verify; - for (vector::iterator it=fnames.begin();it!=fnames.end();it++) - verify.insert((*it)); - cout << "# Distributing: " << verify.size() << " ale files.."< tmp; - scatter_fnames.push_back(tmp); - } - map fname_counts;//del-loc - map > count_fnames;//del-loc - vector sorted_fnames;//del-loc - - for (vector::iterator it=fnames.begin();it!=fnames.end();it++) - { - string fname=(*it); - approx_posterior * ale; - if (not list_of_trees or fname.find(".ale")!=fname.npos) - ale = load_ALE_from_file(fname); - else - ale = observe_ALE_from_string(fname); - int gid_count=0; - for (int i=0;i<(int)ale->Dip_counts.size();i++) - gid_count+=ale->Dip_counts[i].size(); - gid_count+=ale->Bip_counts.size(); - if (gid_count<1e5)//we could be carful to fiter big ales ... probably should automate this top 1%? - { - fname_counts[fname]=gid_count; - count_fnames[gid_count].push_back(fname); - } - delete ale; - } - for (map >::iterator jt=count_fnames.begin();jt!=count_fnames.end();jt++) - for (vector ::iterator kt=(*jt).second.begin();kt!=(*jt).second.end();kt++) - sorted_fnames.push_back((*kt)); - for (int i=0;i<(int)sorted_fnames.size();i++) - scatter_fnames[i%size].push_back(sorted_fnames[i]); - - - map gidsum_ranks;//del-loc - //we try to exhange fnames to optimze gid distribution - while (1) - { - int max_rank=-1; - int min_rank=-1; - scalar_type max_sum=0; - scalar_type min_sum=6e23; - - for (int i=0;igidsum) - { - min_sum=gidsum; - min_rank=i; - } - } - //cout << endl; - max_sum=0; - for (int j=0;j<(int)scatter_fnames[max_rank].size();j++) - max_sum+=fname_counts[scatter_fnames[max_rank][j]]; - min_sum=0; - for (int j=0;j<(int)scatter_fnames[min_rank].size();j++) - min_sum+=fname_counts[scatter_fnames[min_rank][j]]; - //cout << max_rank << " and " << min_rank << endl; - bool changed=false; - for (int j=0;j<(int)scatter_fnames[max_rank].size();j++) - for (int k=0;k<(int)scatter_fnames[min_rank].size();k++) - { - scalar_type max_rank_count=fname_counts[scatter_fnames[max_rank][j]]; - scalar_type min_rank_count=fname_counts[scatter_fnames[min_rank][k]]; - if (abs(min_sum-max_sum) > abs( (min_sum-min_rank_count+max_rank_count) - (max_sum+min_rank_count-max_rank_count ) )) - { - string jname=scatter_fnames[max_rank][j]; - string kname=scatter_fnames[min_rank][k]; - scatter_fnames[max_rank].insert(scatter_fnames[max_rank].begin()+j,kname); - scatter_fnames[min_rank].insert(scatter_fnames[min_rank].begin()+k,jname); - scatter_fnames[max_rank].erase(scatter_fnames[max_rank].begin()+j+1); - scatter_fnames[min_rank].erase(scatter_fnames[min_rank].begin()+k+1); - - min_sum=min_sum-min_rank_count+max_rank_count; - max_sum=max_sum+min_rank_count-max_rank_count; - changed=true; - } - } - for (int j=0;j<(int)scatter_fnames[max_rank].size();j++) - { - scalar_type max_rank_count=fname_counts[scatter_fnames[max_rank][j]]; - if (abs(min_sum-max_sum) > abs( (min_sum+max_rank_count) - (max_sum-max_rank_count ) )) - { - scatter_fnames[min_rank].push_back(scatter_fnames[max_rank][j]); - scatter_fnames[max_rank].erase(scatter_fnames[max_rank].begin()+j); - changed=true; - break; - } - } - gidsum_ranks.clear(); - if (not changed) break; - } - - verify.clear(); - for (int i=0;i >::iterator jt=count_fnames.begin();jt!=count_fnames.end();jt++) - (*jt).second.clear(); - count_fnames.clear(); - sorted_fnames.clear(); + vector> scatter_fnames; // del-loc + + if (rank == server) { + // cout << "#rank:" < verify; + for (vector::iterator it = fnames.begin(); it != fnames.end(); it++) + verify.insert((*it)); + cout << "# Distributing: " << verify.size() << " ale files.." << endl; + for (int i = 0; i < size; i++) { + vector tmp; + scatter_fnames.push_back(tmp); + } + map fname_counts; // del-loc + map> count_fnames; // del-loc + vector sorted_fnames; // del-loc + + for (vector::iterator it = fnames.begin(); it != fnames.end(); + it++) { + string fname = (*it); + approx_posterior *ale; + if (not list_of_trees or fname.find(".ale") != fname.npos) + ale = load_ALE_from_file(fname); + else + ale = observe_ALE_from_string(fname); + int gid_count = 0; + for (int i = 0; i < (int)ale->Dip_counts.size(); i++) + gid_count += ale->Dip_counts[i].size(); + gid_count += ale->Bip_counts.size(); + if (gid_count < 1e5) // we could be carful to fiter big ales ... probably + // should automate this top 1%? + { + fname_counts[fname] = gid_count; + count_fnames[gid_count].push_back(fname); + } + delete ale; + } + for (map>::iterator jt = count_fnames.begin(); + jt != count_fnames.end(); jt++) + for (vector::iterator kt = (*jt).second.begin(); + kt != (*jt).second.end(); kt++) + sorted_fnames.push_back((*kt)); + for (int i = 0; i < (int)sorted_fnames.size(); i++) + scatter_fnames[i % size].push_back(sorted_fnames[i]); + + map gidsum_ranks; // del-loc + // we try to exhange fnames to optimze gid distribution + while (1) { + int max_rank = -1; + int min_rank = -1; + scalar_type max_sum = 0; + scalar_type min_sum = 6e23; + + for (int i = 0; i < size; i++) { + scalar_type gidsum = 0; + for (int j = 0; j < (int)scatter_fnames[i].size(); j++) + gidsum += fname_counts[scatter_fnames[i][j]]; + while (gidsum_ranks.count(gidsum) != 0) + gidsum += 0.1; + gidsum_ranks[gidsum] = i; + // cout << i << " has " << gidsum << " " < gidsum) { + min_sum = gidsum; + min_rank = i; + } + } + // cout << endl; + max_sum = 0; + for (int j = 0; j < (int)scatter_fnames[max_rank].size(); j++) + max_sum += fname_counts[scatter_fnames[max_rank][j]]; + min_sum = 0; + for (int j = 0; j < (int)scatter_fnames[min_rank].size(); j++) + min_sum += fname_counts[scatter_fnames[min_rank][j]]; + // cout << max_rank << " and " << min_rank << endl; + bool changed = false; + for (int j = 0; j < (int)scatter_fnames[max_rank].size(); j++) + for (int k = 0; k < (int)scatter_fnames[min_rank].size(); k++) { + scalar_type max_rank_count = + fname_counts[scatter_fnames[max_rank][j]]; + scalar_type min_rank_count = + fname_counts[scatter_fnames[min_rank][k]]; + if (abs(min_sum - max_sum) > + abs((min_sum - min_rank_count + max_rank_count) - + (max_sum + min_rank_count - max_rank_count))) { + string jname = scatter_fnames[max_rank][j]; + string kname = scatter_fnames[min_rank][k]; + scatter_fnames[max_rank].insert( + scatter_fnames[max_rank].begin() + j, kname); + scatter_fnames[min_rank].insert( + scatter_fnames[min_rank].begin() + k, jname); + scatter_fnames[max_rank].erase(scatter_fnames[max_rank].begin() + + j + 1); + scatter_fnames[min_rank].erase(scatter_fnames[min_rank].begin() + + k + 1); + + min_sum = min_sum - min_rank_count + max_rank_count; + max_sum = max_sum + min_rank_count - max_rank_count; + changed = true; + } + } + for (int j = 0; j < (int)scatter_fnames[max_rank].size(); j++) { + scalar_type max_rank_count = fname_counts[scatter_fnames[max_rank][j]]; + if (abs(min_sum - max_sum) > + abs((min_sum + max_rank_count) - (max_sum - max_rank_count))) { + scatter_fnames[min_rank].push_back(scatter_fnames[max_rank][j]); + scatter_fnames[max_rank].erase(scatter_fnames[max_rank].begin() + j); + changed = true; + break; + } + } gidsum_ranks.clear(); - - cout << "# Scattering: " << verify.size() << " ale files.."<::iterator it=client_fnames.begin();it!=client_fnames.end();it++) - { - approx_posterior * ale;//del-loc - if (not list_of_trees or (*it).find(".ale")!=(*it).npos) - ale = load_ALE_from_file((*it)); - else - ale = observe_ALE_from_string((*it)); - - if (scalar_parameter["use_mpp_trees"]==0) - { - ale_pointers.push_back(ale); - } - else - { - cout<< "Using mpp trees..!!" << endl; - vector trees; - trees.push_back(ale->mpp_tree().first); - ale_pointers.push_back(observe_ALE_from_strings(trees)); - delete ale; - } + verify.clear(); + for (int i = 0; i < size; i++) + for (int j = 0; j < (int)scatter_fnames[i].size(); j++) + verify.insert(scatter_fnames[i][j]); + // del-locs + + fname_counts.clear(); + for (map>::iterator jt = count_fnames.begin(); + jt != count_fnames.end(); jt++) + (*jt).second.clear(); + count_fnames.clear(); + sorted_fnames.clear(); + gidsum_ranks.clear(); + + cout << "# Scattering: " << verify.size() << " ale files.." << endl; + N_ales = verify.size(); + verify.clear(); + } + + scatter(world, scatter_fnames, client_fnames, server); + + if (rank == server) + cout << "#..loading.." << endl; + + for (vector::iterator it = client_fnames.begin(); + it != client_fnames.end(); it++) { + approx_posterior *ale; // del-loc + if (not list_of_trees or (*it).find(".ale") != (*it).npos) + ale = load_ALE_from_file((*it)); + else + ale = observe_ALE_from_string((*it)); + + if (scalar_parameter["use_mpp_trees"] == 0) { + ale_pointers.push_back(ale); + } else { + cout << "Using mpp trees..!!" << endl; + vector trees; + trees.push_back(ale->mpp_tree().first); + ale_pointers.push_back(observe_ALE_from_strings(trees)); + delete ale; } + } - //del-locs - for ( vector >::iterator jt=scatter_fnames.begin();jt!=scatter_fnames.end();jt++) + // del-locs + for (vector>::iterator jt = scatter_fnames.begin(); + jt != scatter_fnames.end(); jt++) (*jt).clear(); - scatter_fnames.clear(); - if (rank==server) cout << "# done." < >::iterator it=model->branch_counts.begin();it!=model->branch_counts.end();it++) - { - string count_name=(*it).first; - for (int branch=0;branchlast_branch;branch++) - model->branch_counts[count_name][branch]=0; - } - for (map ::iterator it=model->MLRec_events.begin();it!=model->MLRec_events.end();it++) - { - model->MLRec_events[(*it).first]=0; - } +void mpi_tree::clear_counts() { + for (map>::iterator it = + model->branch_counts.begin(); + it != model->branch_counts.end(); it++) { + string count_name = (*it).first; + for (int branch = 0; branch < model->last_branch; branch++) + model->branch_counts[count_name][branch] = 0; + } + for (map::iterator it = model->MLRec_events.begin(); + it != model->MLRec_events.end(); it++) { + model->MLRec_events[(*it).first] = 0; + } model->Ttokens.clear(); - int done=1; - broadcast(world,done,server); + int done = 1; + broadcast(world, done, server); } -void mpi_tree::gather_T_to_from(scalar_type samples) -{ - if (rank==server) - { - gathered_T_to_from.clear(); - for (int e=0;elast_branch;e++) - { - vector tmp; - gathered_T_to_from.push_back(tmp); - for (int f=0;flast_branch;f++) - gathered_T_to_from[e].push_back(0); - } - } - vector< vector < vector > > gather_vector; - gather(world,model->T_to_from,gather_vector,server); - if (rank==server) - { - scalar_type Tsum=0; - for (vector< vector < vector > > ::iterator it=gather_vector.begin();it!=gather_vector.end();it++) - for (int e=0;elast_branch;e++) - for (int f=0;flast_branch;f++) - { - gathered_T_to_from[e][f]+=(*it)[e][f]/samples; - Tsum+=(*it)[e][f]/samples; - } - //cout << ">TOTAL Ts = "<< Tsum/100. << endl; - //map >sort_e; - //map >sort_f; - sort_e.clear(); - sort_f.clear(); - for (int e=0;elast_branch;e++) - for (int f=0;flast_branch;f++) - { - sort_e[-gathered_T_to_from[e][f]].push_back(e); - sort_f[-gathered_T_to_from[e][f]].push_back(f); - } - for (map >::iterator it=sort_e.begin();it!=sort_e.end();it++) - { - scalar_type Ts=(*it).first; - for (int i=0;i<(*it).second.size();i++) - { - int e=sort_e[Ts][i]; - int f=sort_f[Ts][i]; - if (Ts< -0.01*Tsum*0.1 and false) - { - cout << Ts; - if (elast_leaf) - cout << " " << model->node_name[model->id_nodes[e]]; - else - cout << " " << e; - if (flast_leaf) - cout << " " << model->node_name[model->id_nodes[f]]; - else - cout << " " << f; - cout << endl; - } - } - } +void mpi_tree::gather_T_to_from(scalar_type samples) { + if (rank == server) { + gathered_T_to_from.clear(); + for (int e = 0; e < model->last_branch; e++) { + vector tmp; + gathered_T_to_from.push_back(tmp); + for (int f = 0; f < model->last_branch; f++) + gathered_T_to_from[e].push_back(0); + } + } + vector>> gather_vector; + gather(world, model->T_to_from, gather_vector, server); + if (rank == server) { + scalar_type Tsum = 0; + for (vector>>::iterator it = + gather_vector.begin(); + it != gather_vector.end(); it++) + for (int e = 0; e < model->last_branch; e++) + for (int f = 0; f < model->last_branch; f++) { + gathered_T_to_from[e][f] += (*it)[e][f] / samples; + Tsum += (*it)[e][f] / samples; + } + // cout << ">TOTAL Ts = "<< Tsum/100. << endl; + // map >sort_e; + // map >sort_f; + sort_e.clear(); + sort_f.clear(); + for (int e = 0; e < model->last_branch; e++) + for (int f = 0; f < model->last_branch; f++) { + sort_e[-gathered_T_to_from[e][f]].push_back(e); + sort_f[-gathered_T_to_from[e][f]].push_back(f); + } + for (map>::iterator it = sort_e.begin(); + it != sort_e.end(); it++) { + scalar_type Ts = (*it).first; + for (int i = 0; i < (*it).second.size(); i++) { + int e = sort_e[Ts][i]; + int f = sort_f[Ts][i]; + if (Ts < -0.01 * Tsum * 0.1 and false) { + cout << Ts; + if (e < model->last_leaf) + cout << " " << model->node_name[model->id_nodes[e]]; + else + cout << " " << e; + if (f < model->last_leaf) + cout << " " << model->node_name[model->id_nodes[f]]; + else + cout << " " << f; + cout << endl; + } + } } + } } -void mpi_tree::gather_counts(scalar_type samples) -{ - map< string ,vector > > gathered_branch_counts;//del-loc - for (map >::iterator it=model->branch_counts.begin();it!=model->branch_counts.end();it++) - { - string count_name=(*it).first; - if (rank==server) - { - vector > tmp; - gathered_branch_counts[count_name]=tmp; - } - gather(world,model->branch_counts[count_name],gathered_branch_counts[count_name],server); - if (rank==server) - { - for (int branch=0;branchlast_branch;branch++) - { - model->branch_counts[count_name][branch]/=samples; - for (int i=1;ibranch_counts[count_name][branch]+=gathered_branch_counts[count_name][i][branch]/samples; - } - cout << "# Tree for " << count_name << " counts:" << endl; - //model->show_counts(count_name,false); - //model->show_counts(count_name,true); - model->show_counts(count_name,true,true); - - } - } - //del-locs - - for ( map< string ,vector > >::iterator it= gathered_branch_counts.begin();it!= gathered_branch_counts.end();it++) - { - for ( vector >::iterator jt= (*it).second.begin();jt!= (*it).second.end();jt++) - (*jt).clear(); - (*it).second.clear(); +void mpi_tree::gather_counts(scalar_type samples) { + map>> gathered_branch_counts; // del-loc + for (map>::iterator it = + model->branch_counts.begin(); + it != model->branch_counts.end(); it++) { + string count_name = (*it).first; + if (rank == server) { + vector> tmp; + gathered_branch_counts[count_name] = tmp; + } + gather(world, model->branch_counts[count_name], + gathered_branch_counts[count_name], server); + if (rank == server) { + for (int branch = 0; branch < model->last_branch; branch++) { + model->branch_counts[count_name][branch] /= samples; + for (int i = 1; i < size; i++) + model->branch_counts[count_name][branch] += + gathered_branch_counts[count_name][i][branch] / samples; + } + cout << "# Tree for " << count_name << " counts:" << endl; + // model->show_counts(count_name,false); + // model->show_counts(count_name,true); + model->show_counts(count_name, true, true); } + } + // del-locs + + for (map>>::iterator it = + gathered_branch_counts.begin(); + it != gathered_branch_counts.end(); it++) { + for (vector>::iterator jt = (*it).second.begin(); + jt != (*it).second.end(); jt++) + (*jt).clear(); + (*it).second.clear(); + } gathered_branch_counts.clear(); - //Ttokens - vector > gather_Ttokens; //del-loc - gather(world,model->Ttokens,gather_Ttokens,server); - if (rank==server) for (int i=0;i >::iterator it=gather_Ttokens.begin();it!=gather_Ttokens.end();it++) - (*it).clear(); - gather_Ttokens.clear(); + // Ttokens + vector> gather_Ttokens; // del-loc + gather(world, model->Ttokens, gather_Ttokens, server); + if (rank == server) + for (int i = 0; i < size; i++) + for (int j = 0; j < (int)gather_Ttokens[i].size(); j++) + Ttokens.push_back(gather_Ttokens[i][j]); + // del-locs + for (vector>::iterator it = gather_Ttokens.begin(); + it != gather_Ttokens.end(); it++) + (*it).clear(); + gather_Ttokens.clear(); - int done=1; - broadcast(world,done,server); + int done = 1; + broadcast(world, done, server); } +void mpi_tree::print_branch_counts(scalar_type samples) { + if (rank == server) { + cout << "#\t"; + cout << "name" + << "\t"; + + for (map>::iterator it = + model->branch_counts.begin(); + it != model->branch_counts.end(); it++) { + string count_name = (*it).first; + cout << count_name << "\t"; + } + cout << "delta" + << "\t"; + cout << "tau" + << "\t"; + cout << "lambda"; + cout << endl; + map some_sums; + for (map>::iterator it = + model->branch_counts.begin(); + it != model->branch_counts.end(); it++) { + string count_name = (*it).first; + some_sums[count_name] = 0; + } + for (int branch = 0; branch < model->last_branch; branch++) { + ; // cout << branch << "\t"; + if (branch < model->last_leaf) + ; // cout << model->node_name[model->id_nodes[branch]]; + else + ; // cout << branch; + ; // cout << "\t"; + + for (map>::iterator it = + model->branch_counts.begin(); + it != model->branch_counts.end(); it++) { + string count_name = (*it).first; + some_sums[count_name] += + model->branch_counts[count_name][branch] / samples; + + // cout << model->branch_counts[count_name][branch] << "\t"; + } + // cout << model->vector_parameter["delta"][branch] << "\t"; + // cout << model->vector_parameter["tau"][branch] << "\t"; + // cout << model->vector_parameter["lambda"][branch]; + + // cout << endl; + } + cout << "#SUMS"; -void mpi_tree::print_branch_counts(scalar_type samples) -{ - if (rank==server) - { - cout<<"#\t"; - cout << "name" << "\t"; - - for (map >::iterator it=model->branch_counts.begin();it!=model->branch_counts.end();it++) - { - string count_name=(*it).first; - cout << count_name << "\t"; - } - cout << "delta" << "\t"; - cout << "tau" << "\t"; - cout << "lambda"; - cout << endl; - map some_sums; - for (map >::iterator it=model->branch_counts.begin();it!=model->branch_counts.end();it++) - { - string count_name=(*it).first; - some_sums[count_name]=0; - } - for (int branch=0;branchlast_branch;branch++) - { - ;//cout << branch << "\t"; - if (branchlast_leaf) - ;//cout << model->node_name[model->id_nodes[branch]]; - else - ;//cout << branch; - ;//cout << "\t"; - - for (map >::iterator it=model->branch_counts.begin();it!=model->branch_counts.end();it++) - { - string count_name=(*it).first; - some_sums[count_name]+=model->branch_counts[count_name][branch]/samples; - - //cout << model->branch_counts[count_name][branch] << "\t"; - } - //cout << model->vector_parameter["delta"][branch] << "\t"; - //cout << model->vector_parameter["tau"][branch] << "\t"; - //cout << model->vector_parameter["lambda"][branch]; - - //cout << endl; - } - cout << "#SUMS"; - - for (map >::iterator it=model->branch_counts.begin();it!=model->branch_counts.end();it++) - { - string count_name=(*it).first; - cout <<"\t"<< some_sums[count_name]/samples; - } - cout << endl; - + for (map>::iterator it = + model->branch_counts.begin(); + it != model->branch_counts.end(); it++) { + string count_name = (*it).first; + cout << "\t" << some_sums[count_name] / samples; } + cout << endl; + } } -string mpi_tree::branch_counts_string() -{ +string mpi_tree::branch_counts_string() { string return_string; - if (rank==server) - { - stringstream out; - - scalar_type total_D=0,total_T=0,total_L=0,total_S=0; - for (int branch=0;branchlast_branch;branch++) - { - total_D+=model->branch_counts["Ds"][branch]; - total_T+=model->branch_counts["Ts"][branch]; - total_L+=model->branch_counts["Ls"][branch]; - total_S+=model->branch_counts["copies"][branch]; - } - out << " " <last_branch; branch++) { + total_D += model->branch_counts["Ds"][branch]; + total_T += model->branch_counts["Ts"][branch]; + total_L += model->branch_counts["Ls"][branch]; + total_S += model->branch_counts["copies"][branch]; + } + out << " " << total_D; + out << " " << total_T; + out << " " << total_L; + out << " " << total_S; + return_string = out.str(); + } + broadcast(world, return_string, server); return return_string; } -void mpi_tree::show_branch_counts() -{ - if (rank==server) - { - for (map >::iterator it=model->branch_counts.begin();it!=model->branch_counts.end();it++) - { - string count_name=(*it).first; - model->show_counts(count_name); - } - scalar_type total_D=0,total_T=0,total_L=0,total_S=0; - for (int branch=0;branchlast_branch;branch++) - { - total_D+=model->branch_counts["Ds"][branch]; - total_T+=model->branch_counts["Ts"][branch]; - total_L+=model->branch_counts["Ls"][branch]; - total_S+=model->branch_counts["copies"][branch]; - - } - cout << "#total D: " << total_D; - cout << " T: " << total_T; - cout << " L: " <last_branch+1)); - - cout << endl; - } - int done=1; - broadcast(world,done,server); +void mpi_tree::show_branch_counts() { + if (rank == server) { + for (map>::iterator it = + model->branch_counts.begin(); + it != model->branch_counts.end(); it++) { + string count_name = (*it).first; + model->show_counts(count_name); + } + scalar_type total_D = 0, total_T = 0, total_L = 0, total_S = 0; + for (int branch = 0; branch < model->last_branch; branch++) { + total_D += model->branch_counts["Ds"][branch]; + total_T += model->branch_counts["Ts"][branch]; + total_L += model->branch_counts["Ls"][branch]; + total_S += model->branch_counts["copies"][branch]; + } + cout << "#total D: " << total_D; + cout << " T: " << total_T; + cout << " L: " << total_L; + cout << " S: " << total_S; + + cout << endl; + cout << "#avg. D: " << total_D / N_ales; + cout << " T: " << total_T / N_ales; + cout << " L: " << total_L / N_ales; + cout << " S: " << total_S / N_ales / ((model->last_branch + 1)); + + cout << endl; + } + int done = 1; + broadcast(world, done, server); } -scalar_type mpi_tree::calculate_MLRecs(bool estimate,bool branchwise) -{ +scalar_type mpi_tree::calculate_MLRecs(bool estimate, bool branchwise) { clear_counts(); MLRec_res.clear(); - scalar_type ll=0; + scalar_type ll = 0; vector gather_ll; - //show_rates(); - //boost::timer * t = new boost::timer(); - for (int i=0;i<(int)ale_pointers.size();i++) - { - //cout << rank <<" at " < res=model->p_MLRec(ale_pointers[i]); - if (model->signal==-11) - { - model->signal=0; - cout << "ERERERER "<save_state("error_ale"); - } - cout <<"#ML " << client_fnames[i] << " " << res.first << " " << log(res.second) << endl; - ll+=log(res.second); - MLRec_res.push_back(res); - } - - //cout << rank << " "<elapsed() < res = model->p_MLRec(ale_pointers[i]); + if (model->signal == -11) { + model->signal = 0; + cout << "ERERERER " << rank << " " << i << endl; + ale_pointers[i]->save_state("error_ale"); + } + cout << "#ML " << client_fnames[i] << " " << res.first << " " + << log(res.second) << endl; + ll += log(res.second); + MLRec_res.push_back(res); + } + + // cout << rank << " "<elapsed() <calculate_EGb(); vector gather_ll; - //boost::timer * t = new boost::timer(); - for (int i=0;i<(int)ale_pointers.size();i++) - { - //cout << rank <<" at " <p(ale_pointers[i]); - if (tmpp==0) cout << client_fnames[i] << " is 0 !!"<elapsed() <scalar_parameter["delta_avg"] << " " << model->vector_parameter["N"][0]*model->scalar_parameter["tau_avg"] << " " << model->scalar_parameter["lambda_avg"] << " " << model->vector_parameter["Delta_bar"][0] <<" " << sum_ll<p(ale_pointers[i]); + if (tmpp == 0) + cout << client_fnames[i] << " is 0 !!" << endl; + cout << "#LL " << client_fnames[i] << " " << log(tmpp) << endl; + + ll += log(tmpp); + } + // cout << rank << " "<elapsed() <scalar_parameter["delta_avg"] << " " + << model->vector_parameter["N"][0] * model->scalar_parameter["tau_avg"] + << " " << model->scalar_parameter["lambda_avg"] << " " + << model->vector_parameter["Delta_bar"][0] << " " << sum_ll << endl; return sum_ll; } -void mpi_tree::estimate_rates() -{ - scalar_type delta,tau,lambda; - map< string ,vector > > gathered_branch_counts;//del-loc - for (map >::iterator it=model->branch_counts.begin();it!=model->branch_counts.end();it++) - { - string count_name=(*it).first; - if (rank==server) - { - vector > tmp; - gathered_branch_counts[count_name]=tmp; - } - gather(world,model->branch_counts[count_name],gathered_branch_counts[count_name],server); - if (rank==server) - { - for (int branch=0;branchlast_branch;branch++) - { - for (int i=1;ibranch_counts[count_name][branch]+=gathered_branch_counts[count_name][i][branch]; - } - //model->show_counts(count_name); - } - if (rank==server) - { - for (int branch=0;branchlast_branch;branch++) - model->branch_counts[count_name][branch]/=100.; - } - } - if (rank==server) - { - - scalar_type Csum=0; - for (int e=0;elast_branch;e++) - { - Csum+=model->branch_counts["copies"][e]; - } - scalar_type P_D_avg=0; - scalar_type P_T_avg=0; - scalar_type P_L_avg=0; - scalar_type w_sum=0; - - for (int e=0;elast_branch;e++) - { - scalar_type N_S=model->branch_counts["count"][e]; - scalar_type Ee=model->branch_counts["Ls"][e]/N_S; - scalar_type Ge=model->branch_counts["singleton"][e]/N_S; - //cout << Ee << " " << Ge << endl; - scalar_type P_D=max((scalar_type)1e-6,-1*((1 - Ee - Ge)/((-1 + Ee)*(-2*Ee + Ge + Ee*Ge)))); - scalar_type P_L=max((scalar_type)1e-6,-1*((Ee - 3*Ee*Ee + Ee*Ee*Ge)/((-1 + Ee)*(-2*Ee + Ge + Ee*Ge)))); - P_D=max((scalar_type)1e-6,model->branch_counts["Ds"][e]/N_S); // this works much better empirically - scalar_type P_T=max((scalar_type)1e-6,model->branch_counts["Ts"][e]/Csum*(float)model->last_branch); - //cout << e<<" "<< P_D << " " << P_L << " " << P_T << " " << " "<< model->branch_counts["count"][e] << endl; - scalar_type w=1.;//model->branch_counts["count"][e]; - P_D_avg+=( P_D ) * w; - P_T_avg+=( P_T ) * w; - P_L_avg+=( P_L ) * w; - w_sum+=w; - } - delta=P_D_avg/w_sum; - tau=P_T_avg/w_sum; - lambda=P_L_avg/w_sum; - //cout << " rate estimates " << delta << " " << tau << " " << lambda << endl; - } - broadcast(world,delta,server); - broadcast(world,tau,server); - broadcast(world,lambda,server); - model->set_model_parameter("delta",delta); - model->set_model_parameter("tau",tau); - model->set_model_parameter("lambda",lambda); +void mpi_tree::estimate_rates() { + scalar_type delta, tau, lambda; + map>> gathered_branch_counts; // del-loc + for (map>::iterator it = + model->branch_counts.begin(); + it != model->branch_counts.end(); it++) { + string count_name = (*it).first; + if (rank == server) { + vector> tmp; + gathered_branch_counts[count_name] = tmp; + } + gather(world, model->branch_counts[count_name], + gathered_branch_counts[count_name], server); + if (rank == server) { + for (int branch = 0; branch < model->last_branch; branch++) { + for (int i = 1; i < size; i++) + model->branch_counts[count_name][branch] += + gathered_branch_counts[count_name][i][branch]; + } + // model->show_counts(count_name); + } + if (rank == server) { + for (int branch = 0; branch < model->last_branch; branch++) + model->branch_counts[count_name][branch] /= 100.; + } + } + if (rank == server) { + + scalar_type Csum = 0; + for (int e = 0; e < model->last_branch; e++) { + Csum += model->branch_counts["copies"][e]; + } + scalar_type P_D_avg = 0; + scalar_type P_T_avg = 0; + scalar_type P_L_avg = 0; + scalar_type w_sum = 0; + + for (int e = 0; e < model->last_branch; e++) { + scalar_type N_S = model->branch_counts["count"][e]; + scalar_type Ee = model->branch_counts["Ls"][e] / N_S; + scalar_type Ge = model->branch_counts["singleton"][e] / N_S; + // cout << Ee << " " << Ge << endl; + scalar_type P_D = + max((scalar_type)1e-6, + -1 * ((1 - Ee - Ge) / ((-1 + Ee) * (-2 * Ee + Ge + Ee * Ge)))); + scalar_type P_L = + max((scalar_type)1e-6, -1 * ((Ee - 3 * Ee * Ee + Ee * Ee * Ge) / + ((-1 + Ee) * (-2 * Ee + Ge + Ee * Ge)))); + P_D = + max((scalar_type)1e-6, model->branch_counts["Ds"][e] / + N_S); // this works much better empirically + scalar_type P_T = + max((scalar_type)1e-6, + model->branch_counts["Ts"][e] / Csum * (float)model->last_branch); + // cout << e<<" "<< P_D << " " << P_L << " " << P_T << " " << " "<< + // model->branch_counts["count"][e] << endl; + scalar_type w = 1.; // model->branch_counts["count"][e]; + P_D_avg += (P_D)*w; + P_T_avg += (P_T)*w; + P_L_avg += (P_L)*w; + w_sum += w; + } + delta = P_D_avg / w_sum; + tau = P_T_avg / w_sum; + lambda = P_L_avg / w_sum; + // cout << " rate estimates " << delta << " " << tau << " " << lambda << + // endl; + } + broadcast(world, delta, server); + broadcast(world, tau, server); + broadcast(world, lambda, server); + model->set_model_parameter("delta", delta); + model->set_model_parameter("tau", tau); + model->set_model_parameter("lambda", lambda); } -void mpi_tree::estimate_rates_bw() -{ - vector delta,tau,lambda; - map< string ,vector > > gathered_branch_counts;//del-loc - for (map >::iterator it=model->branch_counts.begin();it!=model->branch_counts.end();it++) - { - string count_name=(*it).first; - if (rank==server) - { - vector > tmp; - gathered_branch_counts[count_name]=tmp; - } - gather(world,model->branch_counts[count_name],gathered_branch_counts[count_name],server); - if (rank==server) - { - for (int branch=0;branchlast_branch;branch++) - { - for (int i=1;ibranch_counts[count_name][branch]+=gathered_branch_counts[count_name][i][branch]; - } - //model->show_counts(count_name); - } - if (rank==server) - { - for (int branch=0;branchlast_branch;branch++) - model->branch_counts[count_name][branch]/=100.; - } - - } - scalar_type delta_avg=0; - scalar_type tau_avg=0; - scalar_type lambda_avg=0; - - if (rank==server) - { - - scalar_type Csum=0; - scalar_type Tssum=0; - scalar_type Tfromssum=0; - - for (int e=0;elast_branch;e++) - { - Csum+=model->branch_counts["copies"][e]; - Tssum+=model->branch_counts["Ts"][e]; - Tfromssum+=model->branch_counts["Tfroms"][e]/model->branch_counts["count"][e]; - } - scalar_type P_D_avg=0; - scalar_type P_T_avg=0; - scalar_type P_L_avg=0; - scalar_type w_sum=0; - - for (int e=0;elast_branch;e++) - { - scalar_type N_S=model->branch_counts["count"][e]; - scalar_type N_C=model->branch_counts["copies"][e]; - - scalar_type Ee=model->branch_counts["Ls"][e]/N_S; - scalar_type Ge=model->branch_counts["singleton"][e]/N_S; - //cout << Ee << " " << Ge << endl; - scalar_type P_D=max((scalar_type)1e-6,-1*((1 - Ee - Ge)/((-1 + Ee)*(-2*Ee + Ge + Ee*Ge)))); - scalar_type P_L=max((scalar_type)1e-6,-1*((Ee - 3*Ee*Ee + Ee*Ee*Ge)/((-1 + Ee)*(-2*Ee + Ge + Ee*Ge)))); - P_D=max((scalar_type)1e-6,model->branch_counts["Ds"][e]/N_S); // this works much better empirically - scalar_type P_T=max((scalar_type)1e-6,model->branch_counts["Ts"][e]/Csum*(float)model->last_branch); - //P_T= model->branch_counts["Ts"][e]/N_S; - P_D_avg+=P_D; - P_T_avg+=P_T; - P_L_avg+=P_L; - delta.push_back( P_D ); - tau.push_back( P_T); - lambda.push_back( P_L ); - //cout << " rate estimates " << e ; - //cout <<" " << model->vector_parameter["delta"][e] << " " << model->vector_parameter["tau"][e] << " " << model->vector_parameter["lambda"][e]; - //cout <<" " << delta[e] << " " << tau[e] << " " << lambda[e]; - //cout << " "<< model->branch_counts["Ds"][e]/N_S <<" "<< model->branch_counts["Ls"][e]/N_S << " " << model->branch_counts["Ts"][e]/N_S << " " << model->branch_counts["Tfroms"][e]/N_S << " " << N_S <last_branch << " " << P_T_avg/(float)model->last_branch << " " << P_L_avg/(float)model->last_branch << endl; - delta_avg=P_D_avg/(float)model->last_branch; - tau_avg=P_T_avg/(float)model->last_branch; - lambda_avg=P_L_avg/(float)model->last_branch; - - } - broadcast(world,delta,server); - broadcast(world,tau,server); - broadcast(world,lambda,server); - broadcast(world,delta_avg,server); - broadcast(world,tau_avg,server); - broadcast(world,lambda_avg,server); - - model->set_model_parameter("delta",delta); - model->set_model_parameter("tau",tau ); - //model->set_model_parameter("tau",tau); - model->set_model_parameter("lambda",lambda); +void mpi_tree::estimate_rates_bw() { + vector delta, tau, lambda; + map>> gathered_branch_counts; // del-loc + for (map>::iterator it = + model->branch_counts.begin(); + it != model->branch_counts.end(); it++) { + string count_name = (*it).first; + if (rank == server) { + vector> tmp; + gathered_branch_counts[count_name] = tmp; + } + gather(world, model->branch_counts[count_name], + gathered_branch_counts[count_name], server); + if (rank == server) { + for (int branch = 0; branch < model->last_branch; branch++) { + for (int i = 1; i < size; i++) + model->branch_counts[count_name][branch] += + gathered_branch_counts[count_name][i][branch]; + } + // model->show_counts(count_name); + } + if (rank == server) { + for (int branch = 0; branch < model->last_branch; branch++) + model->branch_counts[count_name][branch] /= 100.; + } + } + scalar_type delta_avg = 0; + scalar_type tau_avg = 0; + scalar_type lambda_avg = 0; + + if (rank == server) { + + scalar_type Csum = 0; + scalar_type Tssum = 0; + scalar_type Tfromssum = 0; + + for (int e = 0; e < model->last_branch; e++) { + Csum += model->branch_counts["copies"][e]; + Tssum += model->branch_counts["Ts"][e]; + Tfromssum += + model->branch_counts["Tfroms"][e] / model->branch_counts["count"][e]; + } + scalar_type P_D_avg = 0; + scalar_type P_T_avg = 0; + scalar_type P_L_avg = 0; + scalar_type w_sum = 0; + + for (int e = 0; e < model->last_branch; e++) { + scalar_type N_S = model->branch_counts["count"][e]; + scalar_type N_C = model->branch_counts["copies"][e]; + + scalar_type Ee = model->branch_counts["Ls"][e] / N_S; + scalar_type Ge = model->branch_counts["singleton"][e] / N_S; + // cout << Ee << " " << Ge << endl; + scalar_type P_D = + max((scalar_type)1e-6, + -1 * ((1 - Ee - Ge) / ((-1 + Ee) * (-2 * Ee + Ge + Ee * Ge)))); + scalar_type P_L = + max((scalar_type)1e-6, -1 * ((Ee - 3 * Ee * Ee + Ee * Ee * Ge) / + ((-1 + Ee) * (-2 * Ee + Ge + Ee * Ge)))); + P_D = + max((scalar_type)1e-6, model->branch_counts["Ds"][e] / + N_S); // this works much better empirically + scalar_type P_T = + max((scalar_type)1e-6, + model->branch_counts["Ts"][e] / Csum * (float)model->last_branch); + // P_T= model->branch_counts["Ts"][e]/N_S; + P_D_avg += P_D; + P_T_avg += P_T; + P_L_avg += P_L; + delta.push_back(P_D); + tau.push_back(P_T); + lambda.push_back(P_L); + // cout << " rate estimates " << e ; + // cout <<" " << model->vector_parameter["delta"][e] << " " << + // model->vector_parameter["tau"][e] << " " << + // model->vector_parameter["lambda"][e]; cout <<" " << delta[e] << " " << + // tau[e] << " " << lambda[e]; cout << " "<< + // model->branch_counts["Ds"][e]/N_S <<" "<< + // model->branch_counts["Ls"][e]/N_S << " " << + // model->branch_counts["Ts"][e]/N_S << " " << + // model->branch_counts["Tfroms"][e]/N_S << " " << N_S <last_branch << " " << + // P_T_avg/(float)model->last_branch << " " << + // P_L_avg/(float)model->last_branch << endl; + delta_avg = P_D_avg / (float)model->last_branch; + tau_avg = P_T_avg / (float)model->last_branch; + lambda_avg = P_L_avg / (float)model->last_branch; + } + broadcast(world, delta, server); + broadcast(world, tau, server); + broadcast(world, lambda, server); + broadcast(world, delta_avg, server); + broadcast(world, tau_avg, server); + broadcast(world, lambda_avg, server); + + model->set_model_parameter("delta", delta); + model->set_model_parameter("tau", tau); + // model->set_model_parameter("tau",tau); + model->set_model_parameter("lambda", lambda); } -scalar_type mpi_tree::calculate_pun(int n, bool bw) -{ - scalar_type ll=calculate_pun(); - if (n>0) - { +scalar_type mpi_tree::calculate_pun(int n, bool bw) { + scalar_type ll = calculate_pun(); + if (n > 0) { + estimate_rates(); + + if (rank == server) + for (map>::iterator it = + model->branch_counts.begin(); + it != model->branch_counts.end(); it++) { + string count_name = (*it).first; + { model->show_counts(count_name); } + } + + ll = calculate_pun(); + } + for (int i = 1; i < n; i++) { + if (bw) + estimate_rates_bw(); + else estimate_rates(); - - if (rank==server) - for (map >::iterator it=model->branch_counts.begin();it!=model->branch_counts.end();it++) - { - string count_name=(*it).first; - { - model->show_counts(count_name); - } - } - - ll=calculate_pun(); - - } - for (int i=1;i >::iterator it=model->branch_counts.begin();it!=model->branch_counts.end();it++) - { - string count_name=(*it).first; - { - model->show_counts(count_name); - } - } - - ll=calculate_pun(); - } + if (rank == server) + for (map>::iterator it = + model->branch_counts.begin(); + it != model->branch_counts.end(); it++) { + string count_name = (*it).first; + { model->show_counts(count_name); } + } + + ll = calculate_pun(); + } if (bw) estimate_rates_bw(); else estimate_rates(); - - if (rank==server) - for (map >::iterator it=model->branch_counts.begin();it!=model->branch_counts.end();it++) - { - string count_name=(*it).first; - { - model->show_counts(count_name); - } - } + + if (rank == server) + for (map>::iterator it = + model->branch_counts.begin(); + it != model->branch_counts.end(); it++) { + string count_name = (*it).first; + { model->show_counts(count_name); } + } print_branch_counts(); return ll; } -scalar_type mpi_tree::calculate_pun(int samples) -{ +scalar_type mpi_tree::calculate_pun(int samples) { stringstream outname; - outname << "try."< gather_ll; model->MLRec_events.clear(); - for (int e=0;elast_branch;e++) - { - model->branch_counts["Os"][e]=0; - model->branch_counts["Ds"][e]=0; - model->branch_counts["Ts"][e]=0; - model->branch_counts["Tfroms"][e]=0; - model->branch_counts["Ls"][e]=0; - model->branch_counts["count"][e]=0; - model->branch_counts["copies"][e]=0; - model->branch_counts["singleton"][e]=0; - } - for (int e=0;elast_branch;e++) - for (int f=0;flast_branch;f++) - model->T_to_from[e][f]=0; - - //boost::timer * t = new boost::timer(); + for (int e = 0; e < model->last_branch; e++) { + model->branch_counts["Os"][e] = 0; + model->branch_counts["Ds"][e] = 0; + model->branch_counts["Ts"][e] = 0; + model->branch_counts["Tfroms"][e] = 0; + model->branch_counts["Ls"][e] = 0; + model->branch_counts["count"][e] = 0; + model->branch_counts["copies"][e] = 0; + model->branch_counts["singleton"][e] = 0; + } + for (int e = 0; e < model->last_branch; e++) + for (int f = 0; f < model->last_branch; f++) + model->T_to_from[e][f] = 0; + + // boost::timer * t = new boost::timer(); model->calculate_undatedEs(); - for (int i=0;i<(int)ale_pointers.size();i++) - { - //if (rank==server) cout << rank <<" at " <calculate_undatedEs(); - scalar_type tmpp=model->pun(ale_pointers[i]); - fout << "started "<elapsed() <calculate_undatedEs(); + scalar_type tmpp = model->pun(ale_pointers[i]); + fout << "started " << client_fnames[i] << " ll=" << log(tmpp) << endl; + for (int si = 0; si < samples; si++) { + model->sample_undated(); + } + fout << "finished " << client_fnames[i] << endl; + + if (tmpp == 0) + cout << client_fnames[i] << " is 0 !!" << endl; + // cout <<"#LL " << client_fnames[i] << " " << log(tmpp) << endl; + + ll += log(tmpp); + } + // cout << rank << " "<elapsed() <calculate_undatedEs(); vector gather_ll; model->MLRec_events.clear(); - for (int e=0;elast_branch;e++) - { - model->branch_counts["Os"][e]=0; - model->branch_counts["Ds"][e]=0; - model->branch_counts["Ts"][e]=0; - model->branch_counts["Tfroms"][e]=0; - model->branch_counts["Ls"][e]=0; - model->branch_counts["count"][e]=0; - model->branch_counts["copies"][e]=0; - } - for (int e=0;elast_branch;e++) - for (int f=0;flast_branch;f++) - model->T_to_from[e][f]=0; - - //boost::timer * t = new boost::timer(); - for (int i=0;i<(int)ale_pointers.size();i++) - { - //cout << rank <<" at " <construct_undated(S);//del-loc - alt_model->set_model_parameter("delta",scalar_parameter["inital_delta"]); - alt_model->set_model_parameter("tau",scalar_parameter["inital_tau"]); - alt_model->set_model_parameter("lambda",scalar_parameter["inital_lambda"]); - alt_model->calculate_undatedEs(); - scalar_type tmpp=alt_model->pun(ale_pointers[i]); - for (int i=0;i<100;i++) model->sample_undated(); - if (tmpp==0) cout << client_fnames[i] << " is 0 !!"<elapsed() <last_branch; e++) { + model->branch_counts["Os"][e] = 0; + model->branch_counts["Ds"][e] = 0; + model->branch_counts["Ts"][e] = 0; + model->branch_counts["Tfroms"][e] = 0; + model->branch_counts["Ls"][e] = 0; + model->branch_counts["count"][e] = 0; + model->branch_counts["copies"][e] = 0; + } + for (int e = 0; e < model->last_branch; e++) + for (int f = 0; f < model->last_branch; f++) + model->T_to_from[e][f] = 0; + + // boost::timer * t = new boost::timer(); + for (int i = 0; i < (int)ale_pointers.size(); i++) { + // cout << rank <<" at " <construct_undated(S); // del-loc + alt_model->set_model_parameter("delta", scalar_parameter["inital_delta"]); + alt_model->set_model_parameter("tau", scalar_parameter["inital_tau"]); + alt_model->set_model_parameter("lambda", scalar_parameter["inital_lambda"]); + alt_model->calculate_undatedEs(); + scalar_type tmpp = alt_model->pun(ale_pointers[i]); + for (int i = 0; i < 100; i++) + model->sample_undated(); + if (tmpp == 0) + cout << client_fnames[i] << " is 0 !!" << endl; + // cout <<"#LL " << client_fnames[i] << " " << log(tmpp) << endl; + + ll += log(tmpp); + } + // cout << rank << " "<elapsed() < #include -class mpi_tree -{ - public: - exODT_model * model;//del-loc +class mpi_tree { +public: + exODT_model *model; // del-loc boost::mpi::communicator world; - int server,rank,size; - std::vector ale_pointers;//del-loc + int server, rank, size; + std::vector ale_pointers; // del-loc std::vector client_fnames; scalar_type N_ales; - std::vector > MLRec_res;//del-loc - std::vector Ttokens;//del-loc + std::vector> MLRec_res; // del-loc + std::vector Ttokens; // del-loc - std::map scalar_parameter; - std::map string_parameter; + std::map scalar_parameter; + std::map string_parameter; - void set_parameter(std::string name, scalar_type value) - { - scalar_parameter[name]=value; + void set_parameter(std::string name, scalar_type value) { + scalar_parameter[name] = value; }; - void set_parameter(std::string name, std::string value) - { - string_parameter[name]=value; + void set_parameter(std::string name, std::string value) { + string_parameter[name] = value; }; - - scalar_type delta_avg,tau_avg,lambda_avg; - scalar_type delta_norm,tau_norm,lambda_norm; - std::vector delta_branch_avg,tau_branch_avg,lambda_branch_avg;//del-loc - std::vector delta_branch_norm,tau_branch_norm,lambda_branch_norm;//del-loc + scalar_type delta_avg, tau_avg, lambda_avg; + scalar_type delta_norm, tau_norm, lambda_norm; + std::vector delta_branch_avg, tau_branch_avg, + lambda_branch_avg; // del-loc + std::vector delta_branch_norm, tau_branch_norm, + lambda_branch_norm; // del-loc std::string S_string; - mpi_tree(std::string Sstring,const boost::mpi::communicator mpi_world,std::map set_parameters=std::map(),bool undated=false) - { - if (undated) - { - S_string=Sstring; - set_parameter("min_delta",1e-6); - set_parameter("min_tau",1e-6); - set_parameter("min_lambda",1e-6); - - set_parameter("inital_delta",0.01); - set_parameter("inital_tau",0.02); - set_parameter("inital_lambda",0.1); - model=new exODT_model(); - model->construct_undated(Sstring);//del-loc - - model->set_model_parameter("delta",scalar_parameter["inital_delta"]); - model->set_model_parameter("tau",scalar_parameter["inital_tau"]); - model->set_model_parameter("lambda",scalar_parameter["inital_lambda"]); - for (std::map::iterator it=set_parameters.begin();it!=set_parameters.end();it++) - model->set_model_parameter((*it).first,(*it).second); - - } - else - { - set_parameter("use_mpp_trees",0); - set_parameter("min_delta",1e-6); - set_parameter("min_tau",1e-6); - set_parameter("min_lambda",1e-6); - - set_parameter("inital_delta",0.01); - set_parameter("inital_tau",0.01); - set_parameter("inital_lambda",0.02); - - model=new exODT_model(); - model->set_model_parameter("min_D",3); - model->set_model_parameter("grid_delta_t",0.005); - model->set_model_parameter("event_node",0); - model->set_model_parameter("DD",10); - for (std::map::iterator it=set_parameters.begin();it!=set_parameters.end();it++) - model->set_model_parameter((*it).first,(*it).second); - model->construct(Sstring);//del-loc - scalar_type N=1e6; - model->set_model_parameter("N",1e6);//we can almost scale out N assuming height from coalescent.. - model->set_model_parameter("Delta_bar",N); - model->set_model_parameter("Lambda_bar",N); - model->set_model_parameter("delta",scalar_parameter["inital_delta"]); - model->set_model_parameter("tau",scalar_parameter["inital_tau"]); - model->set_model_parameter("lambda",scalar_parameter["inital_lambda"]); - - model->calculate_EGb();//with default parameters - } - world = mpi_world; - server=0; - rank = world.rank(); - size = world.size(); - - - }; - ~mpi_tree() - { - for (std::vector::iterator it=ale_pointers.begin();it!=ale_pointers.end();it++) - delete (*it); - ale_pointers.clear(); - MLRec_res.clear(); - client_fnames.clear(); - delta_branch_avg.clear(),tau_branch_avg.clear(),lambda_branch_avg.clear();//del-loc - delta_branch_norm.clear(),tau_branch_norm.clear(),lambda_branch_norm.clear();//del-loc - - delete model; - }; - - //implimented in mpi_tree.cpp - void distribute_ales(std::vector,bool list_of_trees=false); + mpi_tree(std::string Sstring, const boost::mpi::communicator mpi_world, + std::map set_parameters = + std::map(), + bool undated = false) { + if (undated) { + S_string = Sstring; + set_parameter("min_delta", 1e-6); + set_parameter("min_tau", 1e-6); + set_parameter("min_lambda", 1e-6); + + set_parameter("inital_delta", 0.01); + set_parameter("inital_tau", 0.02); + set_parameter("inital_lambda", 0.1); + model = new exODT_model(); + model->construct_undated(Sstring); // del-loc + + model->set_model_parameter("delta", scalar_parameter["inital_delta"]); + model->set_model_parameter("tau", scalar_parameter["inital_tau"]); + model->set_model_parameter("lambda", scalar_parameter["inital_lambda"]); + for (std::map::iterator it = + set_parameters.begin(); + it != set_parameters.end(); it++) + model->set_model_parameter((*it).first, (*it).second); + + } else { + set_parameter("use_mpp_trees", 0); + set_parameter("min_delta", 1e-6); + set_parameter("min_tau", 1e-6); + set_parameter("min_lambda", 1e-6); + + set_parameter("inital_delta", 0.01); + set_parameter("inital_tau", 0.01); + set_parameter("inital_lambda", 0.02); + + model = new exODT_model(); + model->set_model_parameter("min_D", 3); + model->set_model_parameter("grid_delta_t", 0.005); + model->set_model_parameter("event_node", 0); + model->set_model_parameter("DD", 10); + for (std::map::iterator it = + set_parameters.begin(); + it != set_parameters.end(); it++) + model->set_model_parameter((*it).first, (*it).second); + model->construct(Sstring); // del-loc + scalar_type N = 1e6; + model->set_model_parameter( + "N", + 1e6); // we can almost scale out N assuming height from coalescent.. + model->set_model_parameter("Delta_bar", N); + model->set_model_parameter("Lambda_bar", N); + model->set_model_parameter("delta", scalar_parameter["inital_delta"]); + model->set_model_parameter("tau", scalar_parameter["inital_tau"]); + model->set_model_parameter("lambda", scalar_parameter["inital_lambda"]); + + model->calculate_EGb(); // with default parameters + } + world = mpi_world; + server = 0; + rank = world.rank(); + size = world.size(); + }; + ~mpi_tree() { + for (std::vector::iterator it = ale_pointers.begin(); + it != ale_pointers.end(); it++) + delete (*it); + ale_pointers.clear(); + MLRec_res.clear(); + client_fnames.clear(); + delta_branch_avg.clear(), tau_branch_avg.clear(), + lambda_branch_avg.clear(); // del-loc + delta_branch_norm.clear(), tau_branch_norm.clear(), + lambda_branch_norm.clear(); // del-loc + + delete model; + }; + + // implimented in mpi_tree.cpp + void distribute_ales(std::vector, bool list_of_trees = false); void load_distributed_ales(std::string fname); - void prune_distributed_ales(std::string fname,std::string Sstring); + void prune_distributed_ales(std::string fname, std::string Sstring); - std::vector < std::vector > gathered_T_to_from; - void gather_T_to_from(scalar_type samples=1); + std::vector> gathered_T_to_from; + void gather_T_to_from(scalar_type samples = 1); - void gather_counts(scalar_type samples=1); + void gather_counts(scalar_type samples = 1); void clear_counts(); std::string branch_counts_string(); void show_branch_counts(); - void print_branch_counts(scalar_type samples=1); + void print_branch_counts(scalar_type samples = 1); - scalar_type calculate_MLRecs(bool estimate=false,bool branchwise=false); + scalar_type calculate_MLRecs(bool estimate = false, bool branchwise = false); scalar_type calculate_p(); - scalar_type calculate_pun(int n=0); + scalar_type calculate_pun(int n = 0); scalar_type calculate_punt(std::string S); - scalar_type calculate_pun(int n,bool bw); - std::map >sort_e; - std::map >sort_f; + scalar_type calculate_pun(int n, bool bw); + std::map> sort_e; + std::map> sort_f; void estimate_rates(); void estimate_rates_bw(); - - //implimented in rae_estimate.cpp - std::vector dtl_estimate(int branch,scalar_type N_ales_norm); - //scalar_type estimate_rates(std::string mode="uniform"); + + // implimented in rae_estimate.cpp + std::vector dtl_estimate(int branch, scalar_type N_ales_norm); + // scalar_type estimate_rates(std::string mode="uniform"); void show_rates(); - private: + +private: ; }; diff --git a/src/omp_test.cpp b/src/omp_test.cpp index a815281..d7b848b 100644 --- a/src/omp_test.cpp +++ b/src/omp_test.cpp @@ -1,92 +1,93 @@ -#include "exODT.h" #include "ALE_util.h" +#include "exODT.h" #include using namespace std; using namespace bpp; -int main(int argc, char ** argv) -{ - cout << "changing OMP_NUM_THREADS should leave the results unchanged!" << endl; - //we need a species tree +int main(int argc, char **argv) { + cout << "changing OMP_NUM_THREADS should leave the results unchanged!" + << endl; + // we need a species tree string Sstring; - ifstream file_stream ("example_data/cy36_green.tree"); - getline (file_stream,Sstring); - //we need an ale - string ale_name="example_data/sc_cy36HBG285662.ale"; - if (argc>1) ale_name=argv[1]; - approx_posterior * ale=load_ALE_from_file(ale_name); + ifstream file_stream("example_data/cy36_green.tree"); + getline(file_stream, Sstring); + // we need an ale + string ale_name = "example_data/sc_cy36HBG285662.ale"; + if (argc > 1) + ale_name = argv[1]; + approx_posterior *ale = load_ALE_from_file(ale_name); // initilaize the exODT model using some initial DTL rates - exODT_model* model=new exODT_model(); - model->set_model_parameter("D",3); - model->set_model_parameter("DD",10); + exODT_model *model = new exODT_model(); + model->set_model_parameter("D", 3); + model->set_model_parameter("DD", 10); model->construct(Sstring); - model->set_model_parameter("event_node",0); - model->set_model_parameter("delta",0.1); - model->set_model_parameter("tau",0.1); - model->set_model_parameter("lambda",0.2); + model->set_model_parameter("event_node", 0); + model->set_model_parameter("delta", 0.1); + model->set_model_parameter("tau", 0.1); + model->set_model_parameter("lambda", 0.2); // calculate single gene propagtaion and extinction functions model->calculate_EGb(); - // calculate joint ALE*exODT likelihood summed over all reconcilation and all tree toplopgies - - // the openMP part should be added in model.cpp in exODT_model::p( approx_posterior *) - // I guess you have to add the openMP bit into model.cpp while making sure the above number does not change - // and also make sure that the tables produed during the calculations work .. - // this can be done with the stochastic backtrace, since we fixed the random seed, the above reconciled tree should alos not change.. - // calling p(ale) after sample when currrently unactivated pragma with label //p3 is active crashes - - scalar_type t_0,t_1; + // calculate joint ALE*exODT likelihood summed over all reconcilation and all + // tree toplopgies + + // the openMP part should be added in model.cpp in exODT_model::p( + // approx_posterior *) I guess you have to add the openMP bit into model.cpp + // while making sure the above number does not change and also make sure that + // the tables produed during the calculations work .. this can be done with + // the stochastic backtrace, since we fixed the random seed, the above + // reconciled tree should alos not change.. calling p(ale) after sample when + // currrently unactivated pragma with label //p3 is active crashes + + scalar_type t_0, t_1; omp_set_num_threads(1); - cout << endl << "trying OMP_NUM_THREADS=1" << endl; - t_0=omp_get_wtime(); + cout << endl << "trying OMP_NUM_THREADS=1" << endl; + t_0 = omp_get_wtime(); cout << model->p(ale) << endl; - t_1=omp_get_wtime(); - cout << t_1-t_0 <<"s"<< endl << endl; + t_1 = omp_get_wtime(); + cout << t_1 - t_0 << "s" << endl << endl; RandomTools::setSeed(20110426); cout << model->sample() << endl; omp_set_num_threads(2); - cout << endl << "trying OMP_NUM_THREADS=2" << endl; - t_0=omp_get_wtime(); + cout << endl << "trying OMP_NUM_THREADS=2" << endl; + t_0 = omp_get_wtime(); cout << model->p(ale) << endl; - t_1=omp_get_wtime(); - cout << t_1-t_0 <<"s"<< endl << endl; + t_1 = omp_get_wtime(); + cout << t_1 - t_0 << "s" << endl << endl; RandomTools::setSeed(20110426); cout << model->sample() << endl; omp_set_num_threads(4); - cout << endl << "trying OMP_NUM_THREADS=4" << endl; - t_0=omp_get_wtime(); + cout << endl << "trying OMP_NUM_THREADS=4" << endl; + t_0 = omp_get_wtime(); cout << model->p(ale) << endl; - t_1=omp_get_wtime(); - cout << t_1-t_0 <<"s"<< endl << endl; + t_1 = omp_get_wtime(); + cout << t_1 - t_0 << "s" << endl << endl; RandomTools::setSeed(20110426); cout << model->sample() << endl; omp_set_num_threads(8); - cout << endl << "trying OMP_NUM_THREADS=8" << endl; - t_0=omp_get_wtime(); + cout << endl << "trying OMP_NUM_THREADS=8" << endl; + t_0 = omp_get_wtime(); cout << model->p(ale) << endl; - t_1=omp_get_wtime(); - cout << t_1-t_0 <<"s"<< endl << endl; + t_1 = omp_get_wtime(); + cout << t_1 - t_0 << "s" << endl << endl; RandomTools::setSeed(20110426); cout << model->sample() << endl; omp_set_num_threads(12); cout << endl << "trying OMP_NUM_THREADS=12" << endl; - t_0=omp_get_wtime(); + t_0 = omp_get_wtime(); cout << model->p(ale) << endl; - t_1=omp_get_wtime(); - cout << t_1-t_0 <<"s"<< endl << endl; + t_1 = omp_get_wtime(); + cout << t_1 - t_0 << "s" << endl << endl; RandomTools::setSeed(20110426); cout << model->sample() << endl; - - } - diff --git a/src/pairHasher.h b/src/pairHasher.h index e850961..61e2704 100644 --- a/src/pairHasher.h +++ b/src/pairHasher.h @@ -1,21 +1,16 @@ -template -inline void hash_combine(std::size_t & seed, const T & v) -{ +template inline void hash_combine(std::size_t &seed, const T &v) { std::hash hasher; seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2); } -namespace std -{ - template struct hash> - { - inline size_t operator()(const pair & v) const - { - size_t seed = 0; - ::hash_combine(seed, v.first); - ::hash_combine(seed, v.second); - return seed; - } - }; -} +namespace std { +template struct hash> { + inline size_t operator()(const pair &v) const { + size_t seed = 0; + ::hash_combine(seed, v.first); + ::hash_combine(seed, v.second); + return seed; + } +}; +} // namespace std diff --git a/src/parse_maxtic.cpp b/src/parse_maxtic.cpp index 063297e..84029b4 100644 --- a/src/parse_maxtic.cpp +++ b/src/parse_maxtic.cpp @@ -3,42 +3,37 @@ using namespace std; using namespace bpp; - - -string azname(Node * node) -{ - vector left_aznames = TreeTemplateTools::getLeavesNames(*(node->getSons()[0])); - vector right_aznames = TreeTemplateTools::getLeavesNames(*(node->getSons()[1])); +string azname(Node *node) { + vector left_aznames = + TreeTemplateTools::getLeavesNames(*(node->getSons()[0])); + vector right_aznames = + TreeTemplateTools::getLeavesNames(*(node->getSons()[1])); sort(left_aznames.begin(), left_aznames.end()); sort(right_aznames.begin(), right_aznames.end()); - string azname=left_aznames[0]+"-"+right_aznames[right_aznames.size()-1]; + string azname = + left_aznames[0] + "-" + right_aznames[right_aznames.size() - 1]; return azname; } -int main(int argc, char ** argv) -{ - - ifstream tree_stream (argv[1]); - string fname=argv[1]; - - string tree; - getline (tree_stream,tree); - tree_type * T=TreeTemplateTools::parenthesisToTree(tree,true); - - vector nodes=T->getNodes(); - - - for (auto it=nodes.begin();it!=nodes.end();it++) - if ((*it)->hasFather() and not (*it)->isLeaf()) - { - scalar_type h=TreeTemplateTools::getHeight(*(*it)); - Node * node = (*it); - string name=azname(node); - int id = node->getBootstrapValue(); - cout << id << " " << name < nodes = T->getNodes(); + + for (auto it = nodes.begin(); it != nodes.end(); it++) + if ((*it)->hasFather() and not(*it)->isLeaf()) { + scalar_type h = TreeTemplateTools::getHeight(*(*it)); + Node *node = (*it); + string name = azname(node); + int id = node->getBootstrapValue(); + cout << id << " " << name << endl; + } } diff --git a/src/sample.cpp b/src/sample.cpp index 79d0109..1c4aa82 100644 --- a/src/sample.cpp +++ b/src/sample.cpp @@ -2,1055 +2,998 @@ using namespace std; using namespace bpp; // -//consider reimplemtation for clarity! +// consider reimplemtation for clarity! // -//The general structure of the calculation, and lot of the code, is the same as p(ale) cf. model.cpp. -//(this could be made more clear) -string exODT_model::sample(bool max_rec) -{ +// The general structure of the calculation, and lot of the code, is the same as +// p(ale) cf. model.cpp. (this could be made more clear) +string exODT_model::sample(bool max_rec) { MLRec_events.clear(); Ttokens.clear(); - - //scalar_type beta=1; - scalar_type root_resum=0; - for (int rank=0;rank temp; // if (g_id!=-1) { //We are not at the root bipartition - temp = ale->id_sets.at( g_id ); + temp = ale->id_sets.at(g_id); for (int i = 0; i < ale->Gamma_size + 1; ++i) { // if ( BipartitionTools::testBit ( temp, i) ) { - if ( temp[ i ] ) { + if (temp[i]) { size++; } } - - + // if ((int)(ale->id_sets[g_id].size())==1) if (size == 1) - is_a_leaf=true; + is_a_leaf = true; // } - vector gp_ids;//del-loc - vector gpp_ids;//del-loc - vector p_part;//del-loc - if (g_id!=-1) - for (unordered_map< pair,scalar_type> :: iterator kt = ale->Dip_counts[g_id].begin(); kt != ale->Dip_counts[g_id].end(); kt++) - { - pair parts = (*kt).first; - long int gp_id=parts.first; - long int gpp_id=parts.second; - gp_ids.push_back(gp_id); - gpp_ids.push_back(gpp_id); - if (ale->Bip_counts[g_id]<=scalar_parameter["min_bip_count"]) - p_part.push_back(0); - else - { - p_part.push_back(ale->p_dip(g_id,gp_id,gpp_id)); - } + vector gp_ids; // del-loc + vector gpp_ids; // del-loc + vector p_part; // del-loc + if (g_id != -1) + for (unordered_map, scalar_type>::iterator kt = + ale->Dip_counts[g_id].begin(); + kt != ale->Dip_counts[g_id].end(); kt++) { + pair parts = (*kt).first; + long int gp_id = parts.first; + long int gpp_id = parts.second; + gp_ids.push_back(gp_id); + gpp_ids.push_back(gpp_id); + if (ale->Bip_counts[g_id] <= scalar_parameter["min_bip_count"]) + p_part.push_back(0); + else { + p_part.push_back(ale->p_dip(g_id, gp_id, gpp_id)); } - else - { - //root bipartition needs to be handled seperatly - map,int> bip_parts; - for (map :: iterator it = ale->Bip_counts.begin(); it != ale->Bip_counts.end(); it++) - { - long int gp_id=(*it).first; - boost::dynamic_bitset<> gamma = ale->id_sets[gp_id]; - - boost::dynamic_bitset<> not_gamma = ~gamma; - not_gamma[0] = 0; - - /* for (auto i = 0; i < ale->nbint; ++i) { - not_gamma[i] = 0; - } - BipartitionTools::bitNot(not_gamma, gamma, ale->nbint);*/ - /* - for (set::iterator st=ale->Gamma.begin();st!=ale->Gamma.end();st++) - if (gamma.count(*st)==0) - not_gamma.insert(*st);*/ - long int gpp_id = ale->set_ids[not_gamma]; - set parts; - parts.insert(gp_id); - parts.insert(gpp_id); - bip_parts[parts]=1; - // gamma.clear(); - // not_gamma.clear(); - } - for (map,int> :: iterator kt = bip_parts.begin();kt!=bip_parts.end();kt++) - { - vector parts; - for (set::iterator sit=(*kt).first.begin();sit!=(*kt).first.end();sit++) parts.push_back((*sit)); - long int gp_id=parts[0]; - long int gpp_id=parts[1]; - gp_ids.push_back(gp_id); - gpp_ids.push_back(gpp_id); - if (ale->Bip_counts[gp_id]<=scalar_parameter["min_bip_count"] and not ale->Gamma_size<4) - p_part.push_back(0); - else - p_part.push_back(ale->p_bip(gp_id)); - } - bip_parts.clear(); } - int N_parts=gp_ids.size(); - int n=time_slices[rank].size(); - //###################################################################################################################### - //######################################### INNER LOOP ################################################################# - //###################################################################################################################### - - vector sample_steps; - vector sample_ps; - scalar_type resum=0; + else { + // root bipartition needs to be handled seperatly + map, int> bip_parts; + for (map::iterator it = ale->Bip_counts.begin(); + it != ale->Bip_counts.end(); it++) { + long int gp_id = (*it).first; + boost::dynamic_bitset<> gamma = ale->id_sets[gp_id]; + + boost::dynamic_bitset<> not_gamma = ~gamma; + not_gamma[0] = 0; + + /* for (auto i = 0; i < ale->nbint; ++i) { + not_gamma[i] = 0; + } + BipartitionTools::bitNot(not_gamma, gamma, ale->nbint);*/ + /* + for (set::iterator st=ale->Gamma.begin();st!=ale->Gamma.end();st++) + if (gamma.count(*st)==0) + not_gamma.insert(*st);*/ + long int gpp_id = ale->set_ids[not_gamma]; + set parts; + parts.insert(gp_id); + parts.insert(gpp_id); + bip_parts[parts] = 1; + // gamma.clear(); + // not_gamma.clear(); + } + for (map, int>::iterator kt = bip_parts.begin(); + kt != bip_parts.end(); kt++) { + vector parts; + for (set::iterator sit = (*kt).first.begin(); + sit != (*kt).first.end(); sit++) + parts.push_back((*sit)); + long int gp_id = parts[0]; + long int gpp_id = parts[1]; + gp_ids.push_back(gp_id); + gpp_ids.push_back(gpp_id); + if (ale->Bip_counts[gp_id] <= scalar_parameter["min_bip_count"] and + not ale->Gamma_size < 4) + p_part.push_back(0); + else + p_part.push_back(ale->p_bip(gp_id)); + } + bip_parts.clear(); + } + int N_parts = gp_ids.size(); + int n = time_slices[rank].size(); + // ###################################################################################################################### + // ######################################### INNER LOOP + // ################################################################# + // ###################################################################################################################### + + vector sample_steps; + vector sample_ps; + scalar_type resum = 0; scalar_type t; - //scalar_type t_to=time_slice_times[rank][t_i]; + // scalar_type t_to=time_slice_times[rank][t_i]; - //int rank_to=rank; - //int t_i_to=t_i; - bool set_S_node=false; + // int rank_to=rank; + // int t_i_to=t_i; + bool set_S_node = false; // proceed a single "D" subslice - if(t_i>0) - { - rank=rank; - t_i-=1; + if (t_i > 0) { + rank = rank; + t_i -= 1; + } + // at boundaries + else if (rank > 0) { + if (S_node) { + ; } - // at boundaries - else if (rank>0) - { - if (S_node) - { - ; - } - //if e defines the time slice we have to look at speciaitons - else if (e==time_slices[rank][n-1]) - { - set_S_node=true; - } - else - { - rank-=1; - t_i=time_slice_times[rank].size()-1; - } + // if e defines the time slice we have to look at speciaitons + else if (e == time_slices[rank][n - 1]) { + set_S_node = true; + } else { + rank -= 1; + t_i = time_slice_times[rank].size() - 1; } - else - { - rank=-1; - t_i=-1; - if (is_a_leaf && extant_species[e]==gid_sps[g_id]) - { - resum=1; - sample_ps.push_back(1); - step step; - step.e=e; - step.ep=-1; - step.epp=-1; - step.t=0; - step.rank=0; - step.g_id=g_id; - step.gp_id=-1; - step.gpp_id=-1; - step.event="0"; - sample_steps.push_back(step); - }//q[g_id][t][e]=1; - - } - if (rank>-1) - { - - t=time_slice_times[rank][t_i]; - scalar_type tpdt;//,tpdt_nl; - if ( t_i < (int)time_slice_times[rank].size()-1 ) - tpdt=time_slice_times[rank][t_i+1]; - else if (rank -1) { + + t = time_slice_times[rank][t_i]; + scalar_type tpdt; //,tpdt_nl; + if (t_i < (int)time_slice_times[rank].size() - 1) + tpdt = time_slice_times[rank][t_i + 1]; + else if (rank < last_rank - 1) + tpdt = time_slice_times[rank + 1][0]; + else + // top of root stem + tpdt = t_begin[time_slices[rank][0]]; - /* - if (scalar_parameter["event_node"]==1 and 0) - tpdt_nl=t; - else - tpdt_nl=tpdt; - */ - //root - scalar_type Delta_t=tpdt-t; - //scalar_type N=vector_parameter["N"][rank]; - scalar_type Delta_bar=vector_parameter["Delta_bar"][rank]; - //scalar_type Lambda_bar=vector_parameter["Lambda_bar"][rank]; - scalar_type p_Delta_bar=Delta_bar*Delta_t; - scalar_type Ebar=Ee[-1][t]; - - if(e==alpha) - { - //boundaries for branch alpha virtual branch - //boundary at present - if (t==0) - { - resum+=0; - sample_ps.push_back(0); - step step; - step.e=alpha; - step.ep=-1; - step.epp=-1; - step.t=t; - step.rank=rank; - step.g_id=g_id; - step.gp_id=-1; - step.gpp_id=-1; - step.event="0"; - sample_steps.push_back(step); - }//q[g_id][t][alpha]=0; - //boundary between slice rank and rank-1 slice is trivial - //trivial - if (S_node )//and 0!? - { - resum=1; - if(1) - { - sample_ps.push_back(1); - step step; - step.e=alpha; - step.ep=-1; - step.epp=-1; - step.t=t; - step.rank=rank; - step.g_id=g_id; - step.gp_id=-1; - step.gpp_id=-1; - step.event="0"; - sample_steps.push_back(step); - } - ;//q[g_id][t][e]=q[g_id][t][e]; - } - //q[g_id][t][alpha]=q[g_id][t][alpha]; - else - { - //cout << " here " <0) - { - int f=daughters[e][0]; - int g=daughters[e][1]; - scalar_type Eft=Ee[f][t]; - scalar_type Egt=Ee[g][t]; - - //scalar_type q_sum=0; - //q[g_id][t][e]=0; - - scalar_type SL_fLg=q[g_id][t][f]*Egt; - scalar_type SL_Lfg=q[g_id][t][g]*Eft; - //SL EVENT - resum+=SL_fLg; - if(1) - { - sample_ps.push_back(SL_fLg); - step step; - step.e=f; - step.ep=-1; - step.epp=-1; - step.t=t; - step.rank=rank; - step.g_id=g_id; - step.gp_id=-1; - step.gpp_id=-1; - step.event="SL"; - sample_steps.push_back(step); - } - - resum+=SL_Lfg; - if(1) - { - sample_ps.push_back(SL_Lfg); - step step; - step.e=g; - step.ep=-1; - step.epp=-1; - step.t=t; - step.rank=rank; - step.g_id=g_id; - step.gp_id=-1; - step.gpp_id=-1; - step.event="SL"; - sample_steps.push_back(step); - } - - //q_sum+=SL_fLg+SL_Lfg; - //q[g_id][t][e]=q[g_id][t][f]*Egt + q[g_id][t][g]*Eft; - //SL. - - //non-leaf directed partition - if (not is_a_leaf) - for (int i=0;i 0) { + int f = daughters[e][0]; + int g = daughters[e][1]; + scalar_type Eft = Ee[f][t]; + scalar_type Egt = Ee[g][t]; + + // scalar_type q_sum=0; + // q[g_id][t][e]=0; + + scalar_type SL_fLg = q[g_id][t][f] * Egt; + scalar_type SL_Lfg = q[g_id][t][g] * Eft; + // SL EVENT + resum += SL_fLg; + if (1) { + sample_ps.push_back(SL_fLg); + step step; + step.e = f; + step.ep = -1; + step.epp = -1; + step.t = t; + step.rank = rank; + step.g_id = g_id; + step.gp_id = -1; + step.gpp_id = -1; + step.event = "SL"; + sample_steps.push_back(step); + } + + resum += SL_Lfg; + if (1) { + sample_ps.push_back(SL_Lfg); + step step; + step.e = g; + step.ep = -1; + step.epp = -1; + step.t = t; + step.rank = rank; + step.g_id = g_id; + step.gp_id = -1; + step.gpp_id = -1; + step.event = "SL"; + sample_steps.push_back(step); + } + + // q_sum+=SL_fLg+SL_Lfg; + // q[g_id][t][e]=q[g_id][t][f]*Egt + q[g_id][t][g]*Eft; + // SL. + + // non-leaf directed partition + if (not is_a_leaf) + for (int i = 0; i < N_parts; i++) { + + long int gp_id = gp_ids[i]; + long int gpp_id = gpp_ids[i]; + scalar_type pp = p_part[i]; + scalar_type S_pf_ppg = q[gp_id][t][f] * q[gpp_id][t][g] * pp; + scalar_type S_ppf_pg = q[gpp_id][t][f] * q[gp_id][t][g] * pp; + // S EVENT + // q[g_id][t][e]+=q[gp_id][t][f]*q[gpp_id][t][g] + // +q[gpp_id][t][f]*q[gp_id][t][g]; + resum += S_pf_ppg; + if (1) { + sample_ps.push_back(S_pf_ppg); + step step; + step.e = -1; + step.ep = f; + step.epp = g; + step.t = t; + step.rank = rank; + step.g_id = -1; + step.gp_id = gp_id; + step.gpp_id = gpp_id; + step.event = "S"; + sample_steps.push_back(step); + } + + resum += S_ppf_pg; + if (1) { + sample_ps.push_back(S_ppf_pg); + step step; + step.e = -1; + step.ep = g; + step.epp = f; + step.t = t; + step.rank = rank; + step.g_id = -1; + step.gp_id = gp_id; + step.gpp_id = gpp_id; + step.event = "S"; + sample_steps.push_back(step); + } + // q_sum+= S_pf_ppg + S_ppf_pg; + // S. + } + + // q[g_id][t][e]=q_sum; + + } + // branches that cross to next time slice + else { + // trivial + resum = 1; + if (1) { + sample_ps.push_back(1); + step step; + step.e = e; + step.ep = -1; + step.epp = -1; + step.t = t; + step.rank = rank; + step.g_id = g_id; + step.gp_id = -1; + step.gpp_id = -1; + step.event = "0"; + sample_steps.push_back(step); + }; // q[g_id][t][e]=q[g_id][t][e]; + } + } + } + // boundaries for branch e. + else { + + // events within slice rank at time t on branch e + // q[g_id][tpdt][e]=0; + // scalar_type q_sum=0; + // scalar_type q_sum_nl=0; + + // non-leaf directed partition + if (not is_a_leaf) + for (int i = 0; i < N_parts; i++) { + + long int gp_id = gp_ids[i]; + long int gpp_id = gpp_ids[i]; + scalar_type pp = p_part[i]; + scalar_type qpe = q[gp_id][t][e]; + scalar_type qppe = q[gpp_id][t][e]; + scalar_type Sb_pa_ppe = + p_Delta_bar * q[gp_id][t][alpha] * qppe * pp; + scalar_type Sb_pe_ppa = + p_Delta_bar * qpe * q[gpp_id][t][alpha] * pp; + // S_bar EVENT + resum += Sb_pa_ppe; + if (1) { + sample_ps.push_back(Sb_pa_ppe); + step step; + step.e = -1; + step.ep = alpha; + step.epp = e; + step.t = t; + step.rank = rank; + step.g_id = -1; + step.gp_id = gp_id; + step.gpp_id = gpp_id; + step.event = "Sb"; + sample_steps.push_back(step); + } + + resum += Sb_pe_ppa; + if (1) { + sample_ps.push_back(Sb_pe_ppa); + step step; + step.e = -1; + step.ep = e; + step.epp = alpha; + step.t = t; + step.rank = rank; + step.g_id = -1; + step.gp_id = gp_id; + step.gpp_id = gpp_id; + step.event = "Sb"; + + sample_steps.push_back(step); + } + // q_sum_nl+= Sb_pa_ppe + Sb_pe_ppa; + + // q[g_id][tpdt][e]+=p_Delta_bar*(q[gp_id][t][alpha]*q[gpp_id][t][e]+q[gp_id][t][e]*q[gpp_id][t][alpha]); + // S_bar. + + scalar_type D = 2 * p_delta_e * qpe * qppe * pp; + resum += D; + if (1) { + sample_ps.push_back(D); + step step; + step.e = -1; + step.ep = e; + step.epp = e; + step.t = t; + step.rank = rank; + step.g_id = -1; + step.gp_id = gp_id; + step.gpp_id = gpp_id; + step.event = "D"; + + sample_steps.push_back(step); + } + // D EVENT + // q_sum_nl+= D; + + // q[g_id][tpdt][e]+=p_delta_e*q[gp_id][t][e]*q[gpp_id][t][e]; + // D. + } + + scalar_type SLb = p_Delta_bar * Eet * q[g_id][t][alpha]; + // SL_bar EVENT + resum += SLb; + if (1) { + sample_ps.push_back(SLb); + step step; + step.e = alpha; + step.ep = -1; + step.epp = -1; + step.t = t; + step.rank = rank; + step.g_id = g_id; + step.gp_id = -1; + step.gpp_id = -1; + step.event = "SLb"; + sample_steps.push_back(step); + } + // q_sum_nl+=SLb; + + // q[g_id][tpdt][e]+=p_Delta_bar*Eet*q[g_id][t][alpha]; + // SL_bar. + + // q[g_id][tpdt_nl][e]+=q_sum_nl; + + scalar_type empty = Get * q[g_id][t][e]; + // 0 EVENT + resum += empty; + if (1) { + sample_ps.push_back(empty); + step step; + step.e = e; + step.ep = -1; + step.epp = -1; + step.t = t; + step.rank = rank; + step.g_id = g_id; + step.gp_id = -1; + step.gpp_id = -1; + step.event = "0"; + sample_steps.push_back(step); + } + // q_sum+=empty; + + // q[g_id][tpdt][e]=Get*q[g_id][t][e]; + // 0. + + // q[g_id][tpdt][e]+=q_sum; + + // events within slice rank at time t on branch e. + } } + } - //###################################################################################################################### - //#########################################INNNER LOOP################################################################## - //###################################################################################################################### + // ###################################################################################################################### + // #########################################INNNER + // LOOP################################################################## + // ###################################################################################################################### gp_ids.clear(); gpp_ids.clear(); p_part.clear(); - if (S_node) - { - rank-=1; - t_i=time_slice_times[rank].size()-1; - S_node=false; - } - if (set_S_node) - { - S_node=true; + if (S_node) { + rank -= 1; + t_i = time_slice_times[rank].size() - 1; + S_node = false; + } + if (set_S_node) { + S_node = true; + } + int step_i = -1; + scalar_type reresum = 0; + scalar_type r = RandomTools::giveRandomNumberBetweenZeroAndEntry(1); + scalar_type max_resum = 0; + int max_i = 0; + for (int i = 0; i < (int)sample_ps.size(); i++) { + if (max_resum < sample_ps[i]) { + max_resum = sample_ps[i]; + max_i = i; } - int step_i=-1; - scalar_type reresum=0; - scalar_type r=RandomTools::giveRandomNumberBetweenZeroAndEntry(1); - scalar_type max_resum=0; - int max_i=0; - for (int i=0;i<(int)sample_ps.size();i++) - { - if (max_resumset2name(ale_pointer->id_sets[g_id]) << "\t" << back_step.t << " t_i:" << t_i << endl; - else - cout << back_step.event << "\t" << back_step.rank << "\tid_rank:" << id_ranks[back_step.e] << "\text_sp:" << extant_species[back_step.e] << "\te:" << back_step.e<< "\tg_id:" << ale_pointer->set2name(ale_pointer->id_sets[g_id]) << "\t" << back_step.t << " t_i:" << t_i << endl; + cout << back_step.event << "\t" << back_step.rank << "\tid_rank:" << + id_ranks[back_step.ep] << "\text_sp:" << extant_species[back_step.ep] << + "\te:" << back_step.ep<< "\tg_id:" << + ale_pointer->set2name(ale_pointer->id_sets[g_id]) << "\t" << back_step.t << + " t_i:" << t_i << endl; else cout << back_step.event << "\t" << + back_step.rank << "\tid_rank:" << id_ranks[back_step.e] << "\text_sp:" << + extant_species[back_step.e] << "\te:" << back_step.e<< "\tg_id:" << + ale_pointer->set2name(ale_pointer->id_sets[g_id]) << "\t" << back_step.t << + " t_i:" << t_i << endl; */ stringstream toptmp; - if (back_step.e==alpha) - toptmp<<-1; - else if (id_ranks[back_step.e]==0) - toptmp<id_sets[g_id]; for (int i = 0; i < ale_pointer->Gamma_size + 1; ++i) { // if ( BipartitionTools::testBit ( temp, i) ) { - if ( temp[i] ) { + if (temp[i]) { size++; } } // } - - if (ale_pointer->Bip_counts[g_id]>0) - { - new_branch_length=max(ale_pointer->Bip_bls[g_id]/ale_pointer->Bip_counts[g_id],(scalar_type)scalar_parameter["min_branch_lenghts"]); - } - else - { - new_branch_length=max(ale_pointer->Bip_bls[g_id]/ale_pointer->observations,(scalar_type)scalar_parameter["min_branch_lenghts"]); - - } + if (ale_pointer->Bip_counts[g_id] > 0) { + new_branch_length = + max(ale_pointer->Bip_bls[g_id] / ale_pointer->Bip_counts[g_id], + (scalar_type)scalar_parameter["min_branch_lenghts"]); + } else { + new_branch_length = + max(ale_pointer->Bip_bls[g_id] / ale_pointer->observations, + (scalar_type)scalar_parameter["min_branch_lenghts"]); + } - if (back_step.t==0 and size == 1 and e!=-1) - { - register_leaf(e); - stringstream branch_string; - if (scalar_parameter["leaf_events"]==1) branch_string<set2name(ale_pointer->id_sets[g_id])+branch_string.str(); - } + if (back_step.t == 0 and size == 1 and e != -1) { + register_leaf(e); + stringstream branch_string; + if (scalar_parameter["leaf_events"] == 1) + branch_string << branch_events; + branch_string << ":" << new_branch_length; + return ale_pointer->set2name(ale_pointer->id_sets[g_id]) + + branch_string.str(); + } - if (back_step.event=="D" or back_step.event=="Tb" or back_step.event=="S" or back_step.event=="Sb") - { - - stringstream transfer_token_stream; - transfer_token_stream<<""; - stringstream branch_string; - if (back_step.event=="S") - { - register_S(e); - branch_string<< branch_events - <<"."<constructor_string <constructor_string << endl; + signal = -11; + } return "error"; } - - diff --git a/src/sample_qvec.cpp b/src/sample_qvec.cpp index 508a3b6..a9734a7 100644 --- a/src/sample_qvec.cpp +++ b/src/sample_qvec.cpp @@ -2,1048 +2,989 @@ using namespace std; using namespace bpp; // -//consider reimplemtation for clarity! +// consider reimplemtation for clarity! // -//The general structure of the calculation, and lot of the code, is the same as p(ale) cf. model.cpp. -//(this could be made more clear) -string exODT_model::sample(bool max_rec) -{ +// The general structure of the calculation, and lot of the code, is the same as +// p(ale) cf. model.cpp. (this could be made more clear) +string exODT_model::sample(bool max_rec) { MLRec_events.clear(); Ttokens.clear(); - - //scalar_type beta=1; - scalar_type root_resum=0; - for (int rank=0;rank temp; // if (g_id!=-1) { //We are not at the root bipartition - temp = ale->id_sets.at( g_id ); + temp = ale->id_sets.at(g_id); for (int i = 0; i < ale->Gamma_size + 1; ++i) { // if ( BipartitionTools::testBit ( temp, i) ) { - if ( temp[ i ] ) { + if (temp[i]) { size++; } } - - + // if ((int)(ale->id_sets[g_id].size())==1) if (size == 1) - is_a_leaf=true; + is_a_leaf = true; // } - vector gp_ids;//del-loc - vector gpp_ids;//del-loc - //p_part is filled up CCPs - vector p_part;//del-loc - if (g_id!=-1) - for (unordered_map< pair,scalar_type> :: iterator kt = ale->Dip_counts[g_id].begin(); kt != ale->Dip_counts[g_id].end(); kt++) - { - pair parts = (*kt).first; - long int gp_id=parts.first; - long int gpp_id=parts.second; - gp_ids.push_back(gp_id); - gpp_ids.push_back(gpp_id); - if (ale->Bip_counts[g_id]<=scalar_parameter["min_bip_count"]) - p_part.push_back(0); - else - { - p_part.push_back(ale->p_dip(g_id,gp_id,gpp_id)); - } + vector gp_ids; // del-loc + vector gpp_ids; // del-loc + // p_part is filled up CCPs + vector p_part; // del-loc + if (g_id != -1) + for (unordered_map, scalar_type>::iterator kt = + ale->Dip_counts[g_id].begin(); + kt != ale->Dip_counts[g_id].end(); kt++) { + pair parts = (*kt).first; + long int gp_id = parts.first; + long int gpp_id = parts.second; + gp_ids.push_back(gp_id); + gpp_ids.push_back(gpp_id); + if (ale->Bip_counts[g_id] <= scalar_parameter["min_bip_count"]) + p_part.push_back(0); + else { + p_part.push_back(ale->p_dip(g_id, gp_id, gpp_id)); } - else - { - //root bipartition needs to be handled seperatly - map,int> bip_parts; - for (map :: iterator it = ale->Bip_counts.begin(); it != ale->Bip_counts.end(); it++) - { - long int gp_id=(*it).first; - boost::dynamic_bitset<> gamma = ale->id_sets[gp_id]; - - boost::dynamic_bitset<> not_gamma = ~gamma; - not_gamma[0] = 0; - - /* for (auto i = 0; i < ale->nbint; ++i) { - not_gamma[i] = 0; - } - BipartitionTools::bitNot(not_gamma, gamma, ale->nbint);*/ - /* - for (set::iterator st=ale->Gamma.begin();st!=ale->Gamma.end();st++) - if (gamma.count(*st)==0) - not_gamma.insert(*st);*/ - long int gpp_id = ale->set_ids[not_gamma]; - set parts; - parts.insert(gp_id); - parts.insert(gpp_id); - bip_parts[parts]=1; - // gamma.clear(); - // not_gamma.clear(); - } - for (map,int> :: iterator kt = bip_parts.begin();kt!=bip_parts.end();kt++) - { - vector parts; - for (set::iterator sit=(*kt).first.begin();sit!=(*kt).first.end();sit++) parts.push_back((*sit)); - long int gp_id=parts[0]; - long int gpp_id=parts[1]; - gp_ids.push_back(gp_id); - gpp_ids.push_back(gpp_id); - //if (ale->Bip_counts[gp_id]<=scalar_parameter["min_bip_count"] and not ale->Gamma_size<4) - // p_part.push_back(0); - //else - p_part.push_back(ale->p_bip(gp_id)); - } - bip_parts.clear(); } - int N_parts=gp_ids.size(); - int n=time_slices[rank].size(); - //###################################################################################################################### - //######################################### INNER LOOP ################################################################# - //###################################################################################################################### - vector sample_steps; - vector sample_ps; - scalar_type resum=0; + else { + // root bipartition needs to be handled seperatly + map, int> bip_parts; + for (map::iterator it = ale->Bip_counts.begin(); + it != ale->Bip_counts.end(); it++) { + long int gp_id = (*it).first; + boost::dynamic_bitset<> gamma = ale->id_sets[gp_id]; + + boost::dynamic_bitset<> not_gamma = ~gamma; + not_gamma[0] = 0; + + /* for (auto i = 0; i < ale->nbint; ++i) { + not_gamma[i] = 0; + } + BipartitionTools::bitNot(not_gamma, gamma, ale->nbint);*/ + /* + for (set::iterator st=ale->Gamma.begin();st!=ale->Gamma.end();st++) + if (gamma.count(*st)==0) + not_gamma.insert(*st);*/ + long int gpp_id = ale->set_ids[not_gamma]; + set parts; + parts.insert(gp_id); + parts.insert(gpp_id); + bip_parts[parts] = 1; + // gamma.clear(); + // not_gamma.clear(); + } + for (map, int>::iterator kt = bip_parts.begin(); + kt != bip_parts.end(); kt++) { + vector parts; + for (set::iterator sit = (*kt).first.begin(); + sit != (*kt).first.end(); sit++) + parts.push_back((*sit)); + long int gp_id = parts[0]; + long int gpp_id = parts[1]; + gp_ids.push_back(gp_id); + gpp_ids.push_back(gpp_id); + // if (ale->Bip_counts[gp_id]<=scalar_parameter["min_bip_count"] and not + // ale->Gamma_size<4) + // p_part.push_back(0); + // else + p_part.push_back(ale->p_bip(gp_id)); + } + bip_parts.clear(); + } + int N_parts = gp_ids.size(); + int n = time_slices[rank].size(); + // ###################################################################################################################### + // ######################################### INNER LOOP + // ################################################################# + // ###################################################################################################################### + vector sample_steps; + vector sample_ps; + scalar_type resum = 0; scalar_type t; - //scalar_type t_to=time_slice_times[rank][t_i]; + // scalar_type t_to=time_slice_times[rank][t_i]; - //int rank_to=rank; - //int t_i_to=t_i; - bool set_S_node=false; + // int rank_to=rank; + // int t_i_to=t_i; + bool set_S_node = false; // proceed a single "D" subslice - if(t_i>0) - { - rank=rank; - t_i-=1; - } - // at boundaries - else if (rank>0) - { - if (S_node) - { - ; - } - //if e defines the time slice we have to look at speciaitons - else if (e==time_slices[rank][n-1]) - { - set_S_node=true; - } - else - { - rank-=1; - t_i=time_slice_times[rank].size()-1; - } + if (t_i > 0) { + rank = rank; + t_i -= 1; + } + // at boundaries + else if (rank > 0) { + if (S_node) { + ; } - else - { - rank=-1; - t_i=-1; - if (is_a_leaf && extant_species[e]==gid_sps[g_id]) - { - resum=1; - sample_ps.push_back(1); - step step; - step.e=e; - step.ep=-1; - step.epp=-1; - step.t=0; - step.rank=0; - step.g_id=g_id; - step.gp_id=-1; - step.gpp_id=-1; - step.event="0"; - sample_steps.push_back(step); - }//qvec[g_id+1][rank][t_i][e]=1; - + // if e defines the time slice we have to look at speciaitons + else if (e == time_slices[rank][n - 1]) { + set_S_node = true; + } else { + rank -= 1; + t_i = time_slice_times[rank].size() - 1; } - if (rank>-1) - { - - t=time_slice_times[rank][t_i]; - scalar_type tpdt; - if ( t_i < (int)time_slice_times[rank].size()-1 ) - tpdt=time_slice_times[rank][t_i+1]; - else if (rank0) - { - int f=daughters[e][0]; - int g=daughters[e][1]; - scalar_type Eft=Ee[f][t]; - scalar_type Egt=Ee[g][t]; - - //scalar_type q_sum=0; - //qvec[g_id+1][rank][t_i][e]=0; - - scalar_type SL_fLg=qvec[g_id+1][rank][t_i][f]*Egt; - scalar_type SL_Lfg=qvec[g_id+1][rank][t_i][g]*Eft; - //SL EVENT - resum+=SL_fLg; - if(1) - { - sample_ps.push_back(SL_fLg); - step step; - step.e=f; - step.ep=-1; - step.epp=-1; - step.t=t; - step.rank=rank; - step.g_id=g_id; - step.gp_id=-1; - step.gpp_id=-1; - step.event="SL"; - sample_steps.push_back(step); - } - - resum+=SL_Lfg; - if(1) - { - sample_ps.push_back(SL_Lfg); - step step; - step.e=g; - step.ep=-1; - step.epp=-1; - step.t=t; - step.rank=rank; - step.g_id=g_id; - step.gp_id=-1; - step.gpp_id=-1; - step.event="SL"; - sample_steps.push_back(step); - } - - //q_sum+=SL_fLg+SL_Lfg; - //qvec[g_id+1][rank][t_i][e]=qvec[g_id+1][rank][t_i][f]*Egt + qvec[g_id+1][rank][t_i][g]*Eft; - //SL. - - //non-leaf directed partition - if (not is_a_leaf) - for (int i=0;i -1) { + + t = time_slice_times[rank][t_i]; + scalar_type tpdt; + if (t_i < (int)time_slice_times[rank].size() - 1) + tpdt = time_slice_times[rank][t_i + 1]; + else if (rank < last_rank - 1) + tpdt = time_slice_times[rank + 1][0]; + else + // top of root stem + tpdt = t_begin[time_slices[rank][0]]; + + // root + scalar_type Delta_t = tpdt - t; + // scalar_type N=vector_parameter["N"][rank]; + scalar_type Delta_bar = vector_parameter["Delta_bar"][rank]; + // scalar_type Lambda_bar=vector_parameter["Lambda_bar"][rank]; + scalar_type p_Delta_bar = Delta_bar * Delta_t; + scalar_type Ebar = Ee[-1][t]; + + if (e == alpha) { + // boundaries for branch alpha virtual branch + // boundary at present + if (t == 0) { + resum += 0; + sample_ps.push_back(0); + step step; + step.e = alpha; + step.ep = -1; + step.epp = -1; + step.t = t; + step.rank = rank; + step.g_id = g_id; + step.gp_id = -1; + step.gpp_id = -1; + step.event = "0"; + sample_steps.push_back(step); + } // qvec[g_id+1][rank][t_i][alpha]=0; + // boundary between slice rank and rank-1 slice is trivial + // trivial + if (S_node) // and 0!? + { + resum = 1; + if (1) { + sample_ps.push_back(1); + step step; + step.e = alpha; + step.ep = -1; + step.epp = -1; + step.t = t; + step.rank = rank; + step.g_id = g_id; + step.gp_id = -1; + step.gpp_id = -1; + step.event = "0"; + sample_steps.push_back(step); + }; // qvec[g_id+1][rank][t_i][e]=qvec[g_id+1][rank][t_i][e]; + } + // qvec[g_id+1][rank][t_i][alpha]=qvec[g_id+1][rank][t_i][alpha]; + else { + // cout << " here " < 0) { + int f = daughters[e][0]; + int g = daughters[e][1]; + scalar_type Eft = Ee[f][t]; + scalar_type Egt = Ee[g][t]; + + // scalar_type q_sum=0; + // qvec[g_id+1][rank][t_i][e]=0; + + scalar_type SL_fLg = qvec[g_id + 1][rank][t_i][f] * Egt; + scalar_type SL_Lfg = qvec[g_id + 1][rank][t_i][g] * Eft; + // SL EVENT + resum += SL_fLg; + if (1) { + sample_ps.push_back(SL_fLg); + step step; + step.e = f; + step.ep = -1; + step.epp = -1; + step.t = t; + step.rank = rank; + step.g_id = g_id; + step.gp_id = -1; + step.gpp_id = -1; + step.event = "SL"; + sample_steps.push_back(step); + } + + resum += SL_Lfg; + if (1) { + sample_ps.push_back(SL_Lfg); + step step; + step.e = g; + step.ep = -1; + step.epp = -1; + step.t = t; + step.rank = rank; + step.g_id = g_id; + step.gp_id = -1; + step.gpp_id = -1; + step.event = "SL"; + sample_steps.push_back(step); + } + + // q_sum+=SL_fLg+SL_Lfg; + // qvec[g_id+1][rank][t_i][e]=qvec[g_id+1][rank][t_i][f]*Egt + + // qvec[g_id+1][rank][t_i][g]*Eft; SL. + + // non-leaf directed partition + if (not is_a_leaf) + for (int i = 0; i < N_parts; i++) { + + long int gp_id = gp_ids[i]; + long int gpp_id = gpp_ids[i]; + scalar_type pp = p_part[i]; + scalar_type S_pf_ppg = qvec[gp_id + 1][rank][t_i][f] * + qvec[gpp_id + 1][rank][t_i][g] * pp; + scalar_type S_ppf_pg = qvec[gpp_id + 1][rank][t_i][f] * + qvec[gp_id + 1][rank][t_i][g] * pp; + // S EVENT + // qvec[g_id+1][rank][t_i][e]+=qvec[gp_id+1][rank][t_i][f]*qvec[gpp_id+1][rank][t_i][g] + // +qvec[gpp_id+1][rank][t_i][f]*qvec[gp_id+1][rank][t_i][g]; + resum += S_pf_ppg; + if (1) { + sample_ps.push_back(S_pf_ppg); + step step; + step.e = -1; + step.ep = f; + step.epp = g; + step.t = t; + step.rank = rank; + step.g_id = -1; + step.gp_id = gp_id; + step.gpp_id = gpp_id; + step.event = "S"; + sample_steps.push_back(step); + } + + resum += S_ppf_pg; + if (1) { + sample_ps.push_back(S_ppf_pg); + step step; + step.e = -1; + step.ep = g; + step.epp = f; + step.t = t; + step.rank = rank; + step.g_id = -1; + step.gp_id = gp_id; + step.gpp_id = gpp_id; + step.event = "S"; + sample_steps.push_back(step); + } + // q_sum+= S_pf_ppg + S_ppf_pg; + // S. + } + + // qvec[g_id+1][rank][t_i][e]=q_sum; + + } + // branches that cross to next time slice + else { + + // trivial + resum = 1; + if (1) { + sample_ps.push_back(1); + step step; + step.e = e; + step.ep = -1; + step.epp = -1; + step.t = t; + step.rank = rank; + step.g_id = g_id; + step.gp_id = -1; + step.gpp_id = -1; + step.event = "0"; + sample_steps.push_back(step); + }; // qvec[g_id+1][rank][t_i][e]=qvec[g_id+1][rank][t_i][e]; + } + } + } + // boundaries for branch e. + else { + + // events within slice rank at time t on branch e + // qvec[g_id+1][tpdt_rank][tpdt_t_i][e]=0; + // scalar_type q_sum=0; + + // non-leaf directed partition + if (not is_a_leaf) + for (int i = 0; i < N_parts; i++) { + + long int gp_id = gp_ids[i]; + long int gpp_id = gpp_ids[i]; + scalar_type pp = p_part[i]; + scalar_type qpe = qvec[gp_id + 1][rank][t_i][e]; + scalar_type qppe = qvec[gpp_id + 1][rank][t_i][e]; + scalar_type Sb_pa_ppe = + p_Delta_bar * qvec[gp_id + 1][rank][t_i][alpha] * qppe * pp; + scalar_type Sb_pe_ppa = + p_Delta_bar * qpe * qvec[gpp_id + 1][rank][t_i][alpha] * pp; + // S_bar EVENT + resum += Sb_pa_ppe; + if (1) { + sample_ps.push_back(Sb_pa_ppe); + step step; + step.e = -1; + step.ep = alpha; + step.epp = e; + step.t = t; + step.rank = rank; + step.g_id = -1; + step.gp_id = gp_id; + step.gpp_id = gpp_id; + step.event = "Sb"; + sample_steps.push_back(step); + } + + resum += Sb_pe_ppa; + if (1) { + sample_ps.push_back(Sb_pe_ppa); + step step; + step.e = -1; + step.ep = e; + step.epp = alpha; + step.t = t; + step.rank = rank; + step.g_id = -1; + step.gp_id = gp_id; + step.gpp_id = gpp_id; + step.event = "Sb"; + + sample_steps.push_back(step); + } + // q_sum+= Sb_pa_ppe + Sb_pe_ppa; + + // qvec[g_id+1][tpdt_rank][tpdt_t_i][e]+=p_Delta_bar*(qvec[gp_id+1][rank][t_i][alpha]*qvec[gpp_id+1][rank][t_i][e]+qvec[gp_id+1][rank][t_i][e]*qvec[gpp_id+1][rank][t_i][alpha]); + // S_bar. + + scalar_type D = 2 * p_delta_e * qpe * qppe * pp; + resum += D; + if (1) { + sample_ps.push_back(D); + step step; + step.e = -1; + step.ep = e; + step.epp = e; + step.t = t; + step.rank = rank; + step.g_id = -1; + step.gp_id = gp_id; + step.gpp_id = gpp_id; + step.event = "D"; + + sample_steps.push_back(step); + } + // D EVENT + // q_sum+= D; + // qvec[g_id+1][tpdt_rank][tpdt_t_i][e]+=2*p_delta_e*qvec[gp_id+1][rank][t_i][e]*qvec[gpp_id+1][rank][t_i][e]; + // D. + } + scalar_type SLb = p_Delta_bar * Eet * qvec[g_id + 1][rank][t_i][alpha]; + // SL_bar EVENT + resum += SLb; + if (1) { + sample_ps.push_back(SLb); + step step; + step.e = alpha; + step.ep = -1; + step.epp = -1; + step.t = t; + step.rank = rank; + step.g_id = g_id; + step.gp_id = -1; + step.gpp_id = -1; + step.event = "SLb"; + sample_steps.push_back(step); + } + // q_suml+=SLb; + + // qvec[g_id+1][tpdt_rank][tpdt_t_i][e]+=p_Delta_bar*Eet*qvec[g_id+1][rank][t_i][alpha]; + // SL_bar. + + scalar_type empty = Get * qvec[g_id + 1][rank][t_i][e]; + // 0 EVENT + resum += empty; + if (1) { + sample_ps.push_back(empty); + step step; + step.e = e; + step.ep = -1; + step.epp = -1; + step.t = t; + step.rank = rank; + step.g_id = g_id; + step.gp_id = -1; + step.gpp_id = -1; + step.event = "0"; + sample_steps.push_back(step); + } + // q_sum+=empty; + + // qvec[g_id+1][tpdt_rank][tpdt_t_i][e]=Get*qvec[g_id+1][rank][t_i][e]; + // 0. + + // qvec[g_id+1][tpdt_rank][tpdt_t_i][e]+=q_sum; + + // events within slice rank at time t on branch e. + } } - //###################################################################################################################### - //#########################################INNNER LOOP################################################################## - //###################################################################################################################### + } + // ###################################################################################################################### + // #########################################INNNER + // LOOP################################################################## + // ###################################################################################################################### gp_ids.clear(); gpp_ids.clear(); p_part.clear(); - if (S_node) - { - rank-=1; - t_i=time_slice_times[rank].size()-1; - S_node=false; - } - if (set_S_node) - { - S_node=true; + if (S_node) { + rank -= 1; + t_i = time_slice_times[rank].size() - 1; + S_node = false; + } + if (set_S_node) { + S_node = true; + } + int step_i = -1; + scalar_type reresum = 0; + scalar_type r = RandomTools::giveRandomNumberBetweenZeroAndEntry(1); + scalar_type max_resum = 0; + int max_i = 0; + for (int i = 0; i < (int)sample_ps.size(); i++) { + if (max_resum < sample_ps[i]) { + max_resum = sample_ps[i]; + max_i = i; } - int step_i=-1; - scalar_type reresum=0; - scalar_type r=RandomTools::giveRandomNumberBetweenZeroAndEntry(1); - scalar_type max_resum=0; - int max_i=0; - for (int i=0;i<(int)sample_ps.size();i++) - { - if (max_resumset2name(ale_pointer->id_sets[g_id]) << "\t" << back_step.t << " t_i:" << t_i << endl; - else - cout << back_step.event << "\t" << back_step.rank << "\tid_rank:" << id_ranks[back_step.e] << "\text_sp:" << extant_species[back_step.e] << "\te:" << back_step.e<< "\tg_id:" << ale_pointer->set2name(ale_pointer->id_sets[g_id]) << "\t" << back_step.t << " t_i:" << t_i << endl; + cout << back_step.event << "\t" << back_step.rank << "\tid_rank:" << + id_ranks[back_step.ep] << "\text_sp:" << extant_species[back_step.ep] << + "\te:" << back_step.ep<< "\tg_id:" << + ale_pointer->set2name(ale_pointer->id_sets[g_id]) << "\t" << back_step.t << + " t_i:" << t_i << endl; else cout << back_step.event << "\t" << + back_step.rank << "\tid_rank:" << id_ranks[back_step.e] << "\text_sp:" << + extant_species[back_step.e] << "\te:" << back_step.e<< "\tg_id:" << + ale_pointer->set2name(ale_pointer->id_sets[g_id]) << "\t" << back_step.t << + " t_i:" << t_i << endl; */ stringstream toptmp; - if (back_step.e==alpha) - toptmp<<-1; - else if (id_ranks[back_step.e]==0) - toptmp<id_sets[g_id]; for (int i = 0; i < ale_pointer->Gamma_size + 1; ++i) { // if ( BipartitionTools::testBit ( temp, i) ) { - if ( temp[i] ) { + if (temp[i]) { size++; } } // } - - if (ale_pointer->Bip_counts.count(g_id) and ale_pointer->Bip_counts[g_id]>0) - { - new_branch_length=max(ale_pointer->Bip_bls[g_id]/ale_pointer->Bip_counts[g_id],(scalar_type)scalar_parameter["min_branch_lenghts"]); - } - else - { - new_branch_length=max(ale_pointer->Bip_bls[g_id]/ale_pointer->observations,(scalar_type)scalar_parameter["min_branch_lenghts"]); - - } + if (ale_pointer->Bip_counts.count(g_id) and + ale_pointer->Bip_counts[g_id] > 0) { + new_branch_length = + max(ale_pointer->Bip_bls[g_id] / ale_pointer->Bip_counts[g_id], + (scalar_type)scalar_parameter["min_branch_lenghts"]); + } else { + new_branch_length = + max(ale_pointer->Bip_bls[g_id] / ale_pointer->observations, + (scalar_type)scalar_parameter["min_branch_lenghts"]); + } - if (back_step.t==0 and size == 1 and e!=-1) - { - register_leaf(e); - stringstream branch_string; - if (scalar_parameter["leaf_events"]==1) branch_string<set2name(ale_pointer->id_sets[g_id])+branch_string.str(); - } + if (back_step.t == 0 and size == 1 and e != -1) { + register_leaf(e); + stringstream branch_string; + if (scalar_parameter["leaf_events"] == 1) + branch_string << branch_events; + branch_string << ":" << new_branch_length; + return ale_pointer->set2name(ale_pointer->id_sets[g_id]) + + branch_string.str(); + } - if (back_step.event=="D" or back_step.event=="Tb" or back_step.event=="S" or back_step.event=="Sb") - { - - stringstream transfer_token_stream; - transfer_token_stream<<""; - stringstream branch_string; - if (back_step.event=="S") - { - register_S(e); - branch_string<< branch_events - <<"."<constructor_string <constructor_string << endl; + signal = -11; + } return "error"; } - - diff --git a/src/sample_scaled.cpp b/src/sample_scaled.cpp index c631e17..e7fd333 100644 --- a/src/sample_scaled.cpp +++ b/src/sample_scaled.cpp @@ -2,92 +2,79 @@ using namespace std; using namespace bpp; // -//consider reimplemtation for clarity! +// consider reimplemtation for clarity! // -//The general structure of the calculation, and lot of the code, is the same as p(ale) cf. model.cpp. -//(this could be made more clear) -string exODT_model::sample(bool max_rec) -{ - if (max_rec) - { - MLRec_events.clear(); - Ttokens.clear(); - } - //scalar_type beta=1; - scalar_type root_resum=0; - for (int rank=0;rank temp; // if (g_id!=-1) { //We are not at the root bipartition - temp = ale->id_sets.at( g_id ); + temp = ale->id_sets.at(g_id); for (int i = 0; i < ale->Gamma_size + 1; ++i) { // if ( BipartitionTools::testBit ( temp, i) ) { - if ( temp[ i ] ) { + if (temp[i]) { size++; } } - - + // if ((int)(ale->id_sets[g_id].size())==1) if (size == 1) - is_a_leaf=true; + is_a_leaf = true; // } - vector gp_ids;//del-loc - vector gpp_ids;//del-loc - //p_part is filled up CCPs - vector p_part;//del-loc - if (g_id!=-1) - for (unordered_map< pair,scalar_type> :: iterator kt = ale->Dip_counts[g_id].begin(); kt != ale->Dip_counts[g_id].end(); kt++) - { - pair parts = (*kt).first; - long int gp_id=parts.first; - long int gpp_id=parts.second; - gp_ids.push_back(gp_id); - gpp_ids.push_back(gpp_id); - if (ale->Bip_counts[g_id]<=scalar_parameter["min_bip_count"]) - p_part.push_back(0); - else - { - p_part.push_back(ale->p_dip(g_id,gp_id,gpp_id)); - } + vector gp_ids; // del-loc + vector gpp_ids; // del-loc + // p_part is filled up CCPs + vector p_part; // del-loc + if (g_id != -1) + for (unordered_map, scalar_type>::iterator kt = + ale->Dip_counts[g_id].begin(); + kt != ale->Dip_counts[g_id].end(); kt++) { + pair parts = (*kt).first; + long int gp_id = parts.first; + long int gpp_id = parts.second; + gp_ids.push_back(gp_id); + gpp_ids.push_back(gpp_id); + if (ale->Bip_counts[g_id] <= scalar_parameter["min_bip_count"]) + p_part.push_back(0); + else { + p_part.push_back(ale->p_dip(g_id, gp_id, gpp_id)); } - else - { - //root bipartition needs to be handled seperatly - map,int> bip_parts; - for (map :: iterator it = ale->Bip_counts.begin(); it != ale->Bip_counts.end(); it++) - { - long int gp_id=(*it).first; - boost::dynamic_bitset<> gamma = ale->id_sets[gp_id]; - - boost::dynamic_bitset<> not_gamma = ~gamma; - not_gamma[0] = 0; - - /* for (auto i = 0; i < ale->nbint; ++i) { - not_gamma[i] = 0; - } - BipartitionTools::bitNot(not_gamma, gamma, ale->nbint);*/ - /* - for (set::iterator st=ale->Gamma.begin();st!=ale->Gamma.end();st++) - if (gamma.count(*st)==0) - not_gamma.insert(*st);*/ - long int gpp_id = ale->set_ids[not_gamma]; - set parts; - parts.insert(gp_id); - parts.insert(gpp_id); - bip_parts[parts]=1; - // gamma.clear(); - // not_gamma.clear(); - } - for (map,int> :: iterator kt = bip_parts.begin();kt!=bip_parts.end();kt++) - { - vector parts; - for (set::iterator sit=(*kt).first.begin();sit!=(*kt).first.end();sit++) parts.push_back((*sit)); - long int gp_id=parts[0]; - long int gpp_id=parts[1]; - gp_ids.push_back(gp_id); - gpp_ids.push_back(gpp_id); - //if (ale->Bip_counts[gp_id]<=scalar_parameter["min_bip_count"] and not ale->Gamma_size<4) - // p_part.push_back(0); - //else - p_part.push_back(ale->p_bip(gp_id)); - } - bip_parts.clear(); } - int N_parts=gp_ids.size(); - int n=time_slices[rank].size(); - //###################################################################################################################### - //######################################### INNER LOOP ################################################################# - //###################################################################################################################### - vector sample_steps; - vector sample_ps; - scalar_type resum=0; + else { + // root bipartition needs to be handled seperatly + map, int> bip_parts; + for (map::iterator it = ale->Bip_counts.begin(); + it != ale->Bip_counts.end(); it++) { + long int gp_id = (*it).first; + boost::dynamic_bitset<> gamma = ale->id_sets[gp_id]; + + boost::dynamic_bitset<> not_gamma = ~gamma; + not_gamma[0] = 0; + + /* for (auto i = 0; i < ale->nbint; ++i) { + not_gamma[i] = 0; + } + BipartitionTools::bitNot(not_gamma, gamma, ale->nbint);*/ + /* + for (set::iterator st=ale->Gamma.begin();st!=ale->Gamma.end();st++) + if (gamma.count(*st)==0) + not_gamma.insert(*st);*/ + long int gpp_id = ale->set_ids[not_gamma]; + set parts; + parts.insert(gp_id); + parts.insert(gpp_id); + bip_parts[parts] = 1; + // gamma.clear(); + // not_gamma.clear(); + } + for (map, int>::iterator kt = bip_parts.begin(); + kt != bip_parts.end(); kt++) { + vector parts; + for (set::iterator sit = (*kt).first.begin(); + sit != (*kt).first.end(); sit++) + parts.push_back((*sit)); + long int gp_id = parts[0]; + long int gpp_id = parts[1]; + gp_ids.push_back(gp_id); + gpp_ids.push_back(gpp_id); + // if (ale->Bip_counts[gp_id]<=scalar_parameter["min_bip_count"] and not + // ale->Gamma_size<4) + // p_part.push_back(0); + // else + p_part.push_back(ale->p_bip(gp_id)); + } + bip_parts.clear(); + } + int N_parts = gp_ids.size(); + int n = time_slices[rank].size(); + // ###################################################################################################################### + // ######################################### INNER LOOP + // ################################################################# + // ###################################################################################################################### + vector sample_steps; + vector sample_ps; + scalar_type resum = 0; scalar_type t; - //scalar_type t_to=time_slice_times[rank][t_i]; + // scalar_type t_to=time_slice_times[rank][t_i]; - //int rank_to=rank; - //int t_i_to=t_i; - bool set_S_node=false; + // int rank_to=rank; + // int t_i_to=t_i; + bool set_S_node = false; // proceed a single "D" subslice - if(t_i>0) - { - rank=rank; - t_i-=1; - } - // at boundaries - else if (rank>0) - { - if (S_node) - { - ; - } - //if e defines the time slice we have to look at speciaitons - else if (e==time_slices[rank][n-1]) - { - set_S_node=true; - } - else - { - rank-=1; - t_i=time_slice_times[rank].size()-1; - } + if (t_i > 0) { + rank = rank; + t_i -= 1; + } + // at boundaries + else if (rank > 0) { + if (S_node) { + ; } - else - { - rank=-1; - t_i=-1; - if (is_a_leaf && extant_species[e]==gid_sps[g_id]) - { - resum=1; - sample_ps.push_back(1); - step step; - step.e=e; - step.ep=-1; - step.epp=-1; - step.t=0; - step.rank=0; - step.g_id=g_id; - step.gp_id=-1; - step.gpp_id=-1; - step.event="0"; - sample_steps.push_back(step); - }//qvec[g_id+1][rank][t_i][e]=1; - + // if e defines the time slice we have to look at speciaitons + else if (e == time_slices[rank][n - 1]) { + set_S_node = true; + } else { + rank -= 1; + t_i = time_slice_times[rank].size() - 1; } - if (rank>-1) - { - - t=time_slice_times[rank][t_i]; - scalar_type tpdt; - if ( t_i < (int)time_slice_times[rank].size()-1 ) - tpdt=time_slice_times[rank][t_i+1]; - else if (rank0) - { - int f=daughters[e][0]; - int g=daughters[e][1]; - scalar_type Eft=Ee[f][t]; - scalar_type Egt=Ee[g][t]; - - //scalar_type q_sum=0; - //qvec[g_id+1][rank][t_i][e]=0; - - scalar_type SL_fLg=qvec[g_id+1][rank][t_i][f]*Egt; - scalar_type SL_Lfg=qvec[g_id+1][rank][t_i][g]*Eft; - //SL EVENT - resum+=SL_fLg; - if(1) - { - sample_ps.push_back(SL_fLg); - step step; - step.e=f; - step.ep=-1; - step.epp=-1; - step.t=t; - step.rank=rank; - step.g_id=g_id; - step.gp_id=-1; - step.gpp_id=-1; - step.event="SL"; - sample_steps.push_back(step); - } - - resum+=SL_Lfg; - if(1) - { - sample_ps.push_back(SL_Lfg); - step step; - step.e=g; - step.ep=-1; - step.epp=-1; - step.t=t; - step.rank=rank; - step.g_id=g_id; - step.gp_id=-1; - step.gpp_id=-1; - step.event="SL"; - sample_steps.push_back(step); - } - - //q_sum+=SL_fLg+SL_Lfg; - //qvec[g_id+1][rank][t_i][e]=qvec[g_id+1][rank][t_i][f]*Egt + qvec[g_id+1][rank][t_i][g]*Eft; - //SL. - - //non-leaf directed partition - if (not is_a_leaf) - for (int i=0;i -1) { + + t = time_slice_times[rank][t_i]; + scalar_type tpdt; + if (t_i < (int)time_slice_times[rank].size() - 1) + tpdt = time_slice_times[rank][t_i + 1]; + else if (rank < last_rank - 1) + tpdt = time_slice_times[rank + 1][0]; + else + // top of root stem + tpdt = t_begin[time_slices[rank][0]]; + + // root + scalar_type Delta_t = tpdt - t; + // Delat_bar corresponds to \hat \sigma + scalar_type ni = time_slices[rank].size(); + scalar_type delta_avg = scalar_parameter["delta_avg"]; + scalar_type tau_avg = scalar_parameter["tau_avg"]; + scalar_type lambda_avg = scalar_parameter["lambda_avg"]; + scalar_type sigma_hat = scalar_parameter["sigma_hat"]; + scalar_type H_hat = Ee[-1][t]; + + if (e == alpha) { + // boundaries for branch alpha virtual branch + // boundary at present + if (t == 0) { + resum += 0; + sample_ps.push_back(0); + step step; + step.e = alpha; + step.ep = -1; + step.epp = -1; + step.t = t; + step.rank = rank; + step.g_id = g_id; + step.gp_id = -1; + step.gpp_id = -1; + step.event = "0"; + sample_steps.push_back(step); + } // qvec[g_id+1][rank][t_i][alpha]=0; + // boundary between slice rank and rank-1 slice is trivial + // trivial + if (S_node) // and 0!? + { + resum = 1; + if (1) { + sample_ps.push_back(1); + step step; + step.e = alpha; + step.ep = -1; + step.epp = -1; + step.t = t; + step.rank = rank; + step.g_id = g_id; + step.gp_id = -1; + step.gpp_id = -1; + step.event = "0"; + sample_steps.push_back(step); + }; // qvec[g_id+1][rank][t_i][e]=qvec[g_id+1][rank][t_i][e]; + } + // qvec[g_id+1][rank][t_i][alpha]=qvec[g_id+1][rank][t_i][alpha]; + else { + // cout << " here " < 0) { + int f = daughters[e][0]; + int g = daughters[e][1]; + scalar_type Eft = Ee[f][t]; + scalar_type Egt = Ee[g][t]; + + // scalar_type q_sum=0; + // qvec[g_id+1][rank][t_i][e]=0; + + scalar_type SL_fLg = qvec[g_id + 1][rank][t_i][f] * Egt; + scalar_type SL_Lfg = qvec[g_id + 1][rank][t_i][g] * Eft; + // SL EVENT + resum += SL_fLg; + if (1) { + sample_ps.push_back(SL_fLg); + step step; + step.e = f; + step.ep = -1; + step.epp = -1; + step.t = t; + step.rank = rank; + step.g_id = g_id; + step.gp_id = -1; + step.gpp_id = -1; + step.event = "SL"; + sample_steps.push_back(step); + } + + resum += SL_Lfg; + if (1) { + sample_ps.push_back(SL_Lfg); + step step; + step.e = g; + step.ep = -1; + step.epp = -1; + step.t = t; + step.rank = rank; + step.g_id = g_id; + step.gp_id = -1; + step.gpp_id = -1; + step.event = "SL"; + sample_steps.push_back(step); + } + + // q_sum+=SL_fLg+SL_Lfg; + // qvec[g_id+1][rank][t_i][e]=qvec[g_id+1][rank][t_i][f]*Egt + + // qvec[g_id+1][rank][t_i][g]*Eft; SL. + + // non-leaf directed partition + if (not is_a_leaf) + for (int i = 0; i < N_parts; i++) { + + long int gp_id = gp_ids[i]; + long int gpp_id = gpp_ids[i]; + scalar_type pp = p_part[i]; + scalar_type S_pf_ppg = qvec[gp_id + 1][rank][t_i][f] * + qvec[gpp_id + 1][rank][t_i][g] * pp; + scalar_type S_ppf_pg = qvec[gpp_id + 1][rank][t_i][f] * + qvec[gp_id + 1][rank][t_i][g] * pp; + // cout << "S: " << S_pf_ppg << " " << S_ppf_pg << " " << endl; + // cout << rank<< " " << t_i << " " << + // time_slice_times[rank][t_i] << endl; S EVENT + // qvec[g_id+1][rank][t_i][e]+=qvec[gp_id+1][rank][t_i][f]*qvec[gpp_id+1][rank][t_i][g] + // +qvec[gpp_id+1][rank][t_i][f]*qvec[gp_id+1][rank][t_i][g]; + resum += S_pf_ppg; + if (1) { + sample_ps.push_back(S_pf_ppg); + step step; + step.e = -1; + step.ep = f; + step.epp = g; + step.t = t; + step.rank = rank; + step.g_id = -1; + step.gp_id = gp_id; + step.gpp_id = gpp_id; + step.event = "S"; + sample_steps.push_back(step); + } + + resum += S_ppf_pg; + if (1) { + sample_ps.push_back(S_ppf_pg); + step step; + step.e = -1; + step.ep = g; + step.epp = f; + step.t = t; + step.rank = rank; + step.g_id = -1; + step.gp_id = gp_id; + step.gpp_id = gpp_id; + step.event = "S"; + sample_steps.push_back(step); + } + // q_sum+= S_pf_ppg + S_ppf_pg; + // S. + } + + // qvec[g_id+1][rank][t_i][e]=q_sum; + + } + // branches that cross to next time slice + else { + + // trivial + resum = 1; + if (1) { + sample_ps.push_back(1); + step step; + step.e = e; + step.ep = -1; + step.epp = -1; + step.t = t; + step.rank = rank; + step.g_id = g_id; + step.gp_id = -1; + step.gpp_id = -1; + step.event = "0"; + sample_steps.push_back(step); + }; // qvec[g_id+1][rank][t_i][e]=qvec[g_id+1][rank][t_i][e]; + } + } + } + // boundaries for branch e. + else { + + // events within slice rank at time t on branch e + // qvec[g_id+1][tpdt_rank][tpdt_t_i][e]=0; + // scalar_type q_sum=0; + + // non-leaf directed partition + if (not is_a_leaf) + for (int i = 0; i < N_parts; i++) { + + long int gp_id = gp_ids[i]; + long int gpp_id = gpp_ids[i]; + scalar_type pp = p_part[i]; + scalar_type qpe = qvec[gp_id + 1][rank][t_i][e]; + scalar_type qppe = qvec[gpp_id + 1][rank][t_i][e]; + scalar_type Sb_pa_ppe = sigma_hat * Delta_t * + qvec[gp_id + 1][rank][t_i][alpha] * qppe * + pp; + scalar_type Sb_pe_ppa = sigma_hat * Delta_t * qpe * + qvec[gpp_id + 1][rank][t_i][alpha] * pp; + // cout << "Sb: " << + // sigma_hat*Delta_t<<"*"<set2name(ale_pointer->id_sets[g_id]) << "\t" << back_step.t << " t_i:" << t_i << " " << time_slice_times[back_step.rank][t_i]<< endl; - else - cout << back_step.event << "\t" << back_step.rank << "\tid_rank:" << id_ranks[back_step.e] << "\text_sp:" << extant_species[back_step.e] << "\te:" << back_step.e<< "\tg_id:" << ale_pointer->set2name(ale_pointer->id_sets[g_id]) << "\t" << back_step.t << " t_i:" << t_i << " " << time_slice_times[back_step.rank][t_i]<< endl; + cout << back_step.event << "\t" << back_step.rank << "\tid_rank:" << + id_ranks[back_step.ep] << "\text_sp:" << extant_species[back_step.ep] << + "\te:" << back_step.ep<< "\tg_id:" << + ale_pointer->set2name(ale_pointer->id_sets[g_id]) << "\t" << back_step.t << + " t_i:" << t_i << " " << time_slice_times[back_step.rank][t_i]<< endl; else + cout << back_step.event << "\t" << back_step.rank << "\tid_rank:" << + id_ranks[back_step.e] << "\text_sp:" << extant_species[back_step.e] << + "\te:" << back_step.e<< "\tg_id:" << + ale_pointer->set2name(ale_pointer->id_sets[g_id]) << "\t" << back_step.t << + " t_i:" << t_i << " " << time_slice_times[back_step.rank][t_i]<< endl; */ stringstream toptmp; - if (back_step.e==alpha) - toptmp<<-1; - else if (id_ranks[back_step.e]==0) - toptmp<id_sets[g_id]; for (int i = 0; i < ale_pointer->Gamma_size + 1; ++i) { // if ( BipartitionTools::testBit ( temp, i) ) { - if ( temp[i] ) { + if (temp[i]) { size++; } } // } - - if (ale_pointer->Bip_counts.count(g_id) and ale_pointer->Bip_counts[g_id]>0) - { - new_branch_length=max(ale_pointer->Bip_bls[g_id]/ale_pointer->Bip_counts[g_id],(scalar_type)scalar_parameter["min_branch_lenghts"]); - } - else - { - new_branch_length=max(ale_pointer->Bip_bls[g_id]/ale_pointer->observations,(scalar_type)scalar_parameter["min_branch_lenghts"]); - - } + if (ale_pointer->Bip_counts.count(g_id) and + ale_pointer->Bip_counts[g_id] > 0) { + new_branch_length = + max(ale_pointer->Bip_bls[g_id] / ale_pointer->Bip_counts[g_id], + (scalar_type)scalar_parameter["min_branch_lenghts"]); + } else { + new_branch_length = + max(ale_pointer->Bip_bls[g_id] / ale_pointer->observations, + (scalar_type)scalar_parameter["min_branch_lenghts"]); + } - if (back_step.t==0 and size == 1 and e!=-1) - { - //cout <<"c+P "<< back_step.t << " " << 0 << endl; + if (back_step.t == 0 and size == 1 and e != -1) { + // cout <<"c+P "<< back_step.t << " " << 0 << endl; - register_leaf(e); - stringstream branch_string; - if (scalar_parameter["leaf_events"]==1) branch_string<PRESENT"); - gid_times[g_id].push_back(0); - gid_branches[g_id].push_back(e); - gid_gidp[g_id].push_back(g_id); - gid_gidpp[g_id].push_back(g_id); + gid_events[g_id].push_back(">PRESENT"); + gid_times[g_id].push_back(0); + gid_branches[g_id].push_back(e); + gid_gidp[g_id].push_back(g_id); + gid_gidpp[g_id].push_back(g_id); - return ale_pointer->set2name(ale_pointer->id_sets[g_id])+branch_string.str(); - } + return ale_pointer->set2name(ale_pointer->id_sets[g_id]) + + branch_string.str(); + } + + if (back_step.event == "D" or back_step.event == "Tb" or + back_step.event == "S" or back_step.event == "Sb") { + + stringstream transfer_token_stream; + transfer_token_stream << ""; + stringstream branch_string; + if (back_step.event == "S") { + // cout <<"c+S "<< back_step.t << " " << 1 << endl; - if (back_step.event=="D" or back_step.event=="Tb" or back_step.event=="S" or back_step.event=="Sb") - { - - stringstream transfer_token_stream; - transfer_token_stream<<""; - stringstream branch_string; - if (back_step.event=="S") - { - //cout <<"c+S "<< back_step.t << " " << 1 << endl; - - register_S(e); - - gid_events[g_id].push_back(">S"); - gid_times[g_id].push_back(t); - gid_branches[g_id].push_back(e); - gid_gidp[g_id].push_back(back_step.gp_id); - gid_gidpp[g_id].push_back(back_step.gpp_id); - - gid_events[back_step.gp_id].push_back("S<"); - gid_times[back_step.gp_id].push_back(t); - gid_branches[back_step.gp_id].push_back(back_step.ep); - gid_gidp[back_step.gp_id].push_back(back_step.gp_id); - gid_gidpp[back_step.gp_id].push_back(back_step.gp_id); - - gid_events[back_step.gpp_id].push_back("S<"); - gid_times[back_step.gpp_id].push_back(t); - gid_branches[back_step.gpp_id].push_back(back_step.epp); - gid_gidp[back_step.gpp_id].push_back(back_step.gpp_id); - gid_gidpp[back_step.gpp_id].push_back(back_step.gpp_id); - - branch_string<< branch_events - <<"."<T"); - gid_times[g_id].push_back(t); - gid_branches[g_id].push_back(e); - - - if (back_step.ep==alpha) - { - this_e=back_step.epp; - this_gid=back_step.gpp_id; - - gid_gidp[g_id].push_back(back_step.gpp_id); - gid_gidpp[g_id].push_back(back_step.gp_id); - - gid_events[back_step.gp_id].push_back("Tto<"); - gid_times[back_step.gp_id].push_back(t); - gid_branches[back_step.gp_id].push_back(back_step.ep); - gid_gidp[back_step.gp_id].push_back(back_step.gp_id); - gid_gidpp[back_step.gp_id].push_back(back_step.gp_id); - - gid_events[back_step.gpp_id].push_back("Tfrom<"); - gid_times[back_step.gpp_id].push_back(t); - gid_branches[back_step.gpp_id].push_back(back_step.epp); - gid_gidp[back_step.gpp_id].push_back(back_step.gpp_id); - gid_gidpp[back_step.gpp_id].push_back(back_step.gpp_id); - - } - else - { - this_e=back_step.ep; - this_gid=back_step.gp_id; - - gid_gidp[g_id].push_back(back_step.gp_id); - gid_gidpp[g_id].push_back(back_step.gpp_id); - - gid_events[back_step.gp_id].push_back("Tfrom<"); - gid_times[back_step.gp_id].push_back(t); - gid_branches[back_step.gp_id].push_back(back_step.ep); - gid_gidp[back_step.gp_id].push_back(back_step.gp_id); - gid_gidpp[back_step.gp_id].push_back(back_step.gp_id); - - gid_events[back_step.gpp_id].push_back("Tto<"); - gid_times[back_step.gpp_id].push_back(t); - gid_branches[back_step.gpp_id].push_back(back_step.epp); - gid_gidp[back_step.gpp_id].push_back(back_step.gpp_id); - gid_gidpp[back_step.gpp_id].push_back(back_step.gpp_id); - - } - - stringstream named_branch; - if (this_e==alpha) - named_branch<<-1; - else if (id_ranks[this_e]==0) - named_branch<"+tmp.str()); - // Tto - - branch_string<< branch_events<Sfrom"); - gid_times[g_id].push_back(t); - gid_branches[g_id].push_back(this_e); - - // Tfrom - stringstream named_branch; - if (this_e==alpha) - named_branch<<-1; - else if (id_ranks[this_e]==0) - named_branch<D"); - gid_times[g_id].push_back(t); - gid_branches[g_id].push_back(e); - gid_gidp[g_id].push_back(back_step.gp_id); - gid_gidpp[g_id].push_back(back_step.gpp_id); - - gid_events[back_step.gp_id].push_back("Dto<"); - gid_times[back_step.gp_id].push_back(t); - gid_branches[back_step.gp_id].push_back(back_step.ep); - gid_gidp[back_step.gp_id].push_back(back_step.gp_id); - gid_gidpp[back_step.gp_id].push_back(back_step.gp_id); - - gid_events[back_step.gpp_id].push_back("Dfrom<"); - gid_times[back_step.gpp_id].push_back(t); - gid_branches[back_step.gpp_id].push_back(back_step.epp); - gid_gidp[back_step.gpp_id].push_back(back_step.gpp_id); - gid_gidpp[back_step.gpp_id].push_back(back_step.gpp_id); - - - Dtoken_stream << "D|" << rank << "|" <S"); + gid_times[g_id].push_back(t); + gid_branches[g_id].push_back(e); + gid_gidp[g_id].push_back(back_step.gp_id); + gid_gidpp[g_id].push_back(back_step.gpp_id); + + gid_events[back_step.gp_id].push_back("S<"); + gid_times[back_step.gp_id].push_back(t); + gid_branches[back_step.gp_id].push_back(back_step.ep); + gid_gidp[back_step.gp_id].push_back(back_step.gp_id); + gid_gidpp[back_step.gp_id].push_back(back_step.gp_id); + + gid_events[back_step.gpp_id].push_back("S<"); + gid_times[back_step.gpp_id].push_back(t); + gid_branches[back_step.gpp_id].push_back(back_step.epp); + gid_gidp[back_step.gpp_id].push_back(back_step.gpp_id); + gid_gidpp[back_step.gpp_id].push_back(back_step.gpp_id); + + branch_string << branch_events << "." << id_ranks[e] << ":" + << max(new_branch_length, (scalar_type)0.0); + } else { + if (back_step.event == "Tb") { + // cout <<"c+Tb "<< back_step.t << " " << 1 << endl; + + int this_e, this_gid; + + gid_events[g_id].push_back(">T"); + gid_times[g_id].push_back(t); + gid_branches[g_id].push_back(e); + + if (back_step.ep == alpha) { + this_e = back_step.epp; + this_gid = back_step.gpp_id; + + gid_gidp[g_id].push_back(back_step.gpp_id); + gid_gidpp[g_id].push_back(back_step.gp_id); + + gid_events[back_step.gp_id].push_back("Tto<"); + gid_times[back_step.gp_id].push_back(t); + gid_branches[back_step.gp_id].push_back(back_step.ep); + gid_gidp[back_step.gp_id].push_back(back_step.gp_id); + gid_gidpp[back_step.gp_id].push_back(back_step.gp_id); + + gid_events[back_step.gpp_id].push_back("Tfrom<"); + gid_times[back_step.gpp_id].push_back(t); + gid_branches[back_step.gpp_id].push_back(back_step.epp); + gid_gidp[back_step.gpp_id].push_back(back_step.gpp_id); + gid_gidpp[back_step.gpp_id].push_back(back_step.gpp_id); + + } else { + this_e = back_step.ep; + this_gid = back_step.gp_id; + + gid_gidp[g_id].push_back(back_step.gp_id); + gid_gidpp[g_id].push_back(back_step.gpp_id); + + gid_events[back_step.gp_id].push_back("Tfrom<"); + gid_times[back_step.gp_id].push_back(t); + gid_branches[back_step.gp_id].push_back(back_step.ep); + gid_gidp[back_step.gp_id].push_back(back_step.gp_id); + gid_gidpp[back_step.gp_id].push_back(back_step.gp_id); + + gid_events[back_step.gpp_id].push_back("Tto<"); + gid_times[back_step.gpp_id].push_back(t); + gid_branches[back_step.gpp_id].push_back(back_step.epp); + gid_gidp[back_step.gpp_id].push_back(back_step.gpp_id); + gid_gidpp[back_step.gpp_id].push_back(back_step.gpp_id); + } + + stringstream named_branch; + if (this_e == alpha) + named_branch << -1; + else if (id_ranks[this_e] == 0) + named_branch << extant_species[this_e]; + else + named_branch << id_ranks[this_e]; + // Tto + register_Tto(this_e); + + stringstream tmp; + tmp << back_step.rank << "|" << t << "|" << named_branch.str() << "|" + << this_gid; + register_Ttoken(transfer_token + ">" + tmp.str()); + // Tto + + branch_string << branch_events << back_step.event << "@" + << back_step.rank << "|" << named_branch.str() << ":" + << max(new_branch_length, (scalar_type)0.0); + } else if (back_step.event == "Sb") { + int this_e; + if (back_step.ep == alpha) { + this_e = back_step.epp; + + gid_gidp[g_id].push_back(back_step.gpp_id); + gid_gidpp[g_id].push_back(back_step.gp_id); + + gid_events[back_step.gp_id].push_back("Sto<"); + gid_times[back_step.gp_id].push_back(t); + gid_branches[back_step.gp_id].push_back(back_step.ep); + gid_gidp[back_step.gp_id].push_back(back_step.gp_id); + gid_gidpp[back_step.gp_id].push_back(back_step.gp_id); + + gid_events[back_step.gpp_id].push_back("Sfrom<"); + gid_times[back_step.gpp_id].push_back(t); + gid_branches[back_step.gpp_id].push_back(back_step.epp); + gid_gidp[back_step.gpp_id].push_back(back_step.gpp_id); + gid_gidpp[back_step.gpp_id].push_back(back_step.gpp_id); + + } else { + this_e = back_step.ep; + + gid_gidp[g_id].push_back(back_step.gp_id); + gid_gidpp[g_id].push_back(back_step.gpp_id); + + gid_events[back_step.gp_id].push_back("Sfrom<"); + gid_times[back_step.gp_id].push_back(t); + gid_branches[back_step.gp_id].push_back(back_step.ep); + gid_gidp[back_step.gp_id].push_back(back_step.gp_id); + gid_gidpp[back_step.gp_id].push_back(back_step.gp_id); + + gid_events[back_step.gpp_id].push_back("Sto<"); + gid_times[back_step.gpp_id].push_back(t); + gid_branches[back_step.gpp_id].push_back(back_step.epp); + gid_gidp[back_step.gpp_id].push_back(back_step.gpp_id); + gid_gidpp[back_step.gpp_id].push_back(back_step.gpp_id); + } + // Tfrom + register_Tfrom(this_e); + + gid_events[g_id].push_back(">Sfrom"); + gid_times[g_id].push_back(t); + gid_branches[g_id].push_back(this_e); + + // Tfrom + stringstream named_branch; + if (this_e == alpha) + named_branch << -1; + else if (id_ranks[this_e] == 0) + named_branch << extant_species[this_e]; + else + named_branch << id_ranks[this_e]; + if (transfer_token != "") + transfer_token_stream << transfer_token; + else + transfer_token_stream << "T|" << rank << "|" << t << "|" + << named_branch.str() << "|" << g_id; + + branch_string << branch_events << "T@" << rank << "|" + << named_branch.str() << ":" + << max(new_branch_length, (scalar_type)0.0); + } else { + // cout <<"c+D "<< back_step.t << " " << 1 << endl; + register_D(e); + stringstream Dtoken_stream; + stringstream named_branch; + if (e == alpha) + named_branch << -1; + else if (id_ranks[e] == 0) + named_branch << extant_species[e]; + else + named_branch << id_ranks[e]; + + gid_events[g_id].push_back(">D"); + gid_times[g_id].push_back(t); + gid_branches[g_id].push_back(e); + gid_gidp[g_id].push_back(back_step.gp_id); + gid_gidpp[g_id].push_back(back_step.gpp_id); + + gid_events[back_step.gp_id].push_back("Dto<"); + gid_times[back_step.gp_id].push_back(t); + gid_branches[back_step.gp_id].push_back(back_step.ep); + gid_gidp[back_step.gp_id].push_back(back_step.gp_id); + gid_gidpp[back_step.gp_id].push_back(back_step.gp_id); + + gid_events[back_step.gpp_id].push_back("Dfrom<"); + gid_times[back_step.gpp_id].push_back(t); + gid_branches[back_step.gpp_id].push_back(back_step.epp); + gid_gidp[back_step.gpp_id].push_back(back_step.gpp_id); + gid_gidpp[back_step.gpp_id].push_back(back_step.gpp_id); + + Dtoken_stream << "D|" << rank << "|" << named_branch.str() << "|" + << g_id; + register_Ttoken(Dtoken_stream.str()); + + branch_string << branch_events << back_step.event << "@" << rank << "|" + << named_branch.str() << ":" + << max(new_branch_length, (scalar_type)0.0); + } } - else if ( back_step.event=="TLb" or back_step.event=="SL" or back_step.event=="SLb" or back_step.event=="0") - { - - stringstream branch_string; - stringstream transfer_token_stream; - transfer_token_stream <<""; - - branch_string<< branch_events; - if (back_step.event!="0") - { - if (back_step.event=="SL") - { - t_i=time_slice_times[rank].size()-1; - register_S(e); - int f=daughters[e][0]; - int g=daughters[e][1]; - if (back_step.e==f) - register_L(g); - else - register_L(f); - branch_string<<"." - <"+tmp.str()); - transfer_token=""; - - branch_string<<"" - <<"@"<" + tmp.str()); + transfer_token = ""; + + branch_string << "" + << "@" << back_step.rank << "|" << named_branch.str(); + } else if (back_step.event == "SLb") { + // cout <<"c+TLb "<< back_step.t << " " << -1 << endl; + register_L(e); + register_Tfrom(e); + + gid_events[g_id].push_back("SLb"); + gid_times[g_id].push_back(t); + gid_branches[g_id].push_back(e); + gid_gidp[g_id].push_back(g_id); + gid_gidpp[g_id].push_back(g_id); + + stringstream named_branch; + if (e == alpha) + named_branch << -1; + else if (id_ranks[e] == 0) + named_branch << extant_species[e]; + else + named_branch << id_ranks[e]; + + transfer_token_stream << "T|" << rank << "|" << t << "|" + << named_branch.str() << "|" << g_id; + + branch_string << ".T" + << "@" << rank << "|" << named_branch.str(); + } + } } - else - { - cout << "error " <constructor_string <constructor_string << endl; + signal = -11; + } return "error"; } - - diff --git a/src/simulateSpAndGeneTrees.cpp b/src/simulateSpAndGeneTrees.cpp index 43f3463..50ca387 100644 --- a/src/simulateSpAndGeneTrees.cpp +++ b/src/simulateSpAndGeneTrees.cpp @@ -1,127 +1,139 @@ #include "exODT.h" #include - -#include #include +#include #include - #include "exODT_sim.h" using namespace std; using namespace bpp; - - -unsigned int good_seed() -{ - unsigned int random_seed, random_seed_a, random_seed_b; - std::ifstream file ("/dev/random", std::ios::binary); - if (file.is_open()) - { - char * memblock; - int size = sizeof(int); - memblock = new char [size]; - file.read (memblock, size); - file.close(); - random_seed_a = long(memblock); - delete[] memblock; - }// end if - else - { - random_seed_a = 0; - } - random_seed_b = std::time(0); - random_seed = random_seed_a xor random_seed_b; - return random_seed; +unsigned int good_seed() { + unsigned int random_seed, random_seed_a, random_seed_b; + std::ifstream file("/dev/random", std::ios::binary); + if (file.is_open()) { + char *memblock; + int size = sizeof(int); + memblock = new char[size]; + file.read(memblock, size); + file.close(); + random_seed_a = long(memblock); + delete[] memblock; + } // end if + else { + random_seed_a = 0; + } + random_seed_b = std::time(0); + random_seed = random_seed_a xor random_seed_b; + return random_seed; } // end good_seed() +int main(int args, char **argv) { -int main(int args, char ** argv) -{ + BppApplication simulateSpAndGeneTrees(args, argv, "STRALE"); + simulateSpAndGeneTrees.startTimer(); - BppApplication simulateSpAndGeneTrees(args, argv, "STRALE"); - simulateSpAndGeneTrees.startTimer(); + size_t N = ApplicationTools::getIntParameter( + "population.size", simulateSpAndGeneTrees.getParams(), 1000); + size_t n = ApplicationTools::getIntParameter( + "number.of.species", simulateSpAndGeneTrees.getParams(), 10); + size_t N_g = ApplicationTools::getIntParameter( + "number.of.genes", simulateSpAndGeneTrees.getParams(), 100); - size_t N = ApplicationTools::getIntParameter("population.size",simulateSpAndGeneTrees.getParams(),1000); - size_t n = ApplicationTools::getIntParameter("number.of.species",simulateSpAndGeneTrees.getParams(),10); - size_t N_g = ApplicationTools::getIntParameter("number.of.genes",simulateSpAndGeneTrees.getParams(),100); + double delta = ApplicationTools::getDoubleParameter( + "delta", simulateSpAndGeneTrees.getParams(), 0.01); // 0.20; + double tau = ApplicationTools::getDoubleParameter( + "tau", simulateSpAndGeneTrees.getParams(), 0.01); // 0.31; + double lambda = ApplicationTools::getDoubleParameter( + "lambda", simulateSpAndGeneTrees.getParams(), 0.1); // 0.39; - double delta = ApplicationTools::getDoubleParameter("delta", simulateSpAndGeneTrees.getParams(), 0.01 );//0.20; - double tau = ApplicationTools::getDoubleParameter("tau", simulateSpAndGeneTrees.getParams(), 0.01 );//0.31; - double lambda = ApplicationTools::getDoubleParameter("lambda", simulateSpAndGeneTrees.getParams(), 0.1 );//0.39; + // If a seed is given + int seed = ApplicationTools::getIntParameter( + "seed", simulateSpAndGeneTrees.getParams(), 0); + if (seed != 0) { + RandomTools::setSeed(seed); + } - //If a seed is given - int seed = ApplicationTools::getIntParameter("seed", simulateSpAndGeneTrees.getParams(), 0 ); - if (seed != 0 ) { - RandomTools::setSeed ( seed ) ; - } + exODT_sim *simulation = new exODT_sim(N, seed); - exODT_sim* simulation=new exODT_sim(N, seed); + // We then have to sample n species and get back a newick string of the + // represented phylogeny: -//We then have to sample n species and get back a newick string of the represented phylogeny: - -string Sstring=simulation->sample_species(n); + string Sstring = simulation->sample_species(n); stringstream fnameS; - fnameS << "S" << ".tree"; - ofstream sp_out( fnameS.str().c_str() ); + fnameS << "S" + << ".tree"; + ofstream sp_out(fnameS.str().c_str()); sp_out << Sstring << ";" << endl; sp_out.close(); -//We can after sampling ask for a vector of simulated gene tree newick strings + // We can after sampling ask for a vector of simulated gene tree newick + // strings -vector Gstrings=simulation->simulate_gene_trees(2*N_g, delta, tau, lambda, 0, false, seed); + vector Gstrings = simulation->simulate_gene_trees( + 2 * N_g, delta, tau, lambda, 0, false, seed); -while (Gstrings.size() < N_g) { - size_t todo = N_g - Gstrings.size() ; - vector Gstrings2=simulation->simulate_gene_trees(2*todo, delta, tau, lambda, 0, false, seed); - VectorTools::append(Gstrings, Gstrings2); -} + while (Gstrings.size() < N_g) { + size_t todo = N_g - Gstrings.size(); + vector Gstrings2 = simulation->simulate_gene_trees( + 2 * todo, delta, tau, lambda, 0, false, seed); + VectorTools::append(Gstrings, Gstrings2); + } - for (size_t i = 0; i + Gstrings=simulation->simulate_gene_trees(N_g,delta,tau,omega=0,only_root=false,G_seed=-1); + + N_g: number of gene families present at the start of the simulation; + delta: D rate; + tau: T rate; + lambda: L rate; + omega: uniform origination rate over the complete phylogeny; + only_root: if true N_g gene families are present in the single species + from which all n sampled species descend; if false N_g genes are present in + each of the N species at the start of the simulation; G_Seed: if =-1 it is + taken from /dev/random.. ; + + The gene trees are time-like, i.e. are ultrametric and have branch lengths in + coalescent units. The DTL events generating the family are not currently + recorded. Doing this properly probably increases the complexity of the + simulation. The current complexity is something like N^2 x (number of genes) + in both time and space. I tested the a few things like extinction probability + etc. that I could calculate analytically, so the simulation is probably + correct. + */ -//exODT_sim* simulation=new exODT_sim(N,S_seed); - -//The parameters of the complete phylogeny simulation are the following: - -/*exODT_sim* simulation=new exODT_sim(N,S_seed=-1,init_t=2); - - N: the number of species; - S_seed: if =-1 it is taken from /dev/random.. ; - init_t: the amount of time from the beginning of the simulation until the end in coalescent units. -*/ -//The parameters of the gene tree simulation are: - -/*vector Gstrings=simulation->simulate_gene_trees(N_g,delta,tau,omega=0,only_root=false,G_seed=-1); - - N_g: number of gene families present at the start of the simulation; - delta: D rate; - tau: T rate; - lambda: L rate; - omega: uniform origination rate over the complete phylogeny; - only_root: if true N_g gene families are present in the single species from which all n sampled species descend; if false N_g genes are present in each of the N species at the start of the simulation; - G_Seed: if =-1 it is taken from /dev/random.. ; - -The gene trees are time-like, i.e. are ultrametric and have branch lengths in coalescent units. The DTL events generating the family are not currently recorded. Doing this properly probably increases the complexity of the simulation. The current complexity is something like N^2 x (number of genes) in both time and space. I tested the a few things like extinction probability etc. that I could calculate analytically, so the simulation is probably correct. -*/ - - return 1; - - } diff --git a/src/simulation.cpp b/src/simulation.cpp index 2134cfb..28ad4da 100644 --- a/src/simulation.cpp +++ b/src/simulation.cpp @@ -3,563 +3,560 @@ using namespace std; using namespace bpp; -unsigned int good_seed() -{ - unsigned int random_seed, random_seed_a, random_seed_b; - std::ifstream file ("/dev/random", std::ios::binary); - if (file.is_open()) - { - char * memblock; - int size = sizeof(int); - memblock = new char [size]; - file.read (memblock, size); - file.close(); - random_seed_a = long(memblock); - delete[] memblock; - }// end if - else - { - random_seed_a = 0; - } - random_seed_b = std::time(0); - random_seed = random_seed_a xor random_seed_b; - return random_seed; +unsigned int good_seed() { + unsigned int random_seed, random_seed_a, random_seed_b; + std::ifstream file("/dev/random", std::ios::binary); + if (file.is_open()) { + char *memblock; + int size = sizeof(int); + memblock = new char[size]; + file.read(memblock, size); + file.close(); + random_seed_a = long(memblock); + delete[] memblock; + } // end if + else { + random_seed_a = 0; + } + random_seed_b = std::time(0); + random_seed = random_seed_a xor random_seed_b; + return random_seed; } // end good_seed() - -int main(int argc, char ** argv) -{ - - //simulate - cout << " First run: \n ./simulation N n"<< endl; - cout << " In this command line, N corresponds to the total number of species, including extinct ones; "<< endl; - cout << " n corresponds to the number of extant species only. "<< endl; - cout << " This will produce a species tree in two files S_XXX.tree and R_XXX.tree. "<< endl; - cout << " In the two files the tree is the same, but in R_XXX.tree the leaves have been renamed. "<< endl; - cout << " XXX here corresponds to the species seed. "<< endl; - cout << " if you like the species tree then remember the species seed S_seed and run: "<< endl; +int main(int argc, char **argv) { + + // simulate + cout << " First run: \n ./simulation N n" << endl; + cout << " In this command line, N corresponds to the total number of " + "species, including extinct ones; " + << endl; + cout << " n corresponds to the number of extant species only. " << endl; + cout << " This will produce a species tree in two files S_XXX.tree and " + "R_XXX.tree. " + << endl; + cout << " In the two files the tree is the same, but in R_XXX.tree the " + "leaves have been renamed. " + << endl; + cout << " XXX here corresponds to the species seed. " << endl; + cout << " if you like the species tree then remember the species seed " + "S_seed and run: " + << endl; cout << " ./simulate N n S_seed omega delta tau lambda " << endl; - cout << " where omega delta tau lambda are the origination at the root, duplication, transfer and loss rate parameters." << endl; - cout << " Do this many times to get a lot of gene trees... "<< endl; - if (argc<3) return 1; - long int S_seed=good_seed(); - if (argc>3) S_seed=atol(argv[3]); - long int G_seed=good_seed(); - //if (argc>2) G_seed=atol(argv[2]); - - - int N=atoi(argv[1]); - int n=atoi(argv[2]); - - scalar_type omega=N*0.; - //O rate overall - scalar_type delta=0; - //D rate per gene - scalar_type tau=0; - //T rate per gene - scalar_type lambda=0; - //L rate per gene - int G_n=0; - if (argc>4) - { - G_n=1; - omega=N*atof(argv[4]); - //O rate overall - delta=atof(argv[5]); - //D rate per gene - tau=atof(argv[6]); - //T rate per gene - lambda=atof(argv[7]); - //L rate per gene - cout << G_n << endl; - } - scalar_type init_t=2; - scalar_type sigma=N; + cout << " where omega delta tau lambda are the origination at the root, " + "duplication, transfer and loss rate parameters." + << endl; + cout << " Do this many times to get a lot of gene trees... " << endl; + if (argc < 3) + return 1; + long int S_seed = good_seed(); + if (argc > 3) + S_seed = atol(argv[3]); + long int G_seed = good_seed(); + // if (argc>2) G_seed=atol(argv[2]); + + int N = atoi(argv[1]); + int n = atoi(argv[2]); + + scalar_type omega = N * 0.; + // O rate overall + scalar_type delta = 0; + // D rate per gene + scalar_type tau = 0; + // T rate per gene + scalar_type lambda = 0; + // L rate per gene + int G_n = 0; + if (argc > 4) { + G_n = 1; + omega = N * atof(argv[4]); + // O rate overall + delta = atof(argv[5]); + // D rate per gene + tau = atof(argv[6]); + // T rate per gene + lambda = atof(argv[7]); + // L rate per gene + cout << G_n << endl; + } + scalar_type init_t = 2; + scalar_type sigma = N; cout << "# Species seed is : " << S_seed << endl; - cout << "# Genes seed is : " << G_seed << endl; + cout << "# Genes seed is : " << G_seed << endl; RandomTools::setSeed(S_seed); - vector population; - long int next_index=0; - for (int i=0;i population; + long int next_index = 0; + for (int i = 0; i < N; i++) { + population.push_back(i); + next_index = i; + } next_index++; - //we record history - vector< vector< long int > > families; - map event_times; - - vector < int > births; - vector < int > deaths; - - - scalar_type t=init_t; - long long species_event=0; - - int Ds=0; - int Ts=0; - int Ls=0; - int Os=0; - int Ss=0; - - while (1) - { - scalar_type t_next=RandomTools::randExponential(1./(sigma*N)); - t-=t_next; - if (t<0) break; - - int death=RandomTools::giveIntRandomNumberBetweenZeroAndEntry(N); - deaths.push_back(death); - - int birth=death; - while (birth==death) birth=RandomTools::giveIntRandomNumberBetweenZeroAndEntry(N); - births.push_back(birth); - - vector family; - long int mother=population[birth]; - long int daugther=next_index; - next_index++; - long int son=next_index; - next_index++; - family.push_back(mother); - family.push_back(daugther); - family.push_back(son); - - population[birth]=daugther; - population[death]=son; - - families.push_back(family); - event_times[species_event]=t; - - species_event++; - - } - long long number_of_species_events=species_event; - //sample + // we record history + vector> families; + map event_times; + + vector births; + vector deaths; + + scalar_type t = init_t; + long long species_event = 0; + + int Ds = 0; + int Ts = 0; + int Ls = 0; + int Os = 0; + int Ss = 0; + + while (1) { + scalar_type t_next = RandomTools::randExponential(1. / (sigma * N)); + t -= t_next; + if (t < 0) + break; + + int death = RandomTools::giveIntRandomNumberBetweenZeroAndEntry(N); + deaths.push_back(death); + + int birth = death; + while (birth == death) + birth = RandomTools::giveIntRandomNumberBetweenZeroAndEntry(N); + births.push_back(birth); + + vector family; + long int mother = population[birth]; + long int daugther = next_index; + next_index++; + long int son = next_index; + next_index++; + family.push_back(mother); + family.push_back(daugther); + family.push_back(son); + + population[birth] = daugther; + population[death] = son; + + families.push_back(family); + event_times[species_event] = t; + + species_event++; + } + long long number_of_species_events = species_event; + // sample - vector population_indicies; - for (int i=0;i population_indicies; + for (int i = 0; i < N; i++) + population_indicies.push_back(i); - vector sampled_population_indicies; - for (int i=0;i sampled_population_indicies; + for (int i = 0; i < n; i++) + sampled_population_indicies.push_back(-1); - RandomTools::getSample(population_indicies,sampled_population_indicies); + RandomTools::getSample(population_indicies, sampled_population_indicies); vector sampled_population; - for (int i=0;i sampled_population_counts; - map strings; - map age; - for (int i=0;i sampled_population_counts; + map strings; + map age; + for (int i = 0; i < n; i++) { + long int extant_species = sampled_population[i]; + stringstream extant_species_name; + extant_species_name << i; // extant_species; + strings[extant_species] = extant_species_name.str(); + sampled_population_counts[extant_species] = 1; + age[extant_species] = 0; + } long int lca; - int rank=0; - for(vector >::reverse_iterator event=families.rbegin();event!=families.rend();event++) - { - scalar_type t_event=event_times[species_event-1]; - long int mother=(*event)[0]; - long int daugther=(*event)[1]; - long int son=(*event)[2]; - if (sampled_population_counts[daugther]==1 and sampled_population_counts[son]==1) - { - rank++; - sampled_population_counts[daugther]=0; - sampled_population_counts[son]=0; - sampled_population_counts[mother]=1; - stringstream sons_bl; - stringstream daugthers_bl; - stringstream rank_bs; - rank_bs << rank; - sons_bl << t_event - age[son]; - daugthers_bl << t_event - age[daugther]; - strings[mother]="("+strings[daugther]+":"+daugthers_bl.str()+","+strings[son]+":"+sons_bl.str()+")"+rank_bs.str(); - age[mother]=t_event; - lca=mother; - } - else if (sampled_population_counts[daugther]==1) - { - sampled_population_counts[daugther]=0; - sampled_population_counts[mother]=1; - strings[mother]=strings[daugther]; - age[mother]=age[daugther]; - - } - else if (sampled_population_counts[son]==1) - { - sampled_population_counts[son]=0; - sampled_population_counts[mother]=1; - strings[mother]=strings[son]; - age[mother]=age[son]; - } - species_event--; + int rank = 0; + for (vector>::reverse_iterator event = families.rbegin(); + event != families.rend(); event++) { + scalar_type t_event = event_times[species_event - 1]; + long int mother = (*event)[0]; + long int daugther = (*event)[1]; + long int son = (*event)[2]; + if (sampled_population_counts[daugther] == 1 and + sampled_population_counts[son] == 1) { + rank++; + sampled_population_counts[daugther] = 0; + sampled_population_counts[son] = 0; + sampled_population_counts[mother] = 1; + stringstream sons_bl; + stringstream daugthers_bl; + stringstream rank_bs; + rank_bs << rank; + sons_bl << t_event - age[son]; + daugthers_bl << t_event - age[daugther]; + strings[mother] = "(" + strings[daugther] + ":" + daugthers_bl.str() + + "," + strings[son] + ":" + sons_bl.str() + ")" + + rank_bs.str(); + age[mother] = t_event; + lca = mother; + } else if (sampled_population_counts[daugther] == 1) { + sampled_population_counts[daugther] = 0; + sampled_population_counts[mother] = 1; + strings[mother] = strings[daugther]; + age[mother] = age[daugther]; + + } else if (sampled_population_counts[son] == 1) { + sampled_population_counts[son] = 0; + sampled_population_counts[mother] = 1; + strings[mother] = strings[son]; + age[mother] = age[son]; } + species_event--; + } stringstream root_bl; - root_bl< random_tree_population; - map random_tree_ages; - for (int i=0;i random_tree_ages; + for (int i = 0; i < n; i++) { stringstream tmp; - tmp<1) - { - int Nr=random_tree_population.size(); - scalar_type t_next=RandomTools::randExponential(1./(2*Nr)); - t+=t_next; - int i=RandomTools::giveIntRandomNumberBetweenZeroAndEntry(Nr); - int j=i; - while (i==j) j=RandomTools::giveIntRandomNumberBetweenZeroAndEntry(Nr); - stringstream tmp; - tmp<<"("<i) - { - random_tree_population.erase(random_tree_population.begin()+j); - random_tree_population.erase(random_tree_population.begin()+i); - } - else - { - random_tree_population.erase(random_tree_population.begin()+i); - random_tree_population.erase(random_tree_population.begin()+j); - } - random_tree_population.push_back(tmp.str()); + t = 0; + while (random_tree_population.size() > 1) { + int Nr = random_tree_population.size(); + scalar_type t_next = RandomTools::randExponential(1. / (2 * Nr)); + t += t_next; + int i = RandomTools::giveIntRandomNumberBetweenZeroAndEntry(Nr); + int j = i; + while (i == j) + j = RandomTools::giveIntRandomNumberBetweenZeroAndEntry(Nr); + stringstream tmp; + tmp << "(" << random_tree_population[i] << ":" + << t - random_tree_ages[random_tree_population[i]] << "," + << random_tree_population[j] << ":" + << t - random_tree_ages[random_tree_population[j]] << ")"; + random_tree_ages[tmp.str()] = t; + if (j > i) { + random_tree_population.erase(random_tree_population.begin() + j); + random_tree_population.erase(random_tree_population.begin() + i); + } else { + random_tree_population.erase(random_tree_population.begin() + i); + random_tree_population.erase(random_tree_population.begin() + j); } - r_out << random_tree_population[0]<<";" << endl; + random_tree_population.push_back(tmp.str()); + } + r_out << random_tree_population[0] << ";" << endl; // genes RandomTools::setSeed(G_seed); - //we seed genes .. this part is trvilally parallelizable.. - long int gene_count=0; - map > population_of_genes; - for (int i=0;i genes_in_species_i; - //we could have more genes per species .. - for (int j=0;j> population_of_genes; + for (int i = 0; i < N; i++) { + vector genes_in_species_i; + // we could have more genes per species .. + for (int j = 0; j < G_n; j++) { + genes_in_species_i.push_back(gene_count); + gene_count++; + } + population_of_genes[i] = genes_in_species_i; + }; //..we replay species history - species_event=0; - //and record gene stories.. - long int next_gene=gene_count; - vector > > gene_families; - map gene_event_times; - map gene_event_types; - - t=init_t; - species_event=0; + species_event = 0; + // and record gene stories.. + long int next_gene = gene_count; + vector>> gene_families; + map gene_event_times; + map gene_event_types; + + t = init_t; + species_event = 0; cout << "#gene stories .." << endl; - long long gene_event=0; - boost::progress_display show_progress( number_of_species_events ); - while(species_event t_species_event) - { - t-=t_next; - - scalar_type r=RandomTools::giveRandomNumberBetweenZeroAndEntry(rate_sum); - if (r family; - long int father=population_of_genes[species][gene]; - family.push_back(father); - long int daugther=next_gene; - next_gene++; - family.push_back(daugther); - long int son=next_gene; - next_gene++; - family.push_back(son); - - vector < vector > fam_vec; - fam_vec.push_back(family); - gene_families.push_back(fam_vec); - gene_event++; - - gene_event_times[father]=t; - gene_event_types[father]="D"; - - population_of_genes[species][gene]=daugther; - population_of_genes[species].push_back(son); - gene_count++; - // cout << "D."< family; - long int father=population_of_genes[species][gene]; - family.push_back(father); - long int daugther=next_gene; - next_gene++; - family.push_back(daugther); - long int son=next_gene; - next_gene++; - family.push_back(son); - vector < vector > fam_vec; - fam_vec.push_back(family); - gene_families.push_back(fam_vec); - gene_event++; - - gene_event_times[father]=t; - gene_event_types[father]="T"; - - population_of_genes[species][gene]=daugther; - population_of_genes[T_to].push_back(son); - gene_count++; - //cout << "T."< genes_in_daugther_species; - vector genes_in_son_species; - vector < vector > fam_vec; - - for (vector ::iterator gene=population_of_genes[births[species_event]].begin();gene!=population_of_genes[births[species_event]].end();gene++) - { - vector family; - long int father=(*gene); - family.push_back(father); - long int daugther=next_gene; - next_gene++; - family.push_back(daugther); - long int son=next_gene; - next_gene++; - family.push_back(son); - fam_vec.push_back(family); - - genes_in_daugther_species.push_back(daugther); - genes_in_son_species.push_back(son); - gene_count++; - gene_event_types[father]="S"; - gene_event_times[father]=t; - - } - - gene_count-=population_of_genes[deaths[species_event]].size(); - - population_of_genes[births[species_event]].clear(); - population_of_genes[births[species_event]]=genes_in_daugther_species; - population_of_genes[deaths[species_event]]=genes_in_son_species; - - //cout << "S."< t_species_event) { + t -= t_next; + + scalar_type r = + RandomTools::giveRandomNumberBetweenZeroAndEntry(rate_sum); + if (r < gene_count * delta + gene_count * tau + gene_count * lambda) { + int gene = -1; + int species = -1; + + long int gene_r = + RandomTools::giveIntRandomNumberBetweenZeroAndEntry(gene_count); + long int gene_re_count = 0; + gene_re_count = 0; + for (int i = 0; i < N; i++) { + if (gene_r < + gene_re_count + (long int)population_of_genes[i].size()) { + gene = (gene_re_count + (long int)population_of_genes[i].size() - + gene_r) - + 1; + species = i; + break; + } + gene_re_count += population_of_genes[i].size(); + } + + if (r < gene_count * delta) + // D + { + // cout << "D"< family; + long int father = population_of_genes[species][gene]; + family.push_back(father); + long int daugther = next_gene; + next_gene++; + family.push_back(daugther); + long int son = next_gene; + next_gene++; + family.push_back(son); + + vector> fam_vec; + fam_vec.push_back(family); + gene_families.push_back(fam_vec); + gene_event++; + + gene_event_times[father] = t; + gene_event_types[father] = "D"; + + population_of_genes[species][gene] = daugther; + population_of_genes[species].push_back(son); + gene_count++; + // cout << "D."< family; + long int father = population_of_genes[species][gene]; + family.push_back(father); + long int daugther = next_gene; + next_gene++; + family.push_back(daugther); + long int son = next_gene; + next_gene++; + family.push_back(son); + vector> fam_vec; + fam_vec.push_back(family); + gene_families.push_back(fam_vec); + gene_event++; + + gene_event_times[father] = t; + gene_event_types[father] = "T"; + + population_of_genes[species][gene] = daugther; + population_of_genes[T_to].push_back(son); + gene_count++; + // cout << "T."< genes_in_daugther_species; + vector genes_in_son_species; + vector> fam_vec; + + for (vector::iterator gene = + population_of_genes[births[species_event]].begin(); + gene != population_of_genes[births[species_event]].end(); gene++) { + vector family; + long int father = (*gene); + family.push_back(father); + long int daugther = next_gene; + next_gene++; + family.push_back(daugther); + long int son = next_gene; + next_gene++; + family.push_back(son); + fam_vec.push_back(family); + + genes_in_daugther_species.push_back(daugther); + genes_in_son_species.push_back(son); + gene_count++; + gene_event_types[father] = "S"; + gene_event_times[father] = t; + } - cout << "#gene simulation ends "<< Ds << " Ds; "<< Ts << " Ts; " << Ls << " Ls; " << Os <<" Os; "<< Ss << " Ss."< sampled_gene_counts; - map gene_strings; - map gene_age; - int j=0; - for (int i=0;i::iterator git=population_of_genes[ sampled_i ].begin();git!=population_of_genes[ sampled_i ].end();git++) - { - long int extant_gene=(*git); - stringstream extant_gene_name; - extant_gene_name << strings[extant_species] << "_" << extant_gene; - //extant_gene_name << sampled_i << "_" << j;// extant_species << "_" << extant_gene; - gene_strings[extant_gene]=extant_gene_name.str(); - sampled_gene_counts[extant_gene]=1; - gene_age[extant_gene]=0; - j++; - } - } - int gene_rank=0; + // cout << "S."< suffix; - for(vector > > ::reverse_iterator event_vec=gene_families.rbegin();event_vec!=gene_families.rend();event_vec++) - { + // XX oh my, what a waste of time .. + gene_families.push_back(fam_vec); + gene_event++; - for(vector< vector > ::iterator event=(*event_vec).begin();event!=(*event_vec).end();event++) - { - - long int father=(*event)[0]; - long int daugther=(*event)[1]; - long int son=(*event)[2]; - scalar_type t_event=gene_event_times[father]; - - if (sampled_gene_counts[daugther]==1 and sampled_gene_counts[son]==1) - { - gene_rank++; - sampled_gene_counts[daugther]=0; - sampled_gene_counts[son]=0; - sampled_gene_counts[father]=1; - stringstream sons_bl; - stringstream daugthers_bl; - stringstream gene_rank_bs; - gene_rank_bs << gene_event_types[father] << suffix[son] << suffix[daugther]; - suffix[father]=""; - sons_bl << t_event - gene_age[son]; - daugthers_bl << t_event - gene_age[daugther]; - gene_strings[father]="("+gene_strings[daugther]+":"+daugthers_bl.str()+","+gene_strings[son]+":"+sons_bl.str()+")"+gene_rank_bs.str(); - gene_age[father]=t_event; - //lca=father; - } - else if (sampled_gene_counts[daugther]==1) - { - sampled_gene_counts[daugther]=0; - sampled_gene_counts[father]=1; - gene_strings[father]=gene_strings[daugther]; - suffix[father]=suffix[daugther]; - gene_age[father]=gene_age[daugther]; - - } - else if (sampled_gene_counts[son]==1) - { - sampled_gene_counts[son]=0; - sampled_gene_counts[father]=1; - gene_strings[father]=gene_strings[son]; - suffix[father]=suffix[son]; - if (gene_event_types[father]=="T") suffix[father]+="T"; - gene_age[father]=gene_age[son]; - } - } - gene_event--; - ++show_trace_progress; + species_event++; + ++show_progress; } + } - int i=0; - for (map::iterator git=sampled_gene_counts.begin(); git!=sampled_gene_counts.end(); git++) - if ((*git).second==1 and gene_strings[(*git).first].find("(")!=string::npos ) - { - stringstream fname; - fname << "G_" << S_seed << "_" << G_seed << "_" << i << ".tree"; - ofstream gene_out( fname.str().c_str() ); - gene_out << gene_strings[(*git).first] << ";" << endl; - i++; + cout << "#gene simulation ends " << Ds << " Ds; " << Ts << " Ts; " << Ls + << " Ls; " << Os << " Os; " << Ss << " Ss." << endl; + + // traceback gene stories + map sampled_gene_counts; + map gene_strings; + map gene_age; + int j = 0; + for (int i = 0; i < n; i++) { + int sampled_i = sampled_population_indicies[i]; + long int extant_species = sampled_population[i]; + if (extant_species == population[sampled_i]) + for (vector::iterator git = + population_of_genes[sampled_i].begin(); + git != population_of_genes[sampled_i].end(); git++) { + long int extant_gene = (*git); + stringstream extant_gene_name; + extant_gene_name << strings[extant_species] << "_" << extant_gene; + // extant_gene_name << sampled_i << "_" << j;// extant_species << "_" << + // extant_gene; + gene_strings[extant_gene] = extant_gene_name.str(); + sampled_gene_counts[extant_gene] = 1; + gene_age[extant_gene] = 0; + j++; } + } + int gene_rank = 0; + + cout << "#traceback begins.." << endl; + boost::progress_display show_trace_progress(gene_event); + map suffix; + for (vector>>::reverse_iterator event_vec = + gene_families.rbegin(); + event_vec != gene_families.rend(); event_vec++) { + + for (vector>::iterator event = (*event_vec).begin(); + event != (*event_vec).end(); event++) { + + long int father = (*event)[0]; + long int daugther = (*event)[1]; + long int son = (*event)[2]; + scalar_type t_event = gene_event_times[father]; + + if (sampled_gene_counts[daugther] == 1 and + sampled_gene_counts[son] == 1) { + gene_rank++; + sampled_gene_counts[daugther] = 0; + sampled_gene_counts[son] = 0; + sampled_gene_counts[father] = 1; + stringstream sons_bl; + stringstream daugthers_bl; + stringstream gene_rank_bs; + gene_rank_bs << gene_event_types[father] << suffix[son] + << suffix[daugther]; + suffix[father] = ""; + sons_bl << t_event - gene_age[son]; + daugthers_bl << t_event - gene_age[daugther]; + gene_strings[father] = "(" + gene_strings[daugther] + ":" + + daugthers_bl.str() + "," + gene_strings[son] + + ":" + sons_bl.str() + ")" + gene_rank_bs.str(); + gene_age[father] = t_event; + // lca=father; + } else if (sampled_gene_counts[daugther] == 1) { + sampled_gene_counts[daugther] = 0; + sampled_gene_counts[father] = 1; + gene_strings[father] = gene_strings[daugther]; + suffix[father] = suffix[daugther]; + gene_age[father] = gene_age[daugther]; + + } else if (sampled_gene_counts[son] == 1) { + sampled_gene_counts[son] = 0; + sampled_gene_counts[father] = 1; + gene_strings[father] = gene_strings[son]; + suffix[father] = suffix[son]; + if (gene_event_types[father] == "T") + suffix[father] += "T"; + gene_age[father] = gene_age[son]; + } + } + gene_event--; + ++show_trace_progress; + } + int i = 0; + for (map::iterator git = sampled_gene_counts.begin(); + git != sampled_gene_counts.end(); git++) + if ((*git).second == 1 and + gene_strings[(*git).first].find("(") != string::npos) { + stringstream fname; + fname << "G_" << S_seed << "_" << G_seed << "_" << i << ".tree"; + ofstream gene_out(fname.str().c_str()); + gene_out << gene_strings[(*git).first] << ";" << endl; + i++; + } return 1; - - } diff --git a/src/test.cpp b/src/test.cpp index ebd0c90..3c48c3c 100644 --- a/src/test.cpp +++ b/src/test.cpp @@ -2,123 +2,125 @@ #include "exODT_sim.h" #include "ALE_util.h" -#include #include +#include using namespace std; using namespace bpp; -int main(int argc, char ** argv) -{ - //we need a species tree +int main(int argc, char **argv) { + // we need a species tree - string sname=argv[1]; + string sname = argv[1]; string Sstring; - ifstream file_stream (sname.c_str()); - getline (file_stream,Sstring); + ifstream file_stream(sname.c_str()); + getline(file_stream, Sstring); - string ale_file=argv[2]; - approx_posterior * ale; - ale=load_ALE_from_file(ale_file); + string ale_file = argv[2]; + approx_posterior *ale; + ale = load_ALE_from_file(ale_file); - exODT_model* model=new exODT_model(); + exODT_model *model = new exODT_model(); - //XX - //XX + // XX + // XX - //exODT_sim* simulation=new exODT_sim(100,1010); - - scalar_type delta=atof(argv[3]); - scalar_type tau=atof(argv[4]); - scalar_type lambda=atof(argv[5]); + // exODT_sim* simulation=new exODT_sim(100,1010); - //simulation->sample_species(10); + scalar_type delta = atof(argv[3]); + scalar_type tau = atof(argv[4]); + scalar_type lambda = atof(argv[5]); - - //for (vector::iterator it=simulation->gene_trees.begin();it!=simulation->gene_trees.end();it++) - //cout << (*it) << endl; + // simulation->sample_species(10); - //cout << simulation->S_string << endl; + // for (vector::iterator + // it=simulation->gene_trees.begin();it!=simulation->gene_trees.end();it++) + // cout << (*it) << endl; - model->set_model_parameter("min_D",3); - model->set_model_parameter("grid_delta_t",0.005); - model->set_model_parameter("DD",10); + // cout << simulation->S_string << endl; + + model->set_model_parameter("min_D", 3); + model->set_model_parameter("grid_delta_t", 0.005); + model->set_model_parameter("DD", 10); model->construct(Sstring); - model->set_model_parameter("event_node",0); - model->set_model_parameter("delta",delta); + model->set_model_parameter("event_node", 0); + model->set_model_parameter("delta", delta); model->set_model_parameter("tau", tau); model->set_model_parameter("lambda", lambda); - model->set_model_parameter("leaf_events",1); + model->set_model_parameter("leaf_events", 1); model->calculate_EGb(); cout << model->p(ale) << endl; - cout << ".."< res = model->p_MLRec(ale); + pair res = model->p_MLRec(ale); cout << res.first <MLRec_events["D"] << "\t" << model->MLRec_events["T"] << "\t" << model->MLRec_events["L"]<< "\t" << model->MLRec_events["S"] <MLRec_events["D"] << "\t" << + model->MLRec_events["T"] << "\t" << model->MLRec_events["L"]<< "\t" << + model->MLRec_events["S"] < sample_trees; - string outname=ale_file+".samples"; - ofstream fout( outname.c_str() ); - string outname2=ale_file+".Ttokens"; - ofstream fout2( outname2.c_str() ); - - int subsamples=atoi(argv[6]); - boost::progress_display pd( subsamples ); - - for (int i=0;isample(false); - fout << sample_tree << endl; - for (vector::iterator it=model->Ttokens.begin();it!=model->Ttokens.end();it++) fout2<< i << " " <<(*it)< leaves = G->getLeaves(); - for (vector::iterator it=leaves.begin();it!=leaves.end();it++ ) - { - string name=(*it)->getName(); - vector tokens; - boost::split(tokens,name,boost::is_any_of(".@"),boost::token_compress_on); - (*it)->setName(tokens[0]); - tokens.clear(); - } - leaves.clear(); - sample_trees.push_back(G); + vector sample_trees; + string outname = ale_file + ".samples"; + ofstream fout(outname.c_str()); + string outname2 = ale_file + ".Ttokens"; + ofstream fout2(outname2.c_str()); + + int subsamples = atoi(argv[6]); + boost::progress_display pd(subsamples); + + for (int i = 0; i < subsamples; i++) { + ++pd; + string sample_tree = model->sample(false); + fout << sample_tree << endl; + for (vector::iterator it = model->Ttokens.begin(); + it != model->Ttokens.end(); it++) + fout2 << i << " " << (*it) << endl; + + tree_type *G = TreeTemplateTools::parenthesisToTree(sample_tree, false); + vector leaves = G->getLeaves(); + for (vector::iterator it = leaves.begin(); it != leaves.end(); + it++) { + string name = (*it)->getName(); + vector tokens; + boost::split(tokens, name, boost::is_any_of(".@"), + boost::token_compress_on); + (*it)->setName(tokens[0]); + tokens.clear(); } - + leaves.clear(); + sample_trees.push_back(G); + } + cout << model->counts_string(); - cout << "Os" <show_counts("Os"); - cout << "Ds" <show_counts("Ds"); - cout << "Ts" <show_counts("Ts"); - cout << "Ts from" <show_counts("Tfroms"); - cout << "Ls" <show_counts("Ls"); - cout << "copies" <show_counts("copies"); - Tree* con_tree= TreeTools::thresholdConsensus(sample_trees,0.5); - TreeTools::computeBootstrapValues(*con_tree,sample_trees); + Tree *con_tree = TreeTools::thresholdConsensus(sample_trees, 0.5); + TreeTools::computeBootstrapValues(*con_tree, sample_trees); cout << endl; - cout << "thcon: "< mpp_res=sale->mpp_tree(); Tree* mpp_T = TreeTemplateTools::parenthesisToTree(mpp_res.first,false); @@ -131,15 +133,11 @@ int main(int argc, char ** argv) approx_posterior * cale=observe_ALE_from_string(con_str); */ - pair res = model->p_MLRec(ale); + pair res = model->p_MLRec(ale); cout << endl; - cout << "ML: "<< endl; + cout << "ML: " << endl; cout << res.first << endl; cout << endl; - return 1; - - } - diff --git a/src/test_simpleML.cpp b/src/test_simpleML.cpp index f844fe6..0ae8996 100644 --- a/src/test_simpleML.cpp +++ b/src/test_simpleML.cpp @@ -1,62 +1,63 @@ -#include "exODT.h" #include "ALE_util.h" +#include "exODT.h" #include using namespace std; using namespace bpp; -int main(int argc, char ** argv) -{ - //we need a species tree - string sname=argv[1]; +int main(int argc, char **argv) { + // we need a species tree + string sname = argv[1]; string Sstring; - ifstream file_stream (sname.c_str()); - getline (file_stream,Sstring); - //we need an ale - string ale_name=argv[2]; - approx_posterior * ale=load_ALE_from_file(ale_name); - - - cout << "Read species tree from: " << argv[1] <<".."<observations<<" trees from: " << ale_name <<".. with :" << ale->count_trees() << " possible amalgamations .." << endl << endl ; - - cout << "and the most liely tree is.."<< endl; + ifstream file_stream(sname.c_str()); + getline(file_stream, Sstring); + // we need an ale + string ale_name = argv[2]; + approx_posterior *ale = load_ALE_from_file(ale_name); + + cout << "Read species tree from: " << argv[1] << ".." << endl; + // we need an .ale file containing observed conditional clade probabilities + // cf. ALEobserve + cout << "Read summary of tree sample for " << ale->observations + << " trees from: " << ale_name << ".. with :" << ale->count_trees() + << " possible amalgamations .." << endl + << endl; + + cout << "and the most liely tree is.." << endl; // initilaize the exODT model using some initial DTL rates - exODT_model* model=new exODT_model(); - int D=1; // this is the simplest parsimony like setting of one event node per slice.. - model->set_model_parameter("D",D); + exODT_model *model = new exODT_model(); + int D = 1; // this is the simplest parsimony like setting of one event node + // per slice.. + model->set_model_parameter("D", D); model->construct(Sstring); - scalar_type delta=0.01; - scalar_type tau=0.01; - scalar_type lambda=0.01; - model->set_model_parameter("delta",delta); - model->set_model_parameter("tau",tau); - model->set_model_parameter("lambda",lambda); + scalar_type delta = 0.01; + scalar_type tau = 0.01; + scalar_type lambda = 0.01; + model->set_model_parameter("delta", delta); + model->set_model_parameter("tau", tau); + model->set_model_parameter("lambda", lambda); - // likelihood only calculation, the function E and G in the paper.. + // likelihood only calculation, the function E and G in the paper.. model->calculate_EGb(); - - //find ML reconciled gene tree: - pair res = model->p_MLRec(ale); - //output tree + + // find ML reconciled gene tree: + pair res = model->p_MLRec(ale); + // output tree cout << res.first << endl; - //output number of events - cout << "# of\t Duplications\tTransfers\tLosses\tSpeciations" <MLRec_events["D"] << "\t" << model->MLRec_events["T"] << "\t" << model->MLRec_events["L"]<< "\t" << model->MLRec_events["S"] <MLRec_events["D"] << "\t" + << model->MLRec_events["T"] << "\t" << model->MLRec_events["L"] << "\t" + << model->MLRec_events["S"] << endl; - //ususally one needs to run instead of model->p_MLRec(ale) the below: + // ususally one needs to run instead of model->p_MLRec(ale) the below: - //model->calculate_EGb(); - //cout << model->p(ale) << endl; + // model->calculate_EGb(); + // cout << model->p(ale) << endl; - //so maybe makes more sense to compare to that .. + // so maybe makes more sense to compare to that .. return 1; - - } - diff --git a/src/times.cpp b/src/times.cpp index 4186478..956ab9a 100644 --- a/src/times.cpp +++ b/src/times.cpp @@ -1,136 +1,136 @@ -#include "exODT.h" #include "ALE_util.h" +#include "exODT.h" -#include -#include #include #include +#include +#include using namespace std; using namespace bpp; - -class p_fun: - public virtual Function, - public AbstractParametrizable -{ + +class p_fun : public virtual Function, public AbstractParametrizable { private: double fval_; - exODT_model* model_pointer; - approx_posterior* ale_pointer; -public: - p_fun(exODT_model* model,approx_posterior* ale, double delta_start=0.01,double tau_start=0.01,double lambda_start=0.1//,double sigma_hat_start=1. -) : AbstractParametrizable(""), fval_(0), model_pointer(model), ale_pointer(ale) - { - //We declare parameters here: - // IncludingInterval* constraint = new IncludingInterval(1e-6, 10-1e-6); - IntervalConstraint* constraint = new IntervalConstraint ( 1e-6, 10-1e-6, true, true ); - addParameter_( new Parameter("delta", delta_start, constraint) ) ; - addParameter_( new Parameter("tau", tau_start, constraint) ) ; - addParameter_( new Parameter("lambda", lambda_start, constraint) ) ; - //addParameter_( new Parameter("sigma_hat", sigma_hat_start, constraint) ) ; + exODT_model *model_pointer; + approx_posterior *ale_pointer; +public: + p_fun(exODT_model *model, approx_posterior *ale, double delta_start = 0.01, + double tau_start = 0.01, + double lambda_start = 0.1 //,double sigma_hat_start=1. + ) + : AbstractParametrizable(""), fval_(0), model_pointer(model), + ale_pointer(ale) { + // We declare parameters here: + // IncludingInterval* constraint = new IncludingInterval(1e-6, 10-1e-6); + IntervalConstraint *constraint = + new IntervalConstraint(1e-6, 10 - 1e-6, true, true); + addParameter_(new Parameter("delta", delta_start, constraint)); + addParameter_(new Parameter("tau", tau_start, constraint)); + addParameter_(new Parameter("lambda", lambda_start, constraint)); + // addParameter_( new Parameter("sigma_hat", sigma_hat_start, constraint) ) + // ; } - - p_fun* clone() const { return new p_fun(*this); } - + + p_fun *clone() const { return new p_fun(*this); } + public: - - void setParameters(const ParameterList& pl) - throw (ParameterNotFoundException, ConstraintException, Exception) - { - matchParametersValues(pl); - } - double getValue() const throw (Exception) { return fval_; } - void fireParameterChanged(const ParameterList& pl) - { - double delta = getParameterValue("delta"); - double tau = getParameterValue("tau"); - double lambda = getParameterValue("lambda"); - //double sigma_hat = getParameterValue("sigma_hat"); - - model_pointer->set_model_parameter("delta",delta); - model_pointer->set_model_parameter("tau",tau); - model_pointer->set_model_parameter("lambda",lambda); - //model_pointer->set_model_parameter("sigma_hat",sigma_hat); - model_pointer->calculate_EGb(); - double y=-log(model_pointer->p(ale_pointer)); - //cout <set_model_parameter("delta", delta); + model_pointer->set_model_parameter("tau", tau); + model_pointer->set_model_parameter("lambda", lambda); + // model_pointer->set_model_parameter("sigma_hat",sigma_hat); + model_pointer->calculate_EGb(); + double y = -log(model_pointer->p(ale_pointer)); + // cout <observations<<" trees from: " << ale_file <<".."<3) + ifstream file_stream_S(argv[1]); + getline(file_stream_S, Sstring); + cout << "Read species tree from: " << argv[1] << ".." << endl; + // we need an .ale file containing observed conditional clade probabilities + // cf. ALEobserve + string ale_file = argv[2]; + approx_posterior *ale; + ale = load_ALE_from_file(ale_file); + cout << "Read summary of tree sample for " << ale->observations + << " trees from: " << ale_file << ".." << endl; + + // we initialise a coarse grained reconciliation model for calculating the sum + exODT_model *model = new exODT_model(); + + int D = 3; + if (argc > 3) model->set_model_parameter("gene_name_separators", argv[3]); - model->set_model_parameter("BOOTSTRAP_LABELS","yes"); + model->set_model_parameter("BOOTSTRAP_LABELS", "yes"); - model->set_model_parameter("min_D",D); - model->set_model_parameter("grid_delta_t",0.05); + model->set_model_parameter("min_D", D); + model->set_model_parameter("grid_delta_t", 0.05); model->construct(Sstring); - model->set_model_parameter("event_node",0); - model->set_model_parameter("leaf_events",1); - model->set_model_parameter("N",1); + model->set_model_parameter("event_node", 0); + model->set_model_parameter("leaf_events", 1); + model->set_model_parameter("N", 1); - //a set of inital rates - scalar_type delta=0.01,tau=0.01,lambda=0.1; + // a set of inital rates + scalar_type delta = 0.01, tau = 0.01, lambda = 0.1; model->set_model_parameter("delta", delta); model->set_model_parameter("tau", tau); model->set_model_parameter("lambda", lambda); model->set_model_parameter("sigma_hat", 1); model->calculate_EGb(); - tree_type * T1=TreeTemplateTools::parenthesisToTree(ale->constructor_string,false); - vector nodes1=T1->getLeaves(); - map names; - for (vector ::iterator it=nodes1.begin();it!=nodes1.end();it++) - { - vector tokens; - string name=(*it)->getName(); - boost::split(tokens,name,boost::is_any_of("_"),boost::token_compress_on); - names[tokens[0]]+=1; - } - + tree_type *T1 = + TreeTemplateTools::parenthesisToTree(ale->constructor_string, false); + vector nodes1 = T1->getLeaves(); + map names; + for (vector::iterator it = nodes1.begin(); it != nodes1.end(); it++) { + vector tokens; + string name = (*it)->getName(); + boost::split(tokens, name, boost::is_any_of("_"), boost::token_compress_on); + names[tokens[0]] += 1; + } + cout << T1->getNumberOfLeaves() << " " << names.size() << endl; - boost::timer * t = new boost::timer(); - string outname=ale_file+".times"; - ofstream fout( outname.c_str() ); + boost::timer *t = new boost::timer(); + string outname = ale_file + ".times"; + ofstream fout(outname.c_str()); model->p(ale); fout << t->elapsed() << "\t"; fout << ale->Dip_counts.size() << "\t"; - fout << T1->getNumberOfLeaves() << "\t"; - fout << names.size() << "\t"; + fout << T1->getNumberOfLeaves() << "\t"; + fout << names.size() << "\t"; fout << ale_file << endl; - + return 0; } - diff --git a/src/times_undated.cpp b/src/times_undated.cpp index 8404854..ebb8d81 100644 --- a/src/times_undated.cpp +++ b/src/times_undated.cpp @@ -1,92 +1,89 @@ -#include "exODT.h" #include "ALE_util.h" +#include "exODT.h" -#include -#include #include #include +#include +#include using namespace std; using namespace bpp; -int main(int argc, char ** argv) -{ +int main(int argc, char **argv) { - - //we need a dated species tree in newick format + // we need a dated species tree in newick format string Sstring; - ifstream file_stream_S (argv[1]); - getline (file_stream_S,Sstring); - cout << "Read species tree from: " << argv[1] <<".."<observations<<" trees from: " << ale_file <<".."<observations + << " trees from: " << ale_file << ".." << endl; - //we initialise a coarse grained reconciliation model for calculating the sum - exODT_model* model=new exODT_model(); + // we initialise a coarse grained reconciliation model for calculating the sum + exODT_model *model = new exODT_model(); - if (argc>3) + if (argc > 3) model->set_model_parameter("gene_name_separators", argv[3]); - model->set_model_parameter("BOOT_STRAP_LABLES","yes"); + model->set_model_parameter("BOOT_STRAP_LABLES", "yes"); model->construct_undated(Sstring); - - //a set of inital rates - scalar_type delta=0.01,tau=0.01,lambda=0.1; + // a set of inital rates + scalar_type delta = 0.01, tau = 0.01, lambda = 0.1; model->set_model_parameter("delta", delta); model->set_model_parameter("tau", tau); model->set_model_parameter("lambda", lambda); model->calculate_undatedEs(); - int leaves=1; - map names; - if (ale->constructor_string.find("(")!=ale->constructor_string.npos) - { - tree_type * T1=TreeTemplateTools::parenthesisToTree(ale->constructor_string,false); - vector nodes1=T1->getLeaves(); - for (vector ::iterator it=nodes1.begin();it!=nodes1.end();it++) - { - vector tokens; - string name=(*it)->getName(); - boost::split(tokens,name,boost::is_any_of("_"),boost::token_compress_on); - names[tokens[0]]+=1; - } - leaves=T1->getNumberOfLeaves(); + int leaves = 1; + map names; + if (ale->constructor_string.find("(") != ale->constructor_string.npos) { + tree_type *T1 = + TreeTemplateTools::parenthesisToTree(ale->constructor_string, false); + vector nodes1 = T1->getLeaves(); + for (vector::iterator it = nodes1.begin(); it != nodes1.end(); + it++) { + vector tokens; + string name = (*it)->getName(); + boost::split(tokens, name, boost::is_any_of("_"), + boost::token_compress_on); + names[tokens[0]] += 1; } - else - { - vector tokens; - boost::split(tokens,ale->constructor_string,boost::is_any_of(","),boost::token_compress_on); - for (vector ::iterator it=tokens.begin();it!=tokens.end();it++) - { - vector tokens2; - string name=(*it); - boost::split(tokens2,name,boost::is_any_of("_"),boost::token_compress_on); - names[tokens2[0]]+=1; - leaves+=1; - } + leaves = T1->getNumberOfLeaves(); + } else { + vector tokens; + boost::split(tokens, ale->constructor_string, boost::is_any_of(","), + boost::token_compress_on); + for (vector::iterator it = tokens.begin(); it != tokens.end(); + it++) { + vector tokens2; + string name = (*it); + boost::split(tokens2, name, boost::is_any_of("_"), + boost::token_compress_on); + names[tokens2[0]] += 1; + leaves += 1; } - + } - boost::timer * t = new boost::timer(); - string outname=ale_file+".utimes"; - ofstream fout( outname.c_str() ); - scalar_type times=100; + boost::timer *t = new boost::timer(); + string outname = ale_file + ".utimes"; + ofstream fout(outname.c_str()); + scalar_type times = 100; scalar_type ll; - - for (int i=0;i<100;i++) - ll=model->pun(ale); - fout << t->elapsed()/times << "\t"; + + for (int i = 0; i < 100; i++) + ll = model->pun(ale); + fout << t->elapsed() / times << "\t"; fout << ale->Dip_counts.size() << "\t"; fout << leaves << "\t"; - fout << names.size() << "\t"; + fout << names.size() << "\t"; fout << ale_file; //<< "\t"; fout << endl; - //fout << ll << endl; - + // fout << ll << endl; + return 0; } - diff --git a/src/traceback.cpp b/src/traceback.cpp index debc843..f5510c9 100644 --- a/src/traceback.cpp +++ b/src/traceback.cpp @@ -2,778 +2,745 @@ using namespace std; using namespace bpp; -//High memory usage lowmem=false traceback is deprecated! -//The current lowmem=true method uses sample(true) cf. sample.cpp. -//The general structure of the calculation, and lot of the code, is the same as p(ale) cf. model.cpp. -pair exODT_model::p_MLRec(approx_posterior *ale, bool lowmem) -{ - ale_pointer=ale; - //cout << "start" << endl; - //iterate over directed patitions (i.e. clades) ordered by the number of leaves - //cout << "start loop" << endl; - - //test - //long int tmp_g_id=-1; - //cout << ale->set2name(ale->id_sets[tmp_g_id]) < g_ids;//del-loc - vector g_id_sizes;//del-loc - for (map > :: iterator it = ale->size_ordered_bips.begin(); it != ale->size_ordered_bips.end(); it++) - for (vector :: iterator jt = (*it).second.begin(); jt != (*it).second.end(); jt++) - { - g_ids.push_back((*jt)); - g_id_sizes.push_back((*it).first); - } - //root biprartitino needs to be handled seperatly +// High memory usage lowmem=false traceback is deprecated! +// The current lowmem=true method uses sample(true) cf. sample.cpp. +// The general structure of the calculation, and lot of the code, is the same as +// p(ale) cf. model.cpp. +pair exODT_model::p_MLRec(approx_posterior *ale, + bool lowmem) { + ale_pointer = ale; + // cout << "start" << endl; + // iterate over directed patitions (i.e. clades) ordered by the number of + // leaves cout << "start loop" << endl; + + // test + // long int tmp_g_id=-1; + // cout << ale->set2name(ale->id_sets[tmp_g_id]) < g_ids; // del-loc + vector g_id_sizes; // del-loc + for (map>::iterator it = ale->size_ordered_bips.begin(); + it != ale->size_ordered_bips.end(); it++) + for (vector::iterator jt = (*it).second.begin(); + jt != (*it).second.end(); jt++) { + g_ids.push_back((*jt)); + g_id_sizes.push_back((*it).first); + } + // root biprartitino needs to be handled seperatly g_ids.push_back(-1); g_id_sizes.push_back(ale->Gamma_size); // // gene<->species mapping - for (int i=0;i<(int)g_ids.size();i++) - { - long int g_id=g_ids[i]; - for (int rank=0;rank temp = ale->id_sets[g_id]; - for (auto i = 0; i < ale->Gamma_size + 1; ++i) { - // if ( BipartitionTools::testBit ( temp, i) ) { - if ( temp[ i ] ) { - id = i; - break; - } + for (int i = 0; i < (int)g_ids.size(); i++) { + long int g_id = g_ids[i]; + for (int rank = 0; rank < last_rank; rank++) { + int n = time_slices[rank].size(); + for (int t_i = 0; t_i < (int)time_slice_times[rank].size(); t_i++) { + scalar_type t = time_slice_times[rank][t_i]; + for (int branch_i = 0; branch_i < n; branch_i++) { + int e = time_slices[rank][branch_i]; + q[g_id][t][e] = 0; } - string gene_name=ale->id_leaves[ id ]; -// string gene_name=ale->id_leaves[(* (ale->id_sets[g_id].begin()) )]; - vector tokens; - boost::split(tokens,gene_name,boost::is_any_of(string_parameter["gene_name_separators"]),boost::token_compress_on); - string species_name; - if ((int)scalar_parameter["species_field"]==-1) - species_name=tokens[tokens.size()-1]; - else - species_name=tokens[(int)scalar_parameter["species_field"]]; - - bool found=false; - for (map::iterator tmpit=extant_species.begin();tmpit!=extant_species.end();tmpit++) - if ((*tmpit).second==species_name) found = true; - if (not found) cout << species_name << endl; - - gid_sps[g_id]=species_name; - } + q[g_id][t][alpha] = 0; + } } - - for (int i=0;i<(int)g_ids.size();i++) - { - // directed partition (dip) gamma's id - bool is_a_leaf=false; - long int g_id=g_ids[i]; - if (g_id_sizes[i]==1) - is_a_leaf=true; - - vector gp_ids;//del-loc - vector gpp_ids;//del-loc - vector p_part;//del-loc - if (g_id!=-1) - for (unordered_map< pair,scalar_type> :: iterator kt = ale->Dip_counts[g_id].begin(); kt != ale->Dip_counts[g_id].end(); kt++) - { - pair parts = (*kt).first; - long int gp_id=parts.first; - long int gpp_id=parts.second; - gp_ids.push_back(gp_id); - gpp_ids.push_back(gpp_id); - if (ale->Bip_counts[g_id]<=scalar_parameter["min_bip_count"]) - p_part.push_back(0); - else - { - p_part.push_back(ale->p_dip(g_id,gp_id,gpp_id)); - } - } + + if (g_id_sizes[i] == 1) { + int id = 0; + boost::dynamic_bitset<> temp = ale->id_sets[g_id]; + for (auto i = 0; i < ale->Gamma_size + 1; ++i) { + // if ( BipartitionTools::testBit ( temp, i) ) { + if (temp[i]) { + id = i; + break; + } + } + string gene_name = ale->id_leaves[id]; + // string gene_name=ale->id_leaves[(* + //(ale->id_sets[g_id].begin()) )]; + vector tokens; + boost::split(tokens, gene_name, + boost::is_any_of(string_parameter["gene_name_separators"]), + boost::token_compress_on); + string species_name; + if ((int)scalar_parameter["species_field"] == -1) + species_name = tokens[tokens.size() - 1]; else - { - //root biprartition needs to be handled seperatly - map,int> bip_parts; - for (map :: iterator it = ale->Bip_counts.begin(); it != ale->Bip_counts.end(); it++) - { - long int gp_id=(*it).first; - boost::dynamic_bitset<> gamma = ale->id_sets[gp_id]; - boost::dynamic_bitset<> not_gamma = ~gamma; - not_gamma[0] = 0; -/* for (set::iterator st=ale->Gamma.begin();st!=ale->Gamma.end();st++) - if (gamma.count(*st)==0) - not_gamma.insert(*st);*/ - /* for (auto i = 0; i < ale->nbint; ++i) { - not_gamma[i] = 0; + species_name = tokens[(int)scalar_parameter["species_field"]]; + + bool found = false; + for (map::iterator tmpit = extant_species.begin(); + tmpit != extant_species.end(); tmpit++) + if ((*tmpit).second == species_name) + found = true; + if (not found) + cout << species_name << endl; + + gid_sps[g_id] = species_name; + } + } + + for (int i = 0; i < (int)g_ids.size(); i++) { + // directed partition (dip) gamma's id + bool is_a_leaf = false; + long int g_id = g_ids[i]; + if (g_id_sizes[i] == 1) + is_a_leaf = true; + + vector gp_ids; // del-loc + vector gpp_ids; // del-loc + vector p_part; // del-loc + if (g_id != -1) + for (unordered_map, scalar_type>::iterator kt = + ale->Dip_counts[g_id].begin(); + kt != ale->Dip_counts[g_id].end(); kt++) { + pair parts = (*kt).first; + long int gp_id = parts.first; + long int gpp_id = parts.second; + gp_ids.push_back(gp_id); + gpp_ids.push_back(gpp_id); + if (ale->Bip_counts[g_id] <= scalar_parameter["min_bip_count"]) + p_part.push_back(0); + else { + p_part.push_back(ale->p_dip(g_id, gp_id, gpp_id)); + } + } + else { + // root biprartition needs to be handled seperatly + map, int> bip_parts; + for (map::iterator it = ale->Bip_counts.begin(); + it != ale->Bip_counts.end(); it++) { + long int gp_id = (*it).first; + boost::dynamic_bitset<> gamma = ale->id_sets[gp_id]; + boost::dynamic_bitset<> not_gamma = ~gamma; + not_gamma[0] = 0; + /* for (set::iterator + st=ale->Gamma.begin();st!=ale->Gamma.end();st++) if + (gamma.count(*st)==0) not_gamma.insert(*st);*/ + /* for (auto i = 0; i < ale->nbint; ++i) { + not_gamma[i] = 0; + } + BipartitionTools::bitNot(not_gamma, gamma, ale->nbint);*/ + long int gpp_id = ale->set_ids[not_gamma]; + set parts; + parts.insert(gp_id); + parts.insert(gpp_id); + bip_parts[parts] = 1; + /* gamma.clear(); + not_gamma.clear();*/ + } + for (map, int>::iterator kt = bip_parts.begin(); + kt != bip_parts.end(); kt++) { + vector parts; + for (set::iterator sit = (*kt).first.begin(); + sit != (*kt).first.end(); sit++) + parts.push_back((*sit)); + long int gp_id = parts[0]; + long int gpp_id = parts[1]; + gp_ids.push_back(gp_id); + gpp_ids.push_back(gpp_id); + if (ale->Bip_counts[gp_id] <= scalar_parameter["min_bip_count"] and + not ale->Gamma_size < 4) + p_part.push_back(0); + else + p_part.push_back(ale->p_bip(gp_id)); + } + bip_parts.clear(); + } + int N_parts = gp_ids.size(); + + // iterate over all postions along S + for (int rank = 0; rank < last_rank; rank++) { + int n = time_slices[rank].size(); + for (int t_i = 0; t_i < (int)time_slice_times[rank].size(); t_i++) { + // ###################################################################################################################### + // #########################################INNNER + // LOOP################################################################## + // ###################################################################################################################### + + scalar_type t = time_slice_times[rank][t_i]; + scalar_type tpdt, tpdt_nl; + // if ( t_i < scalar_parameter["D"]-1 ) + if (t_i < (int)time_slice_times[rank].size() - 1) + tpdt = time_slice_times[rank][t_i + 1]; + else if (rank < last_rank - 1) + tpdt = time_slice_times[rank + 1][0]; + else + // top of root stem + tpdt = t_begin[time_slices[rank][0]]; + + if (scalar_parameter["event_node"] == 1 and 0) + tpdt_nl = t; + else + tpdt_nl = tpdt; + + // root + scalar_type Delta_t = tpdt - t; + scalar_type Delta_bar = vector_parameter["Delta_bar"][rank]; + scalar_type p_Delta_bar = Delta_bar * Delta_t; + + scalar_type Ebar = Ee[-1][t]; + ; + if (1) { + for (int branch_i = 0; branch_i < n; branch_i++) { + int e = time_slices[rank][branch_i]; + + // boundaries for branch e + // boundary at present + if (t == 0) { + if (is_a_leaf && extant_species[e] == gid_sps[g_id]) + q[g_id][t][e] = 1; + else + q[g_id][t][e] = 0; } - BipartitionTools::bitNot(not_gamma, gamma, ale->nbint);*/ - long int gpp_id = ale->set_ids[not_gamma]; - set parts; - parts.insert(gp_id); - parts.insert(gpp_id); - bip_parts[parts]=1; - /* gamma.clear(); - not_gamma.clear();*/ - } - for (map,int> :: iterator kt = bip_parts.begin();kt!=bip_parts.end();kt++) - { - vector parts; - for (set::iterator sit=(*kt).first.begin();sit!=(*kt).first.end();sit++) parts.push_back((*sit)); - long int gp_id=parts[0]; - long int gpp_id=parts[1]; - gp_ids.push_back(gp_id); - gpp_ids.push_back(gpp_id); - if (ale->Bip_counts[gp_id]<=scalar_parameter["min_bip_count"] and not ale->Gamma_size<4) - p_part.push_back(0); - else - p_part.push_back(ale->p_bip(gp_id)); - } - bip_parts.clear(); - } - int N_parts=gp_ids.size(); - - //iterate over all postions along S - for (int rank=0;rank0) - { - int f=daughters[e][0]; - int g=daughters[e][1]; - scalar_type Eft=Ee[f][t]; - scalar_type Egt=Ee[g][t]; - - //sum scalar_type q_sum=0; - //q[g_id][t][e]=0; - //max - scalar_type max_term=0; - step max_step; - //max - - scalar_type SL_fLg=q[g_id][t][f]*Egt; - scalar_type SL_Lfg=q[g_id][t][g]*Eft; - //SL EVENT - // q_sum+=SL_fLg+SL_Lfg; - //q[g_id][t][e]=q[g_id][t][f]*Egt + q[g_id][t][g]*Eft; - //SL. - //max - if (max_term 0) { + int f = daughters[e][0]; + int g = daughters[e][1]; + scalar_type Eft = Ee[f][t]; + scalar_type Egt = Ee[g][t]; + + // sum scalar_type q_sum=0; + // q[g_id][t][e]=0; + // max + scalar_type max_term = 0; + step max_step; + // max + + scalar_type SL_fLg = q[g_id][t][f] * Egt; + scalar_type SL_Lfg = q[g_id][t][g] * Eft; + // SL EVENT + // q_sum+=SL_fLg+SL_Lfg; + // q[g_id][t][e]=q[g_id][t][f]*Egt + q[g_id][t][g]*Eft; + // SL. + // max + if (max_term < SL_fLg) { + max_term = SL_fLg; + max_step.e = f; + max_step.ep = -1; + max_step.epp = -1; + max_step.t = t; + max_step.rank = rank; + max_step.g_id = g_id; + max_step.gp_id = -1; + max_step.gpp_id = -1; + max_step.event = "SL"; + } + if (max_term < SL_Lfg) { + max_term = SL_Lfg; + max_step.e = g; + max_step.ep = -1; + max_step.epp = -1; + max_step.t = t; + max_step.rank = rank; + max_step.g_id = g_id; + max_step.gp_id = -1; + max_step.gpp_id = -1; + max_step.event = "SL"; + } + // max + + // non-leaf directed partition + if (not is_a_leaf) + for (int i = 0; i < N_parts; i++) { + long int gp_id = gp_ids.at(i); + long int gpp_id = gpp_ids.at(i); + scalar_type pp = p_part.at(i); + scalar_type S_pf_ppg = + q[gp_id][t][f] * q[gpp_id][t][g] * pp; + scalar_type S_ppf_pg = + q[gpp_id][t][f] * q[gp_id][t][g] * pp; + // S EVENT + // q[g_id][t][e]+=q[gp_id][t][f]*q[gpp_id][t][g] + // +q[gpp_id][t][f]*q[gp_id][t][g]; sum q_sum+= S_pf_ppg + + // S_ppf_pg; S. max + if (max_term < S_pf_ppg) { + max_term = S_pf_ppg; + max_step.e = -1; + max_step.ep = f; + max_step.epp = g; + max_step.t = t; + max_step.rank = rank; + max_step.g_id = -1; + max_step.gp_id = gp_id; + max_step.gpp_id = gpp_id; + max_step.event = "S"; + } + if (max_term < S_ppf_pg) { + max_term = S_ppf_pg; + max_step.e = -1; + max_step.ep = g; + max_step.epp = f; + max_step.t = t; + max_step.rank = rank; + max_step.g_id = -1; + max_step.gp_id = gp_id; + max_step.gpp_id = gpp_id; + max_step.event = "S"; + } + // max + } + + // sum q[g_id][t][e]=q_sum; + q[g_id][t][e] = max_term; + if (not lowmem) + q_step[g_id][t][e] = max_step; + } + // branches that cross to next time slice + else { + // trivial + ; // q[g_id][t][e]=q[g_id][t][e]; + } + } + // boundaries for branch e. + } + } + if (1) { + // boundaries for branch alpha virtual branch + // boundary at present + if (t == 0) + q[g_id][t][alpha] = 0; + // boundary between slice rank and rank-1 slice is trivial + ; // q[g_id][t][alpha]=q[g_id][t][alpha]; + // boundaries for branch alpha virtual branch. + + // events within slice rank at time t on alpha virtual branch + scalar_type G_bar = Ge[-1][t]; + q[g_id][tpdt][alpha] = 0; + // sum scalar_type q_sum=0; + // sum scalar_type q_sum_nl=0; + // max + scalar_type max_term = 0; + scalar_type max_term_nl = 0; + step max_step; + step max_step_nl; + // max + for (int branch_i = 0; branch_i < n; branch_i++) { + int e = time_slices[rank][branch_i]; + scalar_type tau_e = vector_parameter["tau"][e]; + scalar_type p_Ntau_e = tau_e * Delta_t; + + // non-leaf directed partition + if (not is_a_leaf) + for (int i = 0; i < N_parts; i++) { + long int gp_id = gp_ids.at(i); + long int gpp_id = gpp_ids.at(i); + scalar_type pp = p_part.at(i); + scalar_type T_ep_app = + p_Ntau_e * q[gp_id][t][e] * q[gpp_id][t][alpha] * pp; + scalar_type T_ap_epp = + p_Ntau_e * q[gp_id][t][alpha] * q[gpp_id][t][e] * pp; + // Tb EVENT + // sum q_sum_nl+=T_ep_app+T_ap_epp; + // q[g_id][tpdt][alpha]+=p_Ntau_e*(q[gp_id][t][e]*q[gpp_id][t][alpha]+q[gp_id][t][alpha]*q[gpp_id][t][e]); + // Tb. + // max + if (max_term_nl < T_ep_app) { + max_term_nl = T_ep_app; + max_step_nl.e = -1; + max_step_nl.ep = e; + max_step_nl.epp = alpha; + max_step_nl.t = t; + max_step_nl.rank = rank; + max_step_nl.g_id = -1; + max_step_nl.gp_id = gp_id; + max_step_nl.gpp_id = gpp_id; + max_step_nl.event = "Tb"; + } + if (max_term_nl < T_ap_epp) { + max_term_nl = T_ap_epp; + max_step_nl.e = -1; + max_step_nl.ep = alpha; + max_step_nl.epp = e; + max_step_nl.t = t; + max_step_nl.rank = rank; + max_step_nl.g_id = -1; + max_step_nl.gp_id = gp_id; + max_step_nl.gpp_id = gpp_id; + max_step_nl.event = "Tb"; + } + // max + } + } + // non-leaf directed partition + if (not is_a_leaf) + for (int i = 0; i < N_parts; i++) { + long int gp_id = gp_ids.at(i); + long int gpp_id = gpp_ids[i]; + scalar_type pp = p_part[i]; + scalar_type Sb = p_Delta_bar * + (2 * q[gp_id][t][alpha] * q[gpp_id][t][alpha]) * + pp; + // S_bar EVENT + // sum q_sum_nl+=Sb; + // q[g_id][tpdt][alpha]+=p_Delta_bar*(2*q[gp_id][t][alpha]*q[gpp_id][t][alpha]); + // S_bar. + // max + if (max_term_nl < Sb) { + max_term_nl = Sb; + max_step_nl.e = -1; + max_step_nl.ep = alpha; + max_step_nl.epp = alpha; + max_step_nl.t = t; + max_step_nl.rank = rank; + max_step_nl.g_id = -1; + max_step_nl.gp_id = gp_id; + max_step_nl.gpp_id = gpp_id; + max_step_nl.event = "Sb"; + } + // max + } + + for (int branch_i = 0; branch_i < n; branch_i++) { + int e = time_slices[rank][branch_i]; + scalar_type tau_e = vector_parameter["tau"][e]; + scalar_type p_Ntau_e = tau_e * Delta_t; + scalar_type TLb = p_Ntau_e * Ebar * q[g_id][t][e]; + // TL_bar EVENT + // sum q_sum+=TLb; + // q[g_id][tpdt][alpha]+=p_Ntau_e*Ebar*q[g_id][t][e]; + // TL_bar. + // max + if (max_term_nl < TLb) { + max_term_nl = TLb; + max_step_nl.e = e; + max_step_nl.ep = -1; + max_step_nl.epp = -1; + max_step_nl.t = t; + max_step_nl.rank = rank; + max_step_nl.g_id = g_id; + max_step_nl.gp_id = -1; + max_step_nl.gpp_id = -1; + max_step_nl.event = "TLb"; + } + // max + } + + // sum q[g_id][tpdt_nl][alpha]+=q_sum_nl; + if (q[g_id][tpdt_nl][alpha] < max_term_nl) { + q[g_id][tpdt_nl][alpha] = max_term_nl; + if (not lowmem) + q_step[g_id][tpdt_nl][alpha] = max_step_nl; + } + + // 0 EVENT + scalar_type empty = G_bar * q[g_id][t][alpha]; + // sum q_sum+=empty; + // q[g_id][tpdt][alpha]+=G_bar*q[g_id][t][alpha]; + // 0. + // max + if (max_term < empty) { + max_term = empty; + max_step.e = alpha; + max_step.ep = -1; + max_step.epp = -1; + max_step.t = t; + max_step.rank = rank; + max_step.g_id = g_id; + max_step.gp_id = -1; + max_step.gpp_id = -1; + max_step.event = "0"; + } + // max + + // sum q[g_id][tpdt][alpha]+=q_sum; + if (q[g_id][tpdt][alpha] < max_term) { + q[g_id][tpdt][alpha] = max_term; + if (not lowmem) + q_step[g_id][tpdt][alpha] = max_step; + } + // events within slice rank at time t on alpha virtual branch. + } + if (1) { + for (int branch_i = 0; branch_i < n; branch_i++) { + int e = time_slices[rank][branch_i]; + scalar_type Get = Ge[e][t]; + scalar_type Eet = Ee[e][t]; + scalar_type delta_e = vector_parameter["delta"][e]; + scalar_type p_delta_e = delta_e * Delta_t; + + // events within slice rank at time t on branch e + q[g_id][tpdt][e] = 0; + // sum scalar_type q_sum=0; + // sum scalar_type q_sum_nl=0; + // max + scalar_type max_term = 0; + scalar_type max_term_nl = 0; + step max_step; + step max_step_nl; + // max + + // non-leaf directed partition + if (not is_a_leaf) + for (int i = 0; i < N_parts; i++) { + long int gp_id = gp_ids.at(i); + long int gpp_id = gpp_ids.at(i); + scalar_type pp = p_part.at(i); + scalar_type qpe = q[gp_id][t][e]; + scalar_type qppe = q[gpp_id][t][e]; + scalar_type Sb_pa_ppe = + p_Delta_bar * q[gp_id][t][alpha] * qppe * pp; + scalar_type Sb_pe_ppa = + p_Delta_bar * qpe * q[gpp_id][t][alpha] * pp; + // S_bar EVENT + // sum q_sum_nl+= Sb_pa_ppe + Sb_pe_ppa; + // q[g_id][tpdt][e]+=p_Delta_bar*(q[gp_id][t][alpha]*q[gpp_id][t][e]+q[gp_id][t][e]*q[gpp_id][t][alpha]); + // S_bar. + // max + if (max_term_nl < Sb_pa_ppe) { + max_term_nl = Sb_pa_ppe; + max_step_nl.e = -1; + max_step_nl.ep = alpha; + max_step_nl.epp = e; + max_step_nl.t = t; + max_step_nl.rank = rank; + max_step_nl.g_id = -1; + max_step_nl.gp_id = gp_id; + max_step_nl.gpp_id = gpp_id; + max_step_nl.event = "Sb"; + } + if (max_term_nl < Sb_pe_ppa) { + max_term_nl = Sb_pe_ppa; + max_step_nl.e = -1; + max_step_nl.ep = e; + max_step_nl.epp = alpha; + max_step_nl.t = t; + max_step_nl.rank = rank; + max_step_nl.g_id = -1; + max_step_nl.gp_id = gp_id; + max_step_nl.gpp_id = gpp_id; + max_step_nl.event = "Sb"; + } + // max + + scalar_type D = 2 * p_delta_e * qpe * qppe * pp; + // D EVENT + // sum q_sum_nl+= D; + // q[g_id][tpdt][e]+=p_delta_e*q[gp_id][t][e]*q[gpp_id][t][e]; + // D. + // max + if (max_term_nl < D) { + max_term_nl = D; + max_step_nl.e = -1; + max_step_nl.ep = e; + max_step_nl.epp = e; + max_step_nl.t = t; + max_step_nl.rank = rank; + max_step_nl.g_id = -1; + max_step_nl.gp_id = gp_id; + max_step_nl.gpp_id = gpp_id; + max_step_nl.event = "D"; + } + // max + } + // sum q[g_id][tpdt_nl][e]+=q_sum_nl; + if (q[g_id][tpdt_nl][e] < max_term_nl) { + q[g_id][tpdt_nl][e] = max_term_nl; + if (not lowmem) + q_step[g_id][tpdt_nl][e] = max_step_nl; + } + + scalar_type empty = Get * q[g_id][t][e]; + // 0 EVENT + // sum q_sum+=empty; + // q[g_id][tpdt][e]=Get*q[g_id][t][e]; + // 0. + // max + if (max_term < empty) { + max_term = empty; + max_step.e = e; + max_step.ep = -1; + max_step.epp = -1; + max_step.t = t; + max_step.rank = rank; + max_step.g_id = g_id; + max_step.gp_id = -1; + max_step.gpp_id = -1; + max_step.event = "0"; + } + // max + + scalar_type SLb = p_Delta_bar * Eet * q[g_id][t][alpha]; + // SL_bar EVENT + // sum q_sum+=SLb; + // q[g_id][tpdt][e]+=p_Delta_bar*Eet*q[g_id][t][alpha]; + // SL_bar. + // max + if (max_term < SLb) { + max_term = SLb; + max_step.e = alpha; + max_step.ep = -1; + max_step.epp = -1; + max_step.t = t; + max_step.rank = rank; + max_step.g_id = g_id; + max_step.gp_id = -1; + max_step.gpp_id = -1; + max_step.event = "SLb"; + } + // max + + // sum q[g_id][tpdt][e]+=q_sum; + if (q[g_id][tpdt][e] < max_term) { + q[g_id][tpdt][e] = max_term; + if (not lowmem) + q_step[g_id][tpdt][e] = max_step; + } + // events within slice rank at time t on branch e. + } + } + // ###################################################################################################################### + // #########################################INNNER + // LOOP################################################################## + // ###################################################################################################################### + } } - //del-locs + gp_ids.clear(); + gpp_ids.clear(); + p_part.clear(); + } + // del-locs g_ids.clear(); g_id_sizes.clear(); - if (not lowmem) - { - return traceback(); + if (not lowmem) { + return traceback(); + } else { + // cout << "LOWMEM" < return_pair; - MLRec_events.clear(); - Ttokens.clear(); - register_O(max_e); - return_pair.first=sample(false,-1,max_t,max_rank,max_e,0,"","",true)+";\n"; - return_pair.second=max_term/root_norm; - return return_pair; + + for (int rank = 0; rank < last_rank; rank++) { + int n = time_slices[rank].size(); + for (int t_i = 0; t_i < (int)time_slice_times[rank].size(); t_i++) { + scalar_type t = time_slice_times[rank][t_i]; + for (int branch_i = 0; branch_i < n; branch_i++) { + int e = time_slices[rank][branch_i]; + if (max_term < q[g_id][t][e]) { + max_term = q[g_id][t][e]; + max_e = e; + max_t = t_i; + max_rank = rank; + } + } + if (max_term < q[g_id][t][alpha]) { + max_term = q[g_id][t][alpha]; + max_e = alpha; + max_t = t_i; + max_rank = rank; + } + } } + pair return_pair; + MLRec_events.clear(); + Ttokens.clear(); + register_O(max_e); + return_pair.first = + sample(false, -1, max_t, max_rank, max_e, 0, "", "", true) + ";\n"; + return_pair.second = max_term / root_norm; + return return_pair; + } } -//deprecated -pair exODT_model::traceback() -{ +// deprecated +pair exODT_model::traceback() { stringstream signal_stream; - scalar_type max_term=0; - long int g_id=-1; - int max_e=-11; - scalar_type max_t=-11; - scalar_type max_rank=-11; - - - scalar_type root_norm=0; - for (int rank=0;rank return_pair; + for (int rank = 0; rank < last_rank; rank++) { + int n = time_slices[rank].size(); + for (int t_i = 0; t_i < (int)time_slice_times[rank].size(); t_i++) { + scalar_type t = time_slice_times[rank][t_i]; + for (int branch_i = 0; branch_i < n; branch_i++) { + int e = time_slices[rank][branch_i]; + signal_stream << g_id << " " << t << " " << e << " " << q[g_id][t][e] + << endl; + if (max_term < q[g_id][t][e]) { + max_term = q[g_id][t][e]; + max_e = e; + max_t = t; + max_rank = rank; + } + } + signal_stream << g_id << " " << t << " " << alpha << " " + << q[g_id][t][alpha] << endl; + if (max_term < q[g_id][t][alpha]) { + max_term = q[g_id][t][alpha]; + max_e = alpha; + max_t = t; + max_rank = rank; + } + } + } + signal_string = signal_stream.str(); + pair return_pair; MLRec_events.clear(); Ttokens.clear(); register_O(max_e); - return_pair.first=traceback(g_id,max_t,max_rank,max_e,0,"")+";\n"; - return_pair.second=max_term/root_norm; - - for (std::map > >::iterator it=q.begin();it!=q.end();it++) - { - for ( std::map< scalar_type, std::map >::iterator jt=(*it).second.begin();jt!=(*it).second.end();jt++) - (*jt).second.clear(); - (*it).second.clear(); - } + return_pair.first = traceback(g_id, max_t, max_rank, max_e, 0, "") + ";\n"; + return_pair.second = max_term / root_norm; + + for (std::map>>::iterator + it = q.begin(); + it != q.end(); it++) { + for (std::map>::iterator jt = + (*it).second.begin(); + jt != (*it).second.end(); jt++) + (*jt).second.clear(); + (*it).second.clear(); + } q.clear(); - for (std::map > >::iterator it=q_step.begin();it!=q_step.end();it++) - { - for ( std::map< scalar_type, std::map >::iterator jt=(*it).second.begin();jt!=(*it).second.end();jt++) - (*jt).second.clear(); - (*it).second.clear(); - } - q_step.clear(); + for (std::map>>::iterator + it = q_step.begin(); + it != q_step.end(); it++) { + for (std::map>::iterator jt = + (*it).second.begin(); + jt != (*it).second.end(); jt++) + (*jt).second.clear(); + (*it).second.clear(); + } + q_step.clear(); return return_pair; } -//deprecated -string exODT_model::traceback(long int g_id,scalar_type t,scalar_type rank,int e,scalar_type branch_length,string branch_events, string transfer_token) -{ +// deprecated +string exODT_model::traceback(long int g_id, scalar_type t, scalar_type rank, + int e, scalar_type branch_length, + string branch_events, string transfer_token) { /* if (e==alpha) cout << "b "<<-1; @@ -781,383 +748,377 @@ string exODT_model::traceback(long int g_id,scalar_type t,scalar_type rank,int e cout << "b "<set2name(ale_pointer->id_sets[g_id]) << endl; + + cout << "from " << e << " " << t << " " << endl; + cout << "g_id "<< g_id <<" is " + <set2name(ale_pointer->id_sets[g_id]) << endl; */ - step max_step=q_step[g_id][t][e]; - - //cout << max_step.event< temp = ale_pointer->id_sets[g_id]; - for (auto i = 0; i < ale_pointer->Gamma_size + 1; ++i) { - // if ( BipartitionTools::testBit ( temp, i) ) { - if ( temp[ i] ) { - size++; - } - } - if (t==0 and size==1 and e!=-1) - { - register_leaf(e); - stringstream branch_string; - if (scalar_parameter["leaf_events"]==1) branch_string<set2name(ale_pointer->id_sets[g_id])+branch_string.str(); + step max_step = q_step[g_id][t][e]; + + // cout << max_step.event< temp = ale_pointer->id_sets[g_id]; + for (auto i = 0; i < ale_pointer->Gamma_size + 1; ++i) { + // if ( BipartitionTools::testBit ( temp, i) ) { + if (temp[i]) { + size++; } + } + if (t == 0 and size == 1 and e != -1) { + register_leaf(e); + stringstream branch_string; + if (scalar_parameter["leaf_events"] == 1) + branch_string << branch_events; + branch_string << ":" << new_branch_length; + return ale_pointer->set2name(ale_pointer->id_sets[g_id]) + + branch_string.str(); + } /* if (ale_pointer->Bip_counts[g_id]>0) new_branch_length=ale_pointer->Bip_bls[g_id]/ale_pointer->Bip_counts[g_id]; else new_branch_length=ale_pointer->Bip_bls[g_id]/ale_pointer->observations; */ - if (max_step.event=="D" or max_step.event=="Tb" or max_step.event=="S" or max_step.event=="Sb") - { - - //cout << max_step.event << " " << max_step.ep << "-" << max_step.epp <<" "<constructor_string <constructor_string << endl; + signal = -11; + } return "error"; } -//used by sample() consider moving to sample.cpp -void exODT_model::register_O(int e) -{ - if (e>-1) branch_counts["count"].at(e)+=1; - if (e>-1) branch_counts["Os"].at(e)+=1; +// used by sample() consider moving to sample.cpp +void exODT_model::register_O(int e) { + if (e > -1) + branch_counts["count"].at(e) += 1; + if (e > -1) + branch_counts["Os"].at(e) += 1; } -void exODT_model::register_D(int e) -{ - MLRec_events["D"]+=1; - if (e>-1) branch_counts["Ds"].at(e)+=1; +void exODT_model::register_D(int e) { + MLRec_events["D"] += 1; + if (e > -1) + branch_counts["Ds"].at(e) += 1; } -void exODT_model::register_Tto(int e) -{ - MLRec_events["T"]+=1; - if (e>-1) branch_counts["Ts"].at(e)+=1; +void exODT_model::register_Tto(int e) { + MLRec_events["T"] += 1; + if (e > -1) + branch_counts["Ts"].at(e) += 1; } -void exODT_model::register_Tfrom(int e) -{ - if (e>-1) branch_counts["Tfroms"].at(e)+=1; +void exODT_model::register_Tfrom(int e) { + if (e > -1) + branch_counts["Tfroms"].at(e) += 1; } -void exODT_model::register_L(int e) -{ - MLRec_events["L"]+=1; - if (e>-1) branch_counts["Ls"].at(e)+=1; +void exODT_model::register_L(int e) { + MLRec_events["L"] += 1; + if (e > -1) + branch_counts["Ls"].at(e) += 1; } -void exODT_model::register_S(int e) -{ - MLRec_events["S"]+=1; - if (e>-1) - { - int f=daughters[e][0]; - int g=daughters[e][1]; - branch_counts["copies"].at(e)+=1; - branch_counts["count"].at(f)+=1; - branch_counts["count"].at(g)+=1; - } +void exODT_model::register_S(int e) { + MLRec_events["S"] += 1; + if (e > -1) { + int f = daughters[e][0]; + int g = daughters[e][1]; + branch_counts["copies"].at(e) += 1; + branch_counts["count"].at(f) += 1; + branch_counts["count"].at(g) += 1; + } } -void exODT_model::register_leaf(int e) -{ - if (e>-1) branch_counts["copies"].at(e)+=1; - //MLRec_events["genes"]+=1; +void exODT_model::register_leaf(int e) { + if (e > -1) + branch_counts["copies"].at(e) += 1; + // MLRec_events["genes"]+=1; } -void exODT_model::register_Ttoken(string token) -{ - Ttokens.push_back(token); -} +void exODT_model::register_Ttoken(string token) { Ttokens.push_back(token); } -//ad hoc function should be moved to a future exODT_util.cpp -void exODT_model::show_counts(string name) -{ - for (map ::iterator it=node_ids.begin();it!=node_ids.end();it++ ) - (*it).first->setBranchProperty("ID",BppString("")); - - for (int branch=0;branch(tmp_node->getBranchProperty("ID")))).toSTL(); - //out<< id_ranks[branch]; - if (branch==last_branch-1) out<<"|"<setBranchProperty("ID",BppString(out.str())); - if (tmp_node->isLeaf()) - tmp_node->setName(tmp_node->getName()+"_"+out.str()); - } - cout << TreeTemplateTools::treeToParenthesis(*S,false,"ID") << endl; - for (map ::iterator it=node_ids.begin();it!=node_ids.end();it++ ) - { - (*it).first->setBranchProperty("ID",BppString("")); - if ((*it).first->isLeaf()) - { - vector tokens; - name=(*it).first->getName(); - boost::split(tokens,name,boost::is_any_of("_"),boost::token_compress_on); - (*it).first->setName(tokens[0]); - } +// ad hoc function should be moved to a future exODT_util.cpp +void exODT_model::show_counts(string name) { + for (map::iterator it = node_ids.begin(); it != node_ids.end(); + it++) + (*it).first->setBranchProperty("ID", BppString("")); + + for (int branch = 0; branch < last_branch; branch++) + if (id_nodes.count(branch)) { + Node *tmp_node = id_nodes[branch]; + + stringstream out; + string old_name = (*(dynamic_cast( + tmp_node->getBranchProperty("ID")))) + .toSTL(); + // out<< id_ranks[branch]; + if (branch == last_branch - 1) + out << "|" << name << "|"; + out << branch_counts[name][branch]; + tmp_node->setBranchProperty("ID", BppString(out.str())); + if (tmp_node->isLeaf()) + tmp_node->setName(tmp_node->getName() + "_" + out.str()); } - + cout << TreeTemplateTools::treeToParenthesis(*S, false, "ID") << endl; + for (map::iterator it = node_ids.begin(); it != node_ids.end(); + it++) { + (*it).first->setBranchProperty("ID", BppString("")); + if ((*it).first->isLeaf()) { + vector tokens; + name = (*it).first->getName(); + boost::split(tokens, name, boost::is_any_of("_"), + boost::token_compress_on); + (*it).first->setName(tokens[0]); + } + } } -//ad hoc function should be moved to a future exODT_util.cpp -string exODT_model::counts_string() -{ +// ad hoc function should be moved to a future exODT_util.cpp +string exODT_model::counts_string() { stringstream out; - for (int branch=0;branch::iterator it=node_ids.begin();it!=node_ids.end();it++ ) - (*it).first->setBranchProperty("ID",BppString("")); - - for (int branch=0;branch(tmp_node->getBranchProperty("ID")))).toSTL(); - //out<< id_ranks[branch]; - if (branch==last_branch-1) out<<"|"<setBranchProperty("ID",BppString(out.str())); - if (tmp_node->isLeaf()) - tmp_node->setName(tmp_node->getName()+"_"+out.str()); - } - - cout << TreeTemplateTools::treeToParenthesis(*S,false,"ID") << endl; - for (map ::iterator it=node_ids.begin();it!=node_ids.end();it++ ) - { - (*it).first->setBranchProperty("ID",BppString("")); - if ((*it).first->isLeaf()) - { - vector tokens; - name=(*it).first->getName(); - boost::split(tokens,name,boost::is_any_of("_"),boost::token_compress_on); - (*it).first->setName(tokens[0]); - } +// ad hoc function should be moved to a future exODT_util.cpp +void exODT_model::show_rates(string name) { + for (map::iterator it = node_ids.begin(); it != node_ids.end(); + it++) + (*it).first->setBranchProperty("ID", BppString("")); + + for (int branch = 0; branch < last_branch; branch++) + if (id_nodes.count(branch)) { + Node *tmp_node = id_nodes[branch]; + + stringstream out; + string old_name = (*(dynamic_cast( + tmp_node->getBranchProperty("ID")))) + .toSTL(); + // out<< id_ranks[branch]; + if (branch == last_branch - 1) + out << "|" << name << "|"; + if (name == "tau") + out << vector_parameter[name][branch] * vector_parameter["N"][0]; + else + out << vector_parameter[name][branch]; + tmp_node->setBranchProperty("ID", BppString(out.str())); + if (tmp_node->isLeaf()) + tmp_node->setName(tmp_node->getName() + "_" + out.str()); + } + + cout << TreeTemplateTools::treeToParenthesis(*S, false, "ID") << endl; + for (map::iterator it = node_ids.begin(); it != node_ids.end(); + it++) { + (*it).first->setBranchProperty("ID", BppString("")); + if ((*it).first->isLeaf()) { + vector tokens; + name = (*it).first->getName(); + boost::split(tokens, name, boost::is_any_of("_"), + boost::token_compress_on); + (*it).first->setName(tokens[0]); } + } } diff --git a/src/traceback_qvec.cpp b/src/traceback_qvec.cpp index 3954769..4f46df8 100644 --- a/src/traceback_qvec.cpp +++ b/src/traceback_qvec.cpp @@ -2,741 +2,705 @@ using namespace std; using namespace bpp; -//The current lowmem=true method uses sample(true) cf. sample.cpp. -//The general structure of the calculation, and lot of the code, is the same as p(ale) cf. model.cpp. -pair exODT_model::p_MLRec(approx_posterior *ale, bool lowmem) -{ - - ale_pointer=ale; - //directed partitions and thier sizes - vector g_ids;//del-loc - vector g_id_sizes;//del-loc - for (map > :: iterator it = ale->size_ordered_bips.begin(); it != ale->size_ordered_bips.end(); it++) - for (vector :: iterator jt = (*it).second.begin(); jt != (*it).second.end(); jt++) - { - g_ids.push_back((*jt)); - g_id_sizes.push_back((*it).first); - } - //root biprartitino needs to be handled seperatly +// The current lowmem=true method uses sample(true) cf. sample.cpp. +// The general structure of the calculation, and lot of the code, is the same as +// p(ale) cf. model.cpp. +pair exODT_model::p_MLRec(approx_posterior *ale, + bool lowmem) { + + ale_pointer = ale; + // directed partitions and thier sizes + vector g_ids; // del-loc + vector g_id_sizes; // del-loc + for (map>::iterator it = ale->size_ordered_bips.begin(); + it != ale->size_ordered_bips.end(); it++) + for (vector::iterator jt = (*it).second.begin(); + jt != (*it).second.end(); jt++) { + g_ids.push_back((*jt)); + g_id_sizes.push_back((*it).first); + } + // root biprartitino needs to be handled seperatly g_ids.push_back(-1); g_id_sizes.push_back(ale->Gamma_size); // gene<->species mapping - //vector > > > qvec; - qvec.clear();//hope this doesn't leak.. + // vector > > > qvec; + qvec.clear(); // hope this doesn't leak.. // gene<->species mapping - // for (int i=0;i<(int)g_ids.size();i++) //Going through each clade of the approx_posterior + // for (int i=0;i<(int)g_ids.size();i++) //Going through each clade of the + // approx_posterior // { // long int g_id=g_ids[i]; // cerr<<"i: "< case vide + vector>> vrank; + vector> vt_i; + map vbranch; + vt_i.push_back(vbranch); + vrank.push_back(vt_i); + qvec.push_back(vrank); + } else { + // vector > > vrank; + vector>> vrank; + for (int rank = 0; rank < last_rank; + rank++) // Going through time slices, from leaves to root + { + // cerr<<"\trank: "< > vt_i; + vector> vt_i; + for (int t_i = 0; t_i < (int)time_slice_times[rank].size(); + t_i++) // Going through the subslices + { + // cerr<<"\t\tt_i: "< vbranch(n, 0.); + map vbranch; + for (int branch_i = 0; branch_i < n; branch_i++) { + int e = time_slices[rank][branch_i]; + // cerr<<"\t\t\te: "< case vide - vector > > vrank; - vector > vt_i; - map vbranch; - vt_i.push_back(vbranch); - vrank.push_back(vt_i); - qvec.push_back(vrank); + /* int id = 0; + boost::dynamic_bitset<> temp = ale->id_sets[g_id]; + for (auto i = 0; i < ale->Gamma_size + 1 ; ++i) { + // if ( BipartitionTools::testBit ( temp, i) ) { + if ( temp[i] ) { + id = i; + break; + } + }*/ + + int id = 0; + for (auto i = 0; i < ale->Gamma_size + 1; ++i) { + if (ale->id_sets[g_id][i]) { + id = i; + break; + } } - else{ - //vector > > vrank; - vector > > vrank; - for (int rank=0;rank > vt_i; - vector > vt_i; - for (int t_i=0;t_i<(int)time_slice_times[rank].size();t_i++) //Going through the subslices - { - //cerr<<"\t\tt_i: "< vbranch(n, 0.); - map vbranch; - for (int branch_i=0;branch_iid_leaves[id /*g_id*/]; + + // string gene_name=ale->id_leaves[ g_id ]; + // string gene_name=ale->id_leaves[(* (ale->id_sets[g_id].begin()) )]; + vector tokens; + boost::split(tokens, gene_name, + boost::is_any_of(string_parameter["gene_name_separators"]), + boost::token_compress_on); + string species_name; + if ((int)scalar_parameter["species_field"] == -1) + species_name = tokens[tokens.size() - 1]; + else + species_name = tokens[(int)scalar_parameter["species_field"]]; + gid_sps[g_id] = species_name; + } + } + + // p_parts is filled up with CCPs + for (int i = 0; i < (int)g_ids.size(); i++) { + // directed partition (dip) gamma's id + bool is_a_leaf = false; + long int g_id = g_ids[i]; + if (g_id_sizes[i] == 1) + is_a_leaf = true; + + vector gp_ids; // del-loc + vector gpp_ids; // del-loc + vector p_part; // del-loc + if (g_id != -1) + for (unordered_map, scalar_type>::iterator kt = + ale->Dip_counts[g_id].begin(); + kt != ale->Dip_counts[g_id].end(); kt++) { + pair parts = (*kt).first; + long int gp_id = parts.first; + long int gpp_id = parts.second; + gp_ids.push_back(gp_id); + gpp_ids.push_back(gpp_id); + if (ale->Bip_counts[g_id] <= scalar_parameter["min_bip_count"]) + p_part.push_back(0); + else { + p_part.push_back(ale->p_dip(g_id, gp_id, gpp_id)); + } } - } - for (int i=0;i<(int)g_ids.size();i++) //Going through each clade of the approx_posterior - { - long int g_id=g_ids[i]; - if (g_id_sizes[i]==1) //a leaf, mapping is by name - { - /* int id = 0; - boost::dynamic_bitset<> temp = ale->id_sets[g_id]; - for (auto i = 0; i < ale->Gamma_size + 1 ; ++i) { - // if ( BipartitionTools::testBit ( temp, i) ) { - if ( temp[i] ) { - id = i; - break; - } - }*/ - - int id = 0; - for (auto i=0; i< ale->Gamma_size + 1; ++i) { - if ( ale->id_sets[g_id][i] ) { - id=i; - break; + else { + // root biprartition needs to be handled seperatly + map, int> bip_parts; + for (map::iterator it = ale->Bip_counts.begin(); + it != ale->Bip_counts.end(); it++) { + long int gp_id = (*it).first; + boost::dynamic_bitset<> gamma = ale->id_sets[gp_id]; + boost::dynamic_bitset<> not_gamma = ~gamma; + not_gamma[0] = 0; + /* for (set::iterator + st=ale->Gamma.begin();st!=ale->Gamma.end();st++) if + (gamma.count(*st)==0) not_gamma.insert(*st);*/ + /* for (auto i = 0; i < ale->nbint; ++i) { + not_gamma[i] = 0; } - } - - string gene_name=ale->id_leaves[ id /*g_id*/ ]; - - // string gene_name=ale->id_leaves[ g_id ]; - // string gene_name=ale->id_leaves[(* (ale->id_sets[g_id].begin()) )]; - vector tokens; - boost::split(tokens,gene_name,boost::is_any_of(string_parameter["gene_name_separators"]),boost::token_compress_on); - string species_name; - if ((int)scalar_parameter["species_field"]==-1) - species_name=tokens[tokens.size()-1]; - else - species_name=tokens[(int)scalar_parameter["species_field"]]; - gid_sps[g_id]=species_name; - } - } - - //p_parts is filled up with CCPs - for (int i=0;i<(int)g_ids.size();i++) - { - // directed partition (dip) gamma's id - bool is_a_leaf=false; - long int g_id=g_ids[i]; - if (g_id_sizes[i]==1) - is_a_leaf=true; - - vector gp_ids;//del-loc - vector gpp_ids;//del-loc - vector p_part;//del-loc - if (g_id!=-1) - for (unordered_map< pair,scalar_type> :: iterator kt = ale->Dip_counts[g_id].begin(); kt != ale->Dip_counts[g_id].end(); kt++) - { - pair parts = (*kt).first; - long int gp_id=parts.first; - long int gpp_id=parts.second; - gp_ids.push_back(gp_id); - gpp_ids.push_back(gpp_id); - if (ale->Bip_counts[g_id]<=scalar_parameter["min_bip_count"]) - p_part.push_back(0); - else - { - p_part.push_back(ale->p_dip(g_id,gp_id,gpp_id)); - } - } - else - { - //root biprartition needs to be handled seperatly - map,int> bip_parts; - for (map :: iterator it = ale->Bip_counts.begin(); it != ale->Bip_counts.end(); it++) - { - long int gp_id=(*it).first; - boost::dynamic_bitset<> gamma = ale->id_sets[gp_id]; - boost::dynamic_bitset<> not_gamma = ~gamma; - not_gamma[0] = 0; - /* for (set::iterator st=ale->Gamma.begin();st!=ale->Gamma.end();st++) - if (gamma.count(*st)==0) - not_gamma.insert(*st);*/ - /* for (auto i = 0; i < ale->nbint; ++i) { - not_gamma[i] = 0; - } - BipartitionTools::bitNot(not_gamma, gamma, ale->nbint);*/ - long int gpp_id = ale->set_ids[not_gamma]; - set parts; - parts.insert(gp_id); - parts.insert(gpp_id); - bip_parts[parts]=1; - /* gamma.clear(); - not_gamma.clear();*/ - } - for (map,int> :: iterator kt = bip_parts.begin();kt!=bip_parts.end();kt++) - { - vector parts; - for (set::iterator sit=(*kt).first.begin();sit!=(*kt).first.end();sit++) parts.push_back((*sit)); - long int gp_id=parts[0]; - long int gpp_id=parts[1]; - gp_ids.push_back(gp_id); - gpp_ids.push_back(gpp_id); - if (ale->Bip_counts[gp_id]<=scalar_parameter["min_bip_count"] and not ale->Gamma_size<4) - p_part.push_back(0); - else - p_part.push_back(ale->p_bip(gp_id)); - } - bip_parts.clear(); - } - int N_parts=gp_ids.size(); - - //iterate over all postions along S - for (int rank=0;rank0) - { - int f=daughters[e][0]; - int g=daughters[e][1]; - scalar_type Eft=Ee[f][t]; - scalar_type Egt=Ee[g][t]; - - //sum scalar_type q_sum=0; - //qvec[g_id+1][rank][t_i][e]=0; - //max - scalar_type max_term=0; - //max - - scalar_type SL_fLg=qvec[g_id+1][rank][t_i][f]*Egt; - scalar_type SL_Lfg=qvec[g_id+1][rank][t_i][g]*Eft; - //SL EVENT - // q_sum+=SL_fLg+SL_Lfg; - //qvec[g_id+1][rank][t_i][e]=qvec[g_id+1][rank][t_i][f]*Egt + qvec[g_id+1][rank][t_i][g]*Eft; - //SL. - //max - if (max_termnbint);*/ + long int gpp_id = ale->set_ids[not_gamma]; + set parts; + parts.insert(gp_id); + parts.insert(gpp_id); + bip_parts[parts] = 1; + /* gamma.clear(); + not_gamma.clear();*/ + } + for (map, int>::iterator kt = bip_parts.begin(); + kt != bip_parts.end(); kt++) { + vector parts; + for (set::iterator sit = (*kt).first.begin(); + sit != (*kt).first.end(); sit++) + parts.push_back((*sit)); + long int gp_id = parts[0]; + long int gpp_id = parts[1]; + gp_ids.push_back(gp_id); + gpp_ids.push_back(gpp_id); + if (ale->Bip_counts[gp_id] <= scalar_parameter["min_bip_count"] and + not ale->Gamma_size < 4) + p_part.push_back(0); + else + p_part.push_back(ale->p_bip(gp_id)); + } + bip_parts.clear(); + } + int N_parts = gp_ids.size(); + + // iterate over all postions along S + for (int rank = 0; rank < last_rank; rank++) { + int n = time_slices[rank].size(); + for (int t_i = 0; t_i < (int)time_slice_times[rank].size(); t_i++) { + // ###################################################################################################################### + // #########################################INNNER + // LOOP################################################################## + // ###################################################################################################################### + + scalar_type t = time_slice_times[rank][t_i]; + scalar_type tpdt; + int tpdt_rank, tpdt_t_i; + if (t_i < (int)time_slice_times[rank].size() - 1) { + tpdt = time_slice_times[rank][t_i + 1]; + tpdt_rank = rank; + tpdt_t_i = t_i + 1; + } else if (rank < last_rank - 1) { + tpdt = time_slice_times[rank + 1][0]; + tpdt_rank = rank + 1; + tpdt_t_i = 0; + } else + // top of root ste + { + tpdt = t_begin[time_slices[rank][0]]; + tpdt_rank = rank; + tpdt_t_i = 0; + } + + // root + scalar_type Delta_t = tpdt - t; + scalar_type Delta_bar = vector_parameter["Delta_bar"][rank]; + scalar_type p_Delta_bar = Delta_bar * Delta_t; + + scalar_type Ebar = Ee[-1][t]; + ; + if (1) { + for (int branch_i = 0; branch_i < n; branch_i++) { + int e = time_slices[rank][branch_i]; + + // boundaries for branch e + // boundary at present + if (t == 0) { + if (is_a_leaf && extant_species[e] == gid_sps[g_id]) + qvec[g_id + 1][rank][t_i][e] = 1; + else + qvec[g_id + 1][rank][t_i][e] = 0; + } + // boundary between slice rank and rank-1 + else if (t_i == 0) { + // terminating branch is last in time_slices and defines a + // represented speciation + if (branch_i == n - 1 && rank > 0) { + int f = daughters[e][0]; + int g = daughters[e][1]; + scalar_type Eft = Ee[f][t]; + scalar_type Egt = Ee[g][t]; + + // sum scalar_type q_sum=0; + // qvec[g_id+1][rank][t_i][e]=0; + // max + scalar_type max_term = 0; + // max + + scalar_type SL_fLg = qvec[g_id + 1][rank][t_i][f] * Egt; + scalar_type SL_Lfg = qvec[g_id + 1][rank][t_i][g] * Eft; + // SL EVENT + // q_sum+=SL_fLg+SL_Lfg; + // qvec[g_id+1][rank][t_i][e]=qvec[g_id+1][rank][t_i][f]*Egt + + // qvec[g_id+1][rank][t_i][g]*Eft; SL. max + if (max_term < SL_fLg) { + max_term = SL_fLg; + } + if (max_term < SL_Lfg) { + max_term = SL_Lfg; + } + // max + + // non-leaf directed partition + if (not is_a_leaf) + for (int i = 0; i < N_parts; i++) { + long int gp_id = gp_ids.at(i); + long int gpp_id = gpp_ids.at(i); + scalar_type pp = p_part.at(i); + scalar_type S_pf_ppg = qvec[gp_id + 1][rank][t_i][f] * + qvec[gpp_id + 1][rank][t_i][g] * pp; + scalar_type S_ppf_pg = qvec[gpp_id + 1][rank][t_i][f] * + qvec[gp_id + 1][rank][t_i][g] * pp; + // S EVENT + // qvec[g_id+1][rank][t_i][e]+=qvec[gp_id+1][rank][t_i][f]*qvec[gpp_id+1][rank][t_i][g] + // +qvec[gpp_id+1][rank][t_i][f]*qvec[gp_id+1][rank][t_i][g]; + // sum q_sum+= S_pf_ppg + S_ppf_pg; + // S. + // max + if (max_term < S_pf_ppg) { + max_term = S_pf_ppg; + } + if (max_term < S_ppf_pg) { + max_term = S_ppf_pg; + } + // max + } + + // sum qvec[g_id+1][rank][t_i][e]=q_sum; + qvec[g_id + 1][rank][t_i][e] = max_term; + } + // branches that cross to next time slice + else { + // trivial + ; // qvec[g_id+1][rank][t_i][e]=qvec[g_id+1][rank][t_i][e]; + } + } + // boundaries for branch e. + } + } + if (1) { + // boundaries for branch alpha virtual branch + // boundary at present + if (t == 0) + qvec[g_id + 1][rank][t_i][alpha] = 0; + // boundary between slice rank and rank-1 slice is trivial + ; // qvec[g_id+1][rank][t_i][alpha]=qvec[g_id+1][rank][t_i][alpha]; + // boundaries for branch alpha virtual branch. + + // events within slice rank at time t on alpha virtual branch + scalar_type G_bar = Ge[-1][t]; + qvec[g_id + 1][tpdt_rank][tpdt_t_i][alpha] = 0; + // sum scalar_type q_sum=0; + // max + scalar_type max_term = 0; + // max + for (int branch_i = 0; branch_i < n; branch_i++) { + int e = time_slices[rank][branch_i]; + scalar_type tau_e = vector_parameter["tau"][e]; + scalar_type p_Ntau_e = tau_e * Delta_t; + + // non-leaf directed partition + if (not is_a_leaf) + for (int i = 0; i < N_parts; i++) { + long int gp_id = gp_ids.at(i); + long int gpp_id = gpp_ids.at(i); + scalar_type pp = p_part.at(i); + scalar_type T_ep_app = p_Ntau_e * + qvec[gp_id + 1][rank][t_i][e] * + qvec[gpp_id + 1][rank][t_i][alpha] * pp; + scalar_type T_ap_epp = p_Ntau_e * + qvec[gp_id + 1][rank][t_i][alpha] * + qvec[gpp_id + 1][rank][t_i][e] * pp; + // Tb EVENT + // sum q_sum+=T_ep_app+T_ap_epp; + // qvec[g_id+1][tpdt_rank][tpdt_t_i][alpha]+=p_Ntau_e*(qvec[gp_id+1][rank][t_i][e]*qvec[gpp_id+1][rank][t_i][alpha]+qvec[gp_id+1][rank][t_i][alpha]*qvec[gpp_id+1][rank][t_i][e]); + // Tb. + // max + if (max_term < T_ep_app) { + max_term = T_ep_app; + } + if (max_term < T_ap_epp) { + max_term = T_ap_epp; + } + // max + } + } + // non-leaf directed partition + if (not is_a_leaf) + for (int i = 0; i < N_parts; i++) { + long int gp_id = gp_ids.at(i); + long int gpp_id = gpp_ids[i]; + scalar_type pp = p_part[i]; + scalar_type Sb = p_Delta_bar * + (2 * qvec[gp_id + 1][rank][t_i][alpha] * + qvec[gpp_id + 1][rank][t_i][alpha]) * + pp; + // S_bar EVENT + // sum q_sum+=Sb; + // qvec[g_id+1][tpdt_rank][tpdt_t_i][alpha]+=p_Delta_bar*(2*qvec[gp_id+1][rank][t_i][alpha]*qvec[gpp_id+1][rank][t_i][alpha]); + // S_bar. + // max + if (max_term < Sb) { + max_term = Sb; + } + // max + } + + for (int branch_i = 0; branch_i < n; branch_i++) { + int e = time_slices[rank][branch_i]; + scalar_type tau_e = vector_parameter["tau"][e]; + scalar_type p_Ntau_e = tau_e * Delta_t; + scalar_type TLb = p_Ntau_e * Ebar * qvec[g_id + 1][rank][t_i][e]; + // TL_bar EVENT + // sum q_sum+=TLb; + // qvec[g_id+1][tpdt_rank][tpdt_t_i][alpha]+=p_Ntau_e*Ebar*qvec[g_id+1][rank][t_i][e]; + // TL_bar. + // max + if (max_term < TLb) { + max_term = TLb; + } + // max + } + + // sum qvec[g_id+1][tpdt_rank][tpdt_t_i][alpha]+=q_sum_nl; + if (qvec[g_id + 1][tpdt_rank][tpdt_t_i][alpha] < max_term) { + qvec[g_id + 1][tpdt_rank][tpdt_t_i][alpha] = max_term; + } + + // 0 EVENT + scalar_type empty = G_bar * qvec[g_id + 1][rank][t_i][alpha]; + // sum q_sum+=empty; + // qvec[g_id+1][tpdt_rank][tpdt_t_i][alpha]+=G_bar*qvec[g_id+1][rank][t_i][alpha]; + // 0. + // max + if (max_term < empty) { + max_term = empty; + } + // max + + // sum qvec[g_id+1][tpdt_rank][tpdt_t_i][alpha]+=q_sum; + if (qvec[g_id + 1][tpdt_rank][tpdt_t_i][alpha] < max_term) { + qvec[g_id + 1][tpdt_rank][tpdt_t_i][alpha] = max_term; + } + // events within slice rank at time t on alpha virtual branch. + } + if (1) { + for (int branch_i = 0; branch_i < n; branch_i++) { + int e = time_slices[rank][branch_i]; + scalar_type Get = Ge[e][t]; + scalar_type Eet = Ee[e][t]; + scalar_type delta_e = vector_parameter["delta"][e]; + scalar_type p_delta_e = delta_e * Delta_t; + + // events within slice rank at time t on branch e + qvec[g_id + 1][tpdt_rank][tpdt_t_i][e] = 0; + // sum scalar_type q_sum=0; + // max + scalar_type max_term = 0; + // max + + // non-leaf directed partition + if (not is_a_leaf) + for (int i = 0; i < N_parts; i++) { + long int gp_id = gp_ids.at(i); + long int gpp_id = gpp_ids.at(i); + scalar_type pp = p_part.at(i); + scalar_type qpe = qvec[gp_id + 1][rank][t_i][e]; + scalar_type qppe = qvec[gpp_id + 1][rank][t_i][e]; + scalar_type Sb_pa_ppe = + p_Delta_bar * qvec[gp_id + 1][rank][t_i][alpha] * qppe * pp; + scalar_type Sb_pe_ppa = + p_Delta_bar * qpe * qvec[gpp_id + 1][rank][t_i][alpha] * pp; + // S_bar EVENT + // sum q_sum+= Sb_pa_ppe + Sb_pe_ppa; + // qvec[g_id+1][tpdt_rank][tpdt_t_i][e]+=p_Delta_bar*(qvec[gp_id+1][rank][t_i][alpha]*qvec[gpp_id+1][rank][t_i][e]+qvec[gp_id+1][rank][t_i][e]*qvec[gpp_id+1][rank][t_i][alpha]); + // S_bar. + // max + if (max_term < Sb_pa_ppe) { + max_term = Sb_pa_ppe; + } + if (max_term < Sb_pe_ppa) { + max_term = Sb_pe_ppa; + } + // max + + scalar_type D = 2 * p_delta_e * qpe * qppe * pp; + // D EVENT + // sum q_sum+= D; + // qvec[g_id+1][tpdt_rank][tpdt_t_i][e]+=2*p_delta_e*qvec[gp_id+1][rank][t_i][e]*qvec[gpp_id+1][rank][t_i][e]; + // D. + // max + if (max_term < D) { + max_term = D; + } + // max + } + // sum qvec[g_id+1][tpdt_rank][tpdt_t_i][e]+=q_sum; + if (qvec[g_id + 1][tpdt_rank][tpdt_t_i][e] < max_term) { + qvec[g_id + 1][tpdt_rank][tpdt_t_i][e] = max_term; + } + + scalar_type empty = Get * qvec[g_id + 1][rank][t_i][e]; + // 0 EVENT + // sum q_sum+=empty; + // qvec[g_id+1][tpdt_rank][tpdt_t_i][e]=Get*qvec[g_id+1][rank][t_i][e]; + // 0. + // max + if (max_term < empty) { + max_term = empty; + } + // max + + scalar_type SLb = + p_Delta_bar * Eet * qvec[g_id + 1][rank][t_i][alpha]; + // SL_bar EVENT + // sum q_sum+=SLb; + // qvec[g_id+1][tpdt_rank][tpdt_t_i][e]+=p_Delta_bar*Eet*qvec[g_id+1][rank][t_i][alpha]; + // SL_bar. + // max + if (max_term < SLb) { + max_term = SLb; + } + // max + + // sum qvec[g_id+1][tpdt_rank][tpdt_t_i][e]+=q_sum; + if (qvec[g_id + 1][tpdt_rank][tpdt_t_i][e] < max_term) { + qvec[g_id + 1][tpdt_rank][tpdt_t_i][e] = max_term; + } + // events within slice rank at time t on branch e. + } + } + // ###################################################################################################################### + // #########################################INNNER + // LOOP################################################################## + // ###################################################################################################################### + } } - //del-locs + gp_ids.clear(); + gpp_ids.clear(); + p_part.clear(); + } + // del-locs g_ids.clear(); g_id_sizes.clear(); - scalar_type max_term=0; - int max_e=-11; - scalar_type max_t=-11; - scalar_type max_rank=-11; - - - scalar_type root_norm=0; - for (int rank=0;rank return_pair; + } + + for (int rank = 0; rank < last_rank; rank++) { + int n = time_slices[rank].size(); + for (int t_i = 0; t_i < (int)time_slice_times[rank].size(); t_i++) { + // scalar_type t=time_slice_times[rank][t_i]; + for (int branch_i = 0; branch_i < n; branch_i++) { + int e = time_slices[rank][branch_i]; + if (max_term < qvec[0][rank][t_i][e]) { + max_term = qvec[0][rank][t_i][e]; + max_e = e; + max_t = t_i; + max_rank = rank; + } + } + if (max_term < qvec[0][rank][t_i][alpha]) { + max_term = qvec[0][rank][t_i][alpha]; + max_e = alpha; + max_t = t_i; + max_rank = rank; + } + } + } + pair return_pair; MLRec_events.clear(); Ttokens.clear(); register_O(max_e); - return_pair.first=sample(false,-1,max_t,max_rank,max_e,0,"","",true)+";\n"; - return_pair.second=max_term/root_norm; + return_pair.first = + sample(false, -1, max_t, max_rank, max_e, 0, "", "", true) + ";\n"; + return_pair.second = max_term / root_norm; return return_pair; } -//used by sample() consider moving to sample.cpp -void exODT_model::register_O(int e) -{ - if (e>-1) branch_counts["count"].at(e)+=1; - if (e>-1) branch_counts["Os"].at(e)+=1; +// used by sample() consider moving to sample.cpp +void exODT_model::register_O(int e) { + if (e > -1) + branch_counts["count"].at(e) += 1; + if (e > -1) + branch_counts["Os"].at(e) += 1; } -void exODT_model::register_D(int e) -{ - MLRec_events["D"]+=1; - if (e>-1) branch_counts["Ds"].at(e)+=1; +void exODT_model::register_D(int e) { + MLRec_events["D"] += 1; + if (e > -1) + branch_counts["Ds"].at(e) += 1; } -void exODT_model::register_Tto(int e) -{ - MLRec_events["T"]+=1; - if (e>-1) branch_counts["Ts"].at(e)+=1; +void exODT_model::register_Tto(int e) { + MLRec_events["T"] += 1; + if (e > -1) + branch_counts["Ts"].at(e) += 1; } -void exODT_model::register_Tfrom(int e) -{ - if (e>-1) branch_counts["Tfroms"].at(e)+=1; +void exODT_model::register_Tfrom(int e) { + if (e > -1) + branch_counts["Tfroms"].at(e) += 1; } -void exODT_model::register_L(int e) -{ - MLRec_events["L"]+=1; - if (e>-1) branch_counts["Ls"].at(e)+=1; -} -void exODT_model::register_S(int e) -{ - MLRec_events["S"]+=1; - if (e>-1) - { - int f=daughters[e][0]; - int g=daughters[e][1]; - branch_counts["copies"].at(e)+=1; - branch_counts["count"].at(f)+=1; - branch_counts["count"].at(g)+=1; - } +void exODT_model::register_L(int e) { + MLRec_events["L"] += 1; + if (e > -1) + branch_counts["Ls"].at(e) += 1; } -void exODT_model::register_leaf(int e) -{ - if (e>-1) branch_counts["copies"].at(e)+=1; - //MLRec_events["genes"]+=1; +void exODT_model::register_S(int e) { + MLRec_events["S"] += 1; + if (e > -1) { + int f = daughters[e][0]; + int g = daughters[e][1]; + branch_counts["copies"].at(e) += 1; + branch_counts["count"].at(f) += 1; + branch_counts["count"].at(g) += 1; + } } - -void exODT_model::register_Ttoken(string token) -{ - Ttokens.push_back(token); +void exODT_model::register_leaf(int e) { + if (e > -1) + branch_counts["copies"].at(e) += 1; + // MLRec_events["genes"]+=1; } -//ad hoc function should be moved to a future exODT_util.cpp -void exODT_model::show_counts(string name) -{ - for (map ::iterator it=node_ids.begin();it!=node_ids.end();it++ ) - (*it).first->setBranchProperty("ID",BppString("")); - - for (int branch=0;branch(tmp_node->getBranchProperty("ID")))).toSTL(); - //out<< id_ranks[branch]; - if (branch==last_branch-1) out<<"|"<setBranchProperty("ID",BppString(out.str())); - if (tmp_node->isLeaf()) - tmp_node->setName(tmp_node->getName()+"_"+out.str()); - } - cout << TreeTemplateTools::treeToParenthesis(*S,false,"ID") << endl; - for (map ::iterator it=node_ids.begin();it!=node_ids.end();it++ ) - { - (*it).first->setBranchProperty("ID",BppString("")); - if ((*it).first->isLeaf()) - { - vector tokens; - name=(*it).first->getName(); - boost::split(tokens,name,boost::is_any_of("_"),boost::token_compress_on); - (*it).first->setName(tokens[0]); - } +void exODT_model::register_Ttoken(string token) { Ttokens.push_back(token); } + +// ad hoc function should be moved to a future exODT_util.cpp +void exODT_model::show_counts(string name) { + for (map::iterator it = node_ids.begin(); it != node_ids.end(); + it++) + (*it).first->setBranchProperty("ID", BppString("")); + + for (int branch = 0; branch < last_branch; branch++) + if (id_nodes.count(branch)) { + Node *tmp_node = id_nodes[branch]; + + stringstream out; + string old_name = (*(dynamic_cast( + tmp_node->getBranchProperty("ID")))) + .toSTL(); + // out<< id_ranks[branch]; + if (branch == last_branch - 1) + out << "|" << name << "|"; + out << branch_counts[name][branch]; + tmp_node->setBranchProperty("ID", BppString(out.str())); + if (tmp_node->isLeaf()) + tmp_node->setName(tmp_node->getName() + "_" + out.str()); } - + cout << TreeTemplateTools::treeToParenthesis(*S, false, "ID") << endl; + for (map::iterator it = node_ids.begin(); it != node_ids.end(); + it++) { + (*it).first->setBranchProperty("ID", BppString("")); + if ((*it).first->isLeaf()) { + vector tokens; + name = (*it).first->getName(); + boost::split(tokens, name, boost::is_any_of("_"), + boost::token_compress_on); + (*it).first->setName(tokens[0]); + } + } } -//ad hoc function should be moved to a future exODT_util.cpp -string exODT_model::counts_string() -{ +// ad hoc function should be moved to a future exODT_util.cpp +string exODT_model::counts_string() { stringstream out; - for (int branch=0;branch::iterator it=node_ids.begin();it!=node_ids.end();it++ ) - (*it).first->setBranchProperty("ID",BppString("")); - - for (int branch=0;branch(tmp_node->getBranchProperty("ID")))).toSTL(); - //out<< id_ranks[branch]; - if (branch==last_branch-1) out<<"|"<setBranchProperty("ID",BppString(out.str())); - if (tmp_node->isLeaf()) - tmp_node->setName(tmp_node->getName()+"_"+out.str()); - } - - cout << TreeTemplateTools::treeToParenthesis(*S,false,"ID") << endl; - for (map ::iterator it=node_ids.begin();it!=node_ids.end();it++ ) - { - (*it).first->setBranchProperty("ID",BppString("")); - if ((*it).first->isLeaf()) - { - vector tokens; - name=(*it).first->getName(); - boost::split(tokens,name,boost::is_any_of("_"),boost::token_compress_on); - (*it).first->setName(tokens[0]); - } +// ad hoc function should be moved to a future exODT_util.cpp +void exODT_model::show_rates(string name) { + for (map::iterator it = node_ids.begin(); it != node_ids.end(); + it++) + (*it).first->setBranchProperty("ID", BppString("")); + + for (int branch = 0; branch < last_branch; branch++) + if (id_nodes.count(branch)) { + Node *tmp_node = id_nodes[branch]; + + stringstream out; + string old_name = (*(dynamic_cast( + tmp_node->getBranchProperty("ID")))) + .toSTL(); + // out<< id_ranks[branch]; + if (branch == last_branch - 1) + out << "|" << name << "|"; + if (name == "tau") + out << vector_parameter[name][branch] * vector_parameter["N"][0]; + else + out << vector_parameter[name][branch]; + tmp_node->setBranchProperty("ID", BppString(out.str())); + if (tmp_node->isLeaf()) + tmp_node->setName(tmp_node->getName() + "_" + out.str()); + } + + cout << TreeTemplateTools::treeToParenthesis(*S, false, "ID") << endl; + for (map::iterator it = node_ids.begin(); it != node_ids.end(); + it++) { + (*it).first->setBranchProperty("ID", BppString("")); + if ((*it).first->isLeaf()) { + vector tokens; + name = (*it).first->getName(); + boost::split(tokens, name, boost::is_any_of("_"), + boost::token_compress_on); + (*it).first->setName(tokens[0]); } + } } diff --git a/src/traceback_scaled.cpp b/src/traceback_scaled.cpp index 7de1dd8..5af73f7 100644 --- a/src/traceback_scaled.cpp +++ b/src/traceback_scaled.cpp @@ -2,852 +2,826 @@ using namespace std; using namespace bpp; -//The current lowmem=true method uses sample(true) cf. sample.cpp. -//The general structure of the calculation, and lot of the code, is the same as p(ale) cf. model.cpp. -pair exODT_model::p_MLRec(approx_posterior *ale, bool lowmem) -{ - - ale_pointer=ale; - //directed partitions and thier sizes - vector g_ids;//del-loc - vector g_id_sizes;//del-loc - for (map > :: iterator it = ale->size_ordered_bips.begin(); it != ale->size_ordered_bips.end(); it++) - for (vector :: iterator jt = (*it).second.begin(); jt != (*it).second.end(); jt++) - { - g_ids.push_back((*jt)); - g_id_sizes.push_back((*it).first); - } - //root biprartitino needs to be handled seperatly +// The current lowmem=true method uses sample(true) cf. sample.cpp. +// The general structure of the calculation, and lot of the code, is the same as +// p(ale) cf. model.cpp. +pair exODT_model::p_MLRec(approx_posterior *ale, + bool lowmem) { + + ale_pointer = ale; + // directed partitions and thier sizes + vector g_ids; // del-loc + vector g_id_sizes; // del-loc + for (map>::iterator it = ale->size_ordered_bips.begin(); + it != ale->size_ordered_bips.end(); it++) + for (vector::iterator jt = (*it).second.begin(); + jt != (*it).second.end(); jt++) { + g_ids.push_back((*jt)); + g_id_sizes.push_back((*it).first); + } + // root biprartitino needs to be handled seperatly g_ids.push_back(-1); g_id_sizes.push_back(ale->Gamma_size); // gene<->species mapping - //vector > > > qvec; - qvec.clear();//hope this doesn't leak.. + // vector > > > qvec; + qvec.clear(); // hope this doesn't leak.. // gene<->species mapping - // for (int i=0;i<(int)g_ids.size();i++) //Going through each clade of the approx_posterior + // for (int i=0;i<(int)g_ids.size();i++) //Going through each clade of the + // approx_posterior // { // long int g_id=g_ids[i]; // cerr<<"i: "< case vide + vector>> vrank; + vector> vt_i; + map vbranch; + vt_i.push_back(vbranch); + vrank.push_back(vt_i); + qvec.push_back(vrank); + } else { + // vector > > vrank; + vector>> vrank; + for (int rank = 0; rank < last_rank; + rank++) // Going through time slices, from leaves to root + { + // cerr<<"\trank: "< > vt_i; + vector> vt_i; + for (int t_i = 0; t_i < (int)time_slice_times[rank].size(); + t_i++) // Going through the subslices + { + // cerr<<"\t\tt_i: "< vbranch(n, 0.); + map vbranch; + for (int branch_i = 0; branch_i < n; branch_i++) { + int e = time_slices[rank][branch_i]; + // cerr<<"\t\t\te: "< case vide - vector > > vrank; - vector > vt_i; - map vbranch; - vt_i.push_back(vbranch); - vrank.push_back(vt_i); - qvec.push_back(vrank); + /* int id = 0; + boost::dynamic_bitset<> temp = ale->id_sets[g_id]; + for (auto i = 0; i < ale->Gamma_size + 1 ; ++i) { + // if ( BipartitionTools::testBit ( temp, i) ) { + if ( temp[i] ) { + id = i; + break; + } + }*/ + + int id = 0; + for (auto i = 0; i < ale->Gamma_size + 1; ++i) { + if (ale->id_sets[g_id][i]) { + id = i; + break; + } } - else{ - //vector > > vrank; - vector > > vrank; - for (int rank=0;rank > vt_i; - vector > vt_i; - for (int t_i=0;t_i<(int)time_slice_times[rank].size();t_i++) //Going through the subslices - { - //cerr<<"\t\tt_i: "< vbranch(n, 0.); - map vbranch; - for (int branch_i=0;branch_iid_leaves[id /*g_id*/]; + + // string gene_name=ale->id_leaves[ g_id ]; + // string gene_name=ale->id_leaves[(* (ale->id_sets[g_id].begin()) )]; + vector tokens; + boost::split(tokens, gene_name, + boost::is_any_of(string_parameter["gene_name_separators"]), + boost::token_compress_on); + string species_name; + if ((int)scalar_parameter["species_field"] == -1) + species_name = tokens[tokens.size() - 1]; + else + species_name = tokens[(int)scalar_parameter["species_field"]]; + gid_sps[g_id] = species_name; + } + } + + // p_parts is filled up with CCPs + for (int i = 0; i < (int)g_ids.size(); i++) { + // directed partition (dip) gamma's id + bool is_a_leaf = false; + long int g_id = g_ids[i]; + if (g_id_sizes[i] == 1) + is_a_leaf = true; + + vector gp_ids; // del-loc + vector gpp_ids; // del-loc + vector p_part; // del-loc + if (g_id != -1) + for (unordered_map, scalar_type>::iterator kt = + ale->Dip_counts[g_id].begin(); + kt != ale->Dip_counts[g_id].end(); kt++) { + pair parts = (*kt).first; + long int gp_id = parts.first; + long int gpp_id = parts.second; + gp_ids.push_back(gp_id); + gpp_ids.push_back(gpp_id); + if (ale->Bip_counts[g_id] <= scalar_parameter["min_bip_count"]) + p_part.push_back(0); + else { + p_part.push_back(ale->p_dip(g_id, gp_id, gpp_id)); + } } - } - for (int i=0;i<(int)g_ids.size();i++) //Going through each clade of the approx_posterior - { - long int g_id=g_ids[i]; - if (g_id_sizes[i]==1) //a leaf, mapping is by name - { - /* int id = 0; - boost::dynamic_bitset<> temp = ale->id_sets[g_id]; - for (auto i = 0; i < ale->Gamma_size + 1 ; ++i) { - // if ( BipartitionTools::testBit ( temp, i) ) { - if ( temp[i] ) { - id = i; - break; - } - }*/ - - int id = 0; - for (auto i=0; i< ale->Gamma_size + 1; ++i) { - if ( ale->id_sets[g_id][i] ) { - id=i; - break; + else { + // root biprartition needs to be handled seperatly + map, int> bip_parts; + for (map::iterator it = ale->Bip_counts.begin(); + it != ale->Bip_counts.end(); it++) { + long int gp_id = (*it).first; + boost::dynamic_bitset<> gamma = ale->id_sets[gp_id]; + boost::dynamic_bitset<> not_gamma = ~gamma; + not_gamma[0] = 0; + /* for (set::iterator + st=ale->Gamma.begin();st!=ale->Gamma.end();st++) if + (gamma.count(*st)==0) not_gamma.insert(*st);*/ + /* for (auto i = 0; i < ale->nbint; ++i) { + not_gamma[i] = 0; } - } - - string gene_name=ale->id_leaves[ id /*g_id*/ ]; - - // string gene_name=ale->id_leaves[ g_id ]; - // string gene_name=ale->id_leaves[(* (ale->id_sets[g_id].begin()) )]; - vector tokens; - boost::split(tokens,gene_name,boost::is_any_of(string_parameter["gene_name_separators"]),boost::token_compress_on); - string species_name; - if ((int)scalar_parameter["species_field"]==-1) - species_name=tokens[tokens.size()-1]; - else - species_name=tokens[(int)scalar_parameter["species_field"]]; - gid_sps[g_id]=species_name; - } - } - - //p_parts is filled up with CCPs - for (int i=0;i<(int)g_ids.size();i++) - { - // directed partition (dip) gamma's id - bool is_a_leaf=false; - long int g_id=g_ids[i]; - if (g_id_sizes[i]==1) - is_a_leaf=true; - - vector gp_ids;//del-loc - vector gpp_ids;//del-loc - vector p_part;//del-loc - if (g_id!=-1) - for (unordered_map< pair,scalar_type> :: iterator kt = ale->Dip_counts[g_id].begin(); kt != ale->Dip_counts[g_id].end(); kt++) - { - pair parts = (*kt).first; - long int gp_id=parts.first; - long int gpp_id=parts.second; - gp_ids.push_back(gp_id); - gpp_ids.push_back(gpp_id); - if (ale->Bip_counts[g_id]<=scalar_parameter["min_bip_count"]) - p_part.push_back(0); - else - { - p_part.push_back(ale->p_dip(g_id,gp_id,gpp_id)); - } - } - else - { - //root biprartition needs to be handled seperatly - map,int> bip_parts; - for (map :: iterator it = ale->Bip_counts.begin(); it != ale->Bip_counts.end(); it++) - { - long int gp_id=(*it).first; - boost::dynamic_bitset<> gamma = ale->id_sets[gp_id]; - boost::dynamic_bitset<> not_gamma = ~gamma; - not_gamma[0] = 0; - /* for (set::iterator st=ale->Gamma.begin();st!=ale->Gamma.end();st++) - if (gamma.count(*st)==0) - not_gamma.insert(*st);*/ - /* for (auto i = 0; i < ale->nbint; ++i) { - not_gamma[i] = 0; - } - BipartitionTools::bitNot(not_gamma, gamma, ale->nbint);*/ - long int gpp_id = ale->set_ids[not_gamma]; - set parts; - parts.insert(gp_id); - parts.insert(gpp_id); - bip_parts[parts]=1; - /* gamma.clear(); - not_gamma.clear();*/ - } - for (map,int> :: iterator kt = bip_parts.begin();kt!=bip_parts.end();kt++) - { - vector parts; - for (set::iterator sit=(*kt).first.begin();sit!=(*kt).first.end();sit++) parts.push_back((*sit)); - long int gp_id=parts[0]; - long int gpp_id=parts[1]; - gp_ids.push_back(gp_id); - gpp_ids.push_back(gpp_id); - if (ale->Bip_counts[gp_id]<=scalar_parameter["min_bip_count"] and not ale->Gamma_size<4) - p_part.push_back(0); - else - p_part.push_back(ale->p_bip(gp_id)); - } - bip_parts.clear(); - } - int N_parts=gp_ids.size(); - - //iterate over all postions along S - for (int rank=0;rank0) - { - int f=daughters[e][0]; - int g=daughters[e][1]; - scalar_type Eft=Ee[f][t]; - scalar_type Egt=Ee[g][t]; - - //sum scalar_type q_sum=0; - //qvec[g_id+1][rank][t_i][e]=0; - //max - scalar_type max_term=0; - //max - - scalar_type SL_fLg=qvec[g_id+1][rank][t_i][f]*Egt; - scalar_type SL_Lfg=qvec[g_id+1][rank][t_i][g]*Eft; - //SL EVENT - // q_sum+=SL_fLg+SL_Lfg; - //qvec[g_id+1][rank][t_i][e]=qvec[g_id+1][rank][t_i][f]*Egt + qvec[g_id+1][rank][t_i][g]*Eft; - //SL. - //max - if (max_termnbint);*/ + long int gpp_id = ale->set_ids[not_gamma]; + set parts; + parts.insert(gp_id); + parts.insert(gpp_id); + bip_parts[parts] = 1; + /* gamma.clear(); + not_gamma.clear();*/ + } + for (map, int>::iterator kt = bip_parts.begin(); + kt != bip_parts.end(); kt++) { + vector parts; + for (set::iterator sit = (*kt).first.begin(); + sit != (*kt).first.end(); sit++) + parts.push_back((*sit)); + long int gp_id = parts[0]; + long int gpp_id = parts[1]; + gp_ids.push_back(gp_id); + gpp_ids.push_back(gpp_id); + if (ale->Bip_counts[gp_id] <= scalar_parameter["min_bip_count"] and + not ale->Gamma_size < 4) + p_part.push_back(0); + else + p_part.push_back(ale->p_bip(gp_id)); + } + bip_parts.clear(); + } + int N_parts = gp_ids.size(); + + // iterate over all postions along S + for (int rank = 0; rank < last_rank; rank++) { + int n = time_slices[rank].size(); + for (int t_i = 0; t_i < (int)time_slice_times[rank].size(); t_i++) { + // ###################################################################################################################### + // #########################################INNNER + // LOOP################################################################## + // ###################################################################################################################### + + scalar_type t = time_slice_times[rank][t_i]; + scalar_type tpdt; + int tpdt_rank, tpdt_t_i; + if (t_i < (int)time_slice_times[rank].size() - 1) { + tpdt = time_slice_times[rank][t_i + 1]; + tpdt_rank = rank; + tpdt_t_i = t_i + 1; + } else if (rank < last_rank - 1) { + tpdt = time_slice_times[rank + 1][0]; + tpdt_rank = rank + 1; + tpdt_t_i = 0; + } else + // top of root ste + { + tpdt = t_begin[time_slices[rank][0]]; + tpdt_rank = rank; + tpdt_t_i = 0; + } + + // root + scalar_type Delta_t = tpdt - t; + // Delat_bar corresponds to \hat \sigma + scalar_type ni = time_slices[rank].size(); + scalar_type delta_avg = scalar_parameter["delta_avg"]; + scalar_type tau_avg = scalar_parameter["tau_avg"]; + scalar_type lambda_avg = scalar_parameter["lambda_avg"]; + scalar_type sigma_hat = scalar_parameter["sigma_hat"]; + scalar_type H_hat = Ee[-1][t]; + + // boundary at present + if (t == 0) + qvec[g_id + 1][rank][t_i][alpha] = 0; + + if (1) { + for (int branch_i = 0; branch_i < n; branch_i++) { + int e = time_slices[rank][branch_i]; + + // boundaries for branch e + // boundary at present + if (t == 0) { + if (is_a_leaf && extant_species[e] == gid_sps[g_id]) + qvec[g_id + 1][rank][t_i][e] = 1; + else + qvec[g_id + 1][rank][t_i][e] = 0; + } + // boundary between slice rank and rank-1 + else if (t_i == 0) { + // terminating branch is last in time_slices and defines a + // represented speciation + if (branch_i == n - 1 && rank > 0) { + int f = daughters[e][0]; + int g = daughters[e][1]; + scalar_type Eft = Ee[f][t]; + scalar_type Egt = Ee[g][t]; + + // sum scalar_type q_sum=0; + // qvec[g_id+1][rank][t_i][e]=0; + // max + scalar_type max_term = 0; + // max + + scalar_type SL_fLg = qvec[g_id + 1][rank][t_i][f] * Egt; + scalar_type SL_Lfg = qvec[g_id + 1][rank][t_i][g] * Eft; + // SL EVENT + // q_sum+=SL_fLg+SL_Lfg; + // qvec[g_id+1][rank][t_i][e]=qvec[g_id+1][rank][t_i][f]*Egt + + // qvec[g_id+1][rank][t_i][g]*Eft; SL. max + if (max_term < SL_fLg) { + max_term = SL_fLg; + } + if (max_term < SL_Lfg) { + max_term = SL_Lfg; + } + // max + + // non-leaf directed partition + if (not is_a_leaf) + for (int i = 0; i < N_parts; i++) { + long int gp_id = gp_ids.at(i); + long int gpp_id = gpp_ids.at(i); + scalar_type pp = p_part.at(i); + scalar_type S_pf_ppg = qvec[gp_id + 1][rank][t_i][f] * + qvec[gpp_id + 1][rank][t_i][g] * pp; + scalar_type S_ppf_pg = qvec[gpp_id + 1][rank][t_i][f] * + qvec[gp_id + 1][rank][t_i][g] * pp; + // S EVENT + // qvec[g_id+1][rank][t_i][e]+=qvec[gp_id+1][rank][t_i][f]*qvec[gpp_id+1][rank][t_i][g] + // +qvec[gpp_id+1][rank][t_i][f]*qvec[gp_id+1][rank][t_i][g]; + // sum q_sum+= S_pf_ppg + S_ppf_pg; + // S. + // max + if (max_term < S_pf_ppg) { + max_term = S_pf_ppg; + } + if (max_term < S_ppf_pg) { + max_term = S_ppf_pg; + } + // max + } + + // sum qvec[g_id+1][rank][t_i][e]=q_sum; + qvec[g_id + 1][rank][t_i][e] = max_term; + } + // branches that cross to next time slice + else { + // trivial + ; // qvec[g_id+1][rank][t_i][e]=qvec[g_id+1][rank][t_i][e]; + } + } + // boundaries for branch e. + } + } + if (1) { + // boundaries for branch alpha virtual branch + // boundary at present + // if (t==0) + // qvec[g_id+1][rank][t_i][alpha]=0; + // boundary between slice rank and rank-1 slice is trivial + ; // qvec[g_id+1][rank][t_i][alpha]=qvec[g_id+1][rank][t_i][alpha]; + // boundaries for branch alpha virtual branch. + + // events within slice rank at time t on alpha virtual branch + // scalar_type G_bar=Ge[-1][t]; + qvec[g_id + 1][tpdt_rank][tpdt_t_i][alpha] = 0; + // sum scalar_type q_sum=0; + // max + scalar_type max_term = 0; + // max + /* + for (int branch_i=0;branch_i return_pair; + } + + for (int rank = 0; rank < last_rank; rank++) { + int n = time_slices[rank].size(); + for (int t_i = 0; t_i < (int)time_slice_times[rank].size(); t_i++) { + scalar_type t = time_slice_times[rank][t_i]; + scalar_type tpdt; + if (t_i < (int)time_slice_times[rank].size() - 1) + tpdt = time_slice_times[rank][t_i + 1]; + else if (rank < last_rank - 1) + tpdt = time_slice_times[rank + 1][0]; + else + // top of root stem + tpdt = t_begin[time_slices[rank][0]]; + + // root + scalar_type Delta_t = (tpdt - t) * 1; + + for (int branch_i = 0; branch_i < n; branch_i++) { + int e = time_slices[rank][branch_i]; + if (max_term < qvec[0][rank][t_i][e]) { + max_term = qvec[0][rank][t_i][e] / + (1 - Ee[e][time_slice_times[rank][t_i]]); //*Delta_t; + max_e = e; + max_t = t_i; + max_rank = rank; + } + } + if (max_term < qvec[0][rank][t_i][alpha]) { + max_term = qvec[0][rank][t_i][alpha] / + (1 - Ee[alpha][time_slice_times[rank][t_i]]); //*Delta_t; + max_e = alpha; + max_t = t_i; + max_rank = rank; + } + } + } + pair return_pair; MLRec_events.clear(); Ttokens.clear(); register_O(max_e); - - // cout << max_t << " " << max_rank << " " << max_e << " " << log(max_term) << endl; - return_pair.first=sample(false,-1,max_t,max_rank,max_e,0,"","",true)+";\n"; - return_pair.second=max_term/(1-Ee[max_e][ time_slice_times[max_rank][max_t] ]); + // cout << max_t << " " << max_rank << " " << max_e << " " << log(max_term) + // << endl; + + return_pair.first = + sample(false, -1, max_t, max_rank, max_e, 0, "", "", true) + ";\n"; + return_pair.second = + max_term / (1 - Ee[max_e][time_slice_times[max_rank][max_t]]); return return_pair; } -//used by sample() consider moving to sample.cpp -void exODT_model::register_O(int e) -{ - if (e>-1) branch_counts["count"].at(e)+=1; - if (e>-1) branch_counts["Os"].at(e)+=1; +// used by sample() consider moving to sample.cpp +void exODT_model::register_O(int e) { + if (e > -1) + branch_counts["count"].at(e) += 1; + if (e > -1) + branch_counts["Os"].at(e) += 1; } -void exODT_model::register_D(int e) -{ - MLRec_events["D"]+=1; - if (e>-1) branch_counts["Ds"].at(e)+=1; +void exODT_model::register_D(int e) { + MLRec_events["D"] += 1; + if (e > -1) + branch_counts["Ds"].at(e) += 1; } -void exODT_model::register_Tto(int e) -{ - MLRec_events["T"]+=1; - if (e>-1) branch_counts["Ts"].at(e)+=1; +void exODT_model::register_Tto(int e) { + MLRec_events["T"] += 1; + if (e > -1) + branch_counts["Ts"].at(e) += 1; } -void exODT_model::register_Tfrom(int e) -{ - if (e>-1) branch_counts["Tfroms"].at(e)+=1; +void exODT_model::register_Tfrom(int e) { + if (e > -1) + branch_counts["Tfroms"].at(e) += 1; } -void exODT_model::register_L(int e) -{ - MLRec_events["L"]+=1; - if (e>-1) branch_counts["Ls"].at(e)+=1; +void exODT_model::register_L(int e) { + MLRec_events["L"] += 1; + if (e > -1) + branch_counts["Ls"].at(e) += 1; } -void exODT_model::register_S(int e) -{ - MLRec_events["S"]+=1; - if (e>-1) - { - int f=daughters[e][0]; - int g=daughters[e][1]; - branch_counts["copies"].at(e)+=1; - branch_counts["count"].at(f)+=1; - branch_counts["count"].at(g)+=1; - } +void exODT_model::register_S(int e) { + MLRec_events["S"] += 1; + if (e > -1) { + int f = daughters[e][0]; + int g = daughters[e][1]; + branch_counts["copies"].at(e) += 1; + branch_counts["count"].at(f) += 1; + branch_counts["count"].at(g) += 1; + } } -void exODT_model::register_leaf(int e) -{ - if (e>-1) branch_counts["copies"].at(e)+=1; - //MLRec_events["genes"]+=1; +void exODT_model::register_leaf(int e) { + if (e > -1) + branch_counts["copies"].at(e) += 1; + // MLRec_events["genes"]+=1; } -void exODT_model::register_Ttoken(string token) -{ - Ttokens.push_back(token); -} - -//ad hoc function should be moved to a future exODT_util.cpp -void exODT_model::show_counts(string name, bool as_branch_length, bool per_copy) -{ - for (map ::iterator it=node_ids.begin();it!=node_ids.end();it++ ) - (*it).first->setBranchProperty("ID",BppString("")); - - for (int branch=0;branch(tmp_node->getBranchProperty("ID")))).toSTL(); - //out<< id_ranks[branch]; - out<setBranchProperty("ID",BppString(out.str())); - } - else if (not as_branch_length) - { - tmp_node->setBranchProperty("ID",BppString(out.str())); - if (tmp_node->isLeaf()) - tmp_node->setName(tmp_node->getName()+"_"+out.str()); - } - else - { - tmp_node->setDistanceToFather(value); - } +void exODT_model::register_Ttoken(string token) { Ttokens.push_back(token); } + +// ad hoc function should be moved to a future exODT_util.cpp +void exODT_model::show_counts(string name, bool as_branch_length, + bool per_copy) { + for (map::iterator it = node_ids.begin(); it != node_ids.end(); + it++) + (*it).first->setBranchProperty("ID", BppString("")); + + for (int branch = 0; branch < last_branch; branch++) + if (id_nodes.count(branch)) { + Node *tmp_node = id_nodes[branch]; + + scalar_type value = branch_counts[name][branch]; + if (per_copy) + value = + value / max((double)1., (double)branch_counts["copies"][branch]); + stringstream out; + string old_name = (*(dynamic_cast( + tmp_node->getBranchProperty("ID")))) + .toSTL(); + // out<< id_ranks[branch]; + out << value; + if (branch == last_branch - 1) { + out << "|" << name << "|"; + tmp_node->setBranchProperty("ID", BppString(out.str())); + } else if (not as_branch_length) { + tmp_node->setBranchProperty("ID", BppString(out.str())); + if (tmp_node->isLeaf()) + tmp_node->setName(tmp_node->getName() + "_" + out.str()); + } else { + tmp_node->setDistanceToFather(value); } - cout << TreeTemplateTools::treeToParenthesis(*S,false,"ID") << endl; - for (map ::iterator it=node_ids.begin();it!=node_ids.end();it++ ) - { - (*it).first->setBranchProperty("ID",BppString("")); - if ((*it).first->isLeaf()) - { - vector tokens; - name=(*it).first->getName(); - boost::split(tokens,name,boost::is_any_of("_"),boost::token_compress_on); - (*it).first->setName(tokens[0]); - } } - + cout << TreeTemplateTools::treeToParenthesis(*S, false, "ID") << endl; + for (map::iterator it = node_ids.begin(); it != node_ids.end(); + it++) { + (*it).first->setBranchProperty("ID", BppString("")); + if ((*it).first->isLeaf()) { + vector tokens; + name = (*it).first->getName(); + boost::split(tokens, name, boost::is_any_of("_"), + boost::token_compress_on); + (*it).first->setName(tokens[0]); + } + } } -//ad hoc function should be moved to a future exODT_util.cpp -string exODT_model::counts_string(scalar_type samples) -{ - +// ad hoc function should be moved to a future exODT_util.cpp +string exODT_model::counts_string(scalar_type samples) { + stringstream out; - for (int branch=0;branchPRESENT" or gid_events[g_id][last_i]==">S" or ( gid_events[g_id][last_i]==">Sfrom" and gid_gidp[g_id][last_i]!=gid_gidpp[g_id][last_i]) or gid_events[g_id][last_i]==">D")) - { - return vertical_string(gid_gidp[g_id][last_i],event_stream.str(),t_0); - } - else - { - scalar_type bnorm;//=ale_pointer->Bip_counts[g_id]; - if (ale_pointer->Bip_counts.count(g_id)==0 ) - bnorm=ale_pointer->observations; - else - bnorm=ale_pointer->Bip_counts[g_id]; - if (bnorm==0) - bnorm=ale_pointer->observations; - - event_stream<<" "<Bip_bls[g_id]/bnorm <<"\t"<< gid_gidp[g_id][last_i] << "\t" << gid_gidpp[g_id][last_i]; - return ancestral_string+event_stream.str(); - } + if (t_0 == -1) + t_0 = gid_times[g_id][0]; + for (int i = 0; i < (int)gid_branches[g_id].size(); i++) { + int branch = gid_branches[g_id][i]; + stringstream named_branch; + if (branch == alpha) + named_branch << -1; + else if (id_ranks[branch] == 0) { + named_branch << extant_species[branch]; + } else + named_branch << id_ranks[branch]; + + event_stream << gid_events[g_id][i] << "@" + << named_branch.str() + //<<"*"<PRESENT" or + gid_events[g_id][last_i] == ">S" or + (gid_events[g_id][last_i] == ">Sfrom" and + gid_gidp[g_id][last_i] != gid_gidpp[g_id][last_i]) or + gid_events[g_id][last_i] == ">D")) { + return vertical_string(gid_gidp[g_id][last_i], event_stream.str(), t_0); + } else { + scalar_type bnorm; //=ale_pointer->Bip_counts[g_id]; + if (ale_pointer->Bip_counts.count(g_id) == 0) + bnorm = ale_pointer->observations; + else + bnorm = ale_pointer->Bip_counts[g_id]; + if (bnorm == 0) + bnorm = ale_pointer->observations; + + event_stream << " " << t_0 << " " << gid_times[g_id][last_i] << " " + << ale_pointer->Bip_bls[g_id] / bnorm << "\t" + << gid_gidp[g_id][last_i] << "\t" << gid_gidpp[g_id][last_i]; + return ancestral_string + event_stream.str(); + } } -string exODT_model::gid_string(long int g_id) -{ +string exODT_model::gid_string(long int g_id) { stringstream event_stream; - scalar_type t_0=gid_times[g_id][0]; - for (int i = 0; i < (int)gid_branches[g_id].size(); i++) - { - int branch = gid_branches[g_id][i]; - stringstream named_branch; - if (branch==alpha) - named_branch<<-1; - else if (id_ranks[branch]==0) - { - named_branch<Bip_counts[g_id]; - if (bnorm==0) - bnorm=ale_pointer->observations; - event_stream<<" "<Bip_bls[g_id]/bnorm <<"\t"<< gid_gidp[g_id][last_i] << "\t" << gid_gidpp[g_id][last_i]; + scalar_type t_0 = gid_times[g_id][0]; + for (int i = 0; i < (int)gid_branches[g_id].size(); i++) { + int branch = gid_branches[g_id][i]; + stringstream named_branch; + if (branch == alpha) + named_branch << -1; + else if (id_ranks[branch] == 0) { + named_branch << extant_species[branch]; + } else + named_branch << id_ranks[branch]; + + event_stream << gid_events[g_id][i] + //<<"@"<Bip_counts[g_id]; + if (bnorm == 0) + bnorm = ale_pointer->observations; + event_stream << " " << t_0 << " " << gid_times[g_id][last_i] << " " + << ale_pointer->Bip_bls[g_id] / bnorm << "\t" + << gid_gidp[g_id][last_i] << "\t" << gid_gidpp[g_id][last_i]; return event_stream.str(); } -//ad hoc function should be moved to a future exODT_util.cpp -void exODT_model::show_rates(string name) -{ - for (map ::iterator it=node_ids.begin();it!=node_ids.end();it++ ) - (*it).first->setBranchProperty("ID",BppString("")); - - for (int branch=0;branch(tmp_node->getBranchProperty("ID")))).toSTL(); - //out<< id_ranks[branch]; - if (branch==last_branch-1) out<<"|"<setBranchProperty("ID",BppString(out.str())); - if (tmp_node->isLeaf()) - tmp_node->setName(tmp_node->getName()+"_"+out.str()); - } - - cout << TreeTemplateTools::treeToParenthesis(*S,false,"ID") << endl; - for (map ::iterator it=node_ids.begin();it!=node_ids.end();it++ ) - { - (*it).first->setBranchProperty("ID",BppString("")); - if ((*it).first->isLeaf()) - { - vector tokens; - name=(*it).first->getName(); - boost::split(tokens,name,boost::is_any_of("_"),boost::token_compress_on); - (*it).first->setName(tokens[0]); - } +// ad hoc function should be moved to a future exODT_util.cpp +void exODT_model::show_rates(string name) { + for (map::iterator it = node_ids.begin(); it != node_ids.end(); + it++) + (*it).first->setBranchProperty("ID", BppString("")); + + for (int branch = 0; branch < last_branch; branch++) + if (id_nodes.count(branch)) { + Node *tmp_node = id_nodes[branch]; + + stringstream out; + string old_name = (*(dynamic_cast( + tmp_node->getBranchProperty("ID")))) + .toSTL(); + // out<< id_ranks[branch]; + if (branch == last_branch - 1) + out << "|" << name << "|"; + if (name == "tau") + out << vector_parameter[name][branch] * vector_parameter["N"][0]; + else + out << vector_parameter[name][branch]; + tmp_node->setBranchProperty("ID", BppString(out.str())); + if (tmp_node->isLeaf()) + tmp_node->setName(tmp_node->getName() + "_" + out.str()); + } + + cout << TreeTemplateTools::treeToParenthesis(*S, false, "ID") << endl; + for (map::iterator it = node_ids.begin(); it != node_ids.end(); + it++) { + (*it).first->setBranchProperty("ID", BppString("")); + if ((*it).first->isLeaf()) { + vector tokens; + name = (*it).first->getName(); + boost::split(tokens, name, boost::is_any_of("_"), + boost::token_compress_on); + (*it).first->setName(tokens[0]); } + } } diff --git a/src/undated.cpp b/src/undated.cpp index b63e19a..b873f53 100644 --- a/src/undated.cpp +++ b/src/undated.cpp @@ -3,121 +3,116 @@ using namespace std; using namespace bpp; -static scalar_type EPSILON = numeric_limits< scalar_type >::min(); +static scalar_type EPSILON = numeric_limits::min(); -void exODT_model::construct_undated(const string& Sstring, const string& fractionMissingFile) -{ +void exODT_model::construct_undated(const string &Sstring, + const string &fractionMissingFile) { daughter.clear(); son.clear(); name_node.clear(); node_name.clear(); node_ids.clear(); id_nodes.clear(); - - string_parameter["S_un"]=Sstring; - S=TreeTemplateTools::parenthesisToTree(string_parameter["S_un"], true);//(string_parameter["BOOTSTRAP_LABELS"]=="yes") - S_root = S->getRootNode(); - vector nodes = TreeTemplateTools::getNodes(*S_root); + string_parameter["S_un"] = Sstring; + S = TreeTemplateTools::parenthesisToTree( + string_parameter["S_un"], + true); //(string_parameter["BOOTSTRAP_LABELS"]=="yes") - for (vector ::iterator it=nodes.begin();it!=nodes.end();it++ ) - if ((*it)->isLeaf()) - { - name_node[(*it)->getName()]=(*it); - node_name[(*it)]=(*it)->getName(); - } - else - { - vector leafnames=TreeTemplateTools::getLeavesNames(*(*it)); - sort(leafnames.begin(),leafnames.end()); - stringstream name; - for (vector ::iterator st=leafnames.begin();st!=leafnames.end();st++ ) - name<<(*st)<<"."; - - name_node[name.str()]=(*it); - node_name[(*it)]=name.str(); + S_root = S->getRootNode(); + vector nodes = TreeTemplateTools::getNodes(*S_root); + + for (vector::iterator it = nodes.begin(); it != nodes.end(); it++) + if ((*it)->isLeaf()) { + name_node[(*it)->getName()] = (*it); + node_name[(*it)] = (*it)->getName(); + } else { + vector leafnames = TreeTemplateTools::getLeavesNames(*(*it)); + sort(leafnames.begin(), leafnames.end()); + stringstream name; + for (vector::iterator st = leafnames.begin(); + st != leafnames.end(); st++) + name << (*st) << "."; - } + name_node[name.str()] = (*it); + node_name[(*it)] = name.str(); + } // register species - last_branch=0; - last_leaf=0; - - - - set saw; - for (map ::iterator it=name_node.begin();it!=name_node.end();it++ ) - if ((*it).second->isLeaf()) - { - Node * node = (*it).second; - extant_species[last_branch]=node->getName(); - //stringstream name; - //name << extant_species[last_branch] <<"("<< last_branch<<")"; - //node->setName(name); - node_ids[node]=last_branch; - id_nodes[last_branch]=node; - last_branch++; - last_leaf++; - saw.insert(node); - // a leaf - daughter[last_branch]=-1; - // a leaf - son[last_branch]=-1; - vector_parameter["BL_rate_multiplier"].push_back(node->getDistanceToFather()); - vector_parameter["rate_multiplier_tau_to"].push_back(1); - vector_parameter["rate_multiplier_tau_from"].push_back(1); - wT.push_back(1); - rmD.push_back(1); - rmT.push_back(1); - rmL.push_back(1); - vector_parameter["rate_multiplier_delta"].push_back(1); - vector_parameter["rate_multiplier_lambda"].push_back(1); - vector_parameter["rate_multiplier_O"].push_back(1); - - } + last_branch = 0; + last_leaf = 0; + + set saw; + for (map::iterator it = name_node.begin(); + it != name_node.end(); it++) + if ((*it).second->isLeaf()) { + Node *node = (*it).second; + extant_species[last_branch] = node->getName(); + // stringstream name; + // name << extant_species[last_branch] <<"("<< last_branch<<")"; + // node->setName(name); + node_ids[node] = last_branch; + id_nodes[last_branch] = node; + last_branch++; + last_leaf++; + saw.insert(node); + // a leaf + daughter[last_branch] = -1; + // a leaf + son[last_branch] = -1; + vector_parameter["BL_rate_multiplier"].push_back( + node->getDistanceToFather()); + vector_parameter["rate_multiplier_tau_to"].push_back(1); + vector_parameter["rate_multiplier_tau_from"].push_back(1); + wT.push_back(1); + rmD.push_back(1); + rmT.push_back(1); + rmL.push_back(1); + vector_parameter["rate_multiplier_delta"].push_back(1); + vector_parameter["rate_multiplier_lambda"].push_back(1); + vector_parameter["rate_multiplier_O"].push_back(1); + } - + // ad-hoc postorder + vector next_generation; + for (map::iterator it = name_node.begin(); + it != name_node.end(); it++) + if ((*it).second->isLeaf()) { + Node *node = (*it).second; + next_generation.push_back(node); + } + while (next_generation.size()) { + vector new_generation; + for (vector::iterator it = next_generation.begin(); + it != next_generation.end(); it++) { + Node *node = (*it); + if (node->hasFather()) { + Node *father = node->getFather(); + vector sons = father->getSons(); + Node *sister; + if (sons[0] == node) + sister = sons[1]; + else + sister = sons[0]; - - //ad-hoc postorder - vector next_generation; - for (map ::iterator it=name_node.begin();it!=name_node.end();it++ ) - if ((*it).second->isLeaf()) - { - Node * node = (*it).second; - next_generation.push_back(node); - } - while(next_generation.size()) - { - vector new_generation; - for (vector::iterator it=next_generation.begin();it!=next_generation.end();it++ ) - { - Node * node = (*it); - if (node->hasFather() ) - { - Node * father=node->getFather(); - vector sons=father->getSons(); - Node * sister; - if (sons[0]==node) sister=sons[1]; else sister=sons[0]; - - if (not node_ids.count(father) and saw.count(sister)) - { - node_ids[father]=last_branch; - id_nodes[last_branch]=father; + if (not node_ids.count(father) and saw.count(sister)) { + node_ids[father] = last_branch; + id_nodes[last_branch] = father; stringstream name; name << last_branch; - vector_parameter["BL_rate_multiplier"].push_back(node->getDistanceToFather()); + vector_parameter["BL_rate_multiplier"].push_back( + node->getDistanceToFather()); vector_parameter["rate_multiplier_tau_to"].push_back(1); vector_parameter["rate_multiplier_tau_from"].push_back(1); wT.push_back(1); rmD.push_back(1); rmT.push_back(1); - rmL.push_back(1); - vector_parameter["rate_multiplier_delta"].push_back(1); - vector_parameter["rate_multiplier_lambda"].push_back(1); - vector_parameter["rate_multiplier_O"].push_back(1); + rmL.push_back(1); + vector_parameter["rate_multiplier_delta"].push_back(1); + vector_parameter["rate_multiplier_lambda"].push_back(1); + vector_parameter["rate_multiplier_O"].push_back(1); - father->setBranchProperty("ID",BppString(name.str())); + father->setBranchProperty("ID", BppString(name.str())); last_branch++; saw.insert(father); @@ -126,31 +121,28 @@ void exODT_model::construct_undated(const string& Sstring, const string& fractio } } next_generation.clear(); - for (vector::iterator it=new_generation.begin();it!=new_generation.end();it++ ) - next_generation.push_back((*it)); + for (vector::iterator it = new_generation.begin(); + it != new_generation.end(); it++) + next_generation.push_back((*it)); } - - //for (vector ::iterator it=nodes.begin();it!=nodes.end();it++ ) (*it)->setDistanceToFather(1); - below.clear(); - for (int e=0;egetFather()) <<" e:"<getFather()) < height(id_nodes[f]) ) - { - //cout << e << " below " << f << endl; - below[e][f]=1; - } - else - below[e][f]=0; - } + // for (vector ::iterator it=nodes.begin();it!=nodes.end();it++ ) + // (*it)->setDistanceToFather(1); + below.clear(); + for (int e = 0; e < last_branch - 1; e++) { + for (int f = 0; f < last_branch - 1; f++) { + // cout <getFather()) <<" e:"<getFather()) < height(id_nodes[f])) { + // cout << e << " below " << f << endl; + below[e][f] = 1; + } else + below[e][f] = 0; + } } - - - vector_parameter["BL_rate_multiplier"][last_branch]=scalar_parameter["root_BL"]; + vector_parameter["BL_rate_multiplier"][last_branch] = + scalar_parameter["root_BL"]; vector_parameter["rate_multiplier_tau_to"].push_back(1); vector_parameter["rate_multiplier_tau_from"].push_back(1); wT.push_back(1); @@ -158,103 +150,91 @@ void exODT_model::construct_undated(const string& Sstring, const string& fractio vector_parameter["rate_multiplier_lambda"].push_back(1); vector_parameter["rate_multiplier_O"].push_back(1); - - for (map ::iterator it=node_ids.begin();it!=node_ids.end();it++ ) - { - Node * node = (*it).first; + for (map::iterator it = node_ids.begin(); it != node_ids.end(); + it++) { + Node *node = (*it).first; int branch = (*it).second; stringstream out; stringstream out1; stringstream out2; - out1<hasBranchProperty("bootstrap") ) - { - rank2label[rank]=node->getBootstrapValue(); - //cout <"<hasBranchProperty("bootstrap")) { + rank2label[rank] = node->getBootstrapValue(); + // cout <"<setBranchProperty("ID",BppString(out.str())); + node->setBranchProperty("ID", BppString(out.str())); } - string_parameter["S_with_ranks"]=TreeTemplateTools::treeToParenthesis(*S,false,"ID"); + string_parameter["S_with_ranks"] = + TreeTemplateTools::treeToParenthesis(*S, false, "ID"); - - //map > ancestral_names; - //map > ancestral; + // map > ancestral_names; + // map > ancestral; ancestors.clear(); - for (int e=0;e tmp; + for (int e = 0; e < last_branch; e++) { + vector tmp; ancestors.push_back(tmp); - for (int f=0;f::iterator it=nodes.begin();it!=nodes.end();it++ ) - { - Node * node=(*it); + for (vector::iterator it = nodes.begin(); it != nodes.end(); it++) { + Node *node = (*it); int e = node_ids[node]; stringstream name_from; - if (e tmp; - ancestral_names[name_from.str()]=tmp; - while (node->hasFather()) - { + name_from << e; + map tmp; + ancestral_names[name_from.str()] = tmp; + while (node->hasFather()) { stringstream name_to; int f = node_ids[node]; - if (fgetFather(); - ancestral_names[name_from.str()][name_to.str()]=1; + name_to << f; + node = node->getFather(); + ancestral_names[name_from.str()][name_to.str()] = 1; if (not ancestral[e][f]) - ancestors[e].push_back(f); - ancestral[e][f]=1; + ancestors[e].push_back(f); + ancestral[e][f] = 1; } stringstream name_to; int f = node_ids[node]; - name_to<::iterator it=name_node.begin();it!=name_node.end();it++ ) - if (not (*it).second->isLeaf()) - { - Node * node = (*it).second; - vector sons=node->getSons(); - daughter[node_ids[node]]=node_ids[sons[0]]; - son[node_ids[node]]=node_ids[sons[1]]; - //cout << node_ids[node] << " => " << node_ids[sons[0]] << " & " << node_ids[sons[1]] << endl; - //cout << node_name[node] << " => " << node_name[sons[0]] << " & " << node_name[sons[1]] << endl; + if (scalar_parameter["reldate"] == true) + for (int e = 0; e < last_branch; e++) { + for (int f = 0; f < last_branch; f++) + if (below[e][f] == 1) { + if (not ancestral[e][f]) + ancestors[e].push_back(f); + ancestral[e][f] = 1; + } + } - } + for (map::iterator it = name_node.begin(); + it != name_node.end(); it++) + if (not(*it).second->isLeaf()) { + Node *node = (*it).second; + vector sons = node->getSons(); + daughter[node_ids[node]] = node_ids[sons[0]]; + son[node_ids[node]] = node_ids[sons[1]]; + // cout << node_ids[node] << " => " << node_ids[sons[0]] << " & " << + // node_ids[sons[1]] << endl; cout << node_name[node] << " => " << + // node_name[sons[0]] << " & " << node_name[sons[1]] << endl; + } branch_counts["Os"].clear(); branch_counts["Ds"].clear(); branch_counts["Ts"].clear(); @@ -263,12 +243,11 @@ void exODT_model::construct_undated(const string& Sstring, const string& fractio branch_counts["count"].clear(); branch_counts["presence"].clear(); branch_counts["saw"].clear(); - branch_counts["O_LL"].clear(); + branch_counts["O_LL"].clear(); branch_counts["copies"].clear(); branch_counts["singleton"].clear(); - for (int e=0;e tmp; + for (int e = 0; e < last_branch; e++) { + vector tmp; T_to_from.push_back(tmp); - for (int f=0;f (last_leaf, 0.0); - //Put user-defined values, if available + // Put default values for the fraction of missing genes at the leaves. + vector_parameter["fraction_missing"] = vector(last_leaf, 0.0); + // Put user-defined values, if available if (fractionMissingFile == "") { - } - else { + } else { fraction_missing = readFractionMissingFile(fractionMissingFile); - // Now we need to fill up the vector_parameter, and we have to be careful about the order. + // Now we need to fill up the vector_parameter, and we have to be careful + // about the order. size_t index = 0; - for (map ::iterator it=name_node.begin();it!=name_node.end();it++ ) { - if ((*it).second->isLeaf()) - { - Node * node = (*it).second; + for (map::iterator it = name_node.begin(); + it != name_node.end(); it++) { + if ((*it).second->isLeaf()) { + Node *node = (*it).second; string currentSpecies = node->getName(); - vector_parameter["fraction_missing"][index] = fraction_missing[currentSpecies]; + vector_parameter["fraction_missing"][index] = + fraction_missing[currentSpecies]; index++; } } VectorTools::print(vector_parameter["fraction_missing"]); } - - } -void exODT_model::calculate_undatedEs() -{ +void exODT_model::calculate_undatedEs() { uE.clear(); fm.clear(); mPTE_ancestral_correction.clear(); @@ -330,969 +303,987 @@ void exODT_model::calculate_undatedEs() tau_norm.clear(); PL.clear(); PS.clear(); - //scalar_type P_T=0; - map rm_norms; - rm_norms["tau_to"]=0; - rm_norms["tau_from"]=0; - rm_norms["delta"]=0; - rm_norms["lambda"]=0; - rm_norms["O"]=0; - - for (int e=0;e rm_norms; + rm_norms["tau_to"] = 0; + rm_norms["tau_from"] = 0; + rm_norms["delta"] = 0; + rm_norms["lambda"] = 0; + rm_norms["O"] = 0; + + for (int e = 0; e < last_branch; e++) { + rm_norms["tau_to"] += vector_parameter["rate_multiplier_tau_to"][e]; + rm_norms["tau_from"] += vector_parameter["rate_multiplier_tau_from"][e]; + rm_norms["delta"] += vector_parameter["rate_multiplier_delta"][e]; + rm_norms["lambda"] += vector_parameter["rate_multiplier_lambda"][e]; + rm_norms["O"] += vector_parameter["rate_multiplier_O"][e]; + } - - for (int e=0;e::iterator it=ancestors[e].begin(); it!=ancestors[e].end() ;it++) - { - int f=(*it); - tau_norm[e]-=wT[f]; - } - tau_norm[e]/=P_T; - + for (vector::iterator it = ancestors[e].begin(); + it != ancestors[e].end(); it++) { + int f = (*it); + tau_norm[e] -= wT[f]; + } + tau_norm[e] /= P_T; + PL.push_back(P_L); PS.push_back(P_S); uE.push_back(0); - if (e ancestral_correction; - if (i>0) // There should be no need for this loop at the first iteration, because then it leaves mPTE_ancestral_correction at 0. + // In the loop below with 4 iterations, we calculate the mean probability mPTE + // for a gene to become extinct across all branches. + mPTE = 0; + for (int i = 0; i < 4; i++) { + scalar_type newmPTE = 0; + // vector ancestral_correction; + if (i > 0) // There should be no need for this loop at the first iteration, + // because then it leaves mPTE_ancestral_correction at 0. { - for (int e=0;e::iterator it=ancestral[e].begin();( it!=ancestral[e].end() and i>0);it++) - for (vector::iterator it=ancestors[e].begin(); it!=ancestors[e].end() ;it++) - { - //int f=(*it).first; - int f=(*it); - //if (ancestral[e][f]==1) - mPTE_ancestral_correction[e]+= (wT[f])*uE[f]; //That's how we forbid transfers to ancestors of a branch + for (int e = 0; e < last_branch; e++) { + mPTE_ancestral_correction[e] = 0; + // for (map::iterator it=ancestral[e].begin();( + // it!=ancestral[e].end() and i>0);it++) + for (vector::iterator it = ancestors[e].begin(); + it != ancestors[e].end(); it++) { + // int f=(*it).first; + int f = (*it); + // if (ancestral[e][f]==1) + mPTE_ancestral_correction[e] += + (wT[f]) * + uE[f]; // That's how we forbid transfers to ancestors of a branch } } } - for (int e=0;e > >::iterator it=q.begin();it!=q.end();it++) - { - for ( std::map< scalar_type, std::map >::iterator jt=(*it).second.begin();jt!=(*it).second.end();jt++) - (*jt).second.clear(); +scalar_type exODT_model::pun(approx_posterior *ale, bool verbose, bool no_T) { + scalar_type survive = 0; + scalar_type root_sum = 0; + scalar_type O_norm = 0; + mPTuq_ancestral_correction.clear(); + uq.clear(); + mPTuq.clear(); // XX + ale_pointer = ale; + + for (std::map>>::iterator + it = q.begin(); + it != q.end(); it++) { + for (std::map>::iterator jt = + (*it).second.begin(); + jt != (*it).second.end(); jt++) + (*jt).second.clear(); (*it).second.clear(); } q.clear(); - - //directed partitions and their sizes - //vector g_ids; - //vector g_id_sizes; + // directed partitions and their sizes + // vector g_ids; + // vector g_id_sizes; g_ids.clear(); g_id_sizes.clear(); - for (map > :: iterator it = ale->size_ordered_bips.begin(); it != ale->size_ordered_bips.end(); it++) - for (vector :: iterator jt = (*it).second.begin(); jt != (*it).second.end(); jt++) - { - g_ids.push_back((*jt)); - g_id_sizes.push_back((*it).first); - } - //root bipartition needs to be handled separately + for (map>::iterator it = ale->size_ordered_bips.begin(); + it != ale->size_ordered_bips.end(); it++) + for (vector::iterator jt = (*it).second.begin(); + jt != (*it).second.end(); jt++) { + g_ids.push_back((*jt)); + g_id_sizes.push_back((*it).first); + } + // root bipartition needs to be handled separately g_ids.push_back(-1); g_id_sizes.push_back(ale->Gamma_size); - root_i=g_ids.size()-1; + root_i = g_ids.size() - 1; // gene<->species mapping // mpi_tree's pun() breaks if mapping is saved.. Sz.G. 12.29/ gid_sps.clear(); if (gid_sps.size() == 0) // If the mapping has not been done yet { - //Test that the species associated to genes are really in the species tree + // Test that the species associated to genes are really in the species tree std::set species_set; - for (std::map::iterator iter = extant_species.begin(); iter!=extant_species.end(); ++iter) - { - species_set.insert(iter->second); + for (std::map::iterator iter = extant_species.begin(); + iter != extant_species.end(); ++iter) { + species_set.insert(iter->second); } - if (verbose) cout << "\nGene" << "\t:\t"<< "Species" << endl; - for (int i=0;i<(int)g_ids.size();i++) - { - long int g_id=g_ids[i]; + if (verbose) + cout << "\nGene" + << "\t:\t" + << "Species" << endl; + for (int i = 0; i < (int)g_ids.size(); i++) { + long int g_id = g_ids[i]; - if (g_id_sizes[i]==1) - { + if (g_id_sizes[i] == 1) { int id = 0; - for (auto i=0; i< ale->Gamma_size + 1; ++i) - { - if ( ale->id_sets[g_id][i] ) - { - id=i; + for (auto i = 0; i < ale->Gamma_size + 1; ++i) { + if (ale->id_sets[g_id][i]) { + id = i; break; } } - string gene_name=ale->id_leaves[ id ]; - vector tokens; - boost::split(tokens,gene_name,boost::is_any_of(string_parameter["gene_name_separators"]),boost::token_compress_on); + string gene_name = ale->id_leaves[id]; + vector tokens; + boost::split(tokens, gene_name, + boost::is_any_of(string_parameter["gene_name_separators"]), + boost::token_compress_on); string species_name; - if ((int)scalar_parameter["species_field"]==-1) - { - species_name=tokens[1]; - for (int fi=2;fi g_id2i; - //XX ancestral_correction .. - for (int i=0;i<(int)g_ids.size();i++) - { - long int g_id=g_ids[i]; - g_id2i[g_id]=i; - - if (not ( i<(int)uq.size() ) ) - { - vector tmp; + // map g_id2i; + // XX ancestral_correction .. + for (int i = 0; i < (int)g_ids.size(); i++) { + long int g_id = g_ids[i]; + g_id2i[g_id] = i; + + if (not(i < (int)uq.size())) { + vector tmp; uq.push_back(tmp); - vector tmp2; + vector tmp2; mPTuq_ancestral_correction.push_back(tmp2); mPTuq.push_back(0); - } - else - mPTuq[i]=0; - - for (int e=0;e gp_is; - vector gpp_is; - vector p_part; - if (g_id!=-1) - for (unordered_map< pair,scalar_type> :: iterator kt = ale->Dip_counts[g_id].begin(); kt != ale->Dip_counts[g_id].end(); kt++) - { - pair parts = (*kt).first; - long int gp_id=parts.first; - long int gpp_id=parts.second; - int gp_i=g_id2i[parts.first]; - int gpp_i=g_id2i[parts.second]; - gp_is.push_back(gp_i); - gpp_is.push_back(gpp_i); - if (ale->Bip_counts[g_id]<=scalar_parameter["min_bip_count"]) - p_part.push_back(0); - else - p_part.push_back( pow( (scalar_type) ale->p_dip(g_id,gp_id,gpp_id) , (scalar_type) scalar_parameter["seq_beta"] ) );//set pp - } - else - { - //XX - //root bipartition needs to be handled separately - map,int> bip_parts; - for (map :: iterator it = ale->Bip_counts.begin(); it != ale->Bip_counts.end(); it++) - { - long int gp_id=(*it).first; - boost::dynamic_bitset<> gamma =ale->id_sets.at(gp_id); + bool is_a_leaf = false; + long int g_id = g_ids[i]; + if (g_id_sizes[i] == 1) + is_a_leaf = true; + vector gp_is; + vector gpp_is; + vector p_part; + if (g_id != -1) + for (unordered_map, scalar_type>::iterator kt = + ale->Dip_counts[g_id].begin(); + kt != ale->Dip_counts[g_id].end(); kt++) { + pair parts = (*kt).first; + long int gp_id = parts.first; + long int gpp_id = parts.second; + int gp_i = g_id2i[parts.first]; + int gpp_i = g_id2i[parts.second]; + gp_is.push_back(gp_i); + gpp_is.push_back(gpp_i); + if (ale->Bip_counts[g_id] <= scalar_parameter["min_bip_count"]) + p_part.push_back(0); + else + p_part.push_back( + pow((scalar_type)ale->p_dip(g_id, gp_id, gpp_id), + (scalar_type)scalar_parameter["seq_beta"])); // set pp + } + else { + // XX + // root bipartition needs to be handled separately + map, int> bip_parts; + for (map::iterator it = ale->Bip_counts.begin(); + it != ale->Bip_counts.end(); it++) { + long int gp_id = (*it).first; + boost::dynamic_bitset<> gamma = ale->id_sets.at(gp_id); boost::dynamic_bitset<> not_gamma = ~gamma; not_gamma[0] = 0; long int gpp_id = ale->set_ids.at(not_gamma); - set parts; + set parts; parts.insert(gp_id); parts.insert(gpp_id); - bip_parts[parts]=1; + bip_parts[parts] = 1; // gamma.clear(); // not_gamma.clear(); } - for (map,int> :: iterator kt = bip_parts.begin();kt!=bip_parts.end();kt++) - { - vector parts; - for (set::iterator sit=(*kt).first.begin();sit!=(*kt).first.end();sit++) { + for (map, int>::iterator kt = bip_parts.begin(); + kt != bip_parts.end(); kt++) { + vector parts; + for (set::iterator sit = (*kt).first.begin(); + sit != (*kt).first.end(); sit++) { parts.push_back((*sit)); } - long int gp_id=parts[0]; - //long int gpp_id=parts[1]; + long int gp_id = parts[0]; + // long int gpp_id=parts[1]; - int gp_i=g_id2i[parts[0]]; - int gpp_i=g_id2i[parts[1]]; + int gp_i = g_id2i[parts[0]]; + int gpp_i = g_id2i[parts[1]]; gp_is.push_back(gp_i); gpp_is.push_back(gpp_i); - //Here we can create a new ale->Bip_counts[gp_id], in particular for leaves. - //We may want to add the leaf entries for Bip_counts when Bip_counts is first created. - if (ale->Bip_counts[gp_id]<=scalar_parameter.at("min_bip_count") and not ale->Gamma_size<4) - p_part.push_back(0); + // Here we can create a new ale->Bip_counts[gp_id], in particular for + // leaves. We may want to add the leaf entries for Bip_counts when + // Bip_counts is first created. + if (ale->Bip_counts[gp_id] <= scalar_parameter.at("min_bip_count") and + not ale->Gamma_size < 4) + p_part.push_back(0); else - p_part.push_back(pow ( (scalar_type) ale->p_bip(gp_id) , (scalar_type) scalar_parameter["seq_beta"] ) );//set pp + p_part.push_back( + pow((scalar_type)ale->p_bip(gp_id), + (scalar_type)scalar_parameter["seq_beta"])); // set pp } bip_parts.clear(); } - //###################################################################################################################### - //#########################################INNNER LOOP################################################################## - //###################################################################################################################### + // ###################################################################################################################### + // #########################################INNNER + // LOOP################################################################## + // ###################################################################################################################### - for (int e=0;e::iterator it=ancestral[e].begin(); it!=ancestral[e].end();it++) - for (vector::iterator it=ancestors[e].begin(); it!=ancestors[e].end();it++) - { - //int f=(*it).first; - int f=(*it); - //if (ancestral[e][f]==1) - mPTuq_ancestral_correction[i][e]+=(wT[f])*uq_sum; + if (not no_T) + uq_sum += + ((mPTuq[i] - mPTuq_ancestral_correction[i][e]) / tau_norm[e] * + uE[e] + + uq[i][e] * (mPTE - mPTE_ancestral_correction[e]) / tau_norm[e]); + if (uq_sum < EPSILON) + uq_sum = EPSILON; + uq[i][e] = uq_sum; + new_mPTuq += (wT[e]) * uq_sum; + mPTuq_ancestral_correction[i][e] = 0; + // for (map::iterator it=ancestral[e].begin(); + // it!=ancestral[e].end();it++) + for (vector::iterator it = ancestors[e].begin(); + it != ancestors[e].end(); it++) { + // int f=(*it).first; + int f = (*it); + // if (ancestral[e][f]==1) + mPTuq_ancestral_correction[i][e] += (wT[f]) * uq_sum; } } - mPTuq[i]=new_mPTuq; + mPTuq[i] = new_mPTuq; - //###################################################################################################################### - //#########################################INNNER LOOP################################################################## - //###################################################################################################################### + // ###################################################################################################################### + // #########################################INNNER + // LOOP################################################################## + // ###################################################################################################################### } - survive=0; - root_sum=0; - bool single_O=false; - for (int e=0;e0) - { - vector_parameter["rate_multiplier_O"][e]=0; - } - else - { - vector_parameter["rate_multiplier_O"][e]=1; - } - } - } - - for (int e=0;e 0) { + vector_parameter["rate_multiplier_O"][e] = 0; + } else { + vector_parameter["rate_multiplier_O"][e] = 1; + } } - //cout << root_sum/survive << endl; + } - + for (int e = 0; e < last_branch; e++) { + scalar_type O_p = vector_parameter["rate_multiplier_O"][e]; + if (e == (last_branch - 1) and O_p == 1) + O_p = scalar_parameter["O_R"]; + root_sum += uq[root_i][e] * O_p; + survive += O_p * (1 - uE[e]); + } + for (int e = 0; e < last_branch; e++) { + scalar_type O_p = vector_parameter["rate_multiplier_O"][e]; + if (e == (last_branch - 1) and O_p == 1) + O_p = scalar_parameter["O_R"]; + branch_counts["O_LL"].at(e) = log(uq[root_i][e]) + log(O_p); + } + // cout << root_sum/survive << endl; } - return root_sum*last_branch/survive; - + return root_sum * last_branch / survive; } -string exODT_model::sample_undated(bool no_T) -{ +string exODT_model::sample_undated(bool no_T) { - scalar_type r=RandomTools::giveRandomNumberBetweenZeroAndEntry(1); + scalar_type r = RandomTools::giveRandomNumberBetweenZeroAndEntry(1); - scalar_type root_sum=0; - scalar_type O_norm=0; - for (int e=0;eBip_counts.count(g_id) and ale_pointer->Bip_counts[g_id]>0) - bl<Bip_bls[g_id]/ale_pointer->Bip_counts[g_id],(scalar_type)scalar_parameter["min_branch_lenghts"]); + if (ale_pointer->Bip_counts.count(g_id) and ale_pointer->Bip_counts[g_id] > 0) + bl << max(ale_pointer->Bip_bls[g_id] / ale_pointer->Bip_counts[g_id], + (scalar_type)scalar_parameter["min_branch_lenghts"]); else - bl<Bip_bls[g_id]/ale_pointer->observations,(scalar_type)scalar_parameter["min_branch_lenghts"]); - string branch_length=bl.str(); - - vector gp_is; - vector gpp_is; - vector p_part; - if (g_id!=-1) - for (unordered_map< pair,scalar_type> :: iterator kt = ale_pointer->Dip_counts[g_id].begin(); kt != ale_pointer->Dip_counts[g_id].end(); kt++) - { - pair parts = (*kt).first; - long int gp_id=parts.first; - long int gpp_id=parts.second; - int gp_i=g_id2i[parts.first]; - int gpp_i=g_id2i[parts.second]; - gp_is.push_back(gp_i); - gpp_is.push_back(gpp_i); - if (ale_pointer->Bip_counts[g_id]<=scalar_parameter["min_bip_count"]) - p_part.push_back(0); - else - p_part.push_back( pow( (scalar_type) ale_pointer->p_dip(g_id,gp_id,gpp_id) , (scalar_type) scalar_parameter["seq_beta"] ) );//set pp - } - else - { - //root bipartition needs to be handled separately - map,int> bip_parts; - for (map :: iterator it = ale_pointer->Bip_counts.begin(); it != ale_pointer->Bip_counts.end(); it++) - { - long int gp_id=(*it).first; - boost::dynamic_bitset<> gamma =ale_pointer->id_sets.at(gp_id); + bl << max(ale_pointer->Bip_bls[g_id] / ale_pointer->observations, + (scalar_type)scalar_parameter["min_branch_lenghts"]); + string branch_length = bl.str(); + + vector gp_is; + vector gpp_is; + vector p_part; + if (g_id != -1) + for (unordered_map, scalar_type>::iterator kt = + ale_pointer->Dip_counts[g_id].begin(); + kt != ale_pointer->Dip_counts[g_id].end(); kt++) { + pair parts = (*kt).first; + long int gp_id = parts.first; + long int gpp_id = parts.second; + int gp_i = g_id2i[parts.first]; + int gpp_i = g_id2i[parts.second]; + gp_is.push_back(gp_i); + gpp_is.push_back(gpp_i); + if (ale_pointer->Bip_counts[g_id] <= scalar_parameter["min_bip_count"]) + p_part.push_back(0); + else + p_part.push_back( + pow((scalar_type)ale_pointer->p_dip(g_id, gp_id, gpp_id), + (scalar_type)scalar_parameter["seq_beta"])); // set pp + } + else { + // root bipartition needs to be handled separately + map, int> bip_parts; + for (map::iterator it = + ale_pointer->Bip_counts.begin(); + it != ale_pointer->Bip_counts.end(); it++) { + long int gp_id = (*it).first; + boost::dynamic_bitset<> gamma = ale_pointer->id_sets.at(gp_id); boost::dynamic_bitset<> not_gamma = ~gamma; not_gamma[0] = 0; long int gpp_id = ale_pointer->set_ids.at(not_gamma); - set parts; + set parts; parts.insert(gp_id); parts.insert(gpp_id); - bip_parts[parts]=1; + bip_parts[parts] = 1; // gamma.clear(); // not_gamma.clear(); } - for (map,int> :: iterator kt = bip_parts.begin();kt!=bip_parts.end();kt++) - { - vector parts; - for (set::iterator sit=(*kt).first.begin();sit!=(*kt).first.end();sit++) { + for (map, int>::iterator kt = bip_parts.begin(); + kt != bip_parts.end(); kt++) { + vector parts; + for (set::iterator sit = (*kt).first.begin(); + sit != (*kt).first.end(); sit++) { parts.push_back((*sit)); } - long int gp_id=parts[0]; - //long int gpp_id=parts[1]; + long int gp_id = parts[0]; + // long int gpp_id=parts[1]; - int gp_i=g_id2i[parts[0]]; - int gpp_i=g_id2i[parts[1]]; + int gp_i = g_id2i[parts[0]]; + int gpp_i = g_id2i[parts[1]]; gp_is.push_back(gp_i); gpp_is.push_back(gpp_i); - //Here we can create a new ale->Bip_counts[gp_id], in particular for leaves. - //We may want to add the leaf entries for Bip_counts when Bip_counts is first created. - if (ale_pointer->Bip_counts[gp_id]<=scalar_parameter.at("min_bip_count") and not ale_pointer->Gamma_size<4) - p_part.push_back(0); + // Here we can create a new ale->Bip_counts[gp_id], in particular for + // leaves. We may want to add the leaf entries for Bip_counts when + // Bip_counts is first created. + if (ale_pointer->Bip_counts[gp_id] <= + scalar_parameter.at("min_bip_count") and + not ale_pointer->Gamma_size < 4) + p_part.push_back(0); else - p_part.push_back( pow ( (scalar_type) ale_pointer->p_bip(gp_id) , (scalar_type) scalar_parameter["seq_beta"] ) );//set pp + p_part.push_back( + pow((scalar_type)ale_pointer->p_bip(gp_id), + (scalar_type)scalar_parameter["seq_beta"])); // set pp } bip_parts.clear(); } - - //###################################################################################################################### - //#########################################INNNER LOOP################################################################## - //###################################################################################################################### - scalar_type uq_sum=0; + // ###################################################################################################################### + // #########################################INNNER + // LOOP################################################################## + // ###################################################################################################################### + scalar_type uq_sum = 0; // S leaf and G leaf - if (eset2name(ale_pointer->id_sets[g_id])+branch_string+":"+branch_length; + uq_resum += PS[e] * 1 + EPSILON; + if (r * uq_sum < uq_resum) { + register_leafu(e, last_event); + return ale_pointer->set2name(ale_pointer->id_sets[g_id]) + branch_string + + ":" + branch_length; } } // G internal - if (not is_a_leaf) - { - int N_parts=gp_is.size(); - for (int i=0;i"<set2name(ale_pointer->id_sets[g_ids[gpp_i]]); - Ttokens.push_back(Ttoken.str()); - - return "("+sample_undated(e,gp_i,"S","",no_T)+","+sample_undated(f,gpp_i,"T","",no_T)+").T@"+estr+"->"+fstr+branch_string+":"+branch_length; - } - uq_resum+=uq[gpp_i][e]*(wT[f]/tau_norm[e])*uq[gp_i][f]*pp+EPSILON; - if (r*uq_sum"<set2name(ale_pointer->id_sets[g_ids[gp_i]]); - Ttokens.push_back(Ttoken.str()); - return "("+sample_undated(e,gpp_i,"S","",no_T)+","+sample_undated(f,gp_i,"T","",no_T)+").T@"+estr+"->"+fstr+branch_string+":"+branch_length; + for (int f = 0; f < last_branch; f++) + if (not ancestral[e][f] and not no_T) { + stringstream fstring; + if (not(f < last_leaf)) + fstring << f; + else + fstring << extant_species[f]; + string fstr = fstring.str(); + + uq_resum += + uq[gp_i][e] * (wT[f] / tau_norm[e]) * uq[gpp_i][f] * pp + EPSILON; + if (r * uq_sum < uq_resum) { + register_Tfrom(e); + register_Tto(f); + register_T_to_from(e, f); + stringstream Ttoken; + Ttoken << estr << ">" << fstr << "|" + << ale_pointer->set2name(ale_pointer->id_sets[g_ids[gpp_i]]); + Ttokens.push_back(Ttoken.str()); + + return "(" + sample_undated(e, gp_i, "S", "", no_T) + "," + + sample_undated(f, gpp_i, "T", "", no_T) + ").T@" + estr + + "->" + fstr + branch_string + ":" + branch_length; + } + uq_resum += + uq[gpp_i][e] * (wT[f] / tau_norm[e]) * uq[gp_i][f] * pp + EPSILON; + if (r * uq_sum < uq_resum) { + register_Tfrom(e); + register_Tto(f); + register_T_to_from(e, f); + stringstream Ttoken; + Ttoken << estr << ">" << fstr << "|" + << ale_pointer->set2name(ale_pointer->id_sets[g_ids[gp_i]]); + Ttokens.push_back(Ttoken.str()); + return "(" + sample_undated(e, gpp_i, "S", "", no_T) + "," + + sample_undated(f, gp_i, "T", "", no_T) + ").T@" + estr + + "->" + fstr + branch_string + ":" + branch_length; + } } - - } } } - if (not (e"<set2name(ale_pointer->id_sets[g_id]); - Ttokens.push_back(Ttoken.str()); - */ - register_L(e); - return sample_undated(f,i,"T",".T@"+estr+"->"+fstr+branch_string,no_T); - } - uq_resum+=(wT[f]/tau_norm[e])*uE[f]*uq[i][e]+EPSILON; - if (r*uq_sum"<set2name(ale_pointer->id_sets[g_id]); + Ttokens.push_back(Ttoken.str()); + */ + register_L(e); + return sample_undated(f, i, "T", + ".T@" + estr + "->" + fstr + branch_string, no_T); + } + uq_resum += (wT[f] / tau_norm[e]) * uE[f] * uq[i][e] + EPSILON; + if (r * uq_sum < uq_resum) { + return sample_undated(e, i, "S", "", no_T); + } } - } - //###################################################################################################################### - //#########################################INNNER LOOP################################################################## - //###################################################################################################################### + // ###################################################################################################################### + // #########################################INNNER + // LOOP################################################################## + // ###################################################################################################################### cout << "sum error!" << endl; return "-!=-"; } -string exODT_model::counts_string_undated(scalar_type samples) -{ +string exODT_model::counts_string_undated(scalar_type samples) { stringstream out; - for (int e=0;e-1) - { - int f=daughter[e]; - int g=son[e]; - if (last_event=="S" or last_event=="O") branch_counts["singleton"].at(e)+=1; - branch_counts["copies"].at(e)+=1; - if (branch_counts["saw"].at(e)==0) branch_counts["presence"].at(e)+=1; - branch_counts["saw"].at(e)=1; - - branch_counts["count"].at(f)+=1; - branch_counts["count"].at(g)+=1; +void exODT_model::register_Su(int e, string last_event) { + MLRec_events["S"] += 1; + if (e > -1) { + int f = daughter[e]; + int g = son[e]; + if (last_event == "S" or last_event == "O") + branch_counts["singleton"].at(e) += 1; + branch_counts["copies"].at(e) += 1; + if (branch_counts["saw"].at(e) == 0) + branch_counts["presence"].at(e) += 1; + branch_counts["saw"].at(e) = 1; + + branch_counts["count"].at(f) += 1; + branch_counts["count"].at(g) += 1; } } -void exODT_model::register_leafu(int e,string last_event) -{ - if (e>-1) - { - branch_counts["copies"].at(e)+=1; - if (branch_counts["saw"].at(e)==0) branch_counts["presence"].at(e)+=1; - branch_counts["saw"].at(e)=1; - if (last_event=="S" or last_event=="O") branch_counts["singleton"].at(e)+=1; +void exODT_model::register_leafu(int e, string last_event) { + if (e > -1) { + branch_counts["copies"].at(e) += 1; + if (branch_counts["saw"].at(e) == 0) + branch_counts["presence"].at(e) += 1; + branch_counts["saw"].at(e) = 1; + if (last_event == "S" or last_event == "O") + branch_counts["singleton"].at(e) += 1; } - //MLRec_events["genes"]+=1; + // MLRec_events["genes"]+=1; } -void exODT_model::register_T_to_from(int e,int f) -{ - T_to_from[e][f]+=1; -} +void exODT_model::register_T_to_from(int e, int f) { T_to_from[e][f] += 1; } -void exODT_model::reset_T_to_from() -{ - for (int e=0;egetRootNode(); + vector nodes = TreeTemplateTools::getNodes(*newS_root); -string exODT_model::feSPR(int e, int f) -{ - tree_type * newS=TreeTemplateTools::parenthesisToTree(string_parameter["S_un"], (string_parameter["BOOTSTRAP_LABELS"]=="yes")); - Node * newS_root = newS->getRootNode(); - vector nodes = TreeTemplateTools::getNodes(*newS_root); - - string e_name=node_name[id_nodes[e]]; - string f_name=node_name[id_nodes[f]];; + string e_name = node_name[id_nodes[e]]; + string f_name = node_name[id_nodes[f]]; + ; Node *e_node, *f_node; - - for (vector ::iterator it=nodes.begin();it!=nodes.end();it++ ) - { + for (vector::iterator it = nodes.begin(); it != nodes.end(); it++) { string name_it; - if ((*it)->isLeaf()) - { - name_it=(*it)->getName(); - } - else - { - vector leafnames=TreeTemplateTools::getLeavesNames(*(*it)); - sort(leafnames.begin(),leafnames.end()); + if ((*it)->isLeaf()) { + name_it = (*it)->getName(); + } else { + vector leafnames = TreeTemplateTools::getLeavesNames(*(*it)); + sort(leafnames.begin(), leafnames.end()); stringstream name; - for (vector ::iterator st=leafnames.begin();st!=leafnames.end();st++ ) - name<<(*st)<<"."; + for (vector::iterator st = leafnames.begin(); + st != leafnames.end(); st++) + name << (*st) << "."; - name_it=name.str(); + name_it = name.str(); } - if (name_it==e_name) e_node=(*it); - if (name_it==f_name) f_node=(*it); + if (name_it == e_name) + e_node = (*it); + if (name_it == f_name) + f_node = (*it); } - if (e==f) return string_parameter["S_un"]; + if (e == f) + return string_parameter["S_un"]; - bool e_below_f=false; - Node * node; - node=e_node; - while (node->hasFather()) - { - node=node->getFather(); - if (node==f_node) e_below_f=true; + bool e_below_f = false; + Node *node; + node = e_node; + while (node->hasFather()) { + node = node->getFather(); + if (node == f_node) + e_below_f = true; } - if (e_below_f) - { - Node * swap_tmp=e_node; - e_node=f_node; - f_node=swap_tmp; + if (e_below_f) { + Node *swap_tmp = e_node; + e_node = f_node; + f_node = swap_tmp; } - if (f_node->hasFather() and f_node->getFather()==e_node ) return string_parameter["S_un"]; - - Node * f_father=f_node->getFather(); - vector f_sons=f_father->getSons(); - Node * f_sister; - if (f_sons[0]==f_node) f_sister=f_sons[1]; else f_sister=f_sons[0]; + if (f_node->hasFather() and f_node->getFather() == e_node) + return string_parameter["S_un"]; + + Node *f_father = f_node->getFather(); + vector f_sons = f_father->getSons(); + Node *f_sister; + if (f_sons[0] == f_node) + f_sister = f_sons[1]; + else + f_sister = f_sons[0]; f_father->removeSon(f_sister); - if (f_father->hasFather()) - { - Node * f_grand_father=f_father->getFather(); + if (f_father->hasFather()) { + Node *f_grand_father = f_father->getFather(); f_grand_father->removeSon(f_father); f_grand_father->addSon(f_sister); - } - else - { + } else { newS->setRootNode(f_sister); } - if (e_node->hasFather()) - { - Node * e_father=e_node->getFather(); + if (e_node->hasFather()) { + Node *e_father = e_node->getFather(); e_father->removeSon(e_node); e_father->addSon(f_father); - } - else - newS->setRootNode(f_father); + } else + newS->setRootNode(f_father); f_father->addSon(e_node); - //for (vector ::iterator it=nodes.begin();it!=nodes.end();it++ ) (*it)->setDistanceToFather(1); + // for (vector ::iterator it=nodes.begin();it!=nodes.end();it++ ) + // (*it)->setDistanceToFather(1); //? - - return TreeTemplateTools::treeToParenthesis(*newS,false,"ID"); + + return TreeTemplateTools::treeToParenthesis(*newS, false, "ID"); } -vector exODT_model::NNIs(int e) -{ +vector exODT_model::NNIs(int e) { vector NNIs; - int left_e,right_e,f; + int left_e, right_e, f; - Node * root = id_nodes[e]; + Node *root = id_nodes[e]; - if (root->isLeaf()) return NNIs; + if (root->isLeaf()) + return NNIs; - vector roots_sons=root->getSons(); + vector roots_sons = root->getSons(); - right_e=node_ids[roots_sons[0]]; - left_e=node_ids[roots_sons[1]]; + right_e = node_ids[roots_sons[0]]; + left_e = node_ids[roots_sons[1]]; if (roots_sons[0]->isLeaf()) - ; - else - { - vector right_sons=roots_sons[0]->getSons(); - f=node_ids[right_sons[0]]; - NNIs.push_back(feSPR(left_e,f)); - f=node_ids[right_sons[1]]; - NNIs.push_back(feSPR(left_e,f)); + ; + else { + vector right_sons = roots_sons[0]->getSons(); + f = node_ids[right_sons[0]]; + NNIs.push_back(feSPR(left_e, f)); + f = node_ids[right_sons[1]]; + NNIs.push_back(feSPR(left_e, f)); } if (roots_sons[1]->isLeaf()) - ; - else - { - vector left_sons=roots_sons[1]->getSons(); - f=node_ids[left_sons[0]]; - NNIs.push_back(feSPR(right_e,f)); - f=node_ids[left_sons[1]]; - NNIs.push_back(feSPR(right_e,f)); + ; + else { + vector left_sons = roots_sons[1]->getSons(); + f = node_ids[left_sons[0]]; + NNIs.push_back(feSPR(right_e, f)); + f = node_ids[left_sons[1]]; + NNIs.push_back(feSPR(right_e, f)); } return NNIs; - } diff --git a/src/wALE_ml_sample.cpp b/src/wALE_ml_sample.cpp index 2689065..63a6378 100644 --- a/src/wALE_ml_sample.cpp +++ b/src/wALE_ml_sample.cpp @@ -1,197 +1,202 @@ -#include "exODT.h" #include "ALE_util.h" +#include "exODT.h" -#include -#include #include #include +#include +#include using namespace std; using namespace bpp; - -class p_fun: - public virtual Function, - public AbstractParametrizable -{ + +class p_fun : public virtual Function, public AbstractParametrizable { private: double fval_; - exODT_model* model_pointer; - approx_posterior* ale_pointer; + exODT_model *model_pointer; + approx_posterior *ale_pointer; + public: - p_fun(exODT_model* model,approx_posterior* ale, double delta_start=0.01,double tau_start=0.01,double lambda_start=0.1) : AbstractParametrizable(""), fval_(0), model_pointer(model), ale_pointer(ale) - { - //We declare parameters here: - // IncludingInterval* constraint = new IncludingInterval(1e-6, 10-1e-6); - IntervalConstraint* constraint = new IntervalConstraint ( 1e-6, 10-1e-6, true, true ); - addParameter_( new Parameter("delta", delta_start, constraint) ) ; - addParameter_( new Parameter("tau", tau_start, constraint) ) ; - addParameter_( new Parameter("lambda", lambda_start, constraint) ) ; + p_fun(exODT_model *model, approx_posterior *ale, double delta_start = 0.01, + double tau_start = 0.01, double lambda_start = 0.1) + : AbstractParametrizable(""), fval_(0), model_pointer(model), + ale_pointer(ale) { + // We declare parameters here: + // IncludingInterval* constraint = new IncludingInterval(1e-6, 10-1e-6); + IntervalConstraint *constraint = + new IntervalConstraint(1e-6, 10 - 1e-6, true, true); + addParameter_(new Parameter("delta", delta_start, constraint)); + addParameter_(new Parameter("tau", tau_start, constraint)); + addParameter_(new Parameter("lambda", lambda_start, constraint)); } - - p_fun* clone() const { return new p_fun(*this); } - + + p_fun *clone() const { return new p_fun(*this); } + public: - - void setParameters(const ParameterList& pl) - throw (ParameterNotFoundException, ConstraintException, Exception) - { + void setParameters(const ParameterList &pl) throw(ParameterNotFoundException, + ConstraintException, + Exception) { matchParametersValues(pl); } - double getValue() const throw (Exception) { return fval_; } - void fireParameterChanged(const ParameterList& pl) - { + double getValue() const throw(Exception) { return fval_; } + void fireParameterChanged(const ParameterList &pl) { double delta = getParameterValue("delta"); double tau = getParameterValue("tau"); double lambda = getParameterValue("lambda"); - model_pointer->set_model_parameter("delta",delta); - model_pointer->set_model_parameter("tau",tau); - model_pointer->set_model_parameter("lambda",lambda); + model_pointer->set_model_parameter("delta", delta); + model_pointer->set_model_parameter("tau", tau); + model_pointer->set_model_parameter("lambda", lambda); model_pointer->calculate_EGb(); - double y=-log(model_pointer->p(ale_pointer)); - cout <observations<<" trees from: " << ale_file <<".."<observations + << " trees from: " << ale_file << ".." << endl; + + // we initialise a coarse grained reconciliation model for calculating the sum + exODT_model *model = new exODT_model(); + + int D = 4; model->set_model_parameter("gene_name_separators", "."); - model->set_model_parameter("BOOTSTRAP_LABELS","yes"); + model->set_model_parameter("BOOTSTRAP_LABELS", "yes"); - model->set_model_parameter("min_D",D); - model->set_model_parameter("grid_delta_t",0.05); + model->set_model_parameter("min_D", D); + model->set_model_parameter("grid_delta_t", 0.05); model->construct(Sstring); - model->set_model_parameter("event_node",0); - model->set_model_parameter("leaf_events",1); - model->set_model_parameter("N",1); - - //a set of inital rates - scalar_type delta=0.01,tau=0.01,lambda=0.1; - if (argc>6) - delta=atof(argv[4]),tau=atof(argv[5]),lambda=atof(argv[6]); + model->set_model_parameter("event_node", 0); + model->set_model_parameter("leaf_events", 1); + model->set_model_parameter("N", 1); + + // a set of inital rates + scalar_type delta = 0.01, tau = 0.01, lambda = 0.1; + if (argc > 6) + delta = atof(argv[4]), tau = atof(argv[5]), lambda = atof(argv[6]); model->set_model_parameter("delta", delta); model->set_model_parameter("tau", tau); model->set_model_parameter("lambda", lambda); model->set_model_parameter("sigma_hat", 1); - //calculate_EGb() must always be called after changing rates to calculate E-s and G-s - //cf. http://arxiv.org/abs/1211.4606 + // calculate_EGb() must always be called after changing rates to calculate E-s + // and G-s cf. http://arxiv.org/abs/1211.4606 cout << "#Mapping begin:" << endl; - for (int branch=0;branchlast_branch;branch++) - { - stringstream named_branch; - if (branch==model->alpha) - named_branch<<-1; - else if (model->id_ranks[branch]==0) - named_branch<extant_species[branch]; + for (int branch = 0; branch < model->last_branch; branch++) { + stringstream named_branch; + if (branch == model->alpha) + named_branch << -1; + else if (model->id_ranks[branch] == 0) + named_branch << model->extant_species[branch]; + else { + if (model->rank2label[model->id_ranks[branch]] != -1) + named_branch << model->rank2label[model->id_ranks[branch]]; else - { - if (model->rank2label[model->id_ranks[branch]]!=-1) - named_branch<rank2label[model->id_ranks[branch]]; - else - named_branch<<"ROOT"; - } - cout << named_branch.str() << "->" << branch << endl; + named_branch << "ROOT"; } + cout << named_branch.str() << "->" << branch << endl; + } cout << "#mapping end." << endl; model->calculate_EGb(); - cout << "Reconciliation model initialised, starting DTL rate optimisation" <<".."<setProfiler(0); - optimizer->setMessageHandler(0); - optimizer->setVerbose(2); - - optimizer->setConstraintPolicy(AutoParameter::CONSTRAINTS_AUTO); - optimizer->init(f->getParameters()); //Here we optimize all parameters, and start with the default values. - - // FunctionStopCondition stop(optimizer, 1);//1e-1); - // optimizer->setStopCondition(stop); - //TEMP - //optimizer->setMaximumNumberOfEvaluations( 10 ); - - optimizer->optimize(); - - //optimizer->getParameters().printParameters(cout); - delta=optimizer->getParameterValue("delta"); - tau=optimizer->getParameterValue("tau"); - lambda=optimizer->getParameterValue("lambda"); - - scalar_type mlll=-optimizer->getFunctionValue(); - cout << endl << "ML rates: " << " delta=" << delta << "; tau=" << tau << "; lambda="< res = model->p_MLRec(ale); - //and output it.. - string outname=S_tree_name+".ml_rec"; - - ofstream fout( outname.c_str() ); - fout << "#ALEml using ALE v"<< ALE_VERSION <<" by Szollosi GJ et al.; ssolo@elte.hu; CC BY-SA 3.0;"<string_parameter["S_with_ranks"] <MLRec_events["D"] << "\t" << model->MLRec_events["T"] << "\t" << model->MLRec_events["L"]<< "\t" << model->MLRec_events["S"] <counts_string(); - - cout << "Results in: " << outname << endl; - - } + cout << "Reconciliation model initialised, starting DTL rate optimisation" + << ".." << endl; + + if (true) { + + // we use the Nelder–Mead method implemented in Bio++ + Function *f = new p_fun(model, ale, delta, tau, lambda); + Optimizer *optimizer = new DownhillSimplexMethod(f); + + optimizer->setProfiler(0); + optimizer->setMessageHandler(0); + optimizer->setVerbose(2); + + optimizer->setConstraintPolicy(AutoParameter::CONSTRAINTS_AUTO); + optimizer->init(f->getParameters()); // Here we optimize all parameters, and + // start with the default values. + + // FunctionStopCondition stop(optimizer, 1);//1e-1); + // optimizer->setStopCondition(stop); + // TEMP + // optimizer->setMaximumNumberOfEvaluations( 10 ); + + optimizer->optimize(); + + // optimizer->getParameters().printParameters(cout); + delta = optimizer->getParameterValue("delta"); + tau = optimizer->getParameterValue("tau"); + lambda = optimizer->getParameterValue("lambda"); + + scalar_type mlll = -optimizer->getFunctionValue(); + cout << endl + << "ML rates: " + << " delta=" << delta << "; tau=" << tau << "; lambda=" << lambda + << "." << endl; + cout << "LL=" << mlll << endl; + + cout << "Calculating ML reconciled gene tree.." << endl; + + pair res = model->p_MLRec(ale); + // and output it.. + string outname = S_tree_name + ".ml_rec"; + + ofstream fout(outname.c_str()); + fout << "#ALEml using ALE v" << ALE_VERSION + << " by Szollosi GJ et al.; ssolo@elte.hu; CC BY-SA 3.0;" << endl + << endl; + fout << "S:\t" << model->string_parameter["S_with_ranks"] << endl; + fout << endl; + fout << "Input ale from:\t" << ale_file << endl; + fout << "rate of\t Duplications\tTransfers\tLosses" << endl; + fout << "ML \t" << delta << "\t" << tau << "\t" << lambda << endl; + fout << endl; + + fout << "reconciled G:\t" << res.first << endl; + fout << endl; + fout << "# of\t Duplications\tTransfers\tLosses\tSpeciations" << endl; + fout << "Total \t" << model->MLRec_events["D"] << "\t" + << model->MLRec_events["T"] << "\t" << model->MLRec_events["L"] << "\t" + << model->MLRec_events["S"] << endl; + fout << endl; + fout << "# of\t Duplications\tTransfers\tLosses\tgene copies" << endl; + fout << model->counts_string(); + + cout << "Results in: " << outname << endl; + } cout << "strating sampling.. " << endl; - string sample_name=S_tree_name+".ml_samples"; - ofstream sample_out( sample_name.c_str() ); + string sample_name = S_tree_name + ".ml_samples"; + ofstream sample_out(sample_name.c_str()); - string hist_name=S_tree_name+".hist"; + string hist_name = S_tree_name + ".hist"; - map > > event_histograms; + map>> event_histograms; vector event_types; event_types.push_back("Os"); event_types.push_back("Ds"); @@ -199,77 +204,75 @@ int main(int argc, char ** argv) event_types.push_back("Ls"); event_types.push_back("Tfroms"); event_types.push_back("copies"); - scalar_type samples_in_histogram=0; - - for (vector::iterator et=event_types.begin();et!=event_types.end();++et ) - for (int branch=0;branchlast_branch;branch++) - { - for (int i=0;i<100;++i) - event_histograms[(*et)][branch][i]=0; - //model->branch_counts[(*et)][branch]; - } + scalar_type samples_in_histogram = 0; + + for (vector::iterator et = event_types.begin(); + et != event_types.end(); ++et) + for (int branch = 0; branch < model->last_branch; branch++) { + for (int i = 0; i < 100; ++i) + event_histograms[(*et)][branch][i] = 0; + // model->branch_counts[(*et)][branch]; + } samples_in_histogram++; - for (vector::iterator et=event_types.begin();et!=event_types.end();++et ) - for (int branch=0;branchlast_branch;branch++) - { - event_histograms[(*et)][branch][ model->branch_counts[(*et)][branch] ]++; - model->branch_counts[(*et)][branch]=0; - } + for (vector::iterator et = event_types.begin(); + et != event_types.end(); ++et) + for (int branch = 0; branch < model->last_branch; branch++) { + event_histograms[(*et)][branch][model->branch_counts[(*et)][branch]]++; + model->branch_counts[(*et)][branch] = 0; + } cout << model->p(ale) << endl; - - for (int i =0; i < 10000; i++ ) - { - string sample_tree=model->sample(false); - //cout << sample_tree << endl; - samples_in_histogram++; - for (vector::iterator et=event_types.begin();et!=event_types.end();++et ) - for (int branch=0;branchlast_branch;branch++) - { - event_histograms[(*et)][branch][ model->branch_counts[(*et)][branch] ]++; - model->branch_counts[(*et)][branch]=0; - } - sample_out << sample_tree << endl; - } + + for (int i = 0; i < 10000; i++) { + string sample_tree = model->sample(false); + // cout << sample_tree << endl; + samples_in_histogram++; + for (vector::iterator et = event_types.begin(); + et != event_types.end(); ++et) + for (int branch = 0; branch < model->last_branch; branch++) { + event_histograms[(*et)][branch][model->branch_counts[(*et)][branch]]++; + model->branch_counts[(*et)][branch] = 0; + } + sample_out << sample_tree << endl; + } cout << "sampling done." << endl; - - for (vector::iterator et=event_types.begin();et!=event_types.end();++et ) - { - ofstream h_out( S_tree_name+"_"+((*et)+".h").c_str() ); - h_out << "# Fraction of samples with number of " << (*et) << " in " <last_branch;branch++) - { - - stringstream named_branch; - if (branch==model->alpha) - named_branch<<-1; - else if (model->id_ranks[branch]==0) - named_branch<extant_species[branch]; - else - { - if (model->rank2label[model->id_ranks[branch]]!=-1) - named_branch<rank2label[model->id_ranks[branch]]; - else - named_branch<<"ROOT"; - } - - h_out<t_begin[branch]; - h_out<<"\t"<t_end[branch]; - h_out<<"\t"<branch_counts[(*et)][branch]; - } + + for (vector::iterator et = event_types.begin(); + et != event_types.end(); ++et) { + ofstream h_out(S_tree_name + "_" + ((*et) + ".h").c_str()); + h_out << "# Fraction of samples with number of " << (*et) << " in " + << samples_in_histogram << " samples for each branch of S." << endl; + h_out << "#id\ttb\tte\trnk"; + for (int i = 0; i < 10; ++i) + h_out << "\t" << i; + h_out << endl; + + for (int branch = 0; branch < model->last_branch; branch++) { + + stringstream named_branch; + if (branch == model->alpha) + named_branch << -1; + else if (model->id_ranks[branch] == 0) + named_branch << model->extant_species[branch]; + else { + if (model->rank2label[model->id_ranks[branch]] != -1) + named_branch << model->rank2label[model->id_ranks[branch]]; + else + named_branch << "ROOT"; + } + + h_out << branch; + h_out << "\t" << model->t_begin[branch]; + h_out << "\t" << model->t_end[branch]; + h_out << "\t" << named_branch.str(); + for (int i = 0; i < 10; ++i) + h_out << "\t" + << event_histograms[(*et)][branch][i] / samples_in_histogram; + h_out << endl; + // model->branch_counts[(*et)][branch]; } + } return 0; } - diff --git a/src/wALE_ml_sample_undated.cpp b/src/wALE_ml_sample_undated.cpp index 0639727..78298cf 100644 --- a/src/wALE_ml_sample_undated.cpp +++ b/src/wALE_ml_sample_undated.cpp @@ -1,174 +1,180 @@ -#include "exODT.h" #include "ALE_util.h" +#include "exODT.h" -#include -#include #include #include +#include +#include using namespace std; using namespace bpp; - -class p_fun: - public virtual Function, - public AbstractParametrizable -{ + +class p_fun : public virtual Function, public AbstractParametrizable { private: double fval_; - exODT_model* model_pointer; - approx_posterior* ale_pointer; + exODT_model *model_pointer; + approx_posterior *ale_pointer; + public: - p_fun(exODT_model* model,approx_posterior* ale, double delta_start=0.01,double tau_start=0.01,double lambda_start=0.1) : AbstractParametrizable(""), fval_(0), model_pointer(model), ale_pointer(ale) - { - //We declare parameters here: - // IncludingInterval* constraint = new IncludingInterval(1e-6, 10-1e-6); - IntervalConstraint* constraint = new IntervalConstraint ( 1e-6, 10-1e-6, true, true ); - addParameter_( new Parameter("delta", delta_start, constraint) ) ; - addParameter_( new Parameter("tau", tau_start, constraint) ) ; - addParameter_( new Parameter("lambda", lambda_start, constraint) ) ; + p_fun(exODT_model *model, approx_posterior *ale, double delta_start = 0.01, + double tau_start = 0.01, double lambda_start = 0.1) + : AbstractParametrizable(""), fval_(0), model_pointer(model), + ale_pointer(ale) { + // We declare parameters here: + // IncludingInterval* constraint = new IncludingInterval(1e-6, 10-1e-6); + IntervalConstraint *constraint = + new IntervalConstraint(1e-6, 10 - 1e-6, true, true); + addParameter_(new Parameter("delta", delta_start, constraint)); + addParameter_(new Parameter("tau", tau_start, constraint)); + addParameter_(new Parameter("lambda", lambda_start, constraint)); } - - p_fun* clone() const { return new p_fun(*this); } - + + p_fun *clone() const { return new p_fun(*this); } + public: - - void setParameters(const ParameterList& pl) - throw (ParameterNotFoundException, ConstraintException, Exception) - { + void setParameters(const ParameterList &pl) throw(ParameterNotFoundException, + ConstraintException, + Exception) { matchParametersValues(pl); } - double getValue() const throw (Exception) { return fval_; } - void fireParameterChanged(const ParameterList& pl) - { + double getValue() const throw(Exception) { return fval_; } + void fireParameterChanged(const ParameterList &pl) { double delta = getParameterValue("delta"); double tau = getParameterValue("tau"); double lambda = getParameterValue("lambda"); - model_pointer->set_model_parameter("delta",delta); - model_pointer->set_model_parameter("tau",tau); - model_pointer->set_model_parameter("lambda",lambda); + model_pointer->set_model_parameter("delta", delta); + model_pointer->set_model_parameter("tau", tau); + model_pointer->set_model_parameter("lambda", lambda); model_pointer->calculate_undatedEs(); - double y=-log(model_pointer->pun(ale_pointer)); - cout <observations<<" trees from: " << ale_file <<".."<observations + << " trees from: " << ale_file << ".." << endl; + + // we initialise a coarse grained reconciliation model for calculating the sum + exODT_model *model = new exODT_model(); cout << "o" << endl; - int D=4; + int D = 4; model->set_model_parameter("gene_name_separators", "."); - model->set_model_parameter("BOOTSTRAP_LABELS","yes"); + model->set_model_parameter("BOOTSTRAP_LABELS", "yes"); model->construct_undated(Sstring); - - //a set of inital rates - scalar_type delta=0.01,tau=0.01,lambda=0.1; - if (argc>6) - delta=atof(argv[4]),tau=atof(argv[5]),lambda=atof(argv[6]); + // a set of inital rates + scalar_type delta = 0.01, tau = 0.01, lambda = 0.1; + if (argc > 6) + delta = atof(argv[4]), tau = atof(argv[5]), lambda = atof(argv[6]); model->set_model_parameter("delta", delta); model->set_model_parameter("tau", tau); model->set_model_parameter("lambda", lambda); - //calculate_EGb() must always be called after changing rates to calculate E-s and G-s - //cf. http://arxiv.org/abs/1211.4606 + // calculate_EGb() must always be called after changing rates to calculate E-s + // and G-s cf. http://arxiv.org/abs/1211.4606 model->calculate_undatedEs(); - cout << "Reconciliation model initialised, starting DTL rate optimisation" <<".."<setProfiler(0); - optimizer->setMessageHandler(0); - optimizer->setVerbose(2); - - optimizer->setConstraintPolicy(AutoParameter::CONSTRAINTS_AUTO); - optimizer->init(f->getParameters()); //Here we optimize all parameters, and start with the default values. - - // FunctionStopCondition stop(optimizer, 1);//1e-1); - // optimizer->setStopCondition(stop); - //TEMP - //optimizer->setMaximumNumberOfEvaluations( 10 ); - - optimizer->optimize(); - - //optimizer->getParameters().printParameters(cout); - delta=optimizer->getParameterValue("delta"); - tau=optimizer->getParameterValue("tau"); - lambda=optimizer->getParameterValue("lambda"); - - scalar_type mlll=-optimizer->getFunctionValue(); - cout << endl << "ML rates: " << " delta=" << delta << "; tau=" << tau << "; lambda="<sample_undated(); - //and output it.. - string outname=S_tree_name+".uml_rec"; - - ofstream fout( outname.c_str() ); - fout << "#ALEml using ALE v"<< ALE_VERSION <<" by Szollosi GJ et al.; ssolo@elte.hu; CC BY-SA 3.0;"<string_parameter["S_with_ranks"] <MLRec_events["D"] << "\t" << model->MLRec_events["T"] << "\t" << model->MLRec_events["L"]<< "\t" << model->MLRec_events["S"] <counts_string(); - - cout << "Results in: " << outname << endl; - - } + cout << "Reconciliation model initialised, starting DTL rate optimisation" + << ".." << endl; + + if (true) { + + // we use the Nelder–Mead method implemented in Bio++ + Function *f = new p_fun(model, ale, delta, tau, lambda); + Optimizer *optimizer = new DownhillSimplexMethod(f); + + optimizer->setProfiler(0); + optimizer->setMessageHandler(0); + optimizer->setVerbose(2); + + optimizer->setConstraintPolicy(AutoParameter::CONSTRAINTS_AUTO); + optimizer->init(f->getParameters()); // Here we optimize all parameters, and + // start with the default values. + + // FunctionStopCondition stop(optimizer, 1);//1e-1); + // optimizer->setStopCondition(stop); + // TEMP + // optimizer->setMaximumNumberOfEvaluations( 10 ); + + optimizer->optimize(); + + // optimizer->getParameters().printParameters(cout); + delta = optimizer->getParameterValue("delta"); + tau = optimizer->getParameterValue("tau"); + lambda = optimizer->getParameterValue("lambda"); + + scalar_type mlll = -optimizer->getFunctionValue(); + cout << endl + << "ML rates: " + << " delta=" << delta << "; tau=" << tau << "; lambda=" << lambda + << "." << endl; + cout << "LL=" << mlll << endl; + + cout << "Sampling a reconciled gene tree.." << endl; + + string sample = model->sample_undated(); + // and output it.. + string outname = S_tree_name + ".uml_rec"; + + ofstream fout(outname.c_str()); + fout << "#ALEml using ALE v" << ALE_VERSION + << " by Szollosi GJ et al.; ssolo@elte.hu; CC BY-SA 3.0;" << endl + << endl; + fout << "S:\t" << model->string_parameter["S_with_ranks"] << endl; + fout << endl; + fout << "Input ale from:\t" << ale_file << endl; + fout << "rate of\t Duplications\tTransfers\tLosses" << endl; + fout << "ML \t" << delta << "\t" << tau << "\t" << lambda << endl; + fout << endl; + + fout << "reconciled G:\t" << sample << endl; + fout << endl; + fout << "# of\t Duplications\tTransfers\tLosses\tSpeciations" << endl; + fout << "Total \t" << model->MLRec_events["D"] << "\t" + << model->MLRec_events["T"] << "\t" << model->MLRec_events["L"] << "\t" + << model->MLRec_events["S"] << endl; + fout << endl; + fout << "# of\t Duplications\tTransfers\tLosses\tgene copies" << endl; + fout << model->counts_string(); + + cout << "Results in: " << outname << endl; + } cout << "strating sampling.. " << endl; - string sample_name=S_tree_name+".uml_samples"; - ofstream sample_out( sample_name.c_str() ); + string sample_name = S_tree_name + ".uml_samples"; + ofstream sample_out(sample_name.c_str()); - string hist_name=S_tree_name+".hist"; + string hist_name = S_tree_name + ".hist"; - map > > event_histograms; + map>> event_histograms; vector event_types; event_types.push_back("Os"); event_types.push_back("Ds"); @@ -176,76 +182,74 @@ int main(int argc, char ** argv) event_types.push_back("Ls"); event_types.push_back("Tfroms"); event_types.push_back("copies"); - scalar_type samples_in_histogram=0; - - for (vector::iterator et=event_types.begin();et!=event_types.end();++et ) - for (int branch=0;branchlast_branch;branch++) - { - for (int i=0;i<100;++i) - event_histograms[(*et)][branch][i]=0; - //model->branch_counts[(*et)][branch]; - } + scalar_type samples_in_histogram = 0; + + for (vector::iterator et = event_types.begin(); + et != event_types.end(); ++et) + for (int branch = 0; branch < model->last_branch; branch++) { + for (int i = 0; i < 100; ++i) + event_histograms[(*et)][branch][i] = 0; + // model->branch_counts[(*et)][branch]; + } samples_in_histogram++; - for (vector::iterator et=event_types.begin();et!=event_types.end();++et ) - for (int branch=0;branchlast_branch;branch++) - { - event_histograms[(*et)][branch][ model->branch_counts[(*et)][branch] ]++; - model->branch_counts[(*et)][branch]=0; - } + for (vector::iterator et = event_types.begin(); + et != event_types.end(); ++et) + for (int branch = 0; branch < model->last_branch; branch++) { + event_histograms[(*et)][branch][model->branch_counts[(*et)][branch]]++; + model->branch_counts[(*et)][branch] = 0; + } cout << model->pun(ale) << endl; - - for (int i =0; i < 1000; i++ ) - { - string sample_tree=model->sample_undated(); - //cout << sample_tree << endl; - samples_in_histogram++; - for (vector::iterator et=event_types.begin();et!=event_types.end();++et ) - for (int branch=0;branchlast_branch;branch++) - { - event_histograms[(*et)][branch][ model->branch_counts[(*et)][branch] ]++; - model->branch_counts[(*et)][branch]=0; - } - sample_out << sample_tree << endl; - } + + for (int i = 0; i < 1000; i++) { + string sample_tree = model->sample_undated(); + // cout << sample_tree << endl; + samples_in_histogram++; + for (vector::iterator et = event_types.begin(); + et != event_types.end(); ++et) + for (int branch = 0; branch < model->last_branch; branch++) { + event_histograms[(*et)][branch][model->branch_counts[(*et)][branch]]++; + model->branch_counts[(*et)][branch] = 0; + } + sample_out << sample_tree << endl; + } cout << "sampling done." << endl; - for (vector::iterator et=event_types.begin();et!=event_types.end();++et ) - { - ofstream h_out( S_tree_name+"_"+((*et)+".uh").c_str() ); - h_out << "# Fraction of samples with number of " << (*et) << " in " <last_branch;branch++) - { - - stringstream named_branch; - if (branch==model->alpha) - named_branch<<-1; - else if (branchlast_leaf) - named_branch<extant_species[branch]; - else - { - if (model->rank2label[branch]!=-1) - named_branch<rank2label[branch]; - else - named_branch<<"ROOT"; - } - - h_out<t_begin[branch]; - h_out<<"\t"<t_end[branch]; - h_out<<"\t"<branch_counts[(*et)][branch]; - } + for (vector::iterator et = event_types.begin(); + et != event_types.end(); ++et) { + ofstream h_out(S_tree_name + "_" + ((*et) + ".uh").c_str()); + h_out << "# Fraction of samples with number of " << (*et) << " in " + << samples_in_histogram << " samples for each branch of S." << endl; + h_out << "#id\ttb\tte\trnk"; + for (int i = 0; i < 10; ++i) + h_out << "\t" << i; + h_out << endl; + + for (int branch = 0; branch < model->last_branch; branch++) { + + stringstream named_branch; + if (branch == model->alpha) + named_branch << -1; + else if (branch < model->last_leaf) + named_branch << model->extant_species[branch]; + else { + if (model->rank2label[branch] != -1) + named_branch << model->rank2label[branch]; + else + named_branch << "ROOT"; + } + + h_out << branch; + h_out << "\t" << model->t_begin[branch]; + h_out << "\t" << model->t_end[branch]; + h_out << "\t" << named_branch.str(); + for (int i = 0; i < 10; ++i) + h_out << "\t" + << event_histograms[(*et)][branch][i] / samples_in_histogram; + h_out << endl; + // model->branch_counts[(*et)][branch]; } + } return 0; } - diff --git a/src/wol_host.cpp b/src/wol_host.cpp index 545156d..b76353e 100644 --- a/src/wol_host.cpp +++ b/src/wol_host.cpp @@ -2,63 +2,56 @@ using namespace std; using namespace bpp; -int main(int argc, char ** argv) -{ - ifstream file_stream (argv[1]); +int main(int argc, char **argv) { + ifstream file_stream(argv[1]); string name; - map > keep; - while(! file_stream.eof()) - { - getline (file_stream,name); - if (name.find("_")!=name.npos ) - { - vector tokens; - boost::trim(name); - boost::split(tokens,name,boost::is_any_of(" \t"),boost::token_compress_on); - string sp=tokens[0]; - string small_group=tokens[1]; - string large_group=tokens[2]; - keep[small_group][sp]=1; - keep[large_group][sp]=1; - } - + map> keep; + while (!file_stream.eof()) { + getline(file_stream, name); + if (name.find("_") != name.npos) { + vector tokens; + boost::trim(name); + boost::split(tokens, name, boost::is_any_of(" \t"), + boost::token_compress_on); + string sp = tokens[0]; + string small_group = tokens[1]; + string large_group = tokens[2]; + keep[small_group][sp] = 1; + keep[large_group][sp] = 1; } - for (int i=2;i >::iterator kit= keep.begin() ; kit!=keep.end() ; kit++ ) - { - ifstream file_stream1 (argv[i]); - string name=argv[i]; - vector tokens; - boost::split(tokens,name,boost::is_any_of("."),boost::token_compress_on); - string outname="wol_hosts_"+tokens[0]+"_"+(*kit).first+".tree"; - //string outname="wol_paras_"+tokens[0]+"s_"+(*kit).first+".trees"; - ofstream fout( outname.c_str() ); + } + for (int i = 2; i < argc; i++) { + for (map>::iterator kit = keep.begin(); + kit != keep.end(); kit++) { + ifstream file_stream1(argv[i]); + string name = argv[i]; + vector tokens; + boost::split(tokens, name, boost::is_any_of("."), + boost::token_compress_on); + string outname = "wol_hosts_" + tokens[0] + "_" + (*kit).first + ".tree"; + // string outname="wol_paras_"+tokens[0]+"s_"+(*kit).first+".trees"; + ofstream fout(outname.c_str()); - string tree; - while(! file_stream1.eof()) - { - getline (file_stream1,tree); - if (tree.find(")")!=tree.npos ) - { - - tree_type * T=TreeTemplateTools::parenthesisToTree(tree,false,"ID"); - vector leaves=T->getLeaves(); - for (vector ::iterator it=leaves.begin();it!=leaves.end();it++) - { - string name=(*it)->getName(); - //cout << name << endl; - if (not (*kit).second.count(name)==1) - { - //cout << "drop" << " " < leaves = T->getLeaves(); + for (vector::iterator it = leaves.begin(); it != leaves.end(); + it++) { + string name = (*it)->getName(); + // cout << name << endl; + if (not(*kit).second.count(name) == 1) { + // cout << "drop" << " " < > keep; - while(! file_stream.eof()) - { - getline (file_stream,name); - if (name.find("_")!=name.npos ) - { - vector tokens; - boost::trim(name); - boost::split(tokens,name,boost::is_any_of(" \t"),boost::token_compress_on); - string sp=tokens[0]; - string small_group=tokens[1]; - string large_group=tokens[2]; - keep[small_group][sp]=1; - keep[large_group][sp]=1; - } - + map> keep; + while (!file_stream.eof()) { + getline(file_stream, name); + if (name.find("_") != name.npos) { + vector tokens; + boost::trim(name); + boost::split(tokens, name, boost::is_any_of(" \t"), + boost::token_compress_on); + string sp = tokens[0]; + string small_group = tokens[1]; + string large_group = tokens[2]; + keep[small_group][sp] = 1; + keep[large_group][sp] = 1; } - for (int i=2;i >::iterator kit= keep.begin() ; kit!=keep.end() ; kit++ ) - { - ifstream file_stream1 (argv[i]); - string name=argv[i]; - vector tokens; - boost::split(tokens,name,boost::is_any_of("."),boost::token_compress_on); - //string outname="wol_hosts_"+tokens[0]+"_"+(*kit).first+".tree"; - string outname="wol_paras_"+tokens[0]+"s_"+(*kit).first+".trees"; - ofstream fout( outname.c_str() ); + } + for (int i = 2; i < argc; i++) { + for (map>::iterator kit = keep.begin(); + kit != keep.end(); kit++) { + ifstream file_stream1(argv[i]); + string name = argv[i]; + vector tokens; + boost::split(tokens, name, boost::is_any_of("."), + boost::token_compress_on); + // string outname="wol_hosts_"+tokens[0]+"_"+(*kit).first+".tree"; + string outname = + "wol_paras_" + tokens[0] + "s_" + (*kit).first + ".trees"; + ofstream fout(outname.c_str()); - string tree; - while(! file_stream1.eof()) - { - getline (file_stream1,tree); - if (tree.find(")")!=tree.npos ) - { - - tree_type * T=TreeTemplateTools::parenthesisToTree(tree,false,"ID"); - vector leaves=T->getLeaves(); - for (vector ::iterator it=leaves.begin();it!=leaves.end();it++) - { - string name=(*it)->getName(); - //cout << name << endl; - if (not (*kit).second.count(name)==1) - { - //cout << "drop" << " " < leaves = T->getLeaves(); + for (vector::iterator it = leaves.begin(); it != leaves.end(); + it++) { + string name = (*it)->getName(); + // cout << name << endl; + if (not(*kit).second.count(name) == 1) { + // cout << "drop" << " " < > keep; - while(! file_stream.eof()) - { - getline (file_stream,name); - if (name.find("_")!=name.npos ) - { - vector tokens; - boost::trim(name); - boost::split(tokens,name,boost::is_any_of(" \t"),boost::token_compress_on); - string sp=tokens[0]; - string small_group=tokens[1]; - string large_group=tokens[2]; - keep[small_group][sp]=1; - keep[large_group][sp]=1; - } - + map> keep; + while (!file_stream.eof()) { + getline(file_stream, name); + if (name.find("_") != name.npos) { + vector tokens; + boost::trim(name); + boost::split(tokens, name, boost::is_any_of(" \t"), + boost::token_compress_on); + string sp = tokens[0]; + string small_group = tokens[1]; + string large_group = tokens[2]; + keep[small_group][sp] = 1; + keep[large_group][sp] = 1; } - for (int i=2;i >::iterator kit= keep.begin() ; kit!=keep.end() ; kit++ ) - { - ifstream file_stream1 (argv[i]); - string name=argv[i]; - vector tokens; - boost::split(tokens,name,boost::is_any_of("."),boost::token_compress_on); - //string outname="wol_hosts_"+tokens[0]+"_"+(*kit).first+".tree"; - string outname="wol_paras_"+tokens[0]+"s_"+(*kit).first+".trees"; - ofstream fout( outname.c_str() ); + } + for (int i = 2; i < argc; i++) { + for (map>::iterator kit = keep.begin(); + kit != keep.end(); kit++) { + ifstream file_stream1(argv[i]); + string name = argv[i]; + vector tokens; + boost::split(tokens, name, boost::is_any_of("."), + boost::token_compress_on); + // string outname="wol_hosts_"+tokens[0]+"_"+(*kit).first+".tree"; + string outname = + "wol_paras_" + tokens[0] + "s_" + (*kit).first + ".trees"; + ofstream fout(outname.c_str()); - string tree; - while(! file_stream1.eof()) - { - getline (file_stream1,tree); - if (tree.find(")")!=tree.npos ) - { - - tree_type * T=TreeTemplateTools::parenthesisToTree(tree,false,"ID"); - vector leaves=T->getLeaves(); - for (vector ::iterator it=leaves.begin();it!=leaves.end();it++) - { - string name=(*it)->getName(); - //cout << name << endl; - if (not (*kit).second.count(name)==1) - { - //cout << "drop" << " " < leaves = T->getLeaves(); + for (vector::iterator it = leaves.begin(); it != leaves.end(); + it++) { + string name = (*it)->getName(); + // cout << name << endl; + if (not(*kit).second.count(name) == 1) { + // cout << "drop" << " " < Date: Thu, 6 Jun 2024 16:13:34 +0300 Subject: [PATCH 3/4] fix(undatedDTL): fix ancestral correction sum --- src/undated.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/undated.cpp b/src/undated.cpp index b873f53..70d60cc 100644 --- a/src/undated.cpp +++ b/src/undated.cpp @@ -706,15 +706,13 @@ scalar_type exODT_model::pun(approx_posterior *ale, bool verbose, bool no_T) { uq_sum = EPSILON; uq[i][e] = uq_sum; new_mPTuq += (wT[e]) * uq_sum; + } + for (int e = 0; e < last_branch; e++) { mPTuq_ancestral_correction[i][e] = 0; - // for (map::iterator it=ancestral[e].begin(); - // it!=ancestral[e].end();it++) for (vector::iterator it = ancestors[e].begin(); it != ancestors[e].end(); it++) { - // int f=(*it).first; int f = (*it); - // if (ancestral[e][f]==1) - mPTuq_ancestral_correction[i][e] += (wT[f]) * uq_sum; + mPTuq_ancestral_correction[i][e] += (wT[f]) * uq[i][f]; } } mPTuq[i] = new_mPTuq; From dede93ef61b2e267f721ae8ae31f0b13d285d32b Mon Sep 17 00:00:00 2001 From: Noah Wahl Date: Wed, 12 Jun 2024 09:55:58 +0300 Subject: [PATCH 4/4] fix(undatedDTL): correct likelihood accumulation in base case --- src/undated.cpp | 409 ++++++++++++++++++++++++------------------------ 1 file changed, 208 insertions(+), 201 deletions(-) diff --git a/src/undated.cpp b/src/undated.cpp index 70d60cc..22ab15c 100644 --- a/src/undated.cpp +++ b/src/undated.cpp @@ -656,52 +656,53 @@ scalar_type exODT_model::pun(approx_posterior *ale, bool verbose, bool no_T) { extant_species[e] == gid_sps[g_id]) { // present uq_sum += PS[e] * 1; - } - // G internal - if (not is_a_leaf) { - int N_parts = gp_is.size(); - for (int i = 0; i < N_parts; i++) { - int gp_i = gp_is[i]; - int gpp_i = gpp_is[i]; - scalar_type pp = p_part[i]; - if (not(e < last_leaf)) { - int f = daughter[e]; - int g = son[e]; - // S event - uq_sum += - PS[e] * - (uq[gp_i][f] * uq[gpp_i][g] + uq[gp_i][g] * uq[gpp_i][f]) * - pp; + } else { + // G internal + if (not is_a_leaf) { + int N_parts = gp_is.size(); + for (int i = 0; i < N_parts; i++) { + int gp_i = gp_is[i]; + int gpp_i = gpp_is[i]; + scalar_type pp = p_part[i]; + if (not(e < last_leaf)) { + int f = daughter[e]; + int g = son[e]; + // S event + uq_sum += + PS[e] * + (uq[gp_i][f] * uq[gpp_i][g] + uq[gp_i][g] * uq[gpp_i][f]) * + pp; + } + // D event + uq_sum += PD[e] * (uq[gp_i][e] * uq[gpp_i][e]) * + pp; // no factor of two needed here + // T event + if (not no_T) + uq_sum += + (uq[gp_i][e] * + (mPTuq[gpp_i] - mPTuq_ancestral_correction[gpp_i][e]) / + tau_norm[e] + + uq[gpp_i][e] * + (mPTuq[gp_i] - mPTuq_ancestral_correction[gp_i][e]) / + tau_norm[e]) * + pp; } - // D event - uq_sum += PD[e] * (uq[gp_i][e] * uq[gpp_i][e]) * - pp; // no factor of two needed here - // T event - if (not no_T) - uq_sum += - (uq[gp_i][e] * - (mPTuq[gpp_i] - mPTuq_ancestral_correction[gpp_i][e]) / - tau_norm[e] + - uq[gpp_i][e] * - (mPTuq[gp_i] - mPTuq_ancestral_correction[gp_i][e]) / - tau_norm[e]) * - pp; } + if (not(e < last_leaf)) { + int f = daughter[e]; + int g = son[e]; + // SL event + uq_sum += PS[e] * (uq[i][f] * uE[g] + uq[i][g] * uE[f]); + } + // DL event + uq_sum += PD[e] * (uq[i][e] * uE[e] * 2); + // TL event + if (not no_T) + uq_sum += ((mPTuq[i] - mPTuq_ancestral_correction[i][e]) / + tau_norm[e] * uE[e] + + uq[i][e] * (mPTE - mPTE_ancestral_correction[e]) / + tau_norm[e]); } - if (not(e < last_leaf)) { - int f = daughter[e]; - int g = son[e]; - // SL event - uq_sum += PS[e] * (uq[i][f] * uE[g] + uq[i][g] * uE[f]); - } - // DL event - uq_sum += PD[e] * (uq[i][e] * uE[e] * 2); - // TL event - if (not no_T) - uq_sum += - ((mPTuq[i] - mPTuq_ancestral_correction[i][e]) / tau_norm[e] * - uE[e] + - uq[i][e] * (mPTE - mPTE_ancestral_correction[e]) / tau_norm[e]); if (uq_sum < EPSILON) uq_sum = EPSILON; uq[i][e] = uq_sum; @@ -882,52 +883,53 @@ string exODT_model::sample_undated(int e, int i, string last_event, if (e < last_leaf and is_a_leaf and extant_species[e] == gid_sps[g_id]) { // present uq_sum += PS[e] * 1 + EPSILON; - } + } else { - // G internal - if (not is_a_leaf) { - int N_parts = gp_is.size(); - for (int i = 0; i < N_parts; i++) { - int gp_i = gp_is[i]; - int gpp_i = gpp_is[i]; - scalar_type pp = p_part[i]; - if (not(e < last_leaf)) { - int f = daughter[e]; - int g = son[e]; - // S event - uq_sum += PS[e] * uq[gp_i][f] * uq[gpp_i][g] * pp + EPSILON; - uq_sum += PS[e] * uq[gp_i][g] * uq[gpp_i][f] * pp + EPSILON; - } - // D event - uq_sum += PD[e] * (uq[gp_i][e] * uq[gpp_i][e] * 2) * pp + EPSILON; - // T event - for (int f = 0; f < last_branch; f++) - if (not ancestral[e][f] and not no_T) { - uq_sum += - uq[gp_i][e] * (wT[f] / tau_norm[e]) * uq[gpp_i][f] * pp + EPSILON; - uq_sum += - uq[gpp_i][e] * (wT[f] / tau_norm[e]) * uq[gp_i][f] * pp + EPSILON; + // G internal + if (not is_a_leaf) { + int N_parts = gp_is.size(); + for (int i = 0; i < N_parts; i++) { + int gp_i = gp_is[i]; + int gpp_i = gpp_is[i]; + scalar_type pp = p_part[i]; + if (not(e < last_leaf)) { + int f = daughter[e]; + int g = son[e]; + // S event + uq_sum += PS[e] * uq[gp_i][f] * uq[gpp_i][g] * pp + EPSILON; + uq_sum += PS[e] * uq[gp_i][g] * uq[gpp_i][f] * pp + EPSILON; } + // D event + uq_sum += PD[e] * (uq[gp_i][e] * uq[gpp_i][e] * 2) * pp + EPSILON; + // T event + for (int f = 0; f < last_branch; f++) + if (not ancestral[e][f] and not no_T) { + uq_sum += uq[gp_i][e] * (wT[f] / tau_norm[e]) * uq[gpp_i][f] * pp + + EPSILON; + uq_sum += uq[gpp_i][e] * (wT[f] / tau_norm[e]) * uq[gp_i][f] * pp + + EPSILON; + } + } } - } - if (not(e < last_leaf)) { - int f = daughter[e]; - int g = son[e]; - // SL event - uq_sum += PS[e] * uq[i][f] * uE[g] + EPSILON; - uq_sum += PS[e] * uq[i][g] * uE[f] + EPSILON; - } - - // DL event - uq_sum += PD[e] * (uq[i][e] * uE[e] * 2) + EPSILON; - // TL event - for (int f = 0; f < last_branch; f++) - if (not ancestral[e][f] and not no_T) { - uq_sum += (wT[f] / tau_norm[e]) * uq[i][f] * uE[e] + EPSILON; - uq_sum += (wT[f] / tau_norm[e]) * uE[f] * uq[i][e] + EPSILON; + if (not(e < last_leaf)) { + int f = daughter[e]; + int g = son[e]; + // SL event + uq_sum += PS[e] * uq[i][f] * uE[g] + EPSILON; + uq_sum += PS[e] * uq[i][g] * uE[f] + EPSILON; } + // DL event + uq_sum += PD[e] * (uq[i][e] * uE[e] * 2) + EPSILON; + // TL event + for (int f = 0; f < last_branch; f++) + if (not ancestral[e][f] and not no_T) { + uq_sum += (wT[f] / tau_norm[e]) * uq[i][f] * uE[e] + EPSILON; + uq_sum += (wT[f] / tau_norm[e]) * uE[f] * uq[i][e] + EPSILON; + } + } + // ###################################################################################################################### // #########################################INNNER // LOOP################################################################## @@ -955,135 +957,140 @@ string exODT_model::sample_undated(int e, int i, string last_event, return ale_pointer->set2name(ale_pointer->id_sets[g_id]) + branch_string + ":" + branch_length; } - } - // G internal - if (not is_a_leaf) { - int N_parts = gp_is.size(); - for (int i = 0; i < N_parts; i++) { - int gp_i = gp_is[i]; - int gpp_i = gpp_is[i]; - scalar_type pp = p_part[i]; - if (not(e < last_leaf)) { - int f = daughter[e]; - int g = son[e]; - // S event - uq_resum += PS[e] * uq[gp_i][f] * uq[gpp_i][g] * pp + EPSILON; - if (r * uq_sum < uq_resum) { - register_Su(e, last_event); - return "(" + sample_undated(f, gp_i, "S", "", no_T) + "," + - sample_undated(g, gpp_i, "S", "", no_T) + ")." + estr + - branch_string + ":" + branch_length; - } - uq_resum += PS[e] * uq[gp_i][g] * uq[gpp_i][f] * pp + EPSILON; - if (r * uq_sum < uq_resum) { - register_Su(e, last_event); - return "(" + sample_undated(g, gp_i, "S", "", no_T) + "," + - sample_undated(f, gpp_i, "S", "", no_T) + ")." + estr + - branch_string + ":" + branch_length; - } - } - // D event - uq_resum += PD[e] * (uq[gp_i][e] * uq[gpp_i][e] * 2) * pp + EPSILON; - if (r * uq_sum < uq_resum or no_T) { - register_D(e); - return "(" + sample_undated(e, gp_i, "D", "", no_T) + "," + - sample_undated(e, gpp_i, "D", "", no_T) + ").D@" + estr + - branch_string + ":" + branch_length; - } - - // T event - for (int f = 0; f < last_branch; f++) - if (not ancestral[e][f] and not no_T) { - stringstream fstring; - if (not(f < last_leaf)) - fstring << f; - else - fstring << extant_species[f]; - string fstr = fstring.str(); - - uq_resum += - uq[gp_i][e] * (wT[f] / tau_norm[e]) * uq[gpp_i][f] * pp + EPSILON; + } else { + // G internal + if (not is_a_leaf) { + int N_parts = gp_is.size(); + for (int i = 0; i < N_parts; i++) { + int gp_i = gp_is[i]; + int gpp_i = gpp_is[i]; + scalar_type pp = p_part[i]; + if (not(e < last_leaf)) { + int f = daughter[e]; + int g = son[e]; + // S event + uq_resum += PS[e] * uq[gp_i][f] * uq[gpp_i][g] * pp + EPSILON; if (r * uq_sum < uq_resum) { - register_Tfrom(e); - register_Tto(f); - register_T_to_from(e, f); - stringstream Ttoken; - Ttoken << estr << ">" << fstr << "|" - << ale_pointer->set2name(ale_pointer->id_sets[g_ids[gpp_i]]); - Ttokens.push_back(Ttoken.str()); - - return "(" + sample_undated(e, gp_i, "S", "", no_T) + "," + - sample_undated(f, gpp_i, "T", "", no_T) + ").T@" + estr + - "->" + fstr + branch_string + ":" + branch_length; + register_Su(e, last_event); + return "(" + sample_undated(f, gp_i, "S", "", no_T) + "," + + sample_undated(g, gpp_i, "S", "", no_T) + ")." + estr + + branch_string + ":" + branch_length; } - uq_resum += - uq[gpp_i][e] * (wT[f] / tau_norm[e]) * uq[gp_i][f] * pp + EPSILON; + uq_resum += PS[e] * uq[gp_i][g] * uq[gpp_i][f] * pp + EPSILON; if (r * uq_sum < uq_resum) { - register_Tfrom(e); - register_Tto(f); - register_T_to_from(e, f); - stringstream Ttoken; - Ttoken << estr << ">" << fstr << "|" - << ale_pointer->set2name(ale_pointer->id_sets[g_ids[gp_i]]); - Ttokens.push_back(Ttoken.str()); - return "(" + sample_undated(e, gpp_i, "S", "", no_T) + "," + - sample_undated(f, gp_i, "T", "", no_T) + ").T@" + estr + - "->" + fstr + branch_string + ":" + branch_length; + register_Su(e, last_event); + return "(" + sample_undated(g, gp_i, "S", "", no_T) + "," + + sample_undated(f, gpp_i, "S", "", no_T) + ")." + estr + + branch_string + ":" + branch_length; } } - } - } - if (not(e < last_leaf)) { - int f = daughter[e]; - int g = son[e]; - // SL event - uq_resum += PS[e] * uq[i][f] * uE[g] + EPSILON; - if (r * uq_sum < uq_resum) { - register_Su(e, last_event); - register_L(g); - return sample_undated(f, i, "S", "." + estr + branch_string, no_T); - } - uq_resum += PS[e] * uq[i][g] * uE[f] + EPSILON; - if (r * uq_sum < uq_resum) { - register_Su(e, last_event); - register_L(f); - return sample_undated(g, i, "S", "." + estr + branch_string, no_T); - } - } - // DL event - uq_resum += PD[e] * (uq[i][e] * uE[e] * 2) + EPSILON; - if (r * uq_sum < uq_resum) { - return sample_undated(e, i, "S", branch_string, no_T); - } - // TL event - for (int f = 0; f < last_branch; f++) - if (not ancestral[e][f] and not no_T) { - stringstream fstring; - if (not(f < last_leaf)) - fstring << f; - else - fstring << extant_species[f]; - string fstr = fstring.str(); + // D event + uq_resum += PD[e] * (uq[gp_i][e] * uq[gpp_i][e] * 2) * pp + EPSILON; + if (r * uq_sum < uq_resum or no_T) { + register_D(e); + return "(" + sample_undated(e, gp_i, "D", "", no_T) + "," + + sample_undated(e, gpp_i, "D", "", no_T) + ").D@" + estr + + branch_string + ":" + branch_length; + } - uq_resum += (wT[f] / tau_norm[e]) * uq[i][f] * uE[e] + EPSILON; + // T event + for (int f = 0; f < last_branch; f++) + if (not ancestral[e][f] and not no_T) { + stringstream fstring; + if (not(f < last_leaf)) + fstring << f; + else + fstring << extant_species[f]; + string fstr = fstring.str(); + + uq_resum += + uq[gp_i][e] * (wT[f] / tau_norm[e]) * uq[gpp_i][f] * pp + + EPSILON; + if (r * uq_sum < uq_resum) { + register_Tfrom(e); + register_Tto(f); + register_T_to_from(e, f); + stringstream Ttoken; + Ttoken << estr << ">" << fstr << "|" + << ale_pointer->set2name( + ale_pointer->id_sets[g_ids[gpp_i]]); + Ttokens.push_back(Ttoken.str()); + + return "(" + sample_undated(e, gp_i, "S", "", no_T) + "," + + sample_undated(f, gpp_i, "T", "", no_T) + ").T@" + estr + + "->" + fstr + branch_string + ":" + branch_length; + } + uq_resum += + uq[gpp_i][e] * (wT[f] / tau_norm[e]) * uq[gp_i][f] * pp + + EPSILON; + if (r * uq_sum < uq_resum) { + register_Tfrom(e); + register_Tto(f); + register_T_to_from(e, f); + stringstream Ttoken; + Ttoken << estr << ">" << fstr << "|" + << ale_pointer->set2name( + ale_pointer->id_sets[g_ids[gp_i]]); + Ttokens.push_back(Ttoken.str()); + return "(" + sample_undated(e, gpp_i, "S", "", no_T) + "," + + sample_undated(f, gp_i, "T", "", no_T) + ").T@" + estr + + "->" + fstr + branch_string + ":" + branch_length; + } + } + } + } + if (not(e < last_leaf)) { + int f = daughter[e]; + int g = son[e]; + // SL event + uq_resum += PS[e] * uq[i][f] * uE[g] + EPSILON; if (r * uq_sum < uq_resum) { - register_Tfrom(e); - register_Tto(f); - register_T_to_from(e, f); - /* - stringstream Ttoken; - Ttoken<"<set2name(ale_pointer->id_sets[g_id]); - Ttokens.push_back(Ttoken.str()); - */ - register_L(e); - return sample_undated(f, i, "T", - ".T@" + estr + "->" + fstr + branch_string, no_T); + register_Su(e, last_event); + register_L(g); + return sample_undated(f, i, "S", "." + estr + branch_string, no_T); } - uq_resum += (wT[f] / tau_norm[e]) * uE[f] * uq[i][e] + EPSILON; + uq_resum += PS[e] * uq[i][g] * uE[f] + EPSILON; if (r * uq_sum < uq_resum) { - return sample_undated(e, i, "S", "", no_T); + register_Su(e, last_event); + register_L(f); + return sample_undated(g, i, "S", "." + estr + branch_string, no_T); } } + // DL event + uq_resum += PD[e] * (uq[i][e] * uE[e] * 2) + EPSILON; + if (r * uq_sum < uq_resum) { + return sample_undated(e, i, "S", branch_string, no_T); + } + // TL event + for (int f = 0; f < last_branch; f++) + if (not ancestral[e][f] and not no_T) { + stringstream fstring; + if (not(f < last_leaf)) + fstring << f; + else + fstring << extant_species[f]; + string fstr = fstring.str(); + + uq_resum += (wT[f] / tau_norm[e]) * uq[i][f] * uE[e] + EPSILON; + if (r * uq_sum < uq_resum) { + register_Tfrom(e); + register_Tto(f); + register_T_to_from(e, f); + /* + stringstream Ttoken; + Ttoken<"<set2name(ale_pointer->id_sets[g_id]); + Ttokens.push_back(Ttoken.str()); + */ + register_L(e); + return sample_undated( + f, i, "T", ".T@" + estr + "->" + fstr + branch_string, no_T); + } + uq_resum += (wT[f] / tau_norm[e]) * uE[f] * uq[i][e] + EPSILON; + if (r * uq_sum < uq_resum) { + return sample_undated(e, i, "S", "", no_T); + } + } + } // ###################################################################################################################### // #########################################INNNER // LOOP##################################################################