Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
4c4dccd
add flag for doing spr
trungnotchung Jun 28, 2025
0b4346e
Merge branch 'feat/placement' into feat/spr-with-placement
trungnotchung Jun 28, 2025
1c541ef
Merge branch 'feat/placement' into feat/spr-with-placement
trungnotchung Jun 28, 2025
92c7000
perform spr transformation after placement
trungnotchung Jul 1, 2025
b66abb8
Merge branch 'feat/placement' into feat/spr-with-placement
trungnotchung Jul 4, 2025
e7476bd
Merge branch 'feat/placement' into feat/spr-with-placement
trungnotchung Jul 13, 2025
3940e51
Merge branch 'feat/placement' into feat/spr-with-placement
trungnotchung Jul 25, 2025
ddbfe54
wip: perform spr transformation without breaking original tree
trungnotchung Jul 25, 2025
60ee8d1
update spacing consistent with current lib
trungnotchung Aug 1, 2025
e2f1b8d
update spr transformation to not break original tree
trungnotchung Aug 1, 2025
7b86c95
check correct tree after performing spr transformation
trungnotchung Aug 1, 2025
cfbb327
update readme.md
trungnotchung Aug 1, 2025
751b298
feat: check preserved original tree using hash function
trungnotchung Sep 9, 2025
c6459ed
preserve node hash when transform data structure to pll
trungnotchung Sep 9, 2025
ea8e849
create unique file for treehash logic
trungnotchung Sep 21, 2025
3c7d7e7
recompute hash again in pll
trungnotchung Sep 21, 2025
395f2a6
update spr transformation condition
trungnotchung Sep 21, 2025
6f24f42
enable add flexible missing sequences using params
trungnotchung Nov 23, 2025
0f6dbf9
bump zlib version to latest to fix mac sdk issue
trungnotchung Nov 23, 2025
f4b2c46
Merge branch 'feat/placement' into feat/spr-with-placement
trungnotchung Nov 23, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions .claude/settings.local.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"permissions": {
"allow": [
"Bash(find:*)",
"Bash(make:*)",
"Bash(cmake:*)"
],
"deny": [],
"ask": []
}
}
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,7 @@ configure_file (
# add the binary tree to the search path for include files
# so that we will find iqtree_config.h
include_directories("${PROJECT_BINARY_DIR}")
include_directories("${PROJECT_BINARY_DIR}/zlib-1.2.7")
include_directories("${PROJECT_BINARY_DIR}/zlib-1.3.1")


##################################################################
Expand All @@ -334,7 +334,7 @@ add_subdirectory(pllrepo/src)
add_subdirectory(ncl)
add_subdirectory(whtest)
add_subdirectory(sprng)
add_subdirectory(zlib-1.2.7)
add_subdirectory(zlib-1.3.1)
add_subdirectory(vectorclass)
add_subdirectory(model)

Expand Down
49 changes: 49 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -135,3 +135,52 @@ The compiler will generate an executable file named **mpboot-avx**
6. To analyst file **example.phy** with 2 processes, run command:
`mpiexec -n 2 ./mpboot-avx -s example.phy`
> Option **n** specifies the number of processes used to run MPBoot-MPI

## MPBoot-Placement

MPBoot-Placement is a specialized tool for phylogenetic placement that efficiently places new samples onto an existing phylogenetic tree using parsimony-based algorithms. It processes VCF files and can handle large datasets through intelligent batch processing.

### Downloading source code
You can clone the source code from GitHub with:

`git clone https://github.com/diepthihoang/mpboot.git`

### Compilation
MPBoot-Placement uses the **same compilation process** as standard MPBoot (see sections above). The executable generated is the same `mpboot-avx` (or `mpboot` for SSE).

### Usage

**Basic placement command:**
```bash
./mpboot-avx -pp_on -s variants.vcf -pp_tree reference.newick -pp_n 1000 -pp_k 10
```

**Placement-specific options:**
- `-pp_on` - Enable phylogenetic placement mode (REQUIRED)
- `-pp_tree <treefile>` - Reference phylogenetic tree in Newick format (REQUIRED)
- `-pp_n <number>` - Number of existing sequences already in the tree
- `-pp_k <number>` - Number of new sequences to place onto the tree
- `-pp_spr` - Enable SPR (Subtree Pruning and Regrafting) optimizations
- `-pp_test_optimize` - Verify that the original tree structure is preserved

**Input requirements:**
- `-s <vcf_file>` - Input VCF file containing genetic variants (REQUIRED)
- VCF file must have standard format with proper header lines
- Reference tree in Newick format containing existing sequences

**Example commands:**

1. **Basic placement:**
```bash
./mpboot-avx -pp_on -s variants.vcf -pp_tree reference_tree.newick -pp_n 500 -pp_k 50
```

2. **With SPR optimization:**
```bash
./mpboot-avx -pp_on -s variants.vcf -pp_tree reference_tree.newick -pp_n 500 -pp_k 50 -pp_spr
```

3. **With verification (testing mode):**
```bash
./mpboot-avx -pp_on -s variants.vcf -pp_tree reference_tree.newick -pp_n 500 -pp_k 50 -pp_spr -pp_test_optimize
```
38 changes: 20 additions & 18 deletions alignment.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -398,7 +398,7 @@ void Alignment::checkGappySeq(bool force_error) {
}
}

Alignment::Alignment(char *filename, char *sequence_type, InputType &intype, int existing_sequence) : vector<Pattern>() {
Alignment::Alignment(char *filename, char *sequence_type, InputType &intype, int existing_sequence, int num_missing_sequence) : vector<Pattern>() {
num_states = 0;
frac_const_sites = 0.0;
codon_table = NULL;
Expand All @@ -422,7 +422,7 @@ Alignment::Alignment(char *filename, char *sequence_type, InputType &intype, int
readPhylip(filename, sequence_type);
} else if (intype == IN_VCF) {
cout << "VCF format detected" << endl;
readVCF(filename, sequence_type, existing_sequence);
readVCF(filename, sequence_type, existing_sequence, num_missing_sequence);
} else {
outError("Unknown sequence format, please use PHYLIP, FASTA, or NEXUS format");
}
Expand Down Expand Up @@ -1456,7 +1456,7 @@ void Alignment::updateAlignmentNewSequences(const vector<string> &new_sequences,
}

// Read partial VCF file and update alignment
int Alignment::readPartialVCF(ifstream &in, char *sequence_type, vector<int> &perm_col, int existing_sequence, int start_index, int num_column) {
int Alignment::readPartialVCF(ifstream &in, char *sequence_type, vector<int> &perm_col, int existing_sequence, int missing_sequence, int start_index, int num_column) {
if (in.eof()) {
return 0;
}
Expand All @@ -1478,8 +1478,6 @@ int Alignment::readPartialVCF(ifstream &in, char *sequence_type, vector<int> &pe
split(line, words, "\t");
if (words.size() == 1)
continue;
if (words.size() != 9 + nseq + missing_sample_mutations.size())
throw "Number of columns in VCF file is not consistent";
vector<string> alleles;
Mutation mutation;
int variant_pos = std::stoi(words[1]);
Expand Down Expand Up @@ -1517,9 +1515,11 @@ int Alignment::readPartialVCF(ifstream &in, char *sequence_type, vector<int> &pe
mutation.is_missing = true;
}
if (i - 9 >= existing_sequence) {
if (i - 9 < existing_sequence + missing_sequence) {
if (mutation.mut_nuc != mutation.ref_nuc) {
mutation.par_nuc = mutation.ref_nuc;
missing_sample_mutations[i - 9 - existing_sequence].push_back(mutation);
mutation.par_nuc = mutation.ref_nuc;
missing_sample_mutations[i - 9 - existing_sequence].push_back(mutation);
}
}
}
else {
Expand Down Expand Up @@ -1547,7 +1547,7 @@ int Alignment::readPartialVCF(ifstream &in, char *sequence_type, vector<int> &pe
return num_processed_column;
}

int Alignment::readVCF(char *filename, char *sequence_type, int existing_sequence) {
int Alignment::readVCF(char *filename, char *sequence_type, int existing_sequence, int missing_sequence) {
StrVector sequences;
ifstream in;
in.exceptions(ios::failbit | ios::badbit);
Expand All @@ -1572,8 +1572,10 @@ int Alignment::readVCF(char *filename, char *sequence_type, int existing_sequenc
// Sample names start from the 10th word in the header
for (int i = 9; i < words.size(); i++) {
if (i - 9 >= existing_sequence) {
missing_seq_names.push_back(words[i]);
num_missing_sequence++;
if (i - 9 < existing_sequence + missing_sequence) {
missing_seq_names.push_back(words[i]);
num_missing_sequence++;
}
}
else {
seq_names.push_back(words[i]);
Expand All @@ -1586,8 +1588,6 @@ int Alignment::readVCF(char *filename, char *sequence_type, int existing_sequenc
missing_sample_mutations.resize(num_missing_sequence);
}
else {
if (words.size() != 9 + nseq + num_missing_sequence)
throw "Number of columns in VCF file is not consistent";
vector<string> alleles;
Mutation mutation;
int variant_pos = std::stoi(words[1]);
Expand All @@ -1607,15 +1607,15 @@ int Alignment::readVCF(char *filename, char *sequence_type, int existing_sequenc
std::string allele = alleles[allele_id - 1];
if (i - 9 < existing_sequence)
sequences[i - 9].push_back(allele[0]);
else
else if (i - 9 < existing_sequence + missing_sequence)
missing_sequences[i - 9 - existing_sequence].push_back(allele[0]);

mutation.mut_nuc = getMutationFromState(allele[0]);
}
else {
if (i - 9 < existing_sequence)
sequences[i - 9].push_back(words[3][0]);
else
else if (i - 9 < existing_sequence + missing_sequence)
missing_sequences[i - 9 - existing_sequence].push_back(words[3][0]);

mutation.mut_nuc = getMutationFromState(words[3][0]);
Expand All @@ -1624,15 +1624,17 @@ int Alignment::readVCF(char *filename, char *sequence_type, int existing_sequenc
else {
if (i - 9 < existing_sequence)
sequences[i - 9].push_back('-');
else
else if (i - 9 < existing_sequence + missing_sequence)
missing_sequences[i - 9 - existing_sequence].push_back('-');
mutation.mut_nuc = getMutationFromState('N');
mutation.is_missing = true;
}
if (i - 9 >= existing_sequence) {
if (mutation.mut_nuc != mutation.ref_nuc) {
mutation.par_nuc = mutation.ref_nuc;
missing_sample_mutations[i - 9 - existing_sequence].push_back(mutation);
if (i - 9 < existing_sequence + missing_sequence) {
if (mutation.mut_nuc != mutation.ref_nuc) {
mutation.par_nuc = mutation.ref_nuc;
missing_sample_mutations[i - 9 - existing_sequence].push_back(mutation);
}
}
}
else
Expand Down
6 changes: 3 additions & 3 deletions alignment.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ class Alignment : public vector<Pattern> {
@param sequence_type type of the sequence, either "BIN", "DNA", "AA", or NULL
@param intype (OUT) input format of the file
*/
Alignment(char *filename, char *sequence_type, InputType &intype, int existing_sequence = INT_MAX);
Alignment(char *filename, char *sequence_type, InputType &intype, int existing_sequence = INT_MAX, int missing_sequence = INT_MAX);

/**
destructor
Expand Down Expand Up @@ -714,7 +714,7 @@ class Alignment : public vector<Pattern> {
* It updates the alignment with new sequence data and mutation information.
*/
int readPartialVCF(ifstream &in, char *sequence_type, vector<int> &perm_col,
int existing_sequence, int start_index, int num_column);
int existing_sequence, int missing_sequence, int start_index, int num_column);

/**
* Reads and processes a complete VCF file.
Expand All @@ -729,7 +729,7 @@ class Alignment : public vector<Pattern> {
* - Building patterns and updating the alignment
* - Tracking mutations for both existing and missing sequences
*/
int readVCF(char *file_name, char *sequence_type, int existing_sequence);
int readVCF(char *file_name, char *sequence_type, int existing_sequence, int missing_sequence);
protected:


Expand Down
2 changes: 1 addition & 1 deletion gzstream.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
// standard C++ with new header file names and std:: namespace
#include <iostream>
#include <fstream>
#include "zlib-1.2.7/zlib.h"
#include "zlib-1.3.1/zlib.h"

#ifdef GZSTREAM_NAMESPACE
namespace GZSTREAM_NAMESPACE {
Expand Down
Loading