trungnotchung · trungnotchung · Jun 28, 2025 · Jun 28, 2025 · Jun 28, 2025 · Jul 1, 2025
diff --git a/.claude/settings.local.json b/.claude/settings.local.json
@@ -0,0 +1,11 @@
+{
+  "permissions": {
+    "allow": [
+      "Bash(find:*)",
+      "Bash(make:*)",
+      "Bash(cmake:*)"
+    ],
+    "deny": [],
+    "ask": []
+  }
+}
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -324,7 +324,7 @@ configure_file (
 # add the binary tree to the search path for include files
 # so that we will find iqtree_config.h
 include_directories("${PROJECT_BINARY_DIR}")
-include_directories("${PROJECT_BINARY_DIR}/zlib-1.2.7")
+include_directories("${PROJECT_BINARY_DIR}/zlib-1.3.1")
 
 
 ##################################################################
@@ -334,7 +334,7 @@ add_subdirectory(pllrepo/src)
 add_subdirectory(ncl)
 add_subdirectory(whtest)
 add_subdirectory(sprng)
-add_subdirectory(zlib-1.2.7)
+add_subdirectory(zlib-1.3.1)
 add_subdirectory(vectorclass)
 add_subdirectory(model)
 

diff --git a/README.md b/README.md
@@ -135,3 +135,52 @@ The compiler will generate an executable file named **mpboot-avx**
 6. To analyst file **example.phy** with 2 processes, run command:  
 `mpiexec -n 2 ./mpboot-avx -s example.phy`
 > Option **n** specifies the number of processes used to run MPBoot-MPI
+
+## MPBoot-Placement
+
+MPBoot-Placement is a specialized tool for phylogenetic placement that efficiently places new samples onto an existing phylogenetic tree using parsimony-based algorithms. It processes VCF files and can handle large datasets through intelligent batch processing.
+
+### Downloading source code
+You can clone the source code from GitHub with:
+
+`git clone https://github.com/diepthihoang/mpboot.git`
+
+### Compilation
+MPBoot-Placement uses the **same compilation process** as standard MPBoot (see sections above). The executable generated is the same `mpboot-avx` (or `mpboot` for SSE).
+
+### Usage
+
+**Basic placement command:**
+```bash
+./mpboot-avx -pp_on -s variants.vcf -pp_tree reference.newick -pp_n 1000 -pp_k 10
+```
+
+**Placement-specific options:**
+- `-pp_on` - Enable phylogenetic placement mode (REQUIRED)
+- `-pp_tree <treefile>` - Reference phylogenetic tree in Newick format (REQUIRED)
+- `-pp_n <number>` - Number of existing sequences already in the tree
+- `-pp_k <number>` - Number of new sequences to place onto the tree
+- `-pp_spr` - Enable SPR (Subtree Pruning and Regrafting) optimizations
+- `-pp_test_optimize` - Verify that the original tree structure is preserved
+
+**Input requirements:**
+- `-s <vcf_file>` - Input VCF file containing genetic variants (REQUIRED)
+- VCF file must have standard format with proper header lines
+- Reference tree in Newick format containing existing sequences
+
+**Example commands:**
+
+1. **Basic placement:**
+   ```bash
+   ./mpboot-avx -pp_on -s variants.vcf -pp_tree reference_tree.newick -pp_n 500 -pp_k 50
+   ```
+
+2. **With SPR optimization:**
+   ```bash
+   ./mpboot-avx -pp_on -s variants.vcf -pp_tree reference_tree.newick -pp_n 500 -pp_k 50 -pp_spr
+   ```
+
+3. **With verification (testing mode):**
+   ```bash
+   ./mpboot-avx -pp_on -s variants.vcf -pp_tree reference_tree.newick -pp_n 500 -pp_k 50 -pp_spr -pp_test_optimize
+   ```
diff --git a/alignment.cpp b/alignment.cpp
@@ -398,7 +398,7 @@ void Alignment::checkGappySeq(bool force_error) {
     }
 }
 
-Alignment::Alignment(char *filename, char *sequence_type, InputType &intype, int existing_sequence) : vector<Pattern>() {
+Alignment::Alignment(char *filename, char *sequence_type, InputType &intype, int existing_sequence, int num_missing_sequence) : vector<Pattern>() {
     num_states = 0;
     frac_const_sites = 0.0;
     codon_table = NULL;
@@ -422,7 +422,7 @@ Alignment::Alignment(char *filename, char *sequence_type, InputType &intype, int
             readPhylip(filename, sequence_type);
         } else if (intype == IN_VCF) {
             cout << "VCF format detected" << endl;
-            readVCF(filename, sequence_type, existing_sequence);
+            readVCF(filename, sequence_type, existing_sequence, num_missing_sequence);
         } else {
             outError("Unknown sequence format, please use PHYLIP, FASTA, or NEXUS format");
         }
@@ -1456,7 +1456,7 @@ void Alignment::updateAlignmentNewSequences(const vector<string> &new_sequences,
 }
 
 // Read partial VCF file and update alignment
-int Alignment::readPartialVCF(ifstream &in, char *sequence_type, vector<int> &perm_col, int existing_sequence, int start_index, int num_column) {
+int Alignment::readPartialVCF(ifstream &in, char *sequence_type, vector<int> &perm_col, int existing_sequence, int missing_sequence, int start_index, int num_column) {
 	if (in.eof()) {
 		return 0;
 	}
@@ -1478,8 +1478,6 @@ int Alignment::readPartialVCF(ifstream &in, char *sequence_type, vector<int> &pe
 		split(line, words, "\t");
 		if (words.size() == 1)
 			continue;
-		if (words.size() != 9 + nseq + missing_sample_mutations.size())
-			throw "Number of columns in VCF file is not consistent";
 		vector<string> alleles;
 		Mutation mutation;
 		int variant_pos = std::stoi(words[1]);
@@ -1517,9 +1515,11 @@ int Alignment::readPartialVCF(ifstream &in, char *sequence_type, vector<int> &pe
 				mutation.is_missing = true;
 			}
 			if (i - 9 >= existing_sequence) {
+				if (i - 9 < existing_sequence + missing_sequence) {
 				if (mutation.mut_nuc != mutation.ref_nuc) {
-					mutation.par_nuc = mutation.ref_nuc;
-					missing_sample_mutations[i - 9 - existing_sequence].push_back(mutation);
+						mutation.par_nuc = mutation.ref_nuc;
+						missing_sample_mutations[i - 9 - existing_sequence].push_back(mutation);
+					}
 				}
 			}
 			else {
@@ -1547,7 +1547,7 @@ int Alignment::readPartialVCF(ifstream &in, char *sequence_type, vector<int> &pe
 	return num_processed_column;
 }
 
-int Alignment::readVCF(char *filename, char *sequence_type, int existing_sequence) {
+int Alignment::readVCF(char *filename, char *sequence_type, int existing_sequence, int missing_sequence) {
 	StrVector sequences;
 	ifstream in;
 	in.exceptions(ios::failbit | ios::badbit);
@@ -1572,8 +1572,10 @@ int Alignment::readVCF(char *filename, char *sequence_type, int existing_sequenc
 			// Sample names start from the 10th word in the header
 			for (int i = 9; i < words.size(); i++) {
 				if (i - 9 >= existing_sequence) {
-					missing_seq_names.push_back(words[i]);
-					num_missing_sequence++;
+                    if (i - 9 < existing_sequence + missing_sequence) {
+                        missing_seq_names.push_back(words[i]);
+                        num_missing_sequence++;
+                    }
 				}
 				else {
 					seq_names.push_back(words[i]);
@@ -1586,8 +1588,6 @@ int Alignment::readVCF(char *filename, char *sequence_type, int existing_sequenc
 			missing_sample_mutations.resize(num_missing_sequence);
 		}
 		else {
-			if (words.size() != 9 + nseq + num_missing_sequence)
-				throw "Number of columns in VCF file is not consistent";
 			vector<string> alleles;
 			Mutation mutation;
 			int variant_pos = std::stoi(words[1]);
@@ -1607,15 +1607,15 @@ int Alignment::readVCF(char *filename, char *sequence_type, int existing_sequenc
 						std::string allele = alleles[allele_id - 1];
 						if (i - 9 < existing_sequence)
 							sequences[i - 9].push_back(allele[0]);
-						else
+						else if (i - 9 < existing_sequence + missing_sequence)
 							missing_sequences[i - 9 - existing_sequence].push_back(allele[0]);
 
 						mutation.mut_nuc = getMutationFromState(allele[0]);
 					}
 					else {
 						if (i - 9 < existing_sequence)
 							sequences[i - 9].push_back(words[3][0]);
-						else
+						else if (i - 9 < existing_sequence + missing_sequence)
 							missing_sequences[i - 9 - existing_sequence].push_back(words[3][0]);
 
 						mutation.mut_nuc = getMutationFromState(words[3][0]);
@@ -1624,15 +1624,17 @@ int Alignment::readVCF(char *filename, char *sequence_type, int existing_sequenc
 				else {
 					if (i - 9 < existing_sequence)
 						sequences[i - 9].push_back('-');
-					else
+					else if (i - 9 < existing_sequence + missing_sequence)
 						missing_sequences[i - 9 - existing_sequence].push_back('-');
 					mutation.mut_nuc = getMutationFromState('N');
 					mutation.is_missing = true;
 				}
 				if (i - 9 >= existing_sequence) {
-					if (mutation.mut_nuc != mutation.ref_nuc) {
-						mutation.par_nuc = mutation.ref_nuc;
-						missing_sample_mutations[i - 9 - existing_sequence].push_back(mutation);
+					if (i - 9 < existing_sequence + missing_sequence) {
+						if (mutation.mut_nuc != mutation.ref_nuc) {
+							mutation.par_nuc = mutation.ref_nuc;
+							missing_sample_mutations[i - 9 - existing_sequence].push_back(mutation);
+						}
 					}
 				}
 				else

diff --git a/alignment.h b/alignment.h
@@ -59,7 +59,7 @@ class Alignment : public vector<Pattern> {
             @param sequence_type type of the sequence, either "BIN", "DNA", "AA", or NULL
             @param intype (OUT) input format of the file
      */
-    Alignment(char *filename, char *sequence_type, InputType &intype, int existing_sequence = INT_MAX);
+    Alignment(char *filename, char *sequence_type, InputType &intype, int existing_sequence = INT_MAX, int missing_sequence = INT_MAX);
 
     /**
             destructor
@@ -714,7 +714,7 @@ class Alignment : public vector<Pattern> {
 	 * It updates the alignment with new sequence data and mutation information.
 	 */
 	int readPartialVCF(ifstream &in, char *sequence_type, vector<int> &perm_col, 
-					int existing_sequence, int start_index, int num_column);
+					int existing_sequence, int missing_sequence, int start_index, int num_column);
 
 	/**
 	 * Reads and processes a complete VCF file.
@@ -729,7 +729,7 @@ class Alignment : public vector<Pattern> {
 	 * - Building patterns and updating the alignment
 	 * - Tracking mutations for both existing and missing sequences
 	 */
-	int readVCF(char *file_name, char *sequence_type, int existing_sequence);
+	int readVCF(char *file_name, char *sequence_type, int existing_sequence, int missing_sequence);
 protected:
 
 

diff --git a/gzstream.h b/gzstream.h
@@ -32,7 +32,7 @@
 // standard C++ with new header file names and std:: namespace
 #include <iostream>
 #include <fstream>
-#include "zlib-1.2.7/zlib.h"
+#include "zlib-1.3.1/zlib.h"
 
 #ifdef GZSTREAM_NAMESPACE
 namespace GZSTREAM_NAMESPACE {