From 3e00fbaa9bd3b2beff46a2d36f022f9763524e3e Mon Sep 17 00:00:00 2001
From: Yuhao Tong <yuhtong@spartan-login2.hpc.unimelb.edu.au>
Date: Tue, 18 Nov 2025 17:38:54 +1100
Subject: [PATCH 1/4] Add split at calculating trimmed mean in Condenser.py and
 thus works for the singlem bespoke package for plastid markers.

---
 singlem/condense.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/singlem/condense.py b/singlem/condense.py
index e1f7b285..f9128388 100644
--- a/singlem/condense.py
+++ b/singlem/condense.py
@@ -520,8 +520,13 @@ def best_hit_genera_from_otu(otu):
                 else:
                     num_markers = len(genes_per_domain[tax.split(';')[1].strip().replace('d__','')])
                 logging.debug("Using {} markers for OTU taxonomy {}, with coverages {}".format(num_markers, tax, gene_to_coverage.values()))
-                trimmed_mean = self.calculate_abundance(list(gene_to_coverage.values()), num_markers, trim_percent)
-                next_genus_to_coverage[tax] = trimmed_mean
+                try:
+                    trimmed_mean = self.calculate_abundance(list(gene_to_coverage.values()), num_markers, trim_percent)
+                    next_genus_to_coverage[tax] = trimmed_mean
+                except:
+                    # breakpoint()
+                    print(f"When calculating trimmed mean for tax {tax}, the ZeroDivisionError occurred\
+                          . Consider empty genes_per_domain which has no genes listed for the taxon's domain.", file=sys.stderr)
 
             # Has any species changed in abundance by a large enough amount? If not, we're done
             need_another_iteration = False

From 4a7b67b14cd0d2f8fe58bd218e221eeb1f58439f Mon Sep 17 00:00:00 2001
From: Yuhao Tong <yuhtong@spartan-login2.hpc.unimelb.edu.au>
Date: Thu, 4 Dec 2025 14:39:01 +1100
Subject: [PATCH 2/4] found another spot where using eukaryote metapackage
 failed

---
 singlem/condense.py | 41 ++++++++++++++++++++++++-----------------
 1 file changed, 24 insertions(+), 17 deletions(-)

diff --git a/singlem/condense.py b/singlem/condense.py
index f9128388..e6d05d1d 100644
--- a/singlem/condense.py
+++ b/singlem/condense.py
@@ -1,6 +1,7 @@
 import logging
 import tempfile
 import csv
+# from gReLU.build.lib.grelu.sequence.utils import trim
 import numpy as np
 import extern
 import sys
@@ -98,13 +99,13 @@ def _condense_to_otu_table(self, metapackage, **kwargs):
                     target_domains["Viruses"] += [marker_name]
                 else:
                     raise Exception("Domain: {} not supported.".format(domain))
-                
+        
+        # breakpoint()
         for domain in target_domains:
             if target_domains[domain] in [1, 2]:
                 raise Exception("Number of markers for all domains must either be >= 3 or equal to 0. Only {} markers for domain '{}' found".format(target_domains[domain], domain))
 
         for sample, sample_otus in input_otu_table.each_sample_otus(generate_archive_otu_table=True):
-
             logging.debug("Processing sample {} ..".format(sample))
             apply_diamond_expectation_maximisation = True
             yield self._condense_a_sample(sample, sample_otus, markers, target_domains, trim_percent, min_taxon_coverage, 
@@ -114,7 +115,6 @@ def _condense_a_sample(self, sample, sample_otus, markers, target_domains, trim_
             apply_query_expectation_maximisation, apply_diamond_expectation_maximisation, metapackage,
             output_after_em_otu_table, viral_mode):
 
-
         # Remove off-target OTUs genes
         logging.debug("Total OTU coverage by query: {}".format(sum([o.coverage for o in sample_otus if o.taxonomy_assignment_method() == QUERY_BASED_ASSIGNMENT_METHOD])))
         logging.debug("Total OTU coverage by diamond: {}".format(sum([o.coverage for o in sample_otus if o.taxonomy_assignment_method() == DIAMOND_ASSIGNMENT_METHOD])))
@@ -199,6 +199,8 @@ def _convert_diamond_best_hit_ids_to_taxonomies(self, metapackage, sample_otus):
 
         num_otus_changed = 0
         sequence_ids = set()
+
+        target_domain = [metapackage.singlem_packages[i].target_domains() for i in range(len(metapackage.singlem_packages))]
         # Step 1: Gather dictionary of sequence IDs to taxon strings
         for otu in sample_otus:
             if otu.taxonomy_assignment_method() == DIAMOND_ASSIGNMENT_METHOD:
@@ -220,8 +222,15 @@ def _convert_diamond_best_hit_ids_to_taxonomies(self, metapackage, sample_otus):
                     for seq_id in seq_id_list:
                         taxon_name = sequence_id_to_taxon[seq_id]
                         if not taxon_name[-2].startswith('g__'):
+                            # This part expect bacterial and archaeal sequences as on-target and eukaryotes as off-targets.
                             if not taxon_name[0] == 'd__Eukaryota':
-                                raise Exception("Expected genus level taxon, but found {}, from ID {}".format(taxon_name, seq_id))
+                                # add one check to ensure the target taxon to be d__Eukaryota for all metapackages.
+                                if all("Eukaryota" in domain for domain in target_domain):
+                                    # then the bacterial/archaeal sequences are off-target this time. 
+                                    logging.debug("Ignoring off-target prokaryotic taxon {}".format(taxon_name))
+                                    continue
+                                else:
+                                    raise Exception("Expected genus level taxon, but found {}, from ID {}".format(taxon_name, seq_id))
                             else:
                                 # This can happen when taxonomy is overall
                                 # Archaea so not previously filtered out, but
@@ -401,7 +410,8 @@ def _remove_off_target_otus(self, sample_otus, markers):
                 raise Exception("Stopping: sample {} had too many unassigned OTUs".format(list(sample_otus)[0].sample_name))
         table.data = [otu.data for otu in sample_otus if \
             self._is_targeted_by_marker(otu, otu.taxonomy_array(), markers) and \
-            otu.taxonomy_assignment_method() is not None]
+            otu.taxonomy_assignment_method() is not None] # This code is removing OTUs that have unmatched taxonomy domain.
+    
         num_no_assignment_otus = sum([otu.data[ArchiveOtuTable.COVERAGE_FIELD_INDEX] for otu in table if otu.taxonomy_assignment_method() is None])
         num_assigned_otus = sum([otu.data[ArchiveOtuTable.COVERAGE_FIELD_INDEX] for otu in table if otu.taxonomy_assignment_method() is not None])
         logging.info("After removing off-target OTUs, found {:.2f} assigned and {:.2f} unassigned OTU coverage units".format(num_assigned_otus, num_no_assignment_otus))
@@ -454,13 +464,13 @@ def _apply_genus_expectation_maximization_core(self, sample_otus, trim_percent,
         def best_hit_genera_from_otu(otu):
             # The DIAMOND assignments are already truncated to genus level.
             # But the query-based ones go to species level.
-            best_hit_taxonomies = otu.equal_best_hit_taxonomies()
+            best_hit_taxonomies = otu.equal_best_hit_taxonomies() # there is a list called EQUAL_BEST_HIT_TAXONOMIES_INDEX in ArchiveOtuTable
             method = otu.taxonomy_assignment_method()
             if method == DIAMOND_ASSIGNMENT_METHOD:
                 best_hit_genera = best_hit_taxonomies
             elif method in (QUERY_BASED_ASSIGNMENT_METHOD, QUERY_BASED_ASSIGNMENT_METHOD+'_abandoned'):
                 best_hit_genera = set(
-                    [';'.join([s.strip() for s in taxon.split(';')[:-1]]) for taxon in best_hit_taxonomies])
+                    [';'.join([s.strip() for s in taxon.split(';')[:-1]]) for taxon in best_hit_taxonomies]) # so Cyanobacteria are from here. 
             else:
                 raise Exception("Unexpected taxonomy assignment method: {}".format(otu.taxonomy_assignment_method()))
             return best_hit_genera
@@ -495,7 +505,6 @@ def best_hit_genera_from_otu(otu):
             for otu in sample_otus:
                 unnormalised_coverages = {}
                 best_hit_genera = best_hit_genera_from_otu(otu)
-
                 for best_hit_genus in best_hit_genera:
                     if best_hit_genus in genus_to_coverage: # Can get removed during trimmed mean
                         unnormalised_coverages[best_hit_genus] = genus_to_coverage[best_hit_genus]
@@ -505,6 +514,10 @@ def best_hit_genera_from_otu(otu):
                 #     continue
 
                 for tax, unnormalised_coverage in unnormalised_coverages.items():
+                    # So, there will be tax identified as Bacteria here,
+                    # and also, you have Bacteria in your taxonomy references. 
+                    if "Bacteria" in tax:
+                        logging.info("Assigning coverage for OTU {} to genus {}: unnormalised coverage {}, total coverage {}, otu coverage {}".format(otu, tax, unnormalised_coverage, total_coverage, otu.coverage))
                     # Record the total for each gene so a trimmed mean can be taken afterwards
                     if tax not in next_genus_to_gene_to_coverage:
                         next_genus_to_gene_to_coverage[tax] = {}
@@ -514,19 +527,15 @@ def best_hit_genera_from_otu(otu):
                     
             # Calculate the trimmed mean for each genus
             next_genus_to_coverage = {}
+            # breakpoint()
             for tax, gene_to_coverage in next_genus_to_gene_to_coverage.items():
                 if avg_num_genes_per_species is not None:
                     num_markers = avg_num_genes_per_species
                 else:
                     num_markers = len(genes_per_domain[tax.split(';')[1].strip().replace('d__','')])
                 logging.debug("Using {} markers for OTU taxonomy {}, with coverages {}".format(num_markers, tax, gene_to_coverage.values()))
-                try:
-                    trimmed_mean = self.calculate_abundance(list(gene_to_coverage.values()), num_markers, trim_percent)
-                    next_genus_to_coverage[tax] = trimmed_mean
-                except:
-                    # breakpoint()
-                    print(f"When calculating trimmed mean for tax {tax}, the ZeroDivisionError occurred\
-                          . Consider empty genes_per_domain which has no genes listed for the taxon's domain.", file=sys.stderr)
+                trimmed_mean = self.calculate_abundance(list(gene_to_coverage.values()), num_markers, trim_percent)
+                next_genus_to_coverage[tax] = trimmed_mean                
 
             # Has any species changed in abundance by a large enough amount? If not, we're done
             need_another_iteration = False
@@ -591,8 +600,6 @@ def _apply_species_expectation_maximization_core(self, sample_otus, trim_percent
         best_hit_taxonomy_sets = set()
         some_em_to_do = False
         species_genes = {}
-
-
         for otu in sample_otus:
             best_hit_taxonomies = otu.equal_best_hit_taxonomies()
             if otu.taxonomy_assignment_method() == QUERY_BASED_ASSIGNMENT_METHOD and best_hit_taxonomies is not None:

From 0890de28ef0e88dc2d4983f83aad1422fd2224f6 Mon Sep 17 00:00:00 2001
From: Yuhao Tong <yuhtong@spartan-login1.hpc.unimelb.edu.au>
Date: Mon, 8 Dec 2025 13:02:19 +1100
Subject: [PATCH 3/4] modify condense.py based on Ben'

---
 singlem/condense.py | 39 ++++++++++++++-------------------------
 1 file changed, 14 insertions(+), 25 deletions(-)

diff --git a/singlem/condense.py b/singlem/condense.py
index e6d05d1d..306648df 100644
--- a/singlem/condense.py
+++ b/singlem/condense.py
@@ -1,7 +1,6 @@
 import logging
 import tempfile
 import csv
-# from gReLU.build.lib.grelu.sequence.utils import trim
 import numpy as np
 import extern
 import sys
@@ -79,7 +78,7 @@ def _condense_to_otu_table(self, metapackage, **kwargs):
         logging.info("Using minimum taxon coverage of {}".format(min_taxon_coverage))
 
         markers = {} # set of markers used to the domains they target
-        target_domains = {"Archaea": [], "Bacteria": [], "Eukaryota": [], "Viruses": []}
+        target_domains = {}
         
         for spkg in metapackage.singlem_packages:
             # ensure v3 packages
@@ -89,23 +88,16 @@ def _condense_to_otu_table(self, metapackage, **kwargs):
             markers[marker_name] = spkg.target_domains()
             # count number of markers for each domain
             for domain in spkg.target_domains():
-                if domain == "Archaea":
-                    target_domains["Archaea"] += [marker_name]
-                elif domain == "Bacteria":
-                    target_domains["Bacteria"] += [marker_name]
-                elif domain == "Eukaryota":
-                    target_domains["Eukaryota"] += [marker_name]
-                elif domain == "Viruses":
-                    target_domains["Viruses"] += [marker_name]
-                else:
-                    raise Exception("Domain: {} not supported.".format(domain))
-        
-        # breakpoint()
+                if domain not in target_domains:
+                    target_domains[domain] = []
+                target_domains[domain].append(marker_name)
+                
         for domain in target_domains:
             if target_domains[domain] in [1, 2]:
                 raise Exception("Number of markers for all domains must either be >= 3 or equal to 0. Only {} markers for domain '{}' found".format(target_domains[domain], domain))
 
         for sample, sample_otus in input_otu_table.each_sample_otus(generate_archive_otu_table=True):
+
             logging.debug("Processing sample {} ..".format(sample))
             apply_diamond_expectation_maximisation = True
             yield self._condense_a_sample(sample, sample_otus, markers, target_domains, trim_percent, min_taxon_coverage, 
@@ -115,6 +107,7 @@ def _condense_a_sample(self, sample, sample_otus, markers, target_domains, trim_
             apply_query_expectation_maximisation, apply_diamond_expectation_maximisation, metapackage,
             output_after_em_otu_table, viral_mode):
 
+
         # Remove off-target OTUs genes
         logging.debug("Total OTU coverage by query: {}".format(sum([o.coverage for o in sample_otus if o.taxonomy_assignment_method() == QUERY_BASED_ASSIGNMENT_METHOD])))
         logging.debug("Total OTU coverage by diamond: {}".format(sum([o.coverage for o in sample_otus if o.taxonomy_assignment_method() == DIAMOND_ASSIGNMENT_METHOD])))
@@ -222,7 +215,6 @@ def _convert_diamond_best_hit_ids_to_taxonomies(self, metapackage, sample_otus):
                     for seq_id in seq_id_list:
                         taxon_name = sequence_id_to_taxon[seq_id]
                         if not taxon_name[-2].startswith('g__'):
-                            # This part expect bacterial and archaeal sequences as on-target and eukaryotes as off-targets.
                             if not taxon_name[0] == 'd__Eukaryota':
                                 # add one check to ensure the target taxon to be d__Eukaryota for all metapackages.
                                 if all("Eukaryota" in domain for domain in target_domain):
@@ -410,8 +402,7 @@ def _remove_off_target_otus(self, sample_otus, markers):
                 raise Exception("Stopping: sample {} had too many unassigned OTUs".format(list(sample_otus)[0].sample_name))
         table.data = [otu.data for otu in sample_otus if \
             self._is_targeted_by_marker(otu, otu.taxonomy_array(), markers) and \
-            otu.taxonomy_assignment_method() is not None] # This code is removing OTUs that have unmatched taxonomy domain.
-    
+            otu.taxonomy_assignment_method() is not None]
         num_no_assignment_otus = sum([otu.data[ArchiveOtuTable.COVERAGE_FIELD_INDEX] for otu in table if otu.taxonomy_assignment_method() is None])
         num_assigned_otus = sum([otu.data[ArchiveOtuTable.COVERAGE_FIELD_INDEX] for otu in table if otu.taxonomy_assignment_method() is not None])
         logging.info("After removing off-target OTUs, found {:.2f} assigned and {:.2f} unassigned OTU coverage units".format(num_assigned_otus, num_no_assignment_otus))
@@ -464,13 +455,13 @@ def _apply_genus_expectation_maximization_core(self, sample_otus, trim_percent,
         def best_hit_genera_from_otu(otu):
             # The DIAMOND assignments are already truncated to genus level.
             # But the query-based ones go to species level.
-            best_hit_taxonomies = otu.equal_best_hit_taxonomies() # there is a list called EQUAL_BEST_HIT_TAXONOMIES_INDEX in ArchiveOtuTable
+            best_hit_taxonomies = otu.equal_best_hit_taxonomies()
             method = otu.taxonomy_assignment_method()
             if method == DIAMOND_ASSIGNMENT_METHOD:
                 best_hit_genera = best_hit_taxonomies
             elif method in (QUERY_BASED_ASSIGNMENT_METHOD, QUERY_BASED_ASSIGNMENT_METHOD+'_abandoned'):
                 best_hit_genera = set(
-                    [';'.join([s.strip() for s in taxon.split(';')[:-1]]) for taxon in best_hit_taxonomies]) # so Cyanobacteria are from here. 
+                    [';'.join([s.strip() for s in taxon.split(';')[:-1]]) for taxon in best_hit_taxonomies])
             else:
                 raise Exception("Unexpected taxonomy assignment method: {}".format(otu.taxonomy_assignment_method()))
             return best_hit_genera
@@ -505,6 +496,7 @@ def best_hit_genera_from_otu(otu):
             for otu in sample_otus:
                 unnormalised_coverages = {}
                 best_hit_genera = best_hit_genera_from_otu(otu)
+
                 for best_hit_genus in best_hit_genera:
                     if best_hit_genus in genus_to_coverage: # Can get removed during trimmed mean
                         unnormalised_coverages[best_hit_genus] = genus_to_coverage[best_hit_genus]
@@ -514,10 +506,6 @@ def best_hit_genera_from_otu(otu):
                 #     continue
 
                 for tax, unnormalised_coverage in unnormalised_coverages.items():
-                    # So, there will be tax identified as Bacteria here,
-                    # and also, you have Bacteria in your taxonomy references. 
-                    if "Bacteria" in tax:
-                        logging.info("Assigning coverage for OTU {} to genus {}: unnormalised coverage {}, total coverage {}, otu coverage {}".format(otu, tax, unnormalised_coverage, total_coverage, otu.coverage))
                     # Record the total for each gene so a trimmed mean can be taken afterwards
                     if tax not in next_genus_to_gene_to_coverage:
                         next_genus_to_gene_to_coverage[tax] = {}
@@ -527,7 +515,6 @@ def best_hit_genera_from_otu(otu):
                     
             # Calculate the trimmed mean for each genus
             next_genus_to_coverage = {}
-            # breakpoint()
             for tax, gene_to_coverage in next_genus_to_gene_to_coverage.items():
                 if avg_num_genes_per_species is not None:
                     num_markers = avg_num_genes_per_species
@@ -535,7 +522,7 @@ def best_hit_genera_from_otu(otu):
                     num_markers = len(genes_per_domain[tax.split(';')[1].strip().replace('d__','')])
                 logging.debug("Using {} markers for OTU taxonomy {}, with coverages {}".format(num_markers, tax, gene_to_coverage.values()))
                 trimmed_mean = self.calculate_abundance(list(gene_to_coverage.values()), num_markers, trim_percent)
-                next_genus_to_coverage[tax] = trimmed_mean                
+                next_genus_to_coverage[tax] = trimmed_mean
 
             # Has any species changed in abundance by a large enough amount? If not, we're done
             need_another_iteration = False
@@ -600,6 +587,8 @@ def _apply_species_expectation_maximization_core(self, sample_otus, trim_percent
         best_hit_taxonomy_sets = set()
         some_em_to_do = False
         species_genes = {}
+
+
         for otu in sample_otus:
             best_hit_taxonomies = otu.equal_best_hit_taxonomies()
             if otu.taxonomy_assignment_method() == QUERY_BASED_ASSIGNMENT_METHOD and best_hit_taxonomies is not None:

From e8b9933131c8e26b2bacd0392a3b40589855953b Mon Sep 17 00:00:00 2001
From: Yuhao Tong <yuhtong@spartan-login1.hpc.unimelb.edu.au>
Date: Tue, 9 Dec 2025 16:32:21 +1100
Subject: [PATCH 4/4] Modify the otu writing loop by Ben's advice, tested and
 passed on a sample read data

---
 singlem/condense.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/singlem/condense.py b/singlem/condense.py
index 306648df..3b3c373d 100644
--- a/singlem/condense.py
+++ b/singlem/condense.py
@@ -215,14 +215,15 @@ def _convert_diamond_best_hit_ids_to_taxonomies(self, metapackage, sample_otus):
                     for seq_id in seq_id_list:
                         taxon_name = sequence_id_to_taxon[seq_id]
                         if not taxon_name[-2].startswith('g__'):
-                            if not taxon_name[0] == 'd__Eukaryota':
-                                # add one check to ensure the target taxon to be d__Eukaryota for all metapackages.
-                                if all("Eukaryota" in domain for domain in target_domain):
-                                    # then the bacterial/archaeal sequences are off-target this time. 
-                                    logging.debug("Ignoring off-target prokaryotic taxon {}".format(taxon_name))
-                                    continue
-                                else:
-                                    raise Exception("Expected genus level taxon, but found {}, from ID {}".format(taxon_name, seq_id))
+                            if not taxon_name[0] in target_domain:
+                                # # add one check to ensure the target taxon to be d__Eukaryota for all metapackages.
+                                # if all("Eukaryota" in domain for domain in target_domain):
+                                #     # then the bacterial/archaeal sequences are off-target this time. 
+                                #     logging.debug("Ignoring off-target prokaryotic taxon {}".format(taxon_name))
+                                #     continue
+                                # else:
+                                #     raise Exception("Expected genus level taxon, but found {}, from ID {}".format(taxon_name, seq_id))
+                                continue
                             else:
                                 # This can happen when taxonomy is overall
                                 # Archaea so not previously filtered out, but