From 3e00fbaa9bd3b2beff46a2d36f022f9763524e3e Mon Sep 17 00:00:00 2001 From: Yuhao Tong Date: Tue, 18 Nov 2025 17:38:54 +1100 Subject: [PATCH 1/4] Add split at calculating trimmed mean in Condenser.py and thus works for the singlem bespoke package for plastid markers. --- singlem/condense.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/singlem/condense.py b/singlem/condense.py index e1f7b285..f9128388 100644 --- a/singlem/condense.py +++ b/singlem/condense.py @@ -520,8 +520,13 @@ def best_hit_genera_from_otu(otu): else: num_markers = len(genes_per_domain[tax.split(';')[1].strip().replace('d__','')]) logging.debug("Using {} markers for OTU taxonomy {}, with coverages {}".format(num_markers, tax, gene_to_coverage.values())) - trimmed_mean = self.calculate_abundance(list(gene_to_coverage.values()), num_markers, trim_percent) - next_genus_to_coverage[tax] = trimmed_mean + try: + trimmed_mean = self.calculate_abundance(list(gene_to_coverage.values()), num_markers, trim_percent) + next_genus_to_coverage[tax] = trimmed_mean + except: + # breakpoint() + print(f"When calculating trimmed mean for tax {tax}, the ZeroDivisionError occurred\ + . Consider empty genes_per_domain which has no genes listed for the taxon's domain.", file=sys.stderr) # Has any species changed in abundance by a large enough amount? If not, we're done need_another_iteration = False From 4a7b67b14cd0d2f8fe58bd218e221eeb1f58439f Mon Sep 17 00:00:00 2001 From: Yuhao Tong Date: Thu, 4 Dec 2025 14:39:01 +1100 Subject: [PATCH 2/4] found another spot where using eukaryote metapackage failed --- singlem/condense.py | 41 ++++++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/singlem/condense.py b/singlem/condense.py index f9128388..e6d05d1d 100644 --- a/singlem/condense.py +++ b/singlem/condense.py @@ -1,6 +1,7 @@ import logging import tempfile import csv +# from gReLU.build.lib.grelu.sequence.utils import trim import numpy as np import extern import sys @@ -98,13 +99,13 @@ def _condense_to_otu_table(self, metapackage, **kwargs): target_domains["Viruses"] += [marker_name] else: raise Exception("Domain: {} not supported.".format(domain)) - + + # breakpoint() for domain in target_domains: if target_domains[domain] in [1, 2]: raise Exception("Number of markers for all domains must either be >= 3 or equal to 0. Only {} markers for domain '{}' found".format(target_domains[domain], domain)) for sample, sample_otus in input_otu_table.each_sample_otus(generate_archive_otu_table=True): - logging.debug("Processing sample {} ..".format(sample)) apply_diamond_expectation_maximisation = True yield self._condense_a_sample(sample, sample_otus, markers, target_domains, trim_percent, min_taxon_coverage, @@ -114,7 +115,6 @@ def _condense_a_sample(self, sample, sample_otus, markers, target_domains, trim_ apply_query_expectation_maximisation, apply_diamond_expectation_maximisation, metapackage, output_after_em_otu_table, viral_mode): - # Remove off-target OTUs genes logging.debug("Total OTU coverage by query: {}".format(sum([o.coverage for o in sample_otus if o.taxonomy_assignment_method() == QUERY_BASED_ASSIGNMENT_METHOD]))) logging.debug("Total OTU coverage by diamond: {}".format(sum([o.coverage for o in sample_otus if o.taxonomy_assignment_method() == DIAMOND_ASSIGNMENT_METHOD]))) @@ -199,6 +199,8 @@ def _convert_diamond_best_hit_ids_to_taxonomies(self, metapackage, sample_otus): num_otus_changed = 0 sequence_ids = set() + + target_domain = [metapackage.singlem_packages[i].target_domains() for i in range(len(metapackage.singlem_packages))] # Step 1: Gather dictionary of sequence IDs to taxon strings for otu in sample_otus: if otu.taxonomy_assignment_method() == DIAMOND_ASSIGNMENT_METHOD: @@ -220,8 +222,15 @@ def _convert_diamond_best_hit_ids_to_taxonomies(self, metapackage, sample_otus): for seq_id in seq_id_list: taxon_name = sequence_id_to_taxon[seq_id] if not taxon_name[-2].startswith('g__'): + # This part expect bacterial and archaeal sequences as on-target and eukaryotes as off-targets. if not taxon_name[0] == 'd__Eukaryota': - raise Exception("Expected genus level taxon, but found {}, from ID {}".format(taxon_name, seq_id)) + # add one check to ensure the target taxon to be d__Eukaryota for all metapackages. + if all("Eukaryota" in domain for domain in target_domain): + # then the bacterial/archaeal sequences are off-target this time. + logging.debug("Ignoring off-target prokaryotic taxon {}".format(taxon_name)) + continue + else: + raise Exception("Expected genus level taxon, but found {}, from ID {}".format(taxon_name, seq_id)) else: # This can happen when taxonomy is overall # Archaea so not previously filtered out, but @@ -401,7 +410,8 @@ def _remove_off_target_otus(self, sample_otus, markers): raise Exception("Stopping: sample {} had too many unassigned OTUs".format(list(sample_otus)[0].sample_name)) table.data = [otu.data for otu in sample_otus if \ self._is_targeted_by_marker(otu, otu.taxonomy_array(), markers) and \ - otu.taxonomy_assignment_method() is not None] + otu.taxonomy_assignment_method() is not None] # This code is removing OTUs that have unmatched taxonomy domain. + num_no_assignment_otus = sum([otu.data[ArchiveOtuTable.COVERAGE_FIELD_INDEX] for otu in table if otu.taxonomy_assignment_method() is None]) num_assigned_otus = sum([otu.data[ArchiveOtuTable.COVERAGE_FIELD_INDEX] for otu in table if otu.taxonomy_assignment_method() is not None]) logging.info("After removing off-target OTUs, found {:.2f} assigned and {:.2f} unassigned OTU coverage units".format(num_assigned_otus, num_no_assignment_otus)) @@ -454,13 +464,13 @@ def _apply_genus_expectation_maximization_core(self, sample_otus, trim_percent, def best_hit_genera_from_otu(otu): # The DIAMOND assignments are already truncated to genus level. # But the query-based ones go to species level. - best_hit_taxonomies = otu.equal_best_hit_taxonomies() + best_hit_taxonomies = otu.equal_best_hit_taxonomies() # there is a list called EQUAL_BEST_HIT_TAXONOMIES_INDEX in ArchiveOtuTable method = otu.taxonomy_assignment_method() if method == DIAMOND_ASSIGNMENT_METHOD: best_hit_genera = best_hit_taxonomies elif method in (QUERY_BASED_ASSIGNMENT_METHOD, QUERY_BASED_ASSIGNMENT_METHOD+'_abandoned'): best_hit_genera = set( - [';'.join([s.strip() for s in taxon.split(';')[:-1]]) for taxon in best_hit_taxonomies]) + [';'.join([s.strip() for s in taxon.split(';')[:-1]]) for taxon in best_hit_taxonomies]) # so Cyanobacteria are from here. else: raise Exception("Unexpected taxonomy assignment method: {}".format(otu.taxonomy_assignment_method())) return best_hit_genera @@ -495,7 +505,6 @@ def best_hit_genera_from_otu(otu): for otu in sample_otus: unnormalised_coverages = {} best_hit_genera = best_hit_genera_from_otu(otu) - for best_hit_genus in best_hit_genera: if best_hit_genus in genus_to_coverage: # Can get removed during trimmed mean unnormalised_coverages[best_hit_genus] = genus_to_coverage[best_hit_genus] @@ -505,6 +514,10 @@ def best_hit_genera_from_otu(otu): # continue for tax, unnormalised_coverage in unnormalised_coverages.items(): + # So, there will be tax identified as Bacteria here, + # and also, you have Bacteria in your taxonomy references. + if "Bacteria" in tax: + logging.info("Assigning coverage for OTU {} to genus {}: unnormalised coverage {}, total coverage {}, otu coverage {}".format(otu, tax, unnormalised_coverage, total_coverage, otu.coverage)) # Record the total for each gene so a trimmed mean can be taken afterwards if tax not in next_genus_to_gene_to_coverage: next_genus_to_gene_to_coverage[tax] = {} @@ -514,19 +527,15 @@ def best_hit_genera_from_otu(otu): # Calculate the trimmed mean for each genus next_genus_to_coverage = {} + # breakpoint() for tax, gene_to_coverage in next_genus_to_gene_to_coverage.items(): if avg_num_genes_per_species is not None: num_markers = avg_num_genes_per_species else: num_markers = len(genes_per_domain[tax.split(';')[1].strip().replace('d__','')]) logging.debug("Using {} markers for OTU taxonomy {}, with coverages {}".format(num_markers, tax, gene_to_coverage.values())) - try: - trimmed_mean = self.calculate_abundance(list(gene_to_coverage.values()), num_markers, trim_percent) - next_genus_to_coverage[tax] = trimmed_mean - except: - # breakpoint() - print(f"When calculating trimmed mean for tax {tax}, the ZeroDivisionError occurred\ - . Consider empty genes_per_domain which has no genes listed for the taxon's domain.", file=sys.stderr) + trimmed_mean = self.calculate_abundance(list(gene_to_coverage.values()), num_markers, trim_percent) + next_genus_to_coverage[tax] = trimmed_mean # Has any species changed in abundance by a large enough amount? If not, we're done need_another_iteration = False @@ -591,8 +600,6 @@ def _apply_species_expectation_maximization_core(self, sample_otus, trim_percent best_hit_taxonomy_sets = set() some_em_to_do = False species_genes = {} - - for otu in sample_otus: best_hit_taxonomies = otu.equal_best_hit_taxonomies() if otu.taxonomy_assignment_method() == QUERY_BASED_ASSIGNMENT_METHOD and best_hit_taxonomies is not None: From 0890de28ef0e88dc2d4983f83aad1422fd2224f6 Mon Sep 17 00:00:00 2001 From: Yuhao Tong Date: Mon, 8 Dec 2025 13:02:19 +1100 Subject: [PATCH 3/4] modify condense.py based on Ben' --- singlem/condense.py | 39 ++++++++++++++------------------------- 1 file changed, 14 insertions(+), 25 deletions(-) diff --git a/singlem/condense.py b/singlem/condense.py index e6d05d1d..306648df 100644 --- a/singlem/condense.py +++ b/singlem/condense.py @@ -1,7 +1,6 @@ import logging import tempfile import csv -# from gReLU.build.lib.grelu.sequence.utils import trim import numpy as np import extern import sys @@ -79,7 +78,7 @@ def _condense_to_otu_table(self, metapackage, **kwargs): logging.info("Using minimum taxon coverage of {}".format(min_taxon_coverage)) markers = {} # set of markers used to the domains they target - target_domains = {"Archaea": [], "Bacteria": [], "Eukaryota": [], "Viruses": []} + target_domains = {} for spkg in metapackage.singlem_packages: # ensure v3 packages @@ -89,23 +88,16 @@ def _condense_to_otu_table(self, metapackage, **kwargs): markers[marker_name] = spkg.target_domains() # count number of markers for each domain for domain in spkg.target_domains(): - if domain == "Archaea": - target_domains["Archaea"] += [marker_name] - elif domain == "Bacteria": - target_domains["Bacteria"] += [marker_name] - elif domain == "Eukaryota": - target_domains["Eukaryota"] += [marker_name] - elif domain == "Viruses": - target_domains["Viruses"] += [marker_name] - else: - raise Exception("Domain: {} not supported.".format(domain)) - - # breakpoint() + if domain not in target_domains: + target_domains[domain] = [] + target_domains[domain].append(marker_name) + for domain in target_domains: if target_domains[domain] in [1, 2]: raise Exception("Number of markers for all domains must either be >= 3 or equal to 0. Only {} markers for domain '{}' found".format(target_domains[domain], domain)) for sample, sample_otus in input_otu_table.each_sample_otus(generate_archive_otu_table=True): + logging.debug("Processing sample {} ..".format(sample)) apply_diamond_expectation_maximisation = True yield self._condense_a_sample(sample, sample_otus, markers, target_domains, trim_percent, min_taxon_coverage, @@ -115,6 +107,7 @@ def _condense_a_sample(self, sample, sample_otus, markers, target_domains, trim_ apply_query_expectation_maximisation, apply_diamond_expectation_maximisation, metapackage, output_after_em_otu_table, viral_mode): + # Remove off-target OTUs genes logging.debug("Total OTU coverage by query: {}".format(sum([o.coverage for o in sample_otus if o.taxonomy_assignment_method() == QUERY_BASED_ASSIGNMENT_METHOD]))) logging.debug("Total OTU coverage by diamond: {}".format(sum([o.coverage for o in sample_otus if o.taxonomy_assignment_method() == DIAMOND_ASSIGNMENT_METHOD]))) @@ -222,7 +215,6 @@ def _convert_diamond_best_hit_ids_to_taxonomies(self, metapackage, sample_otus): for seq_id in seq_id_list: taxon_name = sequence_id_to_taxon[seq_id] if not taxon_name[-2].startswith('g__'): - # This part expect bacterial and archaeal sequences as on-target and eukaryotes as off-targets. if not taxon_name[0] == 'd__Eukaryota': # add one check to ensure the target taxon to be d__Eukaryota for all metapackages. if all("Eukaryota" in domain for domain in target_domain): @@ -410,8 +402,7 @@ def _remove_off_target_otus(self, sample_otus, markers): raise Exception("Stopping: sample {} had too many unassigned OTUs".format(list(sample_otus)[0].sample_name)) table.data = [otu.data for otu in sample_otus if \ self._is_targeted_by_marker(otu, otu.taxonomy_array(), markers) and \ - otu.taxonomy_assignment_method() is not None] # This code is removing OTUs that have unmatched taxonomy domain. - + otu.taxonomy_assignment_method() is not None] num_no_assignment_otus = sum([otu.data[ArchiveOtuTable.COVERAGE_FIELD_INDEX] for otu in table if otu.taxonomy_assignment_method() is None]) num_assigned_otus = sum([otu.data[ArchiveOtuTable.COVERAGE_FIELD_INDEX] for otu in table if otu.taxonomy_assignment_method() is not None]) logging.info("After removing off-target OTUs, found {:.2f} assigned and {:.2f} unassigned OTU coverage units".format(num_assigned_otus, num_no_assignment_otus)) @@ -464,13 +455,13 @@ def _apply_genus_expectation_maximization_core(self, sample_otus, trim_percent, def best_hit_genera_from_otu(otu): # The DIAMOND assignments are already truncated to genus level. # But the query-based ones go to species level. - best_hit_taxonomies = otu.equal_best_hit_taxonomies() # there is a list called EQUAL_BEST_HIT_TAXONOMIES_INDEX in ArchiveOtuTable + best_hit_taxonomies = otu.equal_best_hit_taxonomies() method = otu.taxonomy_assignment_method() if method == DIAMOND_ASSIGNMENT_METHOD: best_hit_genera = best_hit_taxonomies elif method in (QUERY_BASED_ASSIGNMENT_METHOD, QUERY_BASED_ASSIGNMENT_METHOD+'_abandoned'): best_hit_genera = set( - [';'.join([s.strip() for s in taxon.split(';')[:-1]]) for taxon in best_hit_taxonomies]) # so Cyanobacteria are from here. + [';'.join([s.strip() for s in taxon.split(';')[:-1]]) for taxon in best_hit_taxonomies]) else: raise Exception("Unexpected taxonomy assignment method: {}".format(otu.taxonomy_assignment_method())) return best_hit_genera @@ -505,6 +496,7 @@ def best_hit_genera_from_otu(otu): for otu in sample_otus: unnormalised_coverages = {} best_hit_genera = best_hit_genera_from_otu(otu) + for best_hit_genus in best_hit_genera: if best_hit_genus in genus_to_coverage: # Can get removed during trimmed mean unnormalised_coverages[best_hit_genus] = genus_to_coverage[best_hit_genus] @@ -514,10 +506,6 @@ def best_hit_genera_from_otu(otu): # continue for tax, unnormalised_coverage in unnormalised_coverages.items(): - # So, there will be tax identified as Bacteria here, - # and also, you have Bacteria in your taxonomy references. - if "Bacteria" in tax: - logging.info("Assigning coverage for OTU {} to genus {}: unnormalised coverage {}, total coverage {}, otu coverage {}".format(otu, tax, unnormalised_coverage, total_coverage, otu.coverage)) # Record the total for each gene so a trimmed mean can be taken afterwards if tax not in next_genus_to_gene_to_coverage: next_genus_to_gene_to_coverage[tax] = {} @@ -527,7 +515,6 @@ def best_hit_genera_from_otu(otu): # Calculate the trimmed mean for each genus next_genus_to_coverage = {} - # breakpoint() for tax, gene_to_coverage in next_genus_to_gene_to_coverage.items(): if avg_num_genes_per_species is not None: num_markers = avg_num_genes_per_species @@ -535,7 +522,7 @@ def best_hit_genera_from_otu(otu): num_markers = len(genes_per_domain[tax.split(';')[1].strip().replace('d__','')]) logging.debug("Using {} markers for OTU taxonomy {}, with coverages {}".format(num_markers, tax, gene_to_coverage.values())) trimmed_mean = self.calculate_abundance(list(gene_to_coverage.values()), num_markers, trim_percent) - next_genus_to_coverage[tax] = trimmed_mean + next_genus_to_coverage[tax] = trimmed_mean # Has any species changed in abundance by a large enough amount? If not, we're done need_another_iteration = False @@ -600,6 +587,8 @@ def _apply_species_expectation_maximization_core(self, sample_otus, trim_percent best_hit_taxonomy_sets = set() some_em_to_do = False species_genes = {} + + for otu in sample_otus: best_hit_taxonomies = otu.equal_best_hit_taxonomies() if otu.taxonomy_assignment_method() == QUERY_BASED_ASSIGNMENT_METHOD and best_hit_taxonomies is not None: From e8b9933131c8e26b2bacd0392a3b40589855953b Mon Sep 17 00:00:00 2001 From: Yuhao Tong Date: Tue, 9 Dec 2025 16:32:21 +1100 Subject: [PATCH 4/4] Modify the otu writing loop by Ben's advice, tested and passed on a sample read data --- singlem/condense.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/singlem/condense.py b/singlem/condense.py index 306648df..3b3c373d 100644 --- a/singlem/condense.py +++ b/singlem/condense.py @@ -215,14 +215,15 @@ def _convert_diamond_best_hit_ids_to_taxonomies(self, metapackage, sample_otus): for seq_id in seq_id_list: taxon_name = sequence_id_to_taxon[seq_id] if not taxon_name[-2].startswith('g__'): - if not taxon_name[0] == 'd__Eukaryota': - # add one check to ensure the target taxon to be d__Eukaryota for all metapackages. - if all("Eukaryota" in domain for domain in target_domain): - # then the bacterial/archaeal sequences are off-target this time. - logging.debug("Ignoring off-target prokaryotic taxon {}".format(taxon_name)) - continue - else: - raise Exception("Expected genus level taxon, but found {}, from ID {}".format(taxon_name, seq_id)) + if not taxon_name[0] in target_domain: + # # add one check to ensure the target taxon to be d__Eukaryota for all metapackages. + # if all("Eukaryota" in domain for domain in target_domain): + # # then the bacterial/archaeal sequences are off-target this time. + # logging.debug("Ignoring off-target prokaryotic taxon {}".format(taxon_name)) + # continue + # else: + # raise Exception("Expected genus level taxon, but found {}, from ID {}".format(taxon_name, seq_id)) + continue else: # This can happen when taxonomy is overall # Archaea so not previously filtered out, but