Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 15 additions & 13 deletions singlem/condense.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def _condense_to_otu_table(self, metapackage, **kwargs):
logging.info("Using minimum taxon coverage of {}".format(min_taxon_coverage))

markers = {} # set of markers used to the domains they target
target_domains = {"Archaea": [], "Bacteria": [], "Eukaryota": [], "Viruses": []}
target_domains = {}

for spkg in metapackage.singlem_packages:
# ensure v3 packages
Expand All @@ -88,16 +88,9 @@ def _condense_to_otu_table(self, metapackage, **kwargs):
markers[marker_name] = spkg.target_domains()
# count number of markers for each domain
for domain in spkg.target_domains():
if domain == "Archaea":
target_domains["Archaea"] += [marker_name]
elif domain == "Bacteria":
target_domains["Bacteria"] += [marker_name]
elif domain == "Eukaryota":
target_domains["Eukaryota"] += [marker_name]
elif domain == "Viruses":
target_domains["Viruses"] += [marker_name]
else:
raise Exception("Domain: {} not supported.".format(domain))
if domain not in target_domains:
target_domains[domain] = []
target_domains[domain].append(marker_name)

for domain in target_domains:
if target_domains[domain] in [1, 2]:
Expand Down Expand Up @@ -199,6 +192,8 @@ def _convert_diamond_best_hit_ids_to_taxonomies(self, metapackage, sample_otus):

num_otus_changed = 0
sequence_ids = set()

target_domain = [metapackage.singlem_packages[i].target_domains() for i in range(len(metapackage.singlem_packages))]
# Step 1: Gather dictionary of sequence IDs to taxon strings
for otu in sample_otus:
if otu.taxonomy_assignment_method() == DIAMOND_ASSIGNMENT_METHOD:
Expand All @@ -220,8 +215,15 @@ def _convert_diamond_best_hit_ids_to_taxonomies(self, metapackage, sample_otus):
for seq_id in seq_id_list:
taxon_name = sequence_id_to_taxon[seq_id]
if not taxon_name[-2].startswith('g__'):
if not taxon_name[0] == 'd__Eukaryota':
raise Exception("Expected genus level taxon, but found {}, from ID {}".format(taxon_name, seq_id))
if not taxon_name[0] in target_domain:
# # add one check to ensure the target taxon to be d__Eukaryota for all metapackages.
# if all("Eukaryota" in domain for domain in target_domain):
# # then the bacterial/archaeal sequences are off-target this time.
# logging.debug("Ignoring off-target prokaryotic taxon {}".format(taxon_name))
# continue
# else:
# raise Exception("Expected genus level taxon, but found {}, from ID {}".format(taxon_name, seq_id))
continue
else:
# This can happen when taxonomy is overall
# Archaea so not previously filtered out, but
Expand Down