Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 22 additions & 6 deletions singlem/supplement.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,7 +346,7 @@ def generate_new_singlem_package(myargs):


def run_hmmsearch_on_one_genome(lock, data, matched_transcripts_fna, output_matched_protein_sequences,
working_directory, hmmsearch_evalue, concatenated_hmms):
working_directory, hmmsearch_evalue, concatenated_hmms, hmm_name_to_package):
total_num_transcripts = 0
failure_genomes = 0
num_transcriptomes = 0
Expand Down Expand Up @@ -414,9 +414,12 @@ def run_hmmsearch_on_one_genome(lock, data, matched_transcripts_fna, output_matc
num_printed += 1

if output_matched_protein_sequences:
transcript_to_hmm = {}
transcript_to_package = {}
for row in df3.rows(named=True):
transcript_to_hmm[row['transcript']] = row['hmm']
hmm_name = row['hmm']
if hmm_name not in hmm_name_to_package:
raise Exception("Unexpected HMM name {} not found in package mapping".format(hmm_name))
transcript_to_package[row['transcript']] = hmm_name_to_package[hmm_name]

num_printed_proteins = 0
with open(tranp.protein_fasta) as g:
Expand All @@ -425,7 +428,7 @@ def run_hmmsearch_on_one_genome(lock, data, matched_transcripts_fna, output_matc
for name, seq, _ in SeqReader().readfq(g):
if name in matched_transcript_ids:
genome_basename = remove_extension(os.path.basename(genome))
new_name = genome_basename + '‡' + name + '‡' + transcript_to_hmm[name] # Use ‡ to separate genome and original ID, must be kept in check with elsewhere in the code
new_name = genome_basename + '‡' + name + '‡' + transcript_to_package[name] # Use ‡ to separate genome and original ID, must be kept in check with elsewhere in the code
print('>' + new_name + '\n' + seq, file=f)
num_printed_proteins += 1
logging.debug("Printed {} proteins for {}".format(num_printed_proteins, genome))
Expand All @@ -448,9 +451,22 @@ def gather_hmmsearch_results(num_threads, working_directory, old_metapackage, ne
# Create concatenated HMMs in working_directory/concatenated_alignment_hmms.hmm
concatenated_hmms = os.path.join(working_directory, 'concatenated_alignment_hmms.hmm')
num_hmms = 0
hmm_name_to_package = {}
with open(concatenated_hmms, 'w') as f:
for spkg in old_metapackage.singlem_packages:
with open(spkg.graftm_package().alignment_hmm_path()) as g:
hmm_path = spkg.graftm_package().alignment_hmm_path()
hmm_name = None
with open(hmm_path) as g:
for line in g:
if line.startswith('NAME'):
hmm_name = line.split()[1]
break
if hmm_name is None:
raise Exception("Unable to find HMM NAME in {}".format(hmm_path))
if hmm_name in hmm_name_to_package:
raise Exception("Duplicate HMM NAME {} found for {}".format(hmm_name, hmm_path))
Comment on lines +466 to +467

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Allow metapackages with shared HMM NAMEs

The new duplicate check raises an exception whenever two packages share the same HMM NAME. Metapackages can legitimately include multiple SingleM packages that reuse the same GraftM alignment HMM (e.g., different window sizes or taxonomy variants backed by the same align_hmm), which means their NAME fields collide; in that case supplement will now abort before running hmmsearch even though it previously worked. If you need the package basename in matched protein IDs, consider disambiguating by rewriting HMM NAME entries to include the package basename (or falling back to the original HMM name when duplicates exist) instead of failing on duplicates.

Useful? React with 👍 / 👎.

hmm_name_to_package[hmm_name] = spkg.graftm_package_basename()
with open(hmm_path) as g:
f.write(g.read())
num_hmms += 1

Expand Down Expand Up @@ -481,7 +497,7 @@ def gather_hmmsearch_results(num_threads, working_directory, old_metapackage, ne
with get_context('spawn').Pool(num_threads) as pool:
map_result = pool.imap_unordered(
_run_hmmsearch_on_one_genome_star,
[(lock, data, matched_transcripts_fna, output_matched_protein_sequences, working_directory, hmmsearch_evalue, concatenated_hmms)
[(lock, data, matched_transcripts_fna, output_matched_protein_sequences, working_directory, hmmsearch_evalue, concatenated_hmms, hmm_name_to_package)
for data in new_genome_transcripts_and_proteins.items()],
chunksize=1)

Expand Down
6 changes: 5 additions & 1 deletion test/test_supplement.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,9 @@ def test_output_matched_protein_sequences(self):
)
extern.run(cmd)

mpkg = Metapackage.acquire(f"{path_to_data}/4.11.22seqs.gpkg.spkg.smpkg/")
package_basenames = {spkg.graftm_package_basename() for spkg in mpkg.singlem_packages}

with open(protein_fasta) as f:
input_protein_names = {name for name, _, _ in SeqReader().readfq(f)}

Expand All @@ -115,12 +118,13 @@ def test_output_matched_protein_sequences(self):

self.assertGreater(len(matched_records), 0)
for name, _, _ in matched_records:
genome_name, protein_name, hmm = name.split('‡', 2)
genome_name, protein_name, package_basename = name.split('‡', 2)
self.assertEqual(
genome_name,
'GCA_011373445.1_genomic.mutated93_ms.manually_added_nongaps',
)
self.assertIn(protein_name, input_protein_names)
self.assertIn(package_basename, package_basenames)

@pytest.mark.expensive
def test_auto_taxonomy(self):
Expand Down