From 59b756ecfc368fc33e5ee656fbf41c93799af946 Mon Sep 17 00:00:00 2001 From: stelviopas Date: Fri, 1 Aug 2025 11:40:06 +0200 Subject: [PATCH 1/4] Add modules for downstream analysis --- mgexpose/base_logger.py | 4 + mgexpose/batch_helpers.py | 223 ++++++++++++++++ mgexpose/clean_workdir.py | 49 ++++ mgexpose/clustering_parser.py | 2 +- mgexpose/db.py | 156 +++++++++++ mgexpose/downstream.py | 448 ++++++++++++++++++++++++++++++++ mgexpose/gene.py | 29 +-- mgexpose/gene_annotator.py | 24 +- mgexpose/get_cluster_data.py | 67 +++++ mgexpose/get_db_seqs.py | 109 ++++++++ mgexpose/get_eggnog.py | 52 ++++ mgexpose/get_eggnog_f13.py | 74 ++++++ mgexpose/gffio.py | 58 ++++- mgexpose/handle_args.py | 11 +- mgexpose/island_processing.py | 8 +- mgexpose/islands.py | 219 ++++++++-------- mgexpose/mge_annotation.py | 317 ++++++++++++++++++++++ mgexpose/parse_hmmsearch.py | 98 +++++++ mgexpose/query_db.py | 279 ++++++++++++++++++++ mgexpose/readers.py | 26 +- mgexpose/recombinases.py | 8 +- mgexpose/test_mge_annotation.py | 129 +++++++++ 22 files changed, 2190 insertions(+), 200 deletions(-) create mode 100644 mgexpose/base_logger.py create mode 100644 mgexpose/batch_helpers.py create mode 100644 mgexpose/clean_workdir.py create mode 100644 mgexpose/db.py create mode 100644 mgexpose/downstream.py create mode 100644 mgexpose/get_cluster_data.py create mode 100644 mgexpose/get_db_seqs.py create mode 100644 mgexpose/get_eggnog.py create mode 100644 mgexpose/get_eggnog_f13.py create mode 100644 mgexpose/mge_annotation.py create mode 100644 mgexpose/parse_hmmsearch.py create mode 100644 mgexpose/query_db.py create mode 100644 mgexpose/test_mge_annotation.py diff --git a/mgexpose/base_logger.py b/mgexpose/base_logger.py new file mode 100644 index 0000000..47b94df --- /dev/null +++ b/mgexpose/base_logger.py @@ -0,0 +1,4 @@ +import logging + +logger = logging +logger.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO) \ No newline at end of file diff --git a/mgexpose/batch_helpers.py b/mgexpose/batch_helpers.py new file mode 100644 index 0000000..b764d9d --- /dev/null +++ b/mgexpose/batch_helpers.py @@ -0,0 +1,223 @@ +#!/usr/bin/env python +# pylint: disable=R0912,R0914,R0915 +''' Collection of functions to collect MGEs from batches of files''' +import os +from collections import Counter, defaultdict +import json + +from dask.distributed import Client, progress, WorkerPlugin +import dask.bag as db +from dask.bag import from_delayed +from dask.delayed import delayed +import dask + +from gffio import read_mge_genomic_islands_gff + +from base_logger import logger +import traceback + + +# Create a dictionary: genome: list of MGE IDs. This is needed to filter out only relevant MGEs per genome.The input is a corresponding list (same order) of genomes and MGE IDs. +def get_genome2mges(genomes, mges): + genome2mge_id = {} + for id, genome_id in zip(mges, genomes): + if genome_id not in genome2mge_id: + genome2mge_id[genome_id] = [] # Initialize the list + genome2mge_id[genome_id].append(id) + return genome2mge_id + +# Helper function to extract genome ID/bin ID from file path +def get_genome_id_from_path(path): + """ + Extract genome ID from a file path. + """ + genome_id = None + try: + genome_id = path.split("/")[-2] + except Exception as e: + logger.error(f"Error extracting genome/bin ID{path}: {e}") + return genome_id + +def collect_batch_mges(gff_paths, i, relevant_ids=None): + """ + Collect MGEs from a batch of GFF files. + + Parameters: + - gff_paths: List of GFF file paths. + - i: Index of the batch. + - relevant_ids: Optional dictionary of relevant MGE IDs per genome ID. + + Returns: + - List of MGE islands for all files in the batch. + """ + islands = [] + for gff_path in gff_paths: + genome_id = get_genome_id_from_path(gff_path) + #logger.info(f"Processing genome: {genome_id}") + + try: + if relevant_ids: + relevant_mges = list(read_mge_genomic_islands_gff(gff_path, relevant_ids[genome_id])) + else: + relevant_mges = list(read_mge_genomic_islands_gff(gff_path)) + + islands.extend(relevant_mges) + + except Exception as e: + logger.error(f"Error processing {gff_path}: {e}") + logger.error(traceback.format_exc()) # Full traceback + + logger.info(f"Batch {i} completed, MGE islands found: {len(islands)}") + return islands + + +def apply_per_batch(islands, funcs): + """ + Calculate statistics for a batch of MGEs using a list of functions. + + Parameters: + - islands: List of MGE objects in the batch. + - funcs: List of functions to be applied to the batch. + + Returns: + - Dictionary with function names as keys and their results as values. + """ + results = {} + for func in funcs: + func_name = func.__name__ # Get the function's name + results[func_name] = func(islands) # Apply the function and store the result + return results + +def apply_one_per_batch(islands, func): + """ + Apply a single function to a batch of MGE objects. + + Parameters: + - islands: List of MGE objects in the batch. + - func: A single function to be applied to the batch. + + Returns: + - The result of applying the function to the islands. + """ + try: + return func(islands) + except Exception as e: + raise RuntimeError(f"Error applying function '{func.__name__}' to batch: {e}") + + +def write_batch_json(batch_count, i, dir, base_filename): + """ + Saves batch statistics to a JSON file using a Dask delayed function. This allows the function to be part of a larger + Dask computation graph, potentially executed in parallel. + + Parameters: + - batch_count (dict): Batch statistics, typically a dictionary with `Counter` values. + - i (int): Index of the current batch. This is used to generate a unique filename for each batch. + - dir (str): Path to the directory where the batch JSON files will be stored. + - base_filename (str): Base name for the JSON files. The batch index will be appended to this base name to create the full filename. + + Ensures that the specified directory exists before writing the file. If it does not exist, the directory will be created. + + Returns: + - A Dask delayed object which, when executed, will write the batch statistics to the specified file path in JSON format. + """ + # Ensure directory exists + if not os.path.exists(dir): + os.makedirs(dir) + + # Delayed function to write JSON + def delayed_write(path, data): + with open(path, 'w') as file: + json.dump(data, file, indent=4) + + # Construct the full path with batch number + path = os.path.join(dir, f"{base_filename}_{i}.json") + return delayed(delayed_write)(path, batch_count) + + +def write_batch_tsv(tsv_string, i, dir, base_filename): + """ + Saves a TSV-formatted string to a file as part of a Dask computation graph. + + Parameters: + - tsv_string (str): TSV-formatted data returned from a processing function. + - i (int): Index of the current batch, used to make unique filenames. + - dir (str): Directory where TSV files will be saved. + - base_filename (str): Base name for the output files. + + Returns: + - A Dask delayed object that writes the TSV to disk when executed. + """ + # Ensure the output directory exists + if not os.path.exists(dir): + os.makedirs(dir) + + # Define the write function + def delayed_write(path, content): + with open(path, 'w') as file: + file.write(content) + + # Create the output file path + path = os.path.join(dir, f"{base_filename}_{i}.tsv") + + # Return the delayed write operation + return delayed(delayed_write)(path, tsv_string) + + +def aggregate_attr(batches): + """ + Aggregate string attributes across all batches. + + Parameters: + - batches: List of batch statistics (dictionaries with str values e.g. cluster: COG category). + + {'90371.SAMN11043730.GCA_007435405_02914': 'S', '90371.SAMN14863315.GCA_013264555_00909': 'S', '28150.SAMN09228819.GCA_007140965_00837': 'K', '28901.SAMN13391507.GCA_011477875_00875': 'no_cog_fcat', '1967657.SAMN09203654.GCA_010924635_01295': 'S', '28901.SAMN13057743.GCA_009231785_02754': 'no_cog_fcat', '90371.SAMN11355433.GCA_007687065_04488': 'S', '28901.SAMN06645026.GCA_009179045_04086': 'no_cog_fcat', '28901.SAMN15658059.GCA_013797405_02028': 'S', '796732.SAMN01805325.GCA_000272735_04407': 'no_cog_fcat', '28901.SAMN12571445.GCA_010939435_00055': 'S', '28901.SAMN13747386.GCA_010741235_03029': 'no_cog_fcat', '115981.SAMN14080650.GCA_011486465_01735': 'S', '28901.SAMN14341880.GCA_011465135_00878': 'H', '1151002.SAMN09403228.GCA_004177825_01940': 'no_cog_fcat', '1029990.SAMN02415182.GCA_000484355_00589': 'S', '28901.SAMN12287151.GCA_007468615_01067': 'S', '28901.SAMN13057273.GCA_009230165_00580': 'no_cog_fcat', '611.SAMN21335643.GCA_019899165_00890': 'EH', '28901.SAMN10095790.GCA_005443695_02309': 'no_cog_fcat', '340190.SAMN15147492.GCA_013661605_01048': 'S', '224729.SAMN19336595.GCA_018502705_02901': 'no_cog_fcat', '28901.SAMN16355443.GCA_015155595_01828': 'no_cog_fcat', '59201.SAMN10093771.GCA_007777665_02600': 'no_cog_fcat', '59201.SAMN17835677.GCA_017072195_01803': 'S'} + +2025-01-31 18:35:27,669 - {'611.SAMN17086052.GCA_016740915_04210': 'K', '611.SAMN07152477.GCA_007233055_03707': 'S', '1173835.SAMN01088029.GCA_000962395_02473': 'L', '1620419.SAMN03894126.GCA_001241425_04679': 'T', '568709.SAMEA2272227.GCA_000493535_02720': 'no_cog_fcat', '28901.SAMN18448990.GCA_017574325_01595': 'L', '90371.SAMEA6057931.GCA_016228905_04588': 'L', '28901.SAMN10177571.GCA_005772365_02802': 'N', '28901.SAMN14050865.GCA_011246635_04142': 'G', '90371.SAMN09387768.GCA_007158225_04690': 'N', '28144.SAMN07734943.GCA_003548115_02128': 'no_cog_fcat', '90105.SAMN09474912.GCA_004184575_03995': 'G', '59201.SAMN10756627.GCA_007583145_03925': 'S', '90371.SAMN03169328.GCA_008018515_03842': 'K', '1620419.SAMN04255380.GCA_010457935_04445': 'no_cog_fcat', '28901.SAMN16124589.GCA_014542005_01530': 'G', '28901.SAMN17005521.GCA_015838815_01302': 'no_cog_fcat', '28901.SAMN19285790.GCA_018468945_04349': 'G', '28901.SAMN10425133.GCA_010255835_04194': 'S', '28901.SAMN12344366.GCA_007726245_00622': 'S', '28901.SAMN12107692.GCA_006482085_01260': 'S', '440524.SAMN02867573.GCA_010663445_04243': 'M', '28901.SAMN20181473.GCA_020012815_04294': 'L', '28901.SAMEA6514879.GCA_011786425_00695': 'G', '90371.SAMN07279560.GCA_002260995_02309': 'K', '90371.SAMN19798225.GCA_018997055_00491': 'L', '28901.SAMN12823265.GCA_008717395_04569': 'V', '1173837.SAMN01088030.GCA_000962405_02064': 'L', '399584.SAMN13050934.GCA_009225065_04346': 'G', '28901.SAMN13057273.GCA_009230165_01317': 'no_eggnog'} + + Returns: + - Dictionary of aggregated statistics for all batches {func: {mge_type: {cluster: COG_category}}} + """ + aggregated = {} + + for batch in batches: + for func_name, mge_dict in batch.items(): + aggregated[func_name] = {} + for mge_type, attr_dict in mge_dict.items(): + if mge_type not in aggregated[func_name]: + aggregated[func_name][mge_type] = defaultdict(list) # TODO: Generalise to function + # Update using the contents of the nested dictionary + for cluster_id, value in attr_dict.items(): + aggregated[func_name][mge_type][cluster_id] = value # TODO: replace with proper majority vote + # flatten the COG value per batch + return aggregated + +def aggregate_counts(batch_counts): + """ + Aggregate statistics across all batches. + + Parameters: + - batch_counts: List of batch statistics (dictionaries with Counter values). + + Returns: + - Dictionary of aggregated statistics for all batches {func: {mge_type: {cluster: count}}} + """ + aggregated = {} + + for batch in batch_counts: + for func_name, mge_counter in batch.items(): + aggregated[func_name] = {} + for mge_type, nested_counter in mge_counter.items(): + if mge_type not in aggregated[func_name]: + aggregated[func_name][mge_type] = Counter() # Initialize if not already present + # Update using the contents of the nested dictionary + for cluster_id, value in nested_counter.items(): + if isinstance(value, str): + aggregated[func_name][mge_type][cluster_id] = value # Overwrite previous COG category with the latest batch + else: + if cluster_id in aggregated[func_name][mge_type]: + aggregated[func_name][mge_type][cluster_id] += value + else: + aggregated[func_name][mge_type][cluster_id] = value + return aggregated + diff --git a/mgexpose/clean_workdir.py b/mgexpose/clean_workdir.py new file mode 100644 index 0000000..56c89ff --- /dev/null +++ b/mgexpose/clean_workdir.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python + +""" Module to clean nextflow workdir """ + +import argparse +import csv +import os +import re + + +def main(): + """ Main. """ + ap = argparse.ArgumentParser() + ap.add_argument("workdir") + ap.add_argument("tracefile") + args = ap.parse_args() + + with open(args.tracefile, "rt", encoding="UTF-8") as _in: + keep = set( + row["hash"] + for row in csv.DictReader(_in, delimiter="\t") + if row["status"] in ("CACHED", "COMPLETED") + ) + + # level2 = args.workdir.rstrip("/").count("/") + 2 + walk = os.walk(args.workdir) + wd, dirs, _ = next(walk) + for d in dirs: + if re.match(r'^[0-9a-f]{2}$', d): + _, subdirs, _ = next(os.walk(os.path.join(wd, d))) + for sd in subdirs: + wd_hash = f"{d}/{sd[:6]}" + if wd_hash in keep: + print(f"keeping {os.path.join((wd, d, sd))}") + + # for wd, _, _ in os.walk(args.workdir): + # if wd.count("/") == level2: + # # 03/717392 + # # work/81/9299777e91fb27fb6626980719cf1f + # wd_hash = wd[len(args.workdir.rstrip("/")) + 1:][:9] + # if wd_hash not in keep: + # # print(f"removing {wd}") + # ... + # else: + # print(f"keeping {wd}") + + +if __name__ == "__main__": + main() diff --git a/mgexpose/clustering_parser.py b/mgexpose/clustering_parser.py index 57a3979..e767efd 100644 --- a/mgexpose/clustering_parser.py +++ b/mgexpose/clustering_parser.py @@ -9,7 +9,7 @@ from contextlib import nullcontext from dataclasses import dataclass -from .chunk_reader import get_lines_from_chunks +from chunk_reader import get_lines_from_chunks logger = logging.getLogger(__name__) diff --git a/mgexpose/db.py b/mgexpose/db.py new file mode 100644 index 0000000..b821dc3 --- /dev/null +++ b/mgexpose/db.py @@ -0,0 +1,156 @@ +# pylint: disable=W2301,R0903,E1101,C0103 + +""" Functions for database access. """ + +import json +import random +import time + +from functools import lru_cache + +from sqlalchemy import create_engine, MetaData, Table +from sqlalchemy.orm import mapper, sessionmaker, registry +from sqlalchemy.exc import OperationalError + + +class DbGene: + """ Placeholder Gene class""" + ... + + +class DbGeneCluster: + """ Placeholder GeneCluster class""" + ... + + +class DbEmapperResult: + """ Emapper results class""" + HEADERS = [ + "#query", + "seed_ortholog", + "evalue", + "score", + "eggNOG_OGs", + "max_annot_lvl", + "COG_category", + "Description", + "Preferred_name", + "GOs", + "EC", + "KEGG_ko", + "KEGG_Pathway", + "KEGG_Module", + "KEGG_Reaction", + "KEGG_rclass", + "BRITE", + "KEGG_TC", + "CAZy", + "BiGG_Reaction", + ] + + def __str__(self): + """ String representation. """ + return "\t".join(str(v) for k, v in self.__dict__.items() if k != "project_id") + + +def read_db_details(f): + """ Reads database credentials from JSON file. """ + with open(f, "rt", encoding="UTF-8") as _in: + return json.load(_in) + + +@lru_cache(maxsize=10000) +def get_cluster(db_session, cluster_id): + """ Queries GeneCluster table by cluster_id. """ + cluster = ( + db_session.query(DbGeneCluster).filter(DbGeneCluster.id == cluster_id).one_or_none() + ) + return cluster + + +def get_gene(db_session, gene_id): + """ Queries Gene table by gene_id. """ + gene = ( + db_session.query(DbGene).filter(DbGene.id == gene_id).one_or_none() + ) + return gene + + +def db_available(cluster_id): + """ Checks if cluster data is stored in db. + (Only the largest 10 clusters are.) + """ + return int(cluster_id.replace("specI_v4_", "")) in range(9) + + +def initialise_db(db_details, db_name, cluster_id=""): + """ Initialises database connection. """ + + db_access = read_db_details(db_details)[db_name] + + engine = create_engine( + f"postgresql+psycopg2://{db_access['username']}:" + f"{db_access['password']}@{db_access['host']}/{db_name}" + ) + + metadata = MetaData() + + if cluster_id.lower() == "speci_v4_00000": + cluster_id = "" + + # strips "_" in case of specI_v4_00000 + gene_table_name = f"{cluster_id}_gene".strip("_") + gene_cluster_table_name = f"{cluster_id}_gene_cluster".strip("_") + + while 1: + try: + gene_table = Table( + gene_table_name, + metadata, + autoload_with=engine + ) + + gene_cluster_table = Table( + gene_cluster_table_name, + metadata, + autoload_with=engine + ) + + # [mapper(DbGene, gene_table), mapper(DbGeneCluster, gene_cluster_table)] + mapper_registry = registry() + mapper_registry.map_imperatively(DbGene, gene_table) + mapper_registry.map_imperatively(DbGeneCluster, gene_cluster_table) + + Session = sessionmaker(bind=engine) + session = Session() + except OperationalError: + time.sleep(random.randint(1, 31)) + else: + break + + return session + + +def initialise_pg3_db(db_details, db_name): + """ Initialises connection to PG3 database for emapper queries. """ + db_access = read_db_details(db_details)[db_name] + + engine = create_engine( + f"postgresql+psycopg2://{db_access['username']}:" + f"{db_access['password']}@{db_access['host']}/{db_name}" + ) + + metadata = MetaData(engine) + + emapper_table = Table( + "eggnog", + metadata, + autoload=True + ) + + mapper(DbEmapperResult, emapper_table) + + Session = sessionmaker(bind=engine) + session = Session() + + return session diff --git a/mgexpose/downstream.py b/mgexpose/downstream.py new file mode 100644 index 0000000..0fc86d2 --- /dev/null +++ b/mgexpose/downstream.py @@ -0,0 +1,448 @@ +#!/usr/bin/env python +# pylint: disable=R0912,R0914,R0915 +import os + +from collections import Counter, defaultdict + +from gffio import read_mge_genomic_islands_gff + +from base_logger import logger + +def stat_nested(islands): + """Calculate statistics on nested MGEs and return as a dictionary.""" + total = 0 + nested = 0 + for island in islands: + if island.mge_type == "nested": + nested += 1 + total += 1 + nested_percentage = (nested / total * 100) if total > 0 else 0 + non_nested_percentage = ((total - nested) / total * 100) if total > 0 else 0 + + return { + "total": total, + "nested_count": nested, + "non_nested_count": total - nested, + "nested_percentage": nested_percentage, + "non_nested_percentage": non_nested_percentage, + } + + +def count_nested(islands): + """ + Calculate the count of nested MGEs. + + Parameters: + - islands: List of MGE objects. + + Returns: + - Integer count of nested MGEs. + """ + nested = sum(1 for island in islands if island.mge_type == "nested") + return nested + + +def stat_core(islands): + """Calculate statistics on MGEs in the core genome and return as a dictionary.""" + total = 0 + core = 0 + for island in islands: + if island.is_core: + core += 1 + total += 1 + core_percentage = (core / total * 100) if total > 0 else 0 + accessory_percentage = ((total - core) / total * 100) if total > 0 else 0 + + return { + "total": total, + "core_count": core, + "accessory_count": total - core, + "core_percentage": core_percentage, + "accessory_percentage": accessory_percentage, + } + + +def count_core(islands): + """ + Calculate the count of MGEs in the core genome. + + Parameters: + - islands: List of MGE objects. + + Returns: + - Integer count of MGEs in the core genome. + """ + core = sum(1 for island in islands if island.is_core) + return core + + +def count_total_islands(islands): + """ + Count the total number of MGE islands. + + Parameters: + - islands: List of MGE objects. + + Returns: + - Integer count of total MGE islands. + """ + return len(list(islands)) + + +def stat_mge_type(islands): + """ + Calculate counts of each MGE type and return as a Counter object. + + Parameters: + - islands: List of MGE objects. + + Returns: + - Counter object with counts of each MGE type. + """ + mge_counts = Counter() + for island in islands: + try: + if island.mge_type == "nested": + mge_counts["nested"] += 1 + else: + # Get the first key (assuming there's only one key) + mge = next(iter(island.mge.keys())) + mge_counts[mge] += 1 + except Exception as e: + raise ValueError(f"Unknown or absent MGE type: {e}") + + return mge_counts + + +def stat_mean_genes(islands): + """Calculate the mean number of genes per MGE and return as a dictionary.""" + genes_lst = [island.n_genes for island in islands] + mean_genes = (sum(genes_lst) / len(genes_lst)) if genes_lst else 0 + + # Return the result as a dictionary + return {"mean_genes_per_mge": mean_genes} + + +def extract_cargo(island): + cargo_genes = [] + for gene in island.genes: + if (gene.phage is None) and (gene.recombinase is None) and (gene.secretion_system is None): + cargo_genes.append(gene) + return cargo_genes + + +def get_kegg_ko(gene): + for key, value in gene.eggnog: + if key == "kegg_ko": + return value + + +def get_cazy(gene): + for key, value in gene.eggnog: + if key == "cazy": + return value + + +def get_cog_category(gene): + if gene.eggnog: + for key, value in gene.eggnog: + if key == "cog_fcat": + if value: + return value + else: + return '-' + else: + return 'no_cog_fcat' + else: + return 'no_eggnog' + + +# SP95 for SPIRE +def get_gene_cluster(gene): + return gene.cluster + + +def get_gene_id(gene): + id_lst = gene.id.split('.') # orginal ID e.g. 28901.SAMN15849311.GCA_014242155_04079 + return id_lst[2] # extract to match CY clustering IDs e.g. GCA_xxx_xxx + +# Extract MGE recombinases +def extract_mger(island): + mgeR_genes = [] + for gene in island.genes: + if gene.recombinase: + try: + gene_id = gene.id + gene_cluster = get_gene_cluster(gene) + mgeR = gene.recombinase + annot = [gene_id, gene_cluster, mgeR] + mgeR_genes.append(annot) + except Exception as e: + logger.error(f"Error processing recombinase gene {gene}: {e}") + logger.error(traceback.format_exc()) + return mgeR_genes + + +# Extract secretion system genes +def extract_secretion_system(island): + secretion_genes = [] + for gene in island.genes: + if gene.secretion_system: + try: + gene_id = gene.id + gene_cluster = get_gene_cluster(gene) + info = gene.secretion_system + annot = [gene_id, gene_cluster, info] + secretion_genes.append(annot) + except Exception as e: + logger.error(f"Error processing secretion system gene {gene}: {e}") + logger.error(traceback.format_exc()) + return secretion_genes + + +# Extract phage genes +def extract_phage(island): + phage_genes = [] + for gene in island.genes: + if gene.phage: + try: + gene_id = gene.id + gene_cluster = get_gene_cluster(gene) + info = gene.phage + annot = [gene_id, gene_cluster, info] + phage_genes.append(annot) + except Exception as e: + logger.error(f"Error processing phage gene {gene}: {e}") + logger.error(traceback.format_exc()) + return phage_genes + + +def get_most_common_kegg_ko(genes): + """ + Calculate the most common KEGG KO annotations for a list of genes. + + Parameters: + - genes: List of gene objects. + + Returns: + - Counter object with counts of KEGG KO annotations. + """ + kos = [get_kegg_ko(gene) for gene in genes] + return Counter(kos) + + +def count_gene_clusters(genes, **kwargs): + """ + Count genes in gene clusters. + + Parameters: + - genes: List of gene objects. + + Returns: + - Counter object with counts of cluster genes. + """ + gene_clusters = [get_gene_cluster(gene) for gene in genes] + return Counter(gene_clusters) + + +def get_majority_cog_category(genes, **kwargs): + cluster_to_categories = defaultdict(list) + + for gene in genes: + gene_cluster = get_gene_cluster(gene) + cog_category = get_cog_category(gene) + cluster_to_categories[gene_cluster].append(cog_category) + + majority_cog_category = {} + + # Determine the majority cog category for each cluster + for cluster, categories in cluster_to_categories.items(): + category_count = Counter(categories) + majority_cog_category[cluster] = category_count.most_common(1)[0][0] # a list of tuples where each tuple is a category and its count; [0][0] extracts the category with the highest count. + #logger.info(majority_cog_category) + return majority_cog_category + + +def get_gene_annotation(genes, func, **kwargs): + gene_annot_dict = {} + for gene in genes: + gene_id = get_gene_id(gene) + func_annot = func(gene) + gene_annot_dict[gene_id] = func_annot + + return gene_annot_dict + + +def get_genes_cog_categories(genes, **kwargs): + return get_gene_annotation(genes, get_cog_category) + + +def get_genes_ko(genes, *_args): + return get_gene_annotation(genes, get_kegg_ko) + + +def get_genes_cazy(genes, *_args): + return get_gene_annotation(genes, get_cazy) + + +def get_genes_clusters(genes, **kwargs): + return get_gene_annotation(genes, get_gene_cluster) + + +def get_genes(genes, mge_id): + gene_ids = [get_gene_id(gene) for gene in genes] + return {mge_id: gene_ids} + + +def count_per_mge_cargo(islands, func): + """ + Extract and count cargo genes associated with each MGE type and return as a dictionary of Counter objects. + + Parameters: + - islands: List of MGE objects. + - func: function to extract cargo e.g. count KO terms or gene clusters + + Returns: + - Dictionary where keys are MGE types, and values are Counter objects with KEGG KO counts. + """ + mge_cargo_counts = { + "nested": Counter(), + "phage": Counter(), + "phage_like": Counter(), + "is_tn": Counter(), + "ce": Counter(), + "mi": Counter(), + "integron": Counter(), + "cellular": Counter(), + } + + for island in islands: + try: + cargo = extract_cargo(island) + if island.mge_type == "nested": + mge_cargo_counts["nested"].update(func(cargo)) + else: + # Get the first key (assuming there's only one key) + mge = next(iter(island.mge.keys())) + mge_cargo_counts[mge].update(func(cargo)) + except Exception as e: + raise ValueError(f"Error processing cargo for island: {e}") + + return mge_cargo_counts + + +def get_per_mge_cargo(islands, func): + mge_cargo_annot = { + "nested": {}, + "phage": {}, + "phage_like": {}, + "is_tn": {}, + "ce": {}, + "mi": {}, + "integron": {}, + "cellular": {}, + } + + for island in islands: + try: + cargo = extract_cargo(island) + mge_id = island.get_id() + if island.mge_type == "nested": + mge_cargo_annot["nested"].update(func(cargo, mge_id)) # Merges another dictionary into existing one + else: + # Get the first key (assuming there's only one key) + mge = next(iter(island.mge.keys())) + mge_cargo_annot[mge].update(func(cargo, mge_id)) + except Exception as e: + raise ValueError(f"Error processing cargo for island: {e}") + + return mge_cargo_annot + + +def get_machinery_genes_tsv(islands): + tsv_rows = [] + header = ['mge_id', 'mge', 'n_genes', 'gene_id', 'gene_cluster', 'feature_type', 'feature_info'] + tsv_rows.append('\t'.join(header)) + + for island in islands: + try: + mge_id = island.get_id() + n_genes = len(island.genes) + mge = ",".join(f"{k}:{v}" for k, v in island.mge.items()) + + recombinases = extract_mger(island) # list of [gene_id, gene_cluster, info] + conj_machinery = extract_secretion_system(island) + phage_machinery = extract_phage(island) + + for gene in recombinases: + gene_id, gene_cluster, info = gene + tsv_rows.append(f"{mge_id}\t{mge}\t{n_genes}\t{gene_id}\t{gene_cluster}\tmgeR\t{info}") + + for gene in conj_machinery: + gene_id, gene_cluster, info = gene + tsv_rows.append(f"{mge_id}\t{mge}\t{n_genes}\t{gene_id}\t{gene_cluster}\tsecretion_system\t{info}") + + for gene in phage_machinery: + gene_id, gene_cluster, info = gene + tsv_rows.append(f"{mge_id}\t{mge}\t{n_genes}\t{gene_id}\t{gene_cluster}\tphage\t{info}") + + except Exception as e: + raise ValueError(f"Error processing machinery for island {island}: {e}") + + tsv_output = '\n'.join(tsv_rows) + return tsv_output + +def get_cargo_genes_tsv(islands): + tsv_rows = [] + header = ['mge_id', 'mge', 'gene_ids', 'gene_clusters'] + tsv_rows.append('\t'.join(header)) + + for island in islands: + gene_ids = [] + gene_clusters = [] + try: + mge_id = island.get_id() + mge = ",".join(f"{k}:{v}" for k, v in island.mge.items()) + + cargo_genes = extract_cargo(island) + + for gene in cargo_genes: + gene_ids.append(gene.id) + gene_clusters.append(get_gene_cluster(gene)) + gene_ids = ';'.join(gene_ids) + gene_clusters = ';'.join(gene_clusters) + tsv_rows.append(f"{mge_id}\t{mge}\t{gene_ids}\t{gene_clusters}") + + except Exception as e: + raise ValueError(f"Error processing cargo for island {island}: {e}") + + tsv_output = '\n'.join(tsv_rows) + return tsv_output + +# Counting works with aggregation since the objects are small. Getting only works with batch saving, since the output is huge. +def count_cargo_gene_clusters(islands): + return count_per_mge_cargo(islands, count_gene_clusters) + + +def get_cargo_species_gene_clusters(islands): + return get_per_mge_cargo(islands, func=get_genes_clusters) + + +# Output: for each mge_type output a dictionary with geneID: COG. geneID supposed to be unique -> overwriting is okay. +def get_cargo_genes_cog(islands): + return get_per_mge_cargo(islands, func=get_genes_cog_categories) + + +def get_cargo_genes_ko(islands): + return get_per_mge_cargo(islands, func=get_genes_ko) + + +def get_cargo_genes_cazy(islands): + return get_per_mge_cargo(islands, func=get_genes_cazy) + + +# Output: for each mge_type output a dictionary mge_id: list(cargo_ids) +def get_cargo_genes(islands): + return get_per_mge_cargo(islands, func=get_genes) + + diff --git a/mgexpose/gene.py b/mgexpose/gene.py index 23e09b6..6fc357d 100644 --- a/mgexpose/gene.py +++ b/mgexpose/gene.py @@ -1,10 +1,10 @@ -# pylint: disable=R0902,R0917,R0913 +# pylint: disable=R0902 """ Gene module """ from dataclasses import dataclass -from .readers import EggnogReader +from readers import EggnogReader @dataclass @@ -34,7 +34,7 @@ class Gene: # specify optional annotations here # when adding new class variables, # otherwise output will be suppressed. - OPTIONAL_ANNOTATIONS = ("phage", "secretion_system", "secretion_rule", "recombinase", "eggnog",) + OPTIONAL_ANNOTATIONS = ("phage", "secretion_system", "secretion_rule", "recombinase", "eggnog") # these are only optional when core genome calculations # are disabled, e.g. co-transferred region inputs CLUSTER_ANNOTATIONS = ("cluster", "is_core",) @@ -42,15 +42,7 @@ class Gene: @staticmethod def rtype(is_core): """ Returns is_core-tag. """ - if is_core is None: - return "NA" return ("ACC", "COR")[is_core] - - @staticmethod - def is_core_gene(occ, n_genomes, core_threshold=0.95, strict=True): - if strict or n_genomes == 2 or n_genomes > 20: - return occ / n_genomes > core_threshold - return occ >= n_genomes - 1 def stringify_eggnog(self): """ convert eggnog annotation into gff-col9 key-value pairs """ @@ -107,16 +99,12 @@ def from_gff(cls, *cols): end=int(cols[4]), # end strand=cols[6], # strand recombinase=attribs.get("recombinase"), - cluster=attribs.get("cluster") or attribs.get("Cluster"), + cluster=attribs.get("cluster"), is_core=attribs.get("genome_type") == "COR", phage=attribs.get("phage"), secretion_system=attribs.get("secretion_system"), secretion_rule=attribs.get("secretion_rule"), - eggnog=tuple( - (k, attribs.get(k)) - for k in EggnogReader.EMAPPER_FIELDS["v2.1.2"] - if attribs.get(k) and k != "description" - ), + eggnog=tuple((k, attribs.get(k)) for k in EggnogReader.EMAPPER_FIELDS["v2.1.2"] if attribs.get(k) and k != "description"), ) def to_gff( @@ -125,17 +113,12 @@ def to_gff( genomic_island_id, add_functional_annotation=False, intermediate_dump=False, - add_header=False, ): """ dump gene to gff record """ - - if add_header: - print("##gff-version 3", file=gff_outstream) - attribs = { "ID": self.id, "Parent": genomic_island_id, - "cluster": self.cluster, + "Cluster": self.cluster, "size": len(self), "secretion_system": self.secretion_system, "secretion_rule": self.secretion_rule, diff --git a/mgexpose/gene_annotator.py b/mgexpose/gene_annotator.py index eb9153f..1ff16ac 100644 --- a/mgexpose/gene_annotator.py +++ b/mgexpose/gene_annotator.py @@ -1,4 +1,4 @@ -# pylint: disable=R0912,R0913,R0914,R0917 +# pylint: disable=R0912,R0913,R0914 """ Classes for integrating gene annotations. """ @@ -6,10 +6,10 @@ from contextlib import nullcontext -from .clustering_parser import parse_full_seq_clusters, parse_y_clusters, parse_db_clusters -from .gene import Gene -from .phage import PhageDetection -from .readers import ( +from clustering_parser import parse_full_seq_clusters, parse_y_clusters, parse_db_clusters +from gene import Gene +from phage import PhageDetection +from readers import ( EggnogReader, parse_macsyfinder_report, read_recombinase_hits, @@ -66,7 +66,6 @@ def add_cluster( use_y_clusters=False, core_threshold=0.95, output_dir=None, - strict=True, ): """ Add information from gene clustering to allow for core/accessory gene classification """ @@ -128,12 +127,10 @@ def add_cluster( if cluster_genes: occ = cluster_genes[cluster] - # gene.is_core = any(( - # occ / n_genomes > core_threshold, - # (2 < n_genomes <= 20 and occ >= n_genomes - 1), - # (n_genomes == 2 and occ == 2), - # )) - gene.is_core = Gene.is_core_gene(occ, n_genomes, core_threshold=core_threshold, strict=strict,) + gene.is_core = any(( + occ / n_genomes > core_threshold, + (n_genomes <= 20 and occ >= n_genomes - 1), + )) elif core_threshold == -1: gene.is_core = is_core @@ -168,11 +165,10 @@ def annotate_genes( use_y_clusters=False, core_threshold=None, output_dir=None, - pyhmmer=True, ): """ Annotate genes with MGE-relevant data. """ self.add_recombinases( - read_recombinase_hits(recombinases, pyhmmer=pyhmmer,) + read_recombinase_hits(recombinases) ) if all(secretion_annotation): self.add_secretion_system( diff --git a/mgexpose/get_cluster_data.py b/mgexpose/get_cluster_data.py new file mode 100644 index 0000000..8d8d240 --- /dev/null +++ b/mgexpose/get_cluster_data.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python + +""" Module for processing mmseqs2 linclust output. """ + +import argparse +import gzip +import os +import warnings + +from db import initialise_db, db_available, get_gene, get_cluster +from readers import read_prodigal_gff + + +def main(): + """ maaaaaaain... """ + ap = argparse.ArgumentParser() + ap.add_argument("genome_id", type=str) + ap.add_argument("speci", type=str) + ap.add_argument("prodigal_gff", type=str) + ap.add_argument("--cluster_db_credentials", type=str) + ap.add_argument("--output_dir", "-o", type=str, default=".") + ap.add_argument("--dump_intermediate_steps", action="store_true") + + args = ap.parse_args() + + db_session = None + if args.cluster_db_credentials: + if db_available(args.speci): + db_session = initialise_db( + args.cluster_db_credentials, + "mge_clusters", + cluster_id=args.speci.lower(), + ) + else: + warnings.warn( + "Could not connect to database.\n" + f"Check if {args.speci} database exists in specified " + f"database ({args.cluster_db_credentials}.)" + ) + return None + + print("DB_SESSION", db_session is not None) + + gene_clusters_out = gzip.open( + os.path.join(args.output_dir, f"{args.genome_id}.db_gene_clusters.txt.gz"), + "wt", + ) + + with gene_clusters_out: + for gene_id, _ in read_prodigal_gff(args.prodigal_gff): + db_gene = get_gene(db_session, gene_id) + if db_gene is not None: + db_cluster = get_cluster(db_session, db_gene.cluster_id) + if db_cluster is not None: + print( + gene_id, + db_cluster.name, + db_gene.is_core, + sep="\t", + file=gene_clusters_out, + ) + + return None + + +if __name__ == "__main__": + main() diff --git a/mgexpose/get_db_seqs.py b/mgexpose/get_db_seqs.py new file mode 100644 index 0000000..602a2b3 --- /dev/null +++ b/mgexpose/get_db_seqs.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python +# pylint: disable=E0401,C0415,R0914,W0702,W0621,W0404,C0301 +# flake8: noqa + +""" Functions to load speci cluster data from cache/seq repo. """ + +import argparse +import gzip +import json +import os + +import pymongo + + +def get_sequences_from_cluster(mongo_db_str, cluster_id, seqfile): + """ Get cluster sequences from cache/seq repo """ + + try: + import pymongo + except ImportError: + return 0 + + client = pymongo.MongoClient(mongo_db_str,) + fr_db = client["progenomes"] + + n_genes = 0 + files = [] + with gzip.open(seqfile, "wt") as genes_out: + for record in fr_db.samples.find({'fr13_cluster': cluster_id}): + genes_file = f"{record['analysis_path']}/ref_genome_called_genes/{record['sample_id']}.genes.fa.gz" + files.append(genes_file) + for genes_file in files: + with gzip.open(genes_file, "rt") as genes_in: + genes_raw = genes_in.read() + n_genes += genes_raw.count(">") + print(genes_raw, file=genes_out, end="" if genes_raw[-1] == "\n" else "\n") + + return n_genes + + +def main(): + """ Main. """ + + ap = argparse.ArgumentParser() + ap.add_argument("dbname", type=str) + ap.add_argument("dbcred", type=str) + ap.add_argument("cluster_id", type=str) + ap.add_argument("outfile", type=str) + ap.add_argument("outfile_ids", type=str) + ap.add_argument("--cache", type=str) + args = ap.parse_args() + + try: + with open(args.dbcred, "rt", encoding="UTF-8") as _in: + db_d = json.load(_in).get(args.dbname) + except: + db_d = {} + + user = db_d.get("username") + host = db_d.get("host") + pw = db_d.get("password") + port = db_d.get("port") + + dbstr = f"mongodb://{user}:{pw}@{host}:{port}" if (user and host and pw and port) else None + + client = pymongo.MongoClient(dbstr,) + fr_db = client["progenomes"] + + n_genes = 0 + files = [] + + n_seqs = 0 + if args.cache and os.path.isdir(args.cache): + print("Looking up seq_cache...") + expected_files = [ + os.path.join(args.cache, f"{args.cluster_id}.{suffix}") + for suffix in ("genes.ffn.gz", "genes.nseqs", "genes.ids.gz") + ] + if all(os.path.isfile(f) and os.stat(f).st_size for f in expected_files): + with open(os.path.join(args.cache, f"{args.cluster_id}.genes.nseqs"), "rt", encoding="UTF-8") as _in: + n_seqs = int(_in.read().strip()) + print("Copying sequences from seq_cache:", args.cluster_id, args.outfile, "...", end="") + # shutil.copyfile(os.path.join(args.cache, f"{args.cluster_id}.genes.ffn.gz"), args.outfile) + + os.symlink(os.path.join(args.cache, f"{args.cluster_id}.genes.ffn.gz"), args.outfile) + os.symlink(os.path.join(args.cache, f"{args.cluster_id}.genes.ids.gz"), args.outfile_ids) + print(n_seqs) + + if not n_seqs: + with gzip.open(args.outfile, "wt") as genes_out, gzip.open(args.outfile_ids, "wt") as geneids_out: + for record in fr_db.samples.find({'fr13_cluster': args.cluster_id}): + genes_file = f"{record['analysis_path']}/ref_genome_called_genes/{record['sample_id']}.genes.fa.gz" + files.append(genes_file) + for genes_file in files: + with gzip.open(genes_file, "rt") as genes_in: + genes_raw = genes_in.read() + n_genes += genes_raw.count(">") + print(genes_raw, file=genes_out, end="" if genes_raw[-1] == "\n" else "\n") + + genes_raw = ( + line[1:].split(" ")[0] + for line in genes_raw.strip().split("\n") + if line[0] == ">" + ) + print(*genes_raw, file=geneids_out, sep="\n") + + +if __name__ == "__main__": + main() diff --git a/mgexpose/get_eggnog.py b/mgexpose/get_eggnog.py new file mode 100644 index 0000000..a54dcb8 --- /dev/null +++ b/mgexpose/get_eggnog.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python +# pylint: disable=C0301 +# flake8: noqa + +""" Get emapper data from database. """ + +import argparse + +import pandas as pd + +from db import read_db_details, DbEmapperResult + + +def main(): + """ Main. """ + ap = argparse.ArgumentParser() + ap.add_argument("database", type=str) + ap.add_argument("credentials", type=str) + ap.add_argument("--project_id", type=str) + ap.add_argument("--sample_name", type=str) + ap.add_argument("--bulk_file", type=str) + + args = ap.parse_args() + + db_access = read_db_details(args.credentials)[args.database] + + conn = f"postgresql://{db_access['host']}/{args.database}?user={db_access['username']}&password={db_access['password']}" + + if args.bulk_file: + with open(args.bulk_file, "rt", encoding="UTF-8") as _in: + query_list = [line.strip() for line in _in] + query_list_str = ", ".join(line for line in query_list) + query = f"SELECT * from eggnog WHERE project_id IN ({query_list_str});" + else: + column, column_value = ("project_id", args.project_id) if args.project_id else ("sample_name", args.sample_name) + query = f"SELECT * from eggnog WHERE {column} = '{column_value}';" + + eggnog = pd.read_sql(query, conn) + try: + eggnog = eggnog.drop("project_id", axis=1) + except KeyError: + pass + try: + eggnog = eggnog.drop(["sample_name", "contig_name", "pfams",], axis=1) + except KeyError: + pass + eggnog.columns = DbEmapperResult.HEADERS + eggnog.to_csv("emapper.annotations", sep="\t", index=False) + + +if __name__ == "__main__": + main() diff --git a/mgexpose/get_eggnog_f13.py b/mgexpose/get_eggnog_f13.py new file mode 100644 index 0000000..0129a52 --- /dev/null +++ b/mgexpose/get_eggnog_f13.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python +# pylint: disable=C0301,R0801 +# flake8: noqa + +""" Get emapper data from database. """ + +import argparse +import os +import pathlib + +import pandas as pd + +from db import read_db_details, DbEmapperResult + + +def main(): + """ Main. """ + ap = argparse.ArgumentParser() + ap.add_argument("database", type=str) + ap.add_argument("credentials", type=str) + ap.add_argument("--project_id", type=str) + ap.add_argument("--sample_name", type=str) + ap.add_argument("--bulk_file", type=str) + ap.add_argument("--sample_tax_map", type=str) + ap.add_argument( + "--fill_missing", + action="store_true", + help="Generate empty annotation file if genome does not have a database record.", + ) + + args = ap.parse_args() + + db_access = read_db_details(args.credentials)[args.database] + + conn = f"postgresql://{db_access['host']}/{args.database}?user={db_access['username']}&password={db_access['password']}" + + with open(args.sample_tax_map, "rt", encoding="UTF-8") as _in: + st_map = dict(line.strip().split("\t")[::-1] for line in _in) + with open(args.bulk_file, "rt", encoding="UTF-8") as _in: + query_list = [(st_map.get(".".join(line.strip().split("/")[1].split(".")[:-1])), line.strip()) for line in _in] + query_list_str = ", ".join(f"'{pid}'" for pid, _ in query_list if pid is not None) + query = f"SELECT * from eggnog WHERE project_id IN ({query_list_str});" + + st_map = {v: k for k, v in st_map.items()} + p_map = dict(query_list) + eggnog = pd.read_sql(query, conn) + annotated_genomes = [] + print(query_list[:10]) + for pid in eggnog["project_id"].unique(): + print(pid) + genome_id = st_map.get(pid) + annotated_genomes.append(genome_id) + eggnog[eggnog["project_id"] == pid].drop("project_id", axis=1).to_csv( + os.path.join(p_map.get(pid), f"{genome_id.split('/')[0]}.emapper_annotations"), + sep="\t", + header=DbEmapperResult.HEADERS, + index=False, + ) + + if args.fill_missing: + # write empty emapper annotations + # for rare cases where we neither have db records + # nor annotations on the file system + # genome will not have cargo/phage annotation + # but could still contain e.g. recombinase signals + for genome_id in set(v for _, v in query_list).difference(annotated_genomes): + path = pathlib.Path(genome_id) + path.mkdir(parents=True, exist_ok=True) + (path / f"{genome_id.split('/')[1]}.emapper_annotations").touch() + + + +if __name__ == "__main__": + main() diff --git a/mgexpose/gffio.py b/mgexpose/gffio.py index 9dcb99a..2c268d3 100644 --- a/mgexpose/gffio.py +++ b/mgexpose/gffio.py @@ -1,20 +1,19 @@ -""" GFF I/O -- wannabe serialisation module """ +from gene import Gene +from islands import GenomicIsland, MgeGenomicIsland -from .gene import Gene -from .islands import GenomicIsland, MgeGenomicIsland +from base_logger import logger -def read_island_gff(fn, island_cls): - """ Read island gff """ +def read_genomic_islands_gff(fn): with open(fn, "rt", encoding="UTF-8") as _in: island = None for line in _in: line = line.strip() if line and line[0] != "#": cols = line.split("\t") - if cols[2] == island_cls.GFFTYPE: + if cols[2] == "region": if island is not None: yield island - island = island_cls.from_gff(*cols) + island = GenomicIsland.from_gff(*cols) elif cols[2] == "gene": gene = Gene.from_gff(*cols) if island is not None: @@ -23,13 +22,46 @@ def read_island_gff(fn, island_cls): raise ValueError("Found gene but no island.") if island is not None: yield island + +def read_mge_genomic_islands_gff(fn, relevant_ids=None): + """ + Generator function to read and parse MGEs and associated genes from a GFF file. + Parameters: + - fn: Path to the GFF file. + - relevant_ids: Optional set of relevant MGE IDs to filter. If None, all MGEs are processed. + Yields: + - MgeGenomicIsland objects that match the relevant IDs or all if None. + """ + with open(fn, "rt", encoding="UTF-8") as _in: + island = None + for line in _in: + line = line.strip() + if line and line[0] != "#": + cols = line.split("\t") + attributes = {kv.split('=')[0]: kv.split('=')[1] for kv in cols[8].split(';') if '=' in kv} -def read_genomic_islands_gff(fn): - """ reads a set of genomic islands + genes from a gff3 """ - yield from read_island_gff(fn, GenomicIsland) + if cols[2] == "mobile_genetic_element": + mge_id = attributes.get("ID") -def read_mge_genomic_islands_gff(fn): - """ reads a set of mge genomic islands + genes from a gff3 """ - yield from read_island_gff(fn, MgeGenomicIsland) + if relevant_ids is None or mge_id in relevant_ids: + if island is not None: + yield island + island = MgeGenomicIsland.from_gff(*cols) + + elif cols[2] == "gene": + parent_id = attributes.get("Parent") + + if island is not None: + if relevant_ids is None or parent_id in relevant_ids: + gene = Gene.from_gff(*cols) + island.genes.add(gene) + else: + continue + else: + # This situation should not happen unless the GFF is malformed + raise ValueError("Found gene with no preceding island.") + + if island is not None: + yield island diff --git a/mgexpose/handle_args.py b/mgexpose/handle_args.py index 6988298..ff45882 100644 --- a/mgexpose/handle_args.py +++ b/mgexpose/handle_args.py @@ -2,15 +2,16 @@ import argparse -from .readers import EggnogReader +from readers import EggnogReader -from . import __version__ +__version__ = "3.6.0" + def handle_args(): """ Argument handling """ ap = argparse.ArgumentParser( - prog="mgexpose", + prog="profile_me", formatter_class=argparse.RawTextHelpFormatter, ) @@ -97,9 +98,6 @@ def handle_args(): help="If specified, per gene emapper annotations are stored in the gff." ) # ensure newest eggnog version - denovo_ap.add_argument("--extract_islands", type=str) - - denovo_ap.add_argument("--pyhmmer_input", action="store_true") denovo_ap.set_defaults(func=None) # TODO @@ -189,6 +187,5 @@ def handle_args_old(): help="Core/accessory gene sets were precomputed." ) ap.add_argument("--skip_island_identification", action="store_true") - ap.add_argument("--extract_islands", type=str) return ap.parse_args() diff --git a/mgexpose/island_processing.py b/mgexpose/island_processing.py index 4b87bbc..1bb51ed 100644 --- a/mgexpose/island_processing.py +++ b/mgexpose/island_processing.py @@ -4,7 +4,7 @@ import logging -from .islands import GenomicIsland, AnnotatedGenomicIsland, MgeGenomicIsland +from islands import GenomicIsland, AnnotatedGenomicIsland, MgeGenomicIsland logger = logging.getLogger(__name__) @@ -173,16 +173,16 @@ def evaluate_islands(islands, rules, outstream=None, outstream2=None): yield mge_island -def prepare_precomputed_islands(single_island=None, island_file=None, genome_id=None,): +def prepare_precomputed_islands(single_island=None, island_file=None): """ Helper function to deal with precomputed regions/islands. """ precomputed_islands = None if single_island and island_file: raise ValueError("Both --single_island and --precomputed_islands set.") if single_island and not island_file: - precomputed_islands = [GenomicIsland.from_region_string(single_island, genome_id=genome_id,)] + precomputed_islands = [GenomicIsland.from_region_string(single_island)] elif not single_island and island_file: with open(island_file, "rt", encoding="UTF-8",) as _in: - precomputed_islands = [GenomicIsland.from_region_string(line, genome_id=genome_id,) for line in _in] + precomputed_islands = [GenomicIsland.from_region_string(line) for line in _in] if precomputed_islands is not None: precomputed_islands_by_contig = {} diff --git a/mgexpose/islands.py b/mgexpose/islands.py index f22ea55..86af044 100644 --- a/mgexpose/islands.py +++ b/mgexpose/islands.py @@ -1,4 +1,4 @@ -# pylint: disable=C0116,C0301,R0902,R0916,R0913,R0917 +# pylint: disable=C0116,C0301,R0902,R0916 """ Data Structures Module @@ -18,14 +18,13 @@ import itertools as it import logging import sys -import warnings +import re from collections import Counter from dataclasses import dataclass, field -from .gene import Gene -from .recombinases import MgeRule, MGE_ALIASES - +from gene import Gene +from recombinases import MgeRule, MGE_ALIASES logger = logging.getLogger(__name__) @@ -33,7 +32,7 @@ @dataclass class GenomicIsland: '''The following class describes a generic genomic region - with one or more identified recombinases (recombinases). + with one or more identified recombinases. This region is then referred as Recombinase Island. The Genomic Island is represented by contig, start and end coordinates, set of genes, some of which are recombinases and MGE machinery. @@ -48,7 +47,6 @@ class GenomicIsland: "end", "gene_list", ) - GFFTYPE = "region" speci: str = None genome: str = None @@ -62,15 +60,6 @@ class GenomicIsland: # recombinases: list = field(default_factory=list) recombinases: Counter = field(default_factory=Counter) - @staticmethod - def parse_id(id_string): - """ Parse genome id, contig id, start and end coordinates from id string. - Reverses get_id(). """ - cols = id_string.split("_") - contig, coords = cols[3].split(':') - - return "_".join(cols[1:3]), contig, int(coords[0]), int(coords[1]) - @staticmethod def get_fieldnames(): """ Returns column headers for island table. """ @@ -88,12 +77,12 @@ def get_fieldnames(): ) @classmethod - def from_region_string(cls, region, genome_id=None,): + def from_region_string(cls, region): """ Creates island from a predefined region string. """ _, _, contig, start_end, *_ = region.strip().split(".") contig = contig.split(".")[-1] start, end = map(int, start_end.split("-")) - return cls(None, genome_id, None, contig, start, end, region.strip()) + return cls(None, None, None, contig, start, end, region.strip()) @classmethod def from_gene(cls, gene): @@ -120,8 +109,8 @@ def __str__(self): genes = ( f"{gene.id}.{gene.cluster}" for gene in sorted( - self.genes, key=lambda g: (g.start, g.end, g.strand) - ) + self.genes, key=lambda g: (g.start, g.end, g.strand) + ) ) return "\t".join( @@ -136,7 +125,7 @@ def add_gene(self, gene): """ Adds a gene to the island. """ if gene not in self.genes: self.end = max(self.end, gene.end) - if gene.recombinase: + if gene.recombinase is not None: # self.recombinases.append( # (f"{gene.id}.{gene.cluster}", gene.recombinase) # ) @@ -174,13 +163,20 @@ def get_id(self): @classmethod def from_gff(cls, *cols): - attribs = dict(item.split("=") for item in cols[-1].split(";")) - recombinases = Counter( - { - item.split(":")[0]: int(item.split(":")[1]) - for item in attribs["recombinases"].split(",") - } - ) + try: + attribs = dict(item.split("=") for item in cols[-1].split(";")) + except: + raise ValueError(f"not enough cols? {cols}") + + try: + recombinases = Counter( + dict( + item.split(":") + for item in attribs["recombinases"].split(",") + ) + ) + except: + raise ValueError(f"recombinase string weird? {attribs['recombinases'].split(',')}") return cls( attribs["specI"], @@ -189,23 +185,12 @@ def from_gff(cls, *cols): cols[0], # contig int(cols[3]), # start int(cols[4]), # end - genes=set(), recombinases=recombinases, + genes=set(), ) - def to_gff( - self, - gff_outstream, - source_db, - write_genes=False, - add_functional_annotation=False, - intermediate_dump=False, - add_header=False, - ): - - if add_header: - print("##gff-version 3", file=gff_outstream) - + def to_gff(self, gff_outstream, source_db, write_genes=False, add_functional_annotation=False, + intermediate_dump=False): island_id = self.get_id() attribs = { "ID": island_id, @@ -222,7 +207,7 @@ def to_gff( ) if self.recombinases else "" ), - "specI": self.speci, + "specI": self.speci, #TODO: does it work? } if self.name: attribs["name"] = self.name @@ -235,7 +220,7 @@ def to_gff( print( self.contig, source, - GenomicIsland.GFFTYPE, + "region", self.start, self.end, len(self), # Score field @@ -248,7 +233,7 @@ def to_gff( if write_genes: # GFF3 child term: genes - for gene in sorted(self.genes, key=lambda g: (g.start, g.end,)): + for gene in sorted(self.genes, key=lambda g: g.id): gene.to_gff( gff_outstream, genomic_island_id=island_id, @@ -370,7 +355,6 @@ class MgeGenomicIsland(AnnotatedGenomicIsland): "conj_man_count", "recombinases", ) - GFFTYPE = "mobile_genetic_element" integron: int = 0 cellular: int = 0 @@ -385,9 +369,14 @@ class MgeGenomicIsland(AnnotatedGenomicIsland): tn3_found: bool = False ser_found: bool = False + mge: Counter = field(default_factory=Counter) + mge_type: str = None + size: int = 0 + n_genes: int = 0 + def __post_init__(self): """ Apply annotations. """ - recombinases = (",".join(it.chain(*((r,) * v for r, v in self.recombinases.items())))).lower() + recombinases = (",".join(r for _, r in self.get_recombinases())).lower() for name, alias in MGE_ALIASES.items(): recombinases = recombinases.replace(name, alias) @@ -396,13 +385,13 @@ def __post_init__(self): # integron self.integron = int("integron" in recombinases) + # tag recombinase island with more than 3 recombinases + # self.c_nmi = int(len(self.recombinases) > 3) + self.c_nmi = sum(self.recombinases.values()) # self.recombinases = recombinases.split(",") if recombinases else [] self.recombinases = Counter(recombinases.split(",")) - # tag recombinase island with more than 3 recombinases - self.c_nmi = sum(self.recombinases.values()) > 3 - def __str__(self): """ String representation. """ return "\t".join( @@ -424,7 +413,55 @@ def __str__(self): self.name, ) ) - + + @staticmethod + def parse_mge_id(mge_id): + """ + Generalized parser for MGE IDs. + + Returns: + genome_id (str): parsed bin or genome identifier + contig (str): contig name (usually something like k141_32063) + start (int): start coordinate + end (int): end coordinate + """ + try: + # Extract coordinates + coord_match = re.search(r":(\d+)-(\d+)$", mge_id) + if not coord_match: + raise ValueError("No coordinates found in MGE ID.") + start, end = map(int, coord_match.groups()) + + # Remove leading MGE_ and SPIRE_ (if present) + cleaned = mge_id + if cleaned.startswith("MGE_"): + cleaned = cleaned[4:] + if cleaned.startswith("SPIRE_"): + cleaned = cleaned[6:] + + # Strip coordinates + core = cleaned.split(':')[0] + + # Assembly-style pattern (e.g., GCA_019800745.1) + if re.match(r"GCA_[\d.]+", core): + genome_id = core.split('_')[0] + '_' + core.split('_')[1] + contig = core.split('.')[-1] + return genome_id, contig, start, end + + # Bin-style pattern: extract contig and genome_id + kmer_match = re.search(r"(_k\d+_\d+)$", core) + if kmer_match: + contig = kmer_match.group(1)[1:] # remove leading underscore + genome_id = core[: -len(kmer_match.group(1))] # remove contig + return genome_id, contig, start, end + + # Fallback for unknown formats + raise ValueError(f"Unrecognized MGE ID format: {mge_id}") + + except Exception as e: + raise ValueError(f"Failed to parse MGE ID '{mge_id}': {e}") + + def get_mge_metrics(self): """ Cast mge metrics to int. """ return tuple( @@ -458,8 +495,7 @@ def get_annotated_mge_metrics(self): def is_nested(annotated_mge_metrics): n_mges = sum(v for _, v in annotated_mge_metrics) if not n_mges: - # raise UserWarning("No MGEs were assigned to recombinase island") - warnings.warn("No MGEs were assigned to recombinase island") + raise UserWarning("No MGEs were assigned to recombinase island") # Solitary or nested MGE? return n_mges > 1 @@ -519,7 +555,7 @@ def evaluate_recombinases(self, rules, outstream=None, outstream2=None): self.c_tn = patch_c_tn if outstream: - print(self, sep="\t", file=outstream,) + print(self, sep="\t", file=outstream, ) # previous step in some cases generates overlap between Phage/Phage_like and Mobility island # this step specifically resolves such instances based on recombinase presence and presence/ @@ -537,7 +573,7 @@ def evaluate_recombinases(self, rules, outstream=None, outstream2=None): self.phage, self.c_mi = True, False if outstream2: - print(self, sep="\t", file=outstream2,) + print(self, sep="\t", file=outstream2, ) @classmethod def from_annotated_genomic_island(cls, ag_island): @@ -549,11 +585,13 @@ def from_annotated_genomic_island(cls, ag_island): def get_id(self): return f"MGE_{self.genome}_{self.contig}:{self.start}-{self.end}" - - def get_attribs(self): + + + def to_gff(self, gff_outstream, source_db, write_genes=False, add_functional_annotation=False): + island_id = self.get_id() mge_metrics = self.get_annotated_mge_metrics() attribs = { - "ID": self.get_id(), + "ID": island_id, "mge": ",".join(f"{k}:{v}" for k, v in mge_metrics), # Count each mge type "genome_type": Gene.rtype(self.is_core), "mge_type": self.mge_num_island_type(self.is_nested(mge_metrics)), @@ -570,52 +608,16 @@ def get_attribs(self): } if self.name: attribs["name"] = self.name - return attribs - - def to_gff( - self, - gff_outstream, - source_db, - write_genes=False, - add_functional_annotation=False, - intermediate_dump=False, - add_header=False, - ): - if add_header: - print("##gff-version 3", file=gff_outstream) - - # island_id = self.get_id() - # mge_metrics = self.get_annotated_mge_metrics() - # attribs = { - # "ID": island_id, - # "mge": ",".join(f"{k}:{v}" for k, v in mge_metrics), # Count each mge type - # "genome_type": Gene.rtype(self.is_core), - # "mge_type": self.mge_num_island_type(self.is_nested(mge_metrics)), - # "size": len(self), - # "n_genes": len(self.genes), - # "mgeR": ( - # ",".join( - # f"{k}:{v}" - # # for k, v in sorted(Counter(self.recombinases).items()) - # for k, v in sorted(self.recombinases.items()) - # ) - # if self.recombinases else "" - # ), - # } - # if self.name: - # attribs["name"] = self.name - attribs = self.get_attribs() attrib_str = ";".join(f"{item[0]}={item[1]}" for item in attribs.items() if item[1]) # Format the source column - source = ("proMGE", f"proMGE_{source_db}")[bool(source_db)] - # if source_db: - # source = f"proMGE_{source_db}" - # else: - # source = "proMGE" + if source_db: + source = f"proMGE_{source_db}" + else: + source = "proMGE" print( self.contig, source, - MgeGenomicIsland.GFFTYPE, + "mobile_genetic_element", self.start, self.end, len(self), # Score field @@ -628,10 +630,10 @@ def to_gff( if write_genes: # GFF3 child term: genes - for gene in sorted(self.genes, key=lambda g: (g.start, g.end,)): + for gene in sorted(self.genes, key=lambda g: g.id): gene.to_gff( gff_outstream, - genomic_island_id=attribs["ID"], + genomic_island_id=island_id, add_functional_annotation=add_functional_annotation, ) @@ -662,12 +664,8 @@ def from_gff(cls, *cols): except: raise ValueError(f"mge string weird? {attribs['mge'].split(',')}") - if mges.get("is_tn"): - mges["c_tn"] = mges["is_tn"] - del mges["is_tn"] + genome_id, contig, start, end = cls.parse_mge_id(attribs["ID"]) - genome_id, *_ = GenomicIsland.parse_id(attribs["ID"]) - # TODO: check coordinates and ID overlap return cls( "", # TODO: where to get/ how to handle specI genome_id, @@ -676,11 +674,10 @@ def from_gff(cls, *cols): int(cols[3]), # start int(cols[4]), # end recombinases=recombinases, - # mge=mges, - **mges, - # mge_type=attribs["mge_type"], - # size=int(attribs["size"]), - # n_genes=int(attribs["n_genes"]), + mge=mges, + mge_type=attribs["mge_type"], + size=int(attribs["size"]), + n_genes=int(attribs["n_genes"]), genes=set(), ) diff --git a/mgexpose/mge_annotation.py b/mgexpose/mge_annotation.py new file mode 100644 index 0000000..7fcaa55 --- /dev/null +++ b/mgexpose/mge_annotation.py @@ -0,0 +1,317 @@ +#!/usr/bin/env python + +# pylint: disable=R0912,R0914,R0915,R0913 + +""" Mobile genetic element annotation """ + +import contextlib +import logging +import os + +from gene_annotator import GeneAnnotator +from handle_args import handle_args +from island_processing import ( + generate_island_set, + annotate_islands, + evaluate_islands, + prepare_precomputed_islands +) +from islands import MgeGenomicIsland +from readers import read_prodigal_gff, read_mge_rules +from gffio import read_genomic_islands_gff + +MGE_TABLE_HEADERS = \ + ("is_tn",) + \ + MgeGenomicIsland.TABLE_HEADERS[1:6] + \ + MgeGenomicIsland.TABLE_HEADERS[8:14] + \ + ("mgeR", "name", "genes",) + +logging.basicConfig( + level=logging.INFO, + format='[%(asctime)s] %(message)s' +) + +logger = logging.getLogger(__name__) + + +def process_islands(genes, genome_id, single_island=None, island_file=None, output_dir=None,): + """ helper function to declutter main() """ + precomputed_islands = prepare_precomputed_islands( + single_island=single_island, + island_file=island_file + ) + + if output_dir: + pang_calls_out = open( + os.path.join( + output_dir, + f"{genome_id}.pan_genome_calls.txt"), + "wt", + encoding="UTF-8", + ) + + islands_out = open( + os.path.join( + output_dir, + f"{genome_id}.pan_genome_islands.txt", + ), + "wt", + encoding="UTF-8", + ) + + raw_islands_out = open( + os.path.join( + output_dir, + "..", # temporary! this is only until i know if this is final output or not + f"{genome_id}.pan_genome_islands_raw.txt", + ), + "wt", + encoding="UTF-8", + ) + else: + pang_calls_out, islands_out, raw_islands_out = [contextlib.nullcontext() for _ in range(3)] + + with pang_calls_out, islands_out, raw_islands_out: + yield from generate_island_set( + genes, + pang_calls_out=pang_calls_out, + raw_islands_out=raw_islands_out, + islands_out=islands_out, + precomputed_islands=precomputed_islands, + ) + + +def dump_islands(islands, out_prefix, db, write_genes=False, add_functional_annotation=False): + """ dump genomic islands to intermediate gff """ + with open( + f"{out_prefix}.unannotated_islands.gff3", + "wt", encoding="UTF-8" + ) as _out: + print("##gff-version 3", file=_out) + for island in sorted(islands, key=lambda isl: isl.contig): + island.to_gff( + _out, db, write_genes=write_genes, + add_functional_annotation=add_functional_annotation, + intermediate_dump=True, + ) + + +def identify_recombinase_islands(islands, genome_id, mge_rules, output_dir=None): + """Identify MGE-islands according to a set of rules + using various signals annotated in the corresponding gene set. """ + if output_dir: + step1_out = open( + os.path.join( + output_dir, + f"{genome_id}.assign_mge.step1.txt", + ), + "wt", + encoding="UTF-8", + ) + + step2_out = open( + os.path.join( + output_dir, + f"{genome_id}.assign_mge.step2.txt", + ), + "wt", + encoding="UTF-8", + ) + + step3_out = open( + os.path.join( + output_dir, + f"{genome_id}.assign_mge.step3.txt", + ), + "wt", + encoding="UTF-8", + ) + + else: + step1_out, step2_out, step3_out = [contextlib.nullcontext() for _ in range(3)] + + with step1_out: + annotated_islands = list(annotate_islands(islands, outstream=step1_out)) + with step2_out, step3_out: + return list( + evaluate_islands( + annotated_islands, + read_mge_rules(mge_rules), + outstream=step2_out, + outstream2=step3_out + ) + ) + + +def write_final_results( + recombinase_islands, + output_dir, + genome_id, + output_suffix, + dbformat=None, + write_tsv=True, + write_gff=True, + write_genes_to_gff=True, + add_functional_annotation=False, +): + """ write final results """ + + outstream = contextlib.nullcontext() + gff_outstream = contextlib.nullcontext() + + out_prefix = os.path.join( + output_dir, + f"{genome_id}.{output_suffix}" + ) + + if write_tsv: + outstream = open( + f"{out_prefix}.txt", + "wt", + encoding="UTF-8", + ) + if write_gff: + gff_outstream = open( + f"{out_prefix}.gff3", + "wt", + encoding="UTF-8", + ) + + # Sort the list of MGEGenomicIslands based on contig names + sorted_islands = sorted(recombinase_islands, key=lambda isl: isl.contig) + + with outstream, gff_outstream: + # TSV header + if write_tsv: + print(*MGE_TABLE_HEADERS, sep="\t", file=outstream) + # GFF3 header + if write_gff: + print("##gff-version 3", file=gff_outstream) + + # Start recording the outputs + for island in sorted_islands: + # TSV: ignore gene-wise annotations; each line is recombinase island, + # all gene IDs are stored in a gene_list column + # assert genome_id == island.genome + if write_tsv: + island.to_tsv(outstream) + # GFF3: add individual genes annotation; + # parent lines are recombinase islands, children lines are genes + # GFF3 parent term: recombinase island + if write_gff: + island.to_gff( + gff_outstream, + source_db=dbformat, + write_genes=write_genes_to_gff, + add_functional_annotation=add_functional_annotation, + ) + + +def denovo_annotation(args, debug_dir=None): + """ denovo annotation """ + annotator = GeneAnnotator( + args.genome_id, + args.speci, + read_prodigal_gff(args.prodigal_gff), + include_genome_id=args.include_genome_id, + has_batch_data=args.allow_batch_data, + dbformat=args.dbformat, + ) + + annotated_genes = annotator.annotate_genes( + args.recombinase_hits, + ( + args.phage_eggnog_data, + args.phage_filter_terms, + ), + ( + args.txs_macsy_report, + args.txs_macsy_rules, + # args.macsy_version, + ), + clusters=args.cluster_data, + use_y_clusters=args.use_y_clusters, + core_threshold=(args.core_threshold, -1)[args.precomputed_core_genes], + output_dir=args.output_dir + ) + + out_prefix = os.path.join(args.output_dir, args.genome_id) + + genomic_islands = list( + process_islands( + annotated_genes, + args.genome_id, + single_island=args.single_island, + island_file=args.precomputed_islands, + output_dir=debug_dir, + ) + ) + + if args.dump_genomic_islands or args.skip_island_identification: + + dump_islands( + genomic_islands, + out_prefix, + args.dbformat, + write_genes=True, + add_functional_annotation=args.add_functional_annotation, + ) + + test_islands = list(read_genomic_islands_gff(f"{out_prefix}.unannotated_islands.gff3")) + dump_islands( + test_islands, + out_prefix + ".test", + args.dbformat, + write_genes=True, + add_functional_annotation=args.add_functional_annotation, + ) + + with open( + os.path.join(args.output_dir, f"{args.genome_id}.gene_info.txt"), + "wt", + encoding="UTF-8", + ) as _out: + annotator.dump_genes(_out) + + return genomic_islands + + +def main(): + """ main """ + + args = handle_args() + logger.info("ARGS: %s", str(args)) + + debug_dir = os.path.join(args.output_dir, "debug") if args.dump_intermediate_steps else None + + if args.command == "denovo": + genomic_islands = denovo_annotation(args, debug_dir=debug_dir) + + elif args.command == "annotate": + genomic_islands = None + raise NotImplementedError + + if not args.skip_island_identification: + + recombinase_islands = identify_recombinase_islands( + genomic_islands, + args.genome_id, + args.mge_rules, + output_dir=debug_dir, + ) + + if recombinase_islands: + write_final_results( + recombinase_islands, + args.output_dir, + args.genome_id, + args.output_suffix, + dbformat=args.dbformat, + write_gff=args.write_gff, + write_genes_to_gff=args.write_genes_to_gff, + add_functional_annotation=args.add_functional_annotation, + ) + + +if __name__ == "__main__": + main() diff --git a/mgexpose/parse_hmmsearch.py b/mgexpose/parse_hmmsearch.py new file mode 100644 index 0000000..85ed6a3 --- /dev/null +++ b/mgexpose/parse_hmmsearch.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python3 + +""" Module for parsing recombinase hmmscan results. """ + + +import argparse +import re + +from recombinases import MGE_ALIASES +from readers import read_mge_rules + + +def parse_hmm_table(table_stream): + """ Parse hmm table. """ + for line in table_stream: + line = line.strip() + if line and line[0] != "#": + parsed = re.split(r"\s+", line) + yield parsed, parsed[0], float(parsed[5]) + + +def extract_best_hits(table_stream): + """ Extracts best hits from hmm-table stream. + + Returns best-scoring recombinase hmm hits. + """ + seen = {} + for line, protein, score in table_stream: + seen_score = seen.setdefault(protein, [0.0, ""])[0] + if score > seen_score: + seen[protein] = [score, line] + + return [line for _, line in seen.values()] + + +def generate_output_table(best_hits, mge_rules): + """ Annotates recombinase hmm hits using mge rules. + + Returns annotated table rows via generator. + """ + for hit in best_hits: + hit[2] = hit[2].lower() + for name, alias in MGE_ALIASES.items(): + hit[2] = hit[2].replace(name, alias) + + rule = mge_rules.get(hit[2]) + if not rule: + raise ValueError(f"Cannot find rule for {hit[2]}.") + + mges = rule.get_signals() + confidence = "high" if len(mges) == 1 else "low" + + yield (hit[0], hit[2], hit[3], ";".join(mges), hit[4], hit[5], confidence) + + +def main(): + """ Main function, duh. """ + ap = argparse.ArgumentParser() + ap.add_argument("hmmsearch_table", type=str) + ap.add_argument("--mge_rules", type=str, default=None) + ap.add_argument("--prefix", type=str, default="sample") + args = ap.parse_args() + + best_hits = [] + with open(args.hmmsearch_table, "rt", encoding="UTF-8") as table_stream: + best_hits = extract_best_hits(parse_hmm_table(table_stream)) + + if best_hits: + with open( + f"{args.prefix}.recombinase_hmmsearch.besthits.out", + "wt", + encoding="UTF-8", + ) as raw_table_out: + print(*("\t".join(bh) for bh in best_hits), sep="\n", file=raw_table_out) + + if args.mge_rules: + mge_rules = read_mge_rules(args.mge_rules, recombinase_scan=True) + + with open( + f"{args.prefix}.recombinase_based_MGE_predictions.tsv", + "wt", + encoding="UTF-8", + ) as mge_pred_out: + + header = ( + "#unigene", "recombinase_SMART_hmm_name", "PFAM_accession", + "MGE_prediction", "hmmsearch_fullsequence_evalue", + "hmmsearch_fullsequence_score", "MGE_prediction_confidence" + ) + + print(*header, sep="\t", file=mge_pred_out) + + for line in generate_output_table(best_hits, mge_rules): + print(*line, sep="\t", file=mge_pred_out) + + +if __name__ == "__main__": + main() diff --git a/mgexpose/query_db.py b/mgexpose/query_db.py new file mode 100644 index 0000000..ae54e5c --- /dev/null +++ b/mgexpose/query_db.py @@ -0,0 +1,279 @@ +import psycopg2 +import pandas as pd +import json +import sys + +json_file = '/g/scb2/bork/grekova/results/envmge/f13_17072024/genome_id2speci.json' + +with open(json_file, "r") as file: + genome2speci_ids = json.load(file) + +def get_gtdb_id(identifier): + """Extract the GTDB ID from a genome identifier.""" + parts = identifier.split('_') + return f"{parts[0]}_{parts[1]}" # GCA_xxxxxx.x + +def connect(params_dic): + """ Connect to the PostgreSQL database server """ + conn = None + try: + # connect to the PostgreSQL server + print('Connecting to the PostgreSQL database...') + conn = psycopg2.connect(**params_dic) + except (Exception, psycopg2.DatabaseError) as error: + print(error) + sys.exit(1) + print("Connection created successfully") + return conn + +def parse_mge_id(id): + #'GCA_009102765.1_371601.SAMN11944272.WDCH01000111:267-2860' + mge_dict = {'gtdb_id': '', + 'contig' : '', + 'start' : '', + 'end' : '' + } + id = id.replace(':', '_').split('_') + mge_dict['gtdb_id'] = (id[0] + '_' + id[1]) #GCA_xxxxxx.x + mge_dict['contig'] = (id[2]) + coordinates = [int(c) for c in id[3].split('-')] + mge_dict['start'] = (coordinates[0]) + mge_dict['end'] = (coordinates[1]) + return mge_dict + +def query_mge_annotations(conn): + ''' Query the database to get all levels of taxonomy, mge_type and recombinases from the mge table. + Args: + In: conn: psycopg connection + Out: result: df mge_id recombinase tax_domain tax_phylum tax_class tax_order tax_family tax_genus tax_species + + ''' + cursor = conn.cursor() + levels = ["clusters.{level}".format(level=level) for level in ['tax_domain', 'tax_phylum', 'tax_class', + 'tax_order', 'tax_family', 'tax_genus', + 'tax_species']] + levels_str = ', '.join(levels) + query = """ + SELECT contig || ':' || start_pos || '-' || end_pos AS contig_pos, + {levels_str} + FROM clusters AS clusters, pg3.mge AS mge + WHERE clusters.id = mge.cluster_id; + """.format(levels_str=levels_str) + cursor.execute(query) + + result = cursor.fetchall() + cursor.close() + columns = ['contig_pos'] + columns.extend([l.replace("clusters.", "") for l in levels]) + result = pd.DataFrame(result, columns=columns) + return result + +def get_taxa(speci_lst, cursor, level=None): + ''' Query the database to get taxonomy information + Args: + In: speci_lst (list): List of species names + cursor: psycopg cursor object + level (str, optional): Specific taxonomic level to query. If None, fetch all levels. + Out: result: (DataFrame) containing taxonomy information + ''' + levels = { + "tax_domain", "tax_phylum", "tax_class", "tax_order", "tax_family", "tax_genus", "tax_species" + } + + if level and level not in levels: + raise ValueError(f"Invalid level: {level}. Choose from {levels} or None for full taxonomy.") + + levels_str = f"clusters.{level}" if level else ', '.join(f"clusters.{lvl}" for lvl in levels) + + specI_str = ', '.join(['%s'] * len(speci_lst)) + + query = f""" + SELECT cluster_name, {levels_str} + FROM clusters AS clusters + WHERE clusters.cluster_name IN ({specI_str}); + """ + + cursor.execute(query, tuple(speci_lst)) + result = cursor.fetchall() + + columns = ['cluster_name'] + ([level] if level else list(levels)) + + return pd.DataFrame(result, columns=columns) if result else pd.DataFrame(columns=columns) + +def get_gtdb_taxa(sample_ids, db, cursor, level=None): + levels = { + "d", "p", "c", "o", "f", "g", "s" + } + if level and level not in levels: + raise ValueError(f"Invalid level: {level}. Choose from {levels} or None for full taxonomy.") + + if db == "pg3": + tax_table = "pg3.gtdb_r220" + sample_table = "pg3.samples" + assembly = "genome_id" + levels_str = f"t.{level}" if level else ', '.join(f"t.{lvl}" for lvl in levels) + elif db == "spire": + tax_table = "gtdb_r220" + sample_table = "bins" + assembly = "bin_id" + levels_str = f"{tax_table}.{level}" if level else ', '.join(f"{tax_table}.{lvl}" for lvl in levels) + else: + raise ValueError(f"Invalid db specification: {db}. pg3 or spire are allowed.") + + + sample_ids_str = ', '.join(['%s'] * len(sample_ids)) + + if db == "pg3": + query = f""" + SELECT sample_name, {levels_str} + FROM {sample_table} AS s, {tax_table} AS t + WHERE (s.sample_name IN ({sample_ids_str})) AND (s.id = t.sample_id); + """ + elif db == "spire": + query = f""" + SELECT bin_name, {levels_str} + FROM {sample_table} AS {sample_table}, {tax_table} AS {tax_table} + WHERE ({sample_table}.bin_name IN ({sample_ids_str})) AND ({sample_table}.id = {tax_table}.bin_id); + """ + else: + raise ValueError(f"Invalid db specification: {db}. pg3 or spire are allowed.") + + cursor.execute(query, tuple(sample_ids)) + result = cursor.fetchall() + + columns = [assembly] + ([level] if level else list(levels)) + + return pd.DataFrame(result, columns=columns) if result else pd.DataFrame(columns=columns) + + +def annotate_clustering_df(clustered_df, conn, level="tax_species"): + ''' Query taxonomy information and update DataFrame ''' + + print("Clustering df, nrows:", len(clustered_df)) + + genome2speci = {id: genome2speci_ids[get_gtdb_id(id)] for id in clustered_df.member_seq_100 if 'GCA_' in id} + clustered_df['speci'] = clustered_df['member_seq_100'].map(genome2speci) + specIs = list(set(genome2speci.values())) + print("# specI:", len(specIs)) + + cursor = conn.cursor() + + if level == "full": + result_df = get_taxa(specIs, cursor) + else: + result_df = get_taxa(specIs, cursor, level) + + print("Merging taxonomy") + clustered_df = clustered_df.merge(result_df, how="inner", left_on="speci", right_on="cluster_name") + + cursor.close() + return clustered_df + + +def get_speci_taxonomy_df(speci_lst, conn, level="tax_species"): + ''' For each specI cluster get taxonomy information and return as taxa_df dataframe''' + + specIs = list(set(speci_lst)) # Ensure that it is a list + print("# specI:", len(speci_lst)) + + cursor = conn.cursor() + + if level == "full": + taxa_df = get_taxa(speci_lst, cursor) + else: + taxa_df = get_taxa(speci_lst, cursor, level) + + cursor.close() + return taxa_df + + +def get_gtdb_taxonomy_df(sample_ids, db, conn, level="tax_species"): + ''' For each sample_id (bin_id or genome_id) get gtdb taxonomy information and return as taxa_df dataframe''' + + sample_ids= list(set(sample_ids)) # Ensure that it is a list + print("# samples_ids:", len(sample_ids)) + + cursor = conn.cursor() + + if level == "full": + taxa_df = get_gtdb_taxa(sample_ids, db, cursor) + else: + taxa_df = get_gtdb_taxa(sample_ids, db, cursor, level) + + cursor.close() + return taxa_df + + +def get_microontology(sample_names, conn): + ''' + Query the database to get microontology information. + + Args: + sample_names (list): List of sample names + cursor (psycopg cursor): Active DB cursor + + Returns: + pd.DataFrame: DataFrame with sample_name, sample_id, term_id, term, term_array + ''' + + if len(sample_names) == 0: + return pd.DataFrame(columns=["sample_name", "study_id", "sample_id", "term"]) + cursor = conn.cursor() + + samples_str = ', '.join(['%s'] * len(sample_names)) + + query = f""" + SELECT + s.sample_name, + s.study_id, + mv.sample_id, + mt.term + FROM samples s + JOIN microntology_v3 mv ON s.id = mv.sample_id + JOIN LATERAL unnest(mv.microntology_terms) AS term_id ON TRUE + JOIN microntology_terms mt ON mt.id = term_id + WHERE s.sample_name IN ({samples_str}); + """ + + cursor.execute(query, tuple(sample_names)) + result = cursor.fetchall() + cursor.close() + columns = ["sample_name", "study_id", "sample_id", "term"] + + return pd.DataFrame(result, columns=columns) if result else pd.DataFrame(columns=columns) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/mgexpose/readers.py b/mgexpose/readers.py index 23bf3be..5f22ce3 100644 --- a/mgexpose/readers.py +++ b/mgexpose/readers.py @@ -7,22 +7,8 @@ import re import sys -from .chunk_reader import get_lines_from_chunks -from .recombinases import MgeRule - - -def read_fasta(f): - header, seq = None, [] - for line in get_lines_from_chunks(f): - if line[0] == ">": - if seq: - yield header, "".join(seq) - seq.clear() - header = line.strip()[1:] - else: - seq.append(line.strip()) - if seq: - yield header, "".join(seq) +from chunk_reader import get_lines_from_chunks +from recombinases import MgeRule def read_prodigal_gff(f): @@ -44,7 +30,7 @@ def read_prodigal_gff(f): yield _id, line -def read_recombinase_hits(f, pyhmmer=True): +def read_recombinase_hits(f): """ Read hmmer output from recombinase scan. Returns (gene_id, mge_name) tuples via generator. @@ -53,10 +39,7 @@ def read_recombinase_hits(f, pyhmmer=True): for line in _in: line = line.strip() if line and line[0] != "#": - if pyhmmer: - gene_id, mge = line.split("\t")[:2] - else: - gene_id, _, mge, *_ = re.split(r"\s+", line) + gene_id, _, mge, *_ = re.split(r"\s+", line) yield gene_id, mge @@ -213,3 +196,4 @@ def filter_record(key, value, row): ) phage_signal = (None, eggnog_freetext)[is_phage] yield gene_id, phage_signal, eggnog_gene_ann + diff --git a/mgexpose/recombinases.py b/mgexpose/recombinases.py index 2fc86d6..c66eb48 100644 --- a/mgexpose/recombinases.py +++ b/mgexpose/recombinases.py @@ -2,8 +2,6 @@ """ Recombinase rules and aliases """ -import itertools as it - from dataclasses import dataclass @@ -107,10 +105,8 @@ def patch_c_tn_check(self, island): # return two_tn3 != two_ser_ce or mixed - if sum(island.recombinases.values()) == 2 and self.is_tn and self.ce and island.conj_man_count < 1: - # recombinase_types = ",".join(list(island.recombinases)) - recombinase_types = ",".join(it.chain(*it.chain((r,) * c for r, c in island.recombinases.items()))) - + if sum(island.recombinases.values()) == 2: + recombinase_types = ",".join(list(island.recombinases)) mixed = "tn3" in recombinase_types and "ser_ce" in recombinase_types two_tn3 = recombinase_types.count("tn3") == 2 diff --git a/mgexpose/test_mge_annotation.py b/mgexpose/test_mge_annotation.py new file mode 100644 index 0000000..7299b00 --- /dev/null +++ b/mgexpose/test_mge_annotation.py @@ -0,0 +1,129 @@ +# pylint: disable=C0301,E0401,W1510,W0621 +# flake8: noqa + +''' +mge_annotation.py +GCA_000012825.1.genomes +GCA_000012825.1.genomes.gff.gz +GCA_000012825.1.genomes.recombinase_hmmsearch.besthits.out +specI_v4_00061 +txsscan_rules.txt +all_systems.tsv +GCA_000012825.1.genomes.emapper.annotations +mge_rules_ms.txt +--cluster_data GCA_000012825.1.genomes_mmseqcluster.tsv.gz +--output_dir specI_v4_00061/GCA_000012825.1.genomes/ +--dump_intermediate_steps +--write_gff +''' +import os +import subprocess +import pytest + +TEST_DATADIR = "../test_data/current/" +OUTPUT_DIR = "specI_v4_00061/GCA_000012825.1.genomes/" +GFF_FILENAME = "GCA_000012825.1.genomes.full_length_MGE_assignments.gff3" +TXT_FILENAME = "GCA_000012825.1.genomes.full_length_MGE_assignments.txt" +DEBUG_FILES = { + # "GCA_000012825.1.genomes.assign_mge.step1.txt": "GCA_000012825.1.genomes.assign_mge.step1.txt", + # "GCA_000012825.1.genomes.assign_mge.step2.txt": "GCA_000012825.1.genomes.assign_mge.step2.txt", + # "GCA_000012825.1.genomes.assign_mge.step3.txt": "GCA_000012825.1.genomes.assign_mge.step3.txt", + # "GCA_000012825.1.genomes.pan_genome_calls.txt": "GCA_000012825.1.genomes.pan_genome_calls.txt", + # "GCA_000012825.1.genomes.pan_genome_islands.txt": "GCA_000012825.1.genomes.pan_genome_islands.txt", +} + +TEST_OUT_GFF = os.path.join(TEST_DATADIR, "output", OUTPUT_DIR, GFF_FILENAME) +TEST_OUT_TXT = os.path.join(TEST_DATADIR, "output", OUTPUT_DIR, TXT_FILENAME) + +INPUT_ARGS = [ + ("genome_id", "GCA_000012825.1.genomes"), + ("prodigal_gff", os.path.join(TEST_DATADIR, "GCA_000012825.1.genomes.gff.gz")), + ("recombinase_hits", os.path.join(TEST_DATADIR, "GCA_000012825.1.genomes.recombinase_hmmsearch.besthits.out")), + ("mge_rules", os.path.join(TEST_DATADIR, "mge_rules_ms.txt")), + ("--speci", "specI_v4_00061"), + ("--txs_macsy_rules", os.path.join(TEST_DATADIR, "txsscan_rules.txt")), + ("--txs_macsy_report", os.path.join(TEST_DATADIR, "all_systems.tsv")), + ("--phage_eggnog_data", os.path.join(TEST_DATADIR, "GCA_000012825.1.genomes.emapper.annotations")), + ("--cluster_data", os.path.join(TEST_DATADIR, "GCA_000012825.1.genomes_mmseqcluster.tsv.gz")), + ("--output_dir", OUTPUT_DIR), + ("--write_gff", ""), + ("--dump_intermediate_steps", ""), + ("--write_genes_to_gff", ""), + ("--add_functional_annotation", ""), +] + + +@pytest.fixture(scope="module") +def run_mge_annotation(tmpdir_factory): + """Fixture to run the mge_annotation.py script once per module and generate the output files.""" + tmpdir = tmpdir_factory.mktemp("mge_output") + tmp_output_dir = os.path.join(tmpdir, OUTPUT_DIR) + os.makedirs(tmp_output_dir, exist_ok=True) + debug_dir = None + + # Prepare the command with input arguments + command = ["python", "mge_annotation.py", "denovo"] + for arg, val in INPUT_ARGS: + if val: # Append argument only if value is non-empty + if arg == "--output_dir": + command.extend(["--output_dir", tmp_output_dir]) + elif '--' in arg: + command.extend([arg, val]) + else: + command.append(val) # Obligatory input + elif arg == "--dump_intermediate_steps": + command.append(arg) + debug_dir = os.path.join(tmp_output_dir, "debug") + os.makedirs(debug_dir, exist_ok=True) + + else: + command.append(arg) + + # Execute the command + print("Running command:", command) + result = subprocess.run(command, capture_output=True, text=True) + + # Ensure the script ran successfully + assert result.returncode == 0, f"Command failed with error: {result.stderr}" + return tmp_output_dir, debug_dir + + +def compare_output_files(generated_file_path, expected_file_path): + """Helper function to compare a generated file with its expected output.""" + # Ensure the generated file exists + assert os.path.exists(generated_file_path), f"Generated file not found at {generated_file_path}" + + # Read both the expected file and the generated file + with open(expected_file_path, "rb") as f: + expected_content = f.read() + + with open(generated_file_path, "rb") as f: + generated_content = f.read() + + # Assert that the content of both files is identical + assert expected_content == generated_content, f"The generated file {generated_file_path} does not match the expected output." + + +def test_gff_output(run_mge_annotation): + """Test to compare the generated GFF file with the expected output.""" + tmp_output_dir, _ = run_mge_annotation + print("Temporary output directory: ", tmp_output_dir) + generated_gff_path = os.path.join(tmp_output_dir, GFF_FILENAME) + compare_output_files(generated_gff_path, TEST_OUT_GFF) + + +def test_txt_output(run_mge_annotation): + """Test to compare the generated TXT file with the expected output.""" + tmp_output_dir, _ = run_mge_annotation + generated_txt_path = os.path.join(tmp_output_dir, TXT_FILENAME) + compare_output_files(generated_txt_path, TEST_OUT_TXT) + + +# Individual tests for each debug file +@pytest.mark.parametrize("debug_filename", DEBUG_FILES.keys()) +def test_debug_file_output(run_mge_annotation, debug_filename): + """Test to compare each file in the debug directory with the expected output.""" + _, debug_dir = run_mge_annotation + generated_file_path = os.path.join(debug_dir, debug_filename) + expected_file_path = os.path.join(TEST_DATADIR, "output", OUTPUT_DIR, "debug", debug_filename) + compare_output_files(generated_file_path, expected_file_path) From b3024fc3a9e745c13c7994b31d3bd2e8db86b2a5 Mon Sep 17 00:00:00 2001 From: grekova Date: Tue, 11 Nov 2025 11:53:17 +0100 Subject: [PATCH 2/4] Update with latest downstream changes --- .../batch_helpers-checkpoint.py | 223 ++++++ .../downstream-checkpoint.py | 445 +++++++++++ .../.ipynb_checkpoints/gffio-checkpoint.py | 67 ++ .../.ipynb_checkpoints/islands-checkpoint.py | 708 ++++++++++++++++++ .../.ipynb_checkpoints/query_db-checkpoint.py | 280 +++++++ mgexpose/__init__.py | 4 + mgexpose/__main__.py | 2 +- mgexpose/batch_helpers.py | 4 +- mgexpose/downstream.py | 4 +- mgexpose/gene.py | 2 +- mgexpose/gffio.py | 14 +- mgexpose/islands.py | 21 +- mgexpose/query_db.py | 28 +- mgexpose/readers.py | 4 +- setup.cfg | 7 +- 15 files changed, 1788 insertions(+), 25 deletions(-) create mode 100644 mgexpose/.ipynb_checkpoints/batch_helpers-checkpoint.py create mode 100644 mgexpose/.ipynb_checkpoints/downstream-checkpoint.py create mode 100644 mgexpose/.ipynb_checkpoints/gffio-checkpoint.py create mode 100644 mgexpose/.ipynb_checkpoints/islands-checkpoint.py create mode 100644 mgexpose/.ipynb_checkpoints/query_db-checkpoint.py diff --git a/mgexpose/.ipynb_checkpoints/batch_helpers-checkpoint.py b/mgexpose/.ipynb_checkpoints/batch_helpers-checkpoint.py new file mode 100644 index 0000000..858f9c1 --- /dev/null +++ b/mgexpose/.ipynb_checkpoints/batch_helpers-checkpoint.py @@ -0,0 +1,223 @@ +#!/usr/bin/env python +# pylint: disable=R0912,R0914,R0915 +''' Collection of functions to collect MGEs from batches of files''' +import os +from collections import Counter, defaultdict +import json + +from dask.distributed import Client, progress, WorkerPlugin +import dask.bag as db +from dask.bag import from_delayed +from dask.delayed import delayed +import dask + +from .gffio import read_mge_genomic_islands_gff + +from .base_logger import logger +import traceback + + +# Create a dictionary: genome: list of MGE IDs. This is needed to filter out only relevant MGEs per genome.The input is a corresponding list (same order) of genomes and MGE IDs. +def get_genome2mges(genomes, mges): + genome2mge_id = {} + for id, genome_id in zip(mges, genomes): + if genome_id not in genome2mge_id: + genome2mge_id[genome_id] = [] # Initialize the list + genome2mge_id[genome_id].append(id) + return genome2mge_id + +# Helper function to extract genome ID/bin ID from file path +def get_genome_id_from_path(path): + """ + Extract genome ID from a file path. + """ + genome_id = None + try: + genome_id = path.split("/")[-2] + except Exception as e: + logger.error(f"Error extracting genome/bin ID{path}: {e}") + return genome_id + +def collect_batch_mges(gff_paths, i, relevant_ids=None): + """ + Collect MGEs from a batch of GFF files. + + Parameters: + - gff_paths: List of GFF file paths. + - i: Index of the batch. + - relevant_ids: Optional dictionary of relevant MGE IDs per genome ID. + + Returns: + - List of MGE islands for all files in the batch. + """ + islands = [] + for gff_path in gff_paths: + genome_id = get_genome_id_from_path(gff_path) + #logger.info(f"Processing genome: {genome_id}") + + try: + if relevant_ids: + relevant_mges = list(read_mge_genomic_islands_gff(gff_path, relevant_ids[genome_id])) + else: + relevant_mges = list(read_mge_genomic_islands_gff(gff_path)) + + islands.extend(relevant_mges) + + except Exception as e: + logger.error(f"Error processing {gff_path}: {e}") + logger.error(traceback.format_exc()) # Full traceback + + logger.info(f"Batch {i} completed, MGE islands found: {len(islands)}") + return islands + + +def apply_per_batch(islands, funcs): + """ + Calculate statistics for a batch of MGEs using a list of functions. + + Parameters: + - islands: List of MGE objects in the batch. + - funcs: List of functions to be applied to the batch. + + Returns: + - Dictionary with function names as keys and their results as values. + """ + results = {} + for func in funcs: + func_name = func.__name__ # Get the function's name + results[func_name] = func(islands) # Apply the function and store the result + return results + +def apply_one_per_batch(islands, func): + """ + Apply a single function to a batch of MGE objects. + + Parameters: + - islands: List of MGE objects in the batch. + - func: A single function to be applied to the batch. + + Returns: + - The result of applying the function to the islands. + """ + try: + return func(islands) + except Exception as e: + raise RuntimeError(f"Error applying function '{func.__name__}' to batch: {e}") + + +def write_batch_json(batch_count, i, dir, base_filename): + """ + Saves batch statistics to a JSON file using a Dask delayed function. This allows the function to be part of a larger + Dask computation graph, potentially executed in parallel. + + Parameters: + - batch_count (dict): Batch statistics, typically a dictionary with `Counter` values. + - i (int): Index of the current batch. This is used to generate a unique filename for each batch. + - dir (str): Path to the directory where the batch JSON files will be stored. + - base_filename (str): Base name for the JSON files. The batch index will be appended to this base name to create the full filename. + + Ensures that the specified directory exists before writing the file. If it does not exist, the directory will be created. + + Returns: + - A Dask delayed object which, when executed, will write the batch statistics to the specified file path in JSON format. + """ + # Ensure directory exists + if not os.path.exists(dir): + os.makedirs(dir) + + # Delayed function to write JSON + def delayed_write(path, data): + with open(path, 'w') as file: + json.dump(data, file, indent=4) + + # Construct the full path with batch number + path = os.path.join(dir, f"{base_filename}_{i}.json") + return delayed(delayed_write)(path, batch_count) + + +def write_batch_tsv(tsv_string, i, dir, base_filename): + """ + Saves a TSV-formatted string to a file as part of a Dask computation graph. + + Parameters: + - tsv_string (str): TSV-formatted data returned from a processing function. + - i (int): Index of the current batch, used to make unique filenames. + - dir (str): Directory where TSV files will be saved. + - base_filename (str): Base name for the output files. + + Returns: + - A Dask delayed object that writes the TSV to disk when executed. + """ + # Ensure the output directory exists + if not os.path.exists(dir): + os.makedirs(dir) + + # Define the write function + def delayed_write(path, content): + with open(path, 'w') as file: + file.write(content) + + # Create the output file path + path = os.path.join(dir, f"{base_filename}_{i}.tsv") + + # Return the delayed write operation + return delayed(delayed_write)(path, tsv_string) + + +def aggregate_attr(batches): + """ + Aggregate string attributes across all batches. + + Parameters: + - batches: List of batch statistics (dictionaries with str values e.g. cluster: COG category). + + {'90371.SAMN11043730.GCA_007435405_02914': 'S', '90371.SAMN14863315.GCA_013264555_00909': 'S', '28150.SAMN09228819.GCA_007140965_00837': 'K', '28901.SAMN13391507.GCA_011477875_00875': 'no_cog_fcat', '1967657.SAMN09203654.GCA_010924635_01295': 'S', '28901.SAMN13057743.GCA_009231785_02754': 'no_cog_fcat', '90371.SAMN11355433.GCA_007687065_04488': 'S', '28901.SAMN06645026.GCA_009179045_04086': 'no_cog_fcat', '28901.SAMN15658059.GCA_013797405_02028': 'S', '796732.SAMN01805325.GCA_000272735_04407': 'no_cog_fcat', '28901.SAMN12571445.GCA_010939435_00055': 'S', '28901.SAMN13747386.GCA_010741235_03029': 'no_cog_fcat', '115981.SAMN14080650.GCA_011486465_01735': 'S', '28901.SAMN14341880.GCA_011465135_00878': 'H', '1151002.SAMN09403228.GCA_004177825_01940': 'no_cog_fcat', '1029990.SAMN02415182.GCA_000484355_00589': 'S', '28901.SAMN12287151.GCA_007468615_01067': 'S', '28901.SAMN13057273.GCA_009230165_00580': 'no_cog_fcat', '611.SAMN21335643.GCA_019899165_00890': 'EH', '28901.SAMN10095790.GCA_005443695_02309': 'no_cog_fcat', '340190.SAMN15147492.GCA_013661605_01048': 'S', '224729.SAMN19336595.GCA_018502705_02901': 'no_cog_fcat', '28901.SAMN16355443.GCA_015155595_01828': 'no_cog_fcat', '59201.SAMN10093771.GCA_007777665_02600': 'no_cog_fcat', '59201.SAMN17835677.GCA_017072195_01803': 'S'} + +2025-01-31 18:35:27,669 - {'611.SAMN17086052.GCA_016740915_04210': 'K', '611.SAMN07152477.GCA_007233055_03707': 'S', '1173835.SAMN01088029.GCA_000962395_02473': 'L', '1620419.SAMN03894126.GCA_001241425_04679': 'T', '568709.SAMEA2272227.GCA_000493535_02720': 'no_cog_fcat', '28901.SAMN18448990.GCA_017574325_01595': 'L', '90371.SAMEA6057931.GCA_016228905_04588': 'L', '28901.SAMN10177571.GCA_005772365_02802': 'N', '28901.SAMN14050865.GCA_011246635_04142': 'G', '90371.SAMN09387768.GCA_007158225_04690': 'N', '28144.SAMN07734943.GCA_003548115_02128': 'no_cog_fcat', '90105.SAMN09474912.GCA_004184575_03995': 'G', '59201.SAMN10756627.GCA_007583145_03925': 'S', '90371.SAMN03169328.GCA_008018515_03842': 'K', '1620419.SAMN04255380.GCA_010457935_04445': 'no_cog_fcat', '28901.SAMN16124589.GCA_014542005_01530': 'G', '28901.SAMN17005521.GCA_015838815_01302': 'no_cog_fcat', '28901.SAMN19285790.GCA_018468945_04349': 'G', '28901.SAMN10425133.GCA_010255835_04194': 'S', '28901.SAMN12344366.GCA_007726245_00622': 'S', '28901.SAMN12107692.GCA_006482085_01260': 'S', '440524.SAMN02867573.GCA_010663445_04243': 'M', '28901.SAMN20181473.GCA_020012815_04294': 'L', '28901.SAMEA6514879.GCA_011786425_00695': 'G', '90371.SAMN07279560.GCA_002260995_02309': 'K', '90371.SAMN19798225.GCA_018997055_00491': 'L', '28901.SAMN12823265.GCA_008717395_04569': 'V', '1173837.SAMN01088030.GCA_000962405_02064': 'L', '399584.SAMN13050934.GCA_009225065_04346': 'G', '28901.SAMN13057273.GCA_009230165_01317': 'no_eggnog'} + + Returns: + - Dictionary of aggregated statistics for all batches {func: {mge_type: {cluster: COG_category}}} + """ + aggregated = {} + + for batch in batches: + for func_name, mge_dict in batch.items(): + aggregated[func_name] = {} + for mge_type, attr_dict in mge_dict.items(): + if mge_type not in aggregated[func_name]: + aggregated[func_name][mge_type] = defaultdict(list) # TODO: Generalise to function + # Update using the contents of the nested dictionary + for cluster_id, value in attr_dict.items(): + aggregated[func_name][mge_type][cluster_id] = value # TODO: replace with proper majority vote + # flatten the COG value per batch + return aggregated + +def aggregate_counts(batch_counts): + """ + Aggregate statistics across all batches. + + Parameters: + - batch_counts: List of batch statistics (dictionaries with Counter values). + + Returns: + - Dictionary of aggregated statistics for all batches {func: {mge_type: {cluster: count}}} + """ + aggregated = {} + + for batch in batch_counts: + for func_name, mge_counter in batch.items(): + aggregated[func_name] = {} + for mge_type, nested_counter in mge_counter.items(): + if mge_type not in aggregated[func_name]: + aggregated[func_name][mge_type] = Counter() # Initialize if not already present + # Update using the contents of the nested dictionary + for cluster_id, value in nested_counter.items(): + if isinstance(value, str): + aggregated[func_name][mge_type][cluster_id] = value # Overwrite previous COG category with the latest batch + else: + if cluster_id in aggregated[func_name][mge_type]: + aggregated[func_name][mge_type][cluster_id] += value + else: + aggregated[func_name][mge_type][cluster_id] = value + return aggregated + diff --git a/mgexpose/.ipynb_checkpoints/downstream-checkpoint.py b/mgexpose/.ipynb_checkpoints/downstream-checkpoint.py new file mode 100644 index 0000000..a3950ec --- /dev/null +++ b/mgexpose/.ipynb_checkpoints/downstream-checkpoint.py @@ -0,0 +1,445 @@ +#!/usr/bin/env python +# pylint: disable=R0912,R0914,R0915 +import os + +from collections import Counter, defaultdict + +from .gffio import read_mge_genomic_islands_gff # FIXME + +from .base_logger import logger + + +''' Quick and dirty implementation - actually does not belong here ''' # FIXME +def stat_core(gff_files): + """Calculate statistics on MGEs in the core genome and return as a dictionary.""" + total = 0 + core = 0 + for f in gff_files: + island = read_mge_genomic_islands_gff(f) + if island.is_core: + core += 1 + total += 1 + return { + "total": total, + "core_count": core, + "accessory_count": total - core, + } + +def stat_nested(islands): + """Calculate statistics on nested MGEs and return as a dictionary.""" + total = 0 + nested = 0 + for island in islands: + if island.mge_type == "nested": + nested += 1 + total += 1 + nested_percentage = (nested / total * 100) if total > 0 else 0 + non_nested_percentage = ((total - nested) / total * 100) if total > 0 else 0 + + return { + "total": total, + "nested_count": nested, + "non_nested_count": total - nested, + "nested_percentage": nested_percentage, + "non_nested_percentage": non_nested_percentage, + } + + +def count_nested(islands): + """ + Calculate the count of nested MGEs. + + Parameters: + - islands: List of MGE objects. + + Returns: + - Integer count of nested MGEs. + """ + nested = sum(1 for island in islands if island.mge_type == "nested") + return nested + + +def count_core(islands): + """ + Calculate the count of MGEs in the core genome. + + Parameters: + - islands: List of MGE objects. + + Returns: + - Integer count of MGEs in the core genome. + """ + core = sum(1 for island in islands if island.is_core) + return core + + +def count_total_islands(islands): + """ + Count the total number of MGE islands. + + Parameters: + - islands: List of MGE objects. + + Returns: + - Integer count of total MGE islands. + """ + return len(list(islands)) + + +def stat_mge_type(islands): + """ + Calculate counts of each MGE type and return as a Counter object. + + Parameters: + - islands: List of MGE objects. + + Returns: + - Counter object with counts of each MGE type. + """ + mge_counts = Counter() + for island in islands: + try: + if island.mge_type == "nested": + mge_counts["nested"] += 1 + else: + # Get the first key (assuming there's only one key) + mge = next(iter(island.mge.keys())) + mge_counts[mge] += 1 + except Exception as e: + raise ValueError(f"Unknown or absent MGE type: {e}") + + return mge_counts + + +def stat_mean_genes(islands): + """Calculate the mean number of genes per MGE and return as a dictionary.""" + genes_lst = [island.n_genes for island in islands] + mean_genes = (sum(genes_lst) / len(genes_lst)) if genes_lst else 0 + + # Return the result as a dictionary + return {"mean_genes_per_mge": mean_genes} + + +def extract_cargo(island): + cargo_genes = [] + for gene in island.genes: + if (gene.phage is None) and (gene.recombinase is None) and (gene.secretion_system is None): + cargo_genes.append(gene) + return cargo_genes + + +def get_kegg_ko(gene): + for key, value in gene.eggnog: + if key == "kegg_ko": + return value + + +def get_cazy(gene): + for key, value in gene.eggnog: + if key == "cazy": + return value + + +def get_cog_category(gene): + if gene.eggnog: + for key, value in gene.eggnog: + if key == "cog_fcat": + if value: + return value + else: + return '-' + else: + return 'no_cog_fcat' + else: + return 'no_eggnog' + + +# SP95 for SPIRE +def get_gene_cluster(gene): + return gene.cluster + + +def get_gene_id(gene): + id_lst = gene.id.split('.') # orginal ID e.g. 28901.SAMN15849311.GCA_014242155_04079 + return id_lst[2] # extract to match CY clustering IDs e.g. GCA_xxx_xxx + +# Extract MGE recombinases +def extract_mger(island): + mgeR_genes = [] + for gene in island.genes: + if gene.recombinase: + try: + gene_id = gene.id + gene_cluster = get_gene_cluster(gene) + mgeR = gene.recombinase + annot = [gene_id, gene_cluster, mgeR] + mgeR_genes.append(annot) + except Exception as e: + logger.error(f"Error processing recombinase gene {gene}: {e}") + logger.error(traceback.format_exc()) + return mgeR_genes + + +# Extract secretion system genes +def extract_secretion_system(island): + secretion_genes = [] + for gene in island.genes: + if gene.secretion_system: + try: + gene_id = gene.id + gene_cluster = get_gene_cluster(gene) + info = gene.secretion_system + annot = [gene_id, gene_cluster, info] + secretion_genes.append(annot) + except Exception as e: + logger.error(f"Error processing secretion system gene {gene}: {e}") + logger.error(traceback.format_exc()) + return secretion_genes + + +# Extract phage genes +def extract_phage(island): + phage_genes = [] + for gene in island.genes: + if gene.phage: + try: + gene_id = gene.id + gene_cluster = get_gene_cluster(gene) + info = gene.phage + annot = [gene_id, gene_cluster, info] + phage_genes.append(annot) + except Exception as e: + logger.error(f"Error processing phage gene {gene}: {e}") + logger.error(traceback.format_exc()) + return phage_genes + + +def get_most_common_kegg_ko(genes): + """ + Calculate the most common KEGG KO annotations for a list of genes. + + Parameters: + - genes: List of gene objects. + + Returns: + - Counter object with counts of KEGG KO annotations. + """ + kos = [get_kegg_ko(gene) for gene in genes] + return Counter(kos) + + +def count_gene_clusters(genes, **kwargs): + """ + Count genes in gene clusters. + + Parameters: + - genes: List of gene objects. + + Returns: + - Counter object with counts of cluster genes. + """ + gene_clusters = [get_gene_cluster(gene) for gene in genes] + return Counter(gene_clusters) + + +def get_majority_cog_category(genes, **kwargs): + cluster_to_categories = defaultdict(list) + + for gene in genes: + gene_cluster = get_gene_cluster(gene) + cog_category = get_cog_category(gene) + cluster_to_categories[gene_cluster].append(cog_category) + + majority_cog_category = {} + + # Determine the majority cog category for each cluster + for cluster, categories in cluster_to_categories.items(): + category_count = Counter(categories) + majority_cog_category[cluster] = category_count.most_common(1)[0][0] # a list of tuples where each tuple is a category and its count; [0][0] extracts the category with the highest count. + #logger.info(majority_cog_category) + return majority_cog_category + + +def get_gene_annotation(genes, func, **kwargs): + gene_annot_dict = {} + for gene in genes: + gene_id = get_gene_id(gene) + func_annot = func(gene) + gene_annot_dict[gene_id] = func_annot + + return gene_annot_dict + + +def get_genes_cog_categories(genes, **kwargs): + return get_gene_annotation(genes, get_cog_category) + + +def get_genes_ko(genes, *_args): + return get_gene_annotation(genes, get_kegg_ko) + + +def get_genes_cazy(genes, *_args): + return get_gene_annotation(genes, get_cazy) + + +def get_genes_clusters(genes, **kwargs): + return get_gene_annotation(genes, get_gene_cluster) + + +def get_genes(genes, mge_id): + gene_ids = [get_gene_id(gene) for gene in genes] + return {mge_id: gene_ids} + + +def count_per_mge_cargo(islands, func): + """ + Extract and count cargo genes associated with each MGE type and return as a dictionary of Counter objects. + + Parameters: + - islands: List of MGE objects. + - func: function to extract cargo e.g. count KO terms or gene clusters + + Returns: + - Dictionary where keys are MGE types, and values are Counter objects with KEGG KO counts. + """ + mge_cargo_counts = { + "nested": Counter(), + "phage": Counter(), + "phage_like": Counter(), + "is_tn": Counter(), + "ce": Counter(), + "mi": Counter(), + "integron": Counter(), + "cellular": Counter(), + } + + for island in islands: + try: + cargo = extract_cargo(island) + if island.mge_type == "nested": + mge_cargo_counts["nested"].update(func(cargo)) + else: + # Get the first key (assuming there's only one key) + mge = next(iter(island.mge.keys())) + mge_cargo_counts[mge].update(func(cargo)) + except Exception as e: + raise ValueError(f"Error processing cargo for island: {e}") + + return mge_cargo_counts + + +def get_per_mge_cargo(islands, func): + mge_cargo_annot = { + "nested": {}, + "phage": {}, + "phage_like": {}, + "is_tn": {}, + "ce": {}, + "mi": {}, + "integron": {}, + "cellular": {}, + } + + for island in islands: + try: + cargo = extract_cargo(island) + mge_id = island.get_id() + if island.mge_type == "nested": + mge_cargo_annot["nested"].update(func(cargo, mge_id)) # Merges another dictionary into existing one + else: + # Get the first key (assuming there's only one key) + mge = next(iter(island.mge.keys())) + mge_cargo_annot[mge].update(func(cargo, mge_id)) + except Exception as e: + raise ValueError(f"Error processing cargo for island: {e}") + + return mge_cargo_annot + + +def get_machinery_genes_tsv(islands): + tsv_rows = [] + header = ['mge_id', 'mge', 'n_genes', 'gene_id', 'gene_cluster', 'feature_type', 'feature_info'] + tsv_rows.append('\t'.join(header)) + + for island in islands: + try: + mge_id = island.get_id() + n_genes = len(island.genes) + mge = ",".join(f"{k}:{v}" for k, v in island.mge.items()) + + recombinases = extract_mger(island) # list of [gene_id, gene_cluster, info] + conj_machinery = extract_secretion_system(island) + phage_machinery = extract_phage(island) + + for gene in recombinases: + gene_id, gene_cluster, info = gene + tsv_rows.append(f"{mge_id}\t{mge}\t{n_genes}\t{gene_id}\t{gene_cluster}\tmgeR\t{info}") + + for gene in conj_machinery: + gene_id, gene_cluster, info = gene + tsv_rows.append(f"{mge_id}\t{mge}\t{n_genes}\t{gene_id}\t{gene_cluster}\tsecretion_system\t{info}") + + for gene in phage_machinery: + gene_id, gene_cluster, info = gene + tsv_rows.append(f"{mge_id}\t{mge}\t{n_genes}\t{gene_id}\t{gene_cluster}\tphage\t{info}") + + except Exception as e: + raise ValueError(f"Error processing machinery for island {island}: {e}") + + tsv_output = '\n'.join(tsv_rows) + return tsv_output + +def get_cargo_genes_tsv(islands): + tsv_rows = [] + header = ['mge_id', 'mge', 'gene_ids', 'gene_clusters'] + tsv_rows.append('\t'.join(header)) + + for island in islands: + gene_ids = [] + gene_clusters = [] + try: + mge_id = island.get_id() + mge = ",".join(f"{k}:{v}" for k, v in island.mge.items()) + + cargo_genes = extract_cargo(island) + + for gene in cargo_genes: + gene_ids.append(gene.id) + gene_clusters.append(get_gene_cluster(gene)) + gene_ids = ';'.join(gene_ids) + gene_clusters = ';'.join(gene_clusters) + tsv_rows.append(f"{mge_id}\t{mge}\t{gene_ids}\t{gene_clusters}") + + except Exception as e: + raise ValueError(f"Error processing cargo for island {island}: {e}") + + tsv_output = '\n'.join(tsv_rows) + return tsv_output + +# Counting works with aggregation since the objects are small. Getting only works with batch saving, since the output is huge. +def count_cargo_gene_clusters(islands): + return count_per_mge_cargo(islands, count_gene_clusters) + + +def get_cargo_species_gene_clusters(islands): + return get_per_mge_cargo(islands, func=get_genes_clusters) + + +# Output: for each mge_type output a dictionary with geneID: COG. geneID supposed to be unique -> overwriting is okay. +def get_cargo_genes_cog(islands): + return get_per_mge_cargo(islands, func=get_genes_cog_categories) + + +def get_cargo_genes_ko(islands): + return get_per_mge_cargo(islands, func=get_genes_ko) + + +def get_cargo_genes_cazy(islands): + return get_per_mge_cargo(islands, func=get_genes_cazy) + + +# Output: for each mge_type output a dictionary mge_id: list(cargo_ids) +def get_cargo_genes(islands): + return get_per_mge_cargo(islands, func=get_genes) + + diff --git a/mgexpose/.ipynb_checkpoints/gffio-checkpoint.py b/mgexpose/.ipynb_checkpoints/gffio-checkpoint.py new file mode 100644 index 0000000..13829bd --- /dev/null +++ b/mgexpose/.ipynb_checkpoints/gffio-checkpoint.py @@ -0,0 +1,67 @@ +from .gene import Gene +from .islands import GenomicIsland, MgeGenomicIsland + +from .base_logger import logger + +def read_genomic_islands_gff(fn): + with open(fn, "rt", encoding="UTF-8") as _in: + island = None + for line in _in: + line = line.strip() + if line and line[0] != "#": + cols = line.split("\t") + if cols[2] == "region": + if island is not None: + yield island + island = GenomicIsland.from_gff(*cols) + elif cols[2] == "gene": + gene = Gene.from_gff(*cols) + if island is not None: + island.genes.add(gene) + else: + raise ValueError("Found gene but no island.") + if island is not None: + yield island + +def read_mge_genomic_islands_gff(fn, relevant_ids=None): + """ + Generator function to read and parse MGEs and associated genes from a GFF file. + + Parameters: + - fn: Path to the GFF file. + - relevant_ids: Optional set of relevant MGE IDs to filter. If None, all MGEs are processed. + + Yields: + - MgeGenomicIsland objects that match the relevant IDs or all if None. + """ + with open(fn, "rt", encoding="UTF-8") as _in: + island = None + for line in _in: + line = line.strip() + if line and line[0] != "#": + cols = line.split("\t") + attributes = {kv.split('=')[0]: kv.split('=')[1] for kv in cols[8].split(';') if '=' in kv} + + if cols[2] == "mobile_genetic_element": + mge_id = attributes.get("ID") + + if relevant_ids is None or mge_id in relevant_ids: + if island is not None: + yield island + island = MgeGenomicIsland.from_gff(*cols) + + elif cols[2] == "gene": + parent_id = attributes.get("Parent") + + if island is not None: + if relevant_ids is None or parent_id in relevant_ids: + gene = Gene.from_gff(*cols) + island.genes.add(gene) + else: + continue + else: + # This situation should not happen unless the GFF is malformed + raise ValueError("Found gene with no preceding island.") + + if island is not None: + yield island diff --git a/mgexpose/.ipynb_checkpoints/islands-checkpoint.py b/mgexpose/.ipynb_checkpoints/islands-checkpoint.py new file mode 100644 index 0000000..0304ea1 --- /dev/null +++ b/mgexpose/.ipynb_checkpoints/islands-checkpoint.py @@ -0,0 +1,708 @@ +# pylint: disable=C0116,C0301,R0902,R0916 +""" +Data Structures Module + +This module is designed to simplify the handling of different genomic sequences, +including but not limited to: + +- Genomic Island +- Annotated Genomic Island +- MGE Genomic Island +- Gene + +The end product of the pipeline is MGE Genomic Island, an Annotated Genomic Island +consisting of Genes. +It can be saved in a tsv or gff3 format together with its attributes and gene annotations. +The MGE type of each MGE Genomic Island is defined by applying MGE Rule. +""" +import itertools as it +import logging +import sys +import re + +from collections import Counter +from dataclasses import dataclass, field + +from .gene import Gene +from .recombinases import MgeRule, MGE_ALIASES + +logger = logging.getLogger(__name__) + + +@dataclass +class GenomicIsland: + '''The following class describes a generic genomic region + with one or more identified recombinases. + This region is then referred as Recombinase Island. + The Genomic Island is represented by contig, start and end + coordinates, set of genes, some of which are recombinases and MGE machinery. + Importantly, the set of genes does not include the non-coding regions. + ''' + RAW_TABLE_HEADER = ( + "specI", + "genome_accession", + "panG", + "contig", + "start", + "end", + "gene_list", + ) + + speci: str = None + genome: str = None + is_core: bool = None + contig: str = None + start: int = None + end: int = None + name: str = "ISLAND" + + genes: set = field(default_factory=set) + # recombinases: list = field(default_factory=list) + recombinases: Counter = field(default_factory=Counter) + + @staticmethod + def get_fieldnames(): + """ Returns column headers for island table. """ + return ( + "first_recombinase_gene", + "first_recombinase", + "island_size", + "genome", + "specI", + "core_acc", + "contig", + "first_gene_start", + "last_gene_end", + "protid_gene_clusters", + ) + + @classmethod + def from_region_string(cls, region): + """ Creates island from a predefined region string. """ + _, _, contig, start_end, *_ = region.strip().split(".") + contig = contig.split(".")[-1] + start, end = map(int, start_end.split("-")) + return cls(None, None, None, contig, start, end, region.strip()) + + @classmethod + def from_gene(cls, gene): + """ Creates island from starting gene. """ + island = cls( + gene.speci, + gene.genome, + gene.is_core, + gene.contig, + gene.start, + gene.end, + ) + island.add_gene(gene) + return island + + def __len__(self): + """ Calculates island length. """ + if self.start is None or self.end is None: + return 0 + return abs(self.end - self.start) + 1 + + def __str__(self): + """ String representation. """ + genes = ( + f"{gene.id}.{gene.cluster}" + for gene in sorted( + self.genes, key=lambda g: (g.start, g.end, g.strand) + ) + ) + + return "\t".join( + [ + f"{v}" if (k != "is_core" or v is None) else Gene.rtype(self.is_core) + for k, v in self.__dict__.items() + if k not in ("genes", "recombinases") + ] + [",".join(genes)] + ) + + def add_gene(self, gene): + """ Adds a gene to the island. """ + if gene not in self.genes: + self.end = max(self.end, gene.end) + if gene.recombinase is not None: + # self.recombinases.append( + # (f"{gene.id}.{gene.cluster}", gene.recombinase) + # ) + self.recombinases[gene.recombinase] += 1 + self.genes.add(gene) + + def get_position(self): + """ Return genomic position tuple. """ + return (self.contig, self.start, self.end) + + def get_recombinases(self): + for g in sorted(self.genes, key=lambda x: x.start): + if g.recombinase: + yield f"{g.id}.{g.cluster}", g.recombinase + + def dump(self, seen_islands, raw_outstream=None, outstream=sys.stdout): + """ Writes island to outstream. """ + if raw_outstream: + print(self, file=raw_outstream) + pos = self.get_position() + if pos not in seen_islands and self.recombinases: + seen_islands.add(pos) + + print( + # *self.recombinases[0], + *tuple(self.get_recombinases())[0], + len(self), + str(self), + sep="\t", + file=outstream, + ) + + def get_id(self): + return f"GIL_{self.genome}_{self.contig}:{self.start}-{self.end}" + + @classmethod + def from_gff(cls, *cols): + try: + attribs = dict(item.split("=") for item in cols[-1].split(";")) + except: + raise ValueError(f"not enough cols? {cols}") + + try: + recombinases = Counter( + dict( + item.split(":") + for item in attribs["recombinases"].split(",") + ) + ) + except: + raise ValueError(f"recombinase string weird? {attribs['recombinases'].split(',')}") + + return cls( + attribs["specI"], + attribs["genome"], + attribs["genome_type"] == "COR", + cols[0], # contig + int(cols[3]), # start + int(cols[4]), # end + recombinases=recombinases, + genes=set(), + ) + + def to_gff(self, gff_outstream, source_db, write_genes=False, add_functional_annotation=False, + intermediate_dump=False): + island_id = self.get_id() + attribs = { + "ID": island_id, + "genome": self.genome, + "genome_type": Gene.rtype(self.is_core), + "size": len(self), + "n_genes": len(self.genes), + # "mgeR": ",".join(sorted(r for _, r in self.recombinases)), + # "mgeR": ",".join(sorted(self.recombinases)), + "recombinases": ( + ",".join( + f"{k}:{v}" + for k, v in sorted(self.recombinases.items()) + ) + if self.recombinases else "" + ), + "specI": self.speci, #TODO: does it work? + } + if self.name: + attribs["name"] = self.name + attrib_str = ";".join(f"{item[0]}={item[1]}" for item in attribs.items() if item[1]) + # Format the source column + if source_db: + source = f"proMGE_{source_db}" + else: + source = "proMGE" + print( + self.contig, + source, + "region", + self.start, + self.end, + len(self), # Score field + ".", # Strand + ".", # Phase + attrib_str, + sep="\t", + file=gff_outstream + ) + + if write_genes: + # GFF3 child term: genes + for gene in sorted(self.genes, key=lambda g: g.id): + gene.to_gff( + gff_outstream, + genomic_island_id=island_id, + add_functional_annotation=add_functional_annotation, + intermediate_dump=intermediate_dump, + ) + + +@dataclass +class AnnotatedGenomicIsland(GenomicIsland): + '''The following class extends generic Genomic Island with MGE machinery annotations.''' + + TABLE_HEADERS = ( + "contig", + "start", + "end", + "island_size", + "prot_count", + "mgeR_count", + "Plasmid_PA", + "phage_count", + "all_conj_count", + "CONJ_T4SS", + "SS_present_mandatoryG", + "entire_ss", + "mgeR", + ) + + phage_count: int = 0 + conj_count: int = 0 + conj_man_count: int = 0 + valid_entire: bool = False + valid_mandatory: bool = False + valid_accessory: bool = False + + def __post_init__(self): + """ Apply annotations. """ + secretion_systems = {} + cm_counts = Counter() + + # self.recombinases = [r for _, r in self.recombinases] + + for gene in self.genes: + self.phage_count += gene.phage is not None + if gene.secretion_system is not None: + self.conj_count += 1 + self.conj_man_count += ( + gene.secretion_system.upper().startswith("CONJ") or + gene.secretion_system.upper().startswith("T4SS") + ) + if gene.secretion_rule is not None: + cm_counts[(gene.secretion_system, False)] += 1 + cm_counts[(gene.secretion_system, True)] += 1 + + secretion_systems[gene.secretion_system] = gene.secretion_rule + + for system, rule in secretion_systems.items(): + self.valid_mandatory |= (cm_counts[(system, True)] >= rule["mandatory"] / 2) + self.valid_accessory |= (cm_counts[(system, False)] >= rule["accessory"] / 2) + self.valid_entire |= ( + cm_counts[(system, True)] == rule["mandatory"] and + cm_counts[(system, False)] == rule["accessory"] + ) + + def __str__(self): + """ String representation. """ + return "\t".join( + map( + str, ( + self.contig, + self.start, + self.end, + len(self), + len(self.genes), + # len(self.recombinases), + sum(self.recombinases.values()), + 0, # is still necessary? + self.phage_count, + self.conj_count, + self.conj_man_count, + self.valid_mandatory, + self.valid_entire, + ",".join(self.recombinases), + ) + ) + ) + + @classmethod + def from_genomic_island(cls, g_island): + """ Construct annotated island from genomic island. """ + return cls( + **g_island.__dict__, + ) + + +@dataclass +class MgeGenomicIsland(AnnotatedGenomicIsland): + '''The following class describes Anotated Genomic Islands with their assigned MGE type. + Those are Mobile Genetic Elements (MGEs). + The class attributes are used to describe the MGE properties. + It also contains functionality to save the MGEs in gff3 or tsv formats.''' + + TABLE_HEADERS = ( + "tn", + "phage", + "phage_like", + "ce", + "integron", + "mi", + "nmi", + "nov", + "cellular", + "contig", + "start", + "end", + "size", + "n_genes", + "phage_count", + "conj_man_count", + "recombinases", + ) + + integron: int = 0 + cellular: int = 0 + phage: int = 0 + c_mi: int = 0 + nov: int = 0 + c_pli: int = 0 + c_ce: int = 0 + c_nmi: int = 0 + c_tn: int = 0 + + tn3_found: bool = False + ser_found: bool = False + + mge: Counter = field(default_factory=Counter) + mge_type: str = None + size: int = 0 + n_genes: int = 0 + + def __post_init__(self): + """ Apply annotations. """ + recombinases = (",".join(r for _, r in self.get_recombinases())).lower() + for name, alias in MGE_ALIASES.items(): + recombinases = recombinases.replace(name, alias) + + self.tn3_found = "tn3" in recombinases + self.ser_found = "c2_n1ser" in recombinases or "ser_ce" in recombinases + + # integron + self.integron = int("integron" in recombinases) + # tag recombinase island with more than 3 recombinases + # self.c_nmi = int(len(self.recombinases) > 3) + self.c_nmi = sum(self.recombinases.values()) + + # self.recombinases = recombinases.split(",") if recombinases else [] + self.recombinases = Counter(recombinases.split(",")) + + def __str__(self): + """ String representation. """ + return "\t".join( + tuple(map(str, self.get_mge_metrics())) + + ( + self.contig, + f"{self.start}", + f"{self.end}", + f"{len(self)}", + f"{len(self.genes)}", + f"{self.phage_count}", + f"{self.conj_man_count}", + # ",".join(self.recombinases), + ",".join( + f"{k}:{v}" + for k, v in sorted(self.recombinases.items()) + ) + if self.recombinases else "", + self.name, + ) + ) + + @staticmethod + def parse_mge_id(mge_id): + """ + Generalized parser for MGE IDs. + + Returns: + genome_id (str): parsed bin or genome identifier + contig (str): contig name (usually something like k141_32063) + start (int): start coordinate + end (int): end coordinate + """ + try: + # Extract coordinates + coord_match = re.search(r":(\d+)-(\d+)$", mge_id) + if not coord_match: + raise ValueError("No coordinates found in MGE ID.") + start, end = map(int, coord_match.groups()) + + # Remove leading MGE_ and SPIRE_ (if present) + cleaned = mge_id + if cleaned.startswith("MGE_"): + cleaned = cleaned[4:] + if cleaned.startswith("SPIRE_"): + cleaned = cleaned[6:] + + # Strip coordinates + core = cleaned.split(':')[0] + + # Assembly-style pattern (e.g., GCA_019800745.1) + if re.match(r"GCA_[\d.]+", core): + genome_id = core.split('_')[0] + '_' + core.split('_')[1] + contig = core.split('.')[-1] + return genome_id, contig, start, end + + # Bin-style pattern: extract contig and genome_id + kmer_match = re.search(r"(_k\d+_\d+)$", core) + if kmer_match: + contig = kmer_match.group(1)[1:] # remove leading underscore + genome_id = core[: -len(kmer_match.group(1))] # remove contig + return genome_id, contig, start, end + + # Underscore split pattern e.g. MGE_NT12001_NC_000913.3:5234-20508 + underscore_split = core.split('_', 1) + if len(underscore_split) == 2 and '.' in underscore_split[1]: + genome_id, contig = underscore_split + return genome_id, contig, start, end + + # Fallback for unknown formats + raise ValueError(f"Unrecognized MGE ID format: {mge_id}") + + except Exception as e: + raise ValueError(f"Failed to parse MGE ID '{mge_id}': {e}") + + + def get_mge_metrics(self): + """ Cast mge metrics to int. """ + return tuple( + map( + int, + ( + self.c_tn, + self.phage, + self.c_pli, + self.c_ce, + self.integron, + self.c_mi, + self.cellular, + ) + ) + ) + + def get_annotated_mge_metrics(self): + metrics = list(self.get_mge_metrics()) # Get mge_type and counts + mge_metrics = [ + (k, v) + for k, v in zip( + ("is_tn", "phage", "phage_like", "ce", "integron", "mi", "cellular",), + metrics + ) + if v # Collect as long as metrics are not None + ] + return mge_metrics + + @staticmethod + def is_nested(annotated_mge_metrics): + n_mges = sum(v for _, v in annotated_mge_metrics) + if not n_mges: + raise UserWarning("No MGEs were assigned to recombinase island") + # Solitary or nested MGE? + return n_mges > 1 + + @staticmethod + def mge_num_island_type(is_nested): + """ Returns nested vs solitary MGE-tag. """ + return ("non-nested", "nested")[is_nested] + + def has_annotation(self): + """ (Sanity) Check if island has any mge annotation. """ + return sum(( + self.c_tn, + self.phage, + self.c_pli, + self.c_ce, + self.integron, + self.c_mi, + self.cellular, + )) > 0 + + def evaluate_recombinases(self, rules, outstream=None, outstream2=None): + """ Annotate recombinases. """ + patch_c_tn = False + + recombinases = it.chain(*it.chain((r,) * c for r, c in self.recombinases.items())) + + for rec in recombinases: + rule = rules.get(rec) + if rule is None: + print(f"WARNING: cannot find mge-rule for `{rec}`") + rule = MgeRule() + + # cellular:Arch1/Cyan/Xer/Cand + self.cellular |= rule.cellular + + self.c_tn = rule.c_tn_check(self) + patch_c_tn |= rule.patch_c_tn_check(self) + + if self.phage_count >= 2 and self.conj_man_count < 1: + self.phage, self.c_mi, self.nov = rule.phage_check(self) + elif self.phage_count < 2 and self.conj_man_count < 1: + self.c_pli, self.c_mi = rule.phage_like_check( + self, + "brujita" in rec + ) + elif self.phage_count < 2 and self.conj_man_count >= 1: + self.c_ce, self.nov = rule.conjug_element_check(self) + elif self.phage_count >= 2 and self.conj_man_count >= 1: + self.phage, self.c_mi, self.nov = rule.mobility_island_check(self) + + # counting multiple tn in Tn3 containing recombinase island + # self.c_tn += (len(self.recombinases) > 2) * (self.tn3_found or self.ser_found) + self.c_tn += (sum(self.recombinases.values()) > 2) * (self.tn3_found or self.ser_found) + if not self.has_annotation(): + if not patch_c_tn: + print(f"WARNING: No annotation found, but cannot patch either.\n{self}") + self.c_tn = patch_c_tn + + if outstream: + print(self, sep="\t", file=outstream, ) + + # previous step in some cases generates overlap between Phage/Phage_like and Mobility island + # this step specifically resolves such instances based on recombinase presence and presence/ + # absence of phage structural genes/conjugation machinery genes in the neighbourhood + if self.c_mi and self.c_pli: + self.c_mi = int( + any( + pat in ",".join(self.recombinases).lower() + for pat in ('relaxase', 'rep_', 'mob_', 'trwc') + ) + ) + self.c_pli = int(not self.c_mi) + + if self.phage and self.c_mi and self.phage_count >= 2: + self.phage, self.c_mi = True, False + + if outstream2: + print(self, sep="\t", file=outstream2, ) + + @classmethod + def from_annotated_genomic_island(cls, ag_island): + """ Construct from annotated genomic island. """ + island = cls( + **ag_island.__dict__ + ) + return island + + def get_id(self): + return f"MGE_{self.genome}_{self.contig}:{self.start}-{self.end}" + + + def to_gff(self, gff_outstream, source_db, write_genes=False, add_functional_annotation=False): + island_id = self.get_id() + mge_metrics = self.get_annotated_mge_metrics() + attribs = { + "ID": island_id, + "mge": ",".join(f"{k}:{v}" for k, v in mge_metrics), # Count each mge type + "genome_type": Gene.rtype(self.is_core), + "mge_type": self.mge_num_island_type(self.is_nested(mge_metrics)), + "size": len(self), + "n_genes": len(self.genes), + "mgeR": ( + ",".join( + f"{k}:{v}" + # for k, v in sorted(Counter(self.recombinases).items()) + for k, v in sorted(self.recombinases.items()) + ) + if self.recombinases else "" + ), + } + if self.name: + attribs["name"] = self.name + attrib_str = ";".join(f"{item[0]}={item[1]}" for item in attribs.items() if item[1]) + # Format the source column + if source_db: + source = f"proMGE_{source_db}" + else: + source = "proMGE" + print( + self.contig, + source, + "mobile_genetic_element", + self.start, + self.end, + len(self), # Score field + ".", # Strand + ".", # Phase + attrib_str, + sep="\t", + file=gff_outstream + ) + + if write_genes: + # GFF3 child term: genes + for gene in sorted(self.genes, key=lambda g: g.id): + gene.to_gff( + gff_outstream, + genomic_island_id=island_id, + add_functional_annotation=add_functional_annotation, + ) + + @classmethod + def from_gff(cls, *cols): + try: + attribs = dict(item.split("=") for item in cols[-1].split(";")) + except: + raise ValueError(f"not enough cols? {cols}") + + try: + recombinases = Counter( + dict((key, int(value)) for key, value in + (item.split(":") + for item in attribs["mgeR"].split(",")) + ) + ) + except: + raise ValueError(f"recombinase string weird? {attribs['mgeR'].split(',')}") + + try: + mges = Counter( + dict((key, int(value)) for key, value in + (item.split(":") + for item in attribs["mge"].split(",")) + ) + ) + except: + raise ValueError(f"mge string weird? {attribs['mge'].split(',')}") + + genome_id, contig, start, end = cls.parse_mge_id(attribs["ID"]) + + return cls( + "", # TODO: where to get/ how to handle specI + genome_id, + attribs["genome_type"] == "COR", + cols[0], # contig + int(cols[3]), # start + int(cols[4]), # end + recombinases=recombinases, + mge=mges, + mge_type=attribs["mge_type"], + size=int(attribs["size"]), + n_genes=int(attribs["n_genes"]), + genes=set(), + ) + + def to_tsv(self, outstream): + metrics = list(self.get_mge_metrics()) + print( + *metrics, + self.contig, + self.start, + self.end, + len(self), # size + len(self.genes), # n_genes + ",".join( + f"{k}:{v}" + # for k, v in sorted(Counter(self.recombinases).items()) + for k, v in sorted(self.recombinases.items()) + ) if self.recombinases else "", + (self.name if self.name else ""), + ",".join(gene.id for gene in sorted(self.genes, key=lambda g: g.id)), # gene_list + sep="\t", + file=outstream, + ) diff --git a/mgexpose/.ipynb_checkpoints/query_db-checkpoint.py b/mgexpose/.ipynb_checkpoints/query_db-checkpoint.py new file mode 100644 index 0000000..da886df --- /dev/null +++ b/mgexpose/.ipynb_checkpoints/query_db-checkpoint.py @@ -0,0 +1,280 @@ +import psycopg2 +import pandas as pd +import json +import sys + +genome2speci_pg3_file = '/home/grekova/workspace/promge_website/data/genome_id2speci.tsv' + +df = pd.read_csv(genome2speci_pg3_file, sep="\t") + +genome2speci_ids = dict(zip(df["sample_name"], df["cluster_name"])) + +def get_gtdb_id(identifier): + """Extract the GTDB ID from a genome identifier.""" + parts = identifier.split('_') + return f"{parts[0]}_{parts[1]}" # GCA_xxxxxx.x + +def connect(params_dic): + """ Connect to the PostgreSQL database server """ + conn = None + try: + # connect to the PostgreSQL server + print('Connecting to the PostgreSQL database...') + conn = psycopg2.connect(**params_dic) + except (Exception, psycopg2.DatabaseError) as error: + print(error) + sys.exit(1) + print("Connection created successfully") + return conn + +def parse_mge_id(id): + #'GCA_009102765.1_371601.SAMN11944272.WDCH01000111:267-2860' + mge_dict = {'gtdb_id': '', + 'contig' : '', + 'start' : '', + 'end' : '' + } + id = id.replace(':', '_').split('_') + mge_dict['gtdb_id'] = (id[0] + '_' + id[1]) #GCA_xxxxxx.x + mge_dict['contig'] = (id[2]) + coordinates = [int(c) for c in id[3].split('-')] + mge_dict['start'] = (coordinates[0]) + mge_dict['end'] = (coordinates[1]) + return mge_dict + +def query_mge_annotations(conn): + ''' Query the database to get all levels of taxonomy, mge_type and recombinases from the mge table. + Args: + In: conn: psycopg connection + Out: result: df mge_id recombinase tax_domain tax_phylum tax_class tax_order tax_family tax_genus tax_species + + ''' + cursor = conn.cursor() + levels = ["clusters.{level}".format(level=level) for level in ['tax_domain', 'tax_phylum', 'tax_class', + 'tax_order', 'tax_family', 'tax_genus', + 'tax_species']] + levels_str = ', '.join(levels) + query = """ + SELECT contig || ':' || start_pos || '-' || end_pos AS contig_pos, + {levels_str} + FROM clusters AS clusters, pg3.mge AS mge + WHERE clusters.id = mge.cluster_id; + """.format(levels_str=levels_str) + cursor.execute(query) + + result = cursor.fetchall() + cursor.close() + columns = ['contig_pos'] + columns.extend([l.replace("clusters.", "") for l in levels]) + result = pd.DataFrame(result, columns=columns) + return result + +def get_taxa(speci_lst, cursor, level=None): + ''' Query the database to get taxonomy information + Args: + In: speci_lst (list): List of species names + cursor: psycopg cursor object + level (str, optional): Specific taxonomic level to query. If None, fetch all levels. + Out: result: (DataFrame) containing taxonomy information + ''' + levels = { + "tax_domain", "tax_phylum", "tax_class", "tax_order", "tax_family", "tax_genus", "tax_species" + } + + if level and level not in levels: + raise ValueError(f"Invalid level: {level}. Choose from {levels} or None for full taxonomy.") + + levels_str = f"clusters.{level}" if level else ', '.join(f"clusters.{lvl}" for lvl in levels) + + specI_str = ', '.join(['%s'] * len(speci_lst)) + + query = f""" + SELECT cluster_name, {levels_str} + FROM clusters AS clusters + WHERE clusters.cluster_name IN ({specI_str}); + """ + + cursor.execute(query, tuple(speci_lst)) + result = cursor.fetchall() + + columns = ['cluster_name'] + ([level] if level else list(levels)) + + return pd.DataFrame(result, columns=columns) if result else pd.DataFrame(columns=columns) + +def get_gtdb_taxa(sample_ids, db, cursor, level=None): + levels = { + "d", "p", "c", "o", "f", "g", "s" + } + if level and level not in levels: + raise ValueError(f"Invalid level: {level}. Choose from {levels} or None for full taxonomy.") + + if db == "pg3": + tax_table = "pg3.gtdb_r220" + sample_table = "pg3.samples" + assembly = "genome_id" + levels_str = f"t.{level}" if level else ', '.join(f"t.{lvl}" for lvl in levels) + elif db == "spire": + tax_table = "gtdb_r220" + sample_table = "bins" + assembly = "bin_id" + levels_str = f"{tax_table}.{level}" if level else ', '.join(f"{tax_table}.{lvl}" for lvl in levels) + else: + raise ValueError(f"Invalid db specification: {db}. pg3 or spire are allowed.") + + + sample_ids_str = ', '.join(['%s'] * len(sample_ids)) + + if db == "pg3": + query = f""" + SELECT sample_name, {levels_str} + FROM {sample_table} AS s, {tax_table} AS t + WHERE (s.sample_name IN ({sample_ids_str})) AND (s.id = t.sample_id); + """ + elif db == "spire": + query = f""" + SELECT bin_name, {levels_str} + FROM {sample_table} AS {sample_table}, {tax_table} AS {tax_table} + WHERE ({sample_table}.bin_name IN ({sample_ids_str})) AND ({sample_table}.id = {tax_table}.bin_id); + """ + else: + raise ValueError(f"Invalid db specification: {db}. pg3 or spire are allowed.") + + cursor.execute(query, tuple(sample_ids)) + result = cursor.fetchall() + + columns = [assembly] + ([level] if level else list(levels)) + + return pd.DataFrame(result, columns=columns) if result else pd.DataFrame(columns=columns) + + +def annotate_clustering_df(clustered_df, conn, level="tax_species"): + ''' Query taxonomy information and update DataFrame ''' + + print("Clustering df, nrows:", len(clustered_df)) + + genome2speci = {id: genome2speci_ids[get_gtdb_id(id)] for id in clustered_df.member_seq_100 if 'GCA_' in id} + clustered_df['speci'] = clustered_df['member_seq_100'].map(genome2speci) + specIs = list(set(genome2speci.values())) + print("# specI:", len(specIs)) + + cursor = conn.cursor() + + if level == "full": + result_df = get_taxa(specIs, cursor) + else: + result_df = get_taxa(specIs, cursor, level) + + print("Merging taxonomy") + clustered_df = clustered_df.merge(result_df, how="inner", left_on="speci", right_on="cluster_name") + + cursor.close() + return clustered_df + + +def get_speci_taxonomy_df(speci_lst, conn, level="tax_species"): + ''' For each specI cluster get taxonomy information and return as taxa_df dataframe''' + + specIs = list(set(speci_lst)) # Ensure that it is a list + print("# specI:", len(speci_lst)) + + cursor = conn.cursor() + + if level == "full": + taxa_df = get_taxa(speci_lst, cursor) + else: + taxa_df = get_taxa(speci_lst, cursor, level) + + cursor.close() + return taxa_df + + +def get_gtdb_taxonomy_df(sample_ids, db, conn, level="tax_species"): + ''' For each sample_id (bin_id or genome_id) get gtdb taxonomy information and return as taxa_df dataframe''' + + sample_ids= list(set(sample_ids)) # Ensure that it is a list + print("# samples_ids:", len(sample_ids)) + + cursor = conn.cursor() + + if level == "full": + taxa_df = get_gtdb_taxa(sample_ids, db, cursor) + else: + taxa_df = get_gtdb_taxa(sample_ids, db, cursor, level) + + cursor.close() + return taxa_df + + +def get_microontology(sample_names, conn): + ''' + Query the database to get microontology information. + + Args: + sample_names (list): List of sample names + cursor (psycopg cursor): Active DB cursor + + Returns: + pd.DataFrame: DataFrame with sample_name, sample_id, term_id, term, term_array + ''' + + if len(sample_names) == 0: + return pd.DataFrame(columns=["sample_name", "study_id", "sample_id", "term"]) + cursor = conn.cursor() + + samples_str = ', '.join(['%s'] * len(sample_names)) + + query = f""" + SELECT + s.sample_name, + s.study_id, + mv.sample_id, + mt.term + FROM samples s + JOIN microntology_v3 mv ON s.id = mv.sample_id + JOIN LATERAL unnest(mv.microntology_terms) AS term_id ON TRUE + JOIN microntology_terms mt ON mt.id = term_id + WHERE s.sample_name IN ({samples_str}); + """ + + cursor.execute(query, tuple(sample_names)) + result = cursor.fetchall() + cursor.close() + columns = ["sample_name", "study_id", "sample_id", "term"] + + return pd.DataFrame(result, columns=columns) if result else pd.DataFrame(columns=columns) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/mgexpose/__init__.py b/mgexpose/__init__.py index b4d74cf..a6b60c0 100644 --- a/mgexpose/__init__.py +++ b/mgexpose/__init__.py @@ -1,3 +1,7 @@ """ MGExpose """ +from . import batch_helpers, downstream +__all__ = ["batch_helpers", "downstream"] + + __version__ = "3.7.6" diff --git a/mgexpose/__main__.py b/mgexpose/__main__.py index c1cdb08..48c6402 100755 --- a/mgexpose/__main__.py +++ b/mgexpose/__main__.py @@ -20,7 +20,7 @@ ) from .islands import MgeGenomicIsland from .readers import read_fasta, read_prodigal_gff, read_mge_rules -from .gffio import read_genomic_islands_gff +from .gffio import read_genomic_islands_gff, read_mge_genomic_islands_gff MGE_TABLE_HEADERS = \ ("is_tn",) + \ diff --git a/mgexpose/batch_helpers.py b/mgexpose/batch_helpers.py index b764d9d..858f9c1 100644 --- a/mgexpose/batch_helpers.py +++ b/mgexpose/batch_helpers.py @@ -11,9 +11,9 @@ from dask.delayed import delayed import dask -from gffio import read_mge_genomic_islands_gff +from .gffio import read_mge_genomic_islands_gff -from base_logger import logger +from .base_logger import logger import traceback diff --git a/mgexpose/downstream.py b/mgexpose/downstream.py index 0fc86d2..2b94710 100644 --- a/mgexpose/downstream.py +++ b/mgexpose/downstream.py @@ -4,9 +4,9 @@ from collections import Counter, defaultdict -from gffio import read_mge_genomic_islands_gff +from .gffio import read_mge_genomic_islands_gff -from base_logger import logger +from .base_logger import logger def stat_nested(islands): """Calculate statistics on nested MGEs and return as a dictionary.""" diff --git a/mgexpose/gene.py b/mgexpose/gene.py index 6fc357d..5d1eb59 100644 --- a/mgexpose/gene.py +++ b/mgexpose/gene.py @@ -4,7 +4,7 @@ from dataclasses import dataclass -from readers import EggnogReader +from .readers import EggnogReader @dataclass diff --git a/mgexpose/gffio.py b/mgexpose/gffio.py index 2c268d3..7be11a1 100644 --- a/mgexpose/gffio.py +++ b/mgexpose/gffio.py @@ -1,7 +1,8 @@ -from gene import Gene -from islands import GenomicIsland, MgeGenomicIsland -from base_logger import logger +from .gene import Gene +from .islands import GenomicIsland, MgeGenomicIsland + +from .base_logger import logger def read_genomic_islands_gff(fn): with open(fn, "rt", encoding="UTF-8") as _in: @@ -23,13 +24,14 @@ def read_genomic_islands_gff(fn): if island is not None: yield island -def read_mge_genomic_islands_gff(fn, relevant_ids=None): +def read_mge_genomic_islands_gff(fn, relevant_ids=None, parse_mge_id=True): """ Generator function to read and parse MGEs and associated genes from a GFF file. Parameters: - fn: Path to the GFF file. - relevant_ids: Optional set of relevant MGE IDs to filter. If None, all MGEs are processed. + - parse_mge_id: Boolean indicating whether to parse the MGE ID into components. Yields: - MgeGenomicIsland objects that match the relevant IDs or all if None. @@ -48,7 +50,7 @@ def read_mge_genomic_islands_gff(fn, relevant_ids=None): if relevant_ids is None or mge_id in relevant_ids: if island is not None: yield island - island = MgeGenomicIsland.from_gff(*cols) + island = MgeGenomicIsland.from_gff(*cols, parse_mge_id=parse_mge_id) elif cols[2] == "gene": parent_id = attributes.get("Parent") @@ -64,4 +66,4 @@ def read_mge_genomic_islands_gff(fn, relevant_ids=None): raise ValueError("Found gene with no preceding island.") if island is not None: - yield island + yield island \ No newline at end of file diff --git a/mgexpose/islands.py b/mgexpose/islands.py index 86af044..611b0e1 100644 --- a/mgexpose/islands.py +++ b/mgexpose/islands.py @@ -23,8 +23,8 @@ from collections import Counter from dataclasses import dataclass, field -from gene import Gene -from recombinases import MgeRule, MGE_ALIASES +from .gene import Gene +from .recombinases import MgeRule, MGE_ALIASES logger = logging.getLogger(__name__) @@ -454,6 +454,12 @@ def parse_mge_id(mge_id): contig = kmer_match.group(1)[1:] # remove leading underscore genome_id = core[: -len(kmer_match.group(1))] # remove contig return genome_id, contig, start, end + + # Underscore split pattern e.g. MGE_NT12001_NC_000913.3:5234-20508 + underscore_split = core.split('_', 1) + if len(underscore_split) == 2 and '.' in underscore_split[1]: + genome_id, contig = underscore_split + return genome_id, contig, start, end # Fallback for unknown formats raise ValueError(f"Unrecognized MGE ID format: {mge_id}") @@ -638,7 +644,7 @@ def to_gff(self, gff_outstream, source_db, write_genes=False, add_functional_ann ) @classmethod - def from_gff(cls, *cols): + def from_gff(cls, *cols, parse_mge_id=True): try: attribs = dict(item.split("=") for item in cols[-1].split(";")) except: @@ -663,8 +669,15 @@ def from_gff(cls, *cols): ) except: raise ValueError(f"mge string weird? {attribs['mge'].split(',')}") + + if parse_mge_id: + genome_id, contig, start, end = cls.parse_mge_id(attribs["ID"]) + else: + genome_id = attribs.get("genome", "") + contig = cols[0] + start = int(cols[3]) + end = int(cols[4]) - genome_id, contig, start, end = cls.parse_mge_id(attribs["ID"]) return cls( "", # TODO: where to get/ how to handle specI diff --git a/mgexpose/query_db.py b/mgexpose/query_db.py index ae54e5c..d8bb94a 100644 --- a/mgexpose/query_db.py +++ b/mgexpose/query_db.py @@ -3,10 +3,11 @@ import json import sys -json_file = '/g/scb2/bork/grekova/results/envmge/f13_17072024/genome_id2speci.json' +genome2speci_pg3_file = '/home/grekova/workspace/promge_website/data/genome_id2speci.tsv' -with open(json_file, "r") as file: - genome2speci_ids = json.load(file) +df = pd.read_csv(genome2speci_pg3_file, sep="\t") + +genome2speci_ids = dict(zip(df["sample_name"], df["cluster_name"])) def get_gtdb_id(identifier): """Extract the GTDB ID from a genome identifier.""" @@ -187,7 +188,7 @@ def get_speci_taxonomy_df(speci_lst, conn, level="tax_species"): return taxa_df -def get_gtdb_taxonomy_df(sample_ids, db, conn, level="tax_species"): +def get_gtdb_taxonomy_df(sample_ids, db, conn, level="s"): ''' For each sample_id (bin_id or genome_id) get gtdb taxonomy information and return as taxa_df dataframe''' sample_ids= list(set(sample_ids)) # Ensure that it is a list @@ -242,9 +243,24 @@ def get_microontology(sample_names, conn): return pd.DataFrame(result, columns=columns) if result else pd.DataFrame(columns=columns) +# Given a contig_name query for its length +# from this table +''' +spire=> SELECT * FROM pg3.contigs LIMIT 10; + id | sample_id | contig_name | length +----+-----------+----------------------------------+-------- + 1 | 1004653 | 938639.SAMN02441251.ATTB01000002 | 138806 + 2 | 1004653 | 938639.SAMN02441251.ATTB01000003 | 88240 + 3 | 1004653 | 938639.SAMN02441251.ATTB01000004 | 86762 +''' +def get_contig_length(contig_name, conn): - - + cursor = conn.cursor() + query = f"SELECT length FROM pg3.contigs WHERE contig_name = %s;" + cursor.execute(query, (contig_name,)) + result = cursor.fetchone() + cursor.close() + return result[0] if result else None diff --git a/mgexpose/readers.py b/mgexpose/readers.py index 5f22ce3..bed60be 100644 --- a/mgexpose/readers.py +++ b/mgexpose/readers.py @@ -7,8 +7,8 @@ import re import sys -from chunk_reader import get_lines_from_chunks -from recombinases import MgeRule +from .chunk_reader import get_lines_from_chunks +from .recombinases import MgeRule def read_prodigal_gff(f): diff --git a/setup.cfg b/setup.cfg index 85077c7..767d236 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,3 +1,8 @@ +[options] +packages = find: +[options.packages.find] +include = mgexpose* [metadata] description_file = README.md -license_files = LICENSE \ No newline at end of file +license_files = LICENSE + From 728c4cf21cda6f73c0cb00ca1df523f3fb63a0b3 Mon Sep 17 00:00:00 2001 From: grekova Date: Tue, 23 Dec 2025 16:32:42 +0100 Subject: [PATCH 3/4] Temp fix dask import problem --- mgexpose/__init__.py | 7 +++--- mgexpose/downstream.py | 48 ++++++++++++++++++++++++++++++++++++++---- 2 files changed, 47 insertions(+), 8 deletions(-) diff --git a/mgexpose/__init__.py b/mgexpose/__init__.py index a6b60c0..d4129ee 100644 --- a/mgexpose/__init__.py +++ b/mgexpose/__init__.py @@ -1,7 +1,6 @@ """ MGExpose """ -from . import batch_helpers, downstream -__all__ = ["batch_helpers", "downstream"] - - __version__ = "3.7.6" + +# Don't import at package level to avoid import errors during installation +__all__ = ["batch_helpers", "downstream"] diff --git a/mgexpose/downstream.py b/mgexpose/downstream.py index 2b94710..821a2af 100644 --- a/mgexpose/downstream.py +++ b/mgexpose/downstream.py @@ -50,15 +50,15 @@ def stat_core(islands): if island.is_core: core += 1 total += 1 - core_percentage = (core / total * 100) if total > 0 else 0 - accessory_percentage = ((total - core) / total * 100) if total > 0 else 0 + core_prop = (core / total * 100) if total > 0 else 0 + acc_prop = ((total - core) / total * 100) if total > 0 else 0 return { "total": total, "core_count": core, "accessory_count": total - core, - "core_percentage": core_percentage, - "accessory_percentage": accessory_percentage, + "core_prop": core_prop, + "acc_prop": acc_prop, } @@ -444,5 +444,45 @@ def get_cargo_genes_cazy(islands): # Output: for each mge_type output a dictionary mge_id: list(cargo_ids) def get_cargo_genes(islands): return get_per_mge_cargo(islands, func=get_genes) + + +def get_mge_basic_info(islands): + """ + Extract basic information for each MGE island. + + Parameters: + - islands: List of MGE objects. + + Returns: + - List of dictionaries with mge_id, mge, mge_type, n_genes, and size for each island. + """ + mge_info_list = [] + + for island in islands: + try: + mge_id = island.get_id() + mge = ",".join( + f"{k}:{v}" + for k, v in sorted(island.mge.items()) + ) if island.mge else "" + mge_type = island.mge_type if island.mge_type else "" + n_genes = island.n_genes + size = island.size + + mge_info = { + "mge_id": mge_id, + "mge": mge, + "mge_type": mge_type, + "n_genes": n_genes, + "size": size, + "is_core": int(island.is_core) # 1 = core, 0 = accessory + } + mge_info_list.append(mge_info) + + except Exception as e: + logger.error(f"Error processing island {island}: {e}") + continue + + return mge_info_list From fc716e5f6d9f2dbc5e09c7e1b952a9be2a0ae0fa Mon Sep 17 00:00:00 2001 From: grekova Date: Mon, 29 Dec 2025 16:29:24 +0100 Subject: [PATCH 4/4] Remove unused modules --- .../batch_helpers-checkpoint.py | 223 ------ .../downstream-checkpoint.py | 445 ----------- .../.ipynb_checkpoints/gffio-checkpoint.py | 67 -- .../.ipynb_checkpoints/islands-checkpoint.py | 708 ------------------ .../.ipynb_checkpoints/query_db-checkpoint.py | 280 ------- mgexpose/__main__.py | 354 --------- mgexpose/batch_helpers.py | 223 ------ mgexpose/chunk_reader.py | 31 - mgexpose/clean_workdir.py | 49 -- mgexpose/clustering_parser.py | 182 ----- mgexpose/db.py | 156 ---- mgexpose/gene_annotator.py | 218 ------ mgexpose/get_cluster_data.py | 67 -- mgexpose/get_db_seqs.py | 109 --- mgexpose/get_eggnog.py | 52 -- mgexpose/get_eggnog_f13.py | 74 -- mgexpose/handle_args.py | 191 ----- mgexpose/island_processing.py | 193 ----- mgexpose/mge_annotation.py | 317 -------- mgexpose/parse_hmmsearch.py | 98 --- mgexpose/phage.py | 126 ---- mgexpose/readers.py | 199 ----- mgexpose/recombinases.py | 182 ----- mgexpose/test_mge_annotation.py | 129 ---- 24 files changed, 4673 deletions(-) delete mode 100644 mgexpose/.ipynb_checkpoints/batch_helpers-checkpoint.py delete mode 100644 mgexpose/.ipynb_checkpoints/downstream-checkpoint.py delete mode 100644 mgexpose/.ipynb_checkpoints/gffio-checkpoint.py delete mode 100644 mgexpose/.ipynb_checkpoints/islands-checkpoint.py delete mode 100644 mgexpose/.ipynb_checkpoints/query_db-checkpoint.py delete mode 100755 mgexpose/__main__.py delete mode 100644 mgexpose/batch_helpers.py delete mode 100644 mgexpose/chunk_reader.py delete mode 100644 mgexpose/clean_workdir.py delete mode 100644 mgexpose/clustering_parser.py delete mode 100644 mgexpose/db.py delete mode 100644 mgexpose/gene_annotator.py delete mode 100644 mgexpose/get_cluster_data.py delete mode 100644 mgexpose/get_db_seqs.py delete mode 100644 mgexpose/get_eggnog.py delete mode 100644 mgexpose/get_eggnog_f13.py delete mode 100644 mgexpose/handle_args.py delete mode 100644 mgexpose/island_processing.py delete mode 100644 mgexpose/mge_annotation.py delete mode 100644 mgexpose/parse_hmmsearch.py delete mode 100644 mgexpose/phage.py delete mode 100644 mgexpose/readers.py delete mode 100644 mgexpose/recombinases.py delete mode 100644 mgexpose/test_mge_annotation.py diff --git a/mgexpose/.ipynb_checkpoints/batch_helpers-checkpoint.py b/mgexpose/.ipynb_checkpoints/batch_helpers-checkpoint.py deleted file mode 100644 index 858f9c1..0000000 --- a/mgexpose/.ipynb_checkpoints/batch_helpers-checkpoint.py +++ /dev/null @@ -1,223 +0,0 @@ -#!/usr/bin/env python -# pylint: disable=R0912,R0914,R0915 -''' Collection of functions to collect MGEs from batches of files''' -import os -from collections import Counter, defaultdict -import json - -from dask.distributed import Client, progress, WorkerPlugin -import dask.bag as db -from dask.bag import from_delayed -from dask.delayed import delayed -import dask - -from .gffio import read_mge_genomic_islands_gff - -from .base_logger import logger -import traceback - - -# Create a dictionary: genome: list of MGE IDs. This is needed to filter out only relevant MGEs per genome.The input is a corresponding list (same order) of genomes and MGE IDs. -def get_genome2mges(genomes, mges): - genome2mge_id = {} - for id, genome_id in zip(mges, genomes): - if genome_id not in genome2mge_id: - genome2mge_id[genome_id] = [] # Initialize the list - genome2mge_id[genome_id].append(id) - return genome2mge_id - -# Helper function to extract genome ID/bin ID from file path -def get_genome_id_from_path(path): - """ - Extract genome ID from a file path. - """ - genome_id = None - try: - genome_id = path.split("/")[-2] - except Exception as e: - logger.error(f"Error extracting genome/bin ID{path}: {e}") - return genome_id - -def collect_batch_mges(gff_paths, i, relevant_ids=None): - """ - Collect MGEs from a batch of GFF files. - - Parameters: - - gff_paths: List of GFF file paths. - - i: Index of the batch. - - relevant_ids: Optional dictionary of relevant MGE IDs per genome ID. - - Returns: - - List of MGE islands for all files in the batch. - """ - islands = [] - for gff_path in gff_paths: - genome_id = get_genome_id_from_path(gff_path) - #logger.info(f"Processing genome: {genome_id}") - - try: - if relevant_ids: - relevant_mges = list(read_mge_genomic_islands_gff(gff_path, relevant_ids[genome_id])) - else: - relevant_mges = list(read_mge_genomic_islands_gff(gff_path)) - - islands.extend(relevant_mges) - - except Exception as e: - logger.error(f"Error processing {gff_path}: {e}") - logger.error(traceback.format_exc()) # Full traceback - - logger.info(f"Batch {i} completed, MGE islands found: {len(islands)}") - return islands - - -def apply_per_batch(islands, funcs): - """ - Calculate statistics for a batch of MGEs using a list of functions. - - Parameters: - - islands: List of MGE objects in the batch. - - funcs: List of functions to be applied to the batch. - - Returns: - - Dictionary with function names as keys and their results as values. - """ - results = {} - for func in funcs: - func_name = func.__name__ # Get the function's name - results[func_name] = func(islands) # Apply the function and store the result - return results - -def apply_one_per_batch(islands, func): - """ - Apply a single function to a batch of MGE objects. - - Parameters: - - islands: List of MGE objects in the batch. - - func: A single function to be applied to the batch. - - Returns: - - The result of applying the function to the islands. - """ - try: - return func(islands) - except Exception as e: - raise RuntimeError(f"Error applying function '{func.__name__}' to batch: {e}") - - -def write_batch_json(batch_count, i, dir, base_filename): - """ - Saves batch statistics to a JSON file using a Dask delayed function. This allows the function to be part of a larger - Dask computation graph, potentially executed in parallel. - - Parameters: - - batch_count (dict): Batch statistics, typically a dictionary with `Counter` values. - - i (int): Index of the current batch. This is used to generate a unique filename for each batch. - - dir (str): Path to the directory where the batch JSON files will be stored. - - base_filename (str): Base name for the JSON files. The batch index will be appended to this base name to create the full filename. - - Ensures that the specified directory exists before writing the file. If it does not exist, the directory will be created. - - Returns: - - A Dask delayed object which, when executed, will write the batch statistics to the specified file path in JSON format. - """ - # Ensure directory exists - if not os.path.exists(dir): - os.makedirs(dir) - - # Delayed function to write JSON - def delayed_write(path, data): - with open(path, 'w') as file: - json.dump(data, file, indent=4) - - # Construct the full path with batch number - path = os.path.join(dir, f"{base_filename}_{i}.json") - return delayed(delayed_write)(path, batch_count) - - -def write_batch_tsv(tsv_string, i, dir, base_filename): - """ - Saves a TSV-formatted string to a file as part of a Dask computation graph. - - Parameters: - - tsv_string (str): TSV-formatted data returned from a processing function. - - i (int): Index of the current batch, used to make unique filenames. - - dir (str): Directory where TSV files will be saved. - - base_filename (str): Base name for the output files. - - Returns: - - A Dask delayed object that writes the TSV to disk when executed. - """ - # Ensure the output directory exists - if not os.path.exists(dir): - os.makedirs(dir) - - # Define the write function - def delayed_write(path, content): - with open(path, 'w') as file: - file.write(content) - - # Create the output file path - path = os.path.join(dir, f"{base_filename}_{i}.tsv") - - # Return the delayed write operation - return delayed(delayed_write)(path, tsv_string) - - -def aggregate_attr(batches): - """ - Aggregate string attributes across all batches. - - Parameters: - - batches: List of batch statistics (dictionaries with str values e.g. cluster: COG category). - - {'90371.SAMN11043730.GCA_007435405_02914': 'S', '90371.SAMN14863315.GCA_013264555_00909': 'S', '28150.SAMN09228819.GCA_007140965_00837': 'K', '28901.SAMN13391507.GCA_011477875_00875': 'no_cog_fcat', '1967657.SAMN09203654.GCA_010924635_01295': 'S', '28901.SAMN13057743.GCA_009231785_02754': 'no_cog_fcat', '90371.SAMN11355433.GCA_007687065_04488': 'S', '28901.SAMN06645026.GCA_009179045_04086': 'no_cog_fcat', '28901.SAMN15658059.GCA_013797405_02028': 'S', '796732.SAMN01805325.GCA_000272735_04407': 'no_cog_fcat', '28901.SAMN12571445.GCA_010939435_00055': 'S', '28901.SAMN13747386.GCA_010741235_03029': 'no_cog_fcat', '115981.SAMN14080650.GCA_011486465_01735': 'S', '28901.SAMN14341880.GCA_011465135_00878': 'H', '1151002.SAMN09403228.GCA_004177825_01940': 'no_cog_fcat', '1029990.SAMN02415182.GCA_000484355_00589': 'S', '28901.SAMN12287151.GCA_007468615_01067': 'S', '28901.SAMN13057273.GCA_009230165_00580': 'no_cog_fcat', '611.SAMN21335643.GCA_019899165_00890': 'EH', '28901.SAMN10095790.GCA_005443695_02309': 'no_cog_fcat', '340190.SAMN15147492.GCA_013661605_01048': 'S', '224729.SAMN19336595.GCA_018502705_02901': 'no_cog_fcat', '28901.SAMN16355443.GCA_015155595_01828': 'no_cog_fcat', '59201.SAMN10093771.GCA_007777665_02600': 'no_cog_fcat', '59201.SAMN17835677.GCA_017072195_01803': 'S'} - -2025-01-31 18:35:27,669 - {'611.SAMN17086052.GCA_016740915_04210': 'K', '611.SAMN07152477.GCA_007233055_03707': 'S', '1173835.SAMN01088029.GCA_000962395_02473': 'L', '1620419.SAMN03894126.GCA_001241425_04679': 'T', '568709.SAMEA2272227.GCA_000493535_02720': 'no_cog_fcat', '28901.SAMN18448990.GCA_017574325_01595': 'L', '90371.SAMEA6057931.GCA_016228905_04588': 'L', '28901.SAMN10177571.GCA_005772365_02802': 'N', '28901.SAMN14050865.GCA_011246635_04142': 'G', '90371.SAMN09387768.GCA_007158225_04690': 'N', '28144.SAMN07734943.GCA_003548115_02128': 'no_cog_fcat', '90105.SAMN09474912.GCA_004184575_03995': 'G', '59201.SAMN10756627.GCA_007583145_03925': 'S', '90371.SAMN03169328.GCA_008018515_03842': 'K', '1620419.SAMN04255380.GCA_010457935_04445': 'no_cog_fcat', '28901.SAMN16124589.GCA_014542005_01530': 'G', '28901.SAMN17005521.GCA_015838815_01302': 'no_cog_fcat', '28901.SAMN19285790.GCA_018468945_04349': 'G', '28901.SAMN10425133.GCA_010255835_04194': 'S', '28901.SAMN12344366.GCA_007726245_00622': 'S', '28901.SAMN12107692.GCA_006482085_01260': 'S', '440524.SAMN02867573.GCA_010663445_04243': 'M', '28901.SAMN20181473.GCA_020012815_04294': 'L', '28901.SAMEA6514879.GCA_011786425_00695': 'G', '90371.SAMN07279560.GCA_002260995_02309': 'K', '90371.SAMN19798225.GCA_018997055_00491': 'L', '28901.SAMN12823265.GCA_008717395_04569': 'V', '1173837.SAMN01088030.GCA_000962405_02064': 'L', '399584.SAMN13050934.GCA_009225065_04346': 'G', '28901.SAMN13057273.GCA_009230165_01317': 'no_eggnog'} - - Returns: - - Dictionary of aggregated statistics for all batches {func: {mge_type: {cluster: COG_category}}} - """ - aggregated = {} - - for batch in batches: - for func_name, mge_dict in batch.items(): - aggregated[func_name] = {} - for mge_type, attr_dict in mge_dict.items(): - if mge_type not in aggregated[func_name]: - aggregated[func_name][mge_type] = defaultdict(list) # TODO: Generalise to function - # Update using the contents of the nested dictionary - for cluster_id, value in attr_dict.items(): - aggregated[func_name][mge_type][cluster_id] = value # TODO: replace with proper majority vote - # flatten the COG value per batch - return aggregated - -def aggregate_counts(batch_counts): - """ - Aggregate statistics across all batches. - - Parameters: - - batch_counts: List of batch statistics (dictionaries with Counter values). - - Returns: - - Dictionary of aggregated statistics for all batches {func: {mge_type: {cluster: count}}} - """ - aggregated = {} - - for batch in batch_counts: - for func_name, mge_counter in batch.items(): - aggregated[func_name] = {} - for mge_type, nested_counter in mge_counter.items(): - if mge_type not in aggregated[func_name]: - aggregated[func_name][mge_type] = Counter() # Initialize if not already present - # Update using the contents of the nested dictionary - for cluster_id, value in nested_counter.items(): - if isinstance(value, str): - aggregated[func_name][mge_type][cluster_id] = value # Overwrite previous COG category with the latest batch - else: - if cluster_id in aggregated[func_name][mge_type]: - aggregated[func_name][mge_type][cluster_id] += value - else: - aggregated[func_name][mge_type][cluster_id] = value - return aggregated - diff --git a/mgexpose/.ipynb_checkpoints/downstream-checkpoint.py b/mgexpose/.ipynb_checkpoints/downstream-checkpoint.py deleted file mode 100644 index a3950ec..0000000 --- a/mgexpose/.ipynb_checkpoints/downstream-checkpoint.py +++ /dev/null @@ -1,445 +0,0 @@ -#!/usr/bin/env python -# pylint: disable=R0912,R0914,R0915 -import os - -from collections import Counter, defaultdict - -from .gffio import read_mge_genomic_islands_gff # FIXME - -from .base_logger import logger - - -''' Quick and dirty implementation - actually does not belong here ''' # FIXME -def stat_core(gff_files): - """Calculate statistics on MGEs in the core genome and return as a dictionary.""" - total = 0 - core = 0 - for f in gff_files: - island = read_mge_genomic_islands_gff(f) - if island.is_core: - core += 1 - total += 1 - return { - "total": total, - "core_count": core, - "accessory_count": total - core, - } - -def stat_nested(islands): - """Calculate statistics on nested MGEs and return as a dictionary.""" - total = 0 - nested = 0 - for island in islands: - if island.mge_type == "nested": - nested += 1 - total += 1 - nested_percentage = (nested / total * 100) if total > 0 else 0 - non_nested_percentage = ((total - nested) / total * 100) if total > 0 else 0 - - return { - "total": total, - "nested_count": nested, - "non_nested_count": total - nested, - "nested_percentage": nested_percentage, - "non_nested_percentage": non_nested_percentage, - } - - -def count_nested(islands): - """ - Calculate the count of nested MGEs. - - Parameters: - - islands: List of MGE objects. - - Returns: - - Integer count of nested MGEs. - """ - nested = sum(1 for island in islands if island.mge_type == "nested") - return nested - - -def count_core(islands): - """ - Calculate the count of MGEs in the core genome. - - Parameters: - - islands: List of MGE objects. - - Returns: - - Integer count of MGEs in the core genome. - """ - core = sum(1 for island in islands if island.is_core) - return core - - -def count_total_islands(islands): - """ - Count the total number of MGE islands. - - Parameters: - - islands: List of MGE objects. - - Returns: - - Integer count of total MGE islands. - """ - return len(list(islands)) - - -def stat_mge_type(islands): - """ - Calculate counts of each MGE type and return as a Counter object. - - Parameters: - - islands: List of MGE objects. - - Returns: - - Counter object with counts of each MGE type. - """ - mge_counts = Counter() - for island in islands: - try: - if island.mge_type == "nested": - mge_counts["nested"] += 1 - else: - # Get the first key (assuming there's only one key) - mge = next(iter(island.mge.keys())) - mge_counts[mge] += 1 - except Exception as e: - raise ValueError(f"Unknown or absent MGE type: {e}") - - return mge_counts - - -def stat_mean_genes(islands): - """Calculate the mean number of genes per MGE and return as a dictionary.""" - genes_lst = [island.n_genes for island in islands] - mean_genes = (sum(genes_lst) / len(genes_lst)) if genes_lst else 0 - - # Return the result as a dictionary - return {"mean_genes_per_mge": mean_genes} - - -def extract_cargo(island): - cargo_genes = [] - for gene in island.genes: - if (gene.phage is None) and (gene.recombinase is None) and (gene.secretion_system is None): - cargo_genes.append(gene) - return cargo_genes - - -def get_kegg_ko(gene): - for key, value in gene.eggnog: - if key == "kegg_ko": - return value - - -def get_cazy(gene): - for key, value in gene.eggnog: - if key == "cazy": - return value - - -def get_cog_category(gene): - if gene.eggnog: - for key, value in gene.eggnog: - if key == "cog_fcat": - if value: - return value - else: - return '-' - else: - return 'no_cog_fcat' - else: - return 'no_eggnog' - - -# SP95 for SPIRE -def get_gene_cluster(gene): - return gene.cluster - - -def get_gene_id(gene): - id_lst = gene.id.split('.') # orginal ID e.g. 28901.SAMN15849311.GCA_014242155_04079 - return id_lst[2] # extract to match CY clustering IDs e.g. GCA_xxx_xxx - -# Extract MGE recombinases -def extract_mger(island): - mgeR_genes = [] - for gene in island.genes: - if gene.recombinase: - try: - gene_id = gene.id - gene_cluster = get_gene_cluster(gene) - mgeR = gene.recombinase - annot = [gene_id, gene_cluster, mgeR] - mgeR_genes.append(annot) - except Exception as e: - logger.error(f"Error processing recombinase gene {gene}: {e}") - logger.error(traceback.format_exc()) - return mgeR_genes - - -# Extract secretion system genes -def extract_secretion_system(island): - secretion_genes = [] - for gene in island.genes: - if gene.secretion_system: - try: - gene_id = gene.id - gene_cluster = get_gene_cluster(gene) - info = gene.secretion_system - annot = [gene_id, gene_cluster, info] - secretion_genes.append(annot) - except Exception as e: - logger.error(f"Error processing secretion system gene {gene}: {e}") - logger.error(traceback.format_exc()) - return secretion_genes - - -# Extract phage genes -def extract_phage(island): - phage_genes = [] - for gene in island.genes: - if gene.phage: - try: - gene_id = gene.id - gene_cluster = get_gene_cluster(gene) - info = gene.phage - annot = [gene_id, gene_cluster, info] - phage_genes.append(annot) - except Exception as e: - logger.error(f"Error processing phage gene {gene}: {e}") - logger.error(traceback.format_exc()) - return phage_genes - - -def get_most_common_kegg_ko(genes): - """ - Calculate the most common KEGG KO annotations for a list of genes. - - Parameters: - - genes: List of gene objects. - - Returns: - - Counter object with counts of KEGG KO annotations. - """ - kos = [get_kegg_ko(gene) for gene in genes] - return Counter(kos) - - -def count_gene_clusters(genes, **kwargs): - """ - Count genes in gene clusters. - - Parameters: - - genes: List of gene objects. - - Returns: - - Counter object with counts of cluster genes. - """ - gene_clusters = [get_gene_cluster(gene) for gene in genes] - return Counter(gene_clusters) - - -def get_majority_cog_category(genes, **kwargs): - cluster_to_categories = defaultdict(list) - - for gene in genes: - gene_cluster = get_gene_cluster(gene) - cog_category = get_cog_category(gene) - cluster_to_categories[gene_cluster].append(cog_category) - - majority_cog_category = {} - - # Determine the majority cog category for each cluster - for cluster, categories in cluster_to_categories.items(): - category_count = Counter(categories) - majority_cog_category[cluster] = category_count.most_common(1)[0][0] # a list of tuples where each tuple is a category and its count; [0][0] extracts the category with the highest count. - #logger.info(majority_cog_category) - return majority_cog_category - - -def get_gene_annotation(genes, func, **kwargs): - gene_annot_dict = {} - for gene in genes: - gene_id = get_gene_id(gene) - func_annot = func(gene) - gene_annot_dict[gene_id] = func_annot - - return gene_annot_dict - - -def get_genes_cog_categories(genes, **kwargs): - return get_gene_annotation(genes, get_cog_category) - - -def get_genes_ko(genes, *_args): - return get_gene_annotation(genes, get_kegg_ko) - - -def get_genes_cazy(genes, *_args): - return get_gene_annotation(genes, get_cazy) - - -def get_genes_clusters(genes, **kwargs): - return get_gene_annotation(genes, get_gene_cluster) - - -def get_genes(genes, mge_id): - gene_ids = [get_gene_id(gene) for gene in genes] - return {mge_id: gene_ids} - - -def count_per_mge_cargo(islands, func): - """ - Extract and count cargo genes associated with each MGE type and return as a dictionary of Counter objects. - - Parameters: - - islands: List of MGE objects. - - func: function to extract cargo e.g. count KO terms or gene clusters - - Returns: - - Dictionary where keys are MGE types, and values are Counter objects with KEGG KO counts. - """ - mge_cargo_counts = { - "nested": Counter(), - "phage": Counter(), - "phage_like": Counter(), - "is_tn": Counter(), - "ce": Counter(), - "mi": Counter(), - "integron": Counter(), - "cellular": Counter(), - } - - for island in islands: - try: - cargo = extract_cargo(island) - if island.mge_type == "nested": - mge_cargo_counts["nested"].update(func(cargo)) - else: - # Get the first key (assuming there's only one key) - mge = next(iter(island.mge.keys())) - mge_cargo_counts[mge].update(func(cargo)) - except Exception as e: - raise ValueError(f"Error processing cargo for island: {e}") - - return mge_cargo_counts - - -def get_per_mge_cargo(islands, func): - mge_cargo_annot = { - "nested": {}, - "phage": {}, - "phage_like": {}, - "is_tn": {}, - "ce": {}, - "mi": {}, - "integron": {}, - "cellular": {}, - } - - for island in islands: - try: - cargo = extract_cargo(island) - mge_id = island.get_id() - if island.mge_type == "nested": - mge_cargo_annot["nested"].update(func(cargo, mge_id)) # Merges another dictionary into existing one - else: - # Get the first key (assuming there's only one key) - mge = next(iter(island.mge.keys())) - mge_cargo_annot[mge].update(func(cargo, mge_id)) - except Exception as e: - raise ValueError(f"Error processing cargo for island: {e}") - - return mge_cargo_annot - - -def get_machinery_genes_tsv(islands): - tsv_rows = [] - header = ['mge_id', 'mge', 'n_genes', 'gene_id', 'gene_cluster', 'feature_type', 'feature_info'] - tsv_rows.append('\t'.join(header)) - - for island in islands: - try: - mge_id = island.get_id() - n_genes = len(island.genes) - mge = ",".join(f"{k}:{v}" for k, v in island.mge.items()) - - recombinases = extract_mger(island) # list of [gene_id, gene_cluster, info] - conj_machinery = extract_secretion_system(island) - phage_machinery = extract_phage(island) - - for gene in recombinases: - gene_id, gene_cluster, info = gene - tsv_rows.append(f"{mge_id}\t{mge}\t{n_genes}\t{gene_id}\t{gene_cluster}\tmgeR\t{info}") - - for gene in conj_machinery: - gene_id, gene_cluster, info = gene - tsv_rows.append(f"{mge_id}\t{mge}\t{n_genes}\t{gene_id}\t{gene_cluster}\tsecretion_system\t{info}") - - for gene in phage_machinery: - gene_id, gene_cluster, info = gene - tsv_rows.append(f"{mge_id}\t{mge}\t{n_genes}\t{gene_id}\t{gene_cluster}\tphage\t{info}") - - except Exception as e: - raise ValueError(f"Error processing machinery for island {island}: {e}") - - tsv_output = '\n'.join(tsv_rows) - return tsv_output - -def get_cargo_genes_tsv(islands): - tsv_rows = [] - header = ['mge_id', 'mge', 'gene_ids', 'gene_clusters'] - tsv_rows.append('\t'.join(header)) - - for island in islands: - gene_ids = [] - gene_clusters = [] - try: - mge_id = island.get_id() - mge = ",".join(f"{k}:{v}" for k, v in island.mge.items()) - - cargo_genes = extract_cargo(island) - - for gene in cargo_genes: - gene_ids.append(gene.id) - gene_clusters.append(get_gene_cluster(gene)) - gene_ids = ';'.join(gene_ids) - gene_clusters = ';'.join(gene_clusters) - tsv_rows.append(f"{mge_id}\t{mge}\t{gene_ids}\t{gene_clusters}") - - except Exception as e: - raise ValueError(f"Error processing cargo for island {island}: {e}") - - tsv_output = '\n'.join(tsv_rows) - return tsv_output - -# Counting works with aggregation since the objects are small. Getting only works with batch saving, since the output is huge. -def count_cargo_gene_clusters(islands): - return count_per_mge_cargo(islands, count_gene_clusters) - - -def get_cargo_species_gene_clusters(islands): - return get_per_mge_cargo(islands, func=get_genes_clusters) - - -# Output: for each mge_type output a dictionary with geneID: COG. geneID supposed to be unique -> overwriting is okay. -def get_cargo_genes_cog(islands): - return get_per_mge_cargo(islands, func=get_genes_cog_categories) - - -def get_cargo_genes_ko(islands): - return get_per_mge_cargo(islands, func=get_genes_ko) - - -def get_cargo_genes_cazy(islands): - return get_per_mge_cargo(islands, func=get_genes_cazy) - - -# Output: for each mge_type output a dictionary mge_id: list(cargo_ids) -def get_cargo_genes(islands): - return get_per_mge_cargo(islands, func=get_genes) - - diff --git a/mgexpose/.ipynb_checkpoints/gffio-checkpoint.py b/mgexpose/.ipynb_checkpoints/gffio-checkpoint.py deleted file mode 100644 index 13829bd..0000000 --- a/mgexpose/.ipynb_checkpoints/gffio-checkpoint.py +++ /dev/null @@ -1,67 +0,0 @@ -from .gene import Gene -from .islands import GenomicIsland, MgeGenomicIsland - -from .base_logger import logger - -def read_genomic_islands_gff(fn): - with open(fn, "rt", encoding="UTF-8") as _in: - island = None - for line in _in: - line = line.strip() - if line and line[0] != "#": - cols = line.split("\t") - if cols[2] == "region": - if island is not None: - yield island - island = GenomicIsland.from_gff(*cols) - elif cols[2] == "gene": - gene = Gene.from_gff(*cols) - if island is not None: - island.genes.add(gene) - else: - raise ValueError("Found gene but no island.") - if island is not None: - yield island - -def read_mge_genomic_islands_gff(fn, relevant_ids=None): - """ - Generator function to read and parse MGEs and associated genes from a GFF file. - - Parameters: - - fn: Path to the GFF file. - - relevant_ids: Optional set of relevant MGE IDs to filter. If None, all MGEs are processed. - - Yields: - - MgeGenomicIsland objects that match the relevant IDs or all if None. - """ - with open(fn, "rt", encoding="UTF-8") as _in: - island = None - for line in _in: - line = line.strip() - if line and line[0] != "#": - cols = line.split("\t") - attributes = {kv.split('=')[0]: kv.split('=')[1] for kv in cols[8].split(';') if '=' in kv} - - if cols[2] == "mobile_genetic_element": - mge_id = attributes.get("ID") - - if relevant_ids is None or mge_id in relevant_ids: - if island is not None: - yield island - island = MgeGenomicIsland.from_gff(*cols) - - elif cols[2] == "gene": - parent_id = attributes.get("Parent") - - if island is not None: - if relevant_ids is None or parent_id in relevant_ids: - gene = Gene.from_gff(*cols) - island.genes.add(gene) - else: - continue - else: - # This situation should not happen unless the GFF is malformed - raise ValueError("Found gene with no preceding island.") - - if island is not None: - yield island diff --git a/mgexpose/.ipynb_checkpoints/islands-checkpoint.py b/mgexpose/.ipynb_checkpoints/islands-checkpoint.py deleted file mode 100644 index 0304ea1..0000000 --- a/mgexpose/.ipynb_checkpoints/islands-checkpoint.py +++ /dev/null @@ -1,708 +0,0 @@ -# pylint: disable=C0116,C0301,R0902,R0916 -""" -Data Structures Module - -This module is designed to simplify the handling of different genomic sequences, -including but not limited to: - -- Genomic Island -- Annotated Genomic Island -- MGE Genomic Island -- Gene - -The end product of the pipeline is MGE Genomic Island, an Annotated Genomic Island -consisting of Genes. -It can be saved in a tsv or gff3 format together with its attributes and gene annotations. -The MGE type of each MGE Genomic Island is defined by applying MGE Rule. -""" -import itertools as it -import logging -import sys -import re - -from collections import Counter -from dataclasses import dataclass, field - -from .gene import Gene -from .recombinases import MgeRule, MGE_ALIASES - -logger = logging.getLogger(__name__) - - -@dataclass -class GenomicIsland: - '''The following class describes a generic genomic region - with one or more identified recombinases. - This region is then referred as Recombinase Island. - The Genomic Island is represented by contig, start and end - coordinates, set of genes, some of which are recombinases and MGE machinery. - Importantly, the set of genes does not include the non-coding regions. - ''' - RAW_TABLE_HEADER = ( - "specI", - "genome_accession", - "panG", - "contig", - "start", - "end", - "gene_list", - ) - - speci: str = None - genome: str = None - is_core: bool = None - contig: str = None - start: int = None - end: int = None - name: str = "ISLAND" - - genes: set = field(default_factory=set) - # recombinases: list = field(default_factory=list) - recombinases: Counter = field(default_factory=Counter) - - @staticmethod - def get_fieldnames(): - """ Returns column headers for island table. """ - return ( - "first_recombinase_gene", - "first_recombinase", - "island_size", - "genome", - "specI", - "core_acc", - "contig", - "first_gene_start", - "last_gene_end", - "protid_gene_clusters", - ) - - @classmethod - def from_region_string(cls, region): - """ Creates island from a predefined region string. """ - _, _, contig, start_end, *_ = region.strip().split(".") - contig = contig.split(".")[-1] - start, end = map(int, start_end.split("-")) - return cls(None, None, None, contig, start, end, region.strip()) - - @classmethod - def from_gene(cls, gene): - """ Creates island from starting gene. """ - island = cls( - gene.speci, - gene.genome, - gene.is_core, - gene.contig, - gene.start, - gene.end, - ) - island.add_gene(gene) - return island - - def __len__(self): - """ Calculates island length. """ - if self.start is None or self.end is None: - return 0 - return abs(self.end - self.start) + 1 - - def __str__(self): - """ String representation. """ - genes = ( - f"{gene.id}.{gene.cluster}" - for gene in sorted( - self.genes, key=lambda g: (g.start, g.end, g.strand) - ) - ) - - return "\t".join( - [ - f"{v}" if (k != "is_core" or v is None) else Gene.rtype(self.is_core) - for k, v in self.__dict__.items() - if k not in ("genes", "recombinases") - ] + [",".join(genes)] - ) - - def add_gene(self, gene): - """ Adds a gene to the island. """ - if gene not in self.genes: - self.end = max(self.end, gene.end) - if gene.recombinase is not None: - # self.recombinases.append( - # (f"{gene.id}.{gene.cluster}", gene.recombinase) - # ) - self.recombinases[gene.recombinase] += 1 - self.genes.add(gene) - - def get_position(self): - """ Return genomic position tuple. """ - return (self.contig, self.start, self.end) - - def get_recombinases(self): - for g in sorted(self.genes, key=lambda x: x.start): - if g.recombinase: - yield f"{g.id}.{g.cluster}", g.recombinase - - def dump(self, seen_islands, raw_outstream=None, outstream=sys.stdout): - """ Writes island to outstream. """ - if raw_outstream: - print(self, file=raw_outstream) - pos = self.get_position() - if pos not in seen_islands and self.recombinases: - seen_islands.add(pos) - - print( - # *self.recombinases[0], - *tuple(self.get_recombinases())[0], - len(self), - str(self), - sep="\t", - file=outstream, - ) - - def get_id(self): - return f"GIL_{self.genome}_{self.contig}:{self.start}-{self.end}" - - @classmethod - def from_gff(cls, *cols): - try: - attribs = dict(item.split("=") for item in cols[-1].split(";")) - except: - raise ValueError(f"not enough cols? {cols}") - - try: - recombinases = Counter( - dict( - item.split(":") - for item in attribs["recombinases"].split(",") - ) - ) - except: - raise ValueError(f"recombinase string weird? {attribs['recombinases'].split(',')}") - - return cls( - attribs["specI"], - attribs["genome"], - attribs["genome_type"] == "COR", - cols[0], # contig - int(cols[3]), # start - int(cols[4]), # end - recombinases=recombinases, - genes=set(), - ) - - def to_gff(self, gff_outstream, source_db, write_genes=False, add_functional_annotation=False, - intermediate_dump=False): - island_id = self.get_id() - attribs = { - "ID": island_id, - "genome": self.genome, - "genome_type": Gene.rtype(self.is_core), - "size": len(self), - "n_genes": len(self.genes), - # "mgeR": ",".join(sorted(r for _, r in self.recombinases)), - # "mgeR": ",".join(sorted(self.recombinases)), - "recombinases": ( - ",".join( - f"{k}:{v}" - for k, v in sorted(self.recombinases.items()) - ) - if self.recombinases else "" - ), - "specI": self.speci, #TODO: does it work? - } - if self.name: - attribs["name"] = self.name - attrib_str = ";".join(f"{item[0]}={item[1]}" for item in attribs.items() if item[1]) - # Format the source column - if source_db: - source = f"proMGE_{source_db}" - else: - source = "proMGE" - print( - self.contig, - source, - "region", - self.start, - self.end, - len(self), # Score field - ".", # Strand - ".", # Phase - attrib_str, - sep="\t", - file=gff_outstream - ) - - if write_genes: - # GFF3 child term: genes - for gene in sorted(self.genes, key=lambda g: g.id): - gene.to_gff( - gff_outstream, - genomic_island_id=island_id, - add_functional_annotation=add_functional_annotation, - intermediate_dump=intermediate_dump, - ) - - -@dataclass -class AnnotatedGenomicIsland(GenomicIsland): - '''The following class extends generic Genomic Island with MGE machinery annotations.''' - - TABLE_HEADERS = ( - "contig", - "start", - "end", - "island_size", - "prot_count", - "mgeR_count", - "Plasmid_PA", - "phage_count", - "all_conj_count", - "CONJ_T4SS", - "SS_present_mandatoryG", - "entire_ss", - "mgeR", - ) - - phage_count: int = 0 - conj_count: int = 0 - conj_man_count: int = 0 - valid_entire: bool = False - valid_mandatory: bool = False - valid_accessory: bool = False - - def __post_init__(self): - """ Apply annotations. """ - secretion_systems = {} - cm_counts = Counter() - - # self.recombinases = [r for _, r in self.recombinases] - - for gene in self.genes: - self.phage_count += gene.phage is not None - if gene.secretion_system is not None: - self.conj_count += 1 - self.conj_man_count += ( - gene.secretion_system.upper().startswith("CONJ") or - gene.secretion_system.upper().startswith("T4SS") - ) - if gene.secretion_rule is not None: - cm_counts[(gene.secretion_system, False)] += 1 - cm_counts[(gene.secretion_system, True)] += 1 - - secretion_systems[gene.secretion_system] = gene.secretion_rule - - for system, rule in secretion_systems.items(): - self.valid_mandatory |= (cm_counts[(system, True)] >= rule["mandatory"] / 2) - self.valid_accessory |= (cm_counts[(system, False)] >= rule["accessory"] / 2) - self.valid_entire |= ( - cm_counts[(system, True)] == rule["mandatory"] and - cm_counts[(system, False)] == rule["accessory"] - ) - - def __str__(self): - """ String representation. """ - return "\t".join( - map( - str, ( - self.contig, - self.start, - self.end, - len(self), - len(self.genes), - # len(self.recombinases), - sum(self.recombinases.values()), - 0, # is still necessary? - self.phage_count, - self.conj_count, - self.conj_man_count, - self.valid_mandatory, - self.valid_entire, - ",".join(self.recombinases), - ) - ) - ) - - @classmethod - def from_genomic_island(cls, g_island): - """ Construct annotated island from genomic island. """ - return cls( - **g_island.__dict__, - ) - - -@dataclass -class MgeGenomicIsland(AnnotatedGenomicIsland): - '''The following class describes Anotated Genomic Islands with their assigned MGE type. - Those are Mobile Genetic Elements (MGEs). - The class attributes are used to describe the MGE properties. - It also contains functionality to save the MGEs in gff3 or tsv formats.''' - - TABLE_HEADERS = ( - "tn", - "phage", - "phage_like", - "ce", - "integron", - "mi", - "nmi", - "nov", - "cellular", - "contig", - "start", - "end", - "size", - "n_genes", - "phage_count", - "conj_man_count", - "recombinases", - ) - - integron: int = 0 - cellular: int = 0 - phage: int = 0 - c_mi: int = 0 - nov: int = 0 - c_pli: int = 0 - c_ce: int = 0 - c_nmi: int = 0 - c_tn: int = 0 - - tn3_found: bool = False - ser_found: bool = False - - mge: Counter = field(default_factory=Counter) - mge_type: str = None - size: int = 0 - n_genes: int = 0 - - def __post_init__(self): - """ Apply annotations. """ - recombinases = (",".join(r for _, r in self.get_recombinases())).lower() - for name, alias in MGE_ALIASES.items(): - recombinases = recombinases.replace(name, alias) - - self.tn3_found = "tn3" in recombinases - self.ser_found = "c2_n1ser" in recombinases or "ser_ce" in recombinases - - # integron - self.integron = int("integron" in recombinases) - # tag recombinase island with more than 3 recombinases - # self.c_nmi = int(len(self.recombinases) > 3) - self.c_nmi = sum(self.recombinases.values()) - - # self.recombinases = recombinases.split(",") if recombinases else [] - self.recombinases = Counter(recombinases.split(",")) - - def __str__(self): - """ String representation. """ - return "\t".join( - tuple(map(str, self.get_mge_metrics())) + - ( - self.contig, - f"{self.start}", - f"{self.end}", - f"{len(self)}", - f"{len(self.genes)}", - f"{self.phage_count}", - f"{self.conj_man_count}", - # ",".join(self.recombinases), - ",".join( - f"{k}:{v}" - for k, v in sorted(self.recombinases.items()) - ) - if self.recombinases else "", - self.name, - ) - ) - - @staticmethod - def parse_mge_id(mge_id): - """ - Generalized parser for MGE IDs. - - Returns: - genome_id (str): parsed bin or genome identifier - contig (str): contig name (usually something like k141_32063) - start (int): start coordinate - end (int): end coordinate - """ - try: - # Extract coordinates - coord_match = re.search(r":(\d+)-(\d+)$", mge_id) - if not coord_match: - raise ValueError("No coordinates found in MGE ID.") - start, end = map(int, coord_match.groups()) - - # Remove leading MGE_ and SPIRE_ (if present) - cleaned = mge_id - if cleaned.startswith("MGE_"): - cleaned = cleaned[4:] - if cleaned.startswith("SPIRE_"): - cleaned = cleaned[6:] - - # Strip coordinates - core = cleaned.split(':')[0] - - # Assembly-style pattern (e.g., GCA_019800745.1) - if re.match(r"GCA_[\d.]+", core): - genome_id = core.split('_')[0] + '_' + core.split('_')[1] - contig = core.split('.')[-1] - return genome_id, contig, start, end - - # Bin-style pattern: extract contig and genome_id - kmer_match = re.search(r"(_k\d+_\d+)$", core) - if kmer_match: - contig = kmer_match.group(1)[1:] # remove leading underscore - genome_id = core[: -len(kmer_match.group(1))] # remove contig - return genome_id, contig, start, end - - # Underscore split pattern e.g. MGE_NT12001_NC_000913.3:5234-20508 - underscore_split = core.split('_', 1) - if len(underscore_split) == 2 and '.' in underscore_split[1]: - genome_id, contig = underscore_split - return genome_id, contig, start, end - - # Fallback for unknown formats - raise ValueError(f"Unrecognized MGE ID format: {mge_id}") - - except Exception as e: - raise ValueError(f"Failed to parse MGE ID '{mge_id}': {e}") - - - def get_mge_metrics(self): - """ Cast mge metrics to int. """ - return tuple( - map( - int, - ( - self.c_tn, - self.phage, - self.c_pli, - self.c_ce, - self.integron, - self.c_mi, - self.cellular, - ) - ) - ) - - def get_annotated_mge_metrics(self): - metrics = list(self.get_mge_metrics()) # Get mge_type and counts - mge_metrics = [ - (k, v) - for k, v in zip( - ("is_tn", "phage", "phage_like", "ce", "integron", "mi", "cellular",), - metrics - ) - if v # Collect as long as metrics are not None - ] - return mge_metrics - - @staticmethod - def is_nested(annotated_mge_metrics): - n_mges = sum(v for _, v in annotated_mge_metrics) - if not n_mges: - raise UserWarning("No MGEs were assigned to recombinase island") - # Solitary or nested MGE? - return n_mges > 1 - - @staticmethod - def mge_num_island_type(is_nested): - """ Returns nested vs solitary MGE-tag. """ - return ("non-nested", "nested")[is_nested] - - def has_annotation(self): - """ (Sanity) Check if island has any mge annotation. """ - return sum(( - self.c_tn, - self.phage, - self.c_pli, - self.c_ce, - self.integron, - self.c_mi, - self.cellular, - )) > 0 - - def evaluate_recombinases(self, rules, outstream=None, outstream2=None): - """ Annotate recombinases. """ - patch_c_tn = False - - recombinases = it.chain(*it.chain((r,) * c for r, c in self.recombinases.items())) - - for rec in recombinases: - rule = rules.get(rec) - if rule is None: - print(f"WARNING: cannot find mge-rule for `{rec}`") - rule = MgeRule() - - # cellular:Arch1/Cyan/Xer/Cand - self.cellular |= rule.cellular - - self.c_tn = rule.c_tn_check(self) - patch_c_tn |= rule.patch_c_tn_check(self) - - if self.phage_count >= 2 and self.conj_man_count < 1: - self.phage, self.c_mi, self.nov = rule.phage_check(self) - elif self.phage_count < 2 and self.conj_man_count < 1: - self.c_pli, self.c_mi = rule.phage_like_check( - self, - "brujita" in rec - ) - elif self.phage_count < 2 and self.conj_man_count >= 1: - self.c_ce, self.nov = rule.conjug_element_check(self) - elif self.phage_count >= 2 and self.conj_man_count >= 1: - self.phage, self.c_mi, self.nov = rule.mobility_island_check(self) - - # counting multiple tn in Tn3 containing recombinase island - # self.c_tn += (len(self.recombinases) > 2) * (self.tn3_found or self.ser_found) - self.c_tn += (sum(self.recombinases.values()) > 2) * (self.tn3_found or self.ser_found) - if not self.has_annotation(): - if not patch_c_tn: - print(f"WARNING: No annotation found, but cannot patch either.\n{self}") - self.c_tn = patch_c_tn - - if outstream: - print(self, sep="\t", file=outstream, ) - - # previous step in some cases generates overlap between Phage/Phage_like and Mobility island - # this step specifically resolves such instances based on recombinase presence and presence/ - # absence of phage structural genes/conjugation machinery genes in the neighbourhood - if self.c_mi and self.c_pli: - self.c_mi = int( - any( - pat in ",".join(self.recombinases).lower() - for pat in ('relaxase', 'rep_', 'mob_', 'trwc') - ) - ) - self.c_pli = int(not self.c_mi) - - if self.phage and self.c_mi and self.phage_count >= 2: - self.phage, self.c_mi = True, False - - if outstream2: - print(self, sep="\t", file=outstream2, ) - - @classmethod - def from_annotated_genomic_island(cls, ag_island): - """ Construct from annotated genomic island. """ - island = cls( - **ag_island.__dict__ - ) - return island - - def get_id(self): - return f"MGE_{self.genome}_{self.contig}:{self.start}-{self.end}" - - - def to_gff(self, gff_outstream, source_db, write_genes=False, add_functional_annotation=False): - island_id = self.get_id() - mge_metrics = self.get_annotated_mge_metrics() - attribs = { - "ID": island_id, - "mge": ",".join(f"{k}:{v}" for k, v in mge_metrics), # Count each mge type - "genome_type": Gene.rtype(self.is_core), - "mge_type": self.mge_num_island_type(self.is_nested(mge_metrics)), - "size": len(self), - "n_genes": len(self.genes), - "mgeR": ( - ",".join( - f"{k}:{v}" - # for k, v in sorted(Counter(self.recombinases).items()) - for k, v in sorted(self.recombinases.items()) - ) - if self.recombinases else "" - ), - } - if self.name: - attribs["name"] = self.name - attrib_str = ";".join(f"{item[0]}={item[1]}" for item in attribs.items() if item[1]) - # Format the source column - if source_db: - source = f"proMGE_{source_db}" - else: - source = "proMGE" - print( - self.contig, - source, - "mobile_genetic_element", - self.start, - self.end, - len(self), # Score field - ".", # Strand - ".", # Phase - attrib_str, - sep="\t", - file=gff_outstream - ) - - if write_genes: - # GFF3 child term: genes - for gene in sorted(self.genes, key=lambda g: g.id): - gene.to_gff( - gff_outstream, - genomic_island_id=island_id, - add_functional_annotation=add_functional_annotation, - ) - - @classmethod - def from_gff(cls, *cols): - try: - attribs = dict(item.split("=") for item in cols[-1].split(";")) - except: - raise ValueError(f"not enough cols? {cols}") - - try: - recombinases = Counter( - dict((key, int(value)) for key, value in - (item.split(":") - for item in attribs["mgeR"].split(",")) - ) - ) - except: - raise ValueError(f"recombinase string weird? {attribs['mgeR'].split(',')}") - - try: - mges = Counter( - dict((key, int(value)) for key, value in - (item.split(":") - for item in attribs["mge"].split(",")) - ) - ) - except: - raise ValueError(f"mge string weird? {attribs['mge'].split(',')}") - - genome_id, contig, start, end = cls.parse_mge_id(attribs["ID"]) - - return cls( - "", # TODO: where to get/ how to handle specI - genome_id, - attribs["genome_type"] == "COR", - cols[0], # contig - int(cols[3]), # start - int(cols[4]), # end - recombinases=recombinases, - mge=mges, - mge_type=attribs["mge_type"], - size=int(attribs["size"]), - n_genes=int(attribs["n_genes"]), - genes=set(), - ) - - def to_tsv(self, outstream): - metrics = list(self.get_mge_metrics()) - print( - *metrics, - self.contig, - self.start, - self.end, - len(self), # size - len(self.genes), # n_genes - ",".join( - f"{k}:{v}" - # for k, v in sorted(Counter(self.recombinases).items()) - for k, v in sorted(self.recombinases.items()) - ) if self.recombinases else "", - (self.name if self.name else ""), - ",".join(gene.id for gene in sorted(self.genes, key=lambda g: g.id)), # gene_list - sep="\t", - file=outstream, - ) diff --git a/mgexpose/.ipynb_checkpoints/query_db-checkpoint.py b/mgexpose/.ipynb_checkpoints/query_db-checkpoint.py deleted file mode 100644 index da886df..0000000 --- a/mgexpose/.ipynb_checkpoints/query_db-checkpoint.py +++ /dev/null @@ -1,280 +0,0 @@ -import psycopg2 -import pandas as pd -import json -import sys - -genome2speci_pg3_file = '/home/grekova/workspace/promge_website/data/genome_id2speci.tsv' - -df = pd.read_csv(genome2speci_pg3_file, sep="\t") - -genome2speci_ids = dict(zip(df["sample_name"], df["cluster_name"])) - -def get_gtdb_id(identifier): - """Extract the GTDB ID from a genome identifier.""" - parts = identifier.split('_') - return f"{parts[0]}_{parts[1]}" # GCA_xxxxxx.x - -def connect(params_dic): - """ Connect to the PostgreSQL database server """ - conn = None - try: - # connect to the PostgreSQL server - print('Connecting to the PostgreSQL database...') - conn = psycopg2.connect(**params_dic) - except (Exception, psycopg2.DatabaseError) as error: - print(error) - sys.exit(1) - print("Connection created successfully") - return conn - -def parse_mge_id(id): - #'GCA_009102765.1_371601.SAMN11944272.WDCH01000111:267-2860' - mge_dict = {'gtdb_id': '', - 'contig' : '', - 'start' : '', - 'end' : '' - } - id = id.replace(':', '_').split('_') - mge_dict['gtdb_id'] = (id[0] + '_' + id[1]) #GCA_xxxxxx.x - mge_dict['contig'] = (id[2]) - coordinates = [int(c) for c in id[3].split('-')] - mge_dict['start'] = (coordinates[0]) - mge_dict['end'] = (coordinates[1]) - return mge_dict - -def query_mge_annotations(conn): - ''' Query the database to get all levels of taxonomy, mge_type and recombinases from the mge table. - Args: - In: conn: psycopg connection - Out: result: df mge_id recombinase tax_domain tax_phylum tax_class tax_order tax_family tax_genus tax_species - - ''' - cursor = conn.cursor() - levels = ["clusters.{level}".format(level=level) for level in ['tax_domain', 'tax_phylum', 'tax_class', - 'tax_order', 'tax_family', 'tax_genus', - 'tax_species']] - levels_str = ', '.join(levels) - query = """ - SELECT contig || ':' || start_pos || '-' || end_pos AS contig_pos, - {levels_str} - FROM clusters AS clusters, pg3.mge AS mge - WHERE clusters.id = mge.cluster_id; - """.format(levels_str=levels_str) - cursor.execute(query) - - result = cursor.fetchall() - cursor.close() - columns = ['contig_pos'] - columns.extend([l.replace("clusters.", "") for l in levels]) - result = pd.DataFrame(result, columns=columns) - return result - -def get_taxa(speci_lst, cursor, level=None): - ''' Query the database to get taxonomy information - Args: - In: speci_lst (list): List of species names - cursor: psycopg cursor object - level (str, optional): Specific taxonomic level to query. If None, fetch all levels. - Out: result: (DataFrame) containing taxonomy information - ''' - levels = { - "tax_domain", "tax_phylum", "tax_class", "tax_order", "tax_family", "tax_genus", "tax_species" - } - - if level and level not in levels: - raise ValueError(f"Invalid level: {level}. Choose from {levels} or None for full taxonomy.") - - levels_str = f"clusters.{level}" if level else ', '.join(f"clusters.{lvl}" for lvl in levels) - - specI_str = ', '.join(['%s'] * len(speci_lst)) - - query = f""" - SELECT cluster_name, {levels_str} - FROM clusters AS clusters - WHERE clusters.cluster_name IN ({specI_str}); - """ - - cursor.execute(query, tuple(speci_lst)) - result = cursor.fetchall() - - columns = ['cluster_name'] + ([level] if level else list(levels)) - - return pd.DataFrame(result, columns=columns) if result else pd.DataFrame(columns=columns) - -def get_gtdb_taxa(sample_ids, db, cursor, level=None): - levels = { - "d", "p", "c", "o", "f", "g", "s" - } - if level and level not in levels: - raise ValueError(f"Invalid level: {level}. Choose from {levels} or None for full taxonomy.") - - if db == "pg3": - tax_table = "pg3.gtdb_r220" - sample_table = "pg3.samples" - assembly = "genome_id" - levels_str = f"t.{level}" if level else ', '.join(f"t.{lvl}" for lvl in levels) - elif db == "spire": - tax_table = "gtdb_r220" - sample_table = "bins" - assembly = "bin_id" - levels_str = f"{tax_table}.{level}" if level else ', '.join(f"{tax_table}.{lvl}" for lvl in levels) - else: - raise ValueError(f"Invalid db specification: {db}. pg3 or spire are allowed.") - - - sample_ids_str = ', '.join(['%s'] * len(sample_ids)) - - if db == "pg3": - query = f""" - SELECT sample_name, {levels_str} - FROM {sample_table} AS s, {tax_table} AS t - WHERE (s.sample_name IN ({sample_ids_str})) AND (s.id = t.sample_id); - """ - elif db == "spire": - query = f""" - SELECT bin_name, {levels_str} - FROM {sample_table} AS {sample_table}, {tax_table} AS {tax_table} - WHERE ({sample_table}.bin_name IN ({sample_ids_str})) AND ({sample_table}.id = {tax_table}.bin_id); - """ - else: - raise ValueError(f"Invalid db specification: {db}. pg3 or spire are allowed.") - - cursor.execute(query, tuple(sample_ids)) - result = cursor.fetchall() - - columns = [assembly] + ([level] if level else list(levels)) - - return pd.DataFrame(result, columns=columns) if result else pd.DataFrame(columns=columns) - - -def annotate_clustering_df(clustered_df, conn, level="tax_species"): - ''' Query taxonomy information and update DataFrame ''' - - print("Clustering df, nrows:", len(clustered_df)) - - genome2speci = {id: genome2speci_ids[get_gtdb_id(id)] for id in clustered_df.member_seq_100 if 'GCA_' in id} - clustered_df['speci'] = clustered_df['member_seq_100'].map(genome2speci) - specIs = list(set(genome2speci.values())) - print("# specI:", len(specIs)) - - cursor = conn.cursor() - - if level == "full": - result_df = get_taxa(specIs, cursor) - else: - result_df = get_taxa(specIs, cursor, level) - - print("Merging taxonomy") - clustered_df = clustered_df.merge(result_df, how="inner", left_on="speci", right_on="cluster_name") - - cursor.close() - return clustered_df - - -def get_speci_taxonomy_df(speci_lst, conn, level="tax_species"): - ''' For each specI cluster get taxonomy information and return as taxa_df dataframe''' - - specIs = list(set(speci_lst)) # Ensure that it is a list - print("# specI:", len(speci_lst)) - - cursor = conn.cursor() - - if level == "full": - taxa_df = get_taxa(speci_lst, cursor) - else: - taxa_df = get_taxa(speci_lst, cursor, level) - - cursor.close() - return taxa_df - - -def get_gtdb_taxonomy_df(sample_ids, db, conn, level="tax_species"): - ''' For each sample_id (bin_id or genome_id) get gtdb taxonomy information and return as taxa_df dataframe''' - - sample_ids= list(set(sample_ids)) # Ensure that it is a list - print("# samples_ids:", len(sample_ids)) - - cursor = conn.cursor() - - if level == "full": - taxa_df = get_gtdb_taxa(sample_ids, db, cursor) - else: - taxa_df = get_gtdb_taxa(sample_ids, db, cursor, level) - - cursor.close() - return taxa_df - - -def get_microontology(sample_names, conn): - ''' - Query the database to get microontology information. - - Args: - sample_names (list): List of sample names - cursor (psycopg cursor): Active DB cursor - - Returns: - pd.DataFrame: DataFrame with sample_name, sample_id, term_id, term, term_array - ''' - - if len(sample_names) == 0: - return pd.DataFrame(columns=["sample_name", "study_id", "sample_id", "term"]) - cursor = conn.cursor() - - samples_str = ', '.join(['%s'] * len(sample_names)) - - query = f""" - SELECT - s.sample_name, - s.study_id, - mv.sample_id, - mt.term - FROM samples s - JOIN microntology_v3 mv ON s.id = mv.sample_id - JOIN LATERAL unnest(mv.microntology_terms) AS term_id ON TRUE - JOIN microntology_terms mt ON mt.id = term_id - WHERE s.sample_name IN ({samples_str}); - """ - - cursor.execute(query, tuple(sample_names)) - result = cursor.fetchall() - cursor.close() - columns = ["sample_name", "study_id", "sample_id", "term"] - - return pd.DataFrame(result, columns=columns) if result else pd.DataFrame(columns=columns) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/mgexpose/__main__.py b/mgexpose/__main__.py deleted file mode 100755 index 48c6402..0000000 --- a/mgexpose/__main__.py +++ /dev/null @@ -1,354 +0,0 @@ -#!/usr/bin/env python - -# pylint: disable=R0912,R0914,R0915,R0913,R0917 - -""" Mobile genetic element annotation """ - -import contextlib -import gzip -import logging -import os -import pathlib - -from .gene_annotator import GeneAnnotator -from .handle_args import handle_args -from .island_processing import ( - generate_island_set, - annotate_islands, - evaluate_islands, - prepare_precomputed_islands -) -from .islands import MgeGenomicIsland -from .readers import read_fasta, read_prodigal_gff, read_mge_rules -from .gffio import read_genomic_islands_gff, read_mge_genomic_islands_gff - -MGE_TABLE_HEADERS = \ - ("is_tn",) + \ - MgeGenomicIsland.TABLE_HEADERS[1:6] + \ - MgeGenomicIsland.TABLE_HEADERS[8:14] + \ - ("mgeR", "name", "genes",) - -logging.basicConfig( - level=logging.INFO, - format='[%(asctime)s] %(message)s' -) - -logger = logging.getLogger(__name__) - - -def process_islands(genes, genome_id, single_island=None, island_file=None, output_dir=None,): - """ helper function to declutter main() """ - precomputed_islands = prepare_precomputed_islands( - single_island=single_island, - island_file=island_file, - genome_id=genome_id, - ) - - if output_dir: - pang_calls_out = open( - os.path.join( - output_dir, - f"{genome_id}.pan_genome_calls.txt"), - "wt", - encoding="UTF-8", - ) - - islands_out = open( - os.path.join( - output_dir, - f"{genome_id}.pan_genome_islands.txt", - ), - "wt", - encoding="UTF-8", - ) - - raw_islands_out = open( - os.path.join( - output_dir, - "..", # temporary! this is only until i know if this is final output or not - f"{genome_id}.pan_genome_islands_raw.txt", - ), - "wt", - encoding="UTF-8", - ) - else: - pang_calls_out, islands_out, raw_islands_out = [contextlib.nullcontext() for _ in range(3)] - - with pang_calls_out, islands_out, raw_islands_out: - yield from generate_island_set( - genes, - pang_calls_out=pang_calls_out, - raw_islands_out=raw_islands_out, - islands_out=islands_out, - precomputed_islands=precomputed_islands, - ) - - -def dump_islands(islands, out_prefix, db, write_genes=False, add_functional_annotation=False): - """ dump genomic islands to intermediate gff """ - with open( - f"{out_prefix}.unannotated_islands.gff3", - "wt", encoding="UTF-8" - ) as _out: - print("##gff-version 3", file=_out) - for island in sorted(islands, key=lambda isl: isl.contig): - island.to_gff( - _out, db, write_genes=write_genes, - add_functional_annotation=add_functional_annotation, - intermediate_dump=True, - ) - - -def identify_recombinase_islands(islands, genome_id, mge_rules, output_dir=None): - """Identify MGE-islands according to a set of rules - using various signals annotated in the corresponding gene set. """ - if output_dir: - step1_out = open( - os.path.join( - output_dir, - f"{genome_id}.assign_mge.step1.txt", - ), - "wt", - encoding="UTF-8", - ) - - step2_out = open( - os.path.join( - output_dir, - f"{genome_id}.assign_mge.step2.txt", - ), - "wt", - encoding="UTF-8", - ) - - step3_out = open( - os.path.join( - output_dir, - f"{genome_id}.assign_mge.step3.txt", - ), - "wt", - encoding="UTF-8", - ) - - else: - step1_out, step2_out, step3_out = [contextlib.nullcontext() for _ in range(3)] - - with step1_out: - annotated_islands = list(annotate_islands(islands, outstream=step1_out)) - with step2_out, step3_out: - return list( - evaluate_islands( - annotated_islands, - read_mge_rules(mge_rules), - outstream=step2_out, - outstream2=step3_out - ) - ) - - -def write_final_results( - recombinase_islands, - output_dir, - genome_id, - output_suffix, - dbformat=None, - write_tsv=True, - write_gff=True, - write_genes_to_gff=True, - add_functional_annotation=False, - genome_seqs=None, -): - """ write final results """ - - outstream = contextlib.nullcontext() - gff_outstream = contextlib.nullcontext() - - out_prefix = os.path.join( - output_dir, - f"{genome_id}.{output_suffix}" - ) - - if write_tsv: - outstream = open( - f"{out_prefix}.txt", - "wt", - encoding="UTF-8", - ) - if write_gff: - gff_outstream = open( - f"{out_prefix}.gff3", - "wt", - encoding="UTF-8", - ) - - # Sort the list of MGEGenomicIslands based on contig names - sorted_islands = sorted(recombinase_islands, key=lambda isl: isl.contig) - islands_by_contig = {} - - with outstream, gff_outstream: - # TSV header - if write_tsv: - print(*MGE_TABLE_HEADERS, sep="\t", file=outstream) - # GFF3 header - if write_gff: - print("##gff-version 3", file=gff_outstream) - - # Start recording the outputs - for island in sorted_islands: - islands_by_contig.setdefault(island.contig, []).append(island) - # TSV: ignore gene-wise annotations; each line is recombinase island, - # all gene IDs are stored in a gene_list column - # assert genome_id == island.genome - if write_tsv: - island.to_tsv(outstream) - # GFF3: add individual genes annotation; - # parent lines are recombinase islands, children lines are genes - # GFF3 parent term: recombinase island - if write_gff: - island.to_gff( - gff_outstream, - source_db=dbformat, - write_genes=write_genes_to_gff, - add_functional_annotation=add_functional_annotation, - ) - - if genome_seqs is not None: - with gzip.open( - f"{out_prefix}.ffn.gz", - "wt", - ) as _out: - for header, seq in read_fasta(genome_seqs): - seqid, *_ = header.split(" ") - for island in islands_by_contig.get(seqid, []): - attribs = island.get_attribs() - try: - del attribs["ID"] - except KeyError: - pass - try: - del attribs["name"] - except KeyError: - pass - attrib_str = ";".join(f"{item[0]}={item[1]}" for item in attribs.items() if item[1]) - print( - f">{island.get_id()} {attrib_str}", seq[island.start - 1: island.end], sep="\n", file=_out - ) - - - - -def denovo_annotation(args, debug_dir=None): - """ denovo annotation """ - annotator = GeneAnnotator( - args.genome_id, - args.speci, - read_prodigal_gff(args.prodigal_gff), - include_genome_id=args.include_genome_id, - has_batch_data=args.allow_batch_data, - dbformat=args.dbformat, - ) - - annotated_genes = annotator.annotate_genes( - args.recombinase_hits, - ( - args.phage_eggnog_data, - args.phage_filter_terms, - ), - ( - args.txs_macsy_report, - args.txs_macsy_rules, - # args.macsy_version, - ), - clusters=args.cluster_data, - use_y_clusters=args.use_y_clusters, - core_threshold=(args.core_threshold, -1)[args.precomputed_core_genes], - output_dir=args.output_dir, - pyhmmer=args.pyhmmer_input, - ) - - out_prefix = os.path.join(args.output_dir, args.genome_id) - - genomic_islands = list( - process_islands( - annotated_genes, - args.genome_id, - single_island=args.single_island, - island_file=args.precomputed_islands, - output_dir=debug_dir, - ) - ) - - if args.dump_genomic_islands or args.skip_island_identification: - - dump_islands( - genomic_islands, - out_prefix, - args.dbformat, - write_genes=True, - add_functional_annotation=args.add_functional_annotation, - ) - - # - # test_islands = list(read_genomic_islands_gff(f"{out_prefix}.unannotated_islands.gff3")) - # dump_islands( - # test_islands, - # out_prefix + ".test", - # args.dbformat, - # write_genes=True, - # add_functional_annotation=args.add_functional_annotation, - # ) - - with open( - os.path.join(args.output_dir, f"{args.genome_id}.gene_info.txt"), - "wt", - encoding="UTF-8", - ) as _out: - annotator.dump_genes(_out) - - return genomic_islands - - -def main(): - """ main """ - - args = handle_args() - logger.info("ARGS: %s", str(args)) - - debug_dir = None - cdir = args.output_dir - if args.dump_intermediate_steps: - cdir = debug_dir = os.path.join(args.output_dir, "debug") - pathlib.Path(cdir).mkdir(exist_ok=True, parents=True) - - genomic_islands = None - if args.command == "denovo": - genomic_islands = denovo_annotation(args, debug_dir=debug_dir) - - elif args.command == "annotate": - raise NotImplementedError - - if not args.skip_island_identification: - - recombinase_islands = identify_recombinase_islands( - genomic_islands, - args.genome_id, - args.mge_rules, - output_dir=debug_dir, - ) - - if recombinase_islands: - write_final_results( - recombinase_islands, - args.output_dir, - args.genome_id, - args.output_suffix, - dbformat=args.dbformat, - write_gff=args.write_gff, - write_genes_to_gff=args.write_genes_to_gff, - add_functional_annotation=args.add_functional_annotation, - genome_seqs=args.extract_islands, - ) - - -if __name__ == "__main__": - main() diff --git a/mgexpose/batch_helpers.py b/mgexpose/batch_helpers.py deleted file mode 100644 index 858f9c1..0000000 --- a/mgexpose/batch_helpers.py +++ /dev/null @@ -1,223 +0,0 @@ -#!/usr/bin/env python -# pylint: disable=R0912,R0914,R0915 -''' Collection of functions to collect MGEs from batches of files''' -import os -from collections import Counter, defaultdict -import json - -from dask.distributed import Client, progress, WorkerPlugin -import dask.bag as db -from dask.bag import from_delayed -from dask.delayed import delayed -import dask - -from .gffio import read_mge_genomic_islands_gff - -from .base_logger import logger -import traceback - - -# Create a dictionary: genome: list of MGE IDs. This is needed to filter out only relevant MGEs per genome.The input is a corresponding list (same order) of genomes and MGE IDs. -def get_genome2mges(genomes, mges): - genome2mge_id = {} - for id, genome_id in zip(mges, genomes): - if genome_id not in genome2mge_id: - genome2mge_id[genome_id] = [] # Initialize the list - genome2mge_id[genome_id].append(id) - return genome2mge_id - -# Helper function to extract genome ID/bin ID from file path -def get_genome_id_from_path(path): - """ - Extract genome ID from a file path. - """ - genome_id = None - try: - genome_id = path.split("/")[-2] - except Exception as e: - logger.error(f"Error extracting genome/bin ID{path}: {e}") - return genome_id - -def collect_batch_mges(gff_paths, i, relevant_ids=None): - """ - Collect MGEs from a batch of GFF files. - - Parameters: - - gff_paths: List of GFF file paths. - - i: Index of the batch. - - relevant_ids: Optional dictionary of relevant MGE IDs per genome ID. - - Returns: - - List of MGE islands for all files in the batch. - """ - islands = [] - for gff_path in gff_paths: - genome_id = get_genome_id_from_path(gff_path) - #logger.info(f"Processing genome: {genome_id}") - - try: - if relevant_ids: - relevant_mges = list(read_mge_genomic_islands_gff(gff_path, relevant_ids[genome_id])) - else: - relevant_mges = list(read_mge_genomic_islands_gff(gff_path)) - - islands.extend(relevant_mges) - - except Exception as e: - logger.error(f"Error processing {gff_path}: {e}") - logger.error(traceback.format_exc()) # Full traceback - - logger.info(f"Batch {i} completed, MGE islands found: {len(islands)}") - return islands - - -def apply_per_batch(islands, funcs): - """ - Calculate statistics for a batch of MGEs using a list of functions. - - Parameters: - - islands: List of MGE objects in the batch. - - funcs: List of functions to be applied to the batch. - - Returns: - - Dictionary with function names as keys and their results as values. - """ - results = {} - for func in funcs: - func_name = func.__name__ # Get the function's name - results[func_name] = func(islands) # Apply the function and store the result - return results - -def apply_one_per_batch(islands, func): - """ - Apply a single function to a batch of MGE objects. - - Parameters: - - islands: List of MGE objects in the batch. - - func: A single function to be applied to the batch. - - Returns: - - The result of applying the function to the islands. - """ - try: - return func(islands) - except Exception as e: - raise RuntimeError(f"Error applying function '{func.__name__}' to batch: {e}") - - -def write_batch_json(batch_count, i, dir, base_filename): - """ - Saves batch statistics to a JSON file using a Dask delayed function. This allows the function to be part of a larger - Dask computation graph, potentially executed in parallel. - - Parameters: - - batch_count (dict): Batch statistics, typically a dictionary with `Counter` values. - - i (int): Index of the current batch. This is used to generate a unique filename for each batch. - - dir (str): Path to the directory where the batch JSON files will be stored. - - base_filename (str): Base name for the JSON files. The batch index will be appended to this base name to create the full filename. - - Ensures that the specified directory exists before writing the file. If it does not exist, the directory will be created. - - Returns: - - A Dask delayed object which, when executed, will write the batch statistics to the specified file path in JSON format. - """ - # Ensure directory exists - if not os.path.exists(dir): - os.makedirs(dir) - - # Delayed function to write JSON - def delayed_write(path, data): - with open(path, 'w') as file: - json.dump(data, file, indent=4) - - # Construct the full path with batch number - path = os.path.join(dir, f"{base_filename}_{i}.json") - return delayed(delayed_write)(path, batch_count) - - -def write_batch_tsv(tsv_string, i, dir, base_filename): - """ - Saves a TSV-formatted string to a file as part of a Dask computation graph. - - Parameters: - - tsv_string (str): TSV-formatted data returned from a processing function. - - i (int): Index of the current batch, used to make unique filenames. - - dir (str): Directory where TSV files will be saved. - - base_filename (str): Base name for the output files. - - Returns: - - A Dask delayed object that writes the TSV to disk when executed. - """ - # Ensure the output directory exists - if not os.path.exists(dir): - os.makedirs(dir) - - # Define the write function - def delayed_write(path, content): - with open(path, 'w') as file: - file.write(content) - - # Create the output file path - path = os.path.join(dir, f"{base_filename}_{i}.tsv") - - # Return the delayed write operation - return delayed(delayed_write)(path, tsv_string) - - -def aggregate_attr(batches): - """ - Aggregate string attributes across all batches. - - Parameters: - - batches: List of batch statistics (dictionaries with str values e.g. cluster: COG category). - - {'90371.SAMN11043730.GCA_007435405_02914': 'S', '90371.SAMN14863315.GCA_013264555_00909': 'S', '28150.SAMN09228819.GCA_007140965_00837': 'K', '28901.SAMN13391507.GCA_011477875_00875': 'no_cog_fcat', '1967657.SAMN09203654.GCA_010924635_01295': 'S', '28901.SAMN13057743.GCA_009231785_02754': 'no_cog_fcat', '90371.SAMN11355433.GCA_007687065_04488': 'S', '28901.SAMN06645026.GCA_009179045_04086': 'no_cog_fcat', '28901.SAMN15658059.GCA_013797405_02028': 'S', '796732.SAMN01805325.GCA_000272735_04407': 'no_cog_fcat', '28901.SAMN12571445.GCA_010939435_00055': 'S', '28901.SAMN13747386.GCA_010741235_03029': 'no_cog_fcat', '115981.SAMN14080650.GCA_011486465_01735': 'S', '28901.SAMN14341880.GCA_011465135_00878': 'H', '1151002.SAMN09403228.GCA_004177825_01940': 'no_cog_fcat', '1029990.SAMN02415182.GCA_000484355_00589': 'S', '28901.SAMN12287151.GCA_007468615_01067': 'S', '28901.SAMN13057273.GCA_009230165_00580': 'no_cog_fcat', '611.SAMN21335643.GCA_019899165_00890': 'EH', '28901.SAMN10095790.GCA_005443695_02309': 'no_cog_fcat', '340190.SAMN15147492.GCA_013661605_01048': 'S', '224729.SAMN19336595.GCA_018502705_02901': 'no_cog_fcat', '28901.SAMN16355443.GCA_015155595_01828': 'no_cog_fcat', '59201.SAMN10093771.GCA_007777665_02600': 'no_cog_fcat', '59201.SAMN17835677.GCA_017072195_01803': 'S'} - -2025-01-31 18:35:27,669 - {'611.SAMN17086052.GCA_016740915_04210': 'K', '611.SAMN07152477.GCA_007233055_03707': 'S', '1173835.SAMN01088029.GCA_000962395_02473': 'L', '1620419.SAMN03894126.GCA_001241425_04679': 'T', '568709.SAMEA2272227.GCA_000493535_02720': 'no_cog_fcat', '28901.SAMN18448990.GCA_017574325_01595': 'L', '90371.SAMEA6057931.GCA_016228905_04588': 'L', '28901.SAMN10177571.GCA_005772365_02802': 'N', '28901.SAMN14050865.GCA_011246635_04142': 'G', '90371.SAMN09387768.GCA_007158225_04690': 'N', '28144.SAMN07734943.GCA_003548115_02128': 'no_cog_fcat', '90105.SAMN09474912.GCA_004184575_03995': 'G', '59201.SAMN10756627.GCA_007583145_03925': 'S', '90371.SAMN03169328.GCA_008018515_03842': 'K', '1620419.SAMN04255380.GCA_010457935_04445': 'no_cog_fcat', '28901.SAMN16124589.GCA_014542005_01530': 'G', '28901.SAMN17005521.GCA_015838815_01302': 'no_cog_fcat', '28901.SAMN19285790.GCA_018468945_04349': 'G', '28901.SAMN10425133.GCA_010255835_04194': 'S', '28901.SAMN12344366.GCA_007726245_00622': 'S', '28901.SAMN12107692.GCA_006482085_01260': 'S', '440524.SAMN02867573.GCA_010663445_04243': 'M', '28901.SAMN20181473.GCA_020012815_04294': 'L', '28901.SAMEA6514879.GCA_011786425_00695': 'G', '90371.SAMN07279560.GCA_002260995_02309': 'K', '90371.SAMN19798225.GCA_018997055_00491': 'L', '28901.SAMN12823265.GCA_008717395_04569': 'V', '1173837.SAMN01088030.GCA_000962405_02064': 'L', '399584.SAMN13050934.GCA_009225065_04346': 'G', '28901.SAMN13057273.GCA_009230165_01317': 'no_eggnog'} - - Returns: - - Dictionary of aggregated statistics for all batches {func: {mge_type: {cluster: COG_category}}} - """ - aggregated = {} - - for batch in batches: - for func_name, mge_dict in batch.items(): - aggregated[func_name] = {} - for mge_type, attr_dict in mge_dict.items(): - if mge_type not in aggregated[func_name]: - aggregated[func_name][mge_type] = defaultdict(list) # TODO: Generalise to function - # Update using the contents of the nested dictionary - for cluster_id, value in attr_dict.items(): - aggregated[func_name][mge_type][cluster_id] = value # TODO: replace with proper majority vote - # flatten the COG value per batch - return aggregated - -def aggregate_counts(batch_counts): - """ - Aggregate statistics across all batches. - - Parameters: - - batch_counts: List of batch statistics (dictionaries with Counter values). - - Returns: - - Dictionary of aggregated statistics for all batches {func: {mge_type: {cluster: count}}} - """ - aggregated = {} - - for batch in batch_counts: - for func_name, mge_counter in batch.items(): - aggregated[func_name] = {} - for mge_type, nested_counter in mge_counter.items(): - if mge_type not in aggregated[func_name]: - aggregated[func_name][mge_type] = Counter() # Initialize if not already present - # Update using the contents of the nested dictionary - for cluster_id, value in nested_counter.items(): - if isinstance(value, str): - aggregated[func_name][mge_type][cluster_id] = value # Overwrite previous COG category with the latest batch - else: - if cluster_id in aggregated[func_name][mge_type]: - aggregated[func_name][mge_type][cluster_id] += value - else: - aggregated[func_name][mge_type][cluster_id] = value - return aggregated - diff --git a/mgexpose/chunk_reader.py b/mgexpose/chunk_reader.py deleted file mode 100644 index 34d7037..0000000 --- a/mgexpose/chunk_reader.py +++ /dev/null @@ -1,31 +0,0 @@ -""" Module docstring """ - -import gzip - - -def get_lines_from_chunks(f: str, bufsize: int = 800000000): - """ - Provides generator access to the lines of large text files. - File is read chunk-wise into a buffer of the specified size. - Support gzip-compressed files. - - inputs: - - f: str -- filename - - bufsize: int -- size of buffer - - """ - gzipped = f.endswith(".gz") - with (gzip.open if gzipped else open)(f, "r") as _in: - tail = "" - while 1: - chunk = _in.read(bufsize) - if gzipped: - chunk = chunk.decode() - chunk = "".join((tail, chunk)) - if not chunk: - break - chunk = chunk.split("\n") - *lines, tail = chunk - yield from lines - if tail: - yield tail diff --git a/mgexpose/clean_workdir.py b/mgexpose/clean_workdir.py deleted file mode 100644 index 56c89ff..0000000 --- a/mgexpose/clean_workdir.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python - -""" Module to clean nextflow workdir """ - -import argparse -import csv -import os -import re - - -def main(): - """ Main. """ - ap = argparse.ArgumentParser() - ap.add_argument("workdir") - ap.add_argument("tracefile") - args = ap.parse_args() - - with open(args.tracefile, "rt", encoding="UTF-8") as _in: - keep = set( - row["hash"] - for row in csv.DictReader(_in, delimiter="\t") - if row["status"] in ("CACHED", "COMPLETED") - ) - - # level2 = args.workdir.rstrip("/").count("/") + 2 - walk = os.walk(args.workdir) - wd, dirs, _ = next(walk) - for d in dirs: - if re.match(r'^[0-9a-f]{2}$', d): - _, subdirs, _ = next(os.walk(os.path.join(wd, d))) - for sd in subdirs: - wd_hash = f"{d}/{sd[:6]}" - if wd_hash in keep: - print(f"keeping {os.path.join((wd, d, sd))}") - - # for wd, _, _ in os.walk(args.workdir): - # if wd.count("/") == level2: - # # 03/717392 - # # work/81/9299777e91fb27fb6626980719cf1f - # wd_hash = wd[len(args.workdir.rstrip("/")) + 1:][:9] - # if wd_hash not in keep: - # # print(f"removing {wd}") - # ... - # else: - # print(f"keeping {wd}") - - -if __name__ == "__main__": - main() diff --git a/mgexpose/clustering_parser.py b/mgexpose/clustering_parser.py deleted file mode 100644 index e767efd..0000000 --- a/mgexpose/clustering_parser.py +++ /dev/null @@ -1,182 +0,0 @@ -# pylint: disable=R0902,R0914 - -""" Functions for gene cluster parsing """ - -import logging -import os - -from collections import Counter -from contextlib import nullcontext -from dataclasses import dataclass - -from chunk_reader import get_lines_from_chunks - - -logger = logging.getLogger(__name__) - - -def extract_genome_id(gene_id): - """ Extract genome id from gene id. """ - sep = "." if "." in gene_id else ("_" if "_" in gene_id else None) - if sep is None: - raise ValueError(f"gene `{gene_id}` does not seem to contain a genome_id.") - - return gene_id[:gene_id.rfind(sep)] - - -def parse_db_clusters(cluster_data): - """ Parse gene, cluster, is_core from tsv-stream. """ - return [ - tuple(line.strip().split("\t")) - for line in get_lines_from_chunks(cluster_data) - ] - - -def parse_full_seq_clusters( - genome_id_prefix, - genes, - cluster_data, - output_dir=None, -): - """ Parse data from linclust gene clustering. """ - - genomes = set() - cluster_genes = Counter() - gene_cluster_map = [] - - if output_dir is None: - write_data = True - cluster_genes_out = nullcontext() - gene_clusters_out = nullcontext() - else: - write_data = False - cluster_genes_out = open( - os.path.join(output_dir, f"{genome_id_prefix}.cluster_genes.txt"), - "wt", - encoding="UTF-8", - ) - gene_clusters_out = open( - os.path.join(output_dir, f"{genome_id_prefix}.gene_clusters.txt"), - "wt", - encoding="UTF-8", - ) - - with gene_clusters_out, cluster_genes_out: - for line in get_lines_from_chunks(cluster_data): - cluster_id, gene_id = line.split("\t") - if gene_id.startswith(genome_id_prefix): - gene_id = gene_id[len(genome_id_prefix) + 1:] - - gene = genes.get( - gene_id, - genes.get(gene_id[gene_id.rfind(".") + 1:]) - ) - if gene is not None: - logger.info("Adding cluster %s to gene %s", cluster_id, gene_id) - genome_id = gene.genome - gene_cluster_map.append((gene_id, cluster_id)) - else: - genome_id = extract_genome_id(gene_id) - - cluster_genes[cluster_id] += 1 - genomes.add(genome_id) - if write_data: - print(cluster_id, gene_id, sep="\t", file=cluster_genes_out) - print(gene_id, cluster_id, sep="\t", file=gene_clusters_out) - - if write_data: - with open( - os.path.join(output_dir, f"{genome_id_prefix}.genomes.txt"), - "wt", - encoding="UTF-8", - ) as genomes_out: - print(*sorted(genomes), sep="\n", file=genomes_out) - - n_genomes = len(genomes) - return n_genomes, cluster_genes, gene_cluster_map, genome_id_prefix in genomes - - -@dataclass -class RefGene: - """ Y-cluster reference gene class. """ - refset: str = None - speci: int = None - refset_id: int = None - is_core: bool = None - is_singleton: bool = None - is_unique: bool = None - prevalence: int = None - sp100_id: int = None - genome_id: int = None - gene_id: str = None - n_rep_genomes: int = None - n_rep_genes: int = None - - @classmethod - def from_string(cls, s): - """ Construct RefGene from cluster id string. """ - # proMGE095-00037-00002654-AN012-0000748630-000497735_01349-0000000104-0000000608 - fields = s.strip().split("-") - return cls( - fields[0], - int(fields[1]), - int(fields[2]), - fields[3][0] == "C", - fields[3][1] == "S", - fields[3][1] in "SU", - int(fields[3][2:]), - int(fields[4]), - int(fields[5][:fields[5].find("_")]), - fields[5], - int(fields[6]), - int(fields[7]), - ) - - -def evaluate_cluster(rep_id, cluster, genes): - """ Add cluster information from Y-cluster data. """ - ref_genes = {gene for gene in cluster if gene.startswith("proMGE") or gene[0] == "-"} - query_genes = cluster.difference(ref_genes) - - if query_genes: - if ref_genes: - if len(ref_genes) > 1: - # this is a heuristic -- - # we take the ref_gene with the largest represented genomes ([6]) - # to approximate the cluster's rep_genomes - ref_genes = sorted( - ((int(r.split("-")[6]), r) for r in ref_genes), - key=lambda x: x[0], reverse=True - ) - # print("MULTIREF-CLUSTER", ref_genes[0][1]) - # print(*(r[1] for r in ref_genes[1:]), sep="\n") - rep = ref_genes[0][1] - else: - rep = list(ref_genes)[0] - ref_gene = RefGene.from_string(rep) - else: - # this cluster doesn't contain any reference genes -> genes are accessory - ref_gene = RefGene(is_core=False, prevalence=0) - - for gene_id in query_genes: - gene = genes.get(gene_id) - if gene is not None: - gene.cluster = rep_id - gene.is_core = ref_gene.is_core - gene.prevalence = ref_gene.prevalence - - -def parse_y_clusters(cluster_data, genes): - """ Parse data from Y gene clustering approach. """ - - cluster, members = None, set() - for line in get_lines_from_chunks(cluster_data): - cluster_id, gene_id = line.split("\t") - if cluster_id != cluster: - if cluster is not None: - evaluate_cluster(cluster, members, genes) - cluster, members = cluster_id, set() - - members.add(gene_id) - - evaluate_cluster(cluster, members, genes) diff --git a/mgexpose/db.py b/mgexpose/db.py deleted file mode 100644 index b821dc3..0000000 --- a/mgexpose/db.py +++ /dev/null @@ -1,156 +0,0 @@ -# pylint: disable=W2301,R0903,E1101,C0103 - -""" Functions for database access. """ - -import json -import random -import time - -from functools import lru_cache - -from sqlalchemy import create_engine, MetaData, Table -from sqlalchemy.orm import mapper, sessionmaker, registry -from sqlalchemy.exc import OperationalError - - -class DbGene: - """ Placeholder Gene class""" - ... - - -class DbGeneCluster: - """ Placeholder GeneCluster class""" - ... - - -class DbEmapperResult: - """ Emapper results class""" - HEADERS = [ - "#query", - "seed_ortholog", - "evalue", - "score", - "eggNOG_OGs", - "max_annot_lvl", - "COG_category", - "Description", - "Preferred_name", - "GOs", - "EC", - "KEGG_ko", - "KEGG_Pathway", - "KEGG_Module", - "KEGG_Reaction", - "KEGG_rclass", - "BRITE", - "KEGG_TC", - "CAZy", - "BiGG_Reaction", - ] - - def __str__(self): - """ String representation. """ - return "\t".join(str(v) for k, v in self.__dict__.items() if k != "project_id") - - -def read_db_details(f): - """ Reads database credentials from JSON file. """ - with open(f, "rt", encoding="UTF-8") as _in: - return json.load(_in) - - -@lru_cache(maxsize=10000) -def get_cluster(db_session, cluster_id): - """ Queries GeneCluster table by cluster_id. """ - cluster = ( - db_session.query(DbGeneCluster).filter(DbGeneCluster.id == cluster_id).one_or_none() - ) - return cluster - - -def get_gene(db_session, gene_id): - """ Queries Gene table by gene_id. """ - gene = ( - db_session.query(DbGene).filter(DbGene.id == gene_id).one_or_none() - ) - return gene - - -def db_available(cluster_id): - """ Checks if cluster data is stored in db. - (Only the largest 10 clusters are.) - """ - return int(cluster_id.replace("specI_v4_", "")) in range(9) - - -def initialise_db(db_details, db_name, cluster_id=""): - """ Initialises database connection. """ - - db_access = read_db_details(db_details)[db_name] - - engine = create_engine( - f"postgresql+psycopg2://{db_access['username']}:" - f"{db_access['password']}@{db_access['host']}/{db_name}" - ) - - metadata = MetaData() - - if cluster_id.lower() == "speci_v4_00000": - cluster_id = "" - - # strips "_" in case of specI_v4_00000 - gene_table_name = f"{cluster_id}_gene".strip("_") - gene_cluster_table_name = f"{cluster_id}_gene_cluster".strip("_") - - while 1: - try: - gene_table = Table( - gene_table_name, - metadata, - autoload_with=engine - ) - - gene_cluster_table = Table( - gene_cluster_table_name, - metadata, - autoload_with=engine - ) - - # [mapper(DbGene, gene_table), mapper(DbGeneCluster, gene_cluster_table)] - mapper_registry = registry() - mapper_registry.map_imperatively(DbGene, gene_table) - mapper_registry.map_imperatively(DbGeneCluster, gene_cluster_table) - - Session = sessionmaker(bind=engine) - session = Session() - except OperationalError: - time.sleep(random.randint(1, 31)) - else: - break - - return session - - -def initialise_pg3_db(db_details, db_name): - """ Initialises connection to PG3 database for emapper queries. """ - db_access = read_db_details(db_details)[db_name] - - engine = create_engine( - f"postgresql+psycopg2://{db_access['username']}:" - f"{db_access['password']}@{db_access['host']}/{db_name}" - ) - - metadata = MetaData(engine) - - emapper_table = Table( - "eggnog", - metadata, - autoload=True - ) - - mapper(DbEmapperResult, emapper_table) - - Session = sessionmaker(bind=engine) - session = Session() - - return session diff --git a/mgexpose/gene_annotator.py b/mgexpose/gene_annotator.py deleted file mode 100644 index 1ff16ac..0000000 --- a/mgexpose/gene_annotator.py +++ /dev/null @@ -1,218 +0,0 @@ -# pylint: disable=R0912,R0913,R0914 - -""" Classes for integrating gene annotations. """ - -import logging - -from contextlib import nullcontext - -from clustering_parser import parse_full_seq_clusters, parse_y_clusters, parse_db_clusters -from gene import Gene -from phage import PhageDetection -from readers import ( - EggnogReader, - parse_macsyfinder_report, - read_recombinase_hits, -) - - -logger = logging.getLogger(__name__) - - -class GeneAnnotator: - """ GeneAnnotator class. """ - def __init__( - self, - genome_id, - speci, - genes, - include_genome_id=False, - has_batch_data=False, - dbformat=None - ): - logger.info("Creating new %s for genome=%s specI=%s", self.__class__, genome_id, speci) - self.genome_id = genome_id - self.speci = speci - self.genes = {} - self.has_batch_data = has_batch_data - self.include_genome_id = include_genome_id - - for gene_id, annotation in genes: - - if dbformat != "PG3": - gene_id = f'{annotation[0]}_{gene_id.split("_")[-1]}' - - logger.info("Adding gene %s", gene_id) - self.genes[gene_id] = Gene( - id=gene_id, - genome=self.genome_id, - speci=self.speci, - contig=annotation[0], - start=int(annotation[3]), - end=int(annotation[4]), - strand=annotation[6], - ) - - def add_recombinases(self, recombinases): - """ Add information from recombinase scan """ - for gene_id, recombinase in recombinases: - gene = self.genes.get(gene_id) - if gene is not None: - gene.recombinase = recombinase - - def add_cluster( - self, - cluster_data, - use_y_clusters=False, - core_threshold=0.95, - output_dir=None, - ): - """ Add information from gene clustering to allow for core/accessory gene classification """ - - if use_y_clusters: - parse_y_clusters(cluster_data, self.genes) - return None - - write_data = False - gene_clusters_out = nullcontext() - n_genomes = 0 - cluster_genes = {} - - with gene_clusters_out: - if cluster_data is not None: - - if core_threshold != -1: - n_genomes, cluster_genes, gene_cluster_map, _ = parse_full_seq_clusters( - self.genome_id, - self.genes, - cluster_data, - output_dir=output_dir, - ) - - logger.info( - "Parsed %s genomes with %s gene clusters.", - n_genomes, - len(cluster_genes), - ) - else: - gene_cluster_map = parse_db_clusters(cluster_data) - - n_genes = len(gene_cluster_map) - n_core_genes = sum(1 for _, _, is_core in gene_cluster_map if is_core) - logger.info( - "Parsed %s precomputed gene-cluster mappings with %s core genes (%s%%)", - n_genes, - n_core_genes, - round(n_core_genes / n_genes, 2), - ) - - for gene_id, *cluster in gene_cluster_map: - cluster, *is_core = cluster - is_core = is_core[0].lower() == "true" if is_core else None - if not self.include_genome_id or gene_id.startswith(self.genome_id): - gene = self.genes.get( - gene_id, - self.genes.get( - gene_id.replace(self.genome_id + ".", "") - ) - ) - logger.info( - "Checking cluster %s gene %s... %s", - str(cluster), - gene_id, - str(gene), - ) - if gene and gene.speci is not None: - gene.cluster = cluster - - if cluster_genes: - occ = cluster_genes[cluster] - gene.is_core = any(( - occ / n_genomes > core_threshold, - (n_genomes <= 20 and occ >= n_genomes - 1), - )) - elif core_threshold == -1: - gene.is_core = is_core - - if write_data: - print(gene.id, gene.cluster, sep="\t", file=gene_clusters_out) - - return None - - def add_eggnog_annotation(self, eggnog_annotation): - """ Add eggnog output and phage signals to each gene """ - for gene_id, phage_data, eggnog_data in eggnog_annotation: - gene = self.genes.get(gene_id) - if gene is not None: - gene.eggnog = eggnog_data - gene.phage = phage_data - - def add_secretion_system(self, secretion_annotation): - """ Add information from txsscan """ - for gene_id, secretion_data in secretion_annotation: - system, rule, *_ = secretion_data - gene = self.genes.get(gene_id) - if gene is not None: - gene.secretion_system = system - gene.secretion_rule = rule - - def annotate_genes( - self, - recombinases, - eggnog_annotation, - secretion_annotation, - clusters=None, - use_y_clusters=False, - core_threshold=None, - output_dir=None, - ): - """ Annotate genes with MGE-relevant data. """ - self.add_recombinases( - read_recombinase_hits(recombinases) - ) - if all(secretion_annotation): - self.add_secretion_system( - parse_macsyfinder_report( - *secretion_annotation[:2], - # macsy_version=secretion_annotation[-1], - ), - ) - if eggnog_annotation is not None: - phage_detection = PhageDetection(eggnog_annotation[1]) - - self.add_eggnog_annotation( - EggnogReader.parse_emapper( - eggnog_annotation[0], - phage_annotation=phage_detection, - ) - ) - if clusters is not None: - self.add_cluster( - clusters, - use_y_clusters=use_y_clusters, - core_threshold=core_threshold, - output_dir=output_dir, - ) - yield from self.genes.values() - - def dump_genes(self, outstream): - """ Write gene info to stream. """ - - headers = list(Gene().__dict__.keys()) - headers.remove("eggnog") - headers += EggnogReader.EMAPPER_FIELDS["v2.1.2"] - headers.remove("description") - - # print(*Gene().__dict__.keys(), sep="\t", file=outstream) - print(*headers, sep="\t", file=outstream) - for gene in self.genes.values(): - gene.stringify_speci() - eggnog_data = {} - if gene.eggnog: - eggnog_data = dict(gene.eggnog) - eggnog_cols = ( - eggnog_data.get(k) - for k in EggnogReader.EMAPPER_FIELDS["v2.1.2"] - if k != "description" - ) - print(gene, *eggnog_cols, sep="\t", file=outstream) diff --git a/mgexpose/get_cluster_data.py b/mgexpose/get_cluster_data.py deleted file mode 100644 index 8d8d240..0000000 --- a/mgexpose/get_cluster_data.py +++ /dev/null @@ -1,67 +0,0 @@ -#!/usr/bin/env python - -""" Module for processing mmseqs2 linclust output. """ - -import argparse -import gzip -import os -import warnings - -from db import initialise_db, db_available, get_gene, get_cluster -from readers import read_prodigal_gff - - -def main(): - """ maaaaaaain... """ - ap = argparse.ArgumentParser() - ap.add_argument("genome_id", type=str) - ap.add_argument("speci", type=str) - ap.add_argument("prodigal_gff", type=str) - ap.add_argument("--cluster_db_credentials", type=str) - ap.add_argument("--output_dir", "-o", type=str, default=".") - ap.add_argument("--dump_intermediate_steps", action="store_true") - - args = ap.parse_args() - - db_session = None - if args.cluster_db_credentials: - if db_available(args.speci): - db_session = initialise_db( - args.cluster_db_credentials, - "mge_clusters", - cluster_id=args.speci.lower(), - ) - else: - warnings.warn( - "Could not connect to database.\n" - f"Check if {args.speci} database exists in specified " - f"database ({args.cluster_db_credentials}.)" - ) - return None - - print("DB_SESSION", db_session is not None) - - gene_clusters_out = gzip.open( - os.path.join(args.output_dir, f"{args.genome_id}.db_gene_clusters.txt.gz"), - "wt", - ) - - with gene_clusters_out: - for gene_id, _ in read_prodigal_gff(args.prodigal_gff): - db_gene = get_gene(db_session, gene_id) - if db_gene is not None: - db_cluster = get_cluster(db_session, db_gene.cluster_id) - if db_cluster is not None: - print( - gene_id, - db_cluster.name, - db_gene.is_core, - sep="\t", - file=gene_clusters_out, - ) - - return None - - -if __name__ == "__main__": - main() diff --git a/mgexpose/get_db_seqs.py b/mgexpose/get_db_seqs.py deleted file mode 100644 index 602a2b3..0000000 --- a/mgexpose/get_db_seqs.py +++ /dev/null @@ -1,109 +0,0 @@ -#!/usr/bin/env python -# pylint: disable=E0401,C0415,R0914,W0702,W0621,W0404,C0301 -# flake8: noqa - -""" Functions to load speci cluster data from cache/seq repo. """ - -import argparse -import gzip -import json -import os - -import pymongo - - -def get_sequences_from_cluster(mongo_db_str, cluster_id, seqfile): - """ Get cluster sequences from cache/seq repo """ - - try: - import pymongo - except ImportError: - return 0 - - client = pymongo.MongoClient(mongo_db_str,) - fr_db = client["progenomes"] - - n_genes = 0 - files = [] - with gzip.open(seqfile, "wt") as genes_out: - for record in fr_db.samples.find({'fr13_cluster': cluster_id}): - genes_file = f"{record['analysis_path']}/ref_genome_called_genes/{record['sample_id']}.genes.fa.gz" - files.append(genes_file) - for genes_file in files: - with gzip.open(genes_file, "rt") as genes_in: - genes_raw = genes_in.read() - n_genes += genes_raw.count(">") - print(genes_raw, file=genes_out, end="" if genes_raw[-1] == "\n" else "\n") - - return n_genes - - -def main(): - """ Main. """ - - ap = argparse.ArgumentParser() - ap.add_argument("dbname", type=str) - ap.add_argument("dbcred", type=str) - ap.add_argument("cluster_id", type=str) - ap.add_argument("outfile", type=str) - ap.add_argument("outfile_ids", type=str) - ap.add_argument("--cache", type=str) - args = ap.parse_args() - - try: - with open(args.dbcred, "rt", encoding="UTF-8") as _in: - db_d = json.load(_in).get(args.dbname) - except: - db_d = {} - - user = db_d.get("username") - host = db_d.get("host") - pw = db_d.get("password") - port = db_d.get("port") - - dbstr = f"mongodb://{user}:{pw}@{host}:{port}" if (user and host and pw and port) else None - - client = pymongo.MongoClient(dbstr,) - fr_db = client["progenomes"] - - n_genes = 0 - files = [] - - n_seqs = 0 - if args.cache and os.path.isdir(args.cache): - print("Looking up seq_cache...") - expected_files = [ - os.path.join(args.cache, f"{args.cluster_id}.{suffix}") - for suffix in ("genes.ffn.gz", "genes.nseqs", "genes.ids.gz") - ] - if all(os.path.isfile(f) and os.stat(f).st_size for f in expected_files): - with open(os.path.join(args.cache, f"{args.cluster_id}.genes.nseqs"), "rt", encoding="UTF-8") as _in: - n_seqs = int(_in.read().strip()) - print("Copying sequences from seq_cache:", args.cluster_id, args.outfile, "...", end="") - # shutil.copyfile(os.path.join(args.cache, f"{args.cluster_id}.genes.ffn.gz"), args.outfile) - - os.symlink(os.path.join(args.cache, f"{args.cluster_id}.genes.ffn.gz"), args.outfile) - os.symlink(os.path.join(args.cache, f"{args.cluster_id}.genes.ids.gz"), args.outfile_ids) - print(n_seqs) - - if not n_seqs: - with gzip.open(args.outfile, "wt") as genes_out, gzip.open(args.outfile_ids, "wt") as geneids_out: - for record in fr_db.samples.find({'fr13_cluster': args.cluster_id}): - genes_file = f"{record['analysis_path']}/ref_genome_called_genes/{record['sample_id']}.genes.fa.gz" - files.append(genes_file) - for genes_file in files: - with gzip.open(genes_file, "rt") as genes_in: - genes_raw = genes_in.read() - n_genes += genes_raw.count(">") - print(genes_raw, file=genes_out, end="" if genes_raw[-1] == "\n" else "\n") - - genes_raw = ( - line[1:].split(" ")[0] - for line in genes_raw.strip().split("\n") - if line[0] == ">" - ) - print(*genes_raw, file=geneids_out, sep="\n") - - -if __name__ == "__main__": - main() diff --git a/mgexpose/get_eggnog.py b/mgexpose/get_eggnog.py deleted file mode 100644 index a54dcb8..0000000 --- a/mgexpose/get_eggnog.py +++ /dev/null @@ -1,52 +0,0 @@ -#!/usr/bin/env python -# pylint: disable=C0301 -# flake8: noqa - -""" Get emapper data from database. """ - -import argparse - -import pandas as pd - -from db import read_db_details, DbEmapperResult - - -def main(): - """ Main. """ - ap = argparse.ArgumentParser() - ap.add_argument("database", type=str) - ap.add_argument("credentials", type=str) - ap.add_argument("--project_id", type=str) - ap.add_argument("--sample_name", type=str) - ap.add_argument("--bulk_file", type=str) - - args = ap.parse_args() - - db_access = read_db_details(args.credentials)[args.database] - - conn = f"postgresql://{db_access['host']}/{args.database}?user={db_access['username']}&password={db_access['password']}" - - if args.bulk_file: - with open(args.bulk_file, "rt", encoding="UTF-8") as _in: - query_list = [line.strip() for line in _in] - query_list_str = ", ".join(line for line in query_list) - query = f"SELECT * from eggnog WHERE project_id IN ({query_list_str});" - else: - column, column_value = ("project_id", args.project_id) if args.project_id else ("sample_name", args.sample_name) - query = f"SELECT * from eggnog WHERE {column} = '{column_value}';" - - eggnog = pd.read_sql(query, conn) - try: - eggnog = eggnog.drop("project_id", axis=1) - except KeyError: - pass - try: - eggnog = eggnog.drop(["sample_name", "contig_name", "pfams",], axis=1) - except KeyError: - pass - eggnog.columns = DbEmapperResult.HEADERS - eggnog.to_csv("emapper.annotations", sep="\t", index=False) - - -if __name__ == "__main__": - main() diff --git a/mgexpose/get_eggnog_f13.py b/mgexpose/get_eggnog_f13.py deleted file mode 100644 index 0129a52..0000000 --- a/mgexpose/get_eggnog_f13.py +++ /dev/null @@ -1,74 +0,0 @@ -#!/usr/bin/env python -# pylint: disable=C0301,R0801 -# flake8: noqa - -""" Get emapper data from database. """ - -import argparse -import os -import pathlib - -import pandas as pd - -from db import read_db_details, DbEmapperResult - - -def main(): - """ Main. """ - ap = argparse.ArgumentParser() - ap.add_argument("database", type=str) - ap.add_argument("credentials", type=str) - ap.add_argument("--project_id", type=str) - ap.add_argument("--sample_name", type=str) - ap.add_argument("--bulk_file", type=str) - ap.add_argument("--sample_tax_map", type=str) - ap.add_argument( - "--fill_missing", - action="store_true", - help="Generate empty annotation file if genome does not have a database record.", - ) - - args = ap.parse_args() - - db_access = read_db_details(args.credentials)[args.database] - - conn = f"postgresql://{db_access['host']}/{args.database}?user={db_access['username']}&password={db_access['password']}" - - with open(args.sample_tax_map, "rt", encoding="UTF-8") as _in: - st_map = dict(line.strip().split("\t")[::-1] for line in _in) - with open(args.bulk_file, "rt", encoding="UTF-8") as _in: - query_list = [(st_map.get(".".join(line.strip().split("/")[1].split(".")[:-1])), line.strip()) for line in _in] - query_list_str = ", ".join(f"'{pid}'" for pid, _ in query_list if pid is not None) - query = f"SELECT * from eggnog WHERE project_id IN ({query_list_str});" - - st_map = {v: k for k, v in st_map.items()} - p_map = dict(query_list) - eggnog = pd.read_sql(query, conn) - annotated_genomes = [] - print(query_list[:10]) - for pid in eggnog["project_id"].unique(): - print(pid) - genome_id = st_map.get(pid) - annotated_genomes.append(genome_id) - eggnog[eggnog["project_id"] == pid].drop("project_id", axis=1).to_csv( - os.path.join(p_map.get(pid), f"{genome_id.split('/')[0]}.emapper_annotations"), - sep="\t", - header=DbEmapperResult.HEADERS, - index=False, - ) - - if args.fill_missing: - # write empty emapper annotations - # for rare cases where we neither have db records - # nor annotations on the file system - # genome will not have cargo/phage annotation - # but could still contain e.g. recombinase signals - for genome_id in set(v for _, v in query_list).difference(annotated_genomes): - path = pathlib.Path(genome_id) - path.mkdir(parents=True, exist_ok=True) - (path / f"{genome_id.split('/')[1]}.emapper_annotations").touch() - - - -if __name__ == "__main__": - main() diff --git a/mgexpose/handle_args.py b/mgexpose/handle_args.py deleted file mode 100644 index ff45882..0000000 --- a/mgexpose/handle_args.py +++ /dev/null @@ -1,191 +0,0 @@ -""" Module for argument handling """ - -import argparse - -from readers import EggnogReader - - -__version__ = "3.6.0" - - -def handle_args(): - """ Argument handling """ - ap = argparse.ArgumentParser( - prog="profile_me", - formatter_class=argparse.RawTextHelpFormatter, - ) - - ap.add_argument( - "--version", action="version", version="%(prog)s " + __version__ - ) - - # ap.add_argument("--output_dir", "-o", type=str, default=".") - # ap.add_argument("--dbformat", type=str, choices=("PG3", "SPIRE")) - # ap.add_argument("--write_gff", action="store_true") - # ap.add_argument("--write_genes_to_gff", action="store_true") - # ap.add_argument("--dump_intermediate_steps", action="store_true") - # ap.add_argument("--output_suffix", type=str, default="full_length_MGE_assignments") - # ap.add_argument("--debug", action="store_true") - - subparsers = ap.add_subparsers(dest="command", required=True) - - parent_subparser = argparse.ArgumentParser(add_help=False) - parent_subparser.add_argument("--output_dir", "-o", type=str, default=".") - parent_subparser.add_argument("--dbformat", type=str, choices=("PG3", "SPIRE")) - parent_subparser.add_argument("--write_gff", action="store_true") - parent_subparser.add_argument("--write_genes_to_gff", action="store_true") - parent_subparser.add_argument("--dump_intermediate_steps", action="store_true") - parent_subparser.add_argument( - "--output_suffix", type=str, default="full_length_MGE_assignments", - ) - parent_subparser.add_argument("--debug", action="store_true") - - denovo_ap = subparsers.add_parser( - "denovo", - help="Classify and annotate mobile genomic regions from annotated genes.", - parents=(parent_subparser,), - ) - denovo_ap.add_argument("genome_id", type=str) - denovo_ap.add_argument("prodigal_gff", type=str) - denovo_ap.add_argument("recombinase_hits", type=str) - denovo_ap.add_argument("mge_rules", type=str) - denovo_ap.add_argument("--speci", type=str, default="no_speci") - denovo_ap.add_argument("--txs_macsy_rules", type=str) - denovo_ap.add_argument("--txs_macsy_report", type=str) - denovo_ap.add_argument("--phage_eggnog_data", type=str) - denovo_ap.add_argument("--cluster_data", type=str) - denovo_ap.add_argument("--skip_island_identification", action="store_true") - denovo_ap.add_argument("--dump_genomic_islands", action="store_true") - denovo_ap.add_argument("--phage_filter_terms", type=str) - - denovo_ap.add_argument("--include_genome_id", action="store_true") - denovo_ap.add_argument("--core_threshold", type=float, default=0.95) - denovo_ap.add_argument( - "--allow_batch_data", - action="store_true", - help=( - "SPIRE annotation may have data that does not relate to the current bin." - " Ignore those data." - ), - ) - denovo_ap.add_argument( - "--use_y_clusters", - action="store_true", - help=( - "Gene clustering is performed against annotated" - " and redundancy-reduced reference sets." - ), - ) - denovo_ap.add_argument( - "--single_island", - action="store_true", - help="Input is genomic region, skips island computation." - ) - denovo_ap.add_argument( - "--precomputed_islands", - type=str, - help="Input is set of genomic regions, skips island computation." - ) - denovo_ap.add_argument( - "--precomputed_core_genes", - action="store_true", - help="Core/accessory gene sets were precomputed." - ) - - denovo_ap.add_argument( - "--add_functional_annotation", - action="store_true", - help="If specified, per gene emapper annotations are stored in the gff." - ) - # ensure newest eggnog version - - denovo_ap.set_defaults(func=None) # TODO - - identify_mobile_islands_ap = subparsers.add_parser( - "identify_mobile_islands", - help="Identify and classify genomic islands as mobile.", - parents=(parent_subparser,), - ) - - identify_mobile_islands_ap.add_argument("island_gff", type=str) - - identify_mobile_islands_ap.set_defaults(func=None) # TODO - - return ap.parse_args() - - -def handle_args_old(): - """ Argument handling """ - ap = argparse.ArgumentParser() - ap.add_argument("genome_id", type=str) - ap.add_argument("prodigal_gff", type=str) - ap.add_argument("recombinase_hits", type=str) - ap.add_argument("speci", type=str) - - ap.add_argument( - "txs_macsy_rules", - type=str, - help=( - "In macsyfinder v1, this is found in macsyfinder.summary(.txt)." - " In v2+, this is provided with the pipeline." - ), - ) - ap.add_argument("txs_macsy_report", type=str) - ap.add_argument("phage_eggnog_data", type=str) - ap.add_argument("mge_rules", type=str) - - ap.add_argument("--cluster_data", type=str) - ap.add_argument("--output_dir", "-o", type=str, default=".") - ap.add_argument("--phage_filter_terms", type=str) - ap.add_argument("--include_genome_id", action="store_true") - ap.add_argument("--core_threshold", type=float, default=0.95) - ap.add_argument("--macsy_version", type=int, choices=(1, 2), default=2) - ap.add_argument( - "--emapper_version", - type=str, - choices=EggnogReader.EMAPPER_FIELDS.keys(), - default="v2.1.2", - ) - ap.add_argument( - "--allow_batch_data", - action="store_true", - help=( - "SPIRE annotation may have data that does not relate to the current bin." - " Ignore those data." - ), - ) - ap.add_argument( - "--use_y_clusters", - action="store_true", - help=( - "Gene clustering is performed against annotated" - " and redundancy-reduced reference sets." - ), - ) - ap.add_argument( - "--single_island", - action="store_true", - help="Input is genomic region, skips island computation." - ) - ap.add_argument( - "--precomputed_islands", - type=str, - help="Input is set of genomic regions, skips island computation." - ) - ap.add_argument("--write_gff", action="store_true") - ap.add_argument("--write_genes_to_gff", action="store_true") - ap.add_argument("--add_functional_annotation", - action="store_true", - help="If specified, per gene emapper annotations are stored in the gff.") - # ensure newest eggnog version - ap.add_argument("--dump_intermediate_steps", action="store_true") - ap.add_argument("--output_suffix", type=str, default="full_length_MGE_assignments") - ap.add_argument("--dbformat", type=str, choices=("PG3", "SPIRE")) - ap.add_argument( - "--precomputed_core_genes", - action="store_true", - help="Core/accessory gene sets were precomputed." - ) - ap.add_argument("--skip_island_identification", action="store_true") - - return ap.parse_args() diff --git a/mgexpose/island_processing.py b/mgexpose/island_processing.py deleted file mode 100644 index 1bb51ed..0000000 --- a/mgexpose/island_processing.py +++ /dev/null @@ -1,193 +0,0 @@ -""" Module for processing mobile genetic islands """ - -import contextlib -import logging - - -from islands import GenomicIsland, AnnotatedGenomicIsland, MgeGenomicIsland - - -logger = logging.getLogger(__name__) - - -def is_valid_stream(stream): - """ Checks if a stream-variable represents a valid stream. """ - return stream is not None and not isinstance( - stream, contextlib.nullcontext - ) - - -def check_island_genes(genes, precomputed_islands=None): - """ Check if genes have valid annotation and belong to a precomputed island. """ - has_precomputed_islands = False - if precomputed_islands is not None: - logger.info("Precomputed islands: %s", len(precomputed_islands)) - has_precomputed_islands = True - - for gene in genes: - is_annotated = gene.has_basic_annotation(skip_core_gene_computation=has_precomputed_islands) - add_gene = False - if is_annotated and has_precomputed_islands: - gene.contig = gene.contig.split(".")[-1] - for island in precomputed_islands.get(gene.contig, []): - log_str = ( - f"Checking gene={gene.contig}:" - f"{gene.start}-{gene.end} against " - f"{island.contig}:{island.start}-{island.end}: " - ) - - if gene.is_in_interval(island.start, island.end): - add_gene = True - if gene.speci is None or gene.speci == "no_speci": - gene.speci = {island.name} - else: - gene.speci.add(island.name) - island.add_gene(gene) - - logger.info("%s %s", log_str, str(add_gene)) - else: - add_gene = is_annotated - - log_str = ( - f"Gene {gene}: {is_annotated=} {add_gene=} ->" - f" contig set = {is_annotated and add_gene}" - ) - logger.info(log_str) - if add_gene: - yield gene - - -def filter_precomputed_islands(precomputed_islands, raw_island_stream=None, island_stream=None): - """ Return precomputed islands with mge signals. """ - for _, islands in precomputed_islands.items(): - seen_islands = set() - for island in islands: - island.dump(seen_islands, raw_outstream=raw_island_stream, outstream=island_stream) - if island.recombinases: - logger.info("GenomicIsland %s created.", str(island)) - yield island - - -def compute_islands(contigs, raw_island_stream=None, island_stream=None): - """ Form genomic islands from stretches of core or accessory genes, - the return those with mge signals.""" - for _, genes in sorted(contigs.items()): - seen_islands = set() - current_island = None - for gene in sorted(genes, key=lambda g: (g.start, g.end, g.strand)): - if current_island is None or current_island.is_core != gene.is_core: - if current_island is not None: - current_island.dump( - seen_islands, - raw_outstream=raw_island_stream, - outstream=island_stream - ) - if current_island.recombinases: - yield current_island - - current_island = GenomicIsland.from_gene(gene) - - current_island.add_gene(gene) - - if current_island is not None: - current_island.dump( - seen_islands, - raw_outstream=raw_island_stream, - outstream=island_stream - ) - if current_island.recombinases: - yield current_island - - -def generate_island_set( - genes, - pang_calls_out=None, - raw_islands_out=None, - islands_out=None, - precomputed_islands=None, -): - """ Compute mge islands """ - contigs = {} - logger.info("generate_island_set: collecting genes") - - for gene in check_island_genes(genes, precomputed_islands=precomputed_islands): - logger.info("Adding gene %s to contig set.", str(gene)) - if is_valid_stream(pang_calls_out): - print(gene, file=pang_calls_out) - if precomputed_islands is None: - contigs.setdefault( - (gene.speci, gene.contig), [] - ).append(gene) - - if is_valid_stream(islands_out): - print(*GenomicIsland.get_fieldnames(), sep="\t", file=islands_out) - island_stream = islands_out - else: - island_stream = islands_out - if is_valid_stream(raw_islands_out): - print(*GenomicIsland.RAW_TABLE_HEADER, sep="\t", file=raw_islands_out) - raw_island_stream = raw_islands_out - else: - raw_island_stream = None - - if precomputed_islands is not None: - yield from filter_precomputed_islands( - precomputed_islands, - raw_island_stream=raw_island_stream, - island_stream=island_stream, - ) - else: - yield from compute_islands( - contigs, - raw_island_stream=raw_island_stream, - island_stream=island_stream, - ) - - -def annotate_islands(islands, outstream=None): - """ Adds annotation to previously computed islands. """ - do_print = is_valid_stream(outstream) - if do_print: - print(*AnnotatedGenomicIsland.TABLE_HEADERS, sep="\t", file=outstream) - for island in sorted(islands, key=lambda x: (x.contig, x.start, x.end)): - annotated_island = AnnotatedGenomicIsland.from_genomic_island(island) - if do_print: - print(annotated_island, file=outstream) - yield annotated_island - - -def evaluate_islands(islands, rules, outstream=None, outstream2=None): - """ Classify/annotate mge islands according to present signals. """ - if is_valid_stream(outstream): - print(*MgeGenomicIsland.TABLE_HEADERS, sep="\t", file=outstream) - if is_valid_stream(outstream2): - print(*MgeGenomicIsland.TABLE_HEADERS, sep="\t", file=outstream2) - for island in islands: - mge_island = MgeGenomicIsland.from_annotated_genomic_island(island) - mge_island.evaluate_recombinases( - rules, - outstream=outstream if not isinstance(outstream, contextlib.nullcontext) else None, - outstream2=outstream2 if not isinstance(outstream2, contextlib.nullcontext) else None, - ) - - yield mge_island - - -def prepare_precomputed_islands(single_island=None, island_file=None): - """ Helper function to deal with precomputed regions/islands. """ - precomputed_islands = None - if single_island and island_file: - raise ValueError("Both --single_island and --precomputed_islands set.") - if single_island and not island_file: - precomputed_islands = [GenomicIsland.from_region_string(single_island)] - elif not single_island and island_file: - with open(island_file, "rt", encoding="UTF-8",) as _in: - precomputed_islands = [GenomicIsland.from_region_string(line) for line in _in] - - if precomputed_islands is not None: - precomputed_islands_by_contig = {} - for island in precomputed_islands: - precomputed_islands_by_contig.setdefault(island.contig, []).append(island) - precomputed_islands = precomputed_islands_by_contig - - return precomputed_islands diff --git a/mgexpose/mge_annotation.py b/mgexpose/mge_annotation.py deleted file mode 100644 index 7fcaa55..0000000 --- a/mgexpose/mge_annotation.py +++ /dev/null @@ -1,317 +0,0 @@ -#!/usr/bin/env python - -# pylint: disable=R0912,R0914,R0915,R0913 - -""" Mobile genetic element annotation """ - -import contextlib -import logging -import os - -from gene_annotator import GeneAnnotator -from handle_args import handle_args -from island_processing import ( - generate_island_set, - annotate_islands, - evaluate_islands, - prepare_precomputed_islands -) -from islands import MgeGenomicIsland -from readers import read_prodigal_gff, read_mge_rules -from gffio import read_genomic_islands_gff - -MGE_TABLE_HEADERS = \ - ("is_tn",) + \ - MgeGenomicIsland.TABLE_HEADERS[1:6] + \ - MgeGenomicIsland.TABLE_HEADERS[8:14] + \ - ("mgeR", "name", "genes",) - -logging.basicConfig( - level=logging.INFO, - format='[%(asctime)s] %(message)s' -) - -logger = logging.getLogger(__name__) - - -def process_islands(genes, genome_id, single_island=None, island_file=None, output_dir=None,): - """ helper function to declutter main() """ - precomputed_islands = prepare_precomputed_islands( - single_island=single_island, - island_file=island_file - ) - - if output_dir: - pang_calls_out = open( - os.path.join( - output_dir, - f"{genome_id}.pan_genome_calls.txt"), - "wt", - encoding="UTF-8", - ) - - islands_out = open( - os.path.join( - output_dir, - f"{genome_id}.pan_genome_islands.txt", - ), - "wt", - encoding="UTF-8", - ) - - raw_islands_out = open( - os.path.join( - output_dir, - "..", # temporary! this is only until i know if this is final output or not - f"{genome_id}.pan_genome_islands_raw.txt", - ), - "wt", - encoding="UTF-8", - ) - else: - pang_calls_out, islands_out, raw_islands_out = [contextlib.nullcontext() for _ in range(3)] - - with pang_calls_out, islands_out, raw_islands_out: - yield from generate_island_set( - genes, - pang_calls_out=pang_calls_out, - raw_islands_out=raw_islands_out, - islands_out=islands_out, - precomputed_islands=precomputed_islands, - ) - - -def dump_islands(islands, out_prefix, db, write_genes=False, add_functional_annotation=False): - """ dump genomic islands to intermediate gff """ - with open( - f"{out_prefix}.unannotated_islands.gff3", - "wt", encoding="UTF-8" - ) as _out: - print("##gff-version 3", file=_out) - for island in sorted(islands, key=lambda isl: isl.contig): - island.to_gff( - _out, db, write_genes=write_genes, - add_functional_annotation=add_functional_annotation, - intermediate_dump=True, - ) - - -def identify_recombinase_islands(islands, genome_id, mge_rules, output_dir=None): - """Identify MGE-islands according to a set of rules - using various signals annotated in the corresponding gene set. """ - if output_dir: - step1_out = open( - os.path.join( - output_dir, - f"{genome_id}.assign_mge.step1.txt", - ), - "wt", - encoding="UTF-8", - ) - - step2_out = open( - os.path.join( - output_dir, - f"{genome_id}.assign_mge.step2.txt", - ), - "wt", - encoding="UTF-8", - ) - - step3_out = open( - os.path.join( - output_dir, - f"{genome_id}.assign_mge.step3.txt", - ), - "wt", - encoding="UTF-8", - ) - - else: - step1_out, step2_out, step3_out = [contextlib.nullcontext() for _ in range(3)] - - with step1_out: - annotated_islands = list(annotate_islands(islands, outstream=step1_out)) - with step2_out, step3_out: - return list( - evaluate_islands( - annotated_islands, - read_mge_rules(mge_rules), - outstream=step2_out, - outstream2=step3_out - ) - ) - - -def write_final_results( - recombinase_islands, - output_dir, - genome_id, - output_suffix, - dbformat=None, - write_tsv=True, - write_gff=True, - write_genes_to_gff=True, - add_functional_annotation=False, -): - """ write final results """ - - outstream = contextlib.nullcontext() - gff_outstream = contextlib.nullcontext() - - out_prefix = os.path.join( - output_dir, - f"{genome_id}.{output_suffix}" - ) - - if write_tsv: - outstream = open( - f"{out_prefix}.txt", - "wt", - encoding="UTF-8", - ) - if write_gff: - gff_outstream = open( - f"{out_prefix}.gff3", - "wt", - encoding="UTF-8", - ) - - # Sort the list of MGEGenomicIslands based on contig names - sorted_islands = sorted(recombinase_islands, key=lambda isl: isl.contig) - - with outstream, gff_outstream: - # TSV header - if write_tsv: - print(*MGE_TABLE_HEADERS, sep="\t", file=outstream) - # GFF3 header - if write_gff: - print("##gff-version 3", file=gff_outstream) - - # Start recording the outputs - for island in sorted_islands: - # TSV: ignore gene-wise annotations; each line is recombinase island, - # all gene IDs are stored in a gene_list column - # assert genome_id == island.genome - if write_tsv: - island.to_tsv(outstream) - # GFF3: add individual genes annotation; - # parent lines are recombinase islands, children lines are genes - # GFF3 parent term: recombinase island - if write_gff: - island.to_gff( - gff_outstream, - source_db=dbformat, - write_genes=write_genes_to_gff, - add_functional_annotation=add_functional_annotation, - ) - - -def denovo_annotation(args, debug_dir=None): - """ denovo annotation """ - annotator = GeneAnnotator( - args.genome_id, - args.speci, - read_prodigal_gff(args.prodigal_gff), - include_genome_id=args.include_genome_id, - has_batch_data=args.allow_batch_data, - dbformat=args.dbformat, - ) - - annotated_genes = annotator.annotate_genes( - args.recombinase_hits, - ( - args.phage_eggnog_data, - args.phage_filter_terms, - ), - ( - args.txs_macsy_report, - args.txs_macsy_rules, - # args.macsy_version, - ), - clusters=args.cluster_data, - use_y_clusters=args.use_y_clusters, - core_threshold=(args.core_threshold, -1)[args.precomputed_core_genes], - output_dir=args.output_dir - ) - - out_prefix = os.path.join(args.output_dir, args.genome_id) - - genomic_islands = list( - process_islands( - annotated_genes, - args.genome_id, - single_island=args.single_island, - island_file=args.precomputed_islands, - output_dir=debug_dir, - ) - ) - - if args.dump_genomic_islands or args.skip_island_identification: - - dump_islands( - genomic_islands, - out_prefix, - args.dbformat, - write_genes=True, - add_functional_annotation=args.add_functional_annotation, - ) - - test_islands = list(read_genomic_islands_gff(f"{out_prefix}.unannotated_islands.gff3")) - dump_islands( - test_islands, - out_prefix + ".test", - args.dbformat, - write_genes=True, - add_functional_annotation=args.add_functional_annotation, - ) - - with open( - os.path.join(args.output_dir, f"{args.genome_id}.gene_info.txt"), - "wt", - encoding="UTF-8", - ) as _out: - annotator.dump_genes(_out) - - return genomic_islands - - -def main(): - """ main """ - - args = handle_args() - logger.info("ARGS: %s", str(args)) - - debug_dir = os.path.join(args.output_dir, "debug") if args.dump_intermediate_steps else None - - if args.command == "denovo": - genomic_islands = denovo_annotation(args, debug_dir=debug_dir) - - elif args.command == "annotate": - genomic_islands = None - raise NotImplementedError - - if not args.skip_island_identification: - - recombinase_islands = identify_recombinase_islands( - genomic_islands, - args.genome_id, - args.mge_rules, - output_dir=debug_dir, - ) - - if recombinase_islands: - write_final_results( - recombinase_islands, - args.output_dir, - args.genome_id, - args.output_suffix, - dbformat=args.dbformat, - write_gff=args.write_gff, - write_genes_to_gff=args.write_genes_to_gff, - add_functional_annotation=args.add_functional_annotation, - ) - - -if __name__ == "__main__": - main() diff --git a/mgexpose/parse_hmmsearch.py b/mgexpose/parse_hmmsearch.py deleted file mode 100644 index 85ed6a3..0000000 --- a/mgexpose/parse_hmmsearch.py +++ /dev/null @@ -1,98 +0,0 @@ -#!/usr/bin/env python3 - -""" Module for parsing recombinase hmmscan results. """ - - -import argparse -import re - -from recombinases import MGE_ALIASES -from readers import read_mge_rules - - -def parse_hmm_table(table_stream): - """ Parse hmm table. """ - for line in table_stream: - line = line.strip() - if line and line[0] != "#": - parsed = re.split(r"\s+", line) - yield parsed, parsed[0], float(parsed[5]) - - -def extract_best_hits(table_stream): - """ Extracts best hits from hmm-table stream. - - Returns best-scoring recombinase hmm hits. - """ - seen = {} - for line, protein, score in table_stream: - seen_score = seen.setdefault(protein, [0.0, ""])[0] - if score > seen_score: - seen[protein] = [score, line] - - return [line for _, line in seen.values()] - - -def generate_output_table(best_hits, mge_rules): - """ Annotates recombinase hmm hits using mge rules. - - Returns annotated table rows via generator. - """ - for hit in best_hits: - hit[2] = hit[2].lower() - for name, alias in MGE_ALIASES.items(): - hit[2] = hit[2].replace(name, alias) - - rule = mge_rules.get(hit[2]) - if not rule: - raise ValueError(f"Cannot find rule for {hit[2]}.") - - mges = rule.get_signals() - confidence = "high" if len(mges) == 1 else "low" - - yield (hit[0], hit[2], hit[3], ";".join(mges), hit[4], hit[5], confidence) - - -def main(): - """ Main function, duh. """ - ap = argparse.ArgumentParser() - ap.add_argument("hmmsearch_table", type=str) - ap.add_argument("--mge_rules", type=str, default=None) - ap.add_argument("--prefix", type=str, default="sample") - args = ap.parse_args() - - best_hits = [] - with open(args.hmmsearch_table, "rt", encoding="UTF-8") as table_stream: - best_hits = extract_best_hits(parse_hmm_table(table_stream)) - - if best_hits: - with open( - f"{args.prefix}.recombinase_hmmsearch.besthits.out", - "wt", - encoding="UTF-8", - ) as raw_table_out: - print(*("\t".join(bh) for bh in best_hits), sep="\n", file=raw_table_out) - - if args.mge_rules: - mge_rules = read_mge_rules(args.mge_rules, recombinase_scan=True) - - with open( - f"{args.prefix}.recombinase_based_MGE_predictions.tsv", - "wt", - encoding="UTF-8", - ) as mge_pred_out: - - header = ( - "#unigene", "recombinase_SMART_hmm_name", "PFAM_accession", - "MGE_prediction", "hmmsearch_fullsequence_evalue", - "hmmsearch_fullsequence_score", "MGE_prediction_confidence" - ) - - print(*header, sep="\t", file=mge_pred_out) - - for line in generate_output_table(best_hits, mge_rules): - print(*line, sep="\t", file=mge_pred_out) - - -if __name__ == "__main__": - main() diff --git a/mgexpose/phage.py b/mgexpose/phage.py deleted file mode 100644 index 1d8fc73..0000000 --- a/mgexpose/phage.py +++ /dev/null @@ -1,126 +0,0 @@ -# pylint: disable=R0903 - -""" Phage detection via keyword search """ - -import re - - -class PhageDetection: - """ Class to detect phage signals in freetext functional gene annotation. """ - # VIRAL_STRUCTURES = re.compile( - # "portal|fiber|collar|terminase|prohead" - # "|baseplate|sheath|base-plate|tail|head|capsid|tube" - # ) - # m/portal|tail_fiber|terminase|prohead|baseplate|tail_sheath|Tail_sheath| - # base-plate|tail_protein|capsid|tail_tube/ - VIRAL_STRUCTURES_KEYWORDS = ( - r"base-?plate", - r"capsid", - r"portal", - r"prohead", - r"terminase", - r"tail_(fiber|protein|sheath|tube)", - ) - VIRAL_STRUCTURES = re.compile(r"|".join(VIRAL_STRUCTURES_KEYWORDS)) - - # EXCLUDE_LIST = re.compile( - # "ribosome|ribosomal|30s|50s|sipc|tafi" - # "|post-translational|mycolic|macrophage" - # ) - # m/ribosome|ribosomal|30S|50S|SipC|Tafi|post-translational|mycolic| - # macrophage|peptidoglycan|sickle|Rhophilin|ATPase|myelin|Cysteine/i) - EXCLUDED_KEYWORDS = ( - r"ribosom(e|al)", - r"[35]0s", - r"sipc", - r"tafi", - r"post-?translational", - r"my(elin|colic)", - r"macrophage", - r"peptido-?glycan", - r"sickle", - r"rhophilin", - r"atpase", - r"cysteine", - ) - EXCLUDE_LIST = re.compile(r"|".join(EXCLUDED_KEYWORDS)) - - # EXPECTED_PHAGES = re.compile("phage|bacteriophage|prophage|lamboid|lambda") - # m/phage|bacteriophage|prophage|lamboid|lambda|\bMu\b|Mu-like - PHAGE_KEYWORDS = ( - r"(bacterio|pro)?phage", - r"lamb(da|oid)", - r"mu(-like)?" - ) - EXPECTED_PHAGES = re.compile(r"|".join(PHAGE_KEYWORDS)) - - # EXTENDED_VIRAL_STRUCTURES = re.compile("holi|dna-packaging|mu-like|lysis|associated|membrane") - # m/holi|DNA-packaging|portal|fiber|collar|terminase|prohead|baseplate| - # sheath|base-plate|lysis|membrane_protein|tail|head|capsid|tube - EXTENDED_VIRAL_STRUCTURES_KEYWORDS = ( - r"holi", - r"dna-packaging", - r"portal", - r"fiber", - r"terminase", - r"prohead", - r"base-?plate", - r"sheath", - r"lysis", - r"membrane_protein", - r"tail", - r"head", - r"capsid", - r"tube", - ) - EXTENDED_VIRAL_STRUCTURES = re.compile(r"|".join(EXTENDED_VIRAL_STRUCTURES_KEYWORDS)) - - # EXCLUDE_INTEGRASE = re.compile("[pP]hage[ _]integrase") - # m/Phage integrase|phage integrase/i - INTEGRASE = re.compile(r"phage[ _]integrase") - - def __init__(self, phage_filter_file=None): - self.phage_filter = set() - if phage_filter_file is not None: - with open(phage_filter_file, "rt", encoding="UTF-8") as _in: - self.phage_filter = set(line.strip().split("\t")[0] for line in _in) - - def is_phage(self, eggnog_freetext, eggnog_og): - """ Filters phage-related parsed eggnog mapper output. - Returns binary phage signal. - """ - if eggnog_og in self.phage_filter: - return False - - viral_structure = PhageDetection.VIRAL_STRUCTURES.search(eggnog_freetext) - excluded_term = PhageDetection.EXCLUDE_LIST.search(eggnog_freetext) - if viral_structure and not excluded_term: - return True - - phage_structure = PhageDetection.EXPECTED_PHAGES.search(eggnog_freetext) - ext_viral_structure = PhageDetection.EXTENDED_VIRAL_STRUCTURES.search(eggnog_freetext) - integrase = PhageDetection.INTEGRASE.search(eggnog_freetext) - - if phage_structure and ext_viral_structure and not integrase: - return True - - return False - - # if all(( - # PhageDetection.VIRAL_STRUCTURES.search(eggnog_freetext), - # not PhageDetection.EXCLUDE_LIST.search(eggnog_freetext), - # )): - # self.phage_annotated.add(gene_id) - # return True - # if all(( - # gene_id not in self.phage_annotated, - # PhageDetection.EXPECTED_PHAGES.search(eggnog_freetext), - # any(( - # PhageDetection.VIRAL_STRUCTURES.search(eggnog_freetext), - # PhageDetection.EXTENDED_VIRAL_STRUCTURES.search(eggnog_freetext), - # )), - # not PhageDetection.EXCLUDE_INTEGRASE.search(eggnog_freetext), - # )): - # self.phage_annotated.add(gene_id) - # return True - # return False diff --git a/mgexpose/readers.py b/mgexpose/readers.py deleted file mode 100644 index bed60be..0000000 --- a/mgexpose/readers.py +++ /dev/null @@ -1,199 +0,0 @@ -# pylint: disable=R0903 - -""" Module contains various reader/parser functions """ - -import csv -import gzip -import re -import sys - -from .chunk_reader import get_lines_from_chunks -from .recombinases import MgeRule - - -def read_prodigal_gff(f): - """ Prodigal gff output reader. - - Returns (gene_id, gff_line) tuples via generator. - """ - for line in get_lines_from_chunks(f): - line = line.strip() - if line and line[0] != "#": - line = line.split("\t") - _id = [ - item.split("=")[1] - for item in line[8].split(";") - if item.startswith("ID") - ][0] - # gene_id = f"{line[0]}_{_id.split('_')[1]}" - # yield gene_id, line - yield _id, line - - -def read_recombinase_hits(f): - """ Read hmmer output from recombinase scan. - - Returns (gene_id, mge_name) tuples via generator. - """ - with open(f, "rt", encoding="UTF-8") as _in: - for line in _in: - line = line.strip() - if line and line[0] != "#": - gene_id, _, mge, *_ = re.split(r"\s+", line) - yield gene_id, mge - - -# would love to add raw scan parsing to annotator, -# but then the upstream filtering doesn't work anymore... >:( -# def read_recombinase_scan(f): -# recombinase_hits = {} -# with open(f, "rt") as _in: -# for line in _in: -# line = line.strip() -# if line and line[0] != "#": -# gene_id, _, mge, pfam_acc, evalue, score, *_ = re.split(r"\s+", line) -# score = float(score) -# best_hit = recombinase_hits.get(gene_id) -# if best_hit is None or score > best_hit[0]: -# recombinase_hits[gene_id] = score, mge, pfam_acc, evalue - -# for gene_id, recombinase_annotation in recombinase_hits.items(): -# yield gene_id, recombinase_annotation - - -def parse_macsyfinder_rules(f, macsy_version=2): - """ Read macsyfinder rules. - - Returns dictionary {secretion_system: {mandatory: count, accessory: count}}. - """ - key_col, mandatory_col, accessory_col = (0, 1, 2) if macsy_version == 2 else (1, 5, 6) - - with open(f, "rt", encoding="UTF-8") as _in: - return { - row[key_col].replace("_putative", ""): { - "mandatory": int(row[mandatory_col]), - "accessory": int(row[accessory_col]), - } - for row_index, row in enumerate(csv.reader(_in, delimiter="\t")) - if row_index and row and not row[0].startswith("#") - } - - -def parse_macsyfinder_report(f, f_rules, macsy_version=2): - """ Read macsyfinder/txsscan results. - - Returns (gene_id, txsscan_results) tuples via generator. - """ - - rules = parse_macsyfinder_rules(f_rules, macsy_version=macsy_version) - - key_col, col1, col2 = (1, 4, 8) if macsy_version == 2 else (0, 6, 9) - - with open(f, "rt", encoding="UTF-8") as _in: - for line in _in: - line = line.strip() - if line and line[0] != "#": - line = re.split(r"\s+", line.strip()) - system = line[col1].replace("TXSS/", "") - rule = rules.get(system) - if rule is None: - print( - "WARNING: cannot find txsscan-rule for system:", - f"`{system}`", - file=sys.stderr, - ) - - if line and line[0] and line[0] != "replicon": - yield line[key_col], (system, rule, line[col2]) - - -def read_mge_rules(f, recombinase_scan=False): - """ Read MGE rules. - - Returns dictionary {mge: MgeRule}. - """ - with open(f, "rt", encoding="UTF-8") as _in: - rules = { - row[0].lower(): MgeRule(row[0], *(tuple(map(int, row[1:]))), recombinase_scan) - for i, row in enumerate(csv.reader(_in, delimiter="\t")) - if i != 0 - } - - # #special case for Tn3 since it can carry conjugative system# - # for rule_id, rule in rules.items(): - # if "tn3" in rule_id: - # rule.ce = 1 - - return rules - - -class EggnogReader: - """ - Class to read and parse Eggnog annotations. - Currently, phages are detected - based on the regex signals in the emapper 'description' field - """ - EMAPPER_FIELDS = { - "v1": {"cog_fcat": 11, "description": 12}, - "v2.0.0": {"cog_fcat": 20, "description": 21}, - "v2.0.2": {"cog_fcat": 20, "description": 21}, - "v2.1.0": {"cog_fcat": 9, "description": 10}, - "v2.1.2": {"cog_fcat": 6, "description": 7, - "seed_eggNOG_ortholog": 1, - "seed_ortholog_evalue": 2, - "seed_ortholog_score": 3, - "eggnog_ogs": 4, - "max_annot_lvl": 5, - "goes": 9, - "ec": 10, - "kegg_ko": 11, - "kegg_pathway": 12, - "kegg_module": 13, - "kegg_reaction": 14, - "kegg_rclass": 15, - "brite": 16, - "cazy": 18, - "bigg_reaction": 19, - "pfam": 20 - }, - } - - @staticmethod - def parse_emapper(f, emapper_version="v2.1.2", phage_annotation=None): - """ Parses emapper annotations output. - Returns (gene_id, phage_signal, eggnog) -> (str, boolean, tuple) tuples via generator. - """ - def filter_record(key, value, row): - return value < len(row) and row[value] and row[value] != "-" and key != "description" - - emapper_fields = EggnogReader.EMAPPER_FIELDS.get(emapper_version) - if emapper_fields is None: - raise ValueError(f"{emapper_version} is an unknown emapper annotation format.") - if f.endswith(".gz"): - emapper_stream = gzip.open(f, "rt") - else: - emapper_stream = open(f, "rt", encoding="UTF-8") - - with emapper_stream: - for row in csv.reader(emapper_stream, delimiter="\t"): - if row and row[0][0] != "#": - gene_id = row[0] - # Collect non-empty eggnog attributes - eggnog_gene_ann = tuple( - (key, row[value]) - for key, value in emapper_fields.items() - if filter_record(key, value, row) - ) - phage_signal = None - if phage_annotation is not None: - # note: freetext is converted to lower case here, - # so REs only have to match against lower! - eggnog_freetext = re.sub( - r"\s", "_", row[emapper_fields["description"]] - ).lower() - is_phage = phage_annotation.is_phage( - eggnog_freetext, row[emapper_fields["eggnog_ogs"]] - ) - phage_signal = (None, eggnog_freetext)[is_phage] - yield gene_id, phage_signal, eggnog_gene_ann - diff --git a/mgexpose/recombinases.py b/mgexpose/recombinases.py deleted file mode 100644 index c66eb48..0000000 --- a/mgexpose/recombinases.py +++ /dev/null @@ -1,182 +0,0 @@ -# pylint: disable=R0916 - -""" Recombinase rules and aliases """ - -from dataclasses import dataclass - - -MGE_ALIASES = { - "c1_n1ser": "ser_tn", - "c2_n1ser": "ser_ce", - "c3_n1ser": "ser_lsr", - "casposons": "cas1", -} - - -@dataclass -class MgeRule: - '''The following class defines the set of rules used to determine MGE type. - Type classification is based on two criteria: - 1. Recombinase subfamily i.e. - 2. Structural information - - MGE categories include - - IS_Tn(tn) - - Phage(ph) - - CE(conjugative elements) - - Integron(int) - - Cellular(cell) - As well as newly introduced categories: - - Phage_like(pli) - - Mobility island(mi)''' - subfamily: str = None - is_tn: bool = False - phage: bool = False - ce: bool = False - integron: bool = False - cellular: bool = False - recombinase_scan: bool = False - - def __post_init__(self): - """ Deal with special case for Tn3 - since it can carry conjugative system - - ignored if the rule is used during recombinase-scans - """ - if all( - ( - self.subfamily is not None, - "tn3" in self.subfamily.lower(), - not self.recombinase_scan, - ) - ): - self.ce = 1 - - def get_signals(self): - """ Returns MGE signals of rule. """ - return tuple( - k - for k, v in self.__dict__.items() - if v and k not in ("subfamily", "recombinase_scan") - ) - - def c_tn_check(self, island): - """ Tn check. """ - # c_tn, n_recombinases = island.c_tn, len(island.recombinases) - c_tn, n_recombinases = island.c_tn, sum(island.recombinases.values()) - if self.is_tn and not self.cellular and not self.ce and not self.phage: - # IS_Tn - c_tn += 1 - elif ( - self.is_tn and self.ce and island.conj_man_count < 1 and - n_recombinases == 2 and not island.tn3_found and not island.ser_found - ): - # c2_n1ser(considers solo c2_n1ser and Tn3) - c_tn = 1 - elif self.is_tn and self.ce and island.conj_man_count < 1 and n_recombinases == 1: - # c2_n1ser(considers c2_n1ser and Tn3 as one tn when not together) - c_tn += 1 - - # disentangles recombinase shared by tn and ph - if (self.is_tn and self.phage and island.phage_count < 2): - c_tn += 1 - - return c_tn - - def patch_c_tn_check(self, island): - """Deals with special case when 2 recombinases. - - c_tn = is_tn and ce and conj_man_count < 1 and |recombinases|=2 and !(tn3 or ser) - """ - # old check was: - # if rule.is_tn and rule.ce and self.conj_man_count < 1 - # and len(self.recombinases) == 2 and not self.tn3_found: - # self.c_tn = True - # elif rule.is_tn and rule.ce and self.conj_man_count < 1 - # and len(self.recombinases) == 2 and not self.ser_found: - # self.c_tn = True - # old: - # if len(island.recombinases) == 2 and self.is_tn and self.ce and island.conj_man_count < 1: - # recombinase_types = list(island.recombinases) - # two_tn3 = "tn3" in recombinase_types[0] and "tn3" in recombinase_types[1] - # two_ser_ce = "ser_ce" in recombinase_types[0] and "ser_ce" in recombinase_types[1] - - # mixed = "tn3" in recombinase_types[0] or "tn3" in recombinase_types[1] - # mixed |= "ser_ce" in recombinase_types[0] or "ser_ce" in recombinase_types[1] - - # return two_tn3 != two_ser_ce or mixed - - if sum(island.recombinases.values()) == 2: - recombinase_types = ",".join(list(island.recombinases)) - mixed = "tn3" in recombinase_types and "ser_ce" in recombinase_types - - two_tn3 = recombinase_types.count("tn3") == 2 - two_ser_ce = recombinase_types.count("ser_ce") == 2 - - return (two_tn3 != two_ser_ce) or mixed - - return False - - def phage_check(self, island): - """Phage annotation based on recombinase presence - and phage structural genes in the neighbourhood. - """ - phage, c_mi, nov = island.phage, island.c_mi, island.nov - phage |= (self.is_tn and self.phage) - phage |= (self.phage and not self.ce) - phage |= (self.phage and self.ce) - - c_mi |= (not self.phage and self.ce) - nov = c_mi - - return phage, c_mi, nov - - def phage_like_check(self, island, is_brujita): - """Annotate phage_like element - (presence of phage specific recombinase and absence of phage structural genes - in the neighbourhood) - and mobility island - (presence of recombinase common to phages and conjugative elements - and absence of phage structural genes in the neighbourhood) - """ - c_pli, c_mi = island.c_pli, island.c_mi - if not self.is_tn: - if self.phage and (not self.ce or is_brujita): - c_pli = 1 - elif self.ce and (not self.phage or not is_brujita): - c_mi = 1 - - return c_pli, c_mi - - def conjug_element_check(self, island): - """Conjugative element annotation based on presence of recombinase - and presence of conjugative machinery genes in the neighbourhood - """ - c_ce, nov = island.c_ce, island.nov - if self.ce: - c_ce = 1 - elif self.phage: - c_ce = nov = 1 - elif all( - ( - bool(self.is_tn), - bool(self.ce), - # len(island.recombinases) >= 3, - sum(island.recombinases.values()) >= 3, - (island.tn3_found or island.ser_found) - ) - ): - c_ce = 1 - - return c_ce, nov - - def mobility_island_check(self, island): - """Annotate MI(Mobility island) presence of both - phage structural genes and conjugation machinery genes in the neighbourhood - """ - phage, c_mi, nov = island.phage, island.c_mi, island.nov - if self.is_tn and self.phage: - phage = 1 - else: - c_mi = nov = 1 - - return phage, c_mi, nov diff --git a/mgexpose/test_mge_annotation.py b/mgexpose/test_mge_annotation.py deleted file mode 100644 index 7299b00..0000000 --- a/mgexpose/test_mge_annotation.py +++ /dev/null @@ -1,129 +0,0 @@ -# pylint: disable=C0301,E0401,W1510,W0621 -# flake8: noqa - -''' -mge_annotation.py -GCA_000012825.1.genomes -GCA_000012825.1.genomes.gff.gz -GCA_000012825.1.genomes.recombinase_hmmsearch.besthits.out -specI_v4_00061 -txsscan_rules.txt -all_systems.tsv -GCA_000012825.1.genomes.emapper.annotations -mge_rules_ms.txt ---cluster_data GCA_000012825.1.genomes_mmseqcluster.tsv.gz ---output_dir specI_v4_00061/GCA_000012825.1.genomes/ ---dump_intermediate_steps ---write_gff -''' -import os -import subprocess -import pytest - -TEST_DATADIR = "../test_data/current/" -OUTPUT_DIR = "specI_v4_00061/GCA_000012825.1.genomes/" -GFF_FILENAME = "GCA_000012825.1.genomes.full_length_MGE_assignments.gff3" -TXT_FILENAME = "GCA_000012825.1.genomes.full_length_MGE_assignments.txt" -DEBUG_FILES = { - # "GCA_000012825.1.genomes.assign_mge.step1.txt": "GCA_000012825.1.genomes.assign_mge.step1.txt", - # "GCA_000012825.1.genomes.assign_mge.step2.txt": "GCA_000012825.1.genomes.assign_mge.step2.txt", - # "GCA_000012825.1.genomes.assign_mge.step3.txt": "GCA_000012825.1.genomes.assign_mge.step3.txt", - # "GCA_000012825.1.genomes.pan_genome_calls.txt": "GCA_000012825.1.genomes.pan_genome_calls.txt", - # "GCA_000012825.1.genomes.pan_genome_islands.txt": "GCA_000012825.1.genomes.pan_genome_islands.txt", -} - -TEST_OUT_GFF = os.path.join(TEST_DATADIR, "output", OUTPUT_DIR, GFF_FILENAME) -TEST_OUT_TXT = os.path.join(TEST_DATADIR, "output", OUTPUT_DIR, TXT_FILENAME) - -INPUT_ARGS = [ - ("genome_id", "GCA_000012825.1.genomes"), - ("prodigal_gff", os.path.join(TEST_DATADIR, "GCA_000012825.1.genomes.gff.gz")), - ("recombinase_hits", os.path.join(TEST_DATADIR, "GCA_000012825.1.genomes.recombinase_hmmsearch.besthits.out")), - ("mge_rules", os.path.join(TEST_DATADIR, "mge_rules_ms.txt")), - ("--speci", "specI_v4_00061"), - ("--txs_macsy_rules", os.path.join(TEST_DATADIR, "txsscan_rules.txt")), - ("--txs_macsy_report", os.path.join(TEST_DATADIR, "all_systems.tsv")), - ("--phage_eggnog_data", os.path.join(TEST_DATADIR, "GCA_000012825.1.genomes.emapper.annotations")), - ("--cluster_data", os.path.join(TEST_DATADIR, "GCA_000012825.1.genomes_mmseqcluster.tsv.gz")), - ("--output_dir", OUTPUT_DIR), - ("--write_gff", ""), - ("--dump_intermediate_steps", ""), - ("--write_genes_to_gff", ""), - ("--add_functional_annotation", ""), -] - - -@pytest.fixture(scope="module") -def run_mge_annotation(tmpdir_factory): - """Fixture to run the mge_annotation.py script once per module and generate the output files.""" - tmpdir = tmpdir_factory.mktemp("mge_output") - tmp_output_dir = os.path.join(tmpdir, OUTPUT_DIR) - os.makedirs(tmp_output_dir, exist_ok=True) - debug_dir = None - - # Prepare the command with input arguments - command = ["python", "mge_annotation.py", "denovo"] - for arg, val in INPUT_ARGS: - if val: # Append argument only if value is non-empty - if arg == "--output_dir": - command.extend(["--output_dir", tmp_output_dir]) - elif '--' in arg: - command.extend([arg, val]) - else: - command.append(val) # Obligatory input - elif arg == "--dump_intermediate_steps": - command.append(arg) - debug_dir = os.path.join(tmp_output_dir, "debug") - os.makedirs(debug_dir, exist_ok=True) - - else: - command.append(arg) - - # Execute the command - print("Running command:", command) - result = subprocess.run(command, capture_output=True, text=True) - - # Ensure the script ran successfully - assert result.returncode == 0, f"Command failed with error: {result.stderr}" - return tmp_output_dir, debug_dir - - -def compare_output_files(generated_file_path, expected_file_path): - """Helper function to compare a generated file with its expected output.""" - # Ensure the generated file exists - assert os.path.exists(generated_file_path), f"Generated file not found at {generated_file_path}" - - # Read both the expected file and the generated file - with open(expected_file_path, "rb") as f: - expected_content = f.read() - - with open(generated_file_path, "rb") as f: - generated_content = f.read() - - # Assert that the content of both files is identical - assert expected_content == generated_content, f"The generated file {generated_file_path} does not match the expected output." - - -def test_gff_output(run_mge_annotation): - """Test to compare the generated GFF file with the expected output.""" - tmp_output_dir, _ = run_mge_annotation - print("Temporary output directory: ", tmp_output_dir) - generated_gff_path = os.path.join(tmp_output_dir, GFF_FILENAME) - compare_output_files(generated_gff_path, TEST_OUT_GFF) - - -def test_txt_output(run_mge_annotation): - """Test to compare the generated TXT file with the expected output.""" - tmp_output_dir, _ = run_mge_annotation - generated_txt_path = os.path.join(tmp_output_dir, TXT_FILENAME) - compare_output_files(generated_txt_path, TEST_OUT_TXT) - - -# Individual tests for each debug file -@pytest.mark.parametrize("debug_filename", DEBUG_FILES.keys()) -def test_debug_file_output(run_mge_annotation, debug_filename): - """Test to compare each file in the debug directory with the expected output.""" - _, debug_dir = run_mge_annotation - generated_file_path = os.path.join(debug_dir, debug_filename) - expected_file_path = os.path.join(TEST_DATADIR, "output", OUTPUT_DIR, "debug", debug_filename) - compare_output_files(generated_file_path, expected_file_path)