diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 85d41c6..7e84097 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.6.1 +current_version = 0.7.1 commit = True tag = True diff --git a/annotation/__init__.py b/annotation/__init__.py index c161714..916b518 100644 --- a/annotation/__init__.py +++ b/annotation/__init__.py @@ -7,7 +7,7 @@ from Mikado.parsers import parser_factory from Mikado.transcripts import Gene, Transcript -VERSION = '0.6.1' +VERSION = '0.7.1' RUN_METADATA = "run_details.json" UTR_SELECTION_OPTIONS = ('augustus', 'gold', 'silver', 'bronze', 'all', 'hq_assembly', 'lq_assembly') diff --git a/annotation/prediction.py b/annotation/prediction.py index d965b39..d7efc96 100644 --- a/annotation/prediction.py +++ b/annotation/prediction.py @@ -59,6 +59,9 @@ def combine_arguments_prediction(cli_arguments): if cli_arguments.repeats: cromwell_inputs['ei_prediction.repeats_gff'] = cli_arguments.repeats.name + if cli_arguments.helixer_model: + cromwell_inputs['ei_prediction.helixer_model'] = cli_arguments.helixer_model + if cli_arguments.force_train: cromwell_inputs['ei_prediction.force_train'] = cli_arguments.force_train @@ -89,6 +92,9 @@ def combine_arguments_prediction(cli_arguments): if cli_arguments.mikado_utr_files: cromwell_inputs['ei_prediction.mikado_utr_files'] = ' '.join(cli_arguments.mikado_utr_files) + if cli_arguments.temp_dir: + cromwell_inputs['ei_prediction.temp_dir'] = cli_arguments.temp_dir + if cli_arguments.do_glimmer: cromwell_inputs['ei_prediction.do_glimmer'] = 'true' if cli_arguments.do_glimmer is not True and os.access(cli_arguments.do_glimmer, os.R_OK): @@ -103,7 +109,10 @@ def combine_arguments_prediction(cli_arguments): cromwell_inputs['ei_prediction.do_codingquarry'] = 'true' if cli_arguments.do_codingquarry is not True and os.access(cli_arguments.do_codingquarry, os.R_OK): cromwell_inputs['ei_prediction.codingquarry_training'] = cli_arguments.do_codingquarry - + + if cli_arguments.do_helixer: + cromwell_inputs['ei_prediction.do_helixer'] = 'true' + if cli_arguments.no_augustus and cli_arguments.no_augustus is False: cromwell_inputs['ei_prediction.do_augustus'] = 'false' @@ -222,8 +231,11 @@ def combine_arguments_prediction(cli_arguments): cromwell_inputs['ei_prediction.snap_extra_params'] = cli_arguments.snap_extra_params if cli_arguments.augustus_extra_params: cromwell_inputs['ei_prediction.augustus_extra_params'] = cli_arguments.augustus_extra_params + if cli_arguments.helixer_extra_params: + cromwell_inputs['ei_prediction.helixer_extra_params'] = cli_arguments.helixer_extra_params if cli_arguments.evm_extra_params: cromwell_inputs['ei_prediction.evm_extra_params'] = cli_arguments.evm_extra_params + if cli_arguments.mikado_config: cromwell_inputs['ei_prediction.mikado_config'] = cli_arguments.mikado_config.name @@ -246,7 +258,8 @@ def collect_prediction_output(run_metadata): if outputs['ei_prediction.glimmer']: symlink(outputs_path, outputs['ei_prediction.glimmer']) - + if outputs['ei_prediction.helixer']: + symlink(outputs_path, outputs['ei_prediction.helixer']) if outputs['ei_prediction.snap']: symlink(outputs_path, outputs['ei_prediction.snap']) if outputs['ei_prediction.codingquarry']: @@ -271,6 +284,7 @@ def collect_prediction_output(run_metadata): snap_prediction_path = os.path.join(predictions_path, "SNAP") codingquarry_prediction_path = os.path.join(predictions_path, "CodingQuarry") augustus_prediction_path = os.path.join(predictions_path, "Augustus") + helixer_prediction_path = os.path.join(predictions_path, "Helixer") if not os.path.exists(predictions_path): os.mkdir(predictions_path) @@ -303,6 +317,14 @@ def collect_prediction_output(run_metadata): else: if os.path.exists(glimmer_prediction_path): shutil.rmtree(glimmer_prediction_path) + + if outputs['ei_prediction.predictions_helixer']: + if not os.path.exists(helixer_prediction_path): + os.mkdir(helixer_prediction_path) + symlink(helixer_prediction_path, outputs['ei_prediction.predictions_helixer']) + else: + if os.path.exists(helixer_prediction_path): + shutil.rmtree(helixer_prediction_path) if outputs['ei_prediction.predictions_augustus']: if not os.path.exists(augustus_prediction_path): diff --git a/annotation/prediction_module/main.wdl b/annotation/prediction_module/main.wdl index 9598891..6db6d4d 100644 --- a/annotation/prediction_module/main.wdl +++ b/annotation/prediction_module/main.wdl @@ -15,7 +15,10 @@ workflow ei_prediction { Boolean do_glimmer = false Directory? glimmer_training Boolean do_snap = false + Boolean do_helixer = false File? snap_training + String? temp_dir + Boolean do_augustus = true Array[File]? transcriptome_models # Classify and divide into Gold, Silver and Bronze Array[File]? homology_models # Take as is @@ -31,6 +34,8 @@ workflow ei_prediction { File? repeats_gff # These are passed through to augustus File? extra_training_models # These models are taken as-is directly as results from the training model selection + File? helixer_model #model used for helixer gene model prediction + Int flank = 200 Int kfold = 8 @@ -51,7 +56,7 @@ workflow ei_prediction { String? snap_extra_params String? augustus_extra_params String? evm_extra_params - + String? helixer_extra_params RuntimeAttr augustus_resources } @@ -262,6 +267,18 @@ workflow ei_prediction { Boolean train_utr = select_first([train_utr_, false]) Int total_models = select_first([num_models, 0]) + #Run helixer predictions if GPU is available + if (do_helixer) { + call Helixer { + input: + genome = def_reference_genome, + model = helixer_model, + temp_dir = temp_dir, + species = species, + extra_params = helixer_extra_params + } + } + # Generate CodingQuarry predictions # Considers lowercase as masked by default if (total_models > 1500 && do_codingquarry) { @@ -421,6 +438,7 @@ workflow ei_prediction { augustus_predictions = def_augustus_predictions, snap_predictions = SNAP.predictions, glimmer_predictions = GlimmerHMM.predictions, + helixer_predictions = Helixer.predictions, codingquarry_predictions = CodingQuarry.predictions, codingquarry_fresh_predictions = CodingQuarryFresh.predictions, hq_protein_alignments = hq_protein.processed_gff, @@ -514,6 +532,7 @@ workflow ei_prediction { File? codingquarry = EVM.formatted_codingquarry_predictions File? codingquarry_fresh = EVM.formatted_codingquarry_fresh_predictions File? augustus_abinitio = EVM.formatted_augustus_abinitio_predictions + File? helixer = EVM.formatted_helixer_predictions Array[File]? augustus = EVM.formatted_augustus_runs_predictions File evm_predictions = CombineEVM.predictions File mikado_loci = MikadoPick.loci @@ -1157,6 +1176,56 @@ task GlimmerHMM { >>> } + +task Helixer { + input { + IndexedReference genome + File model + Int sequence_length + String species + String temp_dir + RuntimeAttr? resources + } + + RuntimeAttr default_attr = object { + constraints: "avx|avx2|sse4", + cpu_cores: 8, + mem_gb: 64, + max_retries: 1, + queue: "ei-a100,ei-gpu" + } + + RuntimeAttr runtime_attr = select_first([resources, default_attr]) + + Int cpus = select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + Int gpus = select_first([runtime_attr.gpu, default_attr.gpu]) + + runtime { + cpu: cpus + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GB" + constraints: select_first([runtime_attr.constraints, default_attr.constraints]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + queue: select_first([runtime_attr.queue, default_attr.queue]) + gpu: gpus + } + + + output { + File predictions = "helixer.predictions.gff3" + } + + command <<< + set -euxo pipefail + ln -s ~{genome.fasta} + Helixer.py --model-filepath \ + --temporary-dir ~{temp_dir} \ + --species ~{species} \ + --fasta-path ~{genome.fasta} \ + ~{helixer_extra_params} \ + --gff-output-path helixer.predictions.gff3 + >>> +} + task GenerateModelProteins { input { IndexedReference genome @@ -1393,6 +1462,7 @@ task EVM { File? augustus_abinitio File? snap_predictions File? glimmer_predictions + File? helixer_predictions File? codingquarry_predictions File? codingquarry_fresh_predictions File? hq_protein_alignments @@ -1426,6 +1496,7 @@ task EVM { File? formatted_codingquarry_fresh_predictions = "codingquarry_fresh.predictions.gff" File? formatted_augustus_abinitio_predictions = "augustus_abinitio.predictions.gff" Array[File]? formatted_augustus_runs_predictions = glob("augustus_*.predictions.gff") + File? formatted_helixer_predictions = "helixer.predictions.gff" File? formatted_snap_predictions_stats = "snap.predictions.stats" File? formatted_glimmer_predictions_stats = "glimmer.predictions.stats" @@ -1433,6 +1504,7 @@ task EVM { File? formatted_codingquarry_fresh_predictions_stats = "codingquarry_fresh.predictions.stats" File? formatted_augustus_abinitio_predictions_stats = "augustus_abinitio.predictions.stats" Array[File]? formatted_augustus_runs_predictions_stats = glob("augustus_*.predictions.stats") + File? formatted_helixer_predictions_stats = "helixer.predictions.gff.stats" } command <<< @@ -1487,6 +1559,11 @@ task EVM { mikado util stats augustus_abinitio.predictions.gff augustus_abinitio.predictions.stats fi + if [ "~{helixer_predictions}" != "" ]; then + cat ~{helixer_predictions} | gff_to_evm glimmer | tee helixer.predictions.gff >> predictions.gff + mikado util stats helixer.predictions.gff helixer.predictions.stats + fi + augustus_run=0 for i in ~{sep=" " augustus_predictions}; do ((augustus_run++)) diff --git a/cromwell_configuration/slurm.conf b/cromwell_configuration/slurm.conf index 26ec05a..42ba91e 100644 --- a/cromwell_configuration/slurm.conf +++ b/cromwell_configuration/slurm.conf @@ -73,6 +73,7 @@ backend { Int memory_mb = 8000 String? constraints String? queue = "ei-medium" + Int? gpu """ submit = """ @@ -84,6 +85,15 @@ backend { --mem ${memory_mb} \ --wrap "/bin/bash ${script}" + elif [ "${queue}" == "ei-a100" || "${queue}" == "ei-gpu" ] #gpu specific nodes in the cluster + then + sbatch -J ${job_name} --constraint="${constraints}" -D ${cwd} -o ${out} -e ${err} -t ${runtime_minutes} \ + -p "${queue}" \ + ${"-c " + cpu} \ + --mem ${memory_mb} \ + --gres=gpu:"${gpu}" + --wrap "/bin/bash + ${script}" else sbatch -J ${job_name} --constraint="${constraints}" -D ${cwd} -o ${out} -e ${err} -t ${runtime_minutes} \ -p ${queue} \ diff --git a/reat.yml b/reat.yml index d1c438f..fe4358b 100644 --- a/reat.yml +++ b/reat.yml @@ -32,3 +32,4 @@ dependencies: - codingquarry=2.0 - augustus=3.4.0 - evidencemodeler=1.1.1 + diff --git a/setup.py b/setup.py index 54e1bb1..8ed8915 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ setup( name="reat", - version="0.6.1", + version="0.7.1", packages=find_packages(".", exclude=["tests"]), url="https://github.com/ei-corebioinformatics/reat", classifiers=[ @@ -19,8 +19,8 @@ "Programming Language :: Python :: 3.8", ], license="MIT", - author="Luis Yanes, Gemy Kaithakottil", - author_email="luis.yanes@earlham.ac.uk, gemy.kaithakottil@earlham.ac.uk", + author="Luis Yanes, Gemy Kaithakottil, Mariano Olivera Fedi", + author_email="luis.yanes@earlham.ac.uk, gemy.kaithakottil@earlham.ac.uk, mariano.olivera-fedi@earlham.ac.uk", description="Robust Eukaryotic Annotation Toolkit", zip_safe=False, keywords="gene annotation WDL pipeline workflow cromwell transcriptome homology", diff --git a/singularity/build_image.sh b/singularity/build_image.sh index e52908f..97a78e9 100644 --- a/singularity/build_image.sh +++ b/singularity/build_image.sh @@ -1,5 +1,5 @@ set -euxo -version=0.6.1 +version=0.7.1 rundir=$(dirname "$(realpath "$0")") cd "$(mktemp -d)" cp "${rundir}"/reat_singularity.def reat.def