From a0b1ea21d77521dfddf2d21a49a5e95f90455c60 Mon Sep 17 00:00:00 2001 From: roryk Date: Thu, 17 Aug 2017 12:25:09 -0400 Subject: [PATCH 1/4] Update README with gene collapsing and insert size commands. This also renames the two helper scripts to be prefixed with pizzly. This way we can load them as binaries with bioconda which will help with incorporation as part of analysis pipelines. Closes #17. --- README.md | 33 ++++++++++++++----- ...flatten_json.py => pizzly_flatten_json.py} | 2 +- ...ength.py => pizzly_get_fragment_length.py} | 2 +- 3 files changed, 26 insertions(+), 11 deletions(-) rename scripts/{flatten_json.py => pizzly_flatten_json.py} (94%) rename scripts/{get_fragment_length.py => pizzly_get_fragment_length.py} (91%) diff --git a/README.md b/README.md index 5f4bbd4..7ca05ae 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,14 @@ ![pizzly](logo.jpg) -# pizzly +# pizzly Fast fusion detection using kallisto ### About -pizzly is a program for detecting gene fusions from RNA-Seq data of cancer samples. +pizzly is a program for detecting gene fusions from RNA-Seq data of cancer samples. It requires running [kallisto](https://pachterlab.github.io/kallisto) with the `--fusion` -parameter (available in version `0.43.1` or later). +parameter (available in version `0.43.1` or later). ### Building @@ -31,7 +31,7 @@ make install Pizzly requires the reference transcriptome in FASTA format as well as a GTF file describing the transcriptome. We recommend using the [Ensembl](http://www.ensembl.org/index.html) transcriptomes. -The example below assumes you have your transcriptome in FASTA format as `transcripts.fa.gz`, the GTF file `transcripts.gtf.gz` +The example below assumes you have your transcriptome in FASTA format as `transcripts.fa.gz`, the GTF file `transcripts.gtf.gz` and your paired-end RNA-Seq data sets in `r1.fastq.gz` and `r2.fastq.gz` ### Running @@ -48,19 +48,34 @@ Next we quantify using kallisto with fusion detection enabled kallisto quant -i index.idx --fusion -o output r1.fastq.gz r2.fastq.gz ``` -This creates the file `output/fusion.txt` which is used by pizzly, finally we run pizzly +This creates the file `output/fusion.txt` which is used by pizzly. Next, we +get the insert size + +``` +pizzly_get_fragment_length.py output/abundance.h5 +400 +``` + +Finally we run pizzly ``` pizzly -k 31 --gtf transcripts.gtf --cache index.cache.txt --align-score 2 \ --insert-size 400 --fasta transcripts.fa.gz --output test output/fusion.txt ``` -The parameters to set are +Optionally, pizzly transcript-level fusion calls can be collapsed down to gene +level fusion calls by using the `pizzly_flatten_json.py` script + +``` +pizzly_flatten_json.py test.json > test-fusion-genes.tsv +``` + +The parameters to set are * `--insert-size`, which should be the maximum insert size of the paired-end library (kallisto will estimate this from the reads, default used is 400) * `--align-score`, the number of mismatches allowed when aligning reads to a reference transcript (default used is 2) `--ignore-protein`, ignore any information about protein coding in the annotation, **warning** this will probably lead to an increase in the number of false positives reported. -* `--cache`, if this file does not exist, pizzly will parse the GTF (which can take up to a minute or two) and store the required data in the cached file. If the cache file exists (if you've run pizzly previously on the same GTF file), pizzly will parse this file instead, which is much faster than parsing the GTF. +* `--cache`, if this file does not exist, pizzly will parse the GTF (which can take up to a minute or two) and store the required data in the cached file. If the cache file exists (if you've run pizzly previously on the same GTF file), pizzly will parse this file instead, which is much faster than parsing the GTF. A more sophisticated example is in the `test` directory which contains a `snakemake` workflow to index, quantify, call fusions and requantify using `kallisto` and `pizzly`. @@ -75,8 +90,8 @@ The `--output test` parameter is used as a prefix and two files are created `tes The `scripts` subfolder contains useful Python scripts -- `get_fragment_length.py` examines an `abundance.h5` produced by `kallisto` and finds the 95th percentile of the fragment length distribution -- `flatten_json.py` reads the `.json` output and converts to a simple gene table +- `pizzly_get_fragment_length.py` examines an `abundance.h5` produced by `kallisto` and finds the 95th percentile of the fragment length distribution +- `pizzly_flatten_json.py` reads the `.json` output and converts to a simple gene table ### Annotations diff --git a/scripts/flatten_json.py b/scripts/pizzly_flatten_json.py similarity index 94% rename from scripts/flatten_json.py rename to scripts/pizzly_flatten_json.py index 3679011..5d7bd33 100644 --- a/scripts/flatten_json.py +++ b/scripts/pizzly_flatten_json.py @@ -26,7 +26,7 @@ def outputGeneTable(fusions, outf, filters = None): outf.write('\n') def usage(): - print("Usage: python flatten_json.py fusion.out.json [genetable.txt]") + print("Usage: python pizzly_flatten_json.py fusion.out.json [genetable.txt]") print("") print(" outputs a flat table listing all gene fusions, if the output file is not") print(" specified it prints to standard output") diff --git a/scripts/get_fragment_length.py b/scripts/pizzly_get_fragment_length.py similarity index 91% rename from scripts/get_fragment_length.py rename to scripts/pizzly_get_fragment_length.py index 5b48543..62503df 100644 --- a/scripts/get_fragment_length.py +++ b/scripts/pizzly_get_fragment_length.py @@ -12,7 +12,7 @@ def get_cumulative_dist(fn): if __name__ == "__main__": if len(sys.argv) < 2: - print("Usage: python get_fragment_length.py H5FILE [cutoff]") + print("Usage: python pizzly_get_fragment_length.py H5FILE [cutoff]") print("") print("Prints 95 percentile fragment length") print("") From dc0251daa931efaf5b737042b55ead536788041f Mon Sep 17 00:00:00 2001 From: roryk Date: Wed, 6 Sep 2017 09:56:08 -0400 Subject: [PATCH 2/4] Add shebang to top of python scripts to make them executable. --- scripts/pizzly_flatten_json.py | 1 + scripts/pizzly_get_fragment_length.py | 1 + 2 files changed, 2 insertions(+) diff --git a/scripts/pizzly_flatten_json.py b/scripts/pizzly_flatten_json.py index 5d7bd33..979b7da 100644 --- a/scripts/pizzly_flatten_json.py +++ b/scripts/pizzly_flatten_json.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python -Es import sys import json from collections import OrderedDict diff --git a/scripts/pizzly_get_fragment_length.py b/scripts/pizzly_get_fragment_length.py index 62503df..1a075e3 100644 --- a/scripts/pizzly_get_fragment_length.py +++ b/scripts/pizzly_get_fragment_length.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python -Es import h5py import numpy as np import sys From 85021827e8a44cc4c332474aec42843c3376ebaa Mon Sep 17 00:00:00 2001 From: roryk Date: Wed, 6 Sep 2017 13:46:01 -0400 Subject: [PATCH 3/4] Use cutoff properly when detecting the fragment length. --- scripts/pizzly_get_fragment_length.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/pizzly_get_fragment_length.py b/scripts/pizzly_get_fragment_length.py index 1a075e3..2fd4903 100644 --- a/scripts/pizzly_get_fragment_length.py +++ b/scripts/pizzly_get_fragment_length.py @@ -28,5 +28,5 @@ def get_cumulative_dist(fn): else: cutoff = 0.95 y = get_cumulative_dist(fn) - fraglen = np.argmax(y > .95) + fraglen = np.argmax(y > cutoff) print(fraglen) From 8fe0c7d3462ab80e35347c7e7c601e2875d55c70 Mon Sep 17 00:00:00 2001 From: roryk Date: Wed, 6 Sep 2017 15:04:09 -0400 Subject: [PATCH 4/4] Use context manager to automatically close the h5 file. --- scripts/pizzly_get_fragment_length.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/pizzly_get_fragment_length.py b/scripts/pizzly_get_fragment_length.py index 2fd4903..22ab8ed 100644 --- a/scripts/pizzly_get_fragment_length.py +++ b/scripts/pizzly_get_fragment_length.py @@ -5,8 +5,8 @@ def get_cumulative_dist(fn): - f = h5py.File(fn) - x = np.asarray(f['aux']['fld'], dtype='float64') + with h5py.File(fn) as f: + x = np.asarray(f['aux']['fld'], dtype='float64') y = np.cumsum(x)/np.sum(x) f.close() return y