Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
203 changes: 98 additions & 105 deletions definitions/tools/unaligned_seq_fda_stats.cwl
Original file line number Diff line number Diff line change
Expand Up @@ -4,65 +4,65 @@ class: CommandLineTool
baseCommand: ['/usr/bin/perl', 'unaligned_stats.pl']
requirements:
- class: DockerRequirement
dockerPull: "mgibio/cle:v1.4.2"
dockerPull: "mgibio/samtools-cwl:1.16.1"
- class: ResourceRequirement
ramMin: 16000
ramMin: 4000
- class: StepInputExpressionRequirement
- class: InitialWorkDirRequirement
listing:
- entryname: 'unaligned_stats.pl'
entry: |
#!/usr/bin/perl

use strict;
use warnings;
use Carp;

use FileHandle;
use File::Basename;
use File::Spec::Functions;
use File::Spec::Functions;
use IO::Uncompress::AnyUncompress;


# sets global variables with the default
use vars qw/$filter $base $offset $samtools/;
use vars qw/$filter $offset $samtools/;

# defines the minimum base call quality score to filter
# inclusive, for example, Bases with >= Q30
$filter = 30;

# defines the offset to calculate the base call quality score from the Phred +33 encoded by ASCII characters
# For example, 33 = 63.06707094 - 30.06707094 (see below)
$offset = 33;

# Please replace count_N() with count_base() below to enable this option.
# defines a basecalling letter to count
# (default: "N")
$base = 'N'; # now inactivated, don't change this.

# specifies the program paths in docker(mgibio/cle:v1.4.2)
$samtools = "/opt/samtools/bin/samtools"; # samtools 1.3.1 using htslib 1.3.2



# specifies the program paths in docker
$samtools = "/opt/samtools/bin/samtools";


# main subroutine
Main();

# program exits here
exit 0;


sub Main {
# for the paths of input files
my @paths = @ARGV if @ARGV > 0;
croak "input file path required" unless @paths > 0;


# reads input files
my (%freq, %count, %base);
my $format = "fastq";
my $n = 0;
foreach my $path (@paths)
{
my $count_for_path = {}; # $count{$path};
my $freq_for_path = {}; # $freq{$path};

my $base_for_path = 0; # $base{$path};

my $fh;
if ($path =~ /\.gz$/ || $path =~ /\.bz2$/)
{
Expand All @@ -72,15 +72,15 @@ requirements:
{
# for MGI legacy instrument data
$format = "bam";

$fh = FileHandle->new("$samtools view $path |");
}
else
{
$fh = FileHandle->new($path, "r");
}


# reads an input file
croak "Cannot find a file: $path" unless -e $path;
croak "Cannot open a file: $path" unless defined $fh;
Expand All @@ -91,21 +91,19 @@ requirements:
if ($n % 4 == 3)
{
chomp $i;
update_hash($i, \%freq, \%count, $offset, $path);

update_hash($i, $freq_for_path, $count_for_path);
}
elsif ($n % 4 == 1)
{
chomp $i;

# instead of count_base($i, $base);

my $nbase = count_N($i);

$base{sum} += $nbase;
$base{$path} += $nbase;

$base_for_path += $nbase;
}


$n ++;
}
}
Expand All @@ -115,31 +113,42 @@ requirements:
{
next if index($i, '@') == 0;
chomp $i;

my @fields = split /\t/, $i;

update_hash($fields[10], \%freq, \%count, $offset, $path);

# instead of count_base($fields[9], $base);

update_hash($fields[10], $freq_for_path, $count_for_path);

my $nbase = count_N($fields[9]);

$base{sum} += $nbase;
$base{$path} += $nbase;


$base_for_path += $nbase;

$n ++;
}
}
else
{
croak "Invalid input file format: $format";
}


for my $freq (keys %$freq_for_path) {
$freq{sum}{$freq} += $freq_for_path->{$freq};
}
$freq{$path} = $freq_for_path;

for my $c (keys %$count_for_path) {
$count{sum}{$c} += $count_for_path->{$c};
}
$count{$path} = $count_for_path;

$base{sum} += $base_for_path;
$base{$path} = $base_for_path;

# closes the file handler
printf "\n%d lines (cumulative) read from %s", $n, $path;
$fh->close;
}


# prints out the source file information
printf "\n\n[Input file information: %d file(s)]", scalar(@paths);
print "\nfile\tdir";
Expand All @@ -148,122 +157,107 @@ requirements:
my ($name, $dir) = fileparse($path);
printf "\n%s\t%s", $name, $dir;
}

foreach my $key ("sum", @paths)
{
# calculates the total number of base calls and sequences
my ($nseq, $ncall) = (0, 0);
foreach my $len (sort {$a <=> $b} keys %{$count{$key}})
foreach my $len (keys %{$count{$key}})
{
$ncall += $len * $count{$key}->{$len};
$nseq += $count{$key}->{$len};
}

croak "Exception: Invalid number of sequences, $nseq" unless $nseq == ($format eq "fastq") ? $n / 4 : $n;


# calculates the median and mean from a frequency hash
my ($median, $mean, $ncall2) = stat_freq($freq{$key});
croak "Exception: invalid count number of base calls" unless $ncall == $ncall2;

# counts by the minimum base call quality score
my $nfilter = 0;
foreach my $score (keys %{$freq{$key}})
{
if ($filter <= $score)
if ($filter <= (ord($score) - $offset))
{
$nfilter += $freq{$key}->{$score};
}
}

# prints out the summary
printf "\n\n[Quality score summary from %s]", $key eq "sum" ? sprintf("%d file(s)", scalar(@paths)) : $key;
printf "\nTotal Number of Reads\t%s", $nseq; # total sequence number
printf "\nTotal Number of basecalls\t%s", $ncall;
printf "\nBases with %s\t%s\t%s (%%)", $base, $base{$key}, $base{$key} / $ncall * 100;
printf "\nBases with N\t%s\t%s (%%)", $base{$key}, $base{$key} / $ncall * 100;
printf "\nMedian Basecall Quality Score\t%s", $median;
printf "\nMean Basecall Quality Score\t%s", $mean;
printf "\nBases with >= Q%s\t%d\t%s (%%)", $filter, $nfilter, $nfilter / $ncall * 100;
print "\n";
}
}


sub update_hash {
my ($qual, $freq, $count, $offset, $path) = @_;
my ($qual, $freq, $count) = @_;

# converts characters to ASCII numbers
my @scores = map { unpack("C*", $_ ) - $offset } unpack('(a)*', $qual);
my @scores = unpack('(a)*', $qual);

# updates the frequency of basecalling score
foreach (@scores)
{
$freq->{sum}->{$_} ++;
$freq->{$path}->{$_} ++;
$freq->{$_} ++;
}

# counts the sequence length
$count->{sum}->{$#scores + 1} ++;
$count->{$path}->{$#scores + 1} ++;
my $length = scalar(@scores);
$count->{$length} ++;
}


sub count_N {
my ($str) = @_;

# calculates the frequency of "N"
my $count = 0;
if (index($str, 'N') > -1)
{
$count = $str =~ tr/N//;
}

return $count;
}


sub count_base {
my ($str, $base) = @_;

# calculates the frequency of a letter
my $count = 0;
if (index($str, $base) > -1)
{
$count = $str =~ s/$base//g;
}


return $count;
}



sub stat_freq {
my ($hash) = @_;

# frequency hash: {key => data value, value => data count}
croak "empty frequency hash" unless keys(%$hash) > 0;

# sorts the data values
my @sort = sort {$a <=> $b} keys %$hash;
my @sort = sort {ord($a) <=> ord($b)} keys %$hash;

# counts the total number of data
my $n = 0;
map { $n += $hash->{$_} } @sort;

# calculates the median index (0-based index)
my @midx = ($n % 2 == 1) ? ((1 + $n) / 2) - 1 : ((1 + $n) / 2 - 1.5, (1 + $n) / 2 - 0.5);

# calculates the median of base call quality score from the frequency
my ($median, $mean);
my $idx = 0;
for (my $i=0; $i<@sort; $i ++)
{
# gets the last index of the given score in the array (1-based)
my $score = $sort[$i];
$idx += $hash->{$score};

my $score_chr = $sort[$i];
my $score_num = ord($score_chr) - $offset;
$idx += $hash->{$score_chr};

# to calculate the mean
$mean += $score * $hash->{$score};
$mean += $score_num * $hash->{$score_chr};

# calculates the median
unless (defined $median)
{
Expand All @@ -273,24 +267,23 @@ requirements:
{
if ($midx[1] <= $idx - 1)
{
$median = $score;
$median = $score_num;
}
else
{
$median = ($score + $sort[$i + 1]) / 2;
$median = ($score_num + $sort[$i + 1]) / 2;
}
}
else
{
$median = $score;
$median = $score_num;
}
}
}
}

return ($median, $mean / $n, $n);
}

inputs:
input_files:
type: File[]
Expand Down