Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 25 additions & 17 deletions assemblathon_stats.pl
Original file line number Diff line number Diff line change
Expand Up @@ -56,13 +56,13 @@
#
###############################################

my $scaffolded_contigs = 0; # how many contigs that are part of scaffolds (sequences must have $n_limit consecutive Ns)
my $scaffolded_contigs = 0; # how many contigs that are part of scaffolds (sequences must have $n_limit consecutive Ns)
my $scaffolded_contig_length = 0; # total length of all scaffolded contigs
my $unscaffolded_contigs = 0; # how many 'orphan' contigs, not part of a scaffold
my $unscaffolded_contig_length = 0; # total length of all contigs not part of scaffold
my $w = 60; # formatting width for output
my %data; # data structure to hold all sequence info key is either 'scaffold', 'contig' or intermediate', values are seqs & length arrays
my (@results, @headers); # arrays to store results (for use with -csv option)
my $w = 60; # formatting width for output
my %data; # data structure to hold all sequence info key is either 'scaffold', 'contig' or intermediate', values are seqs & length arrays
my (@results, @headers); # arrays to store results (for use with -csv option)



Expand Down Expand Up @@ -140,14 +140,14 @@ sub process_FASTA{
$scaffolded_contig_length += $length;

# loop through all contigs that comprise the scaffold
foreach my $contig (split(/N{25,}/, $seq)){
$scaffolded_contigs++;
my $length = length($contig);
foreach my $contig (split(/N{$n_limit,}/, $seq)){
next unless my $length = length($contig);
$scaffolded_contigs++;
push(@{$data{contig}{seqs}},$contig);
push(@{$data{contig}{lengths}},$length);
}
} else {
# must be here if the scaffold is actually just a contig (or is a scaffold with < 25 Ns)
# must be here if the scaffold is actually just a contig (or is a scaffold with < $n_limit Ns)
$unscaffolded_contigs++;
$unscaffolded_contig_length += $length;
push(@{$data{contig}{seqs}},$seq);
Expand Down Expand Up @@ -201,10 +201,10 @@ sub sequence_statistics{
store_results($desc, $average_contigs_per_scaffold) if ($csv);

# now calculate average length of break between contigs
# just find all runs of Ns in scaffolds (>=25) and calculate average length
# just find all runs of Ns in scaffolds (>= $n_limit) and calculate average length
my @contig_breaks;
foreach my $scaffold (@{$data{scaffold}{seqs}}){
while($scaffold =~ m/(N{25,})/g){
while($scaffold =~ m/(N{$n_limit,})/g){
push(@contig_breaks, length($1));
}
}
Expand All @@ -213,11 +213,19 @@ sub sequence_statistics{

if(@contig_breaks == 0){
$average_break_length = 0;
} else{
$average_break_length = sum(@contig_breaks) / @contig_breaks;
} else{
$average_break_length = sum(@contig_breaks) / scalar(@contig_breaks);
}
if($n_limit == 1) {
$desc = "Mean length of breaks (>=${n_limit}N) between contigs in scaffold";
} else {
$desc = "Mean length of breaks (>=${n_limit}Ns) between contigs in scaffold";
}
if(length($n_limit)>=5) {
printf "%${w}s %9d\n", $desc, $average_break_length;
} else {
printf "%${w}s %10d\n", $desc, $average_break_length;
}
$desc = "Average length of break (>25 Ns) between contigs in scaffold";
printf "%${w}s %10d\n", $desc, $average_break_length;
store_results($desc, $average_break_length) if ($csv);
return();
}
Expand Down Expand Up @@ -298,7 +306,7 @@ sub sequence_statistics{
1000000 => '1M',
10000000 => '10M');

foreach my $size qw(1000 10000 100000 1000000 10000000){
foreach my $size (qw(1000 10000 100000 1000000 10000000)){
my $matches = grep { $_ > $size } @{$data{$type}{lengths}};
my $percent = sprintf("%.1f", ($matches / $count) * 100);

Expand Down Expand Up @@ -425,7 +433,7 @@ sub sequence_statistics{
$bases{N} = ($seq =~ tr/N/N/);

my $base_count = 0;
foreach my $base qw (A C G T N){
foreach my $base (qw(A C G T N)){
my $percent = sprintf("%.2f", ($bases{$base} / $length) * 100);
$desc = "$type %$base";
printf "%${w}s %10s\n", $desc, $percent;
Expand Down Expand Up @@ -517,4 +525,4 @@ sub write_csv{


close($out);
}
}