Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
ff0e7bd
Delete .DS_Store
BournSupremacy Feb 25, 2022
c0d7506
Gene coverage scripts updates
Mar 7, 2022
b78c626
Quast step now analyses Minimap results
Mar 7, 2022
b477ec9
Update README.md
BournSupremacy Mar 8, 2022
357844a
Create test
BournSupremacy Mar 11, 2022
393661a
Update README.md
BournSupremacy Mar 14, 2022
0e87aab
Updated location of HUPAN library
BournSupremacy Mar 15, 2022
115047f
Updated name of 0_fastq directory
BournSupremacy Mar 15, 2022
dd2fee1
Fixed hupanSLURM geneExist command
BournSupremacy Mar 23, 2022
3bb0ef0
Update README.md
BournSupremacy Mar 23, 2022
81b44fe
Added descriptions of simulation functions
BournSupremacy Apr 5, 2022
72bbc93
Rename plottingScripts/test to dataAnalysis/JupyterNotebooks/test
BournSupremacy Apr 17, 2022
c610e29
Create temp
BournSupremacy Apr 17, 2022
b1bd1b0
Create temp
BournSupremacy Apr 17, 2022
fdd43a1
Rename dataAnalysis/multiQC/temp to dataAnalysis/multiQC/finalDataset…
BournSupremacy Apr 17, 2022
909ad84
Added MultiQC files for starting dataset
BournSupremacy Apr 17, 2022
2234bf4
Added MultiQC files of final dataset
BournSupremacy Apr 17, 2022
133992a
Delete temp
BournSupremacy Apr 17, 2022
acb7b0d
Delete temp
BournSupremacy Apr 17, 2022
f69d36d
Delete dataAnalysis directory
BournSupremacy May 8, 2022
3b52fca
Corrected ref genome file name
BournSupremacy May 19, 2022
867d7aa
Added step to fix Quast output
BournSupremacy May 31, 2022
6fc316a
Update README.md
BournSupremacy Jun 1, 2022
6344cdd
Update README.md
BournSupremacy Jun 3, 2022
9fe8a1e
Update README.md
BournSupremacy Jun 3, 2022
2fe2df5
Added details about Y chromosome annotations
BournSupremacy Jul 1, 2022
b21dc3b
Update README.md
BournSupremacy Jul 2, 2022
409d478
Update README.md
BournSupremacy Jul 23, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file removed .DS_Store
Binary file not shown.
259 changes: 150 additions & 109 deletions README.md

Large diffs are not rendered by default.

47 changes: 47 additions & 0 deletions bin/adjCdhit.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/usr/bin/perl
use strict;
use warnings;

my $usage="$0 <fasta> <fasta.clstr> <conversion.txt> <out_fasta> <out.clstr>
";

die $usage if @ARGV!=5;

my ($fa,$facls,$con,$outfa,$outcls)=@ARGV;
my %h;
open(IN,$con);
while(<IN>){
chomp;
my ($a,$b)=split /\t/,$_;
$h{$b}=$a;
}
close IN;

open(OUT,">$outfa");
open(IN,$fa);
while(<IN>){
if(/^>([0-9]+)/){
print OUT ">",$h{$1},"\n";
}
else{
print OUT $_;
}
}
close IN;
close OUT;

open(OUT,">$outcls");
open(IN,$facls);
while(<IN>){
if(/^>/){
print OUT $_;
}
else{
my ($a,$b)=split /\.\.\./,$_;
my($c,$d)=split /\>/,$a;
print OUT $c,$h{$d},$b;
}
}
close IN;


Binary file added bin/bam2cov
Binary file not shown.
215 changes: 215 additions & 0 deletions bin/blastCluster.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
#!/usr/bin/perl
use strict;
use warnings;

my $usage="$0 <origin.fa> <identity> <self-blast_output> <output_prefix>

$0 is used to remove redundant sequences from self-blastn output.
blastn should be run with \" -evalue 1e-5 -outfmt \"6 qseqid sseqid
qlen slen length qstart qend sstart send pident evalue\" -max_target_seqs 1000\"

";

die $usage if @ARGV!=4;
my ($fa,$IDEN,$blast_out,$prefix)=@ARGV;

my %seq=readfa($fa);
my %seqL=readLength($fa);

my @cluster;
my %fflag;
my %pflag;

my $index=0;
my %now;
my $pre_q="";

open(IN,$blast_out) || die "Cannot open $blast_out!\n";
while(<IN>){
chomp;
my ($qname,$tname,$qlen,$tlen, $len,$qstart,$qend,$tstart,$tend,$iden,$evalue)=split /\t/,$_;
next if $qname eq $tname;
next if defined $fflag{$qname};
$iden/=100;
if($qname eq $pre_q){
if ($qlen>=$tlen){
add_to_now($tname,$iden,$len,$tstart,$tend,$tlen);
}
else{
add_to_now($tname,$iden,$len,$qstart,$qend,$qlen);
}
}
else{
my $new_clus=init_item($qname);
$fflag{$qname}=1;
$pflag{$qname}=$index;
foreach my $k (keys(%now)){
if($now{$k}->{iden}>$IDEN){
if(defined $fflag{$k}){
next if $fflag{$k}>=$now{$k}->{iden};
$cluster[$pflag{$k}]->{$k}=0;
$new_clus->{$k}=1;
$fflag{$k}=$now{$k}->{iden};
$pflag{$k}=$index;
}
else{
$new_clus->{$k}=1;
$fflag{$k}=$now{$k}->{iden};
$pflag{$k}=$index;
}
}
}
push @cluster, $new_clus;
$index++;
%now=();
if ($qlen>=$tlen){
add_to_now($tname,$iden,$len,$tstart,$tend,$tlen);
}
else{
add_to_now($tname,$iden,$len,$qstart,$qend,$qlen);
}
}
$pre_q=$qname;
}
close IN;

my $outcl=$prefix.".cluster";
my $outfa=$prefix.".fa";
my %rep;

open(OUT,">$outcl");
my $i=0;
for($i=0;$i<@cluster;$i++){
my $l=0;
my $rep="";
foreach my $k (keys(%{$cluster[$i]})){
if($cluster[$i]->{$k}==1){
if($seqL{$k}>$l){
$rep=$k;
$l=$seqL{$k};
}
}
}
$rep{$rep}=1;
print OUT "Group ",$i+1,": ";
foreach my $k (keys(%{$cluster[$i]})){
print OUT " ",$k if $cluster[$i]->{$k}==1;
}
print OUT "\n";
}
foreach my $k (keys(%seq)){
if(!defined($fflag{$k})){
$rep{$k}=1;
print OUT "Group $i: $k\n";
$i++;
}
}
close OUT;

open(OUT,">$outfa");
open(IN,$fa);
my $f=0;
while(<IN>){
chomp;
if(/^>(.+)$/){
if(defined $rep{$1}){
$f=1;
print OUT $_,"\n";
}
}
else{
print OUT $_,"\n" if $f==1;
}
}
close IN;
close OUT;

################################################
sub readfa{
my %h;
open(IN,$_[0]);
while(<IN>){
chomp;
$h{$1}=1 if(/^>(.+)$/);
}
close IN;
return %h;
}

sub readLength{
my %h;
open(IN,$_[0]);
my $name="";
while(<IN>){
chomp;
if(/^>(.+)$/){
$name=$1;
$h{$name}=0;
}
else{
$h{$name}+=length($_);
}
}
close IN;
return %h;
}

sub init_item{
my %h;
$h{$_[0]}=1;
return \%h;
}

sub add_to_now{
my ($tname,$iden,$len,$tstart,$tend,$tlen)=@_;
if(defined $now{$tname}){
if(!check_overlap($now{$tname},$tstart,$tend)){
push @{$now{$tname}->{start}},$tstart;
push @{$now{$tname}->{end}},$tend;
$now{$tname}->{iden}=$now{$tname}->{iden}+$iden*$len/$tlen;
}
}
else{
$now{$tname}=init_h($iden*$len/$tlen,$tstart,$tend);
}
}

sub init_h{
my %h;
$h{iden}=$_[0];
$h{start}=init_array($_[1]);
$h{end}=init_array($_[2]);
return \%h;
}

sub init_array{
my @a;
$a[0]=$_[0];
return \@a;
}
sub check_overlap{
my ($ad,$s,$e)=@_;
my $f=0;
for(my $i=0;$i<@{$ad->{start}};$i++){
if(check_overlap_helper($s,$e,$ad->{start}->[$i],$ad->{end}->$i)){
$f=1;
last;
}
}
close $f;
}

sub check_overlap_helper{
my ($a,$b,$c,$d)=@_;
if($a<$c){
return 0 if $b<$c;
return 1 if $b>=$c;
}
elsif($a>$c){
return 0 if $d<$a;
return 1 if $d>=$a;
}
else{
return 1;
}
}
69 changes: 69 additions & 0 deletions bin/breakGCscaf.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#!/usr/bin/perl
#Created by Hu Zhiqiang, 2014-10-5
use strict;
use warnings;
use Getopt::Std;
use vars qw($opt_m);
getopts("m:");
my $usage="Usage: $0 [options] <gap-closed scaf> <gap-closed contig>
This script is to break gap-closed scaffolds into contigs at consecutive N.

Option:
-m <INT> the number of consecutive N to be broken down
default: 10
";
die $usage if @ARGV!=2;

my $con_N=10;
$con_N=$opt_m if defined($opt_m);

break_scaffolds(@ARGV,$con_N);

sub break_scaffolds{
my ($in,$out,$sepN)=@_;
my $sep="";
for(my $i=1;$i<$sepN;$i++){
$sep.="N";
}
open(SCAF,$in)||die("Error09: cannot read file:$in\n");
open(CONTIG,">$out")||die("Error10: cannot write file:$out\n");
my $init=1;
my ($header,$seq);
while(my $line=<SCAF>){
chomp $line;
if($line=~/^>/){
if(!$init){
split_and_print_seq($seq,$sep,$header);
($header)=split /\s+/,$line;
$seq="";
}
else{
($header)=split /\s+/,$line;
$seq="";
$init=0;
}
}
else{
$seq.=$line;
}
}
split_and_print_seq($seq,$sep,$header);
close SCAF;
close CONTIG;
}

sub split_and_print_seq{
my ($seq,$sep,$header)=@_;
my $i=0;
my @tmp=split /\Q$sep\EN+/,$seq;
foreach my $s (@tmp){
next if length($s)==0;
$i++;
print CONTIG $header,"_",$i,"\n";
for(my $j=0;$j<int(length($s)/100);$j++){
print CONTIG substr($s,100*$j,100),"\n";
}
print CONTIG substr($s,0-length($s)%100),"\n" if length($s)%100>0;
}
}

Binary file added bin/ccov
Binary file not shown.
Loading