Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file removed .DS_Store
Binary file not shown.
219 changes: 124 additions & 95 deletions README.md

Large diffs are not rendered by default.

47 changes: 47 additions & 0 deletions bin/adjCdhit.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/usr/bin/perl
use strict;
use warnings;

my $usage="$0 <fasta> <fasta.clstr> <conversion.txt> <out_fasta> <out.clstr>
";

die $usage if @ARGV!=5;

my ($fa,$facls,$con,$outfa,$outcls)=@ARGV;
my %h;
open(IN,$con);
while(<IN>){
chomp;
my ($a,$b)=split /\t/,$_;
$h{$b}=$a;
}
close IN;

open(OUT,">$outfa");
open(IN,$fa);
while(<IN>){
if(/^>([0-9]+)/){
print OUT ">",$h{$1},"\n";
}
else{
print OUT $_;
}
}
close IN;
close OUT;

open(OUT,">$outcls");
open(IN,$facls);
while(<IN>){
if(/^>/){
print OUT $_;
}
else{
my ($a,$b)=split /\.\.\./,$_;
my($c,$d)=split /\>/,$a;
print OUT $c,$h{$d},$b;
}
}
close IN;


Binary file added bin/bam2cov
Binary file not shown.
215 changes: 215 additions & 0 deletions bin/blastCluster.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
#!/usr/bin/perl
use strict;
use warnings;

my $usage="$0 <origin.fa> <identity> <self-blast_output> <output_prefix>

$0 is used to remove redundant sequences from self-blastn output.
blastn should be run with \" -evalue 1e-5 -outfmt \"6 qseqid sseqid
qlen slen length qstart qend sstart send pident evalue\" -max_target_seqs 1000\"

";

die $usage if @ARGV!=4;
my ($fa,$IDEN,$blast_out,$prefix)=@ARGV;

my %seq=readfa($fa);
my %seqL=readLength($fa);

my @cluster;
my %fflag;
my %pflag;

my $index=0;
my %now;
my $pre_q="";

open(IN,$blast_out) || die "Cannot open $blast_out!\n";
while(<IN>){
chomp;
my ($qname,$tname,$qlen,$tlen, $len,$qstart,$qend,$tstart,$tend,$iden,$evalue)=split /\t/,$_;
next if $qname eq $tname;
next if defined $fflag{$qname};
$iden/=100;
if($qname eq $pre_q){
if ($qlen>=$tlen){
add_to_now($tname,$iden,$len,$tstart,$tend,$tlen);
}
else{
add_to_now($tname,$iden,$len,$qstart,$qend,$qlen);
}
}
else{
my $new_clus=init_item($qname);
$fflag{$qname}=1;
$pflag{$qname}=$index;
foreach my $k (keys(%now)){
if($now{$k}->{iden}>$IDEN){
if(defined $fflag{$k}){
next if $fflag{$k}>=$now{$k}->{iden};
$cluster[$pflag{$k}]->{$k}=0;
$new_clus->{$k}=1;
$fflag{$k}=$now{$k}->{iden};
$pflag{$k}=$index;
}
else{
$new_clus->{$k}=1;
$fflag{$k}=$now{$k}->{iden};
$pflag{$k}=$index;
}
}
}
push @cluster, $new_clus;
$index++;
%now=();
if ($qlen>=$tlen){
add_to_now($tname,$iden,$len,$tstart,$tend,$tlen);
}
else{
add_to_now($tname,$iden,$len,$qstart,$qend,$qlen);
}
}
$pre_q=$qname;
}
close IN;

my $outcl=$prefix.".cluster";
my $outfa=$prefix.".fa";
my %rep;

open(OUT,">$outcl");
my $i=0;
for($i=0;$i<@cluster;$i++){
my $l=0;
my $rep="";
foreach my $k (keys(%{$cluster[$i]})){
if($cluster[$i]->{$k}==1){
if($seqL{$k}>$l){
$rep=$k;
$l=$seqL{$k};
}
}
}
$rep{$rep}=1;
print OUT "Group ",$i+1,": ";
foreach my $k (keys(%{$cluster[$i]})){
print OUT " ",$k if $cluster[$i]->{$k}==1;
}
print OUT "\n";
}
foreach my $k (keys(%seq)){
if(!defined($fflag{$k})){
$rep{$k}=1;
print OUT "Group $i: $k\n";
$i++;
}
}
close OUT;

open(OUT,">$outfa");
open(IN,$fa);
my $f=0;
while(<IN>){
chomp;
if(/^>(.+)$/){
if(defined $rep{$1}){
$f=1;
print OUT $_,"\n";
}
}
else{
print OUT $_,"\n" if $f==1;
}
}
close IN;
close OUT;

################################################
sub readfa{
my %h;
open(IN,$_[0]);
while(<IN>){
chomp;
$h{$1}=1 if(/^>(.+)$/);
}
close IN;
return %h;
}

sub readLength{
my %h;
open(IN,$_[0]);
my $name="";
while(<IN>){
chomp;
if(/^>(.+)$/){
$name=$1;
$h{$name}=0;
}
else{
$h{$name}+=length($_);
}
}
close IN;
return %h;
}

sub init_item{
my %h;
$h{$_[0]}=1;
return \%h;
}

sub add_to_now{
my ($tname,$iden,$len,$tstart,$tend,$tlen)=@_;
if(defined $now{$tname}){
if(!check_overlap($now{$tname},$tstart,$tend)){
push @{$now{$tname}->{start}},$tstart;
push @{$now{$tname}->{end}},$tend;
$now{$tname}->{iden}=$now{$tname}->{iden}+$iden*$len/$tlen;
}
}
else{
$now{$tname}=init_h($iden*$len/$tlen,$tstart,$tend);
}
}

sub init_h{
my %h;
$h{iden}=$_[0];
$h{start}=init_array($_[1]);
$h{end}=init_array($_[2]);
return \%h;
}

sub init_array{
my @a;
$a[0]=$_[0];
return \@a;
}
sub check_overlap{
my ($ad,$s,$e)=@_;
my $f=0;
for(my $i=0;$i<@{$ad->{start}};$i++){
if(check_overlap_helper($s,$e,$ad->{start}->[$i],$ad->{end}->$i)){
$f=1;
last;
}
}
close $f;
}

sub check_overlap_helper{
my ($a,$b,$c,$d)=@_;
if($a<$c){
return 0 if $b<$c;
return 1 if $b>=$c;
}
elsif($a>$c){
return 0 if $d<$a;
return 1 if $d>=$a;
}
else{
return 1;
}
}
69 changes: 69 additions & 0 deletions bin/breakGCscaf.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#!/usr/bin/perl
#Created by Hu Zhiqiang, 2014-10-5
use strict;
use warnings;
use Getopt::Std;
use vars qw($opt_m);
getopts("m:");
my $usage="Usage: $0 [options] <gap-closed scaf> <gap-closed contig>
This script is to break gap-closed scaffolds into contigs at consecutive N.

Option:
-m <INT> the number of consecutive N to be broken down
default: 10
";
die $usage if @ARGV!=2;

my $con_N=10;
$con_N=$opt_m if defined($opt_m);

break_scaffolds(@ARGV,$con_N);

sub break_scaffolds{
my ($in,$out,$sepN)=@_;
my $sep="";
for(my $i=1;$i<$sepN;$i++){
$sep.="N";
}
open(SCAF,$in)||die("Error09: cannot read file:$in\n");
open(CONTIG,">$out")||die("Error10: cannot write file:$out\n");
my $init=1;
my ($header,$seq);
while(my $line=<SCAF>){
chomp $line;
if($line=~/^>/){
if(!$init){
split_and_print_seq($seq,$sep,$header);
($header)=split /\s+/,$line;
$seq="";
}
else{
($header)=split /\s+/,$line;
$seq="";
$init=0;
}
}
else{
$seq.=$line;
}
}
split_and_print_seq($seq,$sep,$header);
close SCAF;
close CONTIG;
}

sub split_and_print_seq{
my ($seq,$sep,$header)=@_;
my $i=0;
my @tmp=split /\Q$sep\EN+/,$seq;
foreach my $s (@tmp){
next if length($s)==0;
$i++;
print CONTIG $header,"_",$i,"\n";
for(my $j=0;$j<int(length($s)/100);$j++){
print CONTIG substr($s,100*$j,100),"\n";
}
print CONTIG substr($s,0-length($s)%100),"\n" if length($s)%100>0;
}
}

Binary file added bin/ccov
Binary file not shown.
Loading