grbot · BournSupremacy · Feb 25, 2022 · Mar 7, 2022 · Mar 7, 2022 · Mar 8, 2022
diff --git a/.DS_Store b/.DS_Store
diff --git a/README.md b/README.md
diff --git a/bin/adjCdhit.pl b/bin/adjCdhit.pl
@@ -0,0 +1,47 @@
+#!/usr/bin/perl
+use strict;
+use warnings;
+
+my $usage="$0 <fasta> <fasta.clstr> <conversion.txt> <out_fasta> <out.clstr>
+";
+
+die $usage if @ARGV!=5;
+
+my ($fa,$facls,$con,$outfa,$outcls)=@ARGV;
+my %h;
+open(IN,$con);
+while(<IN>){
+    chomp;
+    my ($a,$b)=split /\t/,$_;
+    $h{$b}=$a;
+}
+close IN;
+
+open(OUT,">$outfa");
+open(IN,$fa);
+while(<IN>){
+    if(/^>([0-9]+)/){
+	print OUT ">",$h{$1},"\n";
+    }
+    else{
+	print OUT $_;
+    }
+}
+close IN;
+close OUT;
+
+open(OUT,">$outcls");
+open(IN,$facls);
+while(<IN>){
+    if(/^>/){
+	print OUT $_;
+    }
+    else{
+	my ($a,$b)=split /\.\.\./,$_;
+	my($c,$d)=split /\>/,$a;
+	print OUT $c,$h{$d},$b;
+    }
+}
+close IN;
+
+
diff --git a/bin/bam2cov b/bin/bam2cov
diff --git a/bin/blastCluster.pl b/bin/blastCluster.pl
@@ -0,0 +1,215 @@
+#!/usr/bin/perl 
+use strict;
+use warnings;
+
+my $usage="$0 <origin.fa> <identity> <self-blast_output> <output_prefix>
+
+$0 is used to remove redundant sequences from self-blastn output.
+blastn should be run with \" -evalue 1e-5  -outfmt \"6 qseqid sseqid 
+qlen slen length qstart qend sstart send pident evalue\" -max_target_seqs 1000\"
+
+";
+
+die $usage if @ARGV!=4;
+my ($fa,$IDEN,$blast_out,$prefix)=@ARGV;
+
+my %seq=readfa($fa);
+my %seqL=readLength($fa);
+
+my @cluster;
+my %fflag;
+my %pflag;
+
+my $index=0;
+my %now;
+my $pre_q="";
+
+open(IN,$blast_out) || die "Cannot open $blast_out!\n";
+while(<IN>){
+    chomp;
+    my ($qname,$tname,$qlen,$tlen, $len,$qstart,$qend,$tstart,$tend,$iden,$evalue)=split /\t/,$_;
+    next if $qname eq $tname;
+    next if defined $fflag{$qname};
+    $iden/=100;
+    if($qname eq $pre_q){
+	if ($qlen>=$tlen){
+	    add_to_now($tname,$iden,$len,$tstart,$tend,$tlen);
+	}
+	else{
+	    add_to_now($tname,$iden,$len,$qstart,$qend,$qlen);
+	}
+    }
+    else{
+	my $new_clus=init_item($qname);
+	$fflag{$qname}=1;
+	$pflag{$qname}=$index;
+	foreach my $k (keys(%now)){
+	    if($now{$k}->{iden}>$IDEN){
+		if(defined $fflag{$k}){
+		    next if $fflag{$k}>=$now{$k}->{iden};
+		    $cluster[$pflag{$k}]->{$k}=0;
+		    $new_clus->{$k}=1;
+		    $fflag{$k}=$now{$k}->{iden};
+		    $pflag{$k}=$index;
+		}
+		else{
+		    $new_clus->{$k}=1;
+		    $fflag{$k}=$now{$k}->{iden};
+		    $pflag{$k}=$index;
+		}
+	    }
+	}
+	push @cluster, $new_clus;
+	$index++;
+	%now=();
+	if ($qlen>=$tlen){
+	    add_to_now($tname,$iden,$len,$tstart,$tend,$tlen);
+	}
+	else{
+	    add_to_now($tname,$iden,$len,$qstart,$qend,$qlen);
+	}
+    }
+    $pre_q=$qname;
+}
+close IN;
+
+my $outcl=$prefix.".cluster";
+my $outfa=$prefix.".fa";
+my %rep;
+
+open(OUT,">$outcl");
+my $i=0;
+for($i=0;$i<@cluster;$i++){
+    my $l=0;
+    my $rep="";
+    foreach my $k (keys(%{$cluster[$i]})){
+	if($cluster[$i]->{$k}==1){
+	    if($seqL{$k}>$l){
+		$rep=$k;
+		$l=$seqL{$k};
+	    }
+	}
+    }
+    $rep{$rep}=1;
+    print OUT "Group ",$i+1,": ";
+    foreach my $k (keys(%{$cluster[$i]})){
+	print OUT " ",$k if $cluster[$i]->{$k}==1;
+    }
+    print OUT "\n";
+}
+foreach my $k (keys(%seq)){
+    if(!defined($fflag{$k})){
+	$rep{$k}=1;
+	print OUT "Group $i: $k\n";
+	$i++;
+    }
+}
+close OUT;
+
+open(OUT,">$outfa");
+open(IN,$fa);
+my $f=0;
+while(<IN>){
+    chomp;
+    if(/^>(.+)$/){
+	if(defined $rep{$1}){
+	    $f=1;
+	    print OUT $_,"\n";
+	}
+    }
+    else{
+	print OUT $_,"\n" if $f==1;
+    }
+}
+close IN;
+close OUT;
+
+################################################
+sub readfa{
+    my %h;
+    open(IN,$_[0]);
+    while(<IN>){
+	chomp;
+	$h{$1}=1 if(/^>(.+)$/);
+    }
+    close IN;
+    return %h;
+}
+
+sub readLength{
+    my %h;
+    open(IN,$_[0]);
+    my $name="";
+    while(<IN>){
+	chomp;
+	if(/^>(.+)$/){
+	    $name=$1;
+	    $h{$name}=0;
+	}
+	else{
+	    $h{$name}+=length($_);
+	}
+    }
+    close IN;
+    return %h;
+}
+
+sub init_item{
+    my %h;
+    $h{$_[0]}=1;
+    return \%h;
+}
+
+sub add_to_now{
+    my ($tname,$iden,$len,$tstart,$tend,$tlen)=@_;
+    if(defined $now{$tname}){
+	if(!check_overlap($now{$tname},$tstart,$tend)){
+	    push @{$now{$tname}->{start}},$tstart;
+	    push @{$now{$tname}->{end}},$tend;
+	    $now{$tname}->{iden}=$now{$tname}->{iden}+$iden*$len/$tlen;
+	}
+    }
+    else{
+	$now{$tname}=init_h($iden*$len/$tlen,$tstart,$tend);
+    }
+}
+
+sub init_h{
+    my %h;
+    $h{iden}=$_[0];
+    $h{start}=init_array($_[1]);
+    $h{end}=init_array($_[2]);
+    return \%h;
+}
+
+sub init_array{
+    my @a;
+    $a[0]=$_[0];
+    return \@a;
+}
+sub check_overlap{
+    my ($ad,$s,$e)=@_;
+    my $f=0;
+    for(my $i=0;$i<@{$ad->{start}};$i++){
+	if(check_overlap_helper($s,$e,$ad->{start}->[$i],$ad->{end}->$i)){
+	    $f=1;
+	    last;
+	}
+    }
+    close $f;
+}
+
+sub check_overlap_helper{
+    my ($a,$b,$c,$d)=@_;
+    if($a<$c){
+	return 0 if $b<$c;
+	return 1 if $b>=$c;
+    }
+    elsif($a>$c){
+	return 0 if $d<$a;
+	return 1 if $d>=$a;
+    }
+    else{
+	return 1;
+    }
+}
diff --git a/bin/breakGCscaf.pl b/bin/breakGCscaf.pl
@@ -0,0 +1,69 @@
+#!/usr/bin/perl
+#Created by Hu Zhiqiang, 2014-10-5
+use strict;
+use warnings;
+use Getopt::Std;
+use vars qw($opt_m);
+getopts("m:");
+my $usage="Usage: $0 [options] <gap-closed scaf> <gap-closed contig>
+This script is to break gap-closed scaffolds into contigs at consecutive N.
+
+Option:
+   -m     <INT>    the number of consecutive N to be broken down
+                   default: 10
+";
+die $usage if @ARGV!=2;
+
+my $con_N=10;
+$con_N=$opt_m if defined($opt_m);
+
+break_scaffolds(@ARGV,$con_N);
+
+sub break_scaffolds{
+    my ($in,$out,$sepN)=@_;
+    my $sep="";
+    for(my $i=1;$i<$sepN;$i++){
+	$sep.="N";
+    }
+    open(SCAF,$in)||die("Error09: cannot read file:$in\n");
+    open(CONTIG,">$out")||die("Error10: cannot write file:$out\n");
+    my $init=1;
+    my ($header,$seq);
+    while(my $line=<SCAF>){
+	chomp $line;
+	if($line=~/^>/){
+	    if(!$init){
+		split_and_print_seq($seq,$sep,$header);
+		($header)=split /\s+/,$line;
+		$seq="";
+	    }
+	    else{
+		($header)=split /\s+/,$line;
+		$seq="";
+		$init=0;
+	    }
+	}
+	else{
+	    $seq.=$line;
+	}
+    }
+    split_and_print_seq($seq,$sep,$header);
+    close SCAF;
+    close CONTIG;
+}
+
+sub split_and_print_seq{
+    my ($seq,$sep,$header)=@_;
+    my $i=0;
+    my @tmp=split /\Q$sep\EN+/,$seq;
+    foreach my $s (@tmp){
+	next if length($s)==0;
+	$i++;
+	print CONTIG $header,"_",$i,"\n";
+	for(my $j=0;$j<int(length($s)/100);$j++){
+	    print CONTIG substr($s,100*$j,100),"\n";
+	}
+	print CONTIG substr($s,0-length($s)%100),"\n" if length($s)%100>0;
+    }
+}
+
diff --git a/bin/ccov b/bin/ccov