-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy path03.splitLocallyColinear.pl
More file actions
102 lines (95 loc) · 4.15 KB
/
03.splitLocallyColinear.pl
File metadata and controls
102 lines (95 loc) · 4.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/perl -w
=head1 Description
This script is used to make sure the homolog group coordinates with the order of the left to right, then split the crossed sequences to different homolog group so that make several locally colinear blocks.
=head1 Contact & Version
Author: Qi Fang, fangqi@genomics.cn
Version: 1.0, Date: 2019-02-14
Version: 2.0, Date: 2019-03-29, Modified: change the @c to @{$hash{$split_tag}} so that the coordinates would be correct among more than one splits.
Version: 2.1, Date: 2019-03-30, Modified: change the line45 from $seq to $seg{$split_tag} so that there no wrong dump with the sequence splits.
Version: 2.2, Date: 2019-12-10, Modified: Debug the coodinates redundancy of merging neighboring blocks when split the INDEL out which was reported by YuanDeng.
Version: 2.3, Date: 2020-02-28, Modified: Debug the split length ($s) error when the alignments could be merged.
Version: 3.0, Date: 2020-09-22, Modified: Further debug the split length ($s) error when the alignments could be merged.
=head1 Usage Exmples
03.splitLocallyColinear.pl Anc001.maf.sort.all.fasta > Anc001.maf.sort.all.s.fasta
=cut
use strict;
my $input = shift;
open IN,$input;
$/=">";<IN>;
while(<IN>){
my (%hash,%seg);
my ($split_tag,$tag,$s)=(1,0,0);
chomp;
my @a = split /\s+/;
my $seq = $a[3];
my @c = split(/,|;/,$a[1]);
$seg{$split_tag}=$seq;
if($#c==1){
push @{$hash{$split_tag}}, ($c[0],$c[1]);
}else{
for(my $i=0;$i<=$#c-2;$i+=2){
if($c[$i+2]>$c[$i+1]){
push @{$hash{$split_tag}}, ($c[$i],$c[$i+1]);
$s+=$c[$i+1]-$c[$i];
}elsif($c[$i+2]==$c[$i+1]){
push @{$hash{$split_tag}}, ($c[$i],$c[$i+3]);
$s+=$c[$i+3]-$c[$i];$i+=2;
}else{
$tag++;
my $sT=0; my $sE=0;
unshift @{$hash{$split_tag+$tag}}, ($c[$i],$c[$i+1]);
$sT+=$c[$i+1]-$c[$i];
for(my $j=$#{$hash{$split_tag}};$j>0;$j-=2){
if($hash{$split_tag}[$j]<$c[$i+2]){
last;
}elsif($hash{$split_tag}[$j]==$c[$i+2]){
$hash{$split_tag}[-1]=$c[$i+3];
$sE=$c[$i+3]-$c[$i+2]; ##debug by fangqi at 20200228; debug by fangqi at 20200922: if add $s here it would disturb the sequences substr in this run
$i+=2; ##debug by fangqi at 20191210
last;
}else{
unshift @{$hash{$split_tag+$tag}}, ($hash{$split_tag}[$j-1],$hash{$split_tag}[$j]);
$sT+=$hash{$split_tag}[$j]-$hash{$split_tag}[$j-1];
$s-=$hash{$split_tag}[$j]-$hash{$split_tag}[$j-1];
pop @{$hash{$split_tag}};
pop @{$hash{$split_tag}};
}
}
my $sN=0;
for(my $k=0;$k<length($seq);$k++){
my $nu = substr($seg{$split_tag},$k,1);
if($nu ne "-"){
$sN++;
if($sN>$s and $sN<=$s+$sT){
$seg{$split_tag+$tag}.=$nu;
substr($seg{$split_tag}, $k, 1, "-");
}else{
$seg{$split_tag+$tag}.="-";
}
}else{
$seg{$split_tag+$tag}.="-";
}
}
$s+=$sE;
}
}
push @{$hash{$split_tag}}, ($c[-2],$c[-1]);
}
for(my $k=0;$k<=$tag;$k++){
if($tag>0){
my $tmpTag = $split_tag+$k;
print ">$a[0]_p$tmpTag\t";
}else{
print ">$a[0]\t";
}
my ($pos,$len);
for(my $i=0;$i<=$#{$hash{$split_tag+$k}};$i+=2){
$pos.=$hash{$split_tag+$k}[$i].",".$hash{$split_tag+$k}[$i+1].";";
$len.=($hash{$split_tag+$k}[$i+1]-$hash{$split_tag+$k}[$i]).",";
}
if(!defined $pos){print STDERR "$a[0]\n";}
chop $pos;
chop $len;
print "$pos\t$len\n$seg{$split_tag+$k}\n";
}
}close IN;