-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtitanCNAprepare.pl
More file actions
executable file
·224 lines (190 loc) · 8.21 KB
/
titanCNAprepare.pl
File metadata and controls
executable file
·224 lines (190 loc) · 8.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
use strict;
use Data::Dumper;
use File::Glob ':glob';
use File::Basename;
my $data = shift;
my $somaticInfo = shift;
my $pairedCall = shift;
my $homoThred = shift;
my $ndepthThred = shift;
my $lohRegion = shift;
my $split = 1;
my $noNormal = 0;
if ($homoThred eq ''){
$homoThred = 0.85;
}
if ($ndepthThred eq ''){
$ndepthThred = 8;
}
my %somatic;
my %germline; #may have multiple tumors
if ($somaticInfo ne '' and -s "$somaticInfo") {
open IN, "$somaticInfo";
while ( <IN> ){
chomp;
s/[\s\n]$//;
my @columns = split /\t/;
my $tumor = $columns[0];
my $normal = $columns[1];
$somatic{$tumor} = $normal;
push(@{$germline{$normal}}, $tumor) if $normal ne 'undef';
}
close IN;
print STDERR Dumper (\%somatic);
print STDERR Dumper (\%germline);
}
my $outdir = dirname($data)."/titan/";
print STDERR "$outdir\n";
unless (-e "$outdir") {
system("mkdir -p $outdir");
}
my %lohr;
if ($lohRegion ne '') {
open LR, "$lohRegion";
while ( <LR> ) {
chomp;
next if /^ID/;
my ($sample, $chr, $start, $end, $nm, $segmean) = split /\t/;
push(@{$lohr{$chr}}, $start.','.$end.','.$sample);
}
close LR;
}
print STDERR "predetermined germline LOH region:\n";
print STDERR Dumper(\%lohr);
if ($split == 1) {
my %colnames;
my %colindex;
my %fhs;
open IN, "$data";
while (<IN>) {
chomp;
my @cols = split /\t/;
if ($_ =~ /^[\#]?chr\t/) {
$_ =~ s/^\#//;
for (my $i = 0; $i <= $#cols; $i++) {
$colnames{$i} = $cols[$i];
$colindex{$cols[$i]} = $i;
}
next;
} else {
my $chr = $cols[$colindex{'chr'}];
my $pos = $cols[$colindex{'pos'}];
my $ref = $cols[$colindex{'ref'}];
my $alt = $cols[$colindex{'alt'}];
next if ($chr =~ /M(T)?$/); #skip mitochon
# start loh Region sample list #
my %lohSample; #contain sample names of lohr for this coordinate
if ( exists( $lohr{$chr} ) ) {
foreach my $lregion ( @{$lohr{$chr}} ) {
my ($lstart, $lend, $lsample) = split(',', $lregion);
if ($pos >= $lstart and $pos <= $lend) { #overlaps
$lohSample{$lsample} = 1;
}
} #each lregion
} #if chr is in the loh region list
# end loh Region sample list #
for (my $i = 0; $i <= $#cols; $i++) {
if ($colnames{$i} =~ /^(.+?)maf$/) { #now it is sample maf
my $sample = $1;
if ($noNormal == 1) { #if no normal, for testing purpose only
if ( $cols[$i] =~ /\|/ ) { #split the var surrounding information
my @tsinfo = split(/\|/, $cols[$i]);
my $tsmaf = $tsinfo[0];
my $tsendsratio = $tsinfo[1];
my ($tscmean, $tscmedian) = split(',', $tsinfo[2]);
my $tsd = $cols[$i+1];
if (($cols[$i] =~ /\|/ and $tsendsratio <= 0.9 and (($tscmean+$tscmedian) < 5.5 or $tscmedian <= 2)) or ($cols[$i] == 0 and $homoThred >= 0.85)) { #print
my $fh = $sample;
unless (-e "$outdir/$sample\_titan") {
open ( my $fh, ">>", "$outdir/$sample\_titan" ) || die $!;
$fhs{$sample} = $fh;
print {$fhs{$sample}} "chr\tpos\tref\trefCount\talt\taltCount\n";
}
my $NrefCount = 0;
my $refCount = 0;
$NrefCount = round($tsmaf*$tsd);
$refCount = $tsd - $NrefCount;
if (($refCount + $NrefCount) >= 5) {
print {$fhs{$sample}} "$chr\t$pos\t$ref\t$refCount\t$alt\t$NrefCount\n";
}
} #true event print
print STDERR "$chr\t$pos\t$ref\t$alt\t$sample\n";
} #split tumor info
next;
}
if (exists($germline{$sample})) { #it is a blood/normal control, then process each tumor sample
# start determine if is loh Region #
my $lohSamplePos = 'no';
if ($lohRegion ne '') {
if (exists($lohSample{$sample})) { #overlaps
$lohSamplePos = 'yes';
}
} #check germline loh
if ($lohSamplePos eq 'yes') {
print STDERR "GLOH: $sample\t$chr\t$pos\n";
}
# end determine if is loh Region #
my $calledBlood = $cols[$i-1];
if ( $pairedCall == 1 ) {
$calledBlood = $cols[$colindex{${$germline{$sample}}[0]}]; #paired-T-original-column
}
if ($calledBlood =~ /\|/) { #originally called
my @calledBloodInfo = split(/\|/, $calledBlood);
next if ($calledBloodInfo[2] ne '0/1' and $lohSamplePos eq 'no'); #only focus on originally hetero ones unless germline loh
my @calledBloodRecheck = split(/\|/, $cols[$i]); #it is the N column rechecked
my $calledBloodDepth = $cols[$i+1]; #it is the N depth column rechecked
unless ($lohRegion ne '' or exists($somatic{$sample})) { #either loh region or it is both a normal and tumor (so ignore the subsequent filter)
next if ($calledBloodRecheck[0] > $homoThred); #if blood has greater than 0.85 VAF, indicating wrong genotyping
}
next if $calledBloodDepth < $ndepthThred; #if blood has too low dept
if ($cols[$i] =~ /\|/) { #split the var surrounding information
my @infos = split(/\|/, $cols[$i]);
my $bmaf = $infos[0];
my $bendsratio = $infos[1];
my ($bcmean, $bcmedian) = split(',', $infos[2]);
my ($strandRatio, $strandRatioRef, $strandFisherP) = split(',', $infos[3]);
my $badQualFrac = $infos[4];
if ($bendsratio <= 0.9 and ($strandRatio != 0 and $strandRatio != 1) and $badQualFrac < 0.6 and (($bcmean+$bcmedian) < 5.5 or $bcmedian <= 2)) { #make sure it looks real in normal
foreach my $tumorSamp (@{$germline{$sample}}) { ##now should start checking for each tumor samples
my $indexts = $colindex{$tumorSamp.'maf'};
if ($cols[$indexts] =~ /\|/ or ($cols[$indexts] == 0 and $homoThred >= 0.85)) { #split the var surrounding information
my @tsinfo = split(/\|/, $cols[$indexts]);
my $tsmaf = $tsinfo[0];
my $tsendsratio = $tsinfo[1];
my ($tscmean, $tscmedian) = split(',', $tsinfo[2]);
my $tsd = $cols[$indexts+1];
if (($cols[$indexts] =~ /\|/ and $tsendsratio <= 0.9 and (($tscmean+$tscmedian) < 5.5 or $tscmedian <= 2)) or ($cols[$indexts] == 0 and $homoThred >= 0.85)) { #print
my $fh = $tumorSamp;
unless (-e "$outdir/$tumorSamp\_titan") {
open ( my $fh, ">>", "$outdir/$tumorSamp\_titan" ) || die $!;
$fhs{$tumorSamp} = $fh;
print {$fhs{$tumorSamp}} "chr\tpos\tref\trefCount\talt\taltCount\n";
}
my $NrefCount = 0;
my $refCount = 0;
$NrefCount = round($tsmaf*$tsd);
$refCount = $tsd - $NrefCount;
if (($refCount + $NrefCount) >= 5) {
print {$fhs{$tumorSamp}} "$chr\t$pos\t$ref\t$refCount\t$alt\t$NrefCount\n";
}
} #true event print
} #split tumor info
} ##now should start checking for each tumor samples
} #true blood event
} #split blood recheck info
} #originally called
} #blood
} #maf
} #each col
} #each non header
} #each line
close IN;
} #split samples
sub round {
my $number = shift;
my $tmp = int($number);
if ($number >= ($tmp+0.5)){
$tmp++;
}
return $tmp;
}