diff --git a/src/pindel2vcf.cpp b/src/pindel2vcf.cpp index ad93df9b..6a2733bf 100644 --- a/src/pindel2vcf.cpp +++ b/src/pindel2vcf.cpp @@ -760,6 +760,10 @@ void createHeader(ofstream &outFile, const string& sourceProgram, const string& outFile << "##INFO=" << endl; outFile << "##INFO=" << endl; outFile << "##INFO=" << endl; + if ( g_par.somatic && pindel024uOrLater ){ { + // The field name of VarScan's similar statistic is used here. + outFile << "##INFO=" << endl; + } outFile << "##FORMAT=" << endl; //outFile << "##ALT=" << endl; /*EWL040311: probably not needed, as our calls are precise so we should rather give the exact replacing sequence instead of a label. */ //outFile << "##ALT=" << endl; @@ -1516,21 +1520,20 @@ int FACT(int n) return fact; } -double fisher_test(int a, int c, int b, int d) -{ - double p = 0.0; - int n = a + b + c + d; - p = (FACT(a+b)*FACT(c+d)*FACT(a+c)*FACT(d+b)) / (double)(FACT(a)*FACT(b)*FACT(c)*FACT(d)*FACT(n)); - std::cout << a << " " << c << " " << b << " " << d << " " << p << std::endl; - return p; -} +//double fisher_test(int a, int c, int b, int d, double *_left, double *_right, double *_two) +//{ +// double p = 0.0, left = 0.0, right = 0.0, two = 0.0; +// int n = a + b + c + d; +// //p = (FACT(a+b)*FACT(c+d)*FACT(a+c)*FACT(d+b)) / (double)(FACT(a)*FACT(b)*FACT(c)*FACT(d)*FACT(n)); +// p = kt_fisher_exact (a, b, c, d, &left, &right, &two); +// std::cout << a << "|" << c << "|" << b << "|" << d << "|" << p << "|" << two << std::endl; +// return p; +//} ostream& operator<<(ostream& os, const SVData& svd) { - double somatic_p_value = 0.0; - - + double somatic_p_value, left, right, twotail; os << svd.d_chromosome << "\t"; os << svd.getPosition() << "\t"; @@ -1538,10 +1541,8 @@ ostream& operator<<(ostream& os, const SVData& svd) os << svd.getOutputFormattedReference() << "\t"; os << svd.getOutputFormattedAlternative() << "\t"; os << svd.d_quality << "\t"; - if (svd.d_format.size() == 2 && g_par.somatic) { - - somatic_p_value = fisher_test(svd.d_format[0].getTotalReads(), svd.d_format[0].getTotalRefSupport(), svd.d_format[1].getTotalReads(), svd.d_format[1].getTotalRefSupport()); - //if (somatic_p_value < 0.05) svd.d_filter = "PASS"; + if (svd.d_format.size() == 2 && g_par.somatic && pindel024uOrLater) { + somatic_p_value = kt_fisher_exact (svd.d_format[0].getTotalReads(), svd.d_format[0].getTotalRefSupport(), svd.d_format[1].getTotalReads(), svd.d_format[1].getTotalRefSupport(), &left, &right, &twotail); } if (somatic_p_value < 0.05) { os << "PASS\t"; @@ -1571,8 +1572,8 @@ ostream& operator<<(ostream& os, const SVData& svd) os << "," << svd.d_replaceLenTwo; } - if (svd.d_format.size() == 2 && g_par.somatic) { - os << ";" << somatic_p_value; + if (svd.d_format.size() == 2 && g_par.somatic && pindel024uOrLater ) { + os << ";SPV=" << twotail; } @@ -2035,8 +2036,8 @@ void createParameters() new IntParameter( &g_par.maxPostRepeatLength, "-pl", "--max_postindel_repeatlength", "Filters out all indels where the inserted/deleted sequence is followed by a repetition of the fundamental repeat unit of the inserted/deleted sequence; the maximum size of that 'fundamental unit' given by the value of -pl (default infinite) For example: TCAG->TCAGCAG has insertion CAG and post-insertion sequence CAG. This insertion would be filtered out if -pl has been set to 3 or above, but would be deemed 'sufficiently unrepetitive' if -pl is 2", false, -1 ) ); parameters.push_back( new BoolParameter( &g_par.onlyBalancedSamples, "-sb", "--only_balanced_samples", "Only count a sample as supporting an event if it is supported by reads on both strands, minimum reads per strand given by the -ss parameter. (default false)", false, 0 ) ); -// parameters.push_back( - // new BoolParameter( &g_par.somatic, "-so", "--somatic_p", "compute somatic p value when two samples are present, assume the order is normal and tumor. (default false)", false, 0 ) ); + parameters.push_back( + new BoolParameter( &g_par.somatic, "-so", "--somatic_p", "compute somatic p value when two samples are present, assume the order is normal and tumor. (default false)", false, 0 ) ); parameters.push_back( new IntParameter( &g_par.minimumStrandSupport, "-ss", "--minimum_strand_support", "Only count a sample as supporting an event if at least one of its strands is supported by X reads (default 1)", false, 1 ) ); @@ -2350,6 +2351,9 @@ int main(int argc, char* argv[]) showSet( sampleNames ); cout << "Chromosomes in which SVs have been found:\n"; showSet( chromosomeNames ); + if ( g_par.somatic && ! pindel024uOrLater){ + cout << "P-values for somatic variations can only be calculated for pindel outputs after 0.2.4u.\n" + } map< string, int > sampleMap; makeSampleMap( sampleNames, sampleMap );