From f91cd04b868dbe219a9cba45396028321c9ca1aa Mon Sep 17 00:00:00 2001 From: fhadinezhadUC <38734741+fhadinezhadUC@users.noreply.github.com> Date: Sun, 14 Feb 2021 17:45:48 -0800 Subject: [PATCH] Update README.md --- README.md | 275 ++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 258 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index e59e160..daa7e75 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,3 @@ - # Installation 1. Download tSFM and example data using the below bash command or the zip link: ```bash @@ -43,6 +42,7 @@ As a quick introduction to the functionality of of tSFM we will be utilizing the ```shell tsfm -e NSB -x 5 -c tRNA_L_skel_Leish.sites74.struct.cove --logo -p 10 HOMO/HOMO ``` + 3. Lets break this command down so we can understand the options a. The below `-e` option sets the entropy estimator to NSB. This can also be set to `Miller` to use the Miller-Madow estimator. ```shell @@ -72,12 +72,20 @@ As a quick introduction to the functionality of of tSFM we will be utilizing the ```shell for d in */; do tsfm -e NSB -x 5 -c tRNA_L_skel_Leish.sites74.struct.cove --logo -p 10 $d${d%/}; done ``` -3. Recreating ID/KLD logos and data table for bubble plots for `HOMO` versus `MAJOR` +3. Recreating ID/KLD logos and data table for bubble plots for `HOMO` versus `MAJOR`. ```shell -tsfm -c tRNA_L_skel_Leish.sites74.struct.txt -e Miller -x 5 --idlogos --kldlogos --bt MAJOR/MAJOR HOMO/HOMO +tsfm -c tRNA_L_skel_Leish.sites74.struct.cove -e MM -x 5 --idlogos --kldlogos -B --clade HOMO MAJOR/MAJOR HOMO/HOMO ``` + a. The text output of this command will be two files `F_HOMO_B_MAJOR_Table.txt` and `F_MAJOR_B_HOMO_Table.txt` which can be used to create the bubble plots. + + b. For example, file F_MAJOR_B_HOMO_Table.txt can look like the following table which shows at feature 1A of ID logo: the height of stack-bar in column `gainbits`, and the height of symbol K in column `gainfht`. It also shows at feature 1A of KLD logo: the height of stack-bar in column `convbits` and the height of symbol K in column `convfht`. The columns `x`, `y` and `sprinzl` left as 0 will be filled later by mapping each row to the tRNA sprinzl coordinates prior to creating the bubble plots. + + aa | coord | state | fbits | fht | gainbits | gainfht | lossbits | lossfht | convbits | convfht | x | y | sprinzl + :-: | :-: | :-: | :--: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: |:-: + K | 1 | A | 2.7602 | 0.0 | 0.7285 | 0.0 | 0.0 | 0.0 | 0.2913| 0.0 |0.0| 0.0| 0.0 + -# Statistical significance testing +# Statistical significance testing for CIFs tSFM implements statisitical significance testing using permutation based null distributions and corrects multiple testing using FDR and FWER methods. The `-P` option indicates the number of permutations generated for building the null distributions and the `-C` option indicates the method for multiple testing correction. 1. To calculate the significance of CIFs stack-heights with 100 permutations for the humans tRNA data using the NSB entropy estimator we can use this command: @@ -93,18 +101,45 @@ tsfm -e NSB -x 5 -c tRNA_L_skel_Leish.sites74.struct.cove --logo -T stacks -P 10 -P 100 ``` -2. To calculate the significance of KLD-values with 100 permutations for clade humans and clade MAJOR against each other we can use this command: -```shell -tsfm --kldperms 100 -c tRNA_L_skel_Leish.sites74.struct.cove -e NSB -x 5 HOMO/HOMO MAJOR/MAJOR -``` - The `--kldperms` option will set the number of permutations to compute significance of Kullback-Leibler Divergences (KLD values) to 100 - ```shell - --kldperms 100 - ``` +# Statistical significance testing for CIFs Divergences (ID and KLD) +tSFM implements statistical significance testing for CIFs divergences using permutation based null distributions. Pvalues can be calculated with three methods `GPD`, `ECDF` and `ECDF_pseudo`. `ECDF_pseudo` is the Monte Carlo permutation based approach with pseudo-count. `ECDF` is similar to the ECDF_pseudo except that when number of permutation replicates that exceed the original stat exceeds the value of `--exceedances` the Pecdf will be returned instead of Pecdf with pseudo. `GPD` is implemented according to the "Peaks-over-Threshhold" (PoT) tail approximation approach described as Approximate algorithm in the manuscript. In addition to options `-P` and `-C` described before the `-m` option sets the algorithm used for calculation of significance. The default method is GPD. tSFM also calculates the confidence intervals for all the pvalues that are calculated eith method ECDF-without-pseudo and pvalues calculated with method GPD. + +1. To calculate the significance of KLD-values with 100 permutations for clade humans and clade MAJOR against each other we can use three methods of GPD, ECDF and ECDF_pseudo described with three following examples: + 1. Method `ECDF_pseudo` + ```shell + tsfm --kldperms 10 -m ECDF_pseudo -c tRNA_L_skel_Leish.sites74.struct.cove -e NSB -x 5 HOMO/HOMO MAJOR/MAJOR + ``` + a. The `--kldperms` option will set the number of permutations to compute significance of Kullback-Leibler Divergences (KLD values) to 100. + + + 2. Method `ECDF`. + ```shell + tsfm --kldperms 10 --exceedances 5 -m ECDF --alpha 0.03 -c tRNA_L_skel_Leish.sites74.struct.cove -e NSB -x 5 HOMO/HOMO MAJOR/MAJOR + ``` + a. The `--alpha` option will set the significance level to compute the confidence interval of pvalues. Default is 0.05. + + + 3. Method `GPD`. In addition to the previous options, there are options `--targetperms` and `--peaks` that are used for when the pmethod is set to GPD. options targetperms and peaks are refered as variables T and U respectively in the algorithm Approximate. Also the option exceedances is reffered as parameter S and is used for both methods `ECDF` and `GPD`. + ```shell + tsfm --kldperms 100 -m GPD --targetperms 70 --exceedances 5 --peaks 50 -c tRNA_L_skel_Leish.sites74.struct.cove -e NSB -x 5 HOMO/HOMO MAJOR/MAJOR + ``` + a. The default for `targetperms` option is 500. If the maximum permutation number with aption `--kldperms` or `--idperms` is set to a number less that 500, the targetperms needs to be set to a number less that maximum permutation number. + + b. The default for `exceedances` is 10. This number also needs to be less that maximum permutation number. + + c. The default for `peaks` is 250; However in the algorithm APROXIMATE the peak will be set to the minimum of 250 and one-third of the permutations. + +2. The output of KLD and ID logo significance from the example described above will be two text files named `KLD_HOMO_MAJOR_stats.txt` and `KLD_MAJOR_HOMO_stats.txt`. + a. For example file KLD_MAJOR_HOMO_stats.txt will have the columns same as the following table which shows the significance of the KLD statistic at feature 2U along with other information at this feature including: confidence interval as `CI.Lower` and `CI.Upper`, adjusted p-value as `Adjusted-P`, number of permutations with which the pvalue is calculated as `Permutations`, the method used for calculating the p-value as `P-Val-Method` which can take the values: `p_ecdf`, `p_ecdf_with_pseudo`, `p_ecdf_with_pseudo (p_gpd=0)` and `p_gpd`. If the pvalue is calculated with GPD, the parameters of GPD calculations will be set in columns `GPD-shape`, `GPD-scale`, `Peaks`. Also the column `ADtest-P-val` shows the pvalue of the goodness of fit of the extreme permutation values to GPD distribution. + + Coord|State|Statistic|Sample-Sz-Back|Sample-Sz-Fore|P-value|CI.Lower|CI.Upper|Adjusted-P|Permutations|P-Val-Method|GPD-shape|GPD-scale|Peaks|ADtest-P-val|Freqs-Back|Freqs-Fore + :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: |:-:| :-: | :-: |:-: + 2|U|0.75|30|72|2.46e-14|0.0|3.22e-05|7.07e-13|70.0|p_gpd|0.15|0.00037|13.0|0.97|D13E16Y1|D24E24N24s -3. To calculate the significance of ID-values for clade HOMO against clades MAJOR and ENRIETTII with 100 permutations we can use this command: + +3. To calculate the significance of ID-values of single-site features only, for clade HOMO against clades MAJOR and ENRIETTII with 100 permutations using the method GPD use this command: (note that running this command can take about 30 minutes.) ```shell -tsfm --idperms 100 -c tRNA_L_skel_Leish.sites74.struct.cove -e NSB -x 5 --clade HOMO HOMO/HOMO MAJOR/MAJOR ENRIETTII/ENRIETTII +tsfm --idperms 100 -m GPD --targetperms 50 --exceedances 5 --peaks 50 -s -e NSB -x 5 --clade HOMO HOMO/HOMO MAJOR/MAJOR ENRIETTII/ENRIETTII ``` The `--idperms` will set the number of permutations to compute significance of Information Differences (ID values) to 100 ```shell @@ -114,11 +149,14 @@ tsfm --idperms 100 -c tRNA_L_skel_Leish.sites74.struct.cove -e NSB -x 5 --clade ```shell --clade HOMO ``` + The `--targetperms` option will set the permutation number at which GPD-pvalue will be calculated if the extreme permutation values fit the GPD distribution. + ```shell + --targetperms 50 + ``` -# Recreating the supplemental figure from the tSFM publication -Warning: this will take ~14 hours on 24 cores +4. To calculate the significance of KLD-values of paired features only for clade HOMO against clades MAJOR with 100 permutations using the method GPD use this command: ```shell -tsfm --kldperms 10000 --idperms 10000 -c tRNA_L_skel_Leish.sites74.struct.cove -e NSB -x 5 --clade HOMO HOMO/HOMO ENRIETTII/ENRIETTII MAJOR/MAJOR +tsfm --kldperms 100 -m GPD --targetperms 50 --exceedances 5 --peaks 50 -n -c tRNA_L_skel_Leish.sites74.struct.cove -e NSB -x 5 --clade HOMO HOMO/HOMO MAJOR/MAJOR ``` # Usage @@ -206,3 +244,206 @@ optional arguments: argument to the program, stripped of its path. Default is to compute contrasts for all pairs of clades. ``` + +# Recreating the supplemental figure from the tSFM publication +1. Creating the supplemental figure + + 1. Create the statistic table for KLD logos of clade ENRIETTII vs HOMO using the pvalue-calculation-method GPD with maximum permutation number = 10000, target permutation number 500, and calculate the Confidence Interval of p-values with 5% significance level. The running time on a Intel Core i7 Dell XPS was ~34 minutes. The outputs are two text files: KLD_ENRIETTII_HOMO_stats.txt and KLD_HOMO_ENRIETTII_stats.txt + ```shell + mkdir GPD + cd Kelly2020_data + tsfm --kldperms 10000 -m GPD -c tRNA_L_skel_Leish.sites74.struct.cove HOMO/HOMO ENRIETTII/ENRIETTII + mv KLD_ENRIETTII_HOMO_stats.txt KLD_HOMO_ENRIETTII_stats.txt GPD + ``` + + 2. Create the statistic table for `KLD logos` of clade `MAJOR` vs `HOMO` using the pvalue-calculation-method GPD with maximum permutation number = 10000, target permutation number 500, and calculate the Confidence Interval of p-values with 5% significance level. The running time on a Intel Core i7 Dell XPS was ~52.6 minutes. The outputs are two text files: KLD_MAJOR_HOMO_stats.txt and KLD_HOMO_MAJOR_stats.txt + ```shell + tsfm --kldperms 10000 -m GPD -c tRNA_L_skel_Leish.sites74.struct.cove HOMO/HOMO MAJOR/MAJOR + mv KLD_MAJOR_HOMO_stats.txt KLD_HOMO_MAJOR_stats.txt GPD + ``` + + 3. Create the figure using the KLD tables in GPD file by running the following R code: + ```R + library(latex2exp) + library(ggplot2) + library(ggpubr) + setwd("PATH/TO/Kelly2020_data") # make sure to set your working directory to folder Kelly2020_data + spetralc <-c("#D53E4F" ,"#F46D43" ,"#FDAE61" , "#66C2A5" ,"#3288BD","#313695") + breaksarray <- c(1,2,4,10,30,80,200,500) + legendbreaks <- logb(breaksarray, base = 2) + legendlables <- as.character(breaksarray) + KLD_paths <- + c( + "GPD/KLD_MAJOR_HOMO_stats.txt", + "GPD/KLD_HOMO_MAJOR_stats.txt", + "GPD/KLD_ENRIETTII_HOMO_stats.txt", + "GPD/KLD_HOMO_ENRIETTII_stats.txt" + ) + xlabels <- c("Kullback-Leibler Divergence","Kullback-Leibler Divergence","Kullback-Leibler Divergence","Kullback-Leibler Divergence") + clades <- c("MAJOR (n=664 sequences) against Homo (m=431 sequences)", "HOMO against MAJOR","ENRIETTII (n=160 sequences) against Homo (m=431 sequences)", "HOMO against ENRIETTII") + plotls <- list() + for (i in 1:length(KLD_paths)) { + signTable <- read.table(KLD_paths[i], header = TRUE, sep = "\t") + signTable <- signTable[signTable$Sample.Sz.Back != 0 & signTable$Sample.Sz.Fore != 0 & signTable$Statistic != 0, ] + signTable$harmonicmean <- 0 + for (j in 1:nrow(signTable)) { + signTable$harmonicmean[j] = 1 / mean(1 / c(signTable$Sample.Sz.Back[j], signTable$Sample.Sz.Fore[j])) + } + p <- ggplot(signTable, aes_string( x = round(signTable$Statistic, 4), y = -log(signTable$P.value, base = 2),colour = logb(signTable$harmonicmean, 2))) + + geom_point(aes_string( shape=signTable$P.Val.Method, colour = logb(signTable$harmonicmean, 2),size = (signTable$Permutations)))+ + scale_colour_gradientn( colours = spetralc, breaks = legendbreaks, labels = legendlables, guide = "colourbar", limits = c(0, 9.2)) + ylim(0,60)+# xlim(0,xlimits[i])+ + scale_size_continuous(limits = c(10,10001),breaks = c(10,2000,5000,10001),labels = c("10","2000","5000","10000"))+ + geom_errorbar(aes(ymin=-log(P.value + CI.Upper, base = 2), ymax=-log(P.value - CI.Lower, base = 2)))+ + scale_shape_manual( values = c(2,3,4,1), + breaks = c( "p_ecdf", "p_ecdf_with_pseudo", "p_ecdf_with_pseudo (p_gpd=0)", "p_gpd" ), + labels = c( "Pecdf", "Pecdf with pseudo", "Pecdf with_pseudo (pgpd=0)", "Pgpd" ))+ + ggtitle(clades[i]) + ylab(TeX("$Surprisal (-log_2(p))$")) + xlab(xlabels[i]) + labs(color = "Sample Size", shape = "PmethodType",size = "Permutation number") + + theme( + axis.text = element_text(size = 13, family = "serif"), + axis.title = element_text(size = 14, family = "serif"), + legend.text = element_text(size = 12.5, family = "serif"), + legend.title = element_text(size = 14, family = "serif"), + plot.title = element_text(hjust = 0.5, family = "serif", size = 15, face = "bold" ), + legend.key.height = unit(0.9, "cm"), + plot.margin=unit(c(0.2,0.2,0.5,0.2), "cm") + ) + plotls[[i]] <- p + } + setEPS() + postscript("KLD-Significance_ENRIETTII_MAJOR.eps", width = 16, height = 8,fonts=c("serif", "Palatino")) + p <- ggarrange( plotls[[1]], plotls[[2]], plotls[[3]], plotls[[4]],ncol = 2, nrow = 2, common.legend = TRUE,legend.grob = get_legend(plotls[[2]]), legend = "right",labels = c("A", "B", "C","D")) + print(p) + dev.off() + ``` + + + +2. Creating the supplemental figure + + 1. Create the statistic table for `ID logos` of clade `MAJOR` vs `HOMO` using the pvalue-calculation-method GPD with maximum permutation number = 10000, target permutation number 500, and calculate the Confidence Interval of p-values with 5% significance level. This will use the default entropy estimator NSB with default Maximum conditional sample size 5. The running time on a Intel Core i7 Dell XPS was ~4.76 hrs. The outputs are two text files: ID_MAJOR_HOMO_stats.txt and ID_HOMO_MAJOR_stats.txt + ```shell + tsfm --idperms 10000 -m GPD -c tRNA_L_skel_Leish.sites74.struct.cove HOMO/HOMO MAJOR/MAJOR + mv ID_MAJOR_HOMO_stats.txt ID_HOMO_MAJOR_stats.txt GPD + ``` + + 2. Create the statistic table for ID logos of clade ENRIETTII vs HOMO using the pvalue-calculation-method GPD with maximum permutation number = 10000, target permutation number 500, and calculate the Confidence Interval of p-values with 5% significance level. Default -e NSB -x 5. The running time on a Intel Core i7 Dell XPS was ~4.89 hrs. The outputs are two text files: ID_ENRIETTII_HOMO_stats.txt and ID_HOMO_ENRIETTII_stats.txt + ```shell + tsfm --idperms 10000 -m GPD -c tRNA_L_skel_Leish.sites74.struct.cove HOMO/HOMO ENRIETTII/ENRIETTII + mv ID_ENRIETTII_HOMO_stats.txt ID_HOMO_ENRIETTII_stats.txt GPD + ``` + + 3. Create the figure using the ID tables in GPD file by running the following R code: + ```R + library(latex2exp) + library(ggplot2) + library(ggpubr) + setwd("PATH/TO/Kelly2020_data") # make sure to set your working directory to folder Kelly2020_data + spetralc <-c("#D53E4F" ,"#F46D43" ,"#FDAE61" , "#66C2A5" ,"#3288BD","#313695") + breaksarray <- c(1,2,4,10,30,80,200,500) + legendbreaks <- logb(breaksarray, base = 2) + legendlables <- as.character(breaksarray) + ID_paths <- + c( + "GPD/ID_MAJOR_HOMO_stats.txt", + "GPD/ID_HOMO_MAJOR_stats.txt", + "GPD/ID_ENRIETTII_HOMO_stats.txt", + "GPD/ID_HOMO_ENRIETTII_stats.txt" + ) + xlabels <- c("Information Difference","Information Difference","Information Difference","Information Difference") + clades <- c("MAJOR (n=664 sequences) against Homo (m=431 sequences)", "HOMO against MAJOR", "ENRIETTII (n=160 sequences) against Homo (m=431 sequences)", "HOMO against ENRIETTII") + plotls <- list() + for (i in 1:length(ID_paths)) { + signTable <- read.table(ID_paths[i], header = TRUE, sep = "\t") + signTable <- signTable[signTable$Sample.Sz.Back != 0 & signTable$Sample.Sz.Fore != 0 & signTable$Statistic != 0, ] + signTable$harmonicmean <- 0 + for (j in 1:nrow(signTable)) { + signTable$harmonicmean[j] = 1 / mean(1 / c(signTable$Sample.Sz.Back[j], signTable$Sample.Sz.Fore[j])) + } + p <- ggplot(signTable, aes_string( x = round(signTable$Statistic, 4), y = -log(signTable$P.value, base = 2),colour = logb(signTable$harmonicmean, 2))) + + geom_point(aes_string( shape=signTable$P.Val.Method, colour = logb(signTable$harmonicmean, 2),size = (signTable$Permutations)))+ + scale_colour_gradientn( colours = spetralc, breaks = legendbreaks, labels = legendlables, guide = "colourbar", limits = c(0, 9.2)) + ylim(0,60)+# xlim(0,xlimits[i])+ + scale_size_continuous(limits = c(10,10001),breaks = c(10,2000,5000,10001),labels = c("10","2000","5000","10000"))+ + geom_errorbar(aes(ymin=-log(P.value + CI.Upper, base = 2), ymax=-log(P.value - CI.Lower, base = 2)))+ + scale_shape_manual( values = c(2,3,4,1), + breaks = c( "p_ecdf", "p_ecdf_with_pseudo", "p_ecdf_with_pseudo (p_gpd=0)", "p_gpd" ), + labels = c( "Pecdf", "Pecdf with pseudo", "Pecdf with_pseudo (pgpd=0)", "Pgpd" ))+ + ggtitle(clades[i]) + ylab(TeX("$Surprisal (-log_2(p))$")) + xlab(xlabels[i]) + labs(color = "Sample Size", shape = "PmethodType",size = "Permutation number") + + theme( + axis.text = element_text(size = 13, family = "serif"), + axis.title = element_text(size = 14, family = "serif"), + legend.text = element_text(size = 12.5, family = "serif"), + legend.title = element_text(size = 14, family = "serif"), + plot.title = element_text(hjust = 0.5, family = "serif", size = 15, face = "bold" ), + legend.key.height = unit(0.9, "cm"), + plot.margin=unit(c(0.2,0.2,0.5,0.2), "cm") + ) + plotls[[i]] <- p + } + setEPS() + postscript("ID-Significance_ENRIETTII_MAJOR.eps", width = 16, height = 8,fonts=c("serif", "Palatino")) + p <- ggarrange( plotls[[1]], plotls[[2]], plotls[[3]], plotls[[4]],ncol = 2, nrow = 2, common.legend = TRUE,legend.grob = get_legend(plotls[[2]]), legend = "right",labels = c("A", "B", "C","D")) + print(p) + dev.off() + ``` + +2. Creating the slope graph from tSFM manuscript. + + 1. Create a statistic table for KLD logos of clade ENRIETTII vs HOMO using the pvalue-calculation-method ECDF_pseudo (naive Monte Carlo method using pseudo-counts) with maximum permutation number = 10000. The running time on a Linux compute node with 20 cores at 2301 MHz and 120 GB RAM was ~7.72 hrs. The outputs are two text files: KLD_ENRIETTII_HOMO_stats.txt and KLD_HOMO_ENRIETTII_stats.txt + ```shell + mkdir ECDF_pseudo + tsfm --kldperms 10000 -m ECDF_pseudo -c tRNA_L_skel_Leish.sites74.struct.cove --clade HOMO HOMO/HOMO MAJOR/MAJOR + mv KLD_ENRIETTII_HOMO_stats.txt KLD_HOMO_ENRIETTII_stats.txt ECDF_pseudo + ``` + + 2. Create a statistic table for KLD logos of clade ENRIETTII vs HOMO using the pvalue-calculation-method ECDF (Monte Carlo method terminating after M = 10 exceedances) with maximum permutation number = 10000. The running time on a Linux compute node with 20 cores at 2301 MHz and 120 GB RAM was ~4.86 hrs. The outputs are two text files: KLD_ENRIETTII_HOMO_stats.txt and KLD_HOMO_ENRIETTII_stats.txt + ```shell + mkdir ECDF + tsfm --kldperms 10000 -m ECDF -c tRNA_L_skel_Leish.sites74.struct.cove --clade HOMO HOMO/HOMO MAJOR/MAJOR + mv KLD_ENRIETTII_HOMO_stats.txt KLD_HOMO_ENRIETTII_stats.txt ECDF + ``` + + 3. Create the figure using the KLD tables for ENRIETTII against HOMO from three folders GPD, ECDF and ECDF_pseudo + ```R + signTable0 <- read.table("ECDF_pseudo/KLD_ENRIETTII_HOMO_stats.txt", header = TRUE, sep = "\t") + signTable1 <- read.table("ECDF/KLD_ENRIETTII_HOMO_stats.txt", header = TRUE, sep = "\t") + signTable2 <- read.table("GPD/KLD_ENRIETTII_HOMO_stats.txt", header = TRUE, sep = "\t") + signTable0 <- signTable0[signTable0$Sample.Sz.Back != 0 & signTable0$Sample.Sz.Fore != 0 & signTable0$Statistic != 0,] + signTable0$method <- "ECDF-with-pseudo" + signTable1 <- signTable1[signTable1$Sample.Sz.Back != 0 & signTable1$Sample.Sz.Fore != 0 & signTable1$Statistic != 0,] + signTable1$method <- "ECDF" + signTable2 <- signTable2[signTable2$Sample.Sz.Back != 0 & signTable2$Sample.Sz.Fore != 0 & signTable2$Statistic != 0,] + signTable2$method <- "GPD with Initial-Target-Permnum = 500" + signTable0$P.Val.Method <- "p_ecdf_with_pseudo" + common_cols <- intersect(colnames(signTable0), colnames(signTable1)) + bindeddf <- rbind( + subset(signTable0, select = common_cols), + subset(signTable1, select = common_cols), + subset(signTable2, select = common_cols) + ) + bindeddf$coordstate <- paste(bindeddf$coord,bindeddf$state,sep = "") + bindeddf <- bindeddf[order(bindeddf$coordstate),] + bindeddf$method <- factor(bindeddf$method,levels = c("ECDF-with-pseudo","ECDF","GPD with Initial-Target-Permnum = 500")) + bindeddf$harmonicmean <- 0 + for (j in 1:nrow(bindeddf)) { + bindeddf$harmonicmean[j] = 1 / mean(1 / c(bindeddf$Sample.Sz.Back[j], bindeddf$Sample.Sz.Fore[j])) + } + spetralc <-c("#D53E4F" ,"#F46D43" ,"#FDAE61" , "#66C2A5" ,"#3288BD","#313695") + breaksarray <- c(1,2,4,10,30,80,200,500) + legendbreaks <- logb(breaksarray, base = 2) + legendlables <- as.character(breaksarray) + p <- ggplot(data = bindeddf, aes( x = method , y = jitter(-log(P.value, base = 2),factor = 80), group = coordstate )) + + geom_line(aes(color = logb(harmonicmean, base = 2)), size = 0.5) + + geom_point(aes(color = logb(harmonicmean, base = 2),shape=factor(P.Val.Method)), size = 2) + + scale_x_discrete(position = "top") + + scale_colour_gradientn( colours = spetralc, breaks = legendbreaks, labels = legendlables, guide = "colourbar", limits = c(0, 9.2) ) + + theme( legend.key.height = unit(0.7, 'in'), plot.title = element_text(hjust = 0.5), text = element_text(size = 24, family = "serif")) + + scale_shape_manual( values = c(2,3,4,1), breaks = c( "p_ecdf", "p_ecdf_with_pseudo", "p_ecdf_with_pseudo (p_gpd=0)", "p_gpd"), + labels = c( "pecdf", "pecdf with pseudo", "pecdf with pseudo (pgpd=0)", "pgpd" ))+ + labs(title = "" , colour = "Sample Size",shape="Pmethod", size = "Permutation number") + + xlab("") + ylab(TeX("$Surprisal (-log_2(p))$")) + setEPS() + postscript("KLD_ENRIETTII_Slopegraph_GPDvsECDF.eps", width = 18, height = 10,fonts=c("serif", "Palatino")) + print(p) + dev.off() + ``` +