diff --git a/README.md b/README.md index b49be6c..b0b48e4 100644 --- a/README.md +++ b/README.md @@ -1,20 +1,28 @@ TCC === -R package for tag count comparison +Title +----- -Differential expression analysis for tag count data with robust normalization strategies ----------------------------------------------------------------------------------------- +TCC: Differential expression analysis for tag count data with robust normalization strategies -This package provides functions for performing differential expression -analysis using differentially expressed gene elimination strategy. -A simple unified interface is provided which encapsulates functions -to calculate normalization factors and estimate differentially -expressed genes defined in edgeR, baySeq, and DESeq packages. -The appropriate combination provided by TCC allows a more robust -and accurate estimation performed easily than directly using original packages. -Functions to produce simulation data under various conditions and to plot the data are also provided. +Description +----------- + +This package provides a series of functions for performing +differential expression analysis from RNA-seq count data using robust +normalization strategy (called DEGES). The basic idea of DEGES is that +potential differentially expressed genes or transcripts (DEGs) among +compared samples should be removed before data normalization to obtain +a well-ranked gene list where true DEGs are top-ranked and non-DEGs are +bottom ranked. This can be done by performing a multi-step normalization +strategy (called DEGES for DEG elimination strategy). A major +characteristic of TCC is to provide the robust normalization methods for +several kinds of count data (two-group with or without replicates, +multi-group/multi-factor, and so on) by virtue of the use of combinations +of functions in other sophisticated packages (especially edgeR, DESeq, +and baySeq). Documentation ------------- @@ -26,13 +34,14 @@ in addition to the documents included in the software package. Development versions -------------------- -The development version is now on the github repository at https://github.com/tomoakin/TCC +The development version is now on the github repository +at https://github.com/kohijiri/huaying.asagao Releases -------- -Release of this software is made through CRAN at http://cran.r-project.org/web/packages/TCC/ - +Release of this software is made through CRAN at +http://www.bioconductor.org/packages/release/bioc/html/TCC.html No warranty ----------- @@ -53,4 +62,4 @@ A copy of GPL-2 is accompanied in this directory in a file named COPYING. Authors ------- -Sun Jianqiang, Tomoaki Nishiyama, Kentaro Shimizu, and Koji Kadota +Jianqiang Sun, Tomoaki Nishiyama, Kentaro Shimizu, and Koji Kadota diff --git a/TCC/DESCRIPTION b/TCC/DESCRIPTION index 6d4d5b3..72900dc 100644 --- a/TCC/DESCRIPTION +++ b/TCC/DESCRIPTION @@ -1,19 +1,28 @@ Package: TCC Type: Package Title: TCC: Differential expression analysis for tag count data with - robust normalization strategies -Version: 1.0.0 -Date: 2013-01-10 -Author: Sun Jianqiang, Tomoaki Nishiyama, Kentaro Shimizu, and Koji Kadota -Maintainer: Tomoaki Nishiyama -Description: This package provides functions for performing differential expression analysis - using differentially expressed gene elimination strategy. A simple unified interface is - provided which encapsulates functions to calculate normalization factors and estimate - differentially expressed genes defined in edgeR, baySeq, and DESeq packages. - The appropriate combination provided by TCC allows a more robust and accurate estimation - performed easily than directly using original packages. - Functions to produce simulation data under various conditions and to plot the data are also provided. -Depends: R (>= 2.15), edgeR, baySeq, DESeq, ROC, methods + robust normalization strategies +Version: 1.1.99 +Author: Jianqiang Sun, Tomoaki Nishiyama, Kentaro Shimizu, and Koji Kadota +Maintainer: Jianqiang Sun , + Tomoaki Nishiyama +Description: This package provides a series of functions for performing + differential expression analysis from RNA-seq count data using robust + normalization strategy (called DEGES). The basic idea of DEGES is that + potential differentially expressed genes or transcripts (DEGs) among + compared samples should be removed before data normalization to obtain + a well-ranked gene list where true DEGs are top-ranked and non-DEGs are + bottom ranked. This can be done by performing a multi-step normalization + strategy (called DEGES for DEG elimination strategy). A major + characteristic of TCC is to provide the robust normalization methods for + several kinds of count data (two-group with or without replicates, + multi-group/multi-factor, and so on) by virtue of the use of combinations + of functions in other sophisticated packages (especially edgeR, DESeq, + and baySeq). +Depends: R (>= 2.15), methods, DESeq, edgeR, baySeq, ROC +Imports: EBSeq, samr +Suggests: RUnit, BiocGenerics +Enhances: snow +biocViews: HighThroughputSequencing, DifferentialExpression, RNAseq License: GPL-2 Copyright: Authors listed above -URL: http://www.iu.a.u-tokyo.ac.jp/~kadota/TCC diff --git a/TCC/NAMESPACE b/TCC/NAMESPACE index 40b540b..6f825ea 100644 --- a/TCC/NAMESPACE +++ b/TCC/NAMESPACE @@ -2,25 +2,33 @@ import(ROC) import(DESeq) import(baySeq) import(edgeR) +import(EBSeq) +import(samr) -# Export TCC class. exportClasses( "TCC" ) -# Export the public methods in TCC package. export( + show, + names, + length, + "[", + TCC, calcAUCValue, calcNormFactors, do_TbT, estimateDE, exactTestafterTbT, filterLowCountGenes, - generateSimulationData, + simulateReadCounts, getNormalizedData, getResult, MAplot, NBsample, plotFCPseudocolor, - plot.TCC + plot.TCC, + WAD, + ROKU ) + diff --git a/TCC/NEWS b/TCC/NEWS new file mode 100644 index 0000000..f102b8e --- /dev/null +++ b/TCC/NEWS @@ -0,0 +1,43 @@ +CHANGES IN VERSION 1.2.0 +------------------------ + +NEW FEATURES + + o This package was released as a Bioconductor package (previously CRAN). + o WAD method for identifying DEGs was added. + o ROKU method for identifying tissue-specific genes was added. + o 'increment' argument of 'calcNormFactor' function was added. + + +SIGNIFICANT USER-VISIBLE CHANGES + + o 'replicates' field of TCC class was deleted. + + + + +CHANGES IN VERSION 1.1.3 +------------------------ + +SIGNIFICANT USER-VISIBLE CHANGES + + o 'generateSimulationData' function was renamed to 'simulateReadCount'. + + o 'names' field of TCC class was changed to 'gene_id'. + + o 'hypoData' was reduced to a smaller data set. + + o 'hypoData_mg' was created. This is the simulation dataset which consists + of 1,000 genes and 9 samples. + + + + +CHANGES IN VERSION 1.0.0 +------------------------ + +SIGNIFICANT USER-VISIBLE CHANGES + + o 'TCC' class was implemented as a R5 reference class. Wrapper functions + with functional programming semantics were proviede. + diff --git a/TCC/R/ROKU.R b/TCC/R/ROKU.R new file mode 100644 index 0000000..562e4ba --- /dev/null +++ b/TCC/R/ROKU.R @@ -0,0 +1,123 @@ +.outval <- function(y, upper.limit) { + if (all(is.na(y))) + y <- rep(0, length = length(y)) + z <- y[!is.na(y)] + + N <- length(z) + fN <- floor(N * upper.limit) + 1 + + z.sorted <- sort(z) + z.ordered <- order(z) + + df <- matrix(c(rep(1:fN, times = c(fN:1)), j = sequence(fN:1)), + ncol = 2, byrow = FALSE) + n <- N - df[, 2] - df[, 1] + 2 + s <- N - n + f <- rep(0, length = N) + + if (sd(z) != 0) { + ssd <- apply(df, 1, function(d, w = z.sorted, N = N) { + return(sd(w[d[1]:(N + 1 - d[2])])) + }, z.sorted, N) + u <- n * log(ssd * sqrt((n - 1) / n)) + + sqrt(2) * s * lfactorial(n) / n + min.u <- min(u) + d <- t(df[u == min.u, ])[1:2] + if (d[1] > 1) + f[z.ordered[1:(d[1] - 1)]] <- -1 + if (d[2] > 1) + f[z.ordered[(N + 1 - d[2] + 1):N]] <- 1 + } + return(replace(y, !is.na(y), f)) +} + + +.tbw <- function(y) { + y <- y[!is.na(y)] + y.m <- median(y) + y.u <- (y - y.m) / (5 * median(abs(y - y.m)) + 1e-04) + y.w <- rep(0, length(y)) + y.i <- abs(y.u) <= 1 + y.w[y.i] <- ((1 - y.u^2)^2)[y.i] + y.b <- sum(y.w * y) / sum(y.w) +} + +.entvalmod <- function(y) { + y <- y[!is.na(y)] + l <- length(y) + y <- y[y != 0] + if (is.na(sd(y))) { + return (0) + } else if (sum(y) <= 0 || sd(y) == 0) { + return (log2(l)) + } else { + y.m <- median(y) + y.u <- (y - y.m) / (5 * median(abs(y - y.m)) + 1e-04) + y.w <- rep(0, length(y)) + y.i <- abs(y.u) <= 1 + y.w[y.i] <- ((1 - y.u^2)^2)[y.i] + y.b <- sum(y.w * y) / sum(y.w) + p <- abs(y - y.b) + p <- p / sum(p) + e <- - sum(p * log2(p)) + if (is.na(e)) + e <- 0 + return (e) + } +} + +.entval <- function(y) { + y <- y[!is.na(y)] + l <- length(y) + y <- y[y != 0] + if (is.na(sd(y))) { + return (0) + } else if (sum(y) <= 0 || sd(y) == 0) { + return (log2(l)) + } else { + p <- y / sum(y) + e <- - sum(p * log2(p)) + if (is.na(e)) + e <- 0 + return (e) + } +} + +ROKU <- function(data, upper.limit = 0.25, sort = FALSE) { + rs <- NULL + if (is.vector(data)) { + data <- t(matrix(data)) + } else { + data <- as.matrix(data) + } + rs$outliers <- t(apply(t(scale(t(data))), 1, + function (y, upper.limit = upper.limit) { + .outval(y, upper.limit = upper.limit) + }, upper.limit)) + rs$H <- apply(data, 1, .entval) + rs$modH <- apply(data, 1, .entvalmod) + rs$rank <- rank(rs$modH) + rs$Tbw <- apply(data, 1, .tbw) + if (!is.null(colnames(data))) { + l <- colnames(data) + } else { + l <- paste("tissue", 1:ncol(data), sep = "_") + } + if (!is.null(rownames(data))) { + r <- rownames(data) + } else { + r <- 1:nrow(data) + } + colnames(rs$outliers) <- l + rownames(rs$outliers) <- r + if (sort) { + reindex <- order(rs$rank) + rs$outliers <- rs$outliers[reindex, ] + rs$H <- rs$H[reindex] + rs$modH <- rs$modH[reindex] + rs$rank <- rs$rank[reindex] + rs$Tbw <- rs$Tbw[reindex] + } + return (rs) +} + diff --git a/TCC/R/TCC.R b/TCC/R/TCC.R index da1a2c3..f70ba5a 100644 --- a/TCC/R/TCC.R +++ b/TCC/R/TCC.R @@ -1,488 +1,86 @@ -# TCC -# v2.21 -# -# SUN jianqiang -# 2013-01-03 -# -# Class name: -# TCC -# -# Required packagse: -# edgeR -# baySeq -# DESeq -# ROC - TCC <- setRefClass( - "TCC", - fields = list( - count = "matrix", # counts data of libraries. - names = "character", # gene names - group = "numeric", # group of libraries. - replicates = "numeric", # group of libraries. - norm.factors = "numeric", # normalization factors. - stat = "list", # the result of identify DE genes. - estimatedDEG = "numeric", # identified result by identifyDEG(). - simulation = "list", # the aurgument inputed. - private = "list" # intermediate data on DEGES process. - ), - - # Class Methods. - methods = list( - initialize = function(count=NULL, group=NULL, replicates=NULL, norm.factors=NULL, names=NULL) { - # If count data setted, fill it to TCC class object. - if (!is.null(count)) { - if(!is.null(group)){ - if (sum(group) != ncol(count)) - stop("TCC::ERROR: The sum of group has to be equal to the columns of count data.\n") - replicates <<- rep(1:length(group), times=group) - group <<- group - }else if(!is.null(replicates)){ - if (length(replicates) != ncol(count)) - stop("TCC::ERROR: The length of replicates has to be equal to the columns of count data.\n") - group <<- rep(0, length=max(replicates)) - for (i in 1:max(replicates)) { - group[i] <<- sum(replicates == i) - } - replicates <<- replicates - }else{ - stop("TCC::ERROR: group or replicate must be provided.\n") - } - # Fill count data. - if(!is.matrix(count)){ - count <<- as.matrix(count) - }else{ - count <<- count - } - # count is a matrix with or without colnames, rownames - if (is.null(rownames(count))){ - names <<- paste("gene_", c(1:nrow(count)), sep="") - rownames(count) <<- names - names <<- names - } else { - names <<- rownames(count) - } - if (is.null(colnames(count))) { - colnames(count) <<- paste("G", rep(1:length(group), times=group), "_rep", sequence(group), sep="") - } else { - # if the column is not unique, it occurs error on edgeR. - colns <- colnames(count) - if (sum(match(colns, colns)) != sum(1:length(colns))) { - message("TCC::INFO: Changed the column names of count data to unique.") - colnames(count) <<- paste(colns, 1:length(colns), sep="_") - } - } - # Fill normalization factors. - if (is.null(norm.factors)) { - normf <- rep(1, length=ncol(count)) - names(normf) <- colnames(count) - } else { - if (length(norm.factors) != ncol(count)) - stop("\nTCC::ERROR: The length of norm.factors has to be equal to the columns of cuont data.\n") - normf <- norm.factors - } - norm.factors <<- normf - } - private$estimated <<- FALSE - private$simulation <<- FALSE - }, - #/** - # * THE METHODS OF CALCULATE NORMALIZATION FACTORS. - # */ - # TMM normalization. (edgeR) - .normByTmm = function (count) { - #if (!("edgeR" %in% loadedNamespaces())) - # library(edgeR) - suppressMessages(d <- edgeR::DGEList(counts=count, group=replicates)) - suppressMessages(d <- edgeR::calcNormFactors(d)) - normf <- d$samples$norm.factors - names(normf) <- colnames(.self$count) - normf - }, - # DESeq normalization. (DESeq) // Each cols is divided by the genomic means of the rows. - .normByDeseq = function(count) { - suppressMessages(d <- newCountDataSet(countData=count, conditions=replicates)) - suppressMessages(d <- estimateSizeFactors(d)) - return(sizeFactors(d) / colSums(count)) - } - ) -) - #/** - # * THE METHODS OF INDENTIFY DE GENES. - # */ - # Parametric exact test by edgeR. -TCC$methods( .testByEdger = function () { - suppressMessages(d <- edgeR::DGEList(counts=count, group=replicates)) - suppressMessages(d <- edgeR::calcNormFactors(d)) - d$samples$norm.factors <- norm.factors - suppressMessages(d <- edgeR::estimateCommonDisp(d)) - suppressMessages(d <- edgeR::estimateTagwiseDisp(d)) - suppressMessages(d <- edgeR::exactTest(d)) - if (!is.null(d$table$PValue)) { - private$stat$p.value <<- d$table$PValue - } else { - private$stat$p.value <<- d$table$p.value - } - private$stat$rank <<- rank(private$stat$p.value) - private$stat$q.value <<- p.adjust(private$stat$p.value, method="BH") - } -) - # Parametric exact test by DESeq. -TCC$methods( .testByDeseq = function() { - suppressMessages(d <- newCountDataSet(countData=count, conditions=replicates)) - sizeFactors(d) <- norm.factors * colSums(count) - if (ncol(count) > 2) { - e <- try(suppressMessages(d <- estimateDispersions(d)), silent=TRUE) - if (class(e) == "try-error") { - message("TCC::WARN: An Error occurs when execute 'estimateDispersions' in DESeq.") - message("TCC::WARN: Changed 'fitType' to 'local' of 'estiamteDispersions'.") - suppressMessages(d <- estimateDispersions(d, fitType="local")) - } - } else { - e <- try(suppressMessages(d <- estimateDispersions(d, method="blind", sharingMode="fit-only")), silent=TRUE) - if (class(e) == "try-error") { - message("TCC::WARN: An Error occurs when execute 'estimateDispersions' in DESeq.") - message("TCC::WARN: Changed 'fitType' to 'local' of 'estiamteDispersions'.") - suppressMessages(d <- estimateDispersions(d, method="blind", sharingMode="fit-only", fitType="local")) - } - } - suppressMessages(d <- nbinomTest(d, 1, 2)) - d$pval[is.na(d$pval)] <- 1 - d$padj[is.na(d$padj)] <- 1 - private$stat$p.value <<- d$pval - private$stat$q.value <<- d$padj - private$stat$rank <<- rank(d$pval) - } -) - # Non-parametric exact test by baySeq. -TCC$methods( .testByBayseq = function(samplesize, processors) { - cl <- NULL - if (!is.null(processors)) { - if(!("snow" %in% loadedNamespaces())) - library(snow) - if (is.numeric(processors)) { - cl <- makeSOCKcluster(rep("localhost", length=processors)) - } else { - cl <- processors - } - } - suppressMessages(d <- new("countData", data = as.matrix(count), - replicates = replicates, - groups = list(NDE = rep(1, length = length(replicates)), DE = replicates), - libsizes = colSums(count) * norm.factors)) - suppressMessages(d <- getPriors.NB(d, samplesize=samplesize, estimation="QL", cl=cl)) - capture.output(suppressMessages(d <- getLikelihoods.NB(d, pET="BIC", cl=cl))) - stat.bayseq <- topCounts(d, group="DE", number=nrow(count)) - stat.bayseq <- stat.bayseq[rownames(count), ] - private$stat$rank <<- rank(- d@posteriors[, "DE"]) - # calculate p.value and q.value from likelihood values. - private$stat$likelihood <<- stat.bayseq$Likelihood - private$stat$p.value <<- 1 - stat.bayseq$Likelihood - private$stat$q.value <<- stat.bayseq$FDR - private$estimatedDEG <<- as.numeric(private$stat$rank < (nrow(count) * d@estProps[2])) - } -) - - # calculate normalization factors. -TCC$methods(calcNormFactors = function (norm.method = NULL, - test.method = NULL, - iteration = 1, - FDR = NULL, - floorPDEG = NULL, - samplesize = 10000, - processors = NULL) { - if (is.null(norm.method)) { - if (min(.self$group) == 1) { - norm.method = "deseq" - } else { - norm.method = "edger" - } - } - if (is.null(test.method)) { - if (ncol(count) < 4) { - test.method = "deseq" - } else { - test.method = "edger" - } - } - if (norm.method == "edger") - norm.method <- "tmm" - if (test.method != "bayseq" && is.null(FDR)) { - FDR <- 0.1 - } - if (test.method != "bayseq" && is.null(floorPDEG)) { - floorPDEG <- 0.05 - } - if (iteration) { - if (is.logical(iteration)) - iteration <- 1 - message(paste("TCC::INFO: Calculating normalization factors using DEGES")) - message(paste("TCC::INFO: (iDEGES pipeline :", norm.method, "- [", test.method, "-", norm.method, "] X", iteration, ")")) - } else { - message(paste("TCC::INFO: Calculating normalization factors using", norm.method, "...")) - } - # DEGES strategy STEP 1. (First normalization) - norm.factors <<- switch(norm.method, - "tmm" = .self$.normByTmm(count), - "deseq" = .self$.normByDeseq(count), - stop(paste("\nTCC::ERROR: The normalization method of ", norm.method, " doesn't supported.\n")) - ) - private$DEGES.threshold.type <<- 0 - # if DEGES not NULL then start DEGES strategy. - if (iteration) { - # if iteration > 0 then change to iterate DEGES strategy. - for (i in 1:iteration) { - # DEGES strategy STEP 2. (exact test and remove differential expression genes.) - private$stat <<- list() - switch(test.method, - "edger" = .self$.testByEdger(), - "deseq" = .self$.testByDeseq(), - "bayseq" = .self$.testByBayseq(samplesize, processors), - stop(paste("\nTCC::ERROR: The identifying method of ", test.method, " doesn't supported.\n")) - ) - # Remove the DEG from original count data. - deg.flg.FDR <- .self$.exactTest(FDR = FDR) - deg.flg.floorPDEG <- as.numeric(rank(private$stat$p.value, ties.method = "min") <= nrow(count) * floorPDEG) - if (is.null(FDR)) { - deg.flg <- deg.flg.FDR - private$DEGES.threshold.type <<- 5 + sum(private$estimatedDEG != 0) / length(private$estimatedDEG) - } else { - deg.flg <- deg.flg.FDR - private$DEGES.threshold.type <<- 3 + FDR - } - # super threshold. - if ((!is.null(floorPDEG)) && (sum(deg.flg != 0) < sum(deg.flg.floorPDEG != 0))) { - deg.flg <- deg.flg.floorPDEG - private$DEGES.threshold.type <<- 1 + floorPDEG - } - count.ndeg <- count[(deg.flg == 0), ] - if (nrow(count.ndeg) == 0) { - message ("TCC::INFO: No non-DE genes after eliminate DE genes. stop DEGES strategy.") - break - } - # DEGES strategy STEP 3. (Second normalization) - norm.factors <<- switch(norm.method, - "tmm" = .self$.normByTmm(count.ndeg), - "deseq" = .self$.normByDeseq(count.ndeg) - ) - norm.factors <<- norm.factors * colSums(count.ndeg) / colSums(count) - norm.factors <<- norm.factors / mean(norm.factors) - } - private$DEGES.potentialDEG <<- deg.flg - } else { - norm.factors <<- norm.factors / mean(norm.factors) - } - message("TCC::INFO: Done.") - } -) + "TCC", + fields = list( + count = "matrix", # counts data of libraries. + gene_id = "character", # gene names + group = "data.frame", # groups, libraries, conditions. + norm.factors = "numeric", # normalization factors. + stat = "list", # the result of identify DE genes. + estimatedDEG = "numeric", # identified result by identifyDEG(). + simulation = "list", # the aurgument inputed. + DEGES = "list", # detailed informations about DEGES . + private = "list" # intermediate data on DEGES process. + ), -TCC$methods(.exactTest = function (FDR = NULL, significance.level = NULL) { - deg.flg <- rep(0, length = nrow(count)) - if (!is.null(significance.level)) { - deg.flg <- as.numeric(private$stat$p.value < significance.level) - private$estimate.type <<- 1 + significance.level - } else if (!is.null(FDR)) { - deg.flg <- as.numeric(private$stat$q.value < FDR) - private$estimate.type <<- 3 + FDR - } else { - # Only for TbT - deg.flg <- private$estimatedDEG - private$estimate.type <<- 5 + sum(private$estimatedDEG != 0) / length(private$estimatedDEG) - } - # decide group of DEG. - count.normed <- .self$getNormalizedCount() - mean.exp <- matrix(0, ncol=length(group), nrow=nrow(count)) - for (g in 1:length(group)) { - mean.exp[, g] <- log2(rowMeans(as.matrix(count.normed[, replicates == g]))) - } - for (i in 1:length(group)) { - for (j in i:length(group)) { - if (i != j) { - log2ration <- mean.exp[, j] - mean.exp[, i] - deg.flg[(deg.flg > 0) & (log2ration < 0)] <- i - deg.flg[(deg.flg > 0) & (log2ration > 0)] <- j - } - } - } - return (deg.flg) - } -) - - # exact test. -TCC$methods(estimateDE = function (test.method=NULL, - FDR = NULL, - significance.level = NULL, - samplesize=10000, - processors=NULL) { - if (is.null(test.method)) { - if (ncol(count) < 4) { - test.method = "deseq" - } else { - test.method = "edger" - } - } - if (test.method != "bayseq" && is.null(FDR) && is.null(significance.level)) { - FDR <- 0.1 - } - message(paste("TCC::INFO: Identifying DE genes using", test.method, "...")) - # calculate statistics values related DE gene. - private$stat <<- list() - switch(test.method, - "edger" = .self$.testByEdger(), - "deseq" = .self$.testByDeseq(), - "bayseq" = .self$.testByBayseq(samplesize, processors), - stop(paste("\nTCC::ERROR: The identifying method of ", test.method, " doesn't supported.\n")) - ) - # identify DE genes with the results of exact test. - estimatedDEG <<- .self$.exactTest(FDR = FDR, significance.level = significance.level) - if (!is.null(private$stat$likelihood)) - stat$likelihood <<- private$stat$likelihood - if (!is.null(private$stat$p.value)) - stat$p.value <<- private$stat$p.value - if (!is.null(private$stat$q.value)) - stat$q.value <<- private$stat$q.value - if (!is.null(private$stat$rank)) - stat$rank <<- private$stat$rank - private$estimated <<- TRUE - message("TCC::INFO: Done.") - } -) -TCC$methods(getNormalizedCount = function () { - effective.libsizes <- colSums(count) * norm.factors - return (sweep(count, 2, mean(effective.libsizes) / effective.libsizes, "*")) - } -) -TCC$methods(.getMACoordinates = function(g1, g2, floor = 0) { - m <- rep(0, length = nrow(count)) - a <- rep(0, length = nrow(count)) - g1.min.nonZero <- min(g1[g1 > 0]) - g2.min.nonZero <- min(g2[g2 > 0]) - filter <- as.logical(g1 <= floor | g2 <= floor) - g1[g1 <= floor] <- g1.min.nonZero - g2[g2 <= floor] <- g2.min.nonZero - a <- (log2(g1) + log2(g2)) / 2 - m <- log2(g2) - log2(g1) - a[filter] <- min(a) - 1 - return(list(m.value = m, a.value = a)) - } -) - -# plot M-A plotting. -TCC$methods(plotMA = function (FDR = NULL, - significance.level = NULL, - median.lines = FALSE, - floor = 0, - main = NULL, - xlab = expression(A == (log[2] * G2 + log[2] * G1 ) / 2), - ylab = expression(M == log[2] * G2 - log[2] * G1), - xlim = NULL, - ylim = NULL, - cex = 0.3, - pch = 19, - col = NULL, ...) { - # set up default arguments. - if (is.null(col)) { - if (private$estimated == TRUE) { - col <- c(1, rep(6, length=length(group))) - } else if (private$simulation == TRUE) { - col <- c(1, 4, 2, 4 + 1:(length(group) - 3)) - } else { - col <- rep(1, length=length(group)) - } - } else { - if (length(col) != length(group) + 1) { - if (length(col) == 1) - col <- c(col, col) - col <- c(col[1], rep(col[-1], length=length(group) - 1)) - } - } - m.values <- array(0, dim=c(length(group), length(group), nrow(count))) - a.values <- array(0, dim=c(length(group), length(group), nrow(count))) - - # calculate the average of count expression. - count.normed <- getNormalizedCount() - mean.exp <- matrix(0, ncol=length(group), nrow=nrow(count)) - mean.norm <- rep(0, length=length(group)) - for (g in 1:length(group)) { - mean.exp[, g] <- rowMeans(as.matrix(count.normed[, replicates == g])) - mean.norm[g] <- mean(norm.factors[replicates == g]) - } - # calculate m.values and a.values of each combinations of groups. - fig.tils <- length(group) - 1 - if (length(group) > 2) { - split.screen(figs = c(fig.tils, fig.tils)) - } - global.mar <- par("mar") - global.cex <- par("cex") - for (i in 1:length(group)) { - for (j in i:length(group)) { - if (i != j) { - ma.axes <- .self$.getMACoordinates(mean.exp[, i], mean.exp[, j], floor) - filter <- as.logical(mean.exp[, i] > 0 & mean.exp[, j] > 0) - a.values[i, j, ] <- ma.axes$a.value - m.values[i, j, ] <- ma.axes$m.value - a.values[j, i, ] <- ma.axes$a.value - m.values[j, i, ] <- ma.axes$m.value - a <- a.values[i, j, ] - m <- m.values[i, j, ] - if (length(group) > 2) { - screen(fig.tils * (i - 1) + j - 1) - } - if (length(group) > 2) { - par(cex = 0.7 * global.cex) - } - if (length(group) > 4) { - par(mar = global.mar / (length(group) - 1)) - } - if (is.null(xlim)) - xlim <- c(min(a), max(a)) - if (is.null(ylim)) - ylim <- c(min(m), max(m)) - if (is.null(main)) - main <- "MA plot" - if (length(group) > 2) { - gftitle <- paste(main, " (Group ", i, " - Group ", j,")", sep="") - } else { - gftitle <- main - } - plot(0, 0, xlim=xlim, ylim=ylim, xlab=xlab, ylab=ylab, type="n", ...) - title(main = list(gftitle)) - grid(col="gray", lty="dotted") - - col.tag <- rep(0, length = nrow(count)) - if (private$estimated == FALSE) { - if (private$simulation == TRUE) - col.tag <- simulation$trueDEG - } else { - if ((!is.null(estimatedDEG)) && (length(estimatedDEG != 0))) { - col.tag <- as.numeric(estimatedDEG) - } - if (!(is.null(FDR) && is.null(significance.level))) { - private$stat$q.value <<- stat$q.value - private$stat$p.value <<- stat$p.value - col.tag <- .self$.exactTest(FDR = FDR, significance.level = significance.level) - } - } - for (k in 0:max(col.tag)) { - points(a[col.tag == k], m[col.tag == k], col = col[k + 1], pch = pch, cex = cex) + ## Class Methods. + methods = list( + initialize = function(count = NULL, group = NULL, + norm.factors = NULL, gene_id = NULL) { + if (!is.null(count)) { + ## Set up group data. + if (is.null(group)) { + stop("TCC::ERROR: The group or replicates must be provided.\n") + #.self$group <<- data.frame(group = rep(1:length(replicates), times = replicates)) + } else { + if (!is.data.frame(group)) + .self$group <<- data.frame(group = group) + else + .self$group <<- group + } + ## Set up count data. + if (!is.matrix(count)) + .self$count <<- as.matrix(count) + else + .self$count <<- count + ## Set up names. + if (is.null(rownames(.self$count))) { + .self$gene_id <<- paste("gene_", + c(1:nrow(count)), sep = "") + rownames(.self$count) <<- paste("gene_", + c(1:nrow(count)), sep = "") + } else { + .self$gene_id <<- rownames(count) + } + if (is.null(colnames(.self$count))) { + g <- as.numeric(table(group)) + colnames(.self$count) <<- paste("G", + rep(1:length(g), times = g), + "_rep", sequence(g), + sep = "") + } else { + colnm <- colnames(count) + if (sum(match(colnm, colnm)) != sum(1:length(colnm))) { + message("TCC::INFO: Changed the column names of count data to unique.") + colnames(count) <<- paste(colnm, 1:length(colnm), + sep = "_") + } + } + rownames(.self$group) <<- colnames(.self$count) + ## Set up normlization factors if it was given. + if (is.null(norm.factors)) { + .self$norm.factors <<- rep(1, length = ncol(count)) + } else { + if (length(norm.factors) != ncol(count)) + stop("\nTCC::ERROR: The length of norm.factors has to be equal to the columns of cuont data.\n") + .self$norm.factors <<- norm.factors + } + names(norm.factors) <<- norm.factors } - if (median.lines == TRUE) { - for (k in c(0, i, j)) { - med <- median(m.values[i, j, (col.tag == k & filter)]) - lines(c(min(a) + 1, max(a)), c(med, med), col = col[k + 1]) - text(xlim[2], med + 0.5, sprintf("%.3f", med), col = col[k + 1], - pos = 2, offset = 0) - } - } - } + ## Set private argument. + private$estimated <<- FALSE + private$simulation <<- FALSE + private$normalized <<- FALSE + private$available$norm.method <<- c("tmm", "deseq") + private$available$test.method <<- data.frame( + TwoGroup_NonRep = c(T, T, F, F, F), + TwoGroup = c(T, T, T, T, T), + TwoGroup_Paired = c(F, F, F, F, F), + MultiGroup = c(T, T, T, T, T), + MultiFactor = c(T, T, F, T, F), + row.names = c("bayseq", "deseq", "ebseq", + "edger", "samseq") + ) } - } - if (length(group) > 2) { - par(mar = global.mar) - close.screen(all.screens = TRUE) - } - if (length(group) == 2) { - invisible(data.frame(m.value = m.values[1, 2, ], a.value = a.values[1, 2, ])) - } - } + ) ) + diff --git a/TCC/R/TCC.calcNormFactors.R b/TCC/R/TCC.calcNormFactors.R new file mode 100644 index 0000000..a4153b7 --- /dev/null +++ b/TCC/R/TCC.calcNormFactors.R @@ -0,0 +1,236 @@ +TCC$methods(.normByTmm = function(x){ + suppressMessages(d <- edgeR::DGEList(counts = round(x), + group = group[, 1])) + suppressMessages(d <- edgeR::calcNormFactors(d)) + normf <- d$samples$norm.factors + names(normf) <- colnames(.self$count) + return(normf) +}) + +TCC$methods(.normByDeseq = function(x){ + if (ncol(group) == 1) { + suppressMessages(d <- newCountDataSet(countData = round(x), + conditions = group[, 1])) + } else { + suppressMessages(d <- newCountDataSet(countData = round(x), + conditions = group)) + } + suppressMessages(d <- estimateSizeFactors(d)) + return(sizeFactors(d) / colSums(x)) +}) + +#TCC$methods(.normByTwad = function(x, refColumn = NULL, trimWAD, q, AD) { +# libsize <- colSums(x) +# +# allzero <- as.logical(rowSums(x) == 0) +# if (any(allzero)) +# x <- x[!allzero, , drop = FALSE] +# private$twad.trim <<- gene_id[!allzero] +# +# ## set reference column +# y <- sweep(x, 2, 1 / libsize, "*") +# f75 <- apply(y, 2, function(x) quantile(x, p = 0.75)) +# if (is.null(refColumn)) { +# refColumn <- which.min(abs(f75 - mean(f75))) +# if (length(refColumn) == 0 | refColumn < 1 | refColumn > ncol(x)) +# refColumn <- 1 +# } +# ## norm factors +# nf <- rep(1, length = ncol(x)) +# for (i in 1:length(nf)) { +# nf[i] <- .self$.twadcore(obs = x[, i], ref = x[, refColumn], +# obs.libsize = libsize[i], +# ref.libsize = libsize[refColumn], +# trimWAD = trimWAD, +# q = q, AD = AD) +# } +# nf <- nf / exp(mean(log(nf))) +# return (nf) +#}) + +##TCC$methods(.twadcore = function(obs, ref, obs.libsize, ref.libsize, +## trimWAD, q, AD) { +## if (all(obs == ref)) +## return (1) +## ## libsize +## obs <- as.numeric(obs) +## ref <- as.numeric(ref) +## lowcount <- as.logical(obs <= quantile(obs, q) | +## ref <= quantile(ref, q)) +## obs <- obs[!lowcount] +## ref <- ref[!lowcount] +## private$twad.trim <<- private$twad.trim[!lowcount] +## +## ## calculate wad +## wad <- .wad(x = cbind(obs, ref), g = c(1, 2), +## log.scale = TRUE, AD = AD) +## rnk <- rank(abs(wad)) +## rnk.sort <- rnk[rev(order(rnk))] +## min.idx <- min(rnk.sort[1:round(length(obs) * trimWAD)]) +## +## ## calculate normfactors +## v <- (obs.libsize - obs) / (obs.libsize * obs) + +## (ref.libsize - ref) / (ref.libsize * ref) +## v <- v[rnk <= min.idx] +## obs <- obs[rnk <= min.idx] +## ref <- ref[rnk <= min.idx] +## trimmed.geneid <- private$twad.trim[rnk <= min.idx] +## private$twad.trim <<- rep(0, length = nrow(.self$count)) +## names(private$twad.trim) <<- rownames(.self$count) +## private$twad.trim[trimmed.geneid] <<- 1 +## nf <- 2^(sum(log2((obs / obs.libsize) / (ref / ref.libsize)) / v, +## na.rm = TRUE) / (sum(1 / v, na.rm = TRUE))) +## return (nf) +##}) + +TCC$methods(calcNormFactors = function(norm.method = NULL, + test.method = NULL, + iteration = 1, + FDR = NULL, + floorPDEG = 0.05, + increment = FALSE, + ...) { + argus <- list(...) + if ((increment == FALSE) || + (increment == TRUE && private$normalized == FALSE)) { + DEGES$iteration <<- 0 + } + ex.time <- proc.time() + if (is.null(norm.method)) { + if ((ncol(group) == 1) && (min(as.numeric(table(group))) == 1)) + norm.method = "deseq" + else + norm.method = "edger" + } + if (is.null(test.method)) { + if (!is.null(argus$paired) && argus$paired) + test.method = "bayseq" + else if ((ncol(group) == 1) && (min(as.numeric(table(group))) == 1)) + test.method = "deseq" + else + test.method = "edger" + } + if (norm.method == "edger") + norm.method <- "tmm" + if (test.method != "bayseq" && is.null(FDR)) + FDR <- 0.1 + if (iteration) { + if (is.logical(iteration)) + iteration <- 1 + if (iteration < 0 && 100 < iteration) + stop("TCC::ERROR: The iteration must be given within the limits of from '0' to '100'.") + message(paste("TCC::INFO: Calculating normalization factors using DEGES")) + message(paste("TCC::INFO: (iDEGES pipeline :", norm.method, + "- [", test.method, "-", norm.method, "] X", + iteration + DEGES$iteration, ")")) + DEGES$pipeline <<- paste(norm.method, "- [", test.method, + "-", norm.method, "] X", + iteration + DEGES$iteration) + } else { + message(paste("TCC::INFO: Calculating normalization factors using", norm.method, "...")) + DEGES$pipeline <<- norm.method + } + ## DEGES strategy STEP 1. (First normalization) + if ((increment == FALSE) || + (increment == TRUE && private$normalized == FALSE)) { + norm.factors <<- switch(norm.method, + "tmm" = .self$.normByTmm(count), + "deseq" = .self$.normByDeseq(count), + ##"twad" = .self$.normByTwad(count, + ## trimWAD = trimWAD, + ## q = q, AD = AD), + stop(paste("\nTCC::ERROR: The normalization method of ", + norm.method, " doesn't supported.\n"))) + } + norm.factors <<- norm.factors / mean(norm.factors) + DEGES$threshold <<- data.frame(type = "Unused", input = 0, PDEG = 0) + ## if DEGES not NULL then start DEGES strategy. + if (iteration) { + ## if iteration > 0 then change to iterate DEGES strategy. + for (i in 1:iteration) { + ## DEGES strategy STEP 2. (exact test and remove DEGs.) + private$stat <<- list() + switch(test.method, + "edger" = .self$.testByEdger(...),#design = design, + #coef = coef, + #contrast = contrast, + #dispersion = dispersion), + "deseq" = .self$.testByDeseq(...),#fit1 = fit1, + #fit0 = fit0), + "bayseq" = .self$.testByBayseq(...),#samplesize = samplesize, + #cl = cl, + #comparison = comparison), + "noiseq" = .self$.testByNoiseq(...), + "ebseq" = .self$.testByEbseq(...), + "samseq" = .self$.testBySamseq(...), + "wad" = .self$.testByWad(...), + stop(paste("\nTCC::ERROR: The identifying method of ", test.method, " doesn't supported.\n")) + ) + ## Remove the DEG from original count data. + deg.flg <- rep(0, length = nrow(count)) + deg.flg.FDR <- .self$.exactTest(FDR = FDR) + deg.flg.floorPDEG <- rep(0, length = nrow(count)) + + if (is.null(.self$private$stat$testStat) && + is.null(.self$private$stat$prob)) { + deg.flg.floorPDEG <- as.numeric(rank(private$stat$p.value, + ties.method = "min") <= nrow(count) * floorPDEG) + if (is.null(FDR)) { + ## use TbT + deg.flg <- deg.flg.FDR + DEGES$threshold$type <<- "TbT" + DEGES$threshold$input <<- private$tbt$estProps + DEGES$threshold$PDEG <<- sum(deg.flg) / length(deg.flg) + private$DEGES.PrePDEG <<- deg.flg + } else { + ## use FDR + deg.flg <- deg.flg.FDR + DEGES$threshold$type <<- "FDR" + DEGES$threshold$input <<- FDR + DEGES$threshold$PDEG <<- sum(deg.flg) / length(deg.flg) + private$DEGES.PrePDEG <<- deg.flg + } + } else if (is.null(.self$private$stat$testStat) && + !is.null(.self$private$stat$prob)) { + deg.flg.floorPDEG <- as.numeric(rank(- abs(private$stat$prob), + ties.method = "min") <= nrow(count) * floorPDEG) + private$DEGES.PrePDEG <<- rep(0, length = nrow(count)) + } else { + deg.flg.floorPDEG <- as.numeric(rank(- abs(private$stat$testStat), + ties.method = "min") <= nrow(count) * floorPDEG) + private$DEGES.PrePDEG <<- rep(0, length = nrow(count)) + } + + + if (sum(deg.flg != 0) < sum(deg.flg.floorPDEG != 0)) { + ## use floorPDEG + deg.flg <- deg.flg.floorPDEG + DEGES$threshold$type <<- "floorPDEG" + DEGES$threshold$input <<- floorPDEG + DEGES$threshold$PDEG <<- sum(deg.flg) / length(deg.flg) + } + count.ndeg <- count[(deg.flg == 0), ] + if (nrow(count.ndeg) == 0) { + message ("TCC::INFO: No non-DE genes after eliminate DE genes. stop DEGES strategy.") + break + } + ## DEGES strategy STEP 3. (Second normalization) + norm.factors <<- switch(norm.method, + "tmm" = .self$.normByTmm(count.ndeg), + "deseq" = .self$.normByDeseq(count.ndeg) + ##"twad" = .self$.normByTwad(count.ndeg, + ## trimWAD = trimWAD, + ## q = q, AD = AD) + ) + norm.factors <<- norm.factors * colSums(count.ndeg) / colSums(count) + norm.factors <<- norm.factors / mean(norm.factors) + DEGES$iteration <<- DEGES$iteration + 1 + } + DEGES$potDEG <<- deg.flg + DEGES$prePotDEG <<- .self$private$DEGES.PrePDEG + } + message("TCC::INFO: Done.") + DEGES$execution.time <<- proc.time() - ex.time + private$normalized <<- TRUE +}) + diff --git a/TCC/R/TCC.estimateDE.DESeq.R b/TCC/R/TCC.estimateDE.DESeq.R new file mode 100644 index 0000000..1aeee46 --- /dev/null +++ b/TCC/R/TCC.estimateDE.DESeq.R @@ -0,0 +1,143 @@ +TCC$methods(.testByDeseq = function(fit1 = NULL, fit0 = NULL, ...) { + +.testByDeseq.1 = function() { + suppressMessages(d <- newCountDataSet(countData = round(.self$count), + conditions = .self$group[, 1])) + sizeFactors(d) <- .self$norm.factors * colSums(.self$count) + if (min(as.numeric(table(.self$group[, 1]))) == 1) { + ## Single replicates + e <- try(suppressMessages(d <- estimateDispersions(d, + method = "blind", sharingMode = "fit-only")), + silent = TRUE) + if (class(e) == "try-error") { + message("TCC::WARN: 'estimateDispersions' with sharingMode=\"fit-only\" in DESeq could not be performed.") + message("TCC::WARN: 'estimateDispersions' with sharingMode=\"local\" in DESeq was used instead.") + suppressMessages(d <- estimateDispersions(d, + method = "blind", sharingMode = "fit-only", + fitType = "local")) + } + } else { + ## Multiple replicates + e <- try(suppressMessages(d <- estimateDispersions(d)), silent = TRUE) + if (class(e) == "try-error") { + message("TCC::WARN: 'estimateDispersions' with method=\"pooled\" in DESeq could not be performed.") + message("TCC::WARN: 'estimateDispersions' with method=\"blind\" in DESeq was used instead.") + e <- try(suppressMessages(d <- estimateDispersions(d, + method = "blind", sharingMode = "fit-only")), + silent = TRUE) + ## try local mode if defaul occurs error + if (class(e) == "try-error") { + message("TCC::WARN: 'estimateDispersions' with sharingMode=\"fit-only\" in DESeq could not be performed.") + message("TCC::WARN: 'estimateDispersions' with sharingMode=\"local\" in DESeq was used instead.") + suppressMessages(d <- estimateDispersions(d, + method = "blind", sharingMode = "fit-only", + fitType = "local")) + } + } + } + ug <- unique(.self$group[, 1]) + suppressMessages(d <- nbinomTest(d, ug[1], ug[2])) + d$pval[is.na(d$pval)] <- 1 + d$padj[is.na(d$padj)] <- 1 + private$stat$p.value <<- d$pval + private$stat$q.value <<- d$padj + private$stat$rank <<- rank(d$pval) +} +.testByDeseq.2 = function(fit1 = NULL, fit0 = NULL) { + suppressMessages(d <- newCountDataSet(countData = round(count), + conditions = group[, 1])) + sizeFactors(d) <- .self$norm.factors * colSums(.self$count) + if (min(as.numeric(table(.self$group[, 1]))) == 1) { + ## single replicates + e <- try(suppressMessages(d <- estimateDispersions(d, + method = "blind", sharingMode = "fit-only")), + silent = TRUE) + if (class(e) == "try-error") { + message("TCC::WARN: 'estimateDispersions' with sharingMode=\"fit-only\" in DESeq could not be performed.") + message("TCC::WARN: 'estimateDispersions' with sharingMode=\"local\" in DESeq was used instead.") + suppressMessages(d <- estimateDispersions(d, + method = "blind", sharingMode = "fit-only", + fitType = "local")) + } + } else { + ## Multiple replicates + e <- try(suppressMessages(d <- estimateDispersions(d)), silent = TRUE) + if (class(e) == "try-error") { + message("TCC::WARN: 'estimateDispersions' with method=\"pooled\" in DESeq could not be performed.") + message("TCC::WARN: 'estimateDispersions' with method=\"blind\" in DESeq was used instead.") + e <- try(suppressMessages(d <- estimateDispersions(d, + method = "blind", sharingMode = "fit-only")), + silent = TRUE) + ## try local mode if detaul occurs error + if (class(e) == "try-error") { + message("TCC::WARN: 'estimateDispersions' with sharingMode=\"fit-only\" in DESeq could not be performed.") + message("TCC::WARN: 'estimateDispersions' with sharingMode=\"local\" in DESeq was used instead.") + suppressMessages(d <- estimateDispersions(d, + method = "blind", sharingMode = "fit-only", + fitType = "local")) + } + } + } + ## GLM for multiple group comparison. + if (is.null(fit1) && is.null(fit0)) { + fit1 <- count ~ condition + fit0 <- count ~ 1 + } + if (is.null(fit0)) + stop("TCC::ERROR: Need the formula('fit0') to create reduced model regresses for GLM.") + if (is.null(fit1)) + stop("TCC::ERROR: Need the formula('fit1') to create full model regresses for GLM.") + capture.output(f0 <- fitNbinomGLMs(d, fit0)) + capture.output(f1 <- fitNbinomGLMs(d, fit1)) + private$stat$p.value <<- nbinomGLMTest(f1, f0) + private$stat$p.value[is.na(private$stat$p.value)] <<- 1 + private$stat$q.value <<- p.adjust(private$stat$p.value, method = "BH") + private$stat$rank <<- rank(private$stat$p.value) +} + +.testByDeseq.3 = function(fit1 = NULL, fit0 = NULL) { + suppressMessages(d <- newCountDataSet(countData = round(.self$count), + conditions = .self$group)) + sizeFactors(d) <- .self$norm.factors * colSums(.self$count) + ## try default + e <- try(suppressMessages(d <- estimateDispersions(d)), silent = TRUE) + ## try blind method + if (class(e) == "try-error") { + message("TCC::WARN: 'estimateDispersions' with method=\"pooled\" in DESeq could not be performed.") + message("TCC::WARN: 'estimateDispersions' with method=\"blind\" in DESeq was used instead.") + e <- try(suppressMessages(d <- estimateDispersions(d, + method = "blind", sharingMode = "fit-only")), + silent = TRUE) + ## try local mode + if (class(e) == "try-error") { + message("TCC::WARN: 'estimateDispersions' with sharingMode=\"fit-only\" in DESeq could not be performed.") + message("TCC::WARN: 'estimateDispersions' with sharingMode=\"local\" in DESeq was used instead.") + suppressMessages(d <- estimateDispersions(d, + method = "blind", sharingMode = "fit-only", + fitType = "local")) + } + } + if (is.null(fit0)) + stop("TCC::ERROR: Need the formula('fit0') to create reduced model regresses for GLM.") + if (is.null(fit1)) + stop("TCC::ERROR: Need the formula('fit1') to create full model regresses for GLM.") + capture.output(f0 <- fitNbinomGLMs(d, fit0)) + capture.output(f1 <- fitNbinomGLMs(d, fit1)) + private$stat$p.value <<- nbinomGLMTest(f1, f0) + private$stat$p.value[is.na(private$stat$p.value)] <<- 1 + private$stat$q.value <<- p.adjust(private$stat$p.value, method = "BH") + private$stat$rank <<- rank(private$stat$p.value) +} + +ts <- .self$.testStrategy() +if (ts == 1) { + .testByDeseq.1() +} else if (ts == 2) { + .testByDeseq.2(fit1 = fit1, fit0 = fit0) +} else if (ts == 3) { + .testByDeseq.3(fit1 = fit1, fit0 = fit0) +} else { + stop() +} +}) + diff --git a/TCC/R/TCC.estimateDE.EBSeq.R b/TCC/R/TCC.estimateDE.EBSeq.R new file mode 100644 index 0000000..7dec8b4 --- /dev/null +++ b/TCC/R/TCC.estimateDE.EBSeq.R @@ -0,0 +1,65 @@ +TCC$methods(.testByEbseq = function(...) { + +.testByEbseq.1 = function(samplesize = NULL) { + g <- .self$group[, 1] + ug <- unique(g) + suppressMessages(EBout <- EBSeq::EBTest(Data = .self$count, + Conditions = as.factor(g), + sizeFactors = .self$norm.factors * colSums(.self$count), + maxround = samplesize)) + PP <- EBSeq::GetPPMat(EBout) + df <- matrix(1, ncol = 2, nrow = nrow(.self$count)) + rownames(df) <- rownames(.self$count) + df[rownames(PP), 1] <- PP[, 1] + df[rownames(PP), 2] <- PP[, 2] + df[is.na(df)] <- 0 + private$stat$prob <<- df[, 2] + private$stat$p.value <<- rep(NA, length = nrow(.self$count)) + private$stat$q.value <<- df[, 1] + private$stat$rank <<- rank(- .self$private$stat$prob) +} + +.testByEbseq.2 = function(samplesize = NULL) { + g <- .self$group[, 1] + ug <- unique(g) + gp <- matrix(c(rep(1, length = length(ug)), 1:length(ug)), + nrow = 2, byrow = TRUE) + colnames(gp) <- ug + rownames(gp) <- c("Pattern1", "Pattern2") + suppressMessages(MultiOut <- EBSeq::EBMultiTest(.self$count, + NgVector = NULL, + Conditions = g, + AllParti = gp, + sizeFactors = .self$norm.factors * colSums(.self$count), + maxround = samplesize)) + PP <- EBSeq::GetMultiPP(MultiOut) + df <- matrix(1, ncol = 2, nrow = nrow(.self$count)) + rownames(df) <- rownames(.self$count) + df[rownames(PP$PP), 1] <- PP$PP[, 1] + df[rownames(PP$PP), 2] <- PP$PP[, 2] + df[is.na(df)] <- 0 + private$stat$prob <<- df[, 2] + private$stat$p.value <<- rep(NA, length = nrow(.self$count)) + private$stat$q.value <<- df[, 1] + private$stat$rank <<- rank(- .self$private$stat$prob) +} + + +al <- list(...) +if (is.null(al$samplesize)) { + samplesize <- 5 +} else { + samplesize <- al$samplesize +} +ts <- .self$.testStrategy() +if (ts == 1) { + .testByEbseq.1(samplesize = samplesize) +} else if (ts == 2) { + .testByEbseq.2(samplesize = samplesize) +} else if (ts == 3) { + stop() +} else { + stop() +} +}) + diff --git a/TCC/R/TCC.estimateDE.NBPSeq.R b/TCC/R/TCC.estimateDE.NBPSeq.R new file mode 100644 index 0000000..59f6d9d --- /dev/null +++ b/TCC/R/TCC.estimateDE.NBPSeq.R @@ -0,0 +1,30 @@ +##TCC$methods(.testByNbpseq = function() { +## ts <- .self$.testStrategy() +## if (ts == 1) { +## .self$.testByNbpseq.1() +## } else if (ts == 2) { +## stop() +## } else if (ts == 3) { +## stop() +## } else { +## stop() +## } +##}) + + + +##TCC$methods(.testByNbpseq.1 = function() { +## g <- .self$group[, 1] +## ug <- unique(g) +## nbp <- NBPSeq::nbp.test(counts = .self$count, +## grp.ids = g, +## grp1 = ug[1], grp2 = ug[2], +## norm.factors = .self$norm.factors, +## print.level = 0) +## private$stat$p.values <<- nbp$p.values +## private$stat$q.values <<- nbp$q.values +## private$stat$rank <<- rank(.self$private$stat$p.value) +##}) + + + diff --git a/TCC/R/TCC.estimateDE.NOISeq.R b/TCC/R/TCC.estimateDE.NOISeq.R new file mode 100644 index 0000000..1142097 --- /dev/null +++ b/TCC/R/TCC.estimateDE.NOISeq.R @@ -0,0 +1,32 @@ +TCC$methods(.testByNoiseq = function(...) { + +.testByNoiseq.1 = function() { + x <- .self$getNormalizedData() + gl <- data.frame(group = .self$group[, 1]) + nd <- NOISeq::readData(x, gl) + capture.output(suppressMessages(nr <- NOISeq::noiseq(nd, + k = 0.5, + norm = "n", + replicates = "biological", + factor = "group", + conditions = unique(.self$group[, 1])))) + prob <- nr@results[[1]]$prob + prob[is.na(prob)] <- 0 + private$stat$prob <<- prob + private$stat$p.values <<- rep(NA, length = nrow(.self$count)) + private$stat$q.values <<- rep(NA, length = nrow(.self$count)) + private$stat$rank <<- rank(- prob) +} + +ts <- .self$.testStrategy() +if (ts == 1) { + .testByNoiseq.1() +} else if (ts == 2) { + stop() +} else if (ts == 3) { + stop() +} else { + stop() +} + +}) diff --git a/TCC/R/TCC.estimateDE.R b/TCC/R/TCC.estimateDE.R new file mode 100644 index 0000000..fb7ac19 --- /dev/null +++ b/TCC/R/TCC.estimateDE.R @@ -0,0 +1,110 @@ +TCC$methods(.testStrategy = function () { + fc <- .self$group + og <- fc[, 1] + ug <- unique(og) + ts <- -1 + if (ncol(fc) > 1) { + ## Multi-factors + ts <- 3 + } else if (ncol(fc) == 1 & length(ug) > 2) { + ## Multi-groups & One-factor + ts <- 2 + } else if (ncol(fc) == 1 & length(ug) == 2) { + ## Two-groups & One-factor + ts <- 1 + } + return (ts) +}) + +TCC$methods(.exactTest = function (FDR = NULL, significance.level = NULL, + PDEG = NULL) { + deg.flg <- rep(0, length = nrow(count)) + if (!is.null(significance.level)) { + deg.flg <- as.numeric(private$stat$p.value < significance.level) + } else if (!is.null(FDR)) { + deg.flg <- as.numeric(private$stat$q.value < FDR) + } else if (!is.null(PDEG)) { + deg.flg <- as.numeric(private$stat$rank <= nrow(count) * PDEG) + } else { + deg.flg <- private$estimatedDEG #TbT + } + return (deg.flg) +}) + + +TCC$methods(estimateDE = function (test.method = NULL, + FDR = NULL, +# paired = FALSE, + PDEG = NULL, + significance.level = NULL, + dispersion = NULL, + fit0 = NULL, fit1 = NULL, + design = NULL, + contrast = NULL, coef = NULL, + comparison = NULL, + samplesize = NULL, + floor.value = 1, + cl = NULL) { + paired <- FALSE + if (is.null(test.method)) { + if (paired) + test.method = "bayseq" + else if ((ncol(group) == 1) && (min(as.numeric(table(group))) == 1)) + test.method = "deseq" + else + test.method = "edger" + } + pdeg.method <- c("wad", "noiseq", "samseq") + if (length(grep(test.method, pdeg.method)) > 0) { + PDEG <- 0.05 + } else if (test.method != "bayseq" && is.null(FDR) && + is.null(significance.level)) { + FDR <- 0.1 + } + message(paste("TCC::INFO: Identifying DE genes using", test.method, "...")) + ## calculate statistics values related DE gene. + private$stat <<- list() + stat <<- list() + switch(test.method, + "edger" = .self$.testByEdger(design = design, + coef = coef, + contrast = contrast, + dispersion = dispersion, + paired = paired), + "deseq" = .self$.testByDeseq(fit1 = fit1, + fit0 = fit0, + paired = paired), + "bayseq" = .self$.testByBayseq(samplesize = samplesize, + cl = cl, + comparison = comparison, + paired = paired), + "noiseq" = .self$.testByNoiseq(paired = paired), + "ebseq" = .self$.testByEbseq(samplesize = samplesize, + paired = paired), + "samseq" = .self$.testBySamseq(samplesize = samplesize, + paired = paired), + ##"nbpseq" = .self$.testByNbpseq(), + "wad" = .self$.testByWad(floor.value = floor.value), + stop(paste("\nTCC::ERROR: The identifying method of ", + test.method, " doesn't supported.\n")) + ) + ## identify DE genes with the results of exact test. + estimatedDEG <<- .self$.exactTest(FDR = FDR, + significance.level = significance.level, + PDEG = PDEG) + if (!is.null(private$stat$testStat)) + stat$testStat <<- private$stat$testStat + if (!is.null(private$stat$prob)) + stat$prob <<- private$stat$prob + if (!is.null(private$stat$likelihood)) + stat$likelihood <<- private$stat$likelihood + if (!is.null(private$stat$p.value)) + stat$p.value <<- private$stat$p.value + if (!is.null(private$stat$q.value)) + stat$q.value <<- private$stat$q.value + if (!is.null(private$stat$rank)) + stat$rank <<- private$stat$rank + private$estimated <<- TRUE + message("TCC::INFO: Done.") +}) + diff --git a/TCC/R/TCC.estimateDE.SAMseq.R b/TCC/R/TCC.estimateDE.SAMseq.R new file mode 100644 index 0000000..5f7edff --- /dev/null +++ b/TCC/R/TCC.estimateDE.SAMseq.R @@ -0,0 +1,56 @@ +TCC$methods(.testBySamseq = function(...) { + +.testBySAMseq.1 = function(samplesize = NULL) { + c <- round(.self$getNormalizedData()) + s <- samr::SAMseq(x = c, y = .self$group[, 1], + resp.type = "Two class unpaired", + nperms = samplesize) + private$stat$testStat <<- s$samr.obj$tt + private$stat$p.value <<- rep(NA, length = nrow(.self$count)) + private$stat$q.value <<- rep(NA, length = nrow(.self$count)) + private$stat$rank <<- rank(- abs(s$samr.obj$tt)) +} + +.testBySAMseq.1p = function(samplesize = NULL) { + c <- round(.self$getNormalizedData()) + s <- samr::SAMseq(x = c, y = .self$group[, 1], + resp.type = "Two class paired", + nperms = samplesize) + private$stat$testStat <<- s$samr.obj$tt + private$stat$p.value <<- rep(NA, length = nrow(.self$count)) + private$stat$q.value <<- rep(NA, length = nrow(.self$count)) + private$stat$rank <<- rank(- abs(s$samr.obj$tt)) +} + +.testBySAMseq.2 = function(samplesize = NULL) { + c <- round(.self$getNormalizedData()) + s <- samr::SAMseq(x = c, y = .self$group[, 1], + resp.type = "Multiclass", + nperms = samplesize) + private$stat$testStat <<- s$samr.obj$tt + private$stat$p.value <<- rep(NA, length = nrow(.self$count)) + private$stat$q.value <<- rep(NA, length = nrow(.self$count)) + private$stat$rank <<- rank(- abs(s$samr.obj$tt)) +} + +al <- list(...) +if (is.null(al$samplesize)) samplesize <- 10 +else samplesize <- al$samplesize +if (is.null(al$paired)) al$paired <- FALSE + +ts <- .self$.testStrategy() +if (ts == 1) { + #if (al$paired) + # .testBySAMseq.1p(samplesize = samplesize) + #else + .testBySAMseq.1(samplesize = samplesize) +} else if (ts == 2) { + .testBySAMseq.2(samplesize = samplesize) +} else if (ts == 3) { + stop() +} else { + stop() +} +}) + + diff --git a/TCC/R/TCC.estimateDE.WAD.R b/TCC/R/TCC.estimateDE.WAD.R new file mode 100644 index 0000000..1eec5a2 --- /dev/null +++ b/TCC/R/TCC.estimateDE.WAD.R @@ -0,0 +1,14 @@ +TCC$methods(.testByWad = function(floor.value, ...) { + ef <- colSums(count) * norm.factors + x <- sweep(count, 2, mean(ef) / ef, "*") + s <- .wad(x = x, group = .self$group[, 1], + log.scale = TRUE, + floor.value = floor.value) + private$stat$rank <<- rank(- abs(s)) + private$stat$testStat <<- s + private$stat$p.value <<- rep(NA, length = nrow(.self$count)) + private$stat$q.value <<- rep(NA, length = nrow(.self$count)) + private$estimatedDEG <<- rep(0, length = nrow(count)) +}) + + diff --git a/TCC/R/TCC.estimateDE.baySeq.R b/TCC/R/TCC.estimateDE.baySeq.R new file mode 100644 index 0000000..80dbc83 --- /dev/null +++ b/TCC/R/TCC.estimateDE.baySeq.R @@ -0,0 +1,111 @@ +TCC$methods(.testByBayseq = function(...) { + +.testByBayseq.1p = function(samplesize = NULL, cl = NULL, comparison = NULL) { + if (is.null(comparison)) { + comparison <- 1 + } else if(!is.numeric(comparison)) { + cn <- colnames(.self$group) + comparison <- (1:length(cn))[comparison == cn] + } + ug <- unique(.self$group[, 1]) + cd <- nrow(.self$group) / 2 + el <- colSums(.self$count) * .self$norm.factors + capture.output(suppressMessages(d <- new("pairedData", + data = round(.self$count[, 1:cd]), + pairData = round(.self$count[, (cd + 1):(cd + cd)]), + replicates = .self$group[1:cd, 2], + groups = list(NDE = rep(1, length = cd), + DE = .self$group[1:cd, 2]), + libsizes = el[1:cd], + pairLibsizes = el[(cd + 1):(cd * 2)] + ))) + capture.output(suppressMessages(d <- getPriors.BB(d, + samplesize = samplesize, cl = cl))) + capture.output(suppressMessages(d <- getLikelihoods.BB(d, + pET = "BIC", nullProps = 0.5, cl = cl))) + stat.bayseq <- topCounts(d, group = comparison, number = nrow(.self$count)) + stat.bayseq <- stat.bayseq[rownames(.self$count), ] + ## private$stat$rank <<- rank(- d@posteriors[, "DE"]) + private$stat$likelihood <<- stat.bayseq$Likelihood + private$stat$p.value <<- 1 - stat.bayseq$Likelihood + private$stat$p.value[is.na(private$stat$p.value)] <<- 1 + private$stat$rank <<- rank(private$stat$p.value) + private$stat$q.value <<- stat.bayseq$FDR + private$estimatedDEG <<- as.numeric(.self$private$stat$rank < + (nrow(.self$count) * d@estProps[2])) + private$tbt$estProps <<- d@estProps[2] +} + +.testByBayseq.2 = function(samplesize = NULL, cl = NULL) { + capture.output(suppressMessages(d <- new("countData", + data = round(.self$count), + replicates = .self$group[, 1], + groups = list(NDE = rep(1, length = nrow(.self$group)), + DE = .self$group[, 1]), + libsizes = colSums(.self$count) * .self$norm.factors))) + capture.output(suppressMessages(d <- getPriors.NB(d, + samplesize = samplesize, estimation = "QL", cl = cl))) + capture.output(suppressMessages(d <- getLikelihoods.NB(d, + pET = "BIC", cl = cl))) + stat.bayseq <- topCounts(d, group = "DE", number = nrow(.self$count)) + stat.bayseq <- stat.bayseq[rownames(.self$count), ] + private$stat$rank <<- rank(- d@posteriors[, "DE"]) + private$stat$likelihood <<- stat.bayseq$Likelihood + private$stat$p.value <<- 1 - stat.bayseq$Likelihood + private$stat$q.value <<- stat.bayseq$FDR + private$estimatedDEG <<- as.numeric(.self$private$stat$rank < + (nrow(.self$count) * d@estProps[2])) + private$tbt$estProps <<- d@estProps[2] +} + +.testByBayseq.3 = function(samplesize = NULL, cl = NULL, + comparison = NULL) { + if (is.null(comparison)) + comparison <- colnames(.self$group)[2] + gs <- .self$group + gs <- cbind(rep(1, length = nrow(.self$group)), gs) + colnames(gs)[1] <- "NDE" + suppressMessages(d <- new("countData", data = round(.self$count), + replicates = .self$group[, 1], + groups = gs, + libsizes = colSums(.self$count) * .self$norm.factors)) + capture.output(suppressMessages(d <- getPriors.NB(d, + samplesize = samplesize, estimation = "QL", cl = cl))) + capture.output(suppressMessages(d <- getLikelihoods.NB(d, + pET = "BIC", cl = cl))) + stat.bayseq <- topCounts(d, group = comparison, number = nrow(.self$count)) + stat.bayseq <- stat.bayseq[rownames(.self$count), ] + private$stat$rank <<- rank(- d@posteriors[, comparison]) + private$stat$likelihood <<- stat.bayseq$Likelihood + private$stat$p.value <<- 1 - stat.bayseq$Likelihood + private$stat$q.value <<- stat.bayseq$FDR + private$estimatedDEG <<- as.numeric(.self$private$stat$rank < + (nrow(.self$count) * d@estProps[2])) + private$tbt$estProps <<- d@estProps[2] +} + +al <- list(...) +if (is.null(al$samplesize)) samplesize <- 10000 +else samplesize <- al$samplesize +if (is.null(al$paired)) al$paired <- FALSE + +cl <- al$cl +comparison <- al$comparison +ts <- .self$.testStrategy() +if (al$paired) { + .testByBayseq.1p(samplesize = samplesize, cl = cl, + comparison = comparison) +} else if (ts == 1) { + .testByBayseq.2(samplesize = samplesize, cl = cl) +} else if (ts == 2) { + .testByBayseq.2(samplesize = samplesize, cl = cl) +} else if (ts == 3) { + .testByBayseq.3(samplesize = samplesize, cl = cl, + comparison = comparison) +} else { + stop() +} + +}) + + diff --git a/TCC/R/TCC.estimateDE.edgeR.R b/TCC/R/TCC.estimateDE.edgeR.R new file mode 100644 index 0000000..403e62e --- /dev/null +++ b/TCC/R/TCC.estimateDE.edgeR.R @@ -0,0 +1,85 @@ +TCC$methods(.testByEdger = function(design = NULL, coef = NULL, + contrast = NULL, dispersion = NULL, ...) { + +.testByEdger.1 = function(dispersion = NULL) { + suppressMessages(d <- edgeR::DGEList(counts = round(count), + group = group[, 1])) + suppressMessages(d <- edgeR::calcNormFactors(d)) + d$samples$norm.factors <- norm.factors + if (min(table(group[, 1])) > 1) { + suppressMessages(d <- edgeR::estimateCommonDisp(d)) + suppressMessages(d <- edgeR::estimateTagwiseDisp(d)) + } + if (is.null(dispersion)) { + suppressMessages(d <- edgeR::exactTest(d)) + } else { + suppressMessages(d <- edgeR::exactTest(d, dispersion = dispersion)) + } + if (!is.null(d$table$PValue)) { + private$stat$p.value <<- d$table$PValue + } else { + private$stat$p.value <<- d$table$p.value + } + private$stat$rank <<- rank(private$stat$p.value) + private$stat$q.value <<- p.adjust(private$stat$p.value, method = "BH") +} + +.testByEdger.2 = function(design = NULL, coef = NULL, + contrast = NULL){ + if (is.null(design)) + design <- model.matrix(~ as.factor(.self$group[, 1])) + if (is.null(coef) && is.null(contrast)) + coef <- 2:length(unique(.self$group[, 1])) + suppressMessages(d <- edgeR::DGEList(counts = round(.self$count), + group = .self$group[, 1])) + suppressMessages(d <- edgeR::calcNormFactors(d)) + d$samples$norm.factors <- .self$norm.factors + suppressMessages(d <- edgeR::estimateGLMCommonDisp(d, design)) + suppressMessages(d <- edgeR::estimateGLMTrendedDisp(d, design)) + suppressMessages(d <- edgeR::estimateGLMTagwiseDisp(d, design)) + suppressMessages(fit <- edgeR::glmFit(d, design)) + suppressMessages(lrt <- edgeR::glmLRT(fit, coef = coef, + contrast = contrast)) + s <- topTags(lrt, n = nrow(.self$count)) + s <- s$table[rownames(.self$count), ] + private$stat$p.value <<- s$PValue + private$stat$rank <<- rank(.self$private$stat$p.value) + private$stat$q.value <<- s$FDR +} + +.testByEdger.3 = function(design = NULL, coef = NULL, + contrast = NULL){ + if (is.null(design)) + stop("TCC::ERROR: Need the design matrix for GLM.") + suppressMessages(d <- edgeR::DGEList(counts = round(.self$count), + group = .self$group[, 1])) + suppressMessages(d <- edgeR::calcNormFactors(d)) + d$samples$norm.factors <- .self$norm.factors + suppressMessages(d <- edgeR::estimateGLMCommonDisp(d, design)) + suppressMessages(d <- edgeR::estimateGLMTrendedDisp(d, design)) + suppressMessages(d <- edgeR::estimateGLMTagwiseDisp(d, design)) + suppressMessages(fit <- edgeR::glmFit(d, design)) + suppressMessages(lrt <- edgeR::glmLRT(fit, coef = coef, + contrast = contrast)) + s <- topTags(lrt, n = nrow(.self$count)) + s <- s$table[rownames(.self$count), ] + private$stat$p.value <<- s$PValue + private$stat$rank <<- rank(.self$private$stat$p.value) + private$stat$q.value <<- s$FDR +} + +ts <- .self$.testStrategy() +if (ts == 1) { + .testByEdger.1(dispersion = dispersion) +} else if (ts == 2) { + .testByEdger.2(design = design, coef = coef, + contrast = contrast) +} else if (ts == 3) { + .testByEdger.3(design = design, coef = coef, + contrast = contrast) +} else { + stop() +} + +}) + diff --git a/TCC/R/TCC.generic.R b/TCC/R/TCC.generic.R new file mode 100644 index 0000000..27eefe3 --- /dev/null +++ b/TCC/R/TCC.generic.R @@ -0,0 +1,138 @@ +plot.TCC <- function(x, FDR = NULL, median.lines = FALSE, floor = 0, + groups = NULL, col.tag = NULL, normalize = TRUE, ...) { + invisible(x$plotMA(FDR = FDR, median.lines = median.lines, floor = floor, + groups = groups, col.tag = col.tag, + normalize = normalize, ...)) +} + +subset.TCC <- function(x, subset, ...){ + if(!is.logical(subset)){ + if(is.numeric(subset)){ + new_v = logical(length(x)) + new_v[subset] <- TRUE + return(subset(x, new_v)) + } + if(is.character(subset)){ + new_v = logical(length(x)) + names(new_v) <- x$gene_id + new_v[subset] <- TRUE + return(subset(x, new_v)) + } + message("subset called with unsupported type") + return(F); + } + new_tcc <- new("TCC", as.matrix(x$count[subset, ]), + x$group, x$norm.factors, + as.character(x$gene_id[subset])) + if (x$private$estimated == TRUE) { + new_tcc$stat$rank <- x$stat$rank[subset] + new_tcc$stat$p.value <- x$stat$p.value[subset] + new_tcc$stat$q.value <- x$stat$q.value[subset] + } + if (!is.null(x$estimatedDEG) && length(x$estimatedDEG) > 0){ + new_tcc$estimatedDEG <- x$estimatedDEG[subset] + } + if (!is.null(x$simulation)){ + if(length(x$simulation$trueDEG)>0) + new_tcc$simulation$trueDEG <- x$simulation$trueDEG[subset] + if(length(x$simulation$fold.change)>0) + new_tcc$simulation$fold.change <- x$simulation$fold.change[subset] + new_tcc$simulation$PDEG <- x$simulation$PDEG + } + new_tcc$private <- x$private + return(new_tcc) +} + +show.TCC <- function(object) { + ## Counts. + cat("Count:\n") + print(head(object$count)) + cat("\n") + ## Conditions and Annotations. + df <- data.frame( + norm.factors = object$norm.factors, + lib.sizes = object$norm.factors * colSums(object$count) + ) + rownames(df) <- colnames(object$count) + df <- cbind(object$group, df) + cat("Sample:\n") + print(df) + cat("\n") + ## Normalized results. + if (object$private$normalized) { + cat("DEGES:\n") + cat(paste(" Pipeline : ", + object$DEGES$pipeline, + "\n", sep = "")) + cat(paste(" Execution time : ", + sprintf("%.1f", object$DEGES$execution.time[3]), + " sec\n", sep = "")) + cat(paste(" Threshold type : ", + object$DEGES$threshold$type, + " < ", + sprintf("%.2f", object$DEGES$threshold$input), + "\n", + " Potential PDEG : ", + sprintf("%.2f", sum(object$DEGES$potDEG != 0) / + length(object$DEGES$potDEG)), + "\n\n", sep = "")) + } + ## Esimated results. + if (object$private$estimated) { + df <- getResult(object) + cat("Results:\n") + print(head(df)) + cat("\n") + } +} + + + +setGeneric( + name = "calcNormFactors", + def = function(tcc, ...) tcc) +setMethod( + f = "calcNormFactors", + signature(tcc = "DGEList"), + definition = function(tcc, ...) { + return(edgeR::calcNormFactors(tcc, ...)) + } +) + +setMethod( + f = "names", + signature(x = "TCC"), + definition = function(x) { + return (c("count", "gene_id", "group", "norm.factors", + "DEGES", "stat", "estimatedDEG", "simulation")) + } +) + +setMethod( + f = "length", + signature(x = "TCC"), + definition = function(x) { + return (nrow(x$count)) + } +) + +setMethod( + f = "[", + signature(x = "TCC"), + definition = function(x, i){ + return(subset(x,i)) + } +) + +setMethod( + f = "subset", + signature(x = "TCC"), + definition = subset.TCC +) + +setMethod( + f = "show", + signature(object = "TCC"), + definition = show.TCC +) + diff --git a/TCC/R/TCC.getNormalizedData.R b/TCC/R/TCC.getNormalizedData.R new file mode 100644 index 0000000..a122624 --- /dev/null +++ b/TCC/R/TCC.getNormalizedData.R @@ -0,0 +1,7 @@ + +TCC$methods(getNormalizedData = function () { + effective.libsizes <- colSums(count) * norm.factors + return (sweep(count, 2, + mean(effective.libsizes) / effective.libsizes, "*")) +}) + diff --git a/TCC/R/TCC.plotMA.R b/TCC/R/TCC.plotMA.R new file mode 100644 index 0000000..bd601a0 --- /dev/null +++ b/TCC/R/TCC.plotMA.R @@ -0,0 +1,106 @@ + +TCC$methods(plotMA = function (FDR = NULL, + significance.level = NULL, + median.lines = FALSE, + floor = 0, + groups = NULL, + col.tag = NULL, + normalize = TRUE, ...) { + arglist <- list(...) + if (is.null(arglist$xlab)) + arglist$xlab <- expression(A == (log[2] * G2 + log[2] * G1 ) / 2) + if (is.null(arglist$ylab)) + arglist$ylab <- expression(M == log[2] * G2 - log[2] * G1) + if (is.null(arglist$cex)) + arglist$cex <- 0.3 + if (is.null(arglist$pch)) + arglist$pch <- 20 + if (is.null(arglist$main)) + arglist$main <- "MA plot" + + ## set up default arguments. + gro <- .self$group[, 1] + gru <- unique(as.vector(gro)) + if (is.null(groups)) { + groups <- c(gru[1], gru[2]) + } + if (is.null(arglist$col)) { + if (private$estimated == TRUE) { + arglist$col <- c(1, rep(6, length = length(gru))) + } else if (private$simulation == TRUE) { + arglist$col <- c(1, 4, 2, 4 + 1:(length(gru))) + } else { + arglist$col <- rep(1, length = length(gru)) + } + } + if (normalize) + count.normed <- .self$getNormalizedData() + else + count.normed <- .self$count + mean.i <- rowMeans(as.matrix(count.normed[, gro == groups[1]])) + mean.j <- rowMeans(as.matrix(count.normed[, gro == groups[2]])) + norm.i <- mean(norm.factors[gro == groups[1]]) + norm.j <- mean(norm.factors[gro == groups[2]]) + ma.axes <- .self$.getMACoordinates(mean.i, mean.j, floor) + filter <- as.logical(mean.i > 0 & mean.j > 0) + a <- ma.axes$a.value + m <- ma.axes$m.value + + if (is.null(arglist$xlim)) + arglist$xlim <- c(min(a), max(a)) + if (is.null(arglist$ylim)) + arglist$ylim <- c(min(m), max(m)) + arglist$x <- c(0, 0) + arglist$type <- "n" + do.call(plot, arglist) + grid(col = "gray", lty = "dotted") + col.tag.v <- rep(0, length = nrow(count)) + if (private$estimated == FALSE) { + if (private$simulation == TRUE) + col.tag.v <- simulation$trueDEG + } else { + if ((!is.null(estimatedDEG)) && (length(estimatedDEG != 0))) { + col.tag.v <- as.numeric(estimatedDEG) + } + if (!(is.null(FDR) && is.null(significance.level))) { + private$stat$q.value <<- stat$q.value + private$stat$p.value <<- stat$p.value + col.tag.v <- .self$.exactTest(FDR = FDR, + significance.level = significance.level) + } + } + if (is.null(col.tag)) + col.tag <- col.tag.v + 1 + if (length(col.tag) != nrow(count)) + stop("\nTCC::ERROR: The length of col.tag has to be equal to the number of genes.\n") + for (k in unique(col.tag)) { + points(a[col.tag == k], m[col.tag == k], + col = arglist$col[k], pch = arglist$pch, cex = arglist$cex) + } + if (median.lines == TRUE) { + for (k in unique(col.tag)) { + if (length(setdiff(gru, groups)) != 0 && k == setdiff(gru, groups)) + next + med <- median(m[(col.tag == k & filter)]) + lines(c(min(a) + 1, max(a)), c(med, med), col = arglist$col[k]) + text(arglist$xlim[2], med + 0.5, sprintf("%.3f", med), col = arglist$col[k], + pos = 2, offset = 0) + } + } + invisible(data.frame(a.value = a, m.value = m)) +}) + +TCC$methods(.getMACoordinates = function(g1, g2, floor = 0) { + m <- rep(0, length = nrow(count)) + a <- rep(0, length = nrow(count)) + g1.min.nonZero <- min(g1[g1 > 0]) + g2.min.nonZero <- min(g2[g2 > 0]) + filter <- as.logical(g1 <= floor | g2 <= floor) + g1[g1 <= floor] <- g1.min.nonZero + g2[g2 <= floor] <- g2.min.nonZero + a <- (log2(g1) + log2(g2)) / 2 + m <- log2(g2) - log2(g1) + a[filter] <- min(a) - 1 + return(list(m.value = m, a.value = a)) +}) + diff --git a/TCC/R/TCC.public.R b/TCC/R/TCC.public.R new file mode 100644 index 0000000..957a011 --- /dev/null +++ b/TCC/R/TCC.public.R @@ -0,0 +1,114 @@ +setMethod( + f = "calcNormFactors", + signature(tcc = "TCC"), + definition = function(tcc, norm.method = NULL, test.method = NULL, + iteration = TRUE, FDR = NULL, floorPDEG = 0.05, + increment = FALSE, ...) { + obj <- tcc$copy() + obj$calcNormFactors(norm.method = norm.method, + test.method = test.method, + iteration = iteration, FDR = FDR, + floorPDEG = floorPDEG, increment = increment, + ...) + return(obj) + } +) + +estimateDE <- function(tcc, test.method = NULL, FDR = NULL, + dispersion = NULL, + fit0 = NULL, fit1 = NULL, design = NULL, contrast=NULL, + coef = NULL, comparison = NULL, samplesize = NULL, + floor.value = 1, cl = NULL) { + obj <- tcc$copy() + obj$estimateDE(test.method=test.method, FDR=FDR, + dispersion=dispersion, + fit0=fit0, fit1=fit1, + design=design, contrast=contrast, coef=coef, + comparison=comparison, samplesize=samplesize, + floor.value = floor.value, cl=cl) + return(obj) +} + +getResult <- function(tcc, sort = FALSE, floor = 0) { + if (length(tcc$stat) == 0) + stop("\nTCC::ERROR: There are no statistics in stat fields of TCC class tcc. Execute TCC.estiamteDE for calculating them.\n") + ## calculate M-A coordinates + gru <- unique(tcc$group[, 1]) + m.value <- rep(NA, length = nrow(tcc$count)) + a.value <- rep(NA, length = nrow(tcc$count)) + if ((length(gru) == 2) && (ncol(tcc$group) == 1)) { + count.normed <- tcc$getNormalizedData() + mean.exp <- matrix(0, ncol = length(gru), nrow = nrow(tcc$count)) + gru <- unique(as.vector(tcc$group[, 1])) + mean.i <- rowMeans(as.matrix(count.normed[, tcc$group[, 1] == gru[1]])) + mean.j <- rowMeans(as.matrix(count.normed[, tcc$group[, 1] == gru[2]])) + ma.axes <- tcc$.getMACoordinates(mean.i, mean.j, floor) + m.value <- ma.axes$m.value + a.value <- ma.axes$a.value + } + if (!is.null(tcc$stat$p.value)) { + ## show p-values if existed + df <- data.frame( + gene_id = rownames(tcc$count), + a.value = a.value, + m.value = m.value, + p.value = tcc$stat$p.value, + q.value = tcc$stat$q.value, + rank = tcc$stat$rank, + estimatedDEG = tcc$estimatedDEG + ) + } else if (!is.null(tcc$stat$testStat)) { + ## show probability if existed + df <- data.frame( + gene_id = rownames(tcc$count), + a.value = a.value, + m.value = m.value, + testStat = tcc$stat$testStat, + rank = tcc$stat$rank, + estimatedDEG = tcc$estimatedDEG + ) + } else if (!is.null(tcc$stat$prob)) { + ## show probability if existed + df <- data.frame( + gene_id = rownames(tcc$count), + a.value = a.value, + m.value = m.value, + prob = tcc$stat$prob, + rank = tcc$stat$rank, + estimatedDEG = tcc$estimatedDEG + ) + } + rownames(df) <- NULL + if (sort) + df <- df[order(df$rank), ] + return (df) +} + +filterLowCountGenes <- function(tcc, low.count = 0) { + obj <- tcc$copy() + gru <- unique(obj$group[, 1]) + filters <- matrix(0, ncol = length(gru), nrow = nrow(obj$count)) + for (i in 1:length(gru)) { + filters[, i] <- as.numeric(rowSums( + as.matrix(obj$count[, (obj$group[, 1] == gru[i])]) + ) <= low.count) + } + left.tag <- as.logical(rowSums(filters) != length(gru)) + obj$count <- obj$count[left.tag, ] + if (!is.null(obj$simulation$trueDEG) && length(obj$simulation$trueDEG) != 0) + obj$simulation$trueDEG <- obj$simulation$trueDEG[left.tag] + if (!is.null(obj$estimatedDEG) && length(obj$estimatedDEG) != 0) + obj$estimatedDEG <- obj$estimatedDEG[left.tag] + if (!is.null(obj$stat) && length(obj$stat) != 0) { + for (i in 1:length(obj$stat)) { + if (length(obj$stat[[i]]) == length(left.tag)) + obj$stat[[i]] <- obj$stat[[i]][left.tag] + } + } + return (obj) +} + +getNormalizedData <- function(tcc) { + return (tcc$getNormalizedData()) +} + diff --git a/TCC/R/TCC.simulation.R b/TCC/R/TCC.simulation.R new file mode 100644 index 0000000..9107099 --- /dev/null +++ b/TCC/R/TCC.simulation.R @@ -0,0 +1,205 @@ +simulateReadCounts <- function(Ngene = 10000, PDEG = 0.20, + DEG.assign = NULL, DEG.foldchange = NULL, + replicates = NULL, group = NULL) { + ## one-factor + if (is.null(group)) { + if (is.null(replicates)) + replicates <- c(3, 3) + cond.num <- length(replicates) + if (is.null(DEG.assign)) + DEG.assign <- c(0.9, rep(0.1 / (cond.num - 1), + length = cond.num - 1)) + if (is.null(DEG.foldchange)) + DEG.foldchange <- rep(4, length = cond.num) + group <- as.data.frame(matrix(1, nrow = sum(replicates), + ncol = cond.num)) + DEG.fc <- as.data.frame(matrix(1, nrow = sum(replicates), + ncol = cond.num)) + reps <- rep(1:cond.num, times = replicates) + for (i in 1:cond.num) { + group[(reps == i), i] <- 2 + DEG.fc[(reps == i), i] <- DEG.foldchange[i] + } + DEG.foldchange <- DEG.fc + } + ## required arguments + if (is.null(group)) + stop("TCC::ERROR: The 'group' argument is required.") + if (is.null(DEG.assign)) + stop("TCC::ERROR: The 'DEG.assign' argument is required.") + if (is.null(DEG.foldchange)) + stop("TCC::ERROR: The 'DEG.foldchange' argument is required.") + ## check correctly + if (!is.data.frame(group)) + stop("TCC::ERROR: The 'group' argument should be data.frame.") + if (!is.data.frame(DEG.foldchange)) + stop("TCC::ERROR: The 'DEG.foldchange' argument should be data.frame.") + if (nrow(group) != nrow(DEG.foldchange)) + stop("TCC::ERROR: The number of rows of 'group' and 'DEG.foldchange' must equal.") + if (sum(DEG.assign) > 1) + stop("TCC::ERROR: The total value of DEG.assign must less than one.") + if (length(DEG.assign) != ncol(DEG.foldchange)) + stop("TCC::ERROR: The length of 'DEG.assign' should equal to the number of columns of 'DEG.foldchange'.") + ## message + message("TCC::INFO: Generating simulation data under NB distribution ...") + message(paste("TCC::INFO: (genesizes : ", Ngene, ")")) + if (!is.null(replicates)) { + message(paste("TCC::INFO: (replicates : ", paste(replicates, collapse=", "), ")")) + message(paste("TCC::INFO: (PDEG : ", paste(PDEG * DEG.assign, collapse=", "), ")")) + } else { + message(paste("TCC::INFO: (samples : ", nrow(group), ")")) + message(paste("TCC::INFO: (factors : ", ncol(group), ")")) + message(paste("TCC::INFO: (PDEG : ", PDEG, ")")) + } + ## prepare mean and dispersion vectors + arab <- NULL + rm(arab) + data(arab) + rpm.a <- sweep(arab[, 1:3], 2, + median(colSums(arab[, 1:3])) / colSums(arab[, 1:3]), "*") + rpm.b <- sweep(arab[, 4:6], 2, + median(colSums(arab[, 4:6])) / colSums(arab[, 4:6]), "*") + rpm.a <- rpm.a[apply(rpm.a, 1, var) > 0, ] + rpm.b <- rpm.b[apply(rpm.b, 1, var) > 0, ] + mean.ab <- c(apply(rpm.a, 1, mean), apply(rpm.b, 1, mean)) + var.ab <- c(apply(rpm.a, 1, var), apply(rpm.b, 1, var)) + dispersion <- (var.ab - mean.ab) / (mean.ab * mean.ab) + population <- data.frame(mean = mean.ab, disp = dispersion) + population <- population[population$disp > 0, ] + resampling.vector <- sample(1:nrow(population), Ngene, replace = TRUE) + population <- population[resampling.vector, ] + ## make foldchange-matrix for sampling count data. + fc.matrix <- matrix(1, nrow = Ngene, ncol = nrow(group)) + fc.index <- unique(c(0, cumsum(round(Ngene * PDEG * DEG.assign)))) + trueDEG <- rep(0, length = Ngene) + for (i in 2:length(fc.index)) { + fc.matrix[(fc.index[i - 1] + 1):(fc.index[i]), ] <- + fc.matrix[(fc.index[i - 1] + 1):(fc.index[i]), ] * + matrix(rep(DEG.foldchange[, i - 1], + times = fc.index[i] - fc.index[i - 1]), + ncol = ncol(fc.matrix), byrow = TRUE) + if (is.null(replicates)) + trueDEG[(fc.index[i - 1] + 1):(fc.index[i])] <- 1 + else + trueDEG[(fc.index[i - 1] + 1):(fc.index[i])] <- i - 1 + } + ## sampling data + count <- matrix(0, ncol = ncol(group), nrow = Ngene) + count <- apply(fc.matrix, 2, function(x, pp = population) { + rnbinom(n = Ngene, + mu = x * pp$mean, + size = 1 / pp$disp) + }, population) + if (!is.null(replicates)) { + colnames(count) <- paste("G", rep(1:length(replicates), + times = replicates), + "_rep", sequence(replicates), sep = "") + } else { + repnm <- apply(group, 1, function(i){paste(i, collapse="")}) + colnm <- repnm + tb <- table(repnm) + tbm <- tb + 1 + for (i in 1:length(repnm)) { + colnm[i] <- paste(repnm[i], paste("rep", + tbm[repnm[i]] - tb[repnm[i]], sep = ""), sep = "_") + tb[repnm[i]] <- tb[repnm[i]] - 1 + } + colnames(count) <- colnm + } + rownames(count) <- paste("gene", 1:nrow(count), sep = "_") + ## TCC constructor + tcc <- new("TCC", count, + if(is.null(replicates)) group + else rep(1:length(replicates), times = replicates)) + tcc$simulation$trueDEG <- trueDEG + tcc$simulation$DEG.foldchange <- fc.matrix + tcc$simulation$PDEG <- PDEG * DEG.assign + tcc$simulation$params <- population + tcc$private$simulation.rep <- + if(is.null(replicates)) group + else rep(1:length(replicates), times = replicates) + tcc$private$simulation <- TRUE + tcc$private$estimated <- FALSE + return(tcc) +} + +calcAUCValue <- function(tcc) { + if (is.null(tcc$simulation$trueDE) || length(tcc$simulation$trueDE) == 0) + stop("\nTCC::ERROR: No true positive annotations about differential expression genes.\n ") + if (is.null(tcc$stat$rank) || length(tcc$stat$rank) == 0) + stop("\nTCC::ERROR: There are no rank informations in TCC tcc. It need run TCC.estimateDE().\n") + return(AUC(rocdemo.sca(truth = as.numeric(tcc$simulation$trueDE != 0), + data = - tcc$stat$rank))) +} + +plotFCPseudocolor <- function(tcc, main = "", + xlab = "samples", ylab = "genes") { + if (is.null(tcc$simulation$trueDEG) || length(tcc$simulation$trueDEG) == 0) + message("\nTCC::ERROR: There is no annotations about simulation data.\n") + d <- tcc$simulation$DEG.foldchange + layout(matrix(data = c(1, 2), nrow = 1, ncol = 2), + widths = c(4, 1), heights=c(1, 1)) + maxlevel <- ceiling(max(tcc$simulation$DEG.foldchange)) + minlevel <- ceiling(1 / min(tcc$simulation$DEG.foldchange)) + d[d < 1] <- - 1 / d[d < 1] + 2 + if (min(d) >= 1) { + colorRamp <- c( + "#FFFFFFFF", + cm.colors((maxlevel - 1) * 32)[((maxlevel - 1) + * 16):((maxlevel - 1) * 32 - 1)] + ) + } else if (max(d) <= 1) { + colorRamp <- c( + cm.colors((minlevel - 1) * 32)[2:((minlevel - 1) * 16)], + "#FFFFFFFF" + ) + } else { + colorRamp <- c( + cm.colors((minlevel - 1) * 32)[2:((minlevel - 1) * 16)], + "#FFFFFFFF", + cm.colors((maxlevel - 1) * 32)[((maxlevel - 1) + * 16):((maxlevel - 1) * 32 - 1)] + ) + } + + colorLevels <- seq(2 - minlevel, maxlevel, length = length(colorRamp)) + par(mar = c(3 + ncol(tcc$group) * 0.6, 4.5, 2.5, 2)) + image(1:ncol(d), 1:nrow(d), t(d[rev(1:nrow(d)), ]), + col = colorRamp, ylab = ylab, xlab = "", main = main, axes = FALSE, + zlim = range(2 - minlevel, maxlevel)) + title(xlab = xlab, line = 1 + ncol(tcc$group)) + for (i in 1:ncol(tcc$group)) { + axis(1, at = 1:nrow(tcc$group), labels = tcc$group[, i], + cex.axis = 0.8, + line = i * ifelse(i == 1, 1, 0.6) - ifelse(i == 1, 1, 0.6) , + tick = as.logical(i == 1), + lty = as.numeric(i != 0)) + mtext(colnames(tcc$group)[i], side = 1, at = -0, + cex = 0.8, adj = 1, + line = i * ifelse(i == 1, 1, 0.6) - ifelse(i == 1, 1, 0.6) + 1) + } + ycoor <- c(1, cumsum(nrow(tcc$count) * tcc$simulation$PDEG), + nrow(tcc$count) - 0.5) + yaxis <- sprintf("%d", c(1, cumsum(nrow(tcc$count) * tcc$simulation$PDEG), + nrow(tcc$count))) + axis(2, at = nrow(tcc$count) - ycoor, labels = yaxis, + cex.axis = 0.7, las = 1) + box() + par(mar = c(3 + ncol(tcc$group) * 0.6, 2.5, 2.5, 2)) + image(1, 0:length(colorRamp), + matrix(colorLevels, ncol = length(colorRamp), nrow = 1), + col = colorRamp, xlab = "", ylab = "", + xaxt = "n", yaxt="n") + lb <- seq(from = - minlevel + 2, to = maxlevel, by = 1) + lc <- lb + lc[lc < 1] <- 1 / (2 - lc[lc < 1]) + axis(2, + at = seq(from = 0, to = length(colorRamp), + by = length(colorRamp) / (length(lb) - 1)), + labels = c(rev(paste("1/", 1:minlevel, sep = "")[-1]), + sprintf("%d", 1:maxlevel)), + cex.axis = 0.8) + box() + layout(1) +} + diff --git a/TCC/R/TCC_0.4.R b/TCC/R/TCC_0.4.R index d2f8b9e..6339983 100644 --- a/TCC/R/TCC_0.4.R +++ b/TCC/R/TCC_0.4.R @@ -1,55 +1,61 @@ - -MAplot <- function(datalist, FDR_threshold=0.01){ - data <- datalist$counts - data.cl <- datalist$group - norm_f_TbT <- datalist$norm_f_TbT - x_axis <- datalist$Mval - y_axis <- datalist$Aval - plot(x_axis, y_axis, xlab="A = (log2(B)+log2(A))/2", ylab="M = log2(B)-log2(A)", pch=20, cex=.3) - grid(col="gray", lty="dotted") - points(x_axis[datalist$data$FDR < FDR_threshold], y_axis[datalist$data$FDR< FDR_threshold], col=2, pch=20, cex=0.3) - baseline_TbT <- log2(mean(norm_f_TbT[data.cl==2])/mean(norm_f_TbT[data.cl==1])) - abline(h=baseline_TbT, col="red", lwd=1) +MAplot <- function(datalist, FDR_threshold = 0.01){ + data <- datalist$counts + data.cl <- datalist$group + norm_f_TbT <- datalist$norm_f_TbT + x_axis <- datalist$Mval + y_axis <- datalist$Aval + plot(x_axis, y_axis, xlab = "A = (log2(B)+log2(A))/2", + ylab = "M = log2(B)-log2(A)", pch = 20, cex = .3) + grid(col = "gray", lty = "dotted") + points(x_axis[datalist$data$FDR < FDR_threshold], + y_axis[datalist$data$FDR < FDR_threshold], + col = 2, pch = 20, cex = 0.3) + baseline_TbT <- log2(mean(norm_f_TbT[data.cl == 2]) / + mean(norm_f_TbT[data.cl == 1])) + abline(h = baseline_TbT, col = "red", lwd = 1) } -# generate negative binomial distributed datasets with different frequencies -NBsample <- function(DEG_foldchange = 4, repA = 3, repB = 3, Ngene = 3000, PDEG = 0.15, PA = 0.2){ - arab <- NULL;rm(arab) # to avoid note by R CMD check - data(arab) #arab dataset from NBPseq +## generate negative binomial distributed datasets with different frequencies +NBsample <- function(DEG_foldchange = 4, repA = 3, repB = 3, + Ngene = 3000, PDEG = 0.15, PA = 0.2){ + arab <- NULL;rm(arab) # to avoid note by R CMD check + data(arab) # arab dataset from NBPseq data.cl <- c(rep(1, 3), rep(2, 3)) - RPM <- sweep(arab, 2, 1000000/colSums(arab), "*") + RPM <- sweep(arab, 2, 1000000 / colSums(arab), "*") RPM_A <- RPM[,data.cl == 1] RPM_B <- RPM[,data.cl == 2] RPM_A <- RPM_A[apply(RPM_A, 1, var) > 0,] RPM_B <- RPM_B[apply(RPM_B, 1, var) > 0,] MEAN <- c(apply(RPM_A, 1, mean), apply(RPM_B, 1, mean)) VARIANCE <- c(apply(RPM_A, 1, var), apply(RPM_B, 1, var)) - DISPERSION <- (VARIANCE - MEAN)/(MEAN*MEAN) + DISPERSION <- (VARIANCE - MEAN) / (MEAN * MEAN) mean_disp_tmp <- cbind(MEAN, DISPERSION) mean_disp_tmp <- mean_disp_tmp[mean_disp_tmp[,2] > 0,] - resampling_vector <- sample(1:nrow(mean_disp_tmp), Ngene, replace=TRUE) + resampling_vector <- sample(1:nrow(mean_disp_tmp), Ngene, replace = TRUE) mean_disp <- mean_disp_tmp[resampling_vector,] mu <- mean_disp[,1] DEG_degree_A <- rep(1, Ngene) DEG_degree_A[1:(Ngene*PDEG*PA)] <- DEG_foldchange - mu_A <- mu*DEG_degree_A + mu_A <- mu * DEG_degree_A DEG_degree_B <- rep(1, Ngene) - DEG_degree_B[(Ngene*PDEG*PA+1):(Ngene*PDEG)] <- DEG_foldchange - mu_B <- mu*DEG_degree_B - DEG_posi_org <- (DEG_degree_A*DEG_degree_B) > 1 - nonDEG_posi_org <- (DEG_degree_A*DEG_degree_B) == 1 + DEG_degree_B[(Ngene * PDEG * PA + 1):(Ngene * PDEG)] <- DEG_foldchange + mu_B <- mu * DEG_degree_B + DEG_posi_org <- (DEG_degree_A * DEG_degree_B) > 1 + nonDEG_posi_org <- (DEG_degree_A * DEG_degree_B) == 1 outA <- NULL colnamev <-NULL for(i in 1:repA){ - outA <- cbind(outA, rnbinom(n=length(mu_A), mu=mu_A, size=1/mean_disp[,2])) - colnamev <-cbind(colnamev, paste("A", as.character(i), sep="")) + outA <- cbind(outA, rnbinom(n = length(mu_A), + mu = mu_A, size = 1 / mean_disp[,2])) + colnamev <-cbind(colnamev, paste("A", as.character(i), sep = "")) } outB <- NULL for(i in 1:repB){ - outB <- cbind(outB, rnbinom(n=length(mu_B), mu=mu_B, size=1/mean_disp[,2])) - colnamev <-cbind(colnamev, paste("B", as.character(i), sep="")) + outB <- cbind(outB, rnbinom(n = length(mu_B), + mu = mu_B, size = 1 / mean_disp[,2])) + colnamev <-cbind(colnamev, paste("B", as.character(i), sep = "")) } out <- cbind(outA, outB) colnames(out) <- colnamev @@ -64,49 +70,53 @@ NBsample <- function(DEG_foldchange = 4, repA = 3, repB = 3, Ngene = 3000, PDEG -##################################### -### TbT normalization methods ### -##################################### +## TbT normalization methods do_TbT <- function(data, data.cl, sample_num = 10000){ RAW <- data - ### Step 1: first normalization ### - d <- DGEList(counts=data, group=data.cl) + ## Step 1: first normalization + d <- DGEList(counts = data, group = data.cl) d <- calcNormFactors(d) norm_f_TMM <- d$samples$norm.factors names(norm_f_TMM) <- colnames(data) - ### Step 2: DEG identification ### - groups <- list(NDE=rep(1, length(data.cl)), DE=data.cl) - norm_f_RPM=1000000/colSums(data) + ## Step 2: DEG identification + groups <- list(NDE = rep(1, length(data.cl)), DE = data.cl) + norm_f_RPM = 1000000 / colSums(data) RPM <- sweep(data, 2, norm_f_RPM, "*") data <- round(RPM) - once_normalized <- new("countData", data=as.matrix(data), replicates=data.cl, libsizes=colSums(data)*norm_f_TMM, groups=groups) - once_normalized.NB <- getPriors.NB(once_normalized, samplesize=sample_num, estimation="QL", cl=NULL) - out <- getLikelihoods.NB(once_normalized.NB, pET="BIC", cl=NULL) - PDEG <- out@estProps[2] #proportion of differentially expressed genes + once_normalized <- new("countData", data = as.matrix(data), + replicates = data.cl, + libsizes = colSums(data) * norm_f_TMM, + groups = groups) + once_normalized.NB <- getPriors.NB(once_normalized, + samplesize = sample_num, + estimation = "QL", cl = NULL) + out <- getLikelihoods.NB(once_normalized.NB, pET = "BIC", cl = NULL) + PDEG <- out@estProps[2] # proportion of differentially expressed genes rank_bayseq <- rank(-out@posteriors[,2]) - NDEG <- (nrow(data) * PDEG)# number of differentially expressed genes + NDEG <- (nrow(data) * PDEG) # number of differentially expressed genes - ### Step 3: second normalization ### + ## Step 3: second normalization obj_DEGy <- (rank_bayseq < NDEG) obj_DEGn <- (rank_bayseq >= NDEG) data <- RAW[obj_DEGn,] - d <- DGEList(counts=data, group=data.cl) + d <- DGEList(counts = data, group = data.cl) d <- calcNormFactors(d) - norm_f_TbTorg <- d$samples$norm.factors*colSums(data)/colSums(RAW) - norm_f_TbT <- norm_f_TbTorg/mean(c(mean(norm_f_TbTorg[data.cl==1]),mean(norm_f_TbTorg[data.cl==2]))) - + norm_f_TbTorg <- d$samples$norm.factors * colSums(data) / colSums(RAW) + norm_f_TbT <- norm_f_TbTorg / mean(c(mean(norm_f_TbTorg[data.cl == 1]), + mean(norm_f_TbTorg[data.cl == 2]))) data <- RPM - meanA <- log2(apply(data[,data.cl==1], 1, mean)) - meanB <- log2(apply(data[,data.cl==2], 1, mean)) - Aval <- (meanA + meanB)/2 + meanA <- log2(apply(data[,data.cl == 1], 1, mean)) + meanB <- log2(apply(data[,data.cl == 2], 1, mean)) + Aval <- (meanA + meanB) / 2 Mval <- meanB - meanA - ### calculation of PA value (degree of biased expression) ### - RPM_TMM <- sweep(RPM, 2, 1/norm_f_TMM, "*") + ## calculation of PA value (degree of biased expression) ### + RPM_TMM <- sweep(RPM, 2, 1 / norm_f_TMM, "*") data <- RPM_TMM - logratio <- log2(apply(data[,data.cl==2], 1, mean)) - log2(apply(data[,data.cl==1], 1, mean)) - PA <- sum(logratio[rank_bayseq < NDEG] < 0)/NDEG + logratio <- log2(apply(data[,data.cl == 2], 1, mean)) - + log2(apply(data[,data.cl == 1], 1, mean)) + PA <- sum(logratio[rank_bayseq < NDEG] < 0) / NDEG retval <- list(norm_f_TbT, Aval, Mval, PDEG, PA, obj_DEGn, obj_DEGy, norm_f_TMM, norm_f_TbTorg, data.cl, data) names(retval) <- c("norm_f_TbT", "Mval", "Aval", "PDEG", "PA", "nonDEG_posi", "DEG_posi", "norm_f_TMM", "norm_f_TbTorg", "data.cl", "data") @@ -115,33 +125,36 @@ do_TbT <- function(data, data.cl, sample_num = 10000){ exactTestafterTbT <- function(names, counts, group, sample_num = 10000){ - #if (!("edgeR" %in% loadedNamespaces())) - # library(edgeR) - #edgeR_Version <-sessionInfo()$otherPkgs$edgeR$Version - #edgeR_v <- as.integer(strsplit(edgeR_Version, '.', fixed=TRUE)[[1]]) + ##if (!("edgeR" %in% loadedNamespaces())) + ## library(edgeR) + ##edgeR_Version <-sessionInfo()$otherPkgs$edgeR$Version + ##edgeR_v <- as.integer(strsplit(edgeR_Version, '.', fixed=TRUE)[[1]]) tbtout <- do_TbT(counts, group, sample_num) - d <- DGEList(counts=counts, group=group) + d <- DGEList(counts = counts, group = group) d$samples$norm.factors <- tbtout$norm_f_TbT d <- estimateCommonDisp(d) - #if (edgeR_v[[1]] == 2 & edgeR_v[[2]] <= 6) { - #if(is.null(span) == FALSE) prop.used <- span else if(is.null(prop.used)) prop.used <- 0.5 + ##if (edgeR_v[[1]] == 2 & edgeR_v[[2]] <= 6) { + ##if(is.null(span) == FALSE) prop.used <- + ## span else if(is.null(prop.used)) prop.used <- 0.5 d <- estimateTagwiseDisp(d) - #} - #if (edgeR_v[[1]] > 2 | edgeR_v[[2]] >= 7) - # if(is.null(prop.used) == FALSE) span <- prop.used - # d <- estimateTagwiseDisp(d, span=span, grid.length=grid.length) - #} + ##} + ##if (edgeR_v[[1]] > 2 | edgeR_v[[2]] >= 7) + ## if(is.null(prop.used) == FALSE) span <- prop.used + ## d <- estimateTagwiseDisp(d, span=span, grid.length=grid.length) + ##} out <- exactTest(d) - if(is.vector(out$table$PValue)){#for current edgeR - FDR <- p.adjust(out$table$PValue, method="BH") - }else if(is.vector(out$table$p.value)){#for older edgeR - FDR <- p.adjust(out$table$p.value, method="BH") - }else{#something strange + if(is.vector(out$table$PValue)){ # for current edgeR + FDR <- p.adjust(out$table$PValue, method = "BH") + }else if(is.vector(out$table$p.value)){ # for older edgeR + FDR <- p.adjust(out$table$p.value, method = "BH") + }else{ # something strange warning("PValue was not available") } rank_edgeR <- rank(FDR) retval <- cbind(names, out$table, FDR, rank_edgeR) - return(list(data=retval, norm_f_TbT=tbtout$norm_f_TbT, Mval=tbtout$Mval, Aval=tbtout$Aval, counts=counts, group = group)) + return(list(data = retval, norm_f_TbT = tbtout$norm_f_TbT, + Mval = tbtout$Mval, Aval = tbtout$Aval, + counts = counts, group = group)) } diff --git a/TCC/R/TCC_public.R b/TCC/R/TCC_public.R deleted file mode 100644 index 22da4ce..0000000 --- a/TCC/R/TCC_public.R +++ /dev/null @@ -1,290 +0,0 @@ -# getSimulationData -# sample the simulation data under NB model. -generateSimulationData <- function(Ngene=10000, PDEG=0.20, DEG.assign=c(0.9, 0.1), - DEG.model="uniform", DEG.foldchange=NULL, - group=c(3, 3)) { -# The method is for generating simulation data. -# 1) Make super dispersion from arab data for generating simulation data. -# 2) Adjust disersion$mean for resampling. -# If "uniform" model of DEG.model, then time foldchange to dispersion$mean. -# If "gamma" model of DEG.model, then time one to dispersion$mean. -# 3) Generate simulation data under NB dispersion with dispersion$mean. -# 4) Adjust simulation data. -# If "uniform" model of DEG.model, then times one to all simulation data. -# If "gamma" model of DEG.model, then times foldchange calculated from DEG.gamma parameters. -# 5) Return the simulation data as matrix object. - - # Prepare and adjust default paramaters. - max.len <- max(length(DEG.assign), length(group), length(DEG.foldchange)) - if (length(group) != max.len) { - g <- rep(group, length = max.len) - } else { - g <- group - } - if (length(DEG.assign) != max.len) { - def.num <- max.len - length(DEG.assign) - DEG.assign <- c(DEG.assign[1:(length(DEG.assign) - 1)], - rep(DEG.assign[length(DEG.assign)] / (def.num + 1), times=def.num + 1)) - } - if (is.null(DEG.foldchange)) { - if (DEG.model == "uniform") - DEG.foldchange <- list(c(4)) - if (DEG.model == "gamma") - DEG.foldchange <- list(c(1.2, 2.0, 0.5)) - } - if (DEG.model == "uniform") { - for (i in 1:length(DEG.foldchange)) { - if (length(DEG.foldchange[[i]]) != 1) - message ("TCC::INFO: DEG.foldchange has three element in the vectors, only the first element is used for fixed foldchange.") - } - } else if (DEG.model == "gamma") { - for (i in 1:length(DEG.foldchange)) { - if (length(DEG.foldchange[[i]]) != 3) - stop ("\nTCC::ERROR: It need three elements in each vectors when the DEG.mode is specified to gamma.\n") - } - } - if (length(DEG.foldchange) != max.len) { - DEG.foldchange <- rep(DEG.foldchange, length=max.len) - } - DEG.foldchange <- rep(DEG.foldchange, length = max.len) - if (sum(DEG.assign) > 1) - stop("TCC::ERROR: The total value of DEG.assign must less than one.\n") - message("TCC::INFO: Generating simulation data under NB distribution ...") - message(paste("TCC::INFO: (genesizes : ", paste(Ngene, collapse=", "), ")")) - message(paste("TCC::INFO: (groups : ", paste(g, collapse=", "), ")")) - message(paste("TCC::INFO: (foldhcange distribution : ", DEG.model, ")")) - message(paste("TCC::INFO: (PDEG : ", paste(PDEG * DEG.assign, collapse=", "), ")")) - - # 1) Prepare the super population for sampling. - arab <- NULL - rm(arab) - data(arab) - rpm.a <- sweep(arab[, 1:3], 2, median(colSums(arab[, 1:3])) / colSums(arab[, 1:3]), "*") - rpm.b <- sweep(arab[, 4:6], 2, median(colSums(arab[, 4:6])) / colSums(arab[, 4:6]), "*") - rpm.a <- rpm.a[apply(rpm.a, 1, var) > 0, ] - rpm.b <- rpm.b[apply(rpm.b, 1, var) > 0, ] - mean.ab <- c(apply(rpm.a, 1, mean), apply(rpm.b, 1, mean)) - var.ab <- c(apply(rpm.a, 1, var), apply(rpm.b, 1, var)) - dispersion <- (var.ab - mean.ab) / (mean.ab * mean.ab) - population <- data.frame(mean = mean.ab, disp = dispersion) - population <- population[population$disp > 0, ] - resampling.vector <- sample(1:nrow(population), Ngene, replace = TRUE) - population <- population[resampling.vector, ] # super dispersion - - # 2) Make foldchagen-matrix for sampling count data. - fc.matrix <- matrix(1, ncol=sum(g), nrow=Ngene) - DEG.index <- rep(0, length = nrow(population)) # The DEGs position. - reps <- rep(1:length(g), times=g) - if (DEG.model == "uniform") { - DEG.index[1:round(Ngene * PDEG)] <- - rep(1:length(DEG.assign), times = round(Ngene * PDEG * DEG.assign)) - for (i in 1:length(reps)) { - fc.matrix[, i] <- rep(1, length=Ngene) - fc.matrix[(DEG.index == reps[i]), i] <- DEG.foldchange[[reps[i]]][1] - } - } - - # 3) Sample simulation data from NB dispersion. - count <- matrix(0, ncol = sum(g), nrow = nrow(population)) - for (i in 1:length(reps)) { - count[, i] <- rnbinom(n = Ngene, - mu = fc.matrix[, i] * population$mean, - size = 1 / population$disp) - } - - # 4) Adjust count data with DEG.gamma paramaters only for "gamma" model. - if (DEG.model == "gamma") { - count.means <- matrix(0, ncol=length(g), nrow=Ngene) - for (i in 1:length(g)) { - if (is.null(ncol(count[, (reps == i)]))) { - count.means[, i] <- count[, (reps == i)] - } else { - count.means[, i] <- rowMeans(count[, (reps == i)]) - } - } - col.idx <- 1 - for (i in 1:length(g)) { - deg.num <- round(Ngene * PDEG * DEG.assign[i]) - if (is.null(ncol(count.means[, -i]))) { - deg.candidate <- (count.means[, i] > count.means[, -i]) - } else { - deg.candidate <- (count.means[, i] > apply(count.means[, -i], 1, max)) - } - DEG.index[(deg.candidate & cumsum(deg.candidate) <= deg.num)] <- i - for (j in 1:g[i]) { - fc.matrix[(DEG.index == i), col.idx] <- - DEG.foldchange[[i]][1] + rgamma(sum(DEG.index == i), shape=DEG.foldchange[[i]][2], scale=DEG.foldchange[[i]][3]) - count[(DEG.index == i), col.idx] <- - count[(DEG.index == i), col.idx] * fc.matrix[(DEG.index == i), col.idx] - col.idx <- col.idx + 1 - } - } - # sort by DEG.index . - DEG.index[(DEG.index == 0)] <- 100 - count <- count[order(DEG.index), ] - fc.matrix <- fc.matrix[order(DEG.index), ] - DEG.index <- DEG.index[order(DEG.index)] - DEG.index[(DEG.index == 100)] <- 0 - } - - # save the annotations for generating simulation data to TCC object. - colnames(count) <- paste("G", rep(1:length(g), times=g), "_rep", sequence(g), sep="") - rownames(count) <- paste("gene", 1:nrow(count), sep="_") - tcc <- new("TCC", count, group) - tcc$simulation$trueDEG <- DEG.index - tcc$simulation$DEG.foldchange <- fc.matrix - tcc$simulation$PDEG <- PDEG * DEG.assign - tcc$simulation$group <- g - tcc$replicates <- rep(1:length(g), times=g) - tcc$private$simulation <- TRUE - tcc$private$estimated <- FALSE - return(tcc) -} - - -# plotSimulationMap -# plot heat map with simulation conditions. -plotFCPseudocolor <- function(tcc, main="", - xlab="samples", ylab="genes") { - if (is.null(tcc$simulation$trueDEG) || length(tcc$simulation$trueDEG) == 0) - message("\nTCC::ERROR: There is no annotations about simulation data.\n") - # make matrix data for plot heatmap of foldchange. - d <- tcc$simulation$DEG.foldchange - # prepare layout. - layout(matrix(data=c(1,2), nrow=1, ncol=2), widths=c(4,1), heights=c(1,1)) - #colorRamp <- rgb(seq(0,1,length=256), seq(0,1,length=256), seq(1,0,length=256)) - maxlevel <- round(max(tcc$simulation$DEG.foldchange)) - colorRamp <- rgb(seq(1, 1, length=maxlevel), seq(1, 0, length=maxlevel), seq(1, 1, length=maxlevel)) - colorLevels <- seq(1, maxlevel, length=length(colorRamp)) - par(mar=c(5.5,4.5,2.5,2)) - image(1:ncol(d), 1:nrow(d), t(d[rev(1:nrow(d)), ]), col=colorRamp, - ylab=ylab, xlab="", main=main, axes=FALSE, zlim=c(1, max(tcc$simulation$DEG.foldchange))) - title(xlab=xlab, line=4) - axis(1, at=1:ncol(d), labels=paste("rep", sequence(tcc$simulation$group), sep=""), cex.axis=0.7, line=0) - axis(1, at=cumsum(tcc$simulation$group) - tcc$simulation$group + 1, - labels=paste("Group", c(1:length(tcc$simulation$group)), sep=" "), cex.axis=0.7, line=1, lty=0) - y.axis <- c(1, cumsum(nrow(tcc$count) * tcc$simulation$PDEG), nrow(tcc$count) - 0.5) - y.labels <- c(1, cumsum(nrow(tcc$count) * tcc$simulation$PDEG), nrow(tcc$count)) - axis(2, at=nrow(tcc$count) - y.axis, labels=y.labels, cex.axis=0.7, las=1) - box() - # colorbar. - par(mar = c(5.5, 2.5, 2.5, 2)) - image(1, colorLevels, matrix(colorLevels, ncol=length(colorLevels), nrow=1), - col = colorRamp, xlab="", ylab="", xaxt="n") - box() - layout(1) -} - - - -# calcNormFactors -# calculate normalization factors with TCC class tcc. -setGeneric(name = "calcNormFactors", def = function(tcc, ...) tcc) -setMethod( - f = "calcNormFactors", - signature(tcc = "DGEList"), - definition = function(tcc, ...) { - return(edgeR::calcNormFactors(tcc, ...)) - } -) -setMethod( - f = "calcNormFactors", - signature(tcc = "TCC"), - definition = function(tcc, norm.method=NULL, test.method=NULL, iteration=TRUE, - FDR=NULL, floorPDEG=NULL, samplesize=10000, processors=NULL) { - ex.time <- proc.time() - obj <- tcc$copy() - obj$calcNormFactors(norm.method=norm.method, test.method=test.method, iteration=iteration, - FDR=FDR, floorPDEG=floorPDEG, samplesize=samplesize, processors=processors) - obj$stat$execution.time <- proc.time() - ex.time - return(obj) - } -) - -# estimateDE -# the method is for estimating DEGs. -estimateDE <- function(tcc, test.method=NULL, FDR=NULL, samplesize=10000, processors=NULL) { - obj <- tcc$copy() - obj$estimateDE(test.method=test.method, FDR=FDR, samplesize=samplesize, processors=processors) - return(obj) -} - -# plot -# plot MA-plot with TCC class tcc. -plot.TCC <- function(x, FDR=NULL, median.lines = FALSE, floor=0, main=NULL, - xlab = expression(A == (log[2] * G2 + log[2] * G1 ) / 2), - ylab = expression(M == log[2] * G2 - log[2] * G1), - xlim = NULL, ylim = NULL, cex = 0.3, pch = 19, col = NULL, ...) { - invisible(x$plotMA(FDR=FDR, median.lines=median.lines, floor=floor, main=main, xlab=xlab, ylab=ylab, - xlim=xlim, ylim=ylim, cex=cex, pch=pch, col=col, ...)) -} - -# getResult -# get p-value, FDR or the axes of MA-plot as data.frame. -getResult <- function(tcc, sort = FALSE, floor = 0) { - if (length(tcc$group) != 2) - stop("\nTCC::EEROR: This version doesn't support when the group more than two.\n") - if (length(tcc$stat) == 0) - stop("\nTCC::ERROR: There are no statistics in stat fields of TCC class tcc. Execute TCC.estiamteDE for calculating them.\n") - count.normed <- tcc$getNormalizedCount() - mean.exp <- matrix(0, ncol=length(tcc$group), nrow=nrow(tcc$count)) - for (g in 1:length(tcc$group)) { - mean.exp[, g] <- rowMeans(as.matrix(count.normed[, tcc$replicates == g])) - } - ma.axes <- tcc$.getMACoordinates(mean.exp[, 1], mean.exp[, 2], floor) - result.df <- data.frame( - id = rownames(tcc$count), - a.value = ma.axes$a.value, - m.value = ma.axes$m.value, - p.value = tcc$stat$p.value, - q.value = tcc$stat$q.value, - rank = tcc$stat$rank, - estimatedDEG = tcc$estimatedDEG - ) - if (sort) - result.df <- result.df[order(result.df$rank), ] - return (result.df) -} - -# filterData -# remove the low count data. -filterLowCountGenes <- function(tcc, low.count = 0) { - obj <- tcc$copy() - filters <- matrix(0, ncol=length(obj$group), nrow=nrow(obj$count)) - for (i in 1:length(obj$group)) { - if (obj$group[i] == 1) { - filters[, i] <- as.numeric(obj$count[, (obj$replicates == i)] <= low.count) - } else { - filters[, i] <- as.numeric(rowSums(obj$count[, (obj$replicates == i)]) <= low.count) - } - } - left.tag <- as.logical(rowSums(filters) != length(obj$group)) - obj$count <- obj$count[left.tag, ] - if (!is.null(obj$simulation$trueDEG) && length(obj$simulation$trueDEG) != 0) - obj$simulation$trueDEG <- obj$simulation$trueDEG[left.tag] - if (!is.null(obj$estimatedDEG) && length(obj$estimatedDEG) != 0) - obj$estimatedDEG <- obj$estimatedDEG[left.tag] - if (!is.null(obj$stat) && length(obj$stat) != 0) { - for (i in 1:length(obj$stat)) { - if (length(obj$stat[[i]]) == length(left.tag)) - obj$stat[[i]] <- obj$stat[[i]][left.tag] - } - } - return (obj) -} - -# calcAUCValue -# calculate AUC value with TCC class tcc. -calcAUCValue <- function(tcc) { - if (is.null(tcc$simulation$trueDE) || length(tcc$simulation$trueDE) == 0) - stop("\nTCC::ERROR: No true positive annotations about differential expression genes.\n ") - if (is.null(tcc$stat$rank) || length(tcc$stat$rank) == 0) - stop("\nTCC::ERROR: There are no rank informations in TCC tcc. It need run TCC.estimateDE().\n") - return(AUC(rocdemo.sca(truth = as.numeric(tcc$simulation$trueDE != 0), data = - tcc$stat$rank))) -} - -# getNormalizedData -# normalize count data with the normalization factors in TCC and return it. -getNormalizedData <- function(tcc) { - return (tcc$getNormalizedCount()) -} diff --git a/TCC/R/WAD.R b/TCC/R/WAD.R new file mode 100644 index 0000000..6d4c62e --- /dev/null +++ b/TCC/R/WAD.R @@ -0,0 +1,45 @@ +.wad <- function(x, group, log.scale = TRUE, floor.value = 1) { + AD <- FALSE + if (log.scale) { + x[x < floor.value] <- floor.value + x <- log2(x) + } + ug <- unique(group) + s <- combn(length(ug), 2, function(ij, x = x, g = group, ug = ug, AD = AD) { + g1 <- (g == ug[ij[1]]) + g2 <- (g == ug[ij[2]]) + m1 <- rowMeans(as.matrix(x[, g1])) + m2 <- rowMeans(as.matrix(x[, g2])) + if (AD) { + x_ave <- abs(m1 - m2) / 2 + } else { + x_ave <- (m1 + m2) / 2 + } + weight <- (x_ave - min(x_ave)) / (max(x_ave) - min(x_ave)) + s <- (m2 - m1) * weight + return (s) + }, TRUE, x, group, ug, AD) + s <- apply(s, 1, function(i) { + return (i[max(abs(i)) == abs(i)]) + }) + return(s) +} + +WAD <- function(data, group, log.scale = FALSE, floor.value = 1, sort = FALSE) { + data <- as.matrix(data) + wad <- .wad(x = data, group = group, + log.scale = log.scale, + floor.value = floor.value) + wad <- data.frame(wad = wad, + rank = rank(- abs(wad))) + if (!is.null(rownames(data))) { + rownames(wad) <- rownames(data) + } else { + rownames(wad) <- 1:nrow(data) + } + if (sort) { + wad <- wad[order(wad[, 2]), ] + } + return(wad) +} + diff --git a/TCC/data/hypoData.RData b/TCC/data/hypoData.RData index f933ce6..87426d6 100644 Binary files a/TCC/data/hypoData.RData and b/TCC/data/hypoData.RData differ diff --git a/TCC/data/hypoData_mg.RData b/TCC/data/hypoData_mg.RData new file mode 100644 index 0000000..b0e2701 Binary files /dev/null and b/TCC/data/hypoData_mg.RData differ diff --git a/TCC/data/hypoData_ts.RData b/TCC/data/hypoData_ts.RData new file mode 100644 index 0000000..c455ed2 Binary files /dev/null and b/TCC/data/hypoData_ts.RData differ diff --git a/TCC/data/nakai.RData b/TCC/data/nakai.RData new file mode 100644 index 0000000..afe5dd6 Binary files /dev/null and b/TCC/data/nakai.RData differ diff --git a/TCC/inst/CITATION b/TCC/inst/CITATION index 0a8239f..8d5db5e 100644 --- a/TCC/inst/CITATION +++ b/TCC/inst/CITATION @@ -1,29 +1,159 @@ -citHeader("When you use package 'TCC' in publications please cite:") - -year <- sub(".*(2[[:digit:]]{3})-.*", "\\1", meta$Date, perl = TRUE) -vers <- paste("R package version", meta$Version) -citEntry(entry="Article", - title = "A normalization strategy for comparing tag count data", - author = personList(as.person("Koji Kadota"), - as.person("Tomoaki Nishiyama"), - as.person("Kentaro Shimizu")), - year = 2012, - journal = "Algorithms for Molecular Biology", - volume = 7, - pages = 5, - textVersion = paste("Koji Kadota, Tomoaki Nishiyama, Kentaro Shimizu ", - "(2012). A normalization strategy for comparing tag count data. ", - "Algorithms for Molecular Biology 7:5", - sep="")) -citEntry(entry="Manual", - title = "DES:Differential expression genes Eliminating Strategy for calculating normalization factors with tag count data", - author = personList(as.person("Sun Jianqiang"), - as.person("Tomoaki Nishiyama"), - as.person("Kentaro Shimizu"), - as.person("Koji Kadota")), - year = year, - note = vers, - textVersion = - paste("Sun Jianqiang, Tomoaki Nishiyama, Kentaro Shimizu, Koji Kadota ", - "(", year, "). DES::Differential expression genes Eliminating Strategy for calculating normalization factors with tag count data. ", - vers, ".", sep="")) +citHeader("Please cite appropriate references when you publish your results.") + + +citEntry( + entry = "article", + title = "Differential expression analysis for sequence count data", + author = "Anders S and Huber W", + journal = "Genome Biology", + volume = "11(10)", + pages = "R106", + year = 2010, + textVersion = "Anders S and Huber W. Differential expression analysis for sequence count data. Genome Biol. 2010, 11(10): R106" +) + + +citEntry( + entry = "article", + title = "The NBP negative binomial model for assessing differential gene expression from RNA-Seq", + author = "Di Y, Schafer DW, Cumbie JS and Chang JH", + journal = "Stat Appl Genet Mol Biol", + volume = "26(1)", + pages = 139-140, + year = 2010, + textVersion = "Di Y, Schafer DW, Cumbie JS, and Chang JH. The NBP negative binomial model for assessing differential gene expression from RNA-Seq. Stat Appl Genet Mol Biol. 2011, 10: art24" +) + + +citEntry( + entry = "article", + title = "baySeq: empirical Bayesian methods for identifying differential expression in sequence count data", + author = "Hardcastle TJ and Kelly KA", + journal = "BMC Bioinformatics", + volume = 11, + pages = 422, + year = 2010, + textVersion = "Hardcastle TJ and Kelly KA. baySeq: empirical Bayesian methods for identifying differential expression in sequence count data. BMC Bioinformatics 2010, 11: 422" +) + + +citEntry( + entry = "article", + title = "A normalization strategy for comparing tag count data", + author = "Kadota K, Nishiyama T and Shimizu K", + journal = "Algorithms for Molecular Biology", + volume = 7, + pages = 5, + year = 2012, + textVersion = "Kadota K, Nishiyama T, and Shimizu K. A normalization strategy for comparing tag count data. Algorithms Mol Biol. 2012, 7: 5" +) + + +citEntry( + entry = "article", + title = "edgeR: a Bioconductor package for differential expression analysis of digital gene expression data", + author = "Robinson MD, McCarthy DJ and Smyth GK", + journal = "Bioinformatics", + volume = "26(1)", + pages = 139-140, + year = 2010, + textVersion = "Robinson MD, McCarthy DJ, and Smyth GK. edgeR: a Bioconductor package for differential expression analysis of digital gene expression data. Bioinformatics 2010, 26(1): 139-140" +) + + +citEntry( + entry = "article", + title = "A scaling normalization method for differential expression analysis of RNA-seq data", + author = "Robinson MD and Oshlack A", + journal = "Genome Biology", + volume = 11, + pages = "R25", + year = 2010, + textVersion = "Robinson MD and Oshlack A. A scaling normalization method for differential expression analysis of RNA-seq data. Genome Biol. 2010, 11: R25" +) + + +citEntry( + entry = "article", + title = "Small-sample estimation of negative binomial dispersion, with applications to SAGE data", + author = "Robinson MD and Smyth GK", + journal = "Biostatistics", + volume = 9, + pages = 321-332, + year = 2008, + textVersion = "Robinson MD and Smyth GK. Small-sample estimation of negative binomial dispersion, with applications to SAGE data. Biostatistics 2008, 9: 321-332" +) + + +citEntry( + entry = "article", + title = "TCC: an R package for comparing tag count data with robust normalization strategies", + author = "Sun J, Nishiyama T, Shimizu K and Kadota K", + journal = "BMC Bioinformatics", + volume = 14, + pages = 219, + year = 2013, + textVersion = "Sun J, Nishiyama T, Shimizu K, and Kadota K. TCC: an R package for comparing tag count data with robust normalization strategies. BMC Bioinformatics 2013, 14: 219" +) + + +citEntry( + entry = "article", + title = "Differential expression analysis of multifactor RNA-Seq experiments with respect to biological variation", + author = "McCarthy DJ, Chen Y and Smyth GK", + journal = "Nucleic Acids Research", + volume = 40, + pages = 4288-4297, + year = 2012, + textVersion = "McCarthy DJ, Chen Y and Smyth GK. Differential expression analysis of multifactor RNA-Seq experiments with respect to biological variation. Nucleic Acids Research 2012, 40: 4288-4297" +) + + +citEntry( + entry = "article", + title = "A weighted average difference method for detecting differentially expressed genes from microarray data", + author = "Kadota K, Nakai Y, Shimizu K", + journal = "Algorithms Mol Biol.", + volume = 3, + pages = 8, + year = 2008, + textVersion = "Kadota K, Nakai Y, Shimizu K: A weighted average difference method for detecting differentially expressed genes from microarray data. Algorithms Mol Biol. 2008, 3: 8" +) + + +citEntry( + entry = "article", + title = "ROKU: a novel method for identification of tissue-specific genes", + author = "Kadota K, Ye J, Nakai Y, Terada T, Shimizu K", + journal = "BMC Bioinformatics", + volume = 7, + pages = 294, + year = 2006, + textVersion = "Kadota K, Ye J, Nakai Y, Terada T, Shimizu K: ROKU: a novel method for identification of tissue-specific genes. BMC Bioinformatics 2006, 7: 294" +) + + +citEntry( + entry = "article", + title = "Detection of genes with tissue-specific expression patterns using Akaike's Information Criterion (AIC) procedure", + author = "Kadota K, Nishimura SI, Bono H, Nakamura S, Hayashizaki Y, Okazaki Y, Takahashi K", + journal = "Physiol Genomics", + volume = 12, + pages = 251-259, + year = 2003, + textVersion = "Kadota K, Nishimura SI, Bono H, Nakamura S, Hayashizaki Y, Okazaki Y, Takahashi K: Detection of genes with tissue-specific expression patterns using Akaike's Information Criterion (AIC) procedure. Physiol Genomics 2003, 12: 251-259" +) + + +citEntry( + entry = "article", + title = "Simple method for the detection of outliers", + author = "Ueda T", + journal = "Japanese J Appl Stat", + volume = 25, + pages = 17-26, + year = 1996, + textVersion = "Ueda T. Simple method for the detection of outliers. Japanese J Appl Stat 1996, 25: 17-26" +) + + diff --git a/TCC/inst/NEWS b/TCC/inst/NEWS deleted file mode 100644 index a2b75b5..0000000 --- a/TCC/inst/NEWS +++ /dev/null @@ -1,16 +0,0 @@ -TCC-class was implemented as a R5 reference class. -Wrapper functions with functional programming semantics -are provided. - -Implement DES for normalization - -Return value of the NBsample() method changed to a list - -MAplot() method added - -Support of edgeR_2.7 - -The paper describing TbT (TMM-baySeq-TMM) pipeline is published in -Algorithms for Molecular Biology 2012 Apr 5;7(1):5. PMID: 22475125. -You can read the paper at -http://www.almob.org/content/7/1/5 diff --git a/TCC/inst/doc/TCC.Rnw b/TCC/inst/doc/TCC.Rnw new file mode 100644 index 0000000..2a107c7 --- /dev/null +++ b/TCC/inst/doc/TCC.Rnw @@ -0,0 +1,2260 @@ +%\VignetteIndexEntry{TCC} +%\VignettePackage{TCC} + +\documentclass{article} +\usepackage[a4paper]{geometry} +\usepackage{color} +\usepackage{Sweave} +\SweaveOpts{pdf=TRUE} + + +\definecolor{TccBlue}{cmyk}{0.74,0.26,0,0.14} +\definecolor{TccRed}{cmyk}{0,1,0.78,0.1} +\definecolor{TccGreen}{cmyk}{0.99,0,0.29,0.47} +\definecolor{TccOrange}{cmyk}{0,0.27,1,0.03} + +\renewcommand{\floatpagefraction}{0.9} +\newcommand{\Robject}[1]{{\texttt{#1}}} +\newcommand{\Rfunction}[1]{{\Robject{#1}}} +\newcommand{\Rpackage}[1]{\textbf{\texttt{#1}}} +\newcommand{\Rclass}[1]{{\texttt{#1}}} + +\DefineVerbatimEnvironment{Sinput}{Verbatim}{fontshape=n,formatcom=\color{TccRed}} +\DefineVerbatimEnvironment{Soutput}{Verbatim}{fontshape=n,formatcom=\color{TccBlue}} + +\author {\small Jianqiang Sun$^{1\S}$, Tomoaki Nishiyama$^{2\S}$, Kentaro Shimizu$^1$, and Koji Kadota$^1$\\ +\texttt{\footnotesize $^1$ The University of Tokyo, Tokyo, Japan}\\ +\texttt{\footnotesize $^2$ Kanazawa University, Kanazawa, Japan}\\ +\texttt{\footnotesize $^\S$ Maintainer: Jianqiang Sun (wukong@bi.a.u-tokyo.ac.jp),}\\ +\texttt{\footnotesize Tomoaki Nishiyama (tomoakin@staff.kanazawa-u.ac.jp)} +} +\title{TCC: Differential expression analysis for tag count data +with robust normalization strategies} + +\begin{document} + +\maketitle\thispagestyle{empty} + +\begin{abstract} +The R/Bioconductor package, \Rpackage{TCC}, provides users with a +robust and accurate framework to perform differential expression (DE) +analysis of tag count data. We recently developed a multi-step +normalization method (TbT; Kadota et al., 2012 \cite{kadota}) for +two-group RNA-seq data. The strategy (called DEGES) is to remove data +that are potential differentially expressed genes (DEGs) before +performing the data normalization. DEGES in \Rpackage{TCC} is essential +for accurate normalization of tag count data, especially when the up- +and down-regulated DEGs in one of the groups are extremely biased in +their number. A major characteristic of \Rpackage{TCC} is to provide +the DEGES-based normalization methods for several kinds of count data +(two-group with or without replicates, multi-group, and so on) +by virtue of the use of combinations of functions in other sophisticated +packages (especially \Rpackage{edgeR}, \Rpackage{DESeq}, and +\Rpackage{baySeq}). The appropriate combination provided by \Rpackage{TCC} +allows a more robust and accurate estimation to be performed more easily +than directly using original packages and \Rpackage{TCC} provides a simple +unified interface to perform the robust normalization. +\end{abstract} + +\newpage + +\tableofcontents + +\newpage + +\section{Introduction} + +Differential expression analysis based on tag count data has become a +fundamental task for identifying differentially expressed genes or +transcripts (DEGs). The \Rpackage{TCC} package (Tag Count Comparison; +Sun et al., 2013 \cite{sun}) provides users with a robust and accurate +framework to perform differential expression analysis of tag count data. +\Rpackage{TCC} provides integrated analysis pipelines with improved +data normalization steps, compared with other packages such as +\Rpackage{edgeR}, \Rpackage{DESeq}, and \Rpackage{baySeq}, +by appropriately combining their functionalities. +The package incorporates multi-step normalization methods whose strategy is +to remove data that are potential DEGs before performing +the data normalization. + +Kadota et al. (2012) \cite{kadota} recently reported that +the normalization methods implemented in R packages +(such as \Rpackage{edgeR} (Robinson et al., 2010 \cite{robinson}), +\Rpackage{DESeq} (Anders and Huber, 2010 \cite{anders}), +and \Rpackage{baySeq} (Hardcastle and Kelly, 2010 \cite{hardcastle})) for +differential expression (DE) analysis between samples +are inadequate when the up- and down-regulated DEGs in one of the samples +are extremely biased in their number (i.e., biased DE). This is because the +current methods implicitly assume a balanced DE, +wherein the numbers of highly and lowly expressed DE entities in +samples are (nearly) equal. As a result, +methods assuming unbiased DE will not work well on data with biased DE. +Although a major purpose of data normalization is to detect such DE entities, +their existence themselves consequently interferes with their opportunity to +be top-ranked. Conventional procedures for identifying DEGs from tag count +data consisting of two steps (i.e., data normalization and identification of +DEGs) cannot in principle eliminate the potential DE entities before +data normalization. + +To normalize data that potentially has various scenarios (including unbiased +and biased DE), we recently proposed a multi-step normalization strategy +(called TbT, an acronym for the TMM-\Rpackage{baySeq}-TMM pipeline; Kadota +et al., 2012 \cite{kadota}), in which the TMM normalization method +(Robinson and Oshlack, 2010 \cite{robinson2}) is used in steps 1 and 3 +and an empirical Bayesian method implemented +in the \Rpackage{baySeq} package +(Hardcastle and Kelly, 2010 \cite{hardcastle}) is used in +step 2. Although this multi-step DEG elimination strategy (called "DEGES" +for short) can successfully remove potential DE entities identified in +step 2 prior to the estimation of the normalization factors using the TMM +normalization method in step 3, the \Rpackage{baySeq} package used in step +2 of the TbT method is much more computationally intensive than competing +packages like \Rpackage{edgeR} and \Rpackage{DESeq}. While the three-step +TbT normalization method performed best on simulated and real tag count data, +it is practically possible to make different choices for the methods in +each step. A more comprehensive study regarding better choices for DEGES +is needed. + +This package provides tools to perform multi-step normalization methods +based on DEGES and enables differential expression analysis of tag count +data without having to worry much about biased distributions of DEGs. +The DEGES-based normalization function implemented in \Rpackage{TCC} includes +the TbT method based on DEGES for two-group data with or without replicates, +much faster method, and methods for multi-group comparison. \Rpackage{TCC} +provides a simple unified interface to perform data normalization with +combinations of functions provided by \Rpackage{baySeq}, \Rpackage{DESeq}, +and \Rpackage{edgeR}. Functions to produce simulation data under various +conditions and to plot the data are also provided. + +\subsection{Installation} +\label{section-1-1} + +This package is available from the Bioconductor website (http://bioconductor.org/). +To install the package, enter the following command after starting R: + + +\begin{Schunk} +\begin{Sinput} +> source("http://bioconductor.org/biocLite.R") +> biocLite("TCC") +\end{Sinput} +\end{Schunk} + + +\subsection{Citations} +\label{section-1-2} + +This package internally uses many of the functions implemented in the other +packages. This is because our normalization procedures consist, in part, of +combinations of existing normalization methods and differential expression +(DE) methods. + +For example, the TbT normalization method +(Kadota et al., 2012 \cite{kadota}), which is +a particular functionality of the \Rpackage{TCC} package +(Sun et al., 2013 \cite{sun}), +consists of the TMM normalization method +(Robinson and Oshlack, 2010 \cite{robinson2}) implemented +in the \Rpackage{edgeR} package +(Robinson et al., 2010 \cite{robinson}) and the empirical Bayesian +method implemented in the \Rpackage{baySeq} package +(Hardcastle and Kelly, 2010 \cite{hardcastle}). +Therefore, please cite the appropriate references +when you publish your results. + +\begin{Schunk} +\begin{Sinput} +> citation("TCC") +\end{Sinput} +\end{Schunk} + + +\subsection{Quick start} +\label{section-1-3} + +Let us begin by showing two examples (Cases1 and 2) of identifying DEGs +between two groups from tag count data consisting of $1,000$ +genes and a total of six samples +(each group has three biological replicates). The hypothetical +count data (termed "\Robject{hypoData}") is stored in this package +(for details, see section \ref{section-2-1}). We then describe the +DE analysis of count data without replicates (i.e., two samples), +using the data of the first and the fourth +column of \Robject{hypoData} (Case 3). We recommend the use of commands in +Cases 2 and 3. + +Case 1: DE analysis of two-group count data with replicates by +using the exact test (Robinson and Smyth, 2008 \cite{robinson3}) +in \Rpackage{edgeR} coupled with TbT normalization +(termed the TbT-\Rpackage{edgeR} combination). +The \Rpackage{TCC} package was originally designed with the TbT +normalization method, and the original study (Kadota et al., 2012 +\cite{kadota}) recommended this analysis pipeline. +Note that a smaller sampling size (i.e., \Robject{samplesize = 100}) +is used here to reduce the computation time, but a larger sampling size of +around $10,000$ (i.e., \Robject{samplesize = 10000}) +is recommended (Hardcastle and Kelly, 2010 \cite{hardcastle}). +Suggested citations are as follows: \Rpackage{TCC} (Sun et al., +2013 \cite{sun}), TbT (Kadota et al., 2012 \cite{kadota}), +TMM (Robinson and Oshlack, 2010 \cite{robinson2}), +\Rpackage{baySeq} (Hardcastle and Kelly, 2010 \cite{hardcastle}), +and \Rpackage{edgeR} (Robinson et al., 2010 \cite{robinson}). +For details, see section \ref{section-3-1-1}. + +<>= +library(TCC) +data(hypoData) +samplesize <- 100 +group <- c(1, 1, 1, 2, 2, 2) +tcc <- new("TCC", hypoData, group) +tcc <- calcNormFactors(tcc, norm.method = "tmm", test.method = "bayseq", + iteration = 1, samplesize = samplesize) +tcc <- estimateDE(tcc, test.method = "edger", FDR = 0.1) +result <- getResult(tcc, sort = TRUE) +head(result) +@ + +Case 2: DE analysis for two-group count data with replicates by using +the exact test coupled with iterative DEGES/\Rpackage{edgeR} +normalization (i.e., the iDEGES/\Rpackage{edgeR}-\Rpackage{edgeR} +combination). This is an alternative pipeline designed to reduce +the runtime (approx. $20$ sec.), yet its performance is comparable to +the above pipeline. Accordingly, we recommend using this pipeline as +a default when analyzing tag count data with replicates. +A notable advantage of this pipeline is that the multi-step +normalization strategy only needs the methods +implemented in the \Rpackage{edgeR} package. The suggested citations are as +follows: \Rpackage{TCC} (Sun et al., 2013 \cite{sun}), +TMM (Robinson and Oshlack, 2010 \cite{robinson2}), +the exact test (Robinson and Smyth, 2008 \cite{robinson3}), +and \Rpackage{edgeR} (Robinson et al., 2010 \cite{robinson}). +For details, see section \ref{section-3-1-3}. + +<>= +library(TCC) +data(hypoData) +group <- c(1, 1, 1, 2, 2, 2) +tcc <- new("TCC", hypoData, group) +tcc <- calcNormFactors(tcc, norm.method = "tmm", test.method = "edger", + iteration = 3, FDR = 0.1, floorPDEG = 0.05) +tcc <- estimateDE(tcc, test.method = "edger", FDR = 0.1) +result <- getResult(tcc, sort = TRUE) +head(result) +@ + +Case 3: DE analysis for two-group count data without replicates by using +the negative binomial (NB) test in \Rpackage{DESeq} coupled with +iDEGES/\Rpackage{DESeq} normalization +(i.e., the iDEGES/\Rpackage{DESeq}-\Rpackage{DESeq} combination). +A procedure using the data of the first and fourth columns of +\Robject{hypoData} is shown here. Similar to Case 2, this pipeline entirely +consists of methods implemented in the \Rpackage{DESeq} package. Suggested +citations are as follows: \Rpackage{TCC} (Sun et al., 2013 \cite{sun}) +and \Rpackage{DESeq} (Anders and Huber, 2010 \cite{anders}). For details, +see section \ref{section-3-2}. + +<>= +library(TCC) +data(hypoData) +group <- c(1, 2) +tcc <- new("TCC", hypoData[,c(1,4)], group) +tcc <- calcNormFactors(tcc, norm.method = "deseq", test.method = "deseq", + iteration = 3, FDR = 0.1, floorPDEG = 0.05) +tcc <- estimateDE(tcc, test.method = "deseq", FDR = 0.1) +result <- getResult(tcc, sort = TRUE) +head(result) +@ + +\newpage + +\section{Preparations} + +\subsection{Reading the count data} +\label{section-2-1} + +Similar to the other packages, \Rpackage{TCC} typically starts the DE +analysis with a count table matrix where each row indicates a gene +(or transcript), each column indicates a sample (or library), +and each cell indicates the number of counts for a gene in a sample. +Here, we assume a hypothetical count matrix consisting of 1,000 rows +(or genes) and a total of six columns (the first three columns are +produced from biological replicates of Group 1 and the remaining +columns are from Group 2); i.e., \{G1\_rep1, G1\_rep2, G1\_rep3\} +vs. \{G2\_rep1, G2\_rep2, G2\_rep3\}. We start by loading the +hypothetical data (\Robject{hypoData}) from \Rpackage{TCC} and giving +a numeric vector (\Robject{group}) indicating +which group each sample belongs to. + +<>= +library(TCC) +data(hypoData) +head(hypoData) +dim(hypoData) +group <- c(1, 1, 1, 2, 2, 2) +@ + +If you want to analyze another count matrix consisting of nine columns +(e.g., the first four columns are produced from biological replicates +of G1, and the remaining five columns are from G2), the \Robject{group} +vector should be indicated as follows. + +<>= +group <- c(1, 1, 1, 1, 2, 2, 2, 2, 2) +@ + + +\subsection{Constructing TCC class object} +\label{section-2-2} + +The \Rfunction{new} function has to be used to perform the main +functionalities of \Rpackage{TCC}. +This function constructs a \Rpackage{TCC} class object, +and subsequent analyses are performed on this class object. The object +is constructed from i) a count matrix (\Robject{hypoData}) and ii) the +corresponding numeric vector (\Robject{group}) as follows. + +<>= +library(TCC) +data(hypoData) +group <- c(1, 1, 1, 2, 2, 2) +tcc <- new("TCC", hypoData, group) +tcc +@ + +The count matrix and group vector information can be retrieved from +the stored class object by using \Robject{tcc\$count} +and \Robject{tcc\$group}, respectively. + +<>= +head(tcc$count) +tcc$group +@ + +The subset of \Rpackage{TCC} class object can be taken by the +\Rfunction{subset} or \Rfunction{"["} functions. + + +<>= +dim(tcc$count) +tcc.sub1 <- subset(tcc, c(rep(TRUE, 20), rep(FALSE, 980))) +dim(tcc.sub1$count) +tcc.sub2 <- tcc[1:20] +dim(tcc.sub2$count) +@ + + +\subsection{Filtering low-count genes (optional)} +\label{section-2-3} + +The way to filter out genes with low-count tags across samples depends +on the user's philosophy. Although we recommend removing tags with zero +counts across samples as a minimum filtering, this effort is optional. +The \Rfunction{filterLowCountGenes} function performs this filtering. + +<>= +library(TCC) +data(hypoData) +group <- c(1, 1, 1, 2, 2, 2) +tcc <- new("TCC", hypoData, group) +tcc <- filterLowCountGenes(tcc) +dim(tcc$count) +@ + +It can be seen that $\Sexpr{nrow(hypoData) - nrow(tcc$count)} +(= \Sexpr{nrow(hypoData)} -\Sexpr{ nrow(tcc$count)})$ +genes were filtered as non-expressed. +The same procedure can be performed without +the \Rfunction{filterLowCountGenes} function, in which case the +filtering is performed before the \Rpackage{TCC} class object is constructed. + +<>= +filter <- as.logical(rowSums(hypoData) > 0) +dim(hypoData[filter, ]) +tcc <- new("TCC", hypoData[filter, ], group) +dim(tcc$count) +@ + +\newpage + +\section{Normalization} + +\subsection{Normalization of two-group count data with replicates} +\label{section-3-1} + +This package provides robust normalization methods based on DEGES +proposed by Kadota et al. (2012) \cite{kadota}. When obtaining normalization +factors from two-group data with replicates, users can select +a total of six combinations (two normalization methods $\times$ +three DEG identification methods) coupled with an arbitrary number +of iterations ($n = 0, 1, 2, \dots, 100$) in our DEGES-based +normalization pipeline. We show some of the practical combinations +below. + +Since the three-step TbT normalization method was originally designed +for normalizing tag count data with (biological) replicates, we will +first explain the TbT method (\ref{section-3-1-1} DEGES/TbT). +In relation to the other DEGES-based methods, we will call the method +"DEGES/TbT" for convenience. As mentioned in the original study, +DEGES/TbT needs a long computation time. Accordingly, we present +three shorter alternatives (\ref{section-3-1-2} DEGES/\Rpackage{edgeR}, +\ref{section-3-1-3} iDEGES/\Rpackage{edgeR}, and \ref{section-3-1-4} +DEGES/\Rpackage{DESeq}). Note that the purpose here is to obtain +accurate normalization factors to be used with statistical models +(e.g., the exact test or empirical Bayes) for the DE analysis described +in the next section (\ref{section-4} \textbf{Differential expression}). + +\subsubsection{DEGES/TbT} +\label{section-3-1-1} + +The DEGES/TbT (Kadota et al., 2012 \cite{kadota}) with default parameter +settings can be performed as follows. + +<>= +set.seed(1000) +library(TCC) +data(hypoData) +samplesize <- 100 +group <- c(1, 1, 1, 2, 2, 2) +tcc <- new("TCC", hypoData, group) +tcc <- calcNormFactors(tcc, norm.method = "tmm", test.method = "bayseq", + iteration = 1, samplesize = samplesize) +@ + +Note that a smaller sampling size (i.e., \Robject{samplesize = 100}) +is used here to reduce the computation time when performing +the empirical Bayesian method in step 2, +but a larger sampling size of around $10,000$ +(i.e., \Robject{samplesize = 10000}) is recommended +(Hardcastle and Kelly, 2010 \cite{hardcastle}). +This method estimates an empirical distribution of the parameters +of the NB distribution by bootstrapping from the input data. +While the sampling size can be made smaller to reduce the +computation time (e.g., \Robject{samplesize = 40}), the resulting +normalization factors will vary from trial to trial. +In this vignette, we will call the \Rfunction{set.seed} +function for obtaining reproducible results +(i.e., the \Robject{tcc\$norm.factors} values) when +using any random function. The calculated normalization factors +and the computation time can be retrieved with the following commands. + +<>= +tcc$norm.factors +tcc$DEGES$execution.time +@ + +Of course, the procedure can be performed by using functions +in \Rpackage{edgeR} and \Rpackage{baySeq}, +instead of using the \Rfunction{calcNormFators} +function in \Rpackage{TCC}. The \Rfunction{calcNormFators} function +together with the above parameter settings can be regarded as a wrapper +function for the following commands. + +<>= +set.seed(1000) +library(TCC) +data(hypoData) +samplesize <- 100 +group <- c(1, 1, 1, 2, 2, 2) +### STEP 1 ### +d <- DGEList(count = hypoData, group = group) +d <- calcNormFactors(d) +norm.factors <- d$samples$norm.factors +norm.factors <- norm.factors / mean(norm.factors) +### STEP 2 ### +cD <- new("countData", data = hypoData, replicates = group, + groups = list(NDE = rep(1, length = length(group)), DE = group), + libsizes = colSums(hypoData) * norm.factors) +cD <- getPriors.NB(cD, samplesize = samplesize, estimation = "QL", cl = NULL) +cD <- getLikelihoods.NB(cD, pET = "BIC", cl = NULL) +is.DEG <- as.logical(rank(-cD@posteriors[, "DE"]) < + (nrow(hypoData) * cD@estProps[2])) +### STEP 3 ### +d <- DGEList(count = hypoData[!is.DEG, ], group = group) +d <- calcNormFactors(d) +norm.factors <- d$samples$norm.factors * colSums(hypoData[!is.DEG, ]) / + colSums(hypoData) +norm.factors <- norm.factors / mean(norm.factors) +norm.factors +@ + + +\subsubsection{DEGES/edgeR} +\label{section-3-1-2} + +Now let us describe an alternative approach that is roughly $200$-$400$ times +faster than DEGES/TbT, yet has comparable performance. The +TMM-\Rpackage{edgeR}-TMM pipeline (called DEGES/\Rpackage{edgeR}) +employs the exact test implemented in \Rpackage{edgeR} in step 2. +To use this pipeline, we have to provide a reasonable threshold for +defining potential DEGs in step 2. We will define the threshold as +an arbitrary false discovery rate (FDR) with +a floor value of $P_{\mbox{\tiny DEG}}$. +The default FDR is $< 0.1$, and the default floor $P_{\mbox{\tiny DEG}}$ is +$5\%$, but different choices are of course possible. For example, +in case of the default settings, $x\% (x > 5\%)$ of the top-ranked +potential DEGs are eliminated in step 2 if the percentage ($= x\%$) of +genes satisfying FDR $< 0.1$ is over $5\%$. The DEGES/\Rpackage{edgeR} +pipeline has an apparent advantage over TbT in computation time. +It can be performed as follows: + +<>= +library(TCC) +data(hypoData) +group <- c(1, 1, 1, 2, 2, 2) +tcc <- new("TCC", hypoData, group) +tcc <- calcNormFactors(tcc, norm.method = "tmm", test.method = "edger", + iteration = 1, FDR = 0.1, floorPDEG = 0.05) +tcc$norm.factors +tcc$DEGES$execution.time +@ + +The normalization factors calculated from the DEGES/\Rpackage{edgeR} +are very similar to those of DEGES/TbT with the default parameter +settings (i.e., \Robject{samplesize = 10000}). For \Rpackage{edgeR} +users, we provide commands, consisting of functions in \Rpackage{edgeR}, +to perform the DEGES/\Rpackage{edgeR} pipeline without \Rpackage{TCC}. +The \Rfunction{calcNormFators} function together +with the above parameter settings +can be regarded as a wrapper function for the following commands. + +<>= +library(TCC) +data(hypoData) +group <- c(1, 1, 1, 2, 2, 2) +FDR <- 0.1 +floorPDEG <- 0.05 +d <- DGEList(counts = hypoData, group = group) +### STEP 1 ### +d <- calcNormFactors(d) +### STEP 2 ### +d <- estimateCommonDisp(d) +d <- estimateTagwiseDisp(d) +result <- exactTest(d) +q.value <- p.adjust(result$table$PValue, method = "BH") +if (sum(q.value < FDR) > (floorPDEG * nrow(hypoData))) { + is.DEG <- as.logical(q.value < FDR) +} else { + is.DEG <- as.logical(rank(result$table$PValue, ties.method = "min") <= + nrow(hypoData) * floorPDEG) +} +### STEP 3 ### +d <- DGEList(counts = hypoData[!is.DEG, ], group = group) +d <- calcNormFactors(d) +norm.factors <- d$samples$norm.factors * colSums(hypoData[!is.DEG, ]) / + colSums(hypoData) +norm.factors <- norm.factors / mean(norm.factors) +norm.factors +@ + + +\subsubsection{iDEGES/edgeR} +\label{section-3-1-3} + +Our multi-step normalization can be repeated until the calculated +normalization factors converge (Kadota et al., 2012 \cite{kadota}). +An iterative version of DEGES/TbT (i.e., iDEGES/TbT) can be described +as the TMM-(\Rpackage{baySeq}-TMM)$_{n}$ pipeline with $n \ge 2$. +Although the iDEGES/TbT would not be practical in terms of +the computation time, the TMM-(\Rpackage{edgeR}-TMM)$_{n}$ pipeline +(iDEGES/\Rpackage{edgeR}) is potentially superior to both the +DEGES/\Rpackage{edgeR} and the DEGES/TbT. A suggested +iDEGES/\Rpackage{edgeR} implementation ($n = 3$) +consists of seven steps, as follows: + +<>= +library(TCC) +data(hypoData) +group <- c(1, 1, 1, 2, 2, 2) +tcc <- new("TCC", hypoData, group) +tcc <- calcNormFactors(tcc, norm.method = "tmm", test.method = "edger", + iteration = 3, FDR = 0.1, floorPDEG = 0.05) +tcc$norm.factors +tcc$DEGES$execution.time +@ + +\subsubsection{DEGES/DESeq} +\label{section-3-1-4} + +The DEGES pipeline can also be performed by using only the functions +in the \Rpackage{DESeq} package. Similar to the \Rpackage{edgeR} +case above, this \Rpackage{DESeq}-\Rpackage{DESeq}-\Rpackage{DESeq} +pipeline (DEGES/\Rpackage{DESeq}) changes the corresponding arguments +of the \Robject{norm.method} and \Robject{test.method} as follows: + +<>= +library(TCC) +data(hypoData) +group <- c(1, 1, 1, 2, 2, 2) +tcc <- new("TCC", hypoData, group) +tcc <- calcNormFactors(tcc, norm.method = "deseq", test.method = "deseq", + iteration = 1, FDR = 0.1, floorPDEG = 0.05) +tcc$norm.factors +tcc$DEGES$execution.time +@ + +For \Rpackage{DESeq} users, we also provide commands, consisting of +functions in \Rpackage{DESeq}, to perform the DEGES/\Rpackage{DESeq} +pipeline without \Rpackage{TCC}. The \Rfunction{calcNormFators} function +together with the above arguments can be regarded as a wrapper function for +the following commands. + +<>= +library(TCC) +data(hypoData) +group <- c(1, 1, 1, 2, 2, 2) +FDR <- 0.1 +floorPDEG <- 0.05 +cds <- newCountDataSet(hypoData, group) +### STEP 1 ### +cds <- estimateSizeFactors(cds) +### STEP 2 ### +cds <- estimateDispersions(cds) +result <- nbinomTest(cds, 1, 2) +result$pval[is.na(result$pval)] <- 1 +result$padj[is.na(result$padj)] <- 1 +q.value <- result$padj +if (sum(q.value < FDR) > (floorPDEG * nrow(hypoData))) { + is.DEG <- as.logical(q.value < FDR) +} else { + is.DEG <- as.logical(rank(result$pval, ties.method = "min") <= + nrow(hypoData) * floorPDEG) +} +### STEP 3 ### +cds <- newCountDataSet(hypoData[!is.DEG, ], group) +cds <- estimateSizeFactors(cds) +norm.factors <- sizeFactors(cds) / colSums(hypoData) +norm.factors <- norm.factors / mean(norm.factors) +norm.factors +@ + + +\subsection{Normalization of two-group count data without replicates} +\label{section-3-2} + +It is important to keep in mind that most R packages +(including \Rpackage{edgeR}, + \Rpackage{DESeq}, and \Rpackage{baySeq}) are primarily for analyzing data +including biological replications because the biological variability has +to be accurately estimated to avoid spurious DE calls +(Glaus et al., 2012 \cite{glaus}). +In fact, the functions for the DEG identification method implemented in +\Rpackage{edgeR} (i.e., the exact test; ver. 3.0.4) do not allow analysis +without replicates, though the TMM normalization method in the package can +be applied to data regardless of whether it has replicates. Although the +\Rpackage{edgeR} manual provides users with some ideas on how to perform +the DE analysis, it is difficult to customize the analysis with DEGES to +data without replicates. + +When obtaining normalization factors from two-group count data without +replicates, users can select a total of four combinations +(two normalization methods $\times$ two DEG identification methods) +coupled with an arbitrary number +of iterations ($n = 0, 1, 2, \dots, 100$) in our DEGES-based normalization +pipeline. That is, the \Rfunction{calcNormFators} function with the +\Robject{norm.method = "deseq"} or \Robject{"tmm"} and +\Robject{test.method = "deseq"} or \Robject{"bayseq"} can be indicated. +Let us explain the procedure by retrieving the data of the first and the +fourth columns of \Robject{hypoData}, i.e., + +<>= +library(TCC) +data(hypoData) +group <- c(1, 2) +tcc <- new("TCC", hypoData[, c(1, 4)], group) +head(tcc$count) +tcc$group +@ + +A DEGES pipeline (DEGES/\Rpackage{DESeq}) for obtaining normalization +factors is as follows. + +<>= +tcc <- calcNormFactors(tcc, norm.method = "deseq", test.method = "deseq", + iteration = 1, FDR = 0.1, floorPDEG = 0.05) +tcc$norm.factors +@ + +An advantage of this DEGES/\Rpackage{DESeq} pipeline is that the multi-step +normalization strategy only needs the methods in the \Rpackage{DESeq} package. +These factors should be the same as those produced by the following procedure +consisting of functions implemented in \Rpackage{DESeq}. + +<>= +library(TCC) +data(hypoData) +group <- c(1, 2) +FDR <- 0.1 +floorPDEG <- 0.05 +cds <- newCountDataSet(hypoData[, c(1, 4)], group) +### STEP 1 ### +cds <- estimateSizeFactors(cds) +### STEP 2 ### +cds <- estimateDispersions(cds, method = "blind", sharingMode = "fit-only") +result <- nbinomTest(cds, 1, 2) +result$pval[is.na(result$pval)] <- 1 +result$padj[is.na(result$padj)] <- 1 +q.value <- result$padj +if (sum(q.value < FDR) > (floorPDEG * nrow(hypoData))) { + is.DEG <- as.logical(q.value < FDR) +} else { + is.DEG <- as.logical(rank(result$pval, ties.method = "min") <= + nrow(hypoData) * floorPDEG) +} +### STEP 3 ### +cds <- newCountDataSet(hypoData[!is.DEG, c(1, 4)], group) +cds <- estimateSizeFactors(cds) +norm.factors <- sizeFactors(cds) / colSums(hypoData[, c(1, 4)]) +norm.factors <- norm.factors / mean(norm.factors) +norm.factors +@ + + + +\subsection{Normalization of multi-group count data with replicates} +\label{section-3-3} + +Many R packages (including \Rpackage{edgeR}, \Rpackage{DESeq}, +and \Rpackage{baySeq}) support DE analysis for multi-group tag count data. +\Rpackage{TCC} provides some prototypes of DEGES-based pipelines for +such data. Here, we analyze another hypothetical three-group count matrix, +the \Robject{hypoData\_mg} object, provided in \Rpackage{TCC}. +It consists of $1,000$ genes and a total of nine columns for +testing any difference among three groups that each have triplicates. + +<>= +library(TCC) +data(hypoData_mg) +group <- c(1, 1, 1, 2, 2, 2, 3, 3, 3) +tcc <- new("TCC", hypoData_mg, group) +tcc +dim(tcc$count) +@ + +Of the $1,000$ genes, the first $200$ genes are DEGs and the remaining $800$ +genes are non-DEGs. The breakdowns for the $200$ DEGs are as follows: +$140$, $40$, and $20$ DEGs are up-regulated in Groups 1, 2, and 3. +Below, we show some DEGES-based normalization pipelines for this +multi-group data (\ref{section-3-3-1} DEGES/TbT, \ref{section-3-3-2} +DEGES/\Rpackage{edgeR}, and \ref{section-3-3-3} DEGES/\Rpackage{DESeq}). + +\subsubsection{DEGES/TbT} +\label{section-3-3-1} + +The DEGES/TbT pipeline for multi-group data is essentially the same as +those for two-group data with/without replicates. Note that a smaller +sampling size (i.e., \Robject{samplesize = 100}) is used here to reduce +the computation time, but a larger sampling size of around $10,000$ (i.e., +\Robject{samplesize = 10000}) is recommended +(Hardcastle and Kelly, 2010 \cite{hardcastle}). + +<>= +set.seed(1000) +library(TCC) +data(hypoData_mg) +samplesize <- 100 +group <- c(1, 1, 1, 2, 2, 2, 3, 3, 3) +tcc <- new("TCC", hypoData_mg, group) +tcc <- calcNormFactors(tcc, norm.method = "tmm", test.method = "bayseq", + iteration = 1, samplesize = samplesize) +tcc$norm.factors +@ + +\subsubsection{DEGES/edgeR} +\label{section-3-3-2} + +\Rpackage{edgeR} employs generalized linear models (GLMs) to find DEGs +between any of the groups. The DEGES/\Rpackage{edgeR} normalization +pipeline in \Rpackage{TCC} internally uses functions for the GLM approach +that require two models (a full model and a null model). The full model +corresponds to a design matrix to describe sample groups. The null model +corresponds to the model coefficients. +The two models can be defined as follows: + +<>= +library(TCC) +data(hypoData_mg) +group <- c(1, 1, 1, 2, 2, 2, 3, 3, 3) +tcc <- new("TCC", hypoData_mg, group) +design <- model.matrix(~ as.factor(group)) +coef <- 2:length(unique(group)) +@ + +The design matrix (\Robject{design}) can be constructed by using the +\Rfunction{model.matrix} function. +For the model coefficients (\Robject{coef}), +the user should specify all the coefficients except for the intercept term. +The two models (\Robject{design} and \Robject{coef}) will automatically be +generated when performing the following \Rfunction{calcNormFactors} function +if those models are not explicitly indicated. + +<>= +tcc <- calcNormFactors(tcc, norm.method = "tmm", test.method = "edger", + iteration = 1) +tcc$norm.factors +@ + +For \Rpackage{edgeR} users, we provide commands, consisting of functions in +\Rpackage{edgeR}, to perform the DEGES/\Rpackage{edgeR} pipeline without +\Rpackage{TCC}. The \Rfunction{calcNormFators} function together with the +above parameter settings can be regarded as a wrapper function for the +following commands. + +<>= +library(TCC) +data(hypoData_mg) +group <- c(1, 1, 1, 2, 2, 2, 3, 3, 3) +tcc <- new("TCC", hypoData_mg, group) +FDR <- 0.1 +floorPDEG <- 0.05 +design <- model.matrix(~ as.factor(group)) +coef <- 2:length(unique(group)) +d <- DGEList(counts = hypoData_mg, group = group) +### STEP 1 ### +d <- calcNormFactors(d) +### STEP 2 ### +d <- estimateGLMCommonDisp(d, design) +d <- estimateGLMTrendedDisp(d, design) +d <- estimateGLMTagwiseDisp(d, design) +fit <- glmFit(d, design) +lrt <- glmLRT(fit, coef = coef) +result <- topTags(lrt, n = nrow(hypoData_mg)) +result <- result$table[rownames(hypoData_mg), ] +if (sum(result$FDR < FDR) > (floorPDEG * nrow(hypoData_mg))) { + is.DEG <- as.logical(result$FDR < FDR) +} else { + is.DEG <- as.logical(rank(result$PValue, ties.method = "min") <= + nrow(hypoData_mg) * floorPDEG) +} +### STEP 3 ### +d <- DGEList(counts = hypoData_mg[!is.DEG, ], group = group) +d <- calcNormFactors(d) +norm.factors <- d$samples$norm.factors * colSums(hypoData_mg[!is.DEG, ]) / + colSums(hypoData_mg) +norm.factors <- norm.factors / mean(norm.factors) +norm.factors +@ + +\subsubsection{DEGES/DESeq} +\label{section-3-3-3} + +\Rpackage{DESeq} also employs GLMs for analyzing multi-group experiments. +Similar to the \Rpackage{edgeR} package, it requires two models (full +model and reduced model). The full model (\Robject{fit1}) +and reduced model (\Robject{fit0}) +can be created as follows: + +<>= +library(TCC) +data(hypoData_mg) +group <- c(1, 1, 1, 2, 2, 2, 3, 3, 3) +tcc <- new("TCC", hypoData_mg, group) +fit1 <- count ~ condition +fit0 <- count ~ 1 +@ + +The two models (\Robject{fit1} and \Robject{fit0}) will automatically be generated +when performing the following \Robject{calcNormFactors} function if those models +are not explicitly indicated. + +<>= +tcc <- calcNormFactors(tcc, norm.method = "deseq", test.method = "deseq", + iteration = 1) +tcc$norm.factors +@ + +For \Rpackage{DESeq} users, we provide commands, consisting of functions +in \Rpackage{DESeq}, to perform the DEGES/ \Rpackage{DESeq} pipeline without +\Rpackage{TCC}. The \Rfunction{calcNormFators} function together with the +above parameter settings can be regarded as a wrapper function for the +following commands. + +<>= +library(TCC) +data(hypoData_mg) +group <- c(1, 1, 1, 2, 2, 2, 3, 3, 3) +FDR <- 0.1 +floorPDEG <- 0.05 +tcc <- new("TCC", hypoData_mg, group) +fit1 <- count ~ condition +fit0 <- count ~ 1 +cds <- newCountDataSet(hypoData_mg, group) +### STEP 1 ### +cds <- estimateSizeFactors(cds) +### STEP 2 ### +cds <- estimateDispersions(cds) +reduced.model <- fitNbinomGLMs(cds, fit0) +full.model <- fitNbinomGLMs(cds, fit1) +p.value <- nbinomGLMTest(full.model, reduced.model) +p.value[is.na(p.value)] <- 1 +q.value <- p.adjust(p.value, method = "BH") +if (sum(q.value < FDR) > (floorPDEG * nrow(hypoData_mg))) { + is.DEG <- as.logical(q.value < FDR) +} else { + is.DEG <- as.logical(rank(p.value, ties.method = "min") <= + nrow(hypoData_mg) * floorPDEG) +} +### STEP 3 ### +cds <- newCountDataSet(hypoData_mg[!is.DEG, ], group) +cds <- estimateSizeFactors(cds) +norm.factors <- sizeFactors(cds) / colSums(hypoData_mg) +norm.factors <- norm.factors / mean(norm.factors) +norm.factors +@ + + +\subsection{Retrieving normalized data} +\label{section-3-4} + +Similar functions for calculating normalization factors are the +\Rfunction{calcNormFators} function in \Rpackage{edgeR} and the +\Rfunction{estimateSizeFactors} function in \Rpackage{DESeq}. Note that the +terminology used in \Rpackage{DESeq} (i.e., size factors) is +different from that used in \Rpackage{edgeR} (i.e., effective +library sizes) and ours. The effective library size in \Rpackage{edgeR} +is calculated as the library size multiplied by the normalization factor. +The size factors in the \Rpackage{DESeq} package are comparable to +the {\it normalized} effective library sizes wherein the summary statistics +for the effective library sizes are adjusted to one. Our normalization +factors, which can be obtained from \Robject{tcc\$norm.factors}, +have the same names as those in \Rpackage{edgeR}. Accordingly, the +normalization factors calculated from \Rpackage{TCC} with arbitrary +options should be manipulated together with the library sizes when +normalized read counts are to be obtained. Since biologists are often +interested in such information (Dillies et al., 2012 \cite{dillies}), +we provide the \Rfunction{getNormalizedData} function +for retrieving normalized data. + +Note that the \Robject{hypoData} consists of $1,000$ genes and a total +of six samples (three biological replicates for G1 and three biological +replicates for G2); i.e., \{G1\_rep1, G1\_rep2, G1\_rep3\} vs. +\{G2\_rep1, G2\_rep2, G2\_rep3\}. These simulation data have basically +the same conditions as shown in Fig. 1 of the TbT paper (Kadota et +al., 2012 \cite{kadota}); i.e., (i) the first $200$ genes are +DEGs ($P_{\mbox{\small DEG}} = 200/1000 = 20\%$), +(ii) the first $180$ genes of the $200$ DEGs are higher in G1 +($P_{\mbox{\small G1}} = 180/200 = 90\%$), +and the remaining $20$ DEGs are higher in G2, and (iii) the level of +DE is four-fold. The last $800$ genes were designed to be non-DEGs. +The different normalization strategies can roughly be evaluated in terms +of the similarity of their summary statistics for {\it normalized} data +labeled as non-DEGs in one group (e.g., G1) to those of the other group +(e.g., G2). The basic statistics for the non-DEGs are as follows. + +<>= +library(TCC) +data(hypoData) +nonDEG <- 201:1000 +summary(hypoData[nonDEG, ]) +@ + +From now on, we will display only the median values for simplicity, i.e., + +<>= +apply(hypoData[nonDEG, ], 2, median) +@ +<>= +hypoData.median <- apply(hypoData[nonDEG, ], 2, median) +@ + +In what follows, we show detailed examples using \Robject{hypoData}. +Note, however, that the basic usage is simple. + +<>= +normalized.count <- getNormalizedData(tcc) +@ + +\subsubsection{Retrieving two-group DEGES/edgeR-normalized data with replicates} +\label{section-3-4-1} + +The \Rfunction{getNormalizedData} function can be applied to the +\Rpackage{TCC} class object after the normalization factors have been +calculated. + +<>= +library(TCC) +data(hypoData) +nonDEG <- 201:1000 +group <- c(1, 1, 1, 2, 2, 2) +tcc <- new("TCC", hypoData, group) +tcc <- calcNormFactors(tcc, norm.method = "tmm", test.method = "edger", + iteration = 1, FDR = 0.1, floorPDEG = 0.05) +normalized.count <- getNormalizedData(tcc) +apply(normalized.count[nonDEG, ], 2, median) +@ + +The same procedure consisting of functions in \Rpackage{edgeR} is + +<>= +library(TCC) +data(hypoData) +nonDEG <- 201:1000 +group <- c(1, 1, 1, 2, 2, 2) +FDR <- 0.1 +floorPDEG <- 0.05 +d <- DGEList(counts = hypoData, group = group) +### Step 1 ### +d <- calcNormFactors(d) +### Step 2 ### +d <- estimateCommonDisp(d) +d <- estimateTagwiseDisp(d) +result <- exactTest(d) +q.value <- p.adjust(result$table$PValue, method = "BH") +if (sum(q.value < FDR) > (floorPDEG * nrow(hypoData))) { + is.DEG <- as.logical(q.value < FDR) +} else { + is.DEG <- as.logical(order(rank(result$table$PValue)) <= + nrow(hypoData) * floorPDEG) +} +### Step 3 ### +d <- DGEList(counts = hypoData[!is.DEG, ], group = group) +d <- calcNormFactors(d) +norm.factors <- d$samples$norm.factors * colSums(hypoData[!is.DEG, ]) / + colSums(hypoData) +norm.factors <- norm.factors / mean(norm.factors) +effective.libsizes <- colSums(hypoData) * norm.factors +normalized.count <- sweep(hypoData, 2, + mean(effective.libsizes) / effective.libsizes, "*") +apply(normalized.count[nonDEG, ], 2, median) +@ + +It is obvious that the summary statistics (ranging from +$\Sexpr{sprintf("%.5f", min(apply(normalized.count[nonDEG,],2,median)))}$ +to $\Sexpr{sprintf("%.5f", max(apply(normalized.count[nonDEG,],2,median)))})$ +from DEGES/\Rpackage{edgeR}-normalized data are close to the truth (i.e., +ranging from $\Sexpr{sprintf("%.1f", min(hypoData.median))}$ to +$\Sexpr{sprintf("%.1f",max(hypoData.median))}$). For comparison, +the summary statistics for +TMM-normalized data produced using the original normalization method +(i.e., TMM) in \Rpackage{edgeR} are obtained as follows. + +<>= +library(TCC) +data(hypoData) +nonDEG <- 201:1000 +group <- c(1, 1, 1, 2, 2, 2) +d <- DGEList(count = hypoData, group = group) +d <- calcNormFactors(d) +norm.factors <- d$samples$norm.factors +norm.factors <- norm.factors / mean(norm.factors) +effective.libsizes <- colSums(hypoData) * norm.factors +normalized.count <- sweep(hypoData, 2, + mean(effective.libsizes) / effective.libsizes, "*") +apply(normalized.count[nonDEG, ], 2, median) +@ + +This is the same as + +<>= +library(TCC) +data(hypoData) +nonDEG <- 201:1000 +group <- c(1, 1, 1, 2, 2, 2) +tcc <- new("TCC", hypoData, group) +tcc <- calcNormFactors(tcc, norm.method = "tmm", iteration = 0) +normalized.count <- getNormalizedData(tcc) +apply(normalized.count[nonDEG, ], 2, median) +@ + +From the viewpoint of the data distribution of non-DEGs, +these statistics (ranging from +$\Sexpr{sprintf("%.5f", min(apply(normalized.count[nonDEG,],2,median)))}$ +to $\Sexpr{sprintf("%.5f", max(apply(normalized.count[nonDEG,],2,median)))}$) +are not as good as those of DEGES/\Rpackage{edgeR}. + +\subsubsection{Retrieving two-group DEGES/DESeq-normalized data with replicates} +\label{section-3-4-2} + +Similar to the DEGES/\Rpackage{edgeR} case, +DEGES/\Rpackage{DESeq}-normalized data can be retrieved as follows. + +<>= +library(TCC) +data(hypoData) +group <- c(1, 1, 1, 2, 2, 2) +nonDEG <- 201:1000 +tcc <- new("TCC", hypoData, group) +tcc <- calcNormFactors(tcc, norm.method = "deseq", test.method = "deseq", + iteration = 1, FDR = 0.1, floorPDEG = 0.05) +normalized.count <- getNormalizedData(tcc) +apply(normalized.count[nonDEG, ], 2, median) +@ + +The same procedure consisting of functions in \Rpackage{DESeq} is + +<>= +library(TCC) +data(hypoData) +nonDEG <- 201:1000 +group <- c(1, 1, 1, 2, 2, 2) +FDR <- 0.1 +floorPDEG <- 0.05 +cds <- newCountDataSet(hypoData, group) +### Step 1 ### +cds <- estimateSizeFactors(cds) +### Step 2 ### +cds <- estimateDispersions(cds) +result <- nbinomTest(cds, 1, 2) +result$pval[is.na(result$pval)] <- 1 +result$padj[is.na(result$padj)] <- 1 +q.value <- result$padj +if (sum(q.value < FDR) > (floorPDEG * nrow(hypoData))) { + is.DEG <- as.logical(q.value < FDR) +} else { + is.DEG <- as.logical(rank(result$pval, ties.method = "min") <= + nrow(hypoData) * floorPDEG) +} +### Step 3 ### +cds <- newCountDataSet(hypoData[!is.DEG, ], group) +cds <- estimateSizeFactors(cds) +norm.factors <- sizeFactors(cds) / colSums(hypoData) +norm.factors <- norm.factors / mean(norm.factors) +effective.libsizes <- colSums(hypoData) * norm.factors +normalized.count <- sweep(hypoData, 2, + mean(effective.libsizes) / effective.libsizes, "*") +apply(normalized.count[nonDEG, ], 2, median) +@ + +\subsubsection{Retrieving two-group DEGES/DESeq-normalized data without replicates} +\label{section-3-4-3} + +Similar to the case of count data with replicates, +the DEGES/\Rpackage{DESeq}-normalized data without +replicates can be retrieved as follows. + +<>= +library(TCC) +data(hypoData) +nonDEG <- 201:1000 +group <- c(1, 2) +tcc <- new("TCC", hypoData[, c(1, 4)], group) +tcc <- calcNormFactors(tcc, norm.method = "deseq", test.method = "deseq", + iteration = 1, FDR = 0.1, floorPDEG = 0.05) +normalized.count <- getNormalizedData(tcc) +apply(normalized.count[nonDEG, ], 2, median) +@ + +The same procedure consisting of functions in \Rpackage{DESeq} is + +<>= +library(TCC) +data(hypoData) +nonDEG <- 201:1000 +group <- c(1, 2) +FDR <- 0.1 +floorPDEG <- 0.05 +cds <- newCountDataSet(hypoData[,c(1, 4)], group) +### Step 1 ### +cds <- estimateSizeFactors(cds) +### Step 2 ### +cds <- estimateDispersions(cds, method = "blind", sharingMode = "fit-only") +result <- nbinomTest(cds, 1, 2) +result$pval[is.na(result$pval)] <- 1 +result$padj[is.na(result$padj)] <- 1 +q.value <- result$padj +if (sum(q.value < FDR) > (floorPDEG * nrow(hypoData))) { + is.DEG <- as.logical(q.value < FDR) +} else { + is.DEG <- as.logical(rank(result$pval, ties.method = "min") <= + nrow(hypoData) * floorPDEG) +} +### Step 3 ### +cds <- newCountDataSet(hypoData[!is.DEG, c(1, 4)], group) +cds <- estimateSizeFactors(cds) +norm.factors <- sizeFactors(cds) / colSums(hypoData[, c(1, 4)]) +norm.factors <- norm.factors / mean(norm.factors) +effective.libsizes <- colSums(hypoData[, c(1, 4)]) * norm.factors +normalized.count <- sweep(hypoData[, c(1, 4)], 2, + mean(effective.libsizes) / effective.libsizes, "*") +apply(normalized.count[nonDEG, ], 2, median) +@ + +The above summary statistics from DEGES/\Rpackage{DESeq}-normalized data +are closer to the truth +(i.e., $\Sexpr{sprintf("%.1f", hypoData.median[1])}$ for G1\_rep1 +and $\Sexpr{sprintf("%.1f",hypoData.median[4])}$ for G2\_rep1) +than are the following summary statistics from data normalized using the +original normalization method implemented in \Rpackage{DESeq}. + +<>= +library(TCC) +data(hypoData) +nonDEG <- 201:1000 +group <- c(1, 2) +cds <- newCountDataSet(hypoData[, c(1, 4)], group) +cds <- estimateSizeFactors(cds) +normalized.count <- counts(cds, normalized = TRUE) +apply(normalized.count[nonDEG, ], 2, median) +@ + +\subsubsection{Retrieving multi-group iDEGES/edgeR-normalized data with replicates} +\label{section-3-4-4} + +Here, we analyze another hypothetical three-group count matrix, +the \Robject{hypoData\_mg} object, provided in \Rpackage{TCC}. +It consists of $1,000$ genes and a total of nine columns for testing +any difference among three groups that each have triplicates. +Similar to the \Robject{hypoData} object, the first $200$ genes +are DEGs and the remaining $800$ genes are non-DEGs. +The basic statistics for the non-DEGs are as follows. + +<>= +library(TCC) +data(hypoData_mg) +nonDEG <- 201:1000 +summary(hypoData_mg[nonDEG, ]) +@ + +From now on, we will display only the median values for simplicity, i.e., + +<>= +apply(hypoData_mg[nonDEG, ], 2, median) +@ +<>= +hypoData_mg.median <- apply(hypoData_mg[nonDEG, ], 2, median) +@ + +The iDEGES/\Rpackage{edgeR}-normalized data can be retrieved as follows. + +<>= +library(TCC) +data(hypoData_mg) +nonDEG <- 201:1000 +group <- c(1, 1, 1, 2, 2, 2, 3, 3, 3) +tcc <- new("TCC", hypoData_mg, group) +design <- model.matrix(~ as.factor(group)) +coef <- 2:length(unique(group)) +tcc <- calcNormFactors(tcc, norm.method = "tmm", test.method = "edger", + iteration = 3) +normalized.count <- getNormalizedData(tcc) +apply(normalized.count[nonDEG, ], 2, median) +range(apply(normalized.count[nonDEG, ], 2, median)) +@ +<>= +normByiDEGES <- range(apply(normalized.count[nonDEG, ], 2, median)) +@ + +For comparison, the summary statistics for TMM-normalized data +produced using the original normalization method (i.e., TMM) +in \Rpackage{edgeR} are obtained as follows. + +<>= +library(TCC) +data(hypoData_mg) +nonDEG <- 201:1000 +group <- c(1, 1, 1, 2, 2, 2, 3, 3, 3) +tcc <- new("TCC", hypoData_mg, group) +tcc <- calcNormFactors(tcc, norm.method = "tmm", iteration = 0) +normalized.count <- getNormalizedData(tcc) +apply(normalized.count[nonDEG, ], 2, median) +range(apply(normalized.count[nonDEG, ], 2, median)) +@ +<>= +normByTMM <- range(apply(normalized.count[nonDEG, ], 2, median)) +@ + +It is obvious that the summary statistics +(ranging from $\Sexpr{sprintf("%.5f", min(normByiDEGES))}$ +to $\Sexpr{sprintf("%.5f", max(normByiDEGES))}$) +from iDEGES/\Rpackage{edgeR}-normalized data are closer to the truth +(i.e., ranging from $\Sexpr{sprintf("%.1f", min(hypoData_mg.median))}$ to +$\Sexpr{sprintf("%.1f", max(hypoData_mg.median))}$) than those +(ranging from $\Sexpr{sprintf("%.5f", min(normByTMM))}$ +to $\Sexpr{sprintf("%.5f", max(normByTMM))}$) +from TMM-normalized data. + +\newpage + +\section{Differential expression (DE)} +\label{section-4} + +The particular feature of \Rpackage{TCC} is that it calculates +robust normalization factors. Moreover, end users would like to +have some accessory functions for subsequent analyses. Here, +we provide the \Rfunction{estimateDE} function for identifying DEGs. +Specifically, the function internally uses the corresponding functions +implemented in three packages: \Rfunction{exactTest} in \Rpackage{edgeR}, +\Rfunction{nbinomTest} in \Rpackage{DESeq}, +and \Rfunction{getLikelihoods.NB} in \Rpackage{baySeq}. +Similar to the usage in the \Rfunction{calcNormFators} function +with the \Rfunction{test.method} argument in \Rpackage{TCC}, +those DE methods in \Rpackage{edgeR}, \Rpackage{DESeq}, +and \Rpackage{baySeq} can be performed +by using the \Rfunction{estimateDE} function with +\Rfunction{test.method = "edger"}, \Rfunction{"deseq"}, +and \Rfunction{"bayseq"}, respectively. Here, we show some +examples of DE analysis for two-group data with replicates +(\ref{section-4-1}), two-group data without replicates (\ref{section-4-2}), +and multi-group data with replicates (\ref{section-4-3}). + +\subsection{DE analysis for two-group data with replicates} +\label{section-4-1} + +\subsubsection{edgeR coupled with iDEGES/edgeR normalization} +\label{section-4-1-1} + +We give a procedure for DE analysis using the exact test implemented +in \Rpackage{edgeR} together with iDEGES/\Rpackage{edgeR} normalization +factors (i.e., the iDEGES/\Rpackage{edgeR}-\Rpackage{edgeR} combination) +for the hypothetical two-group count data with replicates (i.e., +the \Robject{hypoData} object). If the user wants to determine the +genes having an FDR threshold of $< 10\%$ as DEGs, one can do as follows. + +<>= +library(TCC) +data(hypoData) +group <- c(1, 1, 1, 2, 2, 2) +tcc <- new("TCC", hypoData, group) +tcc <- calcNormFactors(tcc, norm.method = "tmm", test.method = "edger", + iteration = 3, FDR = 0.1, floorPDEG = 0.05) +tcc <- estimateDE(tcc, test.method = "edger", FDR = 0.1) +@ + +The results of the DE analysis are stored in the \Rpackage{TCC} +class object. The summary statistics for top-ranked genes can +be retrieved by using the \Rfunction{getResult} function. + +<>= +result <- getResult(tcc, sort = TRUE) +head(result) +@ + +The DE results can be broken down as follows. + +<>= +table(tcc$estimatedDEG) +@ + +This means $\Sexpr{sum(tcc$estimatedDEG == FALSE)}$ non-DEGs +and $\Sexpr{sum(tcc$estimatedDEG == TRUE)}$ DEGs satisfy FDR $< 0.1$. +The \Rfunction{plot} function generates an M-A plot, +where "M" indicates the log-ratio +(i.e., $\mbox{M} = log_{2}\mbox{G2} - log_{2}\mbox{G1}$) and "A" +indicates average read count +(i.e., $\mbox{A} = (log_{2}\mbox{G2} + log_{2}\mbox{G1}) / 2$), +from the normalized count data. +The magenta points indicate the identified DEGs at FDR $< 0.1$. + +<>= +plot(tcc) +@ + + + + +\subsubsection{baySeq coupled with iDEGES/edgeR normalization} +\label{section-4-1-2} + +If the user wants to employ the empirical Bayesian method +in \Rpackage{baySeq} together with iDEGES/\Rpackage{edgeR} normalization +factors (i.e., the iDEGES/\Rpackage{edgeR}-\Rpackage{baySeq} combination), +one can do as follows. + + +<>= +set.seed(1000) +library(TCC) +data(hypoData) +samplesize <- 100 +group <- c(1, 1, 1, 2, 2, 2) +tcc <- new("TCC", hypoData, group) +tcc <- calcNormFactors(tcc, norm.method = "tmm", test.method = "edger", + iteration = 3, FDR = 0.1, floorPDEG = 0.05) +tcc <- estimateDE(tcc, test.method = "bayseq", + FDR = 0.1, samplesize = samplesize) +result <- getResult(tcc, sort = TRUE) +head(result) +table(tcc$estimatedDEG) +@ + +Note that a smaller sampling size (i.e., \Robject{samplesize = 100}) is +used here to reduce the computation time, +but a larger sampling size of around +$10,000$ (i.e., \Robject{samplesize = 10000}) is recommended (Hardcastle +and Kelly, 2010 \cite{hardcastle}). +Note also that \Rpackage{baySeq} outputs posterior +likelihoods instead of the $p$-values obtained from \Rpackage{edgeR} +and \Rpackage{DESeq}. The $p$-value column stores the ($1 - likelihood$) +values when the \Rfunction{estimateDE} function is executed with the +empirical Bayes in \Rpackage{baySeq}. Now let us describe an alternative +procedure for \Rpackage{baySeq} users that corresponds to the +\Rfunction{estimateDE} function. The $likelihood$ values and +$p$-values (calculated as $1 - likelihood$) are retrieved as follows. + +<>= +set.seed(1000) +library(TCC) +data(hypoData) +samplesize <- 100 +group <- c(1, 1, 1, 2, 2, 2) +tcc <- new("TCC", hypoData, group) +tcc <- calcNormFactors(tcc, norm.method = "tmm", test.method = "edger", + iteration = 3, FDR = 0.1, floorPDEG = 0.05) +effective.libsizes <- colSums(tcc$count) * tcc$norm.factors +groups <- list(NDE = rep(1, length(group)), DE = group) +cD <- new("countData", data = tcc$count, replicates = group, + libsizes = effective.libsizes, groups = groups) +cD <- getPriors.NB(cD, samplesize = samplesize, + estimation = "QL", cl = NULL) +cD <- getLikelihoods.NB(cD, pET = "BIC", cl = NULL) +tmp <- topCounts(cD, group = "DE", number = nrow(tcc$count)) +tmp <- tmp[rownames(tcc$count), ] +p.value <- 1 - tmp$Likelihood +q.value <- tmp$FDR +result <- cbind(p.value, q.value) +rownames(result) <- rownames(tmp) +head(result) +@ + + +\subsection{DE analysis for two-group data without replicates} +\label{section-4-2} + +As described previously, the functions for the DEG identification +method implemented in \Rpackage{edgeR} (i.e., the exact test; ver. 3.0.4) +do not allow analysis without replicates. Currently, +the \Rfunction{estimateDE} function only allows the \Robject{"deseq"} or +\Robject{"bayseq"} options for the \Robject{test.method} argument. +Here, we show a procedure for DE analysis using +the NB test implemented in \Rpackage{DESeq} together with +iDEGES/\Rpackage{DESeq} normalization factors +(i.e., the iDEGES/\Rpackage{DESeq}-\Rpackage{DESeq} combination) +for the hypothetical two-group count data without replicates (i.e., +the \Robject{hypoData[, c(1, 4)]} object). If the user wants to determine +the genes having an FDR threshold of $< 10\%$ as DEGs, +one can do as follows. + +<>= +library(TCC) +data(hypoData) +group <- c(1, 2) +tcc <- new("TCC", hypoData[, c(1, 4)], group) +head(tcc$count) +tcc$group +tcc <- calcNormFactors(tcc, norm.method = "deseq", test.method = "deseq", + iteration = 3, FDR = 0.1, floorPDEG = 0.05) +tcc$norm.factors +tcc <- estimateDE(tcc, test.method = "deseq", + FDR = 0.1) +result <- getResult(tcc, sort = TRUE) +head(result) +table(tcc$estimatedDEG) +@ + +It can be seen that there is no DEG having FDR $< 0.1$. + +\subsection{DE analysis for multi-group data with replicates} +\label{section-4-3} + +Here, we give three examples of DE analysis coupled with +DEGES/\Rpackage{edgeR} normalization for the hypothetical three-group +data with replicates, i.e., the \Robject{hypoData\_mg} object. +The use of the DEGES/\Rpackage{edgeR} normalization +factors is simply for reducing the computation time. + + +\subsubsection{baySeq coupled with DEGES/edgeR normalization} +\label{section-4-3-1} + +The empirical Bayesian method implemented in \Rpackage{baySeq} +after executing the DEGES/\Rpackage{edgeR} normalization +(i.e., the DEGES/\Rpackage{edgeR}-\Rpackage{baySeq} combination) +can be performed as follows. + +<>= +library(TCC) +data(hypoData_mg) +group <- c(1, 1, 1, 2, 2, 2, 3, 3, 3) +tcc <- new("TCC", hypoData_mg, group) +### Normalization ### +tcc <- calcNormFactors(tcc, norm.method = "tmm", test.method = "edger", + iteration = 1) +### DE analysis ### +set.seed(1000) +samplesize <- 100 +tcc <- estimateDE(tcc, test.method = "bayseq", + FDR = 0.1, samplesize = samplesize) +result <- getResult(tcc, sort = TRUE) +head(result) +table(tcc$estimatedDEG) +@ + +It can be seen that the \Rpackage{baySeq} method identified +$\Sexpr{sum(tcc$estimatedDEG == TRUE)}$ DEGs +having FDR $< 0.1$. One can obtain the number of DEGs with another +threshold (e.g., FDR $< 0.2$) from the result object as follows. + +<>= +sum(result$q.value < 0.2) +@ + +For \Rpackage{baySeq} users, we provide commands, consisting of +functions in \Rpackage{baySeq}, to perform the DEG identification +without the function in \Rpackage{TCC}. The \Rfunction{estimateDE} +function with \Robject{test.method = "bayseq"} can be regarded as a +wrapper function for the following commands after the +DEGES/\Rpackage{edgeR} normalization. + +<>= +set.seed(1000) +samplesize <- 100 +effective.libsizes <- colSums(tcc$count) * tcc$norm.factors +groups <- list(NDE = rep(1, length(group)), DE = group) +cD <- new("countData", data = tcc$count, replicates = group, + libsizes = effective.libsizes, groups = groups) +cD <- getPriors.NB(cD, samplesize = samplesize, + estimation = "QL", cl = NULL) +cD <- getLikelihoods.NB(cD, pET = "BIC", cl = NULL) +tmp <- topCounts(cD, group = "DE", number = nrow(tcc$count)) +tmp <- tmp[rownames(tcc$count), ] +p.value <- 1 - tmp$Likelihood +q.value <- tmp$FDR +result <- cbind(p.value, q.value) +rownames(result) <- rownames(tmp) +head(result) +sum(q.value < 0.1) +sum(q.value < 0.2) +@ + +\subsubsection{edgeR coupled with DEGES/edgeR normalization} +\label{section-4-3-2} + +The exact test implemented in \Rpackage{edgeR} after executing +the DEGES/\Rpackage{edgeR} normalization +(i.e., the DEGES/\Rpackage{edgeR}-\Rpackage{edgeR} combination) +can be performed as follows. + +<>= +library(TCC) +data(hypoData_mg) +group <- c(1, 1, 1, 2, 2, 2, 3, 3, 3) +tcc <- new("TCC", hypoData_mg, group) +### Normalization ### +tcc <- calcNormFactors(tcc, norm.method = "tmm", test.method = "edger", + iteration = 1) +### DE analysis ### +tcc <- estimateDE(tcc, test.method = "edger", FDR = 0.1) +result <- getResult(tcc, sort = TRUE) +head(result) +table(tcc$estimatedDEG) +@ + +Note that these DEGs having FDR $< 0.1$ display DE between any of the groups +because the two arguments indicated here +(\Robject{design} and \Robject{coef}) correspond +to an AVOVA-like test for any differences provided in \Rpackage{edgeR}, i.e., + +<>= +library(TCC) +data(hypoData_mg) +group <- c(1, 1, 1, 2, 2, 2, 3, 3, 3) +design <- model.matrix(~ as.factor(group)) +coef <- 2:length(unique(group)) +tcc <- new("TCC", hypoData_mg, group) +### Normalization ### +tcc <- calcNormFactors(tcc, norm.method = "tmm", test.method = "edger", + iteration = 1) +### DE analysis ### +d <- DGEList(tcc$count, group = group) +d$samples$norm.factors <- tcc$norm.factors +d <- estimateGLMCommonDisp(d, design) +d <- estimateGLMTrendedDisp(d, design) +d <- estimateGLMTagwiseDisp(d, design) +fit <- glmFit(d, design) +lrt <- glmLRT(fit, coef = coef) +tmp <- topTags(lrt, n = nrow(tcc$count)) +p.value <- tmp$table$PValue +q.value <- tmp$table$FDR +result <- cbind(p.value, q.value) +rownames(result) <- rownames(tmp) +head(result) +sum(q.value < 0.1) +sum(q.value < 0.2) +@ + +As described in the \Rpackage{edgeR} manual, the second and third columns +in the \Robject{design} object are relative to the baseline (i.e., Group 1 or G1): +\Robject{coef = 2} means G2 vs. G1 and \Robject{coef = 3} means G3 vs. G1. +The above procedure with the \Robject{coef} object +(i.e., \Robject{2:length(unique(group))}) indicates +the both comparisons (i.e., G2 vs. G1 and G3 vs. G1) and +identifies DEGs between any of the three groups. In other words, +one can do any two-group comparison of interest from multi-group data with +replicates. For example, the DE analysis for G3 vs. G1 together with +DEGES/\Rpackage{edgeR} normalization can be performed as follows. + +<>= +library(TCC) +data(hypoData_mg) +group <- c(1, 1, 1, 2, 2, 2, 3, 3, 3) +tcc <- new("TCC", hypoData_mg, group) +### Normalization ### +tcc <- calcNormFactors(tcc, norm.method = "tmm", test.method = "edger", + iteration = 1) +### DE analysis ### +coef <- 3 +tcc <- estimateDE(tcc, test.method = "edger", FDR = 0.1, coef = coef) +result <- getResult(tcc, sort = TRUE) +head(result) +table(tcc$estimatedDEG) +@ + +\subsubsection{DESeq coupled with DEGES/edgeR normalization} +\label{section-4-3-3} + +The NB test implemented in \Rpackage{DESeq} after executing the +DEGES/\Rpackage{edgeR} normalization +(i.e., the DEGES/\Rpackage{edgeR}-\Rpackage{DESeq} combination) +can be performed as follows. + +<>= +library(TCC) +data(hypoData_mg) +group <- c(1, 1, 1, 2, 2, 2, 3, 3, 3) +tcc <- new("TCC", hypoData_mg, group) +### Normalization ### +tcc <- calcNormFactors(tcc, norm.method = "tmm", test.method = "edger", + iteration = 1) +### DE analysis ### +fit1 <- count ~ condition +fit0 <- count ~ 1 +tcc <- estimateDE(tcc, test.method = "deseq", + FDR = 0.1, fit0 = fit0, fit1 = fit1) +result <- getResult(tcc, sort = TRUE) +head(result) +table(tcc$estimatedDEG) +@ + +For \Rpackage{DESeq} users, we provide commands, consisting of +functions in \Rpackage{DESeq}, to perform the DEG identification +without the function in \Rpackage{TCC}. The \Rfunction{estimateDE} +function with \Robject{test.method = "deseq"} can be regarded +as a wrapper function for the following commands after the +DEGES/\Rpackage{edgeR} normalization. + +<>= +library(TCC) +data(hypoData_mg) +group <- c(1, 1, 1, 2, 2, 2, 3, 3, 3) +tcc <- new("TCC", hypoData_mg, group) +### Normalization ### +tcc <- calcNormFactors(tcc, norm.method = "tmm", test.method = "edger", + iteration = 1) +### DE analysis ### +fit1 <- count ~ condition +fit0 <- count ~ 1 +cds <- newCountDataSet(tcc$count, group) +sizeFactors(cds) <- tcc$norm.factors * colSums(tcc$count) +cds <- estimateDispersions(cds) +reduced.model <- fitNbinomGLMs(cds, fit0) +full.model <- fitNbinomGLMs(cds, fit1) +p.value <- nbinomGLMTest(full.model, reduced.model) +p.value[is.na(p.value)] <- 1 +q.value <- p.adjust(p.value, method = "BH") +tmp <- cbind(p.value, q.value) +rownames(tmp) <- tcc$gene_id +result <- tmp[order(p.value), ] +head(result) +sum(q.value < 0.1) +sum(q.value < 0.2) +@ + + +\section{Generation of simulation data} + +\subsection{Introduction and basic usage} +\label{section-5-1} + +As demonstrated in our previous study (Kadota et al., 2012 \cite{kadota}), +the DEGES-based normalization methods implemented in \Rpackage{TCC} +theoretically outperform the other normalization methods when the numbers +of DEGs (G1 vs. G2) in the tag count data are biased. However, +it is difficult to determine whether the up- and down-regulated DEGs in +one of the groups are actually biased in their number when analyzing +real data (Dillies et al., 2012 \cite{dillies}). +This means we have to evaluate the +potential performance of our DEGES-based methods using mainly simulation +data. The \Rfunction{simulateReadCounts} function generates simulation +data under various conditions. This function can generate simulation +data analyzed in the TbT paper (Kadota et al., 2012 \cite{kadota}), +and that means it enables other +researchers to compare the methods they develop with our DEGES-based +methods. For example, the \Robject{hypoData} object, a hypothetical +count dataset provided in \Rpackage{TCC}, was generated by using this +function. The output of the \Rfunction{simulateReadCounts} function is stored +as a \Rpackage{TCC} class object and is therefore ready-to-analyze. + +Note that different trials of simulation analysis generally yield +different count data even under the same simulation conditions. +As mentioned in section \ref{section-3-1-1}, +we can call the \Rfunction{set.seed} +function in order to obtain reproducible results +(i.e., the \Robject{tcc\$count}) with the +\Rfunction{simulateReadCounts} function. + +<>= +set.seed(1000) +library(TCC) +tcc <- simulateReadCounts(Ngene = 1000, PDEG = 0.2, + DEG.assign = c(0.9, 0.1), + DEG.foldchange = c(4, 4), + replicates = c(3, 3)) +dim(tcc$count) +head(tcc$count) +tcc$group +@ + +The simulation conditions for comparing two groups (G1 vs. G2) with +biological replicates are as follows: (i) the number of genes is +$1,000$ (i.e., \Robject{Ngene = 1000}), +(ii) the first $20\%$ of genes are DEGs +(\Robject{PDEG = 0.2}), (iii) the first $90\%$ of the DEGs are +up-regulated in G1, and the remaining $10\%$ are up-regulated in +G2 (\Robject{DEG.assign = c(0.9, 0.1)}), (iv) the levels of DE are four-fold +in both groups (\Robject{DEG.foldchange = c(4, 4)}), +and (v) there are a total of six samples (three biological +replicates for G1 and three biological replicates for G2) +(\Robject{replicates = c(3, 3)}). +The variance of the NB distribution can be modeled as + $V = \mu + \phi\mu^{2}$. +The empirical distribution of the read counts for producing +the mean ($\mu$) and dispersion ($\phi$) parameters of the +model was obtained from {\it Arabidopsis} data (three biological +replicates for each of the treated and non-treated groups) +in \Rpackage{NBPSeq} (Di et al., 2011 \cite{di}). + +The \Robject{tcc\$count} object is essentially the same as +the \Robject{hypoData} object of \Rpackage{TCC}. +The information about the simulation conditions can be viewed as follows. + +<>= +str(tcc$simulation) +@ + +Specifically, the entries for $0, 1$, and $2$ in the +\Robject{tcc\$simulation\$trueDEG} +object are for non-DEG, DEGs up-regulated in G1, and DEGs up-regulated in G2, +respectively. The breakdowns for individual entries are the same as +stated above: $800$ entries are non-DEGs, $180$ DEGs are up-regulated in G1, +and $20$ DEGs are up-regulated in G2. + +<>= +table(tcc$simulation$trueDEG) +@ + +This information can be used to evaluate the performance of the DEGES-based +normalization methods in terms of the sensitivity and specificity of the +results of their DE analysis. A good normalization method coupled with a +DE method such as the exact test +(Robinson and Smyth, 2008 \cite{robinson3}) and the +empirical Bayes (Hardcastle and Kelly, 2010) should produce well-ranked +gene lists in which the true DEGs are top-ranked and non-DEGs are +bottom-ranked when all genes are ranked according to the degree of DE. +The ranked gene list after performing +the DEGES/\Rpackage{edgeR}-\Rpackage{edgeR} +combination can be obtained as follows. + +<>= +tcc <- calcNormFactors(tcc, norm.method = "tmm", test.method = "edger", + iteration = 1, FDR = 0.1, floorPDEG = 0.05) +tcc <- estimateDE(tcc, test.method = "edger", FDR = 0.1) +result <- getResult(tcc, sort = TRUE) +head(result) +@ + +We can now calculate the area under the \Rpackage{ROC} curve +(i.e., AUC; $0 \le $AUC$ \le 1$) between the ranked gene list and the +truth (i.e., DEGs or non-DEGs) and thereby evaluate the sensitivity +and specificity simultaneously. A well-ranked gene list should have +a high AUC value (i.e., high sensitivity and specificity). +The \Rfunction{calcAUCValue} function calculates the AUC value +based on the information stored in the \Rpackage{TCC} class object. + +<>= +calcAUCValue(tcc) +@ +<>= +auc.degesedger <- calcAUCValue(tcc) +@ + +This is essentially the same as + +<>= +AUC(rocdemo.sca(truth = as.numeric(tcc$simulation$trueDEG != 0), + data = -tcc$stat$rank)) +@ + +The following classic \Rpackage{edgeR} procedure +(i.e., the TMM-\Rpackage{edgeR} combination) +make it clear that the DEGES-based normalization method (i.e., +the DEGES/\Rpackage{edgeR} pipeline) outperforms the default +normalization method (i.e., TMM) implemented in \Rpackage{edgeR}. + +<>= +tcc <- calcNormFactors(tcc, norm.method = "tmm", iteration = 0) +tcc <- estimateDE(tcc, test.method = "edger", FDR = 0.1) +calcAUCValue(tcc) +@ + +The following is an alternative procedure for \Rpackage{edgeR} users. + +<>= +d <- DGEList(counts = tcc$count, group = tcc$group$group) +d <- calcNormFactors(d) +d$samples$norm.factors <- d$samples$norm.factors / + mean(d$samples$norm.factors) +d <- estimateCommonDisp(d) +d <- estimateTagwiseDisp(d) +result <- exactTest(d) +result$table$PValue[is.na(result$table$PValue)] <- 1 +AUC(rocdemo.sca(truth = as.numeric(tcc$simulation$trueDEG != 0), + data = -rank(result$table$PValue))) +@ +<>= +set.seed(1000) +samplesize <- 100 +tcc <- calcNormFactors(tcc, norm.method = "tmm", test.method = "bayseq", + iteration = 1, samplesize = samplesize) +tcc <- estimateDE(tcc, test.method = "edger", FDR = 0.1) +auc.degestbt <- calcAUCValue(tcc) +@ + +As can be expected from the similarity of the normalization +factors of DEGES/TbT (\ref{section-3-1-1}) and DEGES/\Rpackage{edgeR} +(\ref{section-3-1-2}), +the AUC value ($\Sexpr{sprintf("%.7f", auc.degesedger)}$) of +DEGES/\Rpackage{edgeR} is quite similar +to the AUC value ($\Sexpr{sprintf("%.7f", auc.degestbt)}$) of the +original TbT method (i.e., DEGES/TbT): + +<>= +set.seed(1000) +samplesize <- 100 +tcc <- calcNormFactors(tcc, norm.method = "tmm", test.method = "bayseq", + iteration = 1, samplesize = samplesize) +tcc <- estimateDE(tcc, test.method = "edger", FDR = 0.1) +calcAUCValue(tcc) +@ + + +\subsection{Two-group data without replicates} +\label{section-5-2} + +Let us generate tag count data without replicates, +such as those used in section \ref{section-3-2} For simplicity, +we first generate simulation data whose conditions are essentially +the same as those in the previous section (i.e., \ref{section-5-1}), +except for the number of replicates in each group: (i) the number of +genes is $1,000$ (i.e., \Robject{Ngene = 1000}), +(ii) the first $20\%$ of genes +are DEGs (\Robject{PDEG = 0.2}), +(iii) the first $90\%$ of the DEGs +are up-regulated in G1, and the remaining $10\%$ are up-regulated in +G2 (\Robject{DEG.assign = c(0.9, 0.1)}), +(iv) the levels of DE are four-fold +in both groups (\Robject{DEG.foldchange = c(4, 4)}), +and (v) there are a total +of two samples (one from G1 and the other from G2) +(\Robject{replicates = c(1, 1)}). + +<>= +set.seed(1000) +library(TCC) +tcc <- simulateReadCounts(Ngene = 1000, PDEG = 0.2, + DEG.assign = c(0.9, 0.1), + DEG.foldchange = c(4, 4), + replicates = c(1, 1)) +dim(tcc$count) +head(tcc$count) +tcc$group +@ + +Now let us see how the DEGES/\Rpackage{DESeq}-\Rpackage{DESeq} combination +with the original \Rpackage{DESeq}-\Rpackage{DESeq} combination +performs. First, we calculate the AUC value for the ranked +gene list obtained from the DEGES/\Rpackage{DESeq}-\Rpackage{DESeq} +combination. + +<>= +tcc <- calcNormFactors(tcc, norm.method = "deseq", test.method = "deseq", + iteration = 1, FDR = 0.1, floorPDEG = 0.05) +tcc <- estimateDE(tcc, test.method = "deseq") +calcAUCValue(tcc) +@ + +Next, we calculate the corresponding value +using the original \Rpackage{DESeq} procedure +(i.e., the \Rpackage{DESeq}-\Rpackage{DESeq} combination). + +<>= +tcc <- calcNormFactors(tcc, norm.method = "deseq", iteration = 0) +tcc <- estimateDE(tcc, test.method = "deseq") +calcAUCValue(tcc) +@ + +It can be seen that the DEGES/\Rpackage{DESeq}-\Rpackage{DESeq} combination +outperforms the original procedure +under the given simulation conditions. The following is an alternative +approach for \Rpackage{DESeq} users. + +<>= +cds <- newCountDataSet(tcc$count, tcc$group$group) +cds <- estimateSizeFactors(cds) +norm.factors <- sizeFactors(cds) / colSums(tcc$count) +norm.factors <- norm.factors / mean(norm.factors) +sizeFactors(cds) <- colSums(tcc$count) * norm.factors +cds <- estimateDispersions(cds, method="blind", sharingMode="fit-only") +result <- nbinomTest(cds, 1, 2) +result$pval[is.na(result$pval)] <- 1 +AUC(rocdemo.sca(truth = as.numeric(tcc$simulation$trueDEG != 0), + data = -rank(result$pval))) +@ + +This procedure is completely the same as the one in \Rpackage{TCC} that +gives normalization factors corresponding to those in \Rpackage{edgeR} +for different packages. However, the following commands from the +\Rpackage{DESeq} manual are of practical value because they give +approximately the same AUC value as above. + +<>= +cds <- newCountDataSet(tcc$count, tcc$group$group) +cds <- estimateSizeFactors(cds) +cds <- estimateDispersions(cds, method="blind", sharingMode="fit-only") +result <- nbinomTest(cds, 1, 2) +result$pval[is.na(result$pval)] <- 1 +AUC(rocdemo.sca(truth = as.numeric(tcc$simulation$trueDEG != 0), + data = -rank(result$pval))) +@ + + +\subsection{Multi-group data with and without replicates} +\label{section-5-3} + +The \Rfunction{simulateReadCounts} function can generate simulation data +with a more complex design. First, we generate a dataset consisting +of three groups. The simulation conditions for this dataset are as +follows: (i) the number of genes is $1,000$ +(i.e., \Robject{Ngene = 1000}), +(ii) the first $30\%$ of genes are DEGs (\Robject{PDEG = 0.3}), +(iii) the breakdowns of the up-regulated DEGs are respectively +$70\%$, $20\%$, and $10\%$ in Groups 1-3 +(\Robject{DEG.assign = c(0.7, 0.2, 0.1)}), +(iv) the levels of DE are $3$-, $10$-, and $6$-fold in individual +groups (\Robject{DEG.foldchange = c(3, 10, 6)}), +and (v) there are a total of nine libraries (2, 4, and 3 +replicates for Groups 1-3) (\Robject{replicates = c(2, 4, 3)}). + +<>= +set.seed(1000) +library(TCC) +tcc <- simulateReadCounts(Ngene = 1000, PDEG = 0.3, + DEG.assign = c(0.7, 0.2, 0.1), + DEG.foldchange = c(3, 10, 6), + replicates = c(2, 4, 3)) +dim(tcc$count) +tcc$group +head(tcc$count) +@ + +The pseudo-color image for the generated simulation data regarding +the DEGs can be obtained from the \Rfunction{plotFCPseudocolor} function. +The right bar (from white to magenta) indicates the degree of +fold-change (FC). As expected, it can be seen that the first +$210$, $60$, and $30$ genes are up-regulated in G1, G2, and G3, +respectively. + +<>= +plotFCPseudocolor(tcc) +@ + +Now let us see how the DEGES/\Rpackage{edgeR}-\Rpackage{edgeR} combination +with the original \Rpackage{edgeR}-\Rpackage{edgeR} combination performs. +First we calculate the AUC value for the ranked gene list obtained from the +DEGES/\Rpackage{edgeR}-\Rpackage{edgeR} combination. + +<>= +tcc <- calcNormFactors(tcc, norm.method = "tmm", test.method = "edger", + iteration = 1) +tcc <- estimateDE(tcc, test.method = "edger", FDR = 0.1) +calcAUCValue(tcc) +@ + +Next, we calculate the corresponding value using the original \Rpackage{edgeR} +procedure for single factor experimental design +(i.e., the \Rpackage{edgeR}-\Rpackage{edgeR} combination). + +<>= +tcc <- calcNormFactors(tcc, norm.method = "tmm", test.method = "edger", + iteration = 0) +tcc <- estimateDE(tcc, test.method = "edger", FDR = 0.1) +calcAUCValue(tcc) +@ + +It can be seen that the DEGES/\Rpackage{edgeR}-\Rpackage{edgeR} combination +outperforms the original \Rpackage{edgeR} procedure under the given simulation +conditions. Note that the \Robject{test.method} argument will be ignored when +\Rpackage{iteration = 0} is specified. + +Next, let us generate another dataset consisting of a total of eight groups. +The simulation conditions for this dataset are as follows: +(i) the number of genes is $10,000$ +(i.e., \Robject{Ngene = 10000}), +(ii) the first $34\%$ of genes are DEGs (\Robject{PDEG = 0.34}), +(iii) the breakdowns of the up-regulated DEGs are respectively +$10\%$, $30\%$, $5\%$, $10\%$, $5\%$, $21\%$, $9\%$, and $10\%$ in Groups 1-8 +(\Robject{DEG.assign = c(0.1, 0.3, 0.05, 0.1, 0.05, 0.21, 0.09, 0.1)}), +(iv) the levels of DE are $3.1$-, $13$-, $2$-, $1.5$-, $9$-, $5.6$-, $4$-, +and $2$-fold in individual groups +(\Robject{DEG.foldchange = c(3.1, 13, 2, 1.5, 9, 5.6, 4, 2)}), +and (v) there are a total of nine libraries (except for G3, +none of the groups have replicates) +(\Robject{replicates = c(1, 1, 2, 1, 1, 1, 1, 1)}). + +<>= +set.seed(1000) +library(TCC) +tcc <- simulateReadCounts(Ngene = 10000, PDEG = 0.34, + DEG.assign = c(0.1, 0.3, 0.05, 0.1, 0.05, 0.21, 0.09, 0.1), + DEG.foldchange = c(3.1, 13, 2, 1.5, 9, 5.6, 4, 2), + replicates = c(1, 1, 2, 1, 1, 1, 1, 1)) +dim(tcc$count) +tcc$group +head(tcc$count) +plotFCPseudocolor(tcc) +@ + + +This kind of simulation data may be useful for evaluating methods +aimed at identifying tissue-specific (or tissue-selective) genes. + +\subsection{Multi-factor data} +\label{section-5-3-p1} + +The \Rfunction{simulateReadCounts} function can also generate simulation +data in multi-factor experimental design. Different from above single-factor +experimental design, the \Robject{group} argument should be used instead of +\Robject{replicates} for specifying sample conditions (or factors) when +generating simulation data in multi-factor design. +In relation to the \Robject{group} specification, +the \Robject{DEG.foldchange} argument should also be specified as a data +frame object. + +We generate a dataset consisting of two factors for comparing +(i) two Groups (i.e., "WT" vs. "KO") as the first factor, +at (ii) two time points (i.e., "1d" vs. "2d") as the second factor, +with all samples obtained from independent subjects. +There are a total of four conditions +("WT\_1d", "WT\_2d", "KO\_1d", and "KO\_2d") each of which has two +biological replicates, comprising a total of eight samples. +The \Robject{group} argument for this experimental design can be described as follows: + +<>= +group <- data.frame( + GROUP = c("WT", "WT", "WT", "WT", "KO", "KO", "KO", "KO"), + TIME = c("1d", "1d", "2d", "2d", "1d", "1d", "2d", "2d") +) +@ + +Next, we design the number of types of DEGs and the levels of fold-change +by the \Robject{DEG.foldchange} argument. We here introduce three types of +DEGs: (a) 2-fold up-regulation in the first four samples (i.e., "WT"), +(b) 3-fold up-regulation in the last four samples (i.e., "KO"), +and (c) 2-fold down-regulation at "2d" in "WT" and 4-fold up-regulation +at "2d" in "KO". This implies that the first two types of DEGs are related +to the first factor (i.e., "WT" vs. "KO") and the third type of DEG is related +to the second factor (i.e., "1d" vs. "2d"). + +<>= +DEG.foldchange <- data.frame( + FACTOR1.1 = c(2, 2, 2, 2, 1, 1, 1, 1), + FACTOR1.2 = c(1, 1, 1, 1, 3, 3, 3, 3), + FACTOR2 = c(1, 1, 0.5, 0.5, 1, 1, 4, 4) +) +@ + +The other simulation conditions for this dataset are as follows: +(1) the number of gene is 1,000 (i.e., \Robject{Ngene = 1000}), +(2) the first 20\% of genes are DEGs (i.e., \Robject{PDEG = 0.2}), +and (3) the breakdowns of the three types of DEGs are 50\%, 20\%, +and 30\% (i.e., \Robject{DEG.assign = c(0.5, 0.2, 0.3)}). + +<>= +set.seed(1000) +tcc <- simulateReadCounts(Ngene = 10000, PDEG = 0.2, + DEG.assign = c(0.5, 0.2, 0.3), + DEG.foldchange = DEG.foldchange, + group = group) +@ + +Since the first six rows in the dataset corresponds to the first type +of DEGs, we can see the 2-fold up-regulation in the first four columns +(i.e., WT-related samples) compared to the last four columns +(i.e., KO-related samples). + +<>= +head(tcc$count) +tcc$group +plotFCPseudocolor(tcc) +@ + + +\subsection{Other utilities} +\label{section-5-4} + +Recall that the simulation framework can handle different levels +of DE for DEGs in individual groups, and the shape of the distribution +for these DEGs is the same as that of non-DEGs. Let us confirm those +distributions by introducing more drastic simulation conditions for +comparing two groups (G1 vs. G2) with biological replicates; i.e., +(i) the number of genes is $20,000$ (i.e., \Robject{Ngene = 20000}), +(ii) the first $30\%$ of genes are DEGs (\Robject{PDEG = 0.30}), +(iii) the first $85\%$ of the DEGs are up-regulated in G1 and the remaining +$15\%$ are up-regulated in G2 (\Robject{DEG.assign = c(0.85, 0.15)}), +(iv) the levels of DE are eight-fold in G1 and sixteen-fold in +G2 (\Robject{DEG.foldchange = c(8, 16)}), and (v) there are a total of +four samples (two biological replicates for G1 and two biological +replicates for G2) (\Robject{replicates = c(2, 2)}). + +<>= +set.seed(1000) +library(TCC) +tcc <- simulateReadCounts(Ngene = 20000, PDEG = 0.30, + DEG.assign = c(0.85, 0.15), + DEG.foldchange = c(8, 16), + replicates = c(2, 2)) +head(tcc$count) +@ + +An M-A plot for the simulation data can be viewed as follows; +the points for up-regulated DEGs in G1 and G2 are colored +blue and red, respectively. The non-DEGs are in black: + +<>= +plot(tcc) +@ + +This plot is generated from simulation data that has been scaled +in such a way that the library sizes of each sample are the same +as the mean library size of the original data. That is, + +<>= +normalized.count <- getNormalizedData(tcc) +colSums(normalized.count) +colSums(tcc$count) +mean(colSums(tcc$count)) +@ + +<>= +xy <- plot(tcc) +isnot.na <- as.logical(xy[, 1] != min(xy[, 1])) +median.G1 <- median(xy[(tcc$simulation$trueDEG == 1) & isnot.na, 2]) +median.G2 <- median(xy[(tcc$simulation$trueDEG == 2) & isnot.na, 2]) +median.nonDEG <- median(xy[(tcc$simulation$trueDEG == 0) & isnot.na, 2]) +@ + +The summary statistics for non-DEGs and up-regulated DEGs in G1 and G2 +are upshifted compared with the original intentions of the user (i.e., +respective M values of $0$, $-3$, and $4$ for non-DEGs and up-regulated +DEGs in G1 and G2). Indeed, the median values, indicated as horizontal lines, +are respectively $\Sexpr{sprintf("%.3f", median.nonDEG)}$, +$\Sexpr{sprintf("%.3f", median.G1)}$, +and $\Sexpr{sprintf("%.3f", median.G2)}$ for non-DEGs +and up-regulated DEGs in G1 and G2. + +<>= +plot(tcc, median.lines = TRUE) +@ + +<>= +tcc <- calcNormFactors(tcc, "tmm", "edger", iteration = 3, + FDR = 0.1, floorPDEG = 0.05) +xy <- plot(tcc) +isnot.na <- as.logical(xy[, 1] != min(xy[, 1])) +median.nonDEG <- median(xy[(tcc$simulation$trueDEG == 0) & isnot.na, 2]) +@ + +These upshifted M values for non-DEGs can be modified after performing the +iDEGES/\Rpackage{edgeR} normalization, e.g., the median M value +($= \Sexpr{sprintf("%.3f", median.nonDEG)}$) +for non-DEGs based on the iDEGES/\Rpackage{edgeR}-normalized +data is nearly zero. + +<>= +tcc <- calcNormFactors(tcc, norm.method = "tmm", test.method = "edger", + iteration = 3, FDR = 0.1, floorPDEG = 0.05) +plot(tcc, median.line = TRUE) +@ + +The comparison of those values obtained from different normalization +methods might be another evaluation metric. + +\newpage + +\section{Session info} + +<>= +sessionInfo() +@ + +\newpage + +\section{References} +\renewcommand{\refname}{} +\renewcommand\refname{\vskip -1cm} + +\begin{thebibliography}{99} +\bibitem{robinson} Robinson MD, McCarthy DJ, and Smyth GK. edgeR: + a Bioconductor package for differential expression + analysis of digital gene expression data. Bioinformatics + 2010, 26(1): 139-140 +\bibitem{dillies} Dillies MA, Rau A, Aubert J, Hennequet-Antier C, + Jeanmougin M, Servant N, Keime C, Marot G, Castel D, + Estelle J, Guernec G, Jagla B, Jouneau L, Lalo\"e D, + Le Gall C, Scha\"effer B, Le Crom S, Guedj M, + Jaffr\'ezic F; on behalf of The French StatOmique + Consortium. A comprehensive evaluation of normalization + methods for Illumina high-throughput RNA sequencing data + analysis. Brief Bioinform, in press +\bibitem{kadota} Kadota K, Nishiyama T, and Shimizu K. A normalization + strategy for comparing tag count data. Algorithms Mol + Biol. 2012, 7:5 +\bibitem{robinson2} Robinson MD and Oshlack A. A scaling normalization method + for differential expression analysis of RNA-seq data. + Genome Biol. 2010, 11: R25 +\bibitem{wad} Kadota K, Nakai Y, Shimizu K: A weighted average difference + method for detecting differentially expressed genes from + microarray data. Algorithms Mol Biol. 208, 3: 8 +\bibitem{hardcastle} Hardcastle TJ and Kelly KA. baySeq: + empirical Bayesian methods for identifying differential + expression in sequence count data. BMC Bioinformatics + 2010, 11: 422 +\bibitem{aicentropy} Kadota K, Nishimura SI, Bono H, Nakamura S, + Hayashizaki Y, Okazaki Y, Takahashi K: + Detection of genes with tissue-specific expression + patterns using Akaike's Information Criterion (AIC) + procedure. Physiol Genomics 2003, 12: 251-259 +\bibitem{anders} Anders S and Huber W. Differential expression analysis + for sequence count data. Genome Biol. 2010, 11(10): R106 +\bibitem{mccarthy} McCarthy DJ, Chen Y, Smyth GK. Differential expression + analysis of multifactor RNA-Seq experiments with respect + to biological variation. Nucleic Acids Res. 2012, 40(10): + 4288-4297 +\bibitem{glaus} Glaus P, Honkela A, and Rattray M. Identifying differentially + expressed transcripts from RNA-seq data with biological + variation. Bioinformatics 2012, 28(13): 1721-1728 +\bibitem{roku} Kadota K, Ye J, Nakai Y, Terada T, Shimizu K: ROKU: a novel + method for identification of tissue-specific genes. + BMC Bioinformatics 2006, 7: 294 +\bibitem{ueda} Ueda T. Simple method for the detection of outliers. + Japanese J Appl Stat 1996, 25: 17-26 +\bibitem{robinson3} Robinson MD and Smyth GK. Small-sample estimation of + negative binomial dispersion, with applications to SAGE + data. Biostatistics 2008, 9: 321-332 +\bibitem{sun} Sun J, Nishiyama T, Shimizu K, and Kadota K. TCC: an R package + for comparing tag count data with robust normalization + strategies. BMC Bioinformatics 2013, 14: 219 +\bibitem{di} Di Y, Schafer DW, Cumbie JS, and Chang JH. The NBP negative + binomial model for assessing differential gene expression from + RNA-Seq. Stat Appl Genet Mol Biol. 2011, 10: art24 +\end{thebibliography} +\end{document} + diff --git a/TCC/inst/unitTests/test_ROKU.R b/TCC/inst/unitTests/test_ROKU.R new file mode 100644 index 0000000..24a284b --- /dev/null +++ b/TCC/inst/unitTests/test_ROKU.R @@ -0,0 +1,101 @@ +## Kadota original code +kadota_2006_bmc_bioinformatics <- function(x) { + x <- x[(!is.na(x))] + x_length <- length(x) + x <- x[(x != 0)] + if(sum(x) <= 0) { + return(log(x_length, base = 2)) + } else if(sd(x) == 0) { + return(log(x_length, base = 2)) + } else { + y <- x + y.m <- median(y) + y.u <- (y - y.m) / (5 * median(abs(y - y.m)) + 1e-04) + y.w <- rep(0, length(y)) + y.i <- abs(y.u) <= 1 + y.w[y.i] <- ((1 - y.u^2)^2)[y.i] + y.b <- sum(y.w * y) / sum(y.w) + x_prime <- abs(y - y.b) + p <- x_prime / sum(x_prime) + e <- sum(p * log(p, base = 2)) + return(-e) + } +} +## Kadota original code +kadota_2003_physiol_genomics_0.25 <- function(x) { + if(length(x) == sum(is.na(x))){ + x <- c(rep(0, length(x))) + } else if(length(x) == sum(is.nan(x))){ + x <- c(rep(0, length(x))) + } + x_org <- x + x <- x[(!is.na(x))] + x <- x[(!is.nan(x))] + n_plus_s <- length(x) + x.sort <- sort(x) + x.order <- order(x) + maice_Ut <- 0 + maice_i <- 0 + maice_j <- 0 + flag <- c(rep(0, length = n_plus_s)) + if (!is.na(sd(x)) | sd(x) != 0) { + for (i in 1:(n_plus_s * 0.25 + 1)) { + for (j in 1:(n_plus_s - i)) { + if ((i + j - 2) <= n_plus_s * 0.25) { + n <- (n_plus_s + 1 - j) - i + 1 + s <- n_plus_s - n + set_sd <- sd(x.sort[i:(n_plus_s + 1 - j)]) * + sqrt((n - 1) / n) + Ut <- n * log(set_sd) + sqrt(2) * s * lfactorial(n) / n + if (maice_Ut > Ut) { + maice_Ut <- Ut + maice_i <- i + maice_j <- j + } + } + } + } + if (maice_i > 1) { + flag[x.order[1:(maice_i - 1)]] <- -1 + } + if (maice_j > 1) { + flag[x.order[(n_plus_s + 1 - maice_j + 1):n_plus_s]] <- 1 + } + tmp <- replace(x_org, ((!is.nan(x_org)) & (!is.na(x_org))), flag) + return(tmp) + } else { + tmp <- replace(x_org, ((!is.nan(x_org)) & (!is.na(x_org))), flag) + return(tmp) + } +} + + +test_ROKU <- function() { + x <- abs(matrix(rnorm(100), ncol = 10)) + y <- t(apply(x, 1, scale)) + + roku.tcc <- ROKU(x) + roku.kdt <- t(apply(y, 1, kadota_2003_physiol_genomics_0.25)) + checkEqualsNumeric(roku.kdt, roku.tcc$outlier) + + outl.kdt <- apply(x, 1, kadota_2006_bmc_bioinformatics) + checkEqualsNumeric(outl.kdt, roku.tcc$modH) + + colnames(x) <- paste("t", 1:ncol(x)) + rownames(x) <- paste("g", 1:nrow(x)) + roku.tccnm <- ROKU(x) + checkEqualsNumeric(roku.tcc$outlier, roku.tccnm$outlier) + checkEqualsNumeric(roku.tcc$H, roku.tccnm$H) + checkEqualsNumeric(roku.tcc$modH, roku.tccnm$modH) +} + +test_ROKU_vector <- function() { + x <- abs(matrix(rnorm(100), ncol = 10)) + roku.all <- ROKU(x) + roku.one <- ROKU(x[1, ]) +} + +test_ROKU_vector <- function() { + data(hypoData_ts) + x <- ROKU(hypoData_ts, sort = TRUE) +} diff --git a/TCC/inst/unitTests/test_WAD.R b/TCC/inst/unitTests/test_WAD.R new file mode 100644 index 0000000..5cf5e30 --- /dev/null +++ b/TCC/inst/unitTests/test_WAD.R @@ -0,0 +1,42 @@ +## Kadota original code +kadota_WAD <- function(data = NULL, data.cl = NULL){ + x <- data + cl <- data.cl + mean1 <- rowMeans(as.matrix(x[, cl == 1])) + mean2 <- rowMeans(as.matrix(x[, cl == 2])) + x_ave <- (mean1 + mean2) / 2 + weight <- (x_ave - min(x_ave)) / (max(x_ave) - min(x_ave)) + statistic <- (mean2 - mean1) * weight + return(statistic) +} + +test_WAD_value <- function() { + g <- c(1, 1, 2, 2) + h <- c("A", "A", "B", "B") + x <- matrix(rnorm(100, 10, 2), ncol = 4) + ef <- colSums(x) + x <- sweep(x, 2, mean(ef) / ef, "*") + y <- x + y[y < 1] <- 1 + y <- log2(y) + + kdt <- kadota_WAD(y, g) + wad.x <- WAD(x, g, log.scale = TRUE, floor.value = 1) + wad.y <- WAD(y, g) + + checkEqualsNumeric(as.matrix(kdt), as.matrix(wad.x[, 1])) + checkEqualsNumeric(as.matrix(kdt), as.matrix(wad.y[, 1])) + + wad.h <- WAD(y, h) + checkEqualsNumeric(as.matrix(kdt), as.matrix(wad.h[, 1])) + + + tcc.g <- new("TCC", x, g) + tcc.g <- estimateDE(tcc.g, test.method = "wad") + tcc.h <- new("TCC", x, h) + tcc.h <- estimateDE(tcc.h, test.method = "wad") + checkEqualsNumeric(kdt, tcc.g$stat$testStat) + checkEqualsNumeric(kdt, tcc.h$stat$testStat) +} + + diff --git a/TCC/inst/unitTests/test_calcNormFactors.R b/TCC/inst/unitTests/test_calcNormFactors.R new file mode 100644 index 0000000..7e7eb71 --- /dev/null +++ b/TCC/inst/unitTests/test_calcNormFactors.R @@ -0,0 +1,164 @@ +test_calcNormFactors_DEGESedgeR_1 <- function() { + data(hypoData) + FDR <- 0.1 + floorPDEG <- 0.05 + count <- hypoData + group <- c(1, 1, 1, 2, 2, 2) + tcc <- new("TCC", count, group) + + tcc_edgeR <- calcNormFactors(tcc, norm.method = "tmm", iteration = 0) + tcc_DEGES_edgeR <- calcNormFactors(tcc, norm.method = "tmm", + test.method = "edger", iteration = 1) + d <- DGEList(counts = hypoData, group = group) + d <- calcNormFactors(d) + nf.1 <- d$samples$norm.factors + nf.1 <- nf.1 / mean(nf.1) + d <- estimateCommonDisp(d) + d <- estimateTagwiseDisp(d) + r <- exactTest(d) + q <- p.adjust(r$table$PValue, method = "BH") + if (sum(q < FDR) > (floorPDEG * nrow(hypoData))) { + is.DEG <- as.logical(q < FDR) + } else { + is.DEG <- as.logical(rank(result$table$PValue, + ties.method = "min") <= + nrow(hypoData) * floorPDEG) + } + d <- DGEList(counts = hypoData[!is.DEG, ], group = group) + d <- calcNormFactors(d) + nf.2 <- d$samples$norm.factors * colSums(hypoData[!is.DEG, ]) / + colSums(hypoData) + nf.2 <- nf.2 / mean(nf.2) + checkEqualsNumeric(nf.1, tcc_edgeR$norm.factors) + checkEqualsNumeric(nf.2, tcc_DEGES_edgeR$norm.factors) +} + +test_calcNormFactors_DEGESDESeq_1 <- function() { + data(hypoData) + FDR <- 0.1 + floorPDEG <- 0.05 + count <- hypoData + group <- c(1, 1, 1, 2, 2, 2) + tcc <- new("TCC", count, group) + + tcc_DESeq <- calcNormFactors(tcc, norm.method = "deseq", iteration = 0) + tcc_DEGES_DESeq <- calcNormFactors(tcc, norm.method = "deseq", + test.method = "deseq", iteration = 1, + FDR = FDR, floorPDEG = floorPDEG) + cds <- newCountDataSet(hypoData, group) + cds <- estimateSizeFactors(cds) + cds <- estimateDispersions(cds) + nf.1 <- sizeFactors(cds) / colSums(hypoData) + nf.1 <- nf.1 / mean(nf.1) + r <- nbinomTest(cds, 1, 2) + r$pval[is.na(r$pval)] <- 1 + r$padj[is.na(r$padj)] <- 1 + if (sum(r$padj < FDR) > (floorPDEG * nrow(hypoData))) { + is.DEG <- as.logical(r$padj < FDR) + } else { + is.DEG <- as.logical(rank(result$table$PValue, + ties.method = "min") <= + nrow(hypoData) * floorPDEG) + } + cds <- newCountDataSet(hypoData[!is.DEG, ], group) + cds <- estimateSizeFactors(cds) + nf.2 <- sizeFactors(cds) / colSums(hypoData) + nf.2 <- nf.2 / mean(nf.2) + checkEqualsNumeric(nf.1, tcc_DESeq$norm.factors) + checkEqualsNumeric(nf.2, tcc_DEGES_DESeq$norm.factors) +} + +test_calcNormFactors_DEGESTbT <- function() { + data(hypoData) + count <- hypoData + group <- c(1, 1, 1, 2, 2, 2) + tcc <- new("TCC", count, group) + set.seed(1) + tcc_DEGES_baySeq <- calcNormFactors(tcc, norm.method = "tmm", + test.method = "bayseq", + iteration = 1, samplesize = 10) + d <- DGEList(count = hypoData, group = group) + d <- calcNormFactors(d) + nf.1 <- d$samples$norm.factors + nf.1 <- nf.1 / mean(nf.1) + cD <- new("countData", data = hypoData, replicates = group, + groups = list(NDE = rep(1, length = length(group)), DE = group), + libsizes = colSums(hypoData) * nf.1) + set.seed(1) + cD <- getPriors.NB(cD, samplesize = 10, estimation = "QL", cl = NULL) + cD <- getLikelihoods.NB(cD, pET = "BIC", cl = NULL) + is.DEG <- as.logical(rank(-cD@posteriors[, "DE"]) < + (nrow(hypoData) * cD@estProps[2])) + d <- DGEList(count = hypoData[!is.DEG, ], group = group) + d <- calcNormFactors(d) + nf.2 <- d$samples$norm.factors * colSums(hypoData[!is.DEG, ]) / + colSums(hypoData) + nf.2 <- nf.2 / mean(nf.2) + + checkEqualsNumeric(nf.2, tcc_DEGES_baySeq$norm.factors) +} + +test_calcNormFactors_DEGESDESeq_classic_single <- function() { + data(hypoData) + floorPDEG <- 0.05 + count <- hypoData[, c(1, 4)] + group <- c(1, 2) + tcc <- new("TCC", count, group) + + tcc <- calcNormFactors(tcc, norm.method = "tmm", iteration = 0) + tcc <- calcNormFactors(tcc, norm.method = "deseq", iteration = 0) + tcc <- calcNormFactors(tcc, norm.method = "deseq", + test.method = "deseq", iteration = 1) + tcc <- calcNormFactors(tcc, norm.method = "tmm", test.method = "bayseq", + iteration = 1, samplesize = 10) +} + + +test_calcNormFactors_DEGESedgeR_glm <- function() { + data(hypoData_mg) + count <- hypoData_mg + group <- c(1, 1, 1, 2, 2, 2, 3, 3, 3) + tcc <- new("TCC", count, group) + + desgin <- model.matrix(~ 0 + factor(group)) + coef <- 2:3 + tcc <- calcNormFactors(tcc, norm.method = "tmm", test.method = "edger", + design = desgin, iteration = 1) +} + + +test_calcNormFactors_DEGESDESeq_glm <- function() { + data(hypoData_mg) + count <- hypoData_mg + group <- c(1, 1, 1, 2, 2, 2, 3, 3, 3) + tcc <- new("TCC", count, group) + + fit1 <- count ~ condition + fit0 <- count ~ 1 + tcc <- calcNormFactors(tcc, norm.method = "deseq", test.method = "deseq", + fit1 = fit1, fit0 = fit0, iteration = 1) +} + +test_calcNormFactors_increment <- function() { + data(hypoData) + tcc <- new("TCC", hypoData, c(1, 1, 1, 2, 2, 2)) + tcc.0 <- calcNormFactors(tcc, iteration = 0) + tcc.1 <- calcNormFactors(tcc, iteration = 1) + tcc.0.1 <- calcNormFactors(tcc, increment = TRUE) + checkEqualsNumeric(tcc.1$norm.factors, tcc.0.1$norm.factors) + + + tcc.3 <- calcNormFactors(tcc, iteration = 3) + tcc.1 <- calcNormFactors(tcc, increment = TRUE) + tcc.1.1 <- calcNormFactors(tcc.1, increment = TRUE) + tcc.1.1.1 <- calcNormFactors(tcc.1.1, increment = TRUE) + checkEqualsNumeric(tcc.3$norm.factors, tcc.1.1.1$norm.factors) + + + tcc.1 <- calcNormFactors(tcc, iteration = 1) + tcc.1.2 <- calcNormFactors(tcc.1, iteration = 2, increment = TRUE) + checkEqualsNumeric(tcc.3$norm.factors, tcc.1.2$norm.factors) +} + + + diff --git a/TCC/inst/unitTests/test_estimateDE.R b/TCC/inst/unitTests/test_estimateDE.R new file mode 100644 index 0000000..4daf95d --- /dev/null +++ b/TCC/inst/unitTests/test_estimateDE.R @@ -0,0 +1,416 @@ +test_estimateDE_EBSeq_1 <- function() { + library(EBSeq) + tcc <- simulateReadCounts(Ngene = 1000, replicates = c(3, 3)) + tcc <- calcNormFactors(tcc, iteration = FALSE) + set.seed(1) + tcc <- estimateDE(tcc, test.method = "ebseq", samplesize = 10) + auc <- calcAUCValue(tcc) + + set.seed(1) + x <- EBTest(Data = tcc$count, + Conditions = as.factor(tcc$group[, 1]), + sizeFactors = tcc$norm.factors * colSums(tcc$count), + maxround = 10) + PP <- GetPPMat(x) + df <- matrix(1, ncol = 2, nrow = nrow(tcc$count)) + rownames(df) <- rownames(tcc$count) + df[rownames(PP), 1] <- PP[, 1] + df[rownames(PP), 2] <- PP[, 2] + df[is.na(df)] <- 0 + + checkEqualsNumeric(df[, 2], tcc$stat$prob) + checkTrue(auc > 0.80) +} + +test_estimateDE_EBSeq_2 <- function() { + library(EBSeq) + tcc <- simulateReadCounts(Ngene = 1000, replicates = c(3, 3, 3)) + tcc <- calcNormFactors(tcc, iteration = FALSE) + set.seed(1) + tcc <- estimateDE(tcc, test.method = "ebseq", samplesize = 10) + auc <- calcAUCValue(tcc) + + g <- tcc$group[, 1] + ug <- unique(g) + gp <- matrix(c(rep(1, length = length(ug)), 1:length(ug)), + nrow = 2, byrow = TRUE) + colnames(gp) <- ug + rownames(gp) <- c("Pattern1", "Pattern2") + set.seed(1) + x <- EBMultiTest(Data = tcc$count, + NgVector = NULL, + Conditions = g, + AllParti = gp, + sizeFactors = tcc$norm.factors * colSums(tcc$count), + maxround = 10) + PP <- GetMultiPP(x) + df <- matrix(1, ncol = 2, nrow = nrow(tcc$count)) + rownames(df) <- rownames(tcc$count) + df[rownames(PP$PP), 1] <- PP$PP[, 1] + df[rownames(PP$PP), 2] <- PP$PP[, 2] + df[is.na(df)] <- 0 + + checkEqualsNumeric(df[, 2], tcc$stat$prob) + checkTrue(auc > 0.80) +} + + +test_estimateDE_SAMseq_1 <- function() { + library(samr) + samplesize <- 10 + tcc <- simulateReadCounts(Ngene = 1000, replicates = c(3, 3)) + tcc <- calcNormFactors(tcc, iteration = FALSE) + set.seed(1) + tcc <- estimateDE(tcc, test.method = "samseq", samplesize = samplesize) + auc <- calcAUCValue(tcc) + + x <- round(getNormalizedData(tcc)) + set.seed(1) + d <- SAMseq(x = x, y = tcc$group[, 1], + resp.type = "Two class unpaired", + nperms = samplesize) + + checkEqualsNumeric(d$samr.obj$tt, tcc$stat$testStat) + checkTrue(auc > 0.80) +} + +##test_estimateDE_SAMseq_1p <- function() { +## library(samr) +## samplesize <- 10 +## tcc <- simulateReadCounts(Ngene = 1000, replicates = c(3, 3)) +## tcc <- calcNormFactors(tcc, iteration = FALSE) +## set.seed(1) +## tcc <- estimateDE(tcc, test.method = "samseq", paired = TRUE, +## samplesize = samplesize) +## auc <- calcAUCValue(tcc) +## +## x <- round(getNormalizedData(tcc)) +## set.seed(1) +## d <- SAMseq(x = x, y = tcc$group[, 1], +## resp.type = "Two class paired", +## nperms = samplesize) +## +## checkEqualsNumeric(d$samr.obj$tt, tcc$stat$testStat) +## checkTrue(auc > 0.80) +##} + +test_estimateDE_SAMseq_2 <- function() { + library(samr) + samplesize <- 10 + tcc <- simulateReadCounts(replicates = c(3, 3, 3)) + tcc <- calcNormFactors(tcc, iteration = FALSE) + set.seed(1) + tcc <- estimateDE(tcc, test.method = "samseq", samplesize = samplesize) + auc <- calcAUCValue(tcc) + + x <- round(getNormalizedData(tcc)) + set.seed(1) + d <- SAMseq(x = x, y = tcc$group[, 1], + resp.type = "Multiclass", + nperms = samplesize) + checkEqualsNumeric(d$samr.obj$tt, tcc$stat$testStat) + checkTrue(auc > 0.80) +} + +##test_estimateDE_NOISeq_1 <- function() { +## library(NOISeq) +## tcc <- simulateReadCounts(Ngene = 1000, replicates = c(3, 3)) +## tcc <- calcNormFactors(tcc, iteration = FALSE) +## tcc <- estimateDE(tcc, test.method = "noiseq") +## auc <- calcAUCValue(tcc) +## +## x <- getNormalizedData(tcc) +## gl <- data.frame(group = tcc$group[, 1]) +## nd <- NOISeq::readData(x, gl) +## nr <- noiseq(nd, k = 0.5, norm = "n", replicates = "biological", +## factor = "group", conditions = unique(tcc$group[, 1])) +## prob <- nr@results[[1]]$prob +## prob[is.na(prob)] <- 0 +## checkEqualsNumeric(prob, tcc$stat$prob) +## checkTrue(auc > 0.80) +##} + +test_estimateDE_baySeq_1 <- function() { + tcc <- simulateReadCounts(Ngene = 1000, replicates = c(3, 3)) + tcc <- calcNormFactors(tcc, iteration = FALSE) + set.seed(1) + tcc <- estimateDE(tcc, test.method = "bayseq", samplesize = 10) + auc <- calcAUCValue(tcc) + + group <- c(1, 1, 1, 2, 2, 2) + el <- colSums(tcc$count) * tcc$norm.factors + groups <- list(NDE = rep(1, length(group)), DE = group) + cD <- new("countData", data = tcc$count, replicates = group, + libsizes = colSums(tcc$count) * tcc$norm.factors, + groups = groups) + set.seed(1) + cD <- getPriors.NB(cD, samplesize = 10, + estimation = "QL", cl = NULL) + cD <- getLikelihoods.NB(cD, pET = "BIC", cl = NULL) + tmp <- topCounts(cD, group = "DE", number = nrow(tcc$count)) + tmp <- tmp[rownames(tcc$count), ] + p <- 1 - tmp$Likelihood + + checkEqualsNumeric(p, tcc$stat$p.value) + checkTrue(auc > 0.60) +} + +##test_estimateDE_baySeq_1p <- function() { +## tcc <- simulateReadCounts(Ngene = 1000, PDEG = 0.3, +## group = data.frame(A = c(1, 1, 1, 1, 2, 2, 2, 2), +## B = c(1, 1, 2, 2, 1, 1, 2, 2)), +## DEG.foldchange = data.frame(F1 = c(4, 4, 4, 4, 1, 1, 1, 1), +## F2 = c(1, 1, 1, 1, 4, 4, 4, 4), +## F3 = c(1, 1, 1/4, 1/4, 1, 1, 4, 4)), +## DEG.assign = c(0.2, 0.2, 0.6)) +## tcc <- calcNormFactors(tcc, iteration = FALSE) +## set.seed(1) +## tcc <- estimateDE(tcc, test.method = "bayseq", paired = TRUE, +## samplesize = 10) +## auc <- calcAUCValue(tcc) +## +## group <- c(1, 1, 2, 2) +## el <- colSums(tcc$count) * tcc$norm.factors +## groups <- list(NDEG = c(1, 1, 1, 1), DE = c(1, 1, 2, 2)) +## cD <- new("pairedData", +## data = tcc$count[, 1:4], +## pairData = tcc$count[, 5:8], +## replicates = group, +## groups = groups, +## libsizes = el[1:4], +## pairLibsizes = el[5:8]) +## set.seed(1) +## cD <- getPriors.BB(cD, samplesize = 300, +## estimation = "QL", cl = NULL) +## cD <- getLikelihoods.BB(cD, pET = "BIC", nullProps = 0.5, cl = NULL) +## +## ## DE between replicate groups +## tmp <- topCounts(cD, group = 1, number = nrow(tcc$count)) +## tmp <- tmp[rownames(tcc$count), ] +## p <- 1 - tmp$Likelihood +##AUC(rocdemo.sca(truth=c(rep(0, 2700),rep(1, 300),rep(0, 7000)), data = -rank(p))) +##AUC(rocdemo.sca(truth=c(rep(1, 2700),rep(0, 300),rep(0, 7000)), data = -rank(p))) +## +## tmp <- topCounts(cD, group = 2, number = nrow(tcc$count)) +## tmp <- tmp[rownames(tcc$count), ] +## p <- 1 - tmp$Likelihood +##AUC(rocdemo.sca(truth=c(rep(0, 2700),rep(1, 300),rep(0, 7000)), data = -rank(p))) +##AUC(rocdemo.sca(truth=c(rep(1, 2700),rep(0, 300),rep(0, 7000)), data = -rank(p))) +## +## +## checkEqualsNumeric(p, tcc$stat$p.value) +## checkTrue(auc > 0.70) +##} + +test_estimateDE_baySeq_2 <- function() { + tcc <- simulateReadCounts(Ngene = 1000, replicates = c(3, 3, 3)) + tcc <- calcNormFactors(tcc, iteration = FALSE) + set.seed(1) + tcc <- estimateDE(tcc, test.method = "bayseq", samplesize = 10) + auc <- calcAUCValue(tcc) + + group <- c(1, 1, 1, 2, 2, 2, 3, 3, 3) + el <- colSums(tcc$count) * tcc$norm.factors + groups <- list(NDE = rep(1, length(group)), DE = group) + cD <- new("countData", data = tcc$count, replicates = group, + libsizes = colSums(tcc$count) * tcc$norm.factors, + groups = groups) + set.seed(1) + cD <- getPriors.NB(cD, samplesize = 10, + estimation = "QL", cl = NULL) + cD <- getLikelihoods.NB(cD, pET = "BIC", cl = NULL) + tmp <- topCounts(cD, group = "DE", number = nrow(tcc$count)) + tmp <- tmp[rownames(tcc$count), ] + p <- 1 - tmp$Likelihood + + checkEqualsNumeric(p, tcc$stat$p.value) + checkTrue(auc > 0.70) +} + +test_estimateDE_baySeq_3 <- function() { + tcc <- simulateReadCounts(Ngene = 1000, replicates = c(4, 4)) + tcc <- calcNormFactors(tcc, iteration = FALSE) + tcc$group <- data.frame(GROUP = c(1, 1, 1, 1, 2, 2, 2, 2 ), + TIME = c(1, 1, 2, 2, 1, 1, 2, 2)) + set.seed(1) + tcc <- estimateDE(tcc, test.method = "bayseq", samplesize = 10, + comparison = "GROUP") + auc <- calcAUCValue(tcc) + + groups <- tcc$group + groups <- cbind(rep(1, length = nrow(tcc$group)), groups) + colnames(groups)[1] <- "NDE" + el <- colSums(tcc$count) * tcc$norm.factors + cD <- new("countData", data = tcc$count, + replicates = c(1, 1, 1, 1, 2, 2, 2, 2), + libsizes = colSums(tcc$count) * tcc$norm.factors, + groups = groups) + set.seed(1) + cD <- getPriors.NB(cD, samplesize = 10, + estimation = "QL", cl = NULL) + cD <- getLikelihoods.NB(cD, pET = "BIC", cl = NULL) + tmp <- topCounts(cD, group = "GROUP", number = nrow(tcc$count)) + tmp <- tmp[rownames(tcc$count), ] + p <- 1 - tmp$Likelihood + + checkEqualsNumeric(p, tcc$stat$p.value) + checkTrue(auc > 0.70) +} + + +test_estimateDE_DESeq_1 <- function() { + tcc <- simulateReadCounts(Ngene = 1000, replicates = c(3, 3)) + tcc <- calcNormFactors(tcc, iteration = FALSE) + tcc <- estimateDE(tcc, test.method = "deseq") + auc <- calcAUCValue(tcc) + + d <- newCountDataSet(tcc$count, tcc$group[, 1]) + sizeFactors(d) <- tcc$norm.factors * colSums(tcc$count) + d <- estimateDispersions(d) + r <- nbinomTest(d, 1, 2) + r$pval[is.na(r$pval)] <- 1 + + checkEqualsNumeric(r$pval, tcc$stat$p.value) + checkTrue(auc > 0.80) +} + +test_estimateDE_DESeq_2 <- function() { + fit1 <- count ~ condition + fit0 <- count ~ 1 + + tcc <- simulateReadCounts(Ngene = 1000, replicates = c(3, 3, 3)) + tcc <- calcNormFactors(tcc, iteration = FALSE) + t1 <- estimateDE(tcc, test.method = "deseq") + t2 <- estimateDE(tcc, test.method = "deseq", fit0 = fit0, fit1 = fit1) + auc <- calcAUCValue(t1) + + d <- newCountDataSet(tcc$count, tcc$group[, 1]) + sizeFactors(d) <- tcc$norm.factors * colSums(tcc$count) + d <- estimateDispersions(d) + f0 <- fitNbinomGLMs(d, fit0) + f1 <- fitNbinomGLMs(d, fit1) + p <- nbinomGLMTest(f1, f0) + p[is.na(p)] <- 1 + + checkEqualsNumeric(p, t1$stat$p.value) + checkEqualsNumeric(p, t2$stat$p.value) + checkTrue(auc > 0.80) +} + +test_estimateDE_DESeq_3 <- function() { + group <- data.frame( + COND = as.factor(c(1, 1, 1, 1, 2, 2, 2, 2)), + TIME = as.factor(c(1, 1, 2, 2, 1, 1, 2, 2)) + ) + fit1 <- count ~ TIME + COND + fit0 <- count ~ 1 + + tcc <- simulateReadCounts(Ngene = 1000, replicates = c(4, 4)) + tcc$group <- group + tcc <- calcNormFactors(tcc, iteration = FALSE) + tcc <- estimateDE(tcc, test.method = "deseq", fit0 = fit0, fit1 = fit1) + auc <- calcAUCValue(tcc) + + d <- newCountDataSet(tcc$count, tcc$group) + sizeFactors(d) <- tcc$norm.factors * colSums(tcc$count) + d <- estimateDispersions(d) + f0 <- fitNbinomGLMs(d, fit0) + f1 <- fitNbinomGLMs(d, fit1) + p <- nbinomGLMTest(f1, f0) + p[is.na(p)] <- 1 + + checkEqualsNumeric(p, tcc$stat$p.value) + checkTrue(auc > 0.80) +} + + +test_estimateDE_edgeR_1 <- function() { + tcc <- simulateReadCounts(Ngene = 1000, replicates = c(3, 3)) + tcc <- calcNormFactors(tcc, iteration = FALSE) + tcc <- estimateDE(tcc, test.method = "edger") + auc <- calcAUCValue(tcc) + + d <- DGEList(counts = tcc$count, group = tcc$group[, 1]) + d$samples$norm.factors <- tcc$norm.factors + d <- estimateCommonDisp(d) + d <- estimateTagwiseDisp(d) + r <- exactTest(d) + checkEqualsNumeric(r$table$PValue, tcc$stat$p.value) + checkTrue(auc > 0.80) +} + +test_estimateDE_edgeR_2 <- function() { + group <- c(1, 1, 1, 2, 2, 2, 3, 3, 3) + coef <- 2:3 + design <- model.matrix(~ as.factor(group)) + + tcc <- simulateReadCounts(Ngene = 1000, replicates = c(3, 3, 3)) + tcc <- calcNormFactors(tcc, iteration = FALSE) + t1 <- estimateDE(tcc, test.method = "edger") + t2 <- estimateDE(tcc, test.method = "edger", design = design, coef = coef) + auc <- calcAUCValue(t1) + + d <- DGEList(counts = tcc$count, group = tcc$group[, 1]) + d$samples$norm.factors <- tcc$norm.factors + d <- estimateGLMCommonDisp(d, design) + d <- estimateGLMTrendedDisp(d, design) + d <- estimateGLMTagwiseDisp(d, design) + fit <- glmFit(d, design) + lrt <- glmLRT(fit, coef = coef) + r <- topTags(lrt, n = nrow(tcc$count)) + r <- r$table[rownames(tcc$count), ] + + checkEqualsNumeric(r$PValue, t1$stat$p.value) + checkEqualsNumeric(r$PValue, t2$stat$p.value) + checkTrue(auc > 0.80) +} + +test_estimateDE_edgeR_3 <- function() { + group <- c(1, 1, 1, 2, 2, 2, 3, 3, 3) + contrast <- c(-1, 0, 1) + design <- model.matrix(~ 0 + as.factor(group)) + + tcc <- simulateReadCounts(Ngene = 1000, replicates = c(3, 3, 3)) + tcc <- calcNormFactors(tcc, iteration = FALSE) + tcc <- estimateDE(tcc, test.method = "edger", design = design, contrast = contrast) + auc <- calcAUCValue(tcc) + + d <- DGEList(counts = tcc$count, group = tcc$group[, 1]) + d$samples$norm.factors <- tcc$norm.factors + d <- estimateGLMCommonDisp(d, design) + d <- estimateGLMTrendedDisp(d, design) + d <- estimateGLMTagwiseDisp(d, design) + fit <- glmFit(d, design) + lrt <- glmLRT(fit, contrast = contrast) + r <- topTags(lrt, n = nrow(tcc$count)) + r <- r$table[rownames(tcc$count), ] + + checkEqualsNumeric(r$PValue, tcc$stat$p.value) + checkTrue(auc > 0.80) +} + +test_estimateDE_crossvalidate <- function() { + tcc <- new("TCC") + av <- tcc$private$available$test.method + ty <- colnames(av) + pk <- rownames(av) + for (i in 1:length(ty)) { + for (j in 1:length(pk)) { + if (av[j, i]) { + if (ty[i] == "UnRepTwoGroup") + x <- simulateReadCounts(Ngene = 1000, replicates = c(1, 1)) + else if (ty[i] == "TwoGroup" || ty[i] == "PairedTwoGroup") + x <- simulateReadCounts(Ngene = 1000, replicates = c(3, 3)) + else + x <- simulateReadCounts(Ngene = 1000, replicates = c(3, 3, 3)) + x <- calcNormFactors(x, norm.method = "tmm", + test.method = pk[j], samplesize = 10) + x <- estimateDE(x, test.method = pk[j], samplesize = 10) + } + } + } +} + + + + diff --git a/TCC/inst/unitTests/test_filterLowCountGenes.R b/TCC/inst/unitTests/test_filterLowCountGenes.R new file mode 100644 index 0000000..44f1e41 --- /dev/null +++ b/TCC/inst/unitTests/test_filterLowCountGenes.R @@ -0,0 +1,13 @@ +test_filterLowCountGenes <- function() { + data(hypoData) + group <- c(1, 1, 1, 2, 2, 2) + tcc <- new("TCC", hypoData, group) + + tcc <- filterLowCountGenes(tcc) + + filter <- as.logical(rowSums(hypoData) > 0) + hypoData.filtered <- hypoData[filter, ] + + checkEqualsNumeric(tcc$count, hypoData.filtered) +} + diff --git a/TCC/inst/unitTests/test_getResult.R b/TCC/inst/unitTests/test_getResult.R new file mode 100644 index 0000000..a0a2f32 --- /dev/null +++ b/TCC/inst/unitTests/test_getResult.R @@ -0,0 +1,16 @@ +test_getResult <- function() { + data(hypoData) + group <- c(1, 1, 1, 2, 2, 2) + tcc <- new("TCC", hypoData, group) + tcc <- calcNormFactors(tcc) + tcc <- estimateDE(tcc, test.method = "edger") + result <- getResult(tcc) + ma <- plot(tcc) + + checkEqualsNumeric(result$p.value, tcc$stat$p.value) + checkEqualsNumeric(result$q.value, tcc$stat$q.value) + checkEqualsNumeric(result$rank, tcc$stat$rank) + checkEqualsNumeric(result$a.value, ma$a.value) + checkEqualsNumeric(result$m.value, ma$m.value) +} + diff --git a/TCC/inst/unitTests/test_new.R b/TCC/inst/unitTests/test_new.R new file mode 100644 index 0000000..0566f8e --- /dev/null +++ b/TCC/inst/unitTests/test_new.R @@ -0,0 +1,16 @@ +test_new <- function() { + data(hypoData) + groupNum <- c(1, 1, 1, 2, 2, 2) + groupStr <- c("G1", "G1", "G1", "G2", "G2", "G2") + + matrixObj <- as.matrix(hypoData) + dataframeObj <- as.matrix(hypoData) + + tccMatrixObj <- new("TCC", matrixObj, groupNum) + tccDataframeObj <- new("TCC", dataframeObj, groupNum) + checkEquals(tccMatrixObj, tccDataframeObj) + + tccMatrixObj <- new("TCC", matrixObj, groupStr) + tccDataframeObj <- new("TCC", dataframeObj, groupStr) + checkEquals(tccMatrixObj, tccDataframeObj) +} diff --git a/TCC/inst/unitTests/test_plot.R b/TCC/inst/unitTests/test_plot.R new file mode 100644 index 0000000..1ff34eb --- /dev/null +++ b/TCC/inst/unitTests/test_plot.R @@ -0,0 +1,17 @@ +test_plot <- function() { + data(hypoData) + group <- c(1, 1, 1, 2, 2, 2) + tcc <- new("TCC", hypoData, group) + plot(tcc) + + tcc <- calcNormFactors(tcc) + plot(tcc) + + tcc <- estimateDE(tcc, test.method = "edger") + plot(tcc) + + group <- c("A", "A", "A", "B", "B", "B") + tcc <- new("TCC", hypoData, group) + plot(tcc) +} + diff --git a/TCC/inst/unitTests/test_plotFCPseudocolor.R b/TCC/inst/unitTests/test_plotFCPseudocolor.R new file mode 100644 index 0000000..e7a214b --- /dev/null +++ b/TCC/inst/unitTests/test_plotFCPseudocolor.R @@ -0,0 +1,8 @@ +test_plotFCPseudocolor <- function() { + tcc <- simulateReadCounts() + plotFCPseudocolor(tcc) + + tcc <- simulateReadCounts(replicates = c(3, 3, 3)) + plotFCPseudocolor(tcc) +} + diff --git a/TCC/inst/unitTests/test_simulateReadCounts.R b/TCC/inst/unitTests/test_simulateReadCounts.R new file mode 100644 index 0000000..a173e58 --- /dev/null +++ b/TCC/inst/unitTests/test_simulateReadCounts.R @@ -0,0 +1,13 @@ +test_simulateReadCounts <- function() { + tcc <- simulateReadCounts() + + tcc <- simulateReadCounts(replicates = c(2, 3, 4)) + + tcc <- simulateReadCounts(Ngene = 1000, PDEG = 0.1, + DEG.assign = c(0.6, 0.4)) + + tcc <- simulateReadCounts(Ngene = 1000, PDEG = 0.1, + DEG.assign = c(0.6, 0.4), + DEG.foldchange = c(2, 6)) +} + diff --git a/TCC/man/MAplot.Rd b/TCC/man/MAplot.Rd index 271400d..f8a05fe 100644 --- a/TCC/man/MAplot.Rd +++ b/TCC/man/MAplot.Rd @@ -2,14 +2,15 @@ \alias{MAplot} \title{plot a MA plot} \description{ - This method plots a MA plot based on the exactTestafterTbT. +This method plots a MA plot based on the exactTestafterTbT. +This function will be obsoleted. } \usage{ - MAplot(datalist, FDR_threshold = 0.01) +MAplot(datalist, FDR_threshold = 0.01) } \arguments{ - \item{datalist}{The output from exacTestafterTbT} - \item{FDR_threshold}{Points below the threshold will be plotted in red.} + \item{datalist}{The output from exacTestafterTbT} + \item{FDR_threshold}{Points below the threshold will be plotted in red.} } \examples{ \dontrun{ diff --git a/TCC/man/NB.Rd b/TCC/man/NB.Rd index 17a1cf6..006640f 100644 --- a/TCC/man/NB.Rd +++ b/TCC/man/NB.Rd @@ -2,24 +2,29 @@ \alias{NBsample} \title{Sampling from negative biomial distribution} \description{ - This methods allow sampling from Negative Binomial distribution with - specified proportion of differentially expressed genes having - specified level of differential expression in terms of fold change. - The proportion of upregulated are also specified. - The distribution of original expression levels are generated by resampling - real data of \emph{Arabidopsis} RNA-seq data from \code{\link{arab}}. - This function will be obsoleted. Use \link{generateSimulationData} - instead. +This methods allow sampling from Negative Binomial distribution with +specified proportion of differentially expressed genes having +specified level of differential expression in terms of fold change. +The proportion of upregulated are also specified. +The distribution of original expression levels are generated by resampling +real data of \emph{Arabidopsis} RNA-seq data from \code{\link{arab}}. +This function will be obsoleted. Use \link{simulateReadCounts}instead. } \usage{ - NBsample(DEG_foldchange=4, repA=3, repB=3, Ngene=3000, PDEG=0.15, PA=0.2) +NBsample(DEG_foldchange = 4, repA = 3, repB = 3, + Ngene = 3000, PDEG = 0.15, PA = 0.2) } \arguments{ -\item{DEG_foldchange}{Fold change value of differentially expressed genes} -\item{repA}{Replicate number for sample A} -\item{repB}{Replicate number for sample B} -\item{Ngene}{Number of genes to produce} -\item{PDEG}{Proportion of differentially expressed genes} -\item{PA}{Proportion of upregulated genes in sample A among differentially expressed -genes (DEGs)} + \item{DEG_foldchange}{Fold change value of differentially expressed genes} + \item{repA}{Replicate number for sample A} + \item{repB}{Replicate number for sample B} + \item{Ngene}{Number of genes to produce} + \item{PDEG}{Proportion of differentially expressed genes} + \item{PA}{Proportion of upregulated genes in sample A among + differentially expressed genes (DEGs)} +} +\examples{ +\dontrun{ +sample <- NBsample() +} } diff --git a/TCC/man/ROKU.Rd b/TCC/man/ROKU.Rd new file mode 100644 index 0000000..09355f5 --- /dev/null +++ b/TCC/man/ROKU.Rd @@ -0,0 +1,128 @@ +\name{ROKU} +\alias{ROKU} +\title{detect tissue-specific (or tissue-selective) patterns from +microarray data with many kinds of samples} +\description{ +ROKU is a method for detecting tissue-specific (or tissue-selective) +patterns from gene expression data for many tissues (or samples). +ROKU (i) ranks genes according to their overall tissue-specificity +using Shannon entropy after data processing and (ii) detects tissues +specific to each gene if any exist using an Akaike's information +criterion (AIC) procedure. +} +\usage{ +ROKU(data, upper.limit = 0.25, sort = FALSE) +} +\arguments{ + \item{data}{numeric matrix or data frame containing microarray data + (on log2 scale), where each row indicates the gene or probeset ID, + each column indicates the tissue, and each cell indicates a + (log2-transformed) expression value of the gene in the tissue. + Numeric vector can also be accepted for a single gene expression + vector.} + \item{upper.limit}{numeric value (between 0 and 1) specifying the maximum + percentage of tissues (or samples) as outliers to each gene.} + \item{sort}{logical. If \code{TRUE}, results are sorted in descending + order of the entropy scores.} +} +\details{ +As shown in Figure 1 in the original study of ROKU (Kadota et al., 2006), +Shannon entropy \eqn{H} of a gene expression vector +(\eqn{x_{1}, x_{2}, ..., x_{N}}) for \eqn{N} tissues can range +from zero to \eqn{log_{2}N}, with the value 0 for genes expressed in a +single tissue and \eqn{log_{2}N} for genes expressed uniformly in all the +tissues. Researchers therefore rely on the low entropy score for the +identification of tissue-specific patterns. +However, direct calculation of the entropy for raw gene expression vector +works well only for detecting tissue-specific patterns when over-expressed +in a small number of tissues but unexpressed or slightly expressed in others: +The \eqn{H} scores of tissue-specific patterns such as +\eqn{(8,8,2,8,8,8,8,8,8,8)} +for the 3rd tissue-specific down-regulation (see the Figure 1e) are close +to the maximum value (\eqn{log_{2}N=3.32} when \eqn{N=10}) and cannot identify +such patterns as tissue-specific. To detect various kinds of tissue-specific +patterns by low entropy score, ROKU processes the original gene expression +vector and makes a new vector (\eqn{x_{1'}, x_{2'}, ..., x_{N'}}). +The data processing is done by subtracting the one-step Tukey biweight and +by taking the absolute value. In case of the above example, ROKU calculates +the \eqn{H} score from the processed vector \eqn{(0,0,6,0,0,0,0,0,0,0)}, +giving very low score (from \eqn{H = 3.26} before processing to \eqn{H' = 0} +after processing). A major characteristic of ROKU is, therefore, +to be able to rank various tissue-specific patterns by using the modified +entropy scores. + +Note that the modified entropy does not explain to which tissue a gene is +specific, only measuring the degree of overall tissue specificity of the gene. +ROKU employs an AIC-based outlier detection method (Ueda, 1996). +Consider, for example, a hypothetical mixed-type of tissue-selective expression +pattern \eqn{(1.2, 5.1, 5.2, 5.4, 5.7, 5.9, 6.0, 6.3, 8.5, 8.8)} where we +imagine a total of three tissues are specific (down-regulated in tissue1; +up-regulated in tissues 9 and 10). The method first normalize the expression +values by subtracting the mean and dividing by the standard deviation +(i.e., \eqn{z}-score transformation), then sorted in order of increasing +magnitude by\cr +\eqn{(-2.221, -0.342, -0.294, -0.198, -0.053, 0.043, 0.092, 0.236, 1.296, +1.441)}. The method evaluates various combinations of outlier candidates +starting from both sides of the values: model1 for non-outlier, +model2 for one outlier for high-side, model3 for two outliers for high-side, +..., model\eqn{x} for one outlier for down-side, ..., modely for two outliers for +both up- and down sides, and so on. Then, it calculates AIC-like statistic +(called \eqn{U}) for each combination of model and search the best combination +that achieves the lowest \eqn{U} value and is termed the minimum AIC estimate +(MAICE). Since the upper.limit value corresponds to the maximum number of the +outlier candidates, it decides the number of combinations. The AIC-based +method output a vector (1 for up-regulated outliers, -1 for down-regulated +outliers, and 0 for non-outliers) that corresponds to the input vector. +For example, the method outputs a vector \eqn{(-1, 0, 0, 0, 0, 0, 0, 0, 1, 1)} +when using \code{upper.limit = 0.5} and \eqn{(-1, 0, 0, 0, 0, 0, 0, 0, 0, 0)} +when using \code{upper.limit = 0.25} (as default). +See the Kadota et al., 2007 for detailed discussion about the effect of +different parameter settings. +} +\value{ +A list containing following fields: + \item{outlier}{A numeric matrix when the input \code{data} are data frame + or matrix. A numeric vector when the input \code{data} are numeric + vector. + Both matrix or vector consist of 1, -1, and 0: 1 for over-expressed + outliers, -1 for under-expressed outliers, and 0 for non-outliers.} + \item{modH}{A numeric vector when the input \code{data} are data frame or + matrix. A numeric scalar when the input \code{data} are numeric + vector. Both vector or scalar consist of modified entropy (\eqn{H'}) + score(s) calculated from a processed gene expression vector.} + \item{H}{A numeric vector when the input \code{data} are data frame or + matrix. A numeric scalar when the input \code{data} are numeric + vector. Both vector or scalar consist of original entropy (\eqn{H}) + score(s) calculated from an original gene expression vector.} + \item{rank}{A numeric vector or scalar consisting of the rank(s) of + \code{modH}.} + \item{Tbw}{a numeric vector or scalar consisting of one-step Tukey's + biweight as an iteratively reweighted measure of central tendency. + This value is in general similar to median value and the same as the + output of \code{tukey.biweight} with default parameter settings + in \code{affy} package. The data processing is done by + subtracting this value for each gene expression vector and by + taking the absolute value.} +} +\references{ +Kadota K, Konishi T, Shimizu K: Evaluation of two outlier-detection-based +methods for detecting tissue-selective genes from microarray data. +Gene Regulation and Systems Biology 2007, 1: 9-15. + +Kadota K, Ye J, Nakai Y, Terada T, Shimizu K: +ROKU: a novel method for identification of tissue-specific genes. +BMC Bioinformatics 2006, 7: 294. + +Kadota K, Nishimura SI, Bono H, Nakamura S, Hayashizaki Y, Okazaki Y, +Takahashi K: Detection of genes with tissue-specific expression patterns +using Akaike's Information Criterion (AIC) procedure. +Physiol Genomics 2003, 12: 251-259. + +Ueda T. Simple method for the detection of outliers. +Japanese J Appl Stat 1996, 25: 17-26. +} +\examples{ +data(hypoData_ts) + +result <- ROKU(hypoData_ts) +} diff --git a/TCC/man/TCC-class.Rd b/TCC/man/TCC-class.Rd index a626e02..16e2b46 100644 --- a/TCC/man/TCC-class.Rd +++ b/TCC/man/TCC-class.Rd @@ -1,80 +1,80 @@ \name{TCC-class} \docType{class} \alias{TCC-class} +\alias{show} +\alias{show,TCC-method} +\alias{names} +\alias{names,TCC-method} +\alias{length} +\alias{length,TCC-method} +\alias{[} +\alias{[,TCC-method} +\alias{subset,TCC-subset} \title{A container for storing information used in TCC} \description{ -This is the container class for TCC. This class initially contains +This is a container class for TCC. This class initially contains count data matrix and some information for the analysis of count data. It also provides further fields that are populated during the analysis. } \details{ -This class is implemented as a R5 reference class. -Thus the method call to this class object change the content of the -object. Functions calling such methods copies the object prior to +This class is implemented as an R5 reference class. +Functions calling such methods copies the object prior to calling the method to keep the semantics of functional programming. -This class can be created by the generic \code{new()} function with -the data for (i.e., \code{count} and \code{group}) fields. -Information of the (\code{replicates}) field can automatically be obtained -when creating this class object from the information of the \code{group} field. -Alternatively, if you supply the \code{replicates}, the group field can be -filled automatically. +This class can be created by the generic \code{new} function with +count data and associated information of experimental design. The values (defaults to all 1) in the \code{norm.factors} field will be changed after performing the \code{\link{calcNormFactors}} function. -The \code{stat} field stores (i) execution time for calculating normalization -factors after performing the \code{\link{calcNormFactors}} function, -and (iii) statistics (i.e., \eqn{p}-value, \eqn{q}-value, and rank) -related to the degrees of differential expression for individual genes -after performing the \code{\link{estimateDE}} function. -The \code{estimatedDEG} field stores information about -which genes are called significantly highly expressed in one group. -The threshold for determining the differentially expressed genes (DEGs) -is preliminarily indicated when performing the \code{\link{estimateDE}} function. -The \code{simulation} field stores parameters of the simulation. -The information in this field is generated by the -\code{\link{generateSimulationData}} function. +The \code{DEGES} field stores information related to our DEGES-based +normalization pipeline after performing the \code{\link{calcNormFactors}} +function. +The \code{stat} and \code{estimatedDEG} fields store results after performing +the \code{\link{estimateDE}} function. +The \code{simulation} field stores parameters +used when performing the \code{\link{simulateReadCounts}} function. } \section{Fields}{ This class contains the following fields: - \describe{ - \item{count}{numeric matrix containing count data. } - \item{group}{numeric vector indicating the numbers of - replicates for individual groups compared.} - \item{replicates}{numeric vector indicating the - experimental group for each sample (or library).} - \item{norm.factors}{numeric vector containing - normalization factors.} - \item{stat}{list for storing results after the execution - of the \code{\link{calcNormFactors}} and - \code{\link{estimateDE}} functions.} - \item{estimatedDEG}{numeric vector as a placeholder for - indicating which genes are expressed higher in particular - group compared to the others. The values in this field - will be populated after the execution of the - \code{\link{estimateDE}} function.} - \item{simulation}{list. This field is only used for - analyzing simulation data.} - } + \describe{ + \item{count}{numeric matrix containing count data. } + \item{gene_id}{character vector indicating the identifier of the count + unit, usually gene.} + \item{group}{data frame indicating information about experimental + design.} + \item{norm.factors}{numeric vector containing normalization factors + (default to 1).} + \item{stat}{list for storing results after the execution of the + \code{\link{calcNormFactors}} and + \code{\link{estimateDE}} functions.} + \item{estimatedDEG}{numeric vector as a placeholder for indicating + either DEGs (flagged as "1") or non-DEGs (as "0") for individual + genes. The values in this field will be populated after + the execution of the \code{\link{estimateDE}} function.} + \item{simulation}{list. This field is only used for analyzing + simulation data.} + \item{DEGES}{list for storing the information about normalization + steps.} + } } \examples{ -\dontrun{ -tcc <- generateSimulationData() +tcc <- simulateReadCounts(Ngene = 10000, PDEG = 0.2, DEG.assign = c(0.8, 0.2), + DEG.foldchange = c(4, 4), replicates = c(3, 3)) -# Check the counts. -tcc$count +# Check the TCC class object. +tcc -# Check the group and replicates annotation of counts. -tcc$group -tcc$replicates +# Check the fields of TCC class object. +names(tcc) +head(tcc$count) # Check the normalization factors. -tcc <- calcNormFactors(tcc) +tcc <- calcNormFactors(tcc, norm.method = "tmm", test.method = "edger", + iteration = 1, FDR = 0.1, floorPDEG = 0.05) tcc$norm.factors # Check the p-values and q-values. -tcc <- estimateDE(tcc) -head(tcc$stat$p.value) -head(tcc$stat$q.value) +tcc <- estimateDE(tcc, test.method = "edger", FDR = 0.1) +tcc # Compare the breakdowns of estimated DEGs with the truth. head(tcc$estimatedDEG) @@ -83,5 +83,4 @@ head(tcc$simulation$trueDEG) # M-A plotting. plot(tcc) } -} \keyword{classes} diff --git a/TCC/man/TCC.Rd b/TCC/man/TCC.Rd index 8fb5703..13c34a3 100644 --- a/TCC/man/TCC.Rd +++ b/TCC/man/TCC.Rd @@ -2,29 +2,31 @@ \docType{package} \alias{TCC} \alias{TCC-package} -\title{A package for differential expression analysis from two-group tag +\title{A package for differential expression analysis from tag count data with robust normalization strategies} \description{ -This package performs differential expression analysis from tag count -data that are produced from high-throughput sequencing (HTS) or next -generation sequencing (NGS) technology. A notable feature of this +This package performs differential expression analysis from transcriptome +data that are produced from high-throughput sequencing (HTS) and microarray +technologies. A notable feature of this package is to provide robust normalization methods whose strategy is to remove data assigned as potential differentially expressed -genes (DEGs) before performing data normalization (Kadota et al., 2012). +genes (DEGs) before performing normalization for RNA-seq count data +(Kadota et al., 2012; Sun et al., 2013). } \details{ -TCC is a package for differential expression analysis from tag count data, -focusing of RNA-seq. This package implements some functions -for calculating normalization factors, -identifying DEGs, -depicting so-called M-A plot, -and generating simulation data. +TCC is a package for differential expression analysis from +transcriptome data produced from RNA-seq and microarray data. +This package implements some functions for calculating normalization factors, +identifying DEGs, depicting so-called M-A plot, and generating simulation data. -To utilize this package, -the count matrix coupled with label information should be stored to a - \link{TCC-class} object using the \code{new} method. -All functions (except for some legacy functions) used in this package -require this \link{TCC-class} object. +To utilize this package, the count matrix coupled with label information +should be stored to a \link{TCC-class} object using the \code{new} method. +All functions, except for some legacy functions +(i.e., \code{\link{NBsample}}, \code{\link{do_TbT}}, +\code{\link{exactTestafterTbT}}, and \code{\link{MAplot}} for RNA-seq data) +and for two recently added functions (i.e., \code{\link{ROKU}} and +\code{\link{WAD}}) for microarray data, +used in this package require this \link{TCC-class} object. Using this object, the \code{\link{calcNormFactors}} function calculates normalization factors and the \code{\link{estimateDE}} function estimates the degree of differential expression (DE) for individual genes. @@ -32,27 +34,23 @@ The estimated normalization factors obtained by using the \code{\link{calcNormFactors}} function are used within the statistical model for differential analysis in the \code{\link{estimateDE}} function. Both two functions internally call functions from other packages -(edgeR, DESeq, and baySeq) when needed. TCC also provides a useful function -\code{\link{generateSimulationData}} for generating simulation data -with various conditions. -Convenient plotting functions are also included. +(edgeR, DESeq, baySeq, EBSeq, and samr) when specified. +TCC also provides some useful functions: \code{\link{simulateReadCounts}} +for generating simulation data with various experimental designs, +\code{\link{plot}} for depicting a M-A plot, +\code{\link{plotFCPseudocolor}} for depicting a pseudo-color image of +simulation condition that the user specified, +\code{\link{WAD}} for identifying DEGs from two-group microarray data +(single-factor design), and \code{\link{ROKU}} for identifying +tissue-specific genes from microarray data for many tissues. +} +\examples{ +data(hypoData) +group <- c(1, 1, 1, 2, 2, 2) +tcc <- new("TCC", hypoData, group) +show(tcc) } \seealso{ \link{TCC-class} } -\references{ -Robinson MD, McCarthy DJ, Smyth GK: edgeR: a Bioconductor package for - differential expression analysis of digital gene expression data. - Bioinformatics 2010, 26(1): 139-140 - -Hardcastle TJ, Kelly KA: baySeq: empirical Bayesian methods for - identifying differential expression in sequence count data. - BMC Bioinformatics 2010, 11: 422 - -Anders S, Huber W: Differential expression analysis for sequence count data. - Genome Biol. 2010, 11: R106 - -Kadota K, Nishiyama T, Shimizu K: A normalization strategy for comparing - tag count data. Algorithms Mol Biol. 2012, 7:5 -} \keyword{packages} diff --git a/TCC/man/WAD.Rd b/TCC/man/WAD.Rd new file mode 100644 index 0000000..88f9c5d --- /dev/null +++ b/TCC/man/WAD.Rd @@ -0,0 +1,44 @@ +\name{WAD} +\alias{WAD} +\title{Calculate WAD statistic for individual genes} +\description{ +This function performs WAD method to identify differentially expressed genes +(DEGs) from two-group gene expression data. A high absolute value for the WAD +statistic is evident of a high degree of differential expression. +} +\usage{ +WAD(data, group, log.scale = FALSE, floor.value = 1, sort = FALSE) +} +\arguments{ + \item{data}{numeric matrix or data frame containing count data or + microarray data, where each row indicates the gene (or transcript + or probeset ID), each column indicates the sample (or library), + and each cell indicates the expression value (i.e., number of counts + or signal intensity) of the gene in the sample.} + \item{group}{numeric vector indicating the experimental group for each + sample (or library).} + \item{log.scale}{logical. If \code{TRUE}, the data are log scaled (base = 2) + after performing the \code{floor.value} setting. The default is + \code{log.scale = FALSE}.} + \item{floor.value}{numeric scalar specifying a floor value for the data. + The default is \code{floor.value = 1}, i.e., values less than 1 are + replaced by 1. Ignored if \code{log.scale = FALSE}.} + \item{sort}{logical. If \code{TRUE}, the retrieved results are sorted + in order of the rank of absolute WAD statistic. + If \code{FALSE}, the results are retrieved by the original order.} +} +\value{ +A numeric vector of WAD statistic for individual genes +} +\references{ +Kadota K, Nakai Y, Shimizu K: A weighted average difference method for +detecting differentially expressed genes from microarray data. +Algorithms Mol Biol. 2008, 3: 8. +} +\examples{ +data(nakai) +group <- c(1, 1, 1, 1, 2, 2, 2, 2) + +wad <- WAD(nakai, group, + log.scale = TRUE, floor.value = 1, sort = TRUE) +} diff --git a/TCC/man/arab.Rd b/TCC/man/arab.Rd index 4c5b32a..e4eef83 100644 --- a/TCC/man/arab.Rd +++ b/TCC/man/arab.Rd @@ -3,38 +3,41 @@ \alias{arab} \title{Arabidopsis RNA-Seq data set} \description{ - This dataset was imported from NBPSeq package and the following - explanation is verbatim copy of their explanation: +This dataset was imported from NBPSeq package and the following +explanation is verbatim copy of their explanation: - An RNA-Seq dataset from a pilot study of the defense response of - \emph{Arabidopsis} to infection by bacteria. We performed RNA-Seq - experiments on three independent biological samples from each of - the two treatment groups. The matrix contains the frequencies of - RNA-Seq reads mapped to genes in a reference database. Rows - correspond to genes and columns correspond to independent - biological samples. +An RNA-Seq dataset from a pilot study of the defense response of +\emph{Arabidopsis} to infection by bacteria. We performed RNA-Seq +experiments on three independent biological samples from each of +the two treatment groups. The matrix contains the frequencies of +RNA-Seq reads mapped to genes in a reference database. Rows +correspond to genes and columns correspond to independent +biological samples. } \details{ - This dataset was imported from NBPSeq package and the following - explanation is verbatim copy of their explanation: +This dataset was imported from NBPSeq package and the following +explanation is verbatim copy of their explanation: - We challenged leaves of \emph{Arabidopsis} with the defense-eliciting - \emph{\eqn{\Delta}hrcC} mutant of \emph{Pseudomonas syringae} pathovar - \emph{tomato} - DC3000. We also infiltrated leaves of \emph{Arabidopsis} with 10mM - MgCl2 as a mock inoculation. RNA was isolated 7 hours after - inoculation, enriched for mRNA and prepared for RNA-Seq. We - sequenced one replicate per channel on the Illumina Genome - Analyzer (http://www.illumina.com). The length of the RNA-Seq - reads can vary in length depending on user preference and the - sequencing instrument. The dataset used here are derived from a - 36-cycle sequencing reaction, that we trimmed to 25mers. We - used an in-house computational pipeline to process, align, and - assign RNA-Seq reads to genes according to a reference database - we developed for \emph{Arabidopsis}. +We challenged leaves of \emph{Arabidopsis} with the defense-eliciting +\emph{\eqn{\Delta}hrcC} mutant of \emph{Pseudomonas syringae} pathovar +\emph{tomato} +DC3000. We also infiltrated leaves of \emph{Arabidopsis} with 10mM +MgCl2 as a mock inoculation. RNA was isolated 7 hours after +inoculation, enriched for mRNA and prepared for RNA-Seq. We +sequenced one replicate per channel on the Illumina Genome +Analyzer (http://www.illumina.com). The length of the RNA-Seq +reads can vary in length depending on user preference and the +sequencing instrument. The dataset used here are derived from a +36-cycle sequencing reaction, that we trimmed to 25mers. We +used an in-house computational pipeline to process, align, and +assign RNA-Seq reads to genes according to a reference database +we developed for \emph{Arabidopsis}. } \usage{data(arab)} \format{A 26222 by 6 matrix of RNA-Seq read frequencies.} +\examples{ +data(arab) +} \references{ Di Y, Schafer DW, Cumbie JS, and Chang JH (2011): "The NBP Negative Binomial Model for Assessing Differential Gene Expression from RNA-Seq", Statistical diff --git a/TCC/man/calcAUCValue.Rd b/TCC/man/calcAUCValue.Rd index 033debc..5c52d39 100644 --- a/TCC/man/calcAUCValue.Rd +++ b/TCC/man/calcAUCValue.Rd @@ -2,59 +2,61 @@ \alias{calcAUCValue} \title{Calculate AUC value from a TCC-class object} \description{ -This function calculates AUC (Area under the ROC curve) value from a \link{TCC-class} object for simulation study. +This function calculates AUC (Area under the ROC curve) value from +a \link{TCC-class} object for simulation study. } \usage{calcAUCValue(tcc)} \arguments{ - \item{tcc}{\link{TCC-class} object having values in both \code{stat$rank} and \code{simulation$trueDEG} fields.} + \item{tcc}{\link{TCC-class} object having values in both \code{stat$rank} + and \code{simulation$trueDEG} fields.} } \details{ This function is generally used after the \code{\link{estimateDE}} function that estimates \eqn{p}-values (and the derivatives such as the \eqn{q}-values and the ranks) for individual genes based on the statistical model for differential expression (DE) analysis. -In case of the simulation analysis, -we know which genes are DEGs or non-DEGs in advance and the information is stored in the -\code{simulation$trueDEG} field of the \code{\link{TCC-class}} object \code{tcc} -(i.e., \code{tcc$simulation$trueDEG}). +In case of the simulation analysis, we know which genes are +DEGs or non-DEGs in advance and the information is stored in +the \code{simulation$trueDEG} field of the \code{\link{TCC-class}} +object \code{tcc} (i.e., \code{tcc$simulation$trueDEG}). The \code{\link{calcAUCValue}} function calculates the AUC value between the ranked gene list obtained by the \code{\link{estimateDE}} function and the truth -obtained by the \code{\link{generateSimulationData}} function. -A well-ranked gene list should have a high AUC value (i.e., high sensitivity and specificity).} +obtained by the \code{\link{simulateReadCounts}} function. +A well-ranked gene list should have a high AUC value +(i.e., high sensitivity and specificity). +} \value{ numeric scalar. } \examples{ # Analyzing a simulation data for comparing two groups # (G1 vs. G2) with biological replicates. -# the first 2000 genes are DEGs, where 1800 are up in G1. +# the first 200 genes are DEGs, where 180 are up in G1. # The DE analysis is performed by an exact test in edgeR coupled # with the DEGES/edgeR normalization factors. -\dontrun{ -tcc <- generateSimulationData(Ngene = 10000, PDEG = 0.2, +tcc <- simulateReadCounts(Ngene = 1000, PDEG = 0.2, DEG.assign = c(0.9, 0.1), - DEG.model = "uniform", - DEG.foldchange = list(4, 4), group = c(3, 3)) -tcc <- calcNormFactors(tcc) -tcc <- estimateDE(tcc) + DEG.foldchange = c(4, 4), + replicates = c(3, 3)) +tcc <- calcNormFactors(tcc, norm.method = "tmm", test.method = "edger", + iteration = 1, FDR = 0.1, floorPDEG = 0.05) +tcc <- estimateDE(tcc, test.method = "edger", FDR = 0.1) calcAUCValue(tcc) -} # Analyzing a simulation data for comparing two groups -# (G1 vs. G2) without any replicates. +# (G1 vs. G2) without replicates. # the levels of DE are 3-fold in G1 and 7-fold in G2 # The DE analysis is performed by an negative binomial test in # DESeq coupled with the DEGES/DESeq normalization factors. -\dontrun{ -tcc <- generateSimulationData(Ngene = 10000, PDEG = 0.2, +tcc <- simulateReadCounts(Ngene = 1000, PDEG = 0.2, DEG.assign = c(0.9, 0.1), - DEG.model = "uniform", - DEG.foldchange = list(3, 7), group = c(1, 1)) -tcc <- calcNormFactors(tcc) -tcc <- estimateDE(tcc) + DEG.foldchange = c(3, 7), + replicates = c(1, 1)) +tcc <- calcNormFactors(tcc, norm.method = "deseq", test.method = "deseq", + iteration = 1, FDR = 0.1, floorPDEG = 0.05) +tcc <- estimateDE(tcc, test.method = "deseq", FDR = 0.1) calcAUCValue(tcc) } -} \keyword{methods} diff --git a/TCC/man/calcNormFactors.Rd b/TCC/man/calcNormFactors.Rd index d75b1c8..4701a27 100644 --- a/TCC/man/calcNormFactors.Rd +++ b/TCC/man/calcNormFactors.Rd @@ -7,148 +7,140 @@ \description{ This function calculates normalization factors using a specified multi-step normalization method from a \link{TCC-class} object. -The procedure can generally be described as the \eqn{STEP1-(STEP2-STEP3)_{n}}{STEP1-(STEP2-STEP3)n} pipeline. -} -\synopsis{ -calcNormFactors(tcc, ...) +The procedure can generally be described as the +\eqn{STEP1-(STEP2-STEP3)_{n}}{STEP1-(STEP2-STEP3)n} pipeline. } \usage{ -calcNormFactors(tcc, norm.method = c("tmm", "deseq"), - test.method = c("edger", "deseq", "bayseq"), - iteration = 1, FDR = 0.1, floorPDEG = 0.05, - samplesize = 10000, processors = NULL) +\S4method{calcNormFactors}{TCC}(tcc, norm.method = NULL, test.method = NULL, + iteration = TRUE, FDR = NULL, floorPDEG = 0.05, + increment = FALSE, ...) } \arguments{ - \item{tcc}{\link{TCC-class} object.} - \item{norm.method}{character specifying normalization method that - is used in both the STEP1 and STEP3. Possible values are \code{"tmm"} - for the TMM normalization method implemented in the edgeR package, - \code{"edger"} (same as \code{"tmm"}), and \code{"deseq"} for the - method implemented in the DESeq package. The default is \code{"tmm"} - when analyzing the count data with multiple replicates - (i.e., \code{min(tcc$group) > 1}) and \code{"deseq"} when analyzing - the count data with a single measurement (i.e., \code{min(tcc$group) == 1})).} - \item{test.method}{character specifying method for identifying - differentially expressed genes (DEGs) used in STEP2. Possible - values are \code{"edger"}, \code{"deseq"}, and \code{"bayseq"} - for the DEG identification methods implemented in the edgeR, DESeq, - and baySeq, respectively. The default is \code{"edger"} when - analyzing the count data with multiple replicates - (i.e., \code{min(tcc$group) > 1}) and \code{"deseq"} when - analyzing the count data with a single measurement (i.e., \code{min(tcc$group) == 1))}.} - \item{iteration}{logical or numeric value specifying the number of iteration (n) - in the proposed normalization pipeline: - the \eqn{STEP1-(STEP2-STEP3)_{n}}{STEP1-(STEP2-STEP3)n} pipeline. - If \code{FALSE} or \code{0} is specified, the normalization pipeline - is performed only by the method in STEP1. If \code{TRUE} or \code{1} is - specified, the three-step normalization pipeline is performed. Integers - higher than \code{1} indicate the number of iteration in the pipeline.} - \item{FDR}{numeric value (between 0 and 1) specifying the - threshold for determining DEGs after STEP2.} - \item{floorPDEG}{numeric value (between 0 and 1) specifying the minimum value - to be eliminated as potential DEGs before performing STEP3.} - \item{samplesize}{numeric value specifying the sample size for estimating - the prior parameters if \code{test.method = "bayseq"}. See - the \code{\link[baySeq]{getPriors.NB}} function for details.} - \item{processors}{numeric value or 'snow' object for using multi - processors if \code{test.method = "bayseq"}. See the - \code{\link[baySeq]{getPriors.NB}} function for details.} + \item{tcc}{\link{TCC-class} object.} + \item{norm.method}{character specifying a normalization method used in + both the STEP1 and STEP3. Possible values are \code{"tmm"} for the + TMM normalization method implemented in the edgeR package, + \code{"edger"} (same as \code{"tmm"}), and \code{"deseq"} for the + method implemented in the DESeq package. The default is \code{"tmm"} + when analyzing the count data with multiple replicates + (i.e., \code{min(table(tcc$group[, 1])) > 1}) + and \code{"deseq"} when analyzing the count data without replicates + \cr(i.e., \code{min(table(tcc$group[, 1])) == 1}).} + \item{test.method}{character specifying a method for identifying + differentially expressed genes (DEGs) used in STEP2: + one of \code{"edger"}, \code{"deseq"}, \code{"bayseq"}, + \code{"ebseq"}, \code{"samseq"}, and \code{"wad"}. + See the "Details" filed in \code{\link{estimateDE}} for detail. + The default is \code{"edger"} when analyzing the count data with + multiple replicates (i.e., \code{min(table(tcc$group[, 1])) > 1}) + and \code{"deseq"} when analyzing the count data without replicates + (i.e., \code{min(table(tcc$group[, 1])) == 1))}.} + \item{iteration}{logical or numeric value specifying the number of + iteration (\eqn{n}) in the proposed normalization pipeline: the + \eqn{STEP1-(STEP2-STEP3)_{n}}{STEP1-(STEP2-STEP3)n} pipeline. + If \code{FALSE} or \code{0} is specified, the normalization pipeline + is performed only by the method in STEP1. If \code{TRUE} or \code{1} + is specified, the three-step normalization pipeline is performed. + Integers higher than \code{1} indicate the number of iteration in + the pipeline.} + \item{FDR}{numeric value (between 0 and 1) specifying the threshold for + determining potential DEGs after STEP2.} + \item{floorPDEG}{numeric value (between 0 and 1) specifying the minimum + value to be eliminated as potential DEGs before performing STEP3.} + \item{increment}{logical value. if \code{increment = TRUE}, the DEGES + pipeline will perform again from the current iterated result.} + \item{...}{arguments to identify potential DEGs at STEP2. See the + "Arguments" field in \code{\link{estimateDE}} for details.} } \details{ -The \code{\link{calcNormFactors}} function is the main function in the TCC package. +The \code{\link{calcNormFactors}} function is the main function in the +TCC package. Since this pipeline employs the DEG identification method at STEP2, -our multi-step strategy can eliminate the biased effect of potential DEGs -before the second normalization at STEP3. -To fully utilize the differentially expressed gene elimination strategy (DEGES), -we strongly recommend not to use \code{iteration = 0} or \code{iteration = FALSE}. -This function internally calls functions implemented in the edgeR, -DESeq, and baySeq packages according to the specified parameters. - -If the \code{norm.method = "tmm"} is specified, -the \code{calcNormFactors} function implemented in edgeR -is used for obtaining the TMM normalization factors at both STEP1 and 3. -In case of \code{norm.method = "deseq"}, -the \code{\link[DESeq]{estimateSizeFactors}} function in DESeq is used. -Note that the original \code{\link[DESeq]{estimateSizeFactors}} function -returns the size factors (not normalization factors). -Our \code{\link{calcNormFactors}} function internally converts the size factors -into normalization factors that are comparable to the TMM normalization factors. +our multi-step strategy can eliminate the negative effect of potential DEGs +before the second normalization at STEP3. +To fully utilize the DEG elimination strategy (DEGES), we strongly recommend +not to use \code{iteration = 0} or \code{iteration = FALSE}. +This function internally calls functions implemented in the other R packages +according to the specified value. -If the \code{test.method = "edger"} is specified, -a series of functions for differential expression -analysis (\code{\link[edgeR]{estimateCommonDisp}}, -\code{\link[edgeR]{estimateTagwiseDisp}}, -and \code{\link[edgeR]{exactTest}}) in edgeR are internally used. -Similarly, the \code{test.method = "deseq"} internally use two functions -(\code{\link[DESeq]{estimateDispersions}} and \code{\link[DESeq]{nbinomTest}}) -in DESeq and the \code{test.method = "bayseq"} - internally use two functions (\code{\link[baySeq]{getPriors.NB}} -and \code{\link[baySeq]{getLikelihoods.NB}}) in baySeq. +\itemize{ + \item \code{norm.method = "tmm"}\cr + The \code{\link[edgeR]{calcNormFactors}} function implemented + in edgeR is used for obtaining the TMM normalization factors + at both STEP1 and STEP3. + \item \code{norm.method = "deseq"}\cr + The \code{\link[DESeq]{estimateSizeFactors}} function + implemented in DESeq is used for obetaining the size factors + at both STEP1 and STEP3. + The size factors are internally converted to normalization factors + that are comparable to the TMM normalization factors. +} } \value{ -\link{TCC-class} object containing the normalization factors in the -\code{norm.factors} field (numerical vector). -In other words, the normalization factors in the \code{norm.factors} field are populated. +After performing the \code{calcNormFactors} function, +the calculated normalization factors are populated in the +\code{norm.factors} field (i.e., \code{tcc$norm.factors}). +Parameters used for DEGES normalization (e.g., potential DEGs +identified in STEP2, execution times for the identification, etc.) +are stored in the DEGES field (i.e., \code{tcc$DEGES}) as follows: + + \item{iteration}{the iteration number \eqn{n} for + \bold{the STEP1 - (STEP2 - STEP3)\eqn{_{n}} pipeline}.} + \item{pipeline}{the DEGES normalization pipeline.} + \item{threshold}{it stores + (i) the type of threshold (\code{threshold$type}), + (ii) the threshold value (\code{threshold$input}), + and (iii) the percentage of potential DEGs actually + used (\code{threshold$PDEG}). + These values depend on whether the percentage + of DEGs identified in STEP2 is higher or lower to the value + indicated by \code{floorPDEG}. + Consider, for example, the execution of \code{calcNormFactors} + function with "\code{FDR = 0.1} and \code{floorPDEG = 0.05}". + If the percentage of DEGs identified in STEP2 satisfying + \code{FDR = 0.1} was 0.14 + (i.e., higher than the \code{floorPDEG} of 0.05), + the values in the threshold fields will be + \code{threshold$type = "FDR"}, \code{threshold$input = 0.1}, + and \code{threshold$PDEG = 0.14}. + If the percentage (= 0.03) was lower than the predefined + \code{floorPDEG} value of 0.05, the values in the threshold fields + will be \code{threshold$type = "floorPDEG"}, + \code{threshold$input = 0.05}, and \code{threshold$PDEG = 0.05}.} + \item{potDEG}{numeric binary vector (0 for non-DEG or 1 for DEG) + after the evaluation of the percentage of DEGs identified in STEP2 with + the predefined \code{floorPDEG} value. If the percentage + (e.g., 2\%) is lower than the \code{floorPDEG} value (e.g., 17\%), + 17\% of elements become 1 as DEG.} + \item{prePotDEG}{numeric binary vector + (0 for non-DEG or 1 for DEG) before the evaluation of the percentage + of DEGs identified in STEP2 with the predefined \code{floorPDEG} + value. Regardless of the \code{floorPDEG} value, the percentage of + elements with 1 is always the same as that of DEGs identified in + STEP2.} + \item{execution.time}{computation time required for normalization.} } \examples{ -# calculating normalization factors for a hypothetical count data -# using the TbT method (the TMM-baySeq-TMM pipeline; Kadota et al., 2012) -\dontrun{ data(hypoData) -group <- c(3, 3) -tcc <- new("TCC", hypoData, group) -tcc <- calcNormFactors(tcc, norm.method = "tmm", test.method = "bayseq") -tcc$norm.factors -} +group <- c(1, 1, 1, 2, 2, 2) # calculating normalization factors using the DEGES/edgeR method # (the TMM-edgeR-TMM pipeline) -\dontrun{ tcc <- new("TCC", hypoData, group) -tcc <- calcNormFactors(tcc) +tcc <- calcNormFactors(tcc, norm.method = "tmm", test.method = "edger", + iteration = 1, FDR = 0.1, floorPDEG = 0.05) tcc$norm.factors -} # calculating normalization factors using the iterative DEGES/edgeR method # (iDEGES/edgeR) with n = 3 -\dontrun{ tcc <- new("TCC", hypoData, group) -tcc <- calcNormFactors(tcc, iteration = 3) +tcc <- calcNormFactors(tcc, norm.method = "tmm", test.method = "edger", + iteration = 3, FDR = 0.1, floorPDEG = 0.05) tcc$norm.factors -} -# calculating normalization factors for count data without replicates -\dontrun{ -group <- c(1, 1) -tcc <- new("TCC", hypoData[, c(1, 4)], group) -tcc <- calcNormFactors(tcc) +# calculating normalization factors for simulation data without replicates +tcc <- simulateReadCounts(replicates = c(1, 1)) +tcc <- calcNormFactors(tcc, norm.method = "deseq", test.method = "deseq", + iteration = 1, FDR = 0.1, floorPDEG = 0.05) tcc$norm.factors } - -# calculating normalization factors for another simulation data without -# replicates -\dontrun{ -tcc <- generateSimulationData(group = c(1, 1)) -tcc <- calcNormFactors(tcc) -tcc$norm.factors -} - -# calculating normalization factors for a simulation data using the -# TbT method with four processors -\dontrun{ -tcc <- generateSimulationData() -tcc <- calcNormFactors(tcc, norm.method = "tmm", - test.method = "bayseq", processors = 4) -tcc$norm.factors -} - -# calculating normalization factors for a simulation data using -# the TbT method with four processors (another way) -\dontrun{ -tcc <- generateSimulationData() -cl <- makeCluster(2, "SOCK") -tcc <- calcNormFactors(tcc, norm.method = "tmm", - test.method = "bayseq", processors = cl) -tcc$norm.factors -} -} diff --git a/TCC/man/do_TbT.Rd b/TCC/man/do_TbT.Rd index 64310c0..5be637d 100644 --- a/TCC/man/do_TbT.Rd +++ b/TCC/man/do_TbT.Rd @@ -1,21 +1,21 @@ \name{do_TbT} \alias{do_TbT} \title{Calculate normalization factors for raw tag count data using a - multi-step normalization strategy called "TbT"} +multi-step normalization strategy called "TbT"} \description{ - This method performs TMM normalization, baySeq, and again TMM - normalization to infer a good normalization factor as - described in Kadota et al. 2012, Algorithms Mol Biol 7:5. - This function will be obsoleted. - Please use \link{TCC-class} based methods. +This method performs TMM normalization, baySeq, and again TMM +normalization to infer a good normalization factor as +described in Kadota et al. 2012, Algorithms Mol Biol 7:5. +This function will be obsoleted. +Please use \link{TCC-class} based methods. } \usage{ - do_TbT(data, data.cl, sample_num = 10000) +do_TbT(data, data.cl, sample_num = 10000) } \arguments{ -\item{data}{The data matrix to be analysed. Numerical data only} -\item{data.cl}{A vector describing the data class for columns in data} -\item{sample_num}{Sample number for baysian estimation} + \item{data}{The data matrix to be analysed. Numerical data only} + \item{data.cl}{A vector describing the data class for columns in data} + \item{sample_num}{Sample number for baysian estimation} } \examples{ \dontrun{ diff --git a/TCC/man/estimateDE.Rd b/TCC/man/estimateDE.Rd index e7dafc3..e19efda 100644 --- a/TCC/man/estimateDE.Rd +++ b/TCC/man/estimateDE.Rd @@ -1,13 +1,10 @@ \name{estimateDE} \alias{estimateDE} \title{Estimate degrees of differential expression (DE) for individual genes} -\synopsis{ -estimateDE(tcc, test.method = NULL, FDR = NULL, - samplesize = 10000, processors = NULL) -} \usage{ -estimateDE(tcc, test.method = c("edger", "deseq", "bayseq"), FDR = 0.1, - samplesize = 10000, processors = NULL) +estimateDE(tcc, test.method, FDR, + dispersion, fit0, fit1, design, + contrast, coef, comparison, samplesize, floor.value = 1, cl) } \description{ This function calculates \eqn{p}-values (or the related statistics) for @@ -17,65 +14,155 @@ identifying differentially expressed genes (DEGs) from a implemented in other R packages. } \arguments{ - \item{tcc}{\link{TCC-class} object.} - \item{test.method}{character string specifying method for identifying - DEGs. Possible values are \code{"edger"}, \code{"deseq"}, and - \code{"bayseq"} for the DEG identification methods implemented in - the edgeR, DESeq, and baySeq, respectively. The default is - \code{"edger"} when analyzing the count data with replicates (i.e., - \code{min(tcc$group) > 1}) and \code{"deseq"} when analyzing the - count data without any replicates (i.e., \code{min(tcc$group) == 1)}).} - \item{FDR}{numeric value (between 0 and 1) specifying the threshold - for determining DEGs.} - \item{samplesize}{numeric value specifying the sample size for - estimating the prior parameters if \code{test.method = "bayseq"}. - See the \code{\link[baySeq]{getPriors.NB}} function for details.} - \item{processors}{numeric value or 'snow' object for using multi - processors if \code{test.method = "bayseq"}. See the - \code{\link[baySeq]{getPriors.NB}} function for details.} + \item{tcc}{\link{TCC-class} object.} + \item{test.method}{character string specifying a method for identifying + DEGs: one of \code{"edger"}, \code{"deseq"}, \code{"bayseq"}, + \code{"ebseq"}, \code{"samseq"}, and \code{"wad"}. + See the "Details" field for detail. + The default is \code{"edger"} when analyzing the count data with + replicates (i.e., \code{min(table(tcc$group[, 1])) > 1}), + \code{"deseq"} when analyzing the count data without replicates + (i.e., \code{min(table(tcc$group[, 1])) == 1)}).} + %, and \code{"bayseq"} when the \code{paired = TRUE} is specified.} + \item{FDR}{numeric value (between 0 and 1) specifying the threshold + for determining DEGs.} + %\item{paired}{logical. If \code{TRUE}, the data is treated as paired data.} + \item{dispersion}{numeric vector giving the dispersion of all genes + for analyzing the count data without replicates when the + \code{test.method = "edger"} is specified. + See the \code{\link[edgeR]{exactTest}} function in edgeR for details.} + \item{design}{numeric matrix giving the design matrix for the linear model. + See the \code{\link[edgeR]{glmFit}} function in edgeR for details.} + \item{contrast}{numeric vector specifying a contrast of the linear model + coefficients to be tested equal to zero. + See the \code{\link[edgeR]{glmLRT}} function in edgeR for details.} + \item{coef}{integer or character vector indicating which coefficients + of the linear model are to be tested equal to zero. + See the \code{\link[edgeR]{glmLRT}} function in edgeR for details.} + \item{fit0}{a formula for creating reduced model described in DESeq. + The left hand side must be \code{count}, the right hand side can + involve any column of \code{tcc$group} is used as the model frame. + See the \code{\link[DESeq]{fitNbinomGLMs}} function for details.} + \item{fit1}{a formula for creating full model described in DESeq. + The left hand side must be \code{count}, the right hand side can + involve any column of \code{tcc$group} is used as the model frame. + See the \code{\link[DESeq]{fitNbinomGLMs}} function for details.} + \item{comparison}{numeric or character string identifying the columns in + the \code{tcc$group[, 1]} for analysis. See the \code{group} argument + in \code{\link[baySeq]{topCounts}} for details.} + \item{samplesize}{integer specifying (i) the sample size for estimating the + prior parameters if \code{test.method = "bayseq"} (defaults to 10000), + (ii) the iteration number used in EM algorithm in EBSeq if + \code{test.method = "ebseq"} (defaults to 5), + and (iii) the number of permutation in samr if + \code{test.method = "samseq"} (defaults to 10).} + \item{floor.value}{numeric scalar (>= 0) to replace the values higher + than the \code{floor.value} if \code{test.method = "wad"} + (defaults to 1).} + \item{cl}{\code{snow} object when using multi processors if + \code{test.method = "bayseq"} is specified. + See the \code{\link[baySeq]{getPriors.NB}} function in baySeq + for details.} } \details{ -\code{estimateDE} function is generally used after the -\code{\link{calcNormFactors}} function calculated normalization factors. -\code{estimateDE} constructs a statistical model for differential -expression (DE) analysis with the calculated normalization factors. -This function internally calls individual functions implemented in the edgeR, -DESeq, and baySeq packages according to the specified parameters. - -If the \code{test.method = "edger"} is specified, -a series of functions for differential expression analysis -(\code{\link[edgeR]{estimateCommonDisp}}, - \code{\link[edgeR]{estimateTagwiseDisp}}, -and \code{\link[edgeR]{exactTest}}) -in edgeR are internally called and \eqn{p}-values (and the derivative such - as the \eqn{q}-values and the ranks) are calculated. - -The \code{test.method = "deseq"} internally use two functions -(\code{\link[DESeq]{estimateDispersions}} and \code{\link[DESeq]{nbinomTest}}) - in DESeq. - -The \code{test.method = "bayseq"} internally use two -functions (\code{\link[baySeq]{getPriors.NB}} and - \code{\link[baySeq]{getLikelihoods.NB}}) in baySeq. +\code{estimaetDE} function is generally used after performing the +\code{\link{calcNormFactors}} function that calculates normalization factors. +\code{estimateDE} constructs a statistical model for differential expression +(DE) analysis with the calculated normalization factors and returns the +\eqn{p}-values (or the derivatives). The individual functions in the other +packages are internally called according to the specified +\code{test.method} parameter. -Different from the edgeR and DESeq in which the calculated \eqn{p}-values are -stored in the \code{stat$p.value} field of the \link{TCC-class} object -\code{tcc}, baySeq outputs posterior likelihoods instead of \eqn{p}-values. -Therefore, the \eqn{(1 - likelihood)} values are stored in the corresponding - field in case of \code{test.method = "bayseq"}. +\itemize{ + \item \code{test.method = "edger"}\cr + There are two approaches (i.e., exact test and GLM) to identify DEGs + in edgeR. The two approches are implmented in TCC. As a default, + the exact test approach is used for two-group data, + and GLM approach is used for multi-group or multi-factor data. + However, if \code{design} and the one of \code{coef} or + \code{contrast} are given, the GLM approach will be used for + two-group data. Currently, edgeR package does not support data + without replicates.\cr + If the exact test approach is used, + \code{\link[edgeR]{estimateCommonDisp}}, + \code{\link[edgeR]{estimateTagwiseDisp}}, and + \code{\link[edgeR]{exactTest}} are internally called.\cr + If the GLM approach is used, + \code{\link[edgeR]{estimateGLMCommonDisp}}, + \code{\link[edgeR]{estimateGLMTrendedDisp}}, + \code{\link[edgeR]{estimateGLMTagwiseDisp}}, + \code{\link[edgeR]{glmFit}}, and + \code{\link[edgeR]{glmLRT}} + are internally called. + \item \code{test.method = "deseq"}\cr + DESeq supports two approach (i.e. an exact test and + GLM approach) for identifying DEGs. As a default, + the exact test is used for two-group data, + and GLM approach is used for multi-group or multi-factor data. + However, if \code{fit0} and \code{fit1} are given, the GLM approach + will be used for two-group data.\cr + If the exact test is used, + \code{\link[DESeq]{estimateDispersions}} and + \code{\link[DESeq]{nbinomTest}} are internally called.\cr + If the GLM approach is used, + \code{\link[DESeq]{estimateDispersions}}, + \code{\link[DESeq]{fitNbinomGLMs}}, and + \code{\link[DESeq]{nbinomGLMTest}} + are internally called. + \item \code{test.method = "bayseq"}\cr + \code{\link[baySeq]{getPriors.NB}} and + \code{\link[baySeq]{getLikelihoods.NB}} in baySeq are internally + called for identifying DEGs. + %when \code{paired = FALSE}. + %\code{\link[baySeq]{getPriors.BB}} and + %\code{\link[baySeq]{getLikelihoods.BB}} in baySeq are called + %when \code{paired = TRUE}. + Since baySeq outputs posterior likelihoods instead of + \eqn{p}-values, the \eqn{(1 - likelihood)} values are + stored in the corersponding field + (i.e., \code{tcc$stat$p.value}). + \item \code{test.method = "ebseq"}\cr + \code{\link[EBSeq]{EBTest}} and \code{\link[EBSeq]{EBMultiTest}} + are internally called for two-group and multi-group data with + replicates, respectively. Currently, both functions does not + support data without replicates. + Since EBSeq only outputs \eqn{q}-value, + the all values in \code{tcc$stat$p.value} are \code{NA}. + \item \code{test.method = "samseq"}\cr + \code{\link[samr]{SAMseq}} with + \code{resp.type = "Two class unpaired"} arugment + in samr package is called to identify DEGs for two-group data, + and \code{resp.type = "Multiclass"} for multi-group data. + Since \code{\link[samr]{SAMseq}} outputs test statistics + instead of \eqn{p}-values, + the \code{tcc$stat$p.value} and \code{tcc$stat$q.value} + are \code{NA}. + Alternatively, the test statistics are stored in + \code{tcc$stat$testStat} field. + \item \code{test.method = "wad"}\cr + The \code{\link{WAD}} implemented in TCC is used for identifying + DEGs. Since \code{\link{WAD}} outputs test statistics instead of + \eqn{p}-values, the \code{tcc$stat$p.value} and + \code{tcc$stat$q.value} are \code{NA}. + Alternatively, the test statistics are stored in + \code{tcc$stat$testStat} field. +} } \value{ A \code{\link{TCC-class}} object containing following fields: - \item{stat$p.value}{numeric vector of \eqn{p}-values.} - \item{stat$q.value}{numeric vector of \eqn{q}-values calculated - based on the \eqn{p}-values using the \code{p.adjust} function - with default parameter settings.} - \item{stat$rank}{gene rank in order of the \eqn{p}-values.} - \item{estimatedDEG}{numeric vector consisting of 0, 1, or 2 - depending on whether each gene is classified as non-DEG, DEG - expressed at a higher level in Group 1 or Group 2, respectively. - The threshold for classifying DEGs or non-DEGs is preliminarily - given as the \code{FDR} argument.} + \item{stat$p.value}{numeric vector of \eqn{p}-values.} + \item{stat$q.value}{numeric vector of \eqn{q}-values calculated + based on the \eqn{p}-values using the \code{p.adjust} function + with default parameter settings.} + \item{stat$testStat}{numeric vector of test statistics if + \code{test.method = "samseq"} or \code{"wad"} is specified.} + \item{stat$rank}{gene rank in order of the \eqn{p}-values or + test statistics.} + \item{estimatedDEG}{numeric vector consisting of 0 or 1 + depending on whether each gene is classified + as non-DEG or DEG. The threshold for classifying + DEGs or non-DEGs is preliminarily given as the + \code{FDR} argument.} } \examples{ # Analyzing a simulation data for comparing two groups @@ -84,28 +171,27 @@ A \code{\link{TCC-class}} object containing following fields: # with the DEGES/edgeR normalization factors. # For retrieving the summaries of DE results, we recommend to use # the getResult function. -\dontrun{ data(hypoData) -group <- c(3, 3) +group <- c(1, 1, 1, 2, 2, 2) tcc <- new("TCC", hypoData, group) -tcc <- calcNormFactors(tcc) +tcc <- calcNormFactors(tcc, norm.method = "tmm", test.method = "edger", + iteration = 1, FDR = 0.1, floorPDEG = 0.05) tcc <- estimateDE(tcc, test.method = "edger", FDR = 0.1) head(tcc$stat$p.value) head(tcc$stat$q.value) head(tcc$estimatedDEG) -} +result <- getResult(tcc) # Analyzing a simulation data for comparing two groups -# (G1 vs. G2) without any replicates +# (G1 vs. G2) without replicates # The DE analysis is performed by an negative binomial (NB) test # in DESeq coupled with the DEGES/DESeq normalization factors. -\dontrun{ data(hypoData) -group <- c(1, 1) +group <- c(1, 2) tcc <- new("TCC", hypoData[, c(1, 4)], group) -tcc <- calcNormFactors(tcc) +tcc <- calcNormFactors(tcc, norm.method = "deseq", test.method = "deseq", + iteration = 1, FDR = 0.1, floorPDEG = 0.05) tcc <- estimateDE(tcc, test.method = "deseq", FDR = 0.1) } -} \keyword{methods} diff --git a/TCC/man/exactTestafterTbT.Rd b/TCC/man/exactTestafterTbT.Rd index 328c22a..e97df12 100644 --- a/TCC/man/exactTestafterTbT.Rd +++ b/TCC/man/exactTestafterTbT.Rd @@ -2,12 +2,12 @@ \alias{exactTestafterTbT} \title{exactTest after TMM-baySeq-TMM procedure} \description{ - This function perform exact test with edgeR after TMM-baySeq-TMM procedure - via \code{\link{do_TbT}}. - This function will be obsoleted. Use \link{TCC-class} based methods instead. +This function perform exact test with edgeR after TMM-baySeq-TMM +procedure via \code{\link{do_TbT}}. This function will be obsoleted. +Use \link{TCC-class} based methods instead. } \usage{ - exactTestafterTbT(names, counts, group, sample_num=10000) + exactTestafterTbT(names, counts, group, sample_num = 10000) } \arguments{ \item{names}{A vector containing the name of each element eg gene} diff --git a/TCC/man/filterLowCountGenes.Rd b/TCC/man/filterLowCountGenes.Rd index 77b2a39..715e29c 100644 --- a/TCC/man/filterLowCountGenes.Rd +++ b/TCC/man/filterLowCountGenes.Rd @@ -8,35 +8,31 @@ The threshold is configurable with \code{low.count} parameter. } \usage{filterLowCountGenes(tcc, low.count = 0)} \arguments{ - \item{tcc}{\link{TCC-class} object.} - \item{low.count}{numeric value (>= 0) specifying the threshold - for filtering genes. The higher value indicates the more - numbers of genes to be filtered out.} + \item{tcc}{\link{TCC-class} object.} + \item{low.count}{numeric value (>= 0) specifying the threshold + for filtering genes. The higher value indicates the more + numbers of genes to be filtered out.} } \value{ \link{TCC-class} object consisting of genes whose sum of the counts - across samples is equal or higher than the \code{low.count} value. +across samples is equal or higher than the \code{low.count} value. } \examples{ # Filtering genes with zero counts across samples (default) from -# a hypothetical count dataset that originally has 10,000 genes -\dontrun{ +# a hypothetical count dataset that originally has 1,000 genes data(hypoData) -group <- c(3, 3) +group <- c(1, 1, 1, 2, 2, 2) tcc <- new("TCC", hypoData, group) dim(tcc$count) tcc <- filterLowCountGenes(tcc) dim(tcc$count) -} # Filtering genes with 10 counts across samples from hypoData -\dontrun{ data(hypoData) -group <- c(3, 3) +group <- c(1, 1, 1, 2, 2, 2) tcc <- new("TCC", hypoData, group) dim(tcc$count) tcc <- filterLowCountGenes(tcc, low.count = 10) dim(tcc$count) } -} \keyword{methods} diff --git a/TCC/man/generateSimulationData.Rd b/TCC/man/generateSimulationData.Rd deleted file mode 100644 index 476958d..0000000 --- a/TCC/man/generateSimulationData.Rd +++ /dev/null @@ -1,139 +0,0 @@ -\name{generateSimulationData} -\alias{generateSimulationData} -\title{Generate simulation data from negative binomial (NB) distribution} -\description{ -This function generates simulation data with a specified condition. -It can generate not only all of the simulation data analyzed in Kadota -et al., (2012) but also simulation data with more complex design -such as two or more groups and/or without replicates. -} -\usage{ -generateSimulationData(Ngene = 10000, PDEG = 0.20, DEG.assign = c(0.9, 0.1), - DEG.model = "uniform", DEG.foldchange = NULL, - group = c(3, 3)) -} -\arguments{ - \item{Ngene}{numeric scalar specifying the number of genes.} - \item{PDEG}{numeric scalar specifying the proportion of - differentially expressed genes (DEGs).} - \item{DEG.assign}{numeric vector specifying the proportions - of DEGs up-regulated in individual groups to be compared. - The number of element should be the same as the number of - groups compared.} - \item{DEG.model}{character string specifying the distribution - for sampling the fold-change (FC) values. Possible values - are \code{"uniform"} (default) and \code{"gamma"}.} - \item{DEG.foldchange}{list. If \code{DEG.model = "uniform"}, - \eqn{i}-th component contains numeric scalar specifying the - degree of FC for Group \eqn{i}. The default is - \code{DEG.foldchange = list(4, 4)}, indicating that the levels - of DE are four-fold in both groups. If \code{DEG.model = "gamma"}, - \eqn{i}-th component contains numeric vector consisting of three - elements, i.e., c(\eqn{x}, \eqn{y}, \eqn{z}), for determining - the FC value for DEGs up-regulated in Group \eqn{i}. Specifically, - the FC value for each DEG is randomly sampled from - "\eqn{x} + a gamma distribution with shape (= \eqn{y}) and - scale (= \eqn{z}) parameters", indicating that the minimum and - average FC values for DEGs correspond to \eqn{x} and - (\eqn{x} + \eqn{y} * \eqn{z}).} - \item{group}{numeric vector indicating the numbers of (biological) - replicates for individual groups compared.} -} -\details{ -The empirical distribution of read counts -used in this function is calculated from a RNA-seq dataset -obtained from \emph{Arabidopsis} data -(three biological replicates for both the treated and non-treated samples), -the arab object, in NBPSeq package (Di et al., 2011). -The overall design about the simulation conditions introduced -can be viewed as a pseudo-color image by the \code{\link{plotFCPseudocolor}} function. -} -\value{ -A \link{TCC-class} object containing following fields: - \item{count}{numeric matrix of simulated count data.} - \item{group}{numeric vector indicating the numbers - of (biological) replicates for individual groups compared.} - \item{replicates}{numeric vector indicating the experimental - group for each sample (or library).} - \item{norm.factors}{numeric vector as a placeholder for - normalization factors.} - \item{stat}{list for storing results after the execution of - the \code{\link{calcNormFactors}} (and \code{\link{estimateDE}}) - function.} - \item{estimatedDEG}{numeric vector as a placeholder for indicating - which genes are up-regulated in particular group compared to the - others. The values in this field will be populated after the - execution of the \code{\link{estimateDE}} function.} - \item{simulation}{list containing four fields: \code{trueDEG}, - \code{DEG.foldchange}, \code{PDEG}, and \code{group}. The - \code{trueDEG} field (numeric vector) stores information about - DEGs: 0 for non-DEG, 1 for DEG up-regulated in Group 1, 2 for - DEG up-regulated in Group 2, and so on. The information for - the remaining three fields is the same as those indicated in - the corresponding arguments.} -} -\examples{ -# Generating a simulation data for comparing two groups -# (G1 vs. G2) with biological replicates. -# the first 2000 genes are DEGs, where 1800 are up in G1. -\dontrun{ -tcc <- generateSimulationData(Ngene = 10000, PDEG = 0.2, - DEG.assign = c(0.9, 0.1), - DEG.model = "uniform", - DEG.foldchange = list(4, 4), group = c(3, 3)) -dim(tcc$count) -head(tcc$count) -str(tcc$simulation) -head(tcc$simulation$trueDEG) -} - - -# Generating a simulation data for comparing two groups -# (G1 vs. G2) without any replicates. -# the levels of DE are 3-fold in G1 and 7-fold in G2 -\dontrun{ -tcc <- generateSimulationData(Ngene = 10000, PDEG = 0.2, - DEG.assign = c(0.9, 0.1), - DEG.model = "uniform", - DEG.foldchange = list(3, 7), group = c(1, 1)) -dim(tcc$count) -head(tcc$count) -str(tcc$simulation) -head(tcc$simulation$trueDEG) -} - - -# Generating a simulation data for comparing two groups -# (G1 vs. G2) with replicates. -# the DEG.model are changed, minimum FC = 2-fold, -# and the average FC = 4.5-fold. -\dontrun{ -tcc <- generateSimulationData(Ngene = 10000, PDEG = 0.2, - DEG.assign = c(0.9, 0.1), - DEG.model = "gamma", - DEG.foldchange = list(c(2, 5, 0.5), c(2, 5, 0.5)), - group = c(2, 2)) -dim(tcc$count) -str(tcc$simulation) -} - - - -# Generating a simulation data for comparing three groups -# (G1 vs. G2 vs. G3) with biological replicates. -# the first 3000 genes are DEGs, where the 70%, 20%, and 10% are -# up-regulated in G1, G2, G3, respectively. The levels of DE are -# 3-, 10, and 6-fold in individual groups. -\dontrun{ -tcc <- generateSimulationData(Ngene = 10000, PDEG = 0.3, - DEG.assign = c(0.7, 0.2, 0.1), - DEG.model = "uniform", - DEG.foldchange = list(3, 10, 6), - group = c(2, 4, 3)) -dim(tcc$count) -head(tcc$count) -str(tcc$simulation) -head(tcc$simulation$trueDEG) -} -} -\keyword{methods} diff --git a/TCC/man/getNormalizedData.Rd b/TCC/man/getNormalizedData.Rd index 76af8bd..c4c8989 100644 --- a/TCC/man/getNormalizedData.Rd +++ b/TCC/man/getNormalizedData.Rd @@ -7,7 +7,7 @@ data and calculated normalization factors. } \usage{getNormalizedData(tcc)} \arguments{ - \item{tcc}{\link{TCC-class} object.} + \item{tcc}{\link{TCC-class} object.} } \details{ This function is generally used after the \code{\link{calcNormFactors}} @@ -20,25 +20,17 @@ stored in the \code{norm.factors} field in the \link{TCC-class} object. A numeric matrix containing normalized count data. } \examples{ -\dontrun{ -# Note that the hypoData has non-DEGs at 2001-10000th rows -nonDEG <- 2001:10000 +# Note that the hypoData has non-DEGs at 201-1000th rows +nonDEG <- 201:1000 data(hypoData) summary(hypoData[nonDEG, ]) - -# Obtaining normalized count data after performing the -# TbT normalization method (Kadota et al., 2012) from hypoData, -# i.e., TbT-normalized data -group <- c(3, 3) -tcc <- new("TCC", hypoData, group) -tcc <- calcNormFactors(tcc, norm.method = "tmm", test.method = "bayseq") -normalized.count <- getNormalizedData(tcc) -summary(normalized.count[nonDEG, ]) +group <- c(1, 1, 1, 2, 2, 2) # Obtaining normalized count data after performing the # DEGES/edgeR normalization method, i.e., DEGES/edgeR-normalized data tcc <- new("TCC", hypoData, group) -tcc <- calcNormFactors(tcc) +tcc <- calcNormFactors(tcc, norm.method = "tmm", test.method = "edger", + iteration = 1, FDR = 0.1, floorPDEG = 0.05) normalized.count <- getNormalizedData(tcc) summary(normalized.count[nonDEG, ]) @@ -46,9 +38,8 @@ summary(normalized.count[nonDEG, ]) # TMM normalization method (Robinson and Oshlack, 2010), # i.e., TMM-normalized data tcc <- new("TCC", hypoData, group) -tcc <- calcNormFactors(tcc, norm.method = "tmm", iteration = FALSE) +tcc <- calcNormFactors(tcc, norm.method = "tmm", iteration = 0) normalized.count <- getNormalizedData(tcc) summary(normalized.count[nonDEG, ]) } -} \keyword{methods} diff --git a/TCC/man/getResult.Rd b/TCC/man/getResult.Rd index 9471ab3..7d48c45 100644 --- a/TCC/man/getResult.Rd +++ b/TCC/man/getResult.Rd @@ -2,64 +2,65 @@ \alias{getResult} \title{Obtain the summaries of results after the differential expression analysis} -\description{This function is generally used after the - \code{\link{estimateDE}} function. It retrieves the summaries of - differential expression (DE) results from \link{TCC-class} object. - The retrieved information includes \eqn{p}-values, \eqn{q}-values, - coordinates of M-A plot (i.e., M and A values), and so on.} +\description{ +This function is generally used after the +\code{\link{estimateDE}} function. It retrieves the summaries of +differential expression (DE) results from \link{TCC-class} object. +The retrieved information includes \eqn{p}-values, \eqn{q}-values, +coordinates of M-A plot (i.e., M and A values), and so on. +} \usage{getResult(tcc, sort = FALSE, floor = 0)} \arguments{ - \item{tcc}{\link{TCC-class} object} - \item{sort}{logical. If \code{TRUE}, the retrieved results are - sorted in order of the \code{stat$rank} field in the - \link{TCC-class} object. If \code{FALSE}, the results - are retrieved by the original order.} - \item{floor}{numeric scalar specifying a threshold for adjusting - low count data.} + \item{tcc}{\link{TCC-class} object} + \item{sort}{logical. If \code{TRUE}, the retrieved results are + sorted in order of the \code{stat$rank} field in the + \link{TCC-class} object. If \code{FALSE}, the results + are retrieved by the original order.} + \item{floor}{numeric scalar specifying a threshold for adjusting + low count data.} } \value{ A data frame object containing following fields: - \item{id}{(character) vector of gene ID.} - \item{a.value}{numeric vector of average expression level on log2 - scale (i.e., A-value) for each gene across the compared two groups. - It corresponds to the \eqn{x} coordinate in the M-A plot.} - \item{m.value}{numeric vector of fold-change on \eqn{\log_2} scale - (i.e., M-value) for each gene between the two groups compared. - It corresponds to the \eqn{y} coordinate in the M-A plot.} - \item{p.value}{numeric vector of \eqn{p}-values.} - \item{q.value}{numeric vector of \eqn{q}-values calculated - based on the \eqn{p}-values using the \code{p.adjust} - function with default parameter settings.} - \item{rank}{numeric vector of gene rank in order of - the \eqn{p}-values.} - \item{estimatedDEG}{numeric vector consisting of 0, 1, or 2 - depending on whether each gene is classified as non-DEG, DEG - up-regulated in Group 1 or Group 2, respectively. The threshold - for classifying DEGs or non-DEGs is preliminarily given when - performing \code{\link{estimateDE}}.} + \item{gene_id}{character vector indicating the id of the count unit, + usually gene.} + \item{a.value}{numeric vector of average expression level on log2 + scale (i.e., A-value) for each gene across the compared + two groups. It corresponds to the \eqn{x} coordinate + in the M-A plot.} + \item{m.value}{numeric vector of fold-change on \eqn{\log_2} scale + (i.e., M-value) for each gene between the two groups + compared. It corresponds to the \eqn{y} coordinate in + the M-A plot.} + \item{p.value}{numeric vector of \eqn{p}-values.} + \item{q.value}{numeric vector of \eqn{q}-values calculated + based on the \eqn{p}-values using the \code{p.adjust} + function with default parameter settings.} + \item{rank}{numeric vector of gene rank in order of the \eqn{p}-values.} + \item{estimatedDEG}{numeric vector consisting of 0 or 1 + depending on whether each gene is classified + as non-DEG or DEG. The threshold for classifying + DEGs or non-DEGs is preliminarily + given when performing \code{\link{estimateDE}}.} } \examples{ # Obtaining DE results by an exact test in edgeR coupled with # the DEGES/edgeR normalization factors -# run on 100/hypoData genes for example to save time data(hypoData) -group <- c(3, 3) -tcc <- new("TCC", hypoData[1:100*100,], group) -tcc <- calcNormFactors(tcc) +group <- c(1, 1, 1, 2, 2, 2) +tcc <- new("TCC", hypoData, group) +tcc <- calcNormFactors(tcc, norm.method = "tmm", test.method = "edger", + iteration = 1, FDR = 0.1, floorPDEG = 0.05) tcc <- estimateDE(tcc, test.method = "edger", FDR = 0.1) result <- getResult(tcc, sort = TRUE) head(result) # Obtaining DE results by an negative binomial test in DESeq # coupled with the iterative DEGES/DESeq normalization method -# with n = 3 -\dontrun{ tcc <- new("TCC", hypoData, group) -tcc <- calcNormFactors(tcc, norm.method = "deseq", - test.method = "deseq", iteration = 3) +tcc <- calcNormFactors(tcc, norm.method = "deseq", test.method = "deseq", + iteration = 1, FDR = 0.1, floorPDEG = 0.05) tcc <- estimateDE(tcc, test.method = "deseq", FDR = 0.1) result <- getResult(tcc, sort = TRUE) head(result) } -} \keyword{methods} diff --git a/TCC/man/hypoData.Rd b/TCC/man/hypoData.Rd index a5d29f2..de31807 100644 --- a/TCC/man/hypoData.Rd +++ b/TCC/man/hypoData.Rd @@ -4,35 +4,33 @@ \title{A simulation dataset for comparing two-group tag count data, focusing on RNA-seq} \description{ -A simulation dataset, consisting of 10,000 rows (or genes) and +A simulation dataset, consisting of 1,000 rows (or genes) and 6 columns (or independent biological samples). } \usage{data(hypoData)} -\format{hypoData is a matrix of dimension 10,000 times 6.} +\format{\code{hypoData} is a matrix of dimension 1,000 times 6.} \details{ This package typically start the differential expression analysis with - a count table matrix such as hypoData where each row indicates the - gene (or transcript), each column indicates the sample (or library), - and each cell indicates the number of counts to the gene in the sample. - The first three columns are produced from biological replicates of, - for example, Group 1 and the remaining columns are from Group 2; - i.e., {G1_rep1, G1_rep2, G1_rep3} vs. {G2_rep1, G2_rep2, G2_rep3}. -This data is generated by the \code{\link{generateSimulationData}} - function with default parameter settings. The first 2,000 genes are - differentially expressed in the two groups. Of these, the first 1,800 - genes are expressed at a higher level in Group 1 (G1) and the remaining - 200 genes are expressed at a higher level in G2. Accordingly, the - 2,001-10,000th genes are not differentially expressed (non-DEGs). - The levels of differential expression (DE) are four-fold in both groups. +a count table matrix such as \code{hypoData} where each row indicates the +gene (or transcript), each column indicates the sample (or library), +and each cell indicates the number of counts to the gene in the sample. +The first three columns are produced from biological replicates of, +for example, Group 1 and the remaining columns are from Group 2; +i.e., {G1_rep1, G1_rep2, G1_rep3} vs. {G2_rep1, G2_rep2, G2_rep3}. +This data is generated by the \code{\link{simulateReadCounts}} +function with default parameter settings. The first 200 genes are +differentially expressed in the two groups. Of these, the first 180 +genes are expressed at a higher level in Group 1 (G1) and the remaining +20 genes are expressed at a higher level in G2. Accordingly, the +201-1000th genes are not differentially expressed (non-DEGs). +The levels of differential expression (DE) are four-fold in both groups. } \examples{ -\dontrun{ # The 'hypoData' is generated by following commands. -tcc <- generateSimulationData(Ngene = 10000, PDEG = 0.2, - DEG.assign = c(0.9, 0.1), - DEG.model = "uniform", - DEG.foldchange = list(4, 4), group = c(3, 3)) +tcc <- simulateReadCounts(Ngene = 1000, PDEG = 0.2, + DEG.assign = c(0.9, 0.1), + DEG.foldchange = c(4, 4), + replicates = c(3, 3)) hypoData <- tcc$count } -} \keyword{datasets} diff --git a/TCC/man/hypoData_mg.Rd b/TCC/man/hypoData_mg.Rd new file mode 100644 index 0000000..38ddedb --- /dev/null +++ b/TCC/man/hypoData_mg.Rd @@ -0,0 +1,39 @@ +\name{hypoData_mg} +\docType{data} +\alias{hypoData_mg} +\title{A simulation dataset for comparing three-group tag count + data, focusing on RNA-seq} +\description{ +A simulation dataset, consisting of 1,000 rows (or genes) and + 9 columns (or independent biological samples). +} +\usage{data(hypoData_mg)} +\format{\code{hypoData_mg} is a matrix of dimension 1,000 times 9.} +\details{ +The \code{hypoData_mg}, a matrix object, is a simulation dataset which +consists of 1,000 rows (genes) and 9 columns (samples). +Each cell of matrix indicates the number of counts to the gene in the sample. +The first three columns are produced from biological replicates of, +for example, Group 1, the next three columns are from Group2 and the +remaining columns are from Group 3; +i.e., {G1_rep1, G1_rep2, G1_rep3} vs. {G2_rep1, G2_rep2, G2_rep3} vs. +{G3_rep1, G3_rep2, G3_rep3}. +This data is generated by the \code{\link{simulateReadCounts}} +function with the following parameters (see Examples). +The first 200 genes are differentially expressed among +the three groups. Of these, the first 140 +genes are expressed at a higher level only in Group 1 (G1), the next +40 genes are expressed at a higher level only in G2 and the last +20 genes are expressed at a higher level only in G3. Accordingly, the +201-1000th genes are not differentially expressed (non-DEGs). +The levels of differential expression (DE) are four-fold in each group. +} +\examples{ +# The 'hypoData_mg' is generated by following commands. +tcc <- simulateReadCounts(Ngene = 1000, PDEG = 0.2, + DEG.assign = c(0.7, 0.2, 0.1), + DEG.foldchange = c(4, 4, 4), + replicates = c(3, 3, 3)) +hypoData_mg <- tcc$count +} +\keyword{datasets} diff --git a/TCC/man/hypoData_ts.Rd b/TCC/man/hypoData_ts.Rd new file mode 100644 index 0000000..8dde342 --- /dev/null +++ b/TCC/man/hypoData_ts.Rd @@ -0,0 +1,32 @@ +\name{hypoData_ts} +\docType{data} +\alias{hypoData_ts} +\title{A sample microarray data for detecting tissue-specific patterns.} +\description{ +A hypothetical micoarray data consisting of eight rows (genes) and ten +columns (tissues). The expression patterns are quite similar to those +in figure 1 in Kadota et al., 2006. +} +\usage{data(hypoData_ts)} +\format{\code{hypoData_ts} is a matrix of dimension eight times ten.} +\details{ +The \code{hypoData_ts} is designed for explaining the performance of +\code{\link{ROKU}} that identify tissue-specific expression patterns. +The \code{hypoData_ts} contains a total of eight genes having various +expression patterns across ten tissues: (1) 'up-type' genes selectively +over-expressed in a small number of tissues but unexpressed ("gene1"), +slightly expressed ("gene3"), and moderately expressed ("gene4"), +(2) 'down-type' genes selectively under-expressed ("gene5"), +and (3) 'mixed-type' genes selectively over- and under-expressed in +some tissues ("gene6"). The other genes are not tissue-specific genes +(i.e., "gene2", "gene7", and "gene8"). +} +\references{ +Kadota K, Ye J, Nakai Y, Terada T, Shimizu K: +ROKU: a novel method for identification of tissue-specific genes. +BMC Bioinformatics 2006, 7: 294. +} +\examples{ +data(hypoData_ts) +} +\keyword{datasets} diff --git a/TCC/man/nakai.Rd b/TCC/man/nakai.Rd new file mode 100644 index 0000000..ea80e32 --- /dev/null +++ b/TCC/man/nakai.Rd @@ -0,0 +1,21 @@ +\name{nakai} +\docType{data} +\alias{nakai} +\title{DNA microarray data set} +\description{ +This is a log2-transformed two-group microarray (Affymetrix GeneChip) data +consisting of 31,099 probesets. A total of eight samples were taken from the +rat liver: the first four samples are fed and the last fours are 24-hour +fasted. The original data can be obtained from NCBI Gene Expression Omnibus +(GEO) with GSE7623. This is a subset. +} +\usage{data(nakai)} +\format{\code{nakai} is a matrix of 31,099 rows (probesets) and 8 columns +(samples or tissues).} +\references{ +Nakai Y, Hashida H., Kadota K, Minami M, Shimizu K, Matsumoto I, Kato H, Abe K., Up-regulation of genes related to the ubiquitin-proteasome system in the brown adipose tissue of 24-h-fasted rats. Bioscience Biotechnology and Biochemistry 2008, 72(1):139-148. +} +\examples{ +data(nakai) +} +\keyword{datasets} diff --git a/TCC/man/plot.TCC.Rd b/TCC/man/plot.TCC.Rd index 61542c8..7a68fe3 100644 --- a/TCC/man/plot.TCC.Rd +++ b/TCC/man/plot.TCC.Rd @@ -2,50 +2,46 @@ \docType{methods} \alias{plot.TCC} \alias{plot} -\title{Plot an log fold-change versus log average expression +\title{Plot a log fold-change versus log average expression (so-called M-A plot)} \description{This function generates a scatter plot of log fold-change (i.e., \eqn{M = \log_2 G2 - \log_2 G1}{M = log2(G2) - log2(G1)} on - the \eqn{y}-axis) between Groups 1 vs. 2) versus log average + the \eqn{y}-axis between Groups 1 vs. 2) versus log average expression (i.e., \eqn{A = (\log_2 G1 + \log_2 G2) / 2}{A = (log2(G1) + log2(G2)) / 2} on the \eqn{x}-axis) using normalized count data.} \usage{ \method{plot}{TCC}(x, FDR = NULL, median.lines = FALSE, floor = 0, - main = NULL, - xlab = expression(A == (log[2] * G2 + log[2] * G1) / 2), - ylab = expression(M == log[2] * G2 - log[2] * G1), - xlim = NULL, ylim = NULL, cex = 0.3, pch = 19, col = NULL, ...) + groups = NULL, col.tag = NULL, normalize = TRUE, ...) } \arguments{ - \item{x}{\link{TCC-class} object.} - \item{FDR}{numeric scalar specifying a false discovery rate (FDR) - threshold for determining differentially expressed genes (DEGs)} - \item{median.lines}{logical. If \code{TRUE}, horizontal lines - specifying the median M values for non-DEGs (black), DEGs - up-regulated in Group 1 (G1; blue), and G2 (red) are drawn.} - \item{floor}{numeric scalar specifying a threshold for adjusting low count data.} - \item{main}{character string indicating the plotting title.} - \item{xlab}{character string indicating the \eqn{x}-label title.} - \item{ylab}{character string indicating the \eqn{y}-label title.} - \item{xlim}{numeric vector (consisting of two elements) specifying - the range of the \eqn{x} coordinates.} - \item{ylim}{numeric vector (consisting of two elements) specifying - the range of the \eqn{y} coordinates.} - \item{cex}{numeric scalar specifying the multiplying factor of the - size of the plotting points relative to the default (= 0.3).} - \item{pch}{numeric scalar specifying a symbol or a single character - to be used as the default in plotting points.} - \item{col}{vector specifying plotting color. The default is col - = c(1, 4, 2, 5, 6, 7, ...) of color index.} - \item{\dots}{further graphical arguments, see \code{\link{plot.default}}.} + \item{x}{\link{TCC-class} object.} + \item{FDR}{numeric scalar specifying a false discovery rate (FDR) + threshold for determining differentially expressed genes + (DEGs)} + \item{median.lines}{logical. If \code{TRUE}, horizontal lines + specifying the median M values for non-DEGs (black), + DEGs up-regulated in Group 1 (G1; blue), + and G2 (red) are drawn.} + \item{floor}{numeric scalar specifying a threshold for adjusting + low count data.} + \item{groups}{numeric vector consists two elements for specifying what + two groups should be drawn when data contains more than + three groups.} + \item{col.tag}{numeric vector spacifying the index of \code{col} + for coloring the points of the genes.} + \item{normalize}{logical. If \code{FALSE}, the coordinates of M-A plot + are calculated from the raw data.} + \item{\dots}{further graphical arguments, see \code{\link{plot.default}}.} } \details{ This function generates roughly three different M-A plots depending on the conditions for \link{TCC-class} objects. When the function is performed just after the \code{new} method, -all the genes (points) are treated as non-DEGs (the default is black; see Example 1). -The \code{\link{generateSimulationData}} function followed -by the \code{\link{plot}} function can classify the genes as \emph{true} non-DEGs (black), +all the genes (points) are treated as non-DEGs +(the default is black; see Example 1). +The \code{\link{simulateReadCounts}} function followed +by the \code{\link{plot}} function can classify the genes +as \emph{true} non-DEGs (black), \emph{true} DEGs up-regulated in Group 1 (G1; blue), and \emph{true} DEGs up-regulated in G2 (red) (see Example 2). The \code{\link{estimateDE}} function followed @@ -59,8 +55,8 @@ Similar to the \code{\link[edgeR]{plotSmear}} function in edgeR package, \code{\link{plot}} function plots those points at the left side of the minimum A (i.e., log average expression) value. The \eqn{x} coordinate of those points is the minimum A value minus 1. -The \eqn{y} coordinate is calculated as if the 0 count was the minimum observed -non-0 count in each group. +The \eqn{y} coordinate is calculated as if the 0 count was +the minimum observed non-0 count in each group. } \value{ A scatter plot to the current graphic device. @@ -75,7 +71,7 @@ A scatter plot to the current graphic device. # because the information about DEG or non-DEG for each gene is # not indicated. data(hypoData) -group <- c(3, 3) +group <- c(1, 1, 1, 2, 2, 2) tcc <- new("TCC", hypoData, group) plot(tcc) @@ -84,41 +80,30 @@ colSums(normalized.count) colSums(hypoData) mean(colSums(hypoData)) -# 2-1. -# M-A plotting of simulation data with default parameter settings. -# non-DEGs are in black, DEGs up-regulated in G1 and G2 are in -# blue and red. -\dontrun{ -tcc <- generateSimulationData() -plot(tcc, median.lines = TRUE) -} - -# 2-2. +# 2. # M-A plotting of DEGES/edgeR-normalized simulation data. # It can be seen that the median M value for non-DEGs approaches -# zero. -\dontrun{ -tcc <- generateSimulationData() -tcc <- calcNormFactors(tcc) +# zero. Note that non-DEGs are in black, DEGs up-regulated in +# G1 and G2 are in blue and red. +tcc <- simulateReadCounts() +tcc <- calcNormFactors(tcc, norm.method = "tmm", test.method = "edger", + iteration = 1, FDR = 0.1, floorPDEG = 0.05) plot(tcc, median.lines = TRUE) -} # 3. # M-A plotting of DEGES/edgeR-normalized hypoData after performing # DE analysis. -\dontrun{ data(hypoData) -group <- c(3, 3) +group <- c(1, 1, 1, 2, 2, 2) tcc <- new("TCC", hypoData, group) -tcc <- calcNormFactors(tcc) +tcc <- calcNormFactors(tcc, norm.method = "tmm", test.method = "edger", + iteration = 1, FDR = 0.1, floorPDEG = 0.05) tcc <- estimateDE(tcc, test.method = "edger", FDR = 0.1) plot(tcc) -} - # Changing the FDR threshold -\dontrun{plot(tcc, FDR = 0.7)} +plot(tcc, FDR = 0.7) } \keyword{methods} diff --git a/TCC/man/plotFCPseudocolor.Rd b/TCC/man/plotFCPseudocolor.Rd index c9ce869..93d4671 100644 --- a/TCC/man/plotFCPseudocolor.Rd +++ b/TCC/man/plotFCPseudocolor.Rd @@ -10,14 +10,14 @@ and the breakdowns for individual groups from a \link{TCC-class} object. plotFCPseudocolor(tcc, main, xlab, ylab) } \arguments{ - \item{tcc}{\link{TCC-class} object.} - \item{main}{character string indicating the plotting title.} - \item{xlab}{character string indicating the \emph{x}-label title.} - \item{ylab}{character string indicating the \emph{y}-label title.} + \item{tcc}{\link{TCC-class} object.} + \item{main}{character string indicating the plotting title.} + \item{xlab}{character string indicating the \emph{x}-label title.} + \item{ylab}{character string indicating the \emph{y}-label title.} } \details{ This function should be used after the -\code{\link{generateSimulationData}} function that generates +\code{\link{simulateReadCounts}} function that generates simulation data with arbitrary defined conditions. The largest log fold-change (FC) values are in magenta and no-changes are in white. @@ -25,26 +25,22 @@ in magenta and no-changes are in white. \examples{ # Generating a simulation data for comparing two groups # (G1 vs. G2) with biological replicates. -# the first 20 genes are DEGs, where 18 are up in G1. -tcc <- generateSimulationData(Ngene = 100, PDEG = 0.2, - DEG.assign = c(0.9, 0.1), - DEG.model = "uniform", - DEG.foldchange = list(4, 4), - group = c(3, 3)) +# the first 200 genes are DEGs, where 180 are up in G1. +tcc <- simulateReadCounts(Ngene = 1000, PDEG = 0.2, + DEG.assign = c(0.9, 0.1), + DEG.foldchange = c(4, 4), + replicates = c(3, 3)) plotFCPseudocolor(tcc) # Generating a simulation data for comparing three groups # (G1 vs. G2 vs. G3) with biological replicates. -# the first 3000 genes are DEGs, where the 70%, 20%, and 10% are +# the first 300 genes are DEGs, where the 70%, 20%, and 10% are # up-regulated in G1, G2, G3, respectively. The levels of DE are # 3-, 10, and 6-fold in individual groups. -\dontrun{ -tcc <- generateSimulationData(Ngene = 10000, PDEG = 0.3, - DEG.assign = c(0.7, 0.2, 0.1), - DEG.model = "uniform", - DEG.foldchange = list(3, 10, 6), - group = c(2, 4, 3)) +tcc <- simulateReadCounts(Ngene = 1000, PDEG = 0.3, + DEG.assign = c(0.7, 0.2, 0.1), + DEG.foldchange = c(3, 10, 6), + replicates = c(3, 3, 3)) plotFCPseudocolor(tcc) } -} \keyword{methods} diff --git a/TCC/man/simulateReadCounts.Rd b/TCC/man/simulateReadCounts.Rd new file mode 100644 index 0000000..f029b12 --- /dev/null +++ b/TCC/man/simulateReadCounts.Rd @@ -0,0 +1,134 @@ +\name{simulateReadCounts} +\alias{simulateReadCounts} +\title{Generate simulation data from negative binomial (NB) distribution} +\description{ +This function generates simulation data with arbitrary defined +experimental condition. +} +\usage{ +simulateReadCounts(Ngene = 10000, PDEG = 0.20, DEG.assign = NULL, + DEG.foldchange = NULL, replicates = NULL, group = NULL) +} +\arguments{ + \item{Ngene}{numeric scalar specifying the number of genes.} + \item{PDEG}{numeric scalar specifying the proportion of + differentially expressed genes (DEGs).} + \item{DEG.assign}{numeric vector specifying the proportion of DEGs up- or + down-regulated in individual groups to be compared. The number of + elements should be the same as that of \code{replicates} if + \code{replicates} is specified. The indication of \code{replicates} + means a single-factor experimental design. The number of elements in + \code{DEG.assign} should be the same as the number of columns in + \code{DEG.foldchange}. Both \code{DEG.foldchange} as data frame + and \code{group} should simultaneously be specified and those + indication means a multi-factor experimental design.} + \item{DEG.foldchange}{numeric vector for single-factor experimental design + and data frame for multi-factor experimental design. Both + \code{DEG.foldchange} as numeric vector and \code{replicates} should + simultaneously be specified for single-factor experimental design. + The \eqn{i}-th element in \code{DEG.foldchange} vector indicates the + degree of fold-change for Group \eqn{i}. The default is + \code{DEG.foldchange = c(4, 4)}, indicating that the levels of DE + are four-fold in both groups.\cr + Both \code{DEG.foldchange} as data frame and \code{group} should + simultaneously be specified for multi-factor experimental design. + Numeric values in the \code{DEG.foldchange} object indicate the + degree of fold-change for individual conditions or factors.} + \item{replicates}{numeric vector indicating the numbers of (biological) + replicates for individual groups compared. Ignored if \code{group} + is specified.} + \item{group}{data frame specifying the multi-factor experimental design.} +} +\details{ +The empirical distribution of read counts +used in this function is calculated from a RNA-seq dataset +obtained from \emph{Arabidopsis} data +(three biological replicates for both the treated and non-treated samples), +the \code{arab} object, in NBPSeq package (Di et al., 2011). +The overall design about the simulation conditions introduced +can be viewed as a pseudo-color image by the +\code{\link{plotFCPseudocolor}} function. +} +\value{ +A \link{TCC-class} object containing following fields: + \item{count}{numeric matrix of simulated count data.} + \item{group}{data frame indicating which group (or condition or factor) + each sample belongs to.} + \item{norm.factors}{numeric vector as a placeholder for + normalization factors.} + \item{stat}{list for storing results after the execution of + the \code{\link{calcNormFactors}} + (and \code{\link{estimateDE}}) function.} + \item{estimatedDEG}{numeric vector as a placeholder for indicating + which genes are up-regulated in particular group + compared to the others. The values in this field + will be populated after the execution of the + \code{\link{estimateDE}} function.} + \item{simulation}{list containing four fields: \code{trueDEG}, + \code{DEG.foldchange}, \code{PDEG}, and \code{params}. + The \code{trueDEG} field (numeric vector) stores + information about DEGs: 0 for non-DEG, 1 for + DEG up-regulated in Group 1, 2 for DEG up-regulated + in Group 2, and so on. The information for + the remaining three fields is the same as those + indicated in the corresponding arguments.} +} +\examples{ +# Generating a simulation data for comparing two groups +# (G1 vs. G2) without replicates (single-factor experimental design). +# the levels of DE are 3-fold in G1 and 7-fold in G2 +tcc <- simulateReadCounts(Ngene = 10000, PDEG = 0.2, + DEG.assign = c(0.9, 0.1), + DEG.foldchange = c(3, 7), + replicates = c(1, 1)) +dim(tcc$count) +head(tcc$count) +str(tcc$simulation) +head(tcc$simulation$trueDEG) + + +# Generating a simulation data for comparing three groups +# (G1 vs. G2 vs. G3) with biological replicates +# (single-factor experimental design). +# the first 3000 genes are DEGs, where the 70%, 20%, and 10% are +# up-regulated in G1, G2, G3, respectively. The levels of DE are +# 3-, 10-, and 6-fold in individual groups. +tcc <- simulateReadCounts(Ngene = 10000, PDEG = 0.3, + DEG.assign = c(0.7, 0.2, 0.1), + DEG.foldchange = c(3, 10, 6), + replicates = c(2, 4, 3)) +dim(tcc$count) +head(tcc$count) +str(tcc$simulation) +head(tcc$simulation$trueDEG) + + +# Generating a simulation data consisting of 10,000 rows (i.e., Ngene = 10000) +# and 8 columns (samples) for two-factor experimental design +# (condition and time). The first 3,000 genes are DEGs (i.e., PDEG = 0.3). +# Of the 3,000 DEGs, 40% are differentially expressed in condition (or GROUP) "A" +# compared to the other condition (i.e., condition "B"), 40% are differentially +# expressed in condition (or GROUP) "B" compared to the other condition +# (i.e., condition "A"), and the remaining 20% are differentially expressed at +# "10h" in association with the second factor: DEG.assign = c(0.4, 0.4, 0.2). +# The levels of fold-change are (i) 2-fold up-regulation in condition "A" for +# the first 40% of DEGs, (ii) 4-fold up-regulation in condition "B" for the +# second 40%, and (iii) 0.4- and 0.6-fold up-regulation at "10h" in "A" and +# 5-fold up-regulation at "10h" in "B". + +group <- data.frame( + GROUP = c("A", "A", "A", "A", "B", "B", "B", "B"), + TIME = c("2h", "2h", "10h", "10h", "2h", "2h", "10h", "10h") +) +DEG.foldchange <- data.frame( + FACTOR1.1 = c(2, 2, 2, 2, 1, 1, 1, 1), + FACTOR1.2 = c(1, 1, 1, 1, 4, 4, 4, 4), + FACTOR2 = c(1, 1, 0.4, 0.6, 1, 1, 5, 5) +) +tcc <- simulateReadCounts(Ngene = 10000, PDEG = 0.3, + DEG.assign = c(0.4, 0.4, 0.2), + DEG.foldchange = DEG.foldchange, + group = group) +tcc +} +\keyword{methods} diff --git a/TCC/tests/DESDES.R b/TCC/tests/DESDES.R deleted file mode 100644 index 5f65e49..0000000 --- a/TCC/tests/DESDES.R +++ /dev/null @@ -1,12 +0,0 @@ -if (Sys.getenv("TCC_REAL_TEST")!=""){ - library(TCC) - tcc <- generateSimulationData(Ngene = 100, PDEG = 0.2, DEG.assign = c(0.9, 0.1)) - tcc <- calcNormFactors(tcc, norm.method = "deseq", test.method = "deseq", - FDR = 0.1, floorPDEG = 0.05) - cat("tcc$norm.factors: ") - cat(tcc$norm.factors) - cat("\n") - cat("tcc$stat$execution.time: ") - cat(tcc$stat$execution.time) - cat("\n") -} diff --git a/TCC/tests/DESDES1.R b/TCC/tests/DESDES1.R deleted file mode 100644 index 4ec49d8..0000000 --- a/TCC/tests/DESDES1.R +++ /dev/null @@ -1,12 +0,0 @@ -if (Sys.getenv("TCC_REAL_TEST")!=""){ - library(TCC) - tcc <- generateSimulationData(Ngene = 100, PDEG = 0.2, DEG.assign = c(0.9, 0.1),group=c(1,1)) - tcc <- calcNormFactors(tcc, norm.method = "deseq", test.method = "deseq", - FDR = 0.1, floorPDEG = 0.05) - cat("tcc$norm.factors: ") - cat(tcc$norm.factors) - cat("\n") - cat("tcc$stat$execution.time: ") - cat(tcc$stat$execution.time) - cat("\n") -} diff --git a/TCC/tests/DESTbT1.R b/TCC/tests/DESTbT1.R deleted file mode 100644 index bdf60ae..0000000 --- a/TCC/tests/DESTbT1.R +++ /dev/null @@ -1,11 +0,0 @@ -if (Sys.getenv("TCC_REAL_TEST") != ""){ - library(TCC) - tcc <- generateSimulationData(Ngene = 100, PDEG = 0.2, DEG.assign = c(0.9, 0.1)) - tcc <- calcNormFactors(tcc, norm.method = "tmm", test.method = "bayseq") - cat("tcc$norm.factors: ") - cat(tcc$norm.factors) - cat("\n") - cat("tcc$stat$execution.time: ") - cat(tcc$stat$execution.time) - cat("\n") -} diff --git a/TCC/tests/DESedgeR.R b/TCC/tests/DESedgeR.R deleted file mode 100644 index a2735d2..0000000 --- a/TCC/tests/DESedgeR.R +++ /dev/null @@ -1,11 +0,0 @@ -if (Sys.getenv("TCC_REAL_TEST") != ""){ - library(TCC) - tcc <- generateSimulationData(Ngene = 100, PDEG = 0.2, DEG.assign = c(0.9, 0.1)) - tcc <- calcNormFactors(tcc, norm.method = "tmm", test.method = "edger") - cat("tcc$norm.factors: ") - cat(tcc$norm.factors) - cat("\n") - cat("tcc$stat$execution.time: ") - cat(tcc$stat$execution.time) - cat("\n") -} diff --git a/TCC/tests/DESedgeR3.R b/TCC/tests/DESedgeR3.R deleted file mode 100644 index 5844e56..0000000 --- a/TCC/tests/DESedgeR3.R +++ /dev/null @@ -1,11 +0,0 @@ -if (Sys.getenv("TCC_REAL_TEST") != ""){ - library(TCC) - tcc <- generateSimulationData(Ngene = 100, PDEG = 0.2, DEG.assign = c(0.9, 0.1)) - tcc <- calcNormFactors(tcc, DES = 3) - cat("tcc$norm.factors: ") - cat(tcc$norm.factors) - cat("\n") - cat("tcc$stat$execution.time: ") - cat(tcc$stat$execution.time) - cat("\n") -} diff --git a/TCC/tests/DEbaySeq.R b/TCC/tests/DEbaySeq.R deleted file mode 100644 index 956ce98..0000000 --- a/TCC/tests/DEbaySeq.R +++ /dev/null @@ -1,12 +0,0 @@ -if (Sys.getenv("TCC_REAL_TEST")!=""){ - library(TCC) - tcc <- generateSimulationData(Ngene = 100, PDEG = 0.2, DEG.assign = c(0.9, 0.1)) - tcc <- calcNormFactors(tcc) - tcc <- estimateDE(tcc, test.method = "bayseq", FDR = 0.1, samplesize = 1000) - result <- getResult(tcc, sort = TRUE) - print(head(result)) - table(tcc$estimatedDEG) - png("plot4b.png", 600, 500) - plot(tcc) - dev.off() -} diff --git a/TCC/tests/DEedgeR.R b/TCC/tests/DEedgeR.R deleted file mode 100644 index 851d2b5..0000000 --- a/TCC/tests/DEedgeR.R +++ /dev/null @@ -1,12 +0,0 @@ -if (Sys.getenv("TCC_REAL_TEST")!=""){ - library(TCC) - tcc <- generateSimulationData(Ngene = 100, PDEG = 0.2, DEG.assign = c(0.9, 0.1)) - tcc <- calcNormFactors(tcc) - tcc <- estimateDE(tcc, test.method = "edger", FDR = 0.1) - result <- getResult(tcc, sort = TRUE) - print(head(result)) - table(tcc$estimatedDEG) - png("plot4.png", 600, 500) - plot(tcc) - dev.off() -} diff --git a/TCC/tests/basic.R b/TCC/tests/basic.R deleted file mode 100644 index 6ae0662..0000000 --- a/TCC/tests/basic.R +++ /dev/null @@ -1,11 +0,0 @@ -if (Sys.getenv("TCC_REAL_TEST")){ - library(TCC) - data(hypoData) - tcc <- generateSimulationData(Ngene = 100, PDEG = 0.2, DEG.assign = c(0.9, 0.1)) -# group <- c(3, 3) -# tcc <- new("TCC", hypoData, c(3,3)) - tcc$calcNormFactors(norm.method = "tmm", test.method = "bayseq") - tcc$estimateDE(test.method = "edger", FDR = 0.1) - result <- getResult(tcc, sort = TRUE) - print(head(result)) -} diff --git a/TCC/tests/dataframe.R b/TCC/tests/dataframe.R deleted file mode 100644 index 6b2e97b..0000000 --- a/TCC/tests/dataframe.R +++ /dev/null @@ -1,7 +0,0 @@ -library(TCC) -data(hypoData) -df<-data.frame(row.names = paste('a', rownames(hypoData), sep=""), - A1 = hypoData[,1], A2 = hypoData[,2], A3 = hypoData[,3], - B1 = hypoData[,4], B2 = hypoData[,5], B3 = hypoData[,6]) -tccdata=new("TCC",df,c(3,3)) -head(tccdata$names) diff --git a/TCC/tests/filterUnexpressed.R b/TCC/tests/filterUnexpressed.R deleted file mode 100644 index dadea92..0000000 --- a/TCC/tests/filterUnexpressed.R +++ /dev/null @@ -1,19 +0,0 @@ -if (Sys.getenv("TCC_REAL_TEST") != ""){ - library(TCC) - data(hypoData) - group <- c(3, 3) - tcc <- new("TCC", hypoData, c(3,3)) - cat("tcc$count: ") - cat(dim(tcc$count)) - cat("\n") - tccf <- filterUnexpressedGenes(tcc) - cat("dim(tcc$count): ") - cat(dim(tccf$count)) - cat("\n") - cat("dim(hypoData): ") - cat(dim(hypoData)) - cat("\n") - cat("dim(hypoData[as.logical(rowSums(hypoData)>0),]): ") - cat(dim(hypoData[as.logical(rowSums(hypoData) > 0),])) - cat("\n") -} diff --git a/TCC/tests/genSim.R b/TCC/tests/genSim.R deleted file mode 100644 index 3aa379e..0000000 --- a/TCC/tests/genSim.R +++ /dev/null @@ -1,26 +0,0 @@ -if (Sys.getenv("TCC_REAL_TEST")!=""){ -library(TCC) - set.seed(1234567) - tcc <- generateSimulationData(Ngene = 10000, PDEG = 0.3, - DEG.assign = c(0.6, 0.2, 0.2), - DEG.foldchange = list(3, 10, 6), - group = c(2, 4, 3)) - cat("dim(tcc$count): ") - cat(dim(tcc$count)) - cat("\n") - cat("tcc$group: ") - cat(tcc$group) - cat("\n") - cat("tcc$replicates: ") - cat(tcc$replicates) - cat("\n") - cat("tcc$count: ") - cat("\n") - print(head(tcc$count)) - png("plot5-3-1.png", 500, 500) - plotFCPseudocolor(tcc) - dev.off() - png("plot5-3-2.png", 600, 600) - plot(tcc) - dev.off() -} diff --git a/TCC/tests/runTests.R b/TCC/tests/runTests.R new file mode 100644 index 0000000..afbdefe --- /dev/null +++ b/TCC/tests/runTests.R @@ -0,0 +1 @@ +BiocGenerics:::testPackage("TCC") diff --git a/do_install.sh b/do_install.sh deleted file mode 100755 index 3da10cd..0000000 --- a/do_install.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/sh -R --slave <