diff --git a/DESCRIPTION b/DESCRIPTION index 8ff2cc3..f482154 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -6,7 +6,8 @@ Description: A collection of tools for analyzing and visualizing bisulfite sequencing data. Authors@R: c(person(c("Kasper", "Daniel"), "Hansen", role = c("aut", "cre"), email = "kasperdanielhansen@gmail.com"), - person("Peter", "Hickey", role = "aut", email = "peter.hickey@gmail.com")) + person("Peter", "Hickey", role = "aut", email = "peter.hickey@gmail.com"), + person(c("Shan", "V."), "Andrews", role = "ctb", email = "sandre17@jhu.edu")) Depends: R (>= 3.3), methods, @@ -55,6 +56,7 @@ Collate: BSmooth.fstat.R BSseqStat_class.R getStats.R + getSex.R hdf5_utils.R combine_utils.R DelayedArray_utils.R diff --git a/NAMESPACE b/NAMESPACE index 235c7d0..efa93e9 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -10,8 +10,7 @@ importFrom(BiocGenerics, "anyDuplicated", "cbind", "colnames", "strand", "strand<-", "union", "unique", "updateObject") importFrom(stats, "approxfun", "fisher.test", "ppoints", "predict", "preplot", "qchisq", - "qnorm", "qqplot", "qunif", "cov2cor", - "plogis") + "qnorm", "qqplot", "qunif", "cov2cor","kmeans", "plogis") importFrom(graphics, "abline", "axis", "layout", "legend", "lines", "mtext", "par", "plot", "points", "polygon", "rect", "rug", "text") import(parallel) @@ -72,7 +71,7 @@ export("BSseq", "getMeth", "getCoverage", "getBSseq", "getStats", "read.umtab", "read.umtab2", "read.bsmooth", "read.bismark", "poissonGoodnessOfFit", "binomialGoodnessOfFit", "data.frame2GRanges", "BSseqTstat", - "BSseqStat") + "BSseqStat","getSex") S3method("print", "chisqGoodnessOfFit") S3method("plot", "chisqGoodnessOfFit") diff --git a/R/getSex.R b/R/getSex.R new file mode 100644 index 0000000..1b9550e --- /dev/null +++ b/R/getSex.R @@ -0,0 +1,72 @@ +getSex <- function(BSseq, cutoff = 0){ + stopifnot(is(BSseq, "BSseq")) + xChr <- match.arg(unique(as.character(seqnames(BSseq))), c("X","chrX"), several.ok = TRUE) + yChr <- match.arg(unique(as.character(seqnames(BSseq))), c("Y","chrY"), several.ok = TRUE) + + BSseq.x <- chrSelectBSseq(BSseq, seqnames = xChr) + BSseq.y <- chrSelectBSseq(BSseq, seqnames = yChr) + + xCov <- getCoverage(BSseq.x, what = "perRegionAverage") + yCov <- getCoverage(BSseq.y, what = "perRegionAverage") + metric <- xCov - yCov + sexclust <- kmeans(metric, 2) + + if(all(sexclust$centers > cutoff)){ + message("[getSex] No samples with coverage of Y chromosome. Predicting all female samples.") + df <- DataFrame(xCov = xCov, yCov = yCov, predictedSex = rep("F", ncol(BSseq))) + rownames(df) <- sampleNames(BSseq) + stop(return(df)) + } + if(all(sexclust$centers < cutoff)){ + message("[getSex] No samples without coverage of Y chromosme. Predicting all male samples.") + df <- DataFrame(xCov = xCov, yCov = yCov, predictedSex = rep("M", ncol(BSseq))) + rownames(df) <- sampleNames(BSseq) + stop(return(df)) + } + predictedSex <- ifelse(metric > cutoff, "F", "M") + + xClust <- kmeans(xCov, centers=c(min(xCov), max(xCov))) + yClust <- kmeans(yCov, centers=c(max(yCov), min(yCov))) + + predSex <- rep(NA, ncol(BSseq)) + names(predSex) <- sampleNames(BSseq) + predSex[(xClust$cluster + yClust$cluster) == 2] <- "M" + predSex[(xClust$cluster + yClust$cluster) == 4] <- "F" + + if(!identical(predictedSex, predSex)) + warning("[getSex] Possible discrepancy with Sample ID ",paste(sampleNames(BSseq)[(predictedSex!=predSex | is.na(predSex))],collapse=", "),". Check via plotSex().") + + df <- DataFrame(xCov = xCov, yCov = yCov, predictedSex = predictedSex) + rownames(df) <- sampleNames(BSseq) + df +} + +addSex <- function(BSseq, sex = NULL) { + if(is.null(sex)) + sex <- getSex(BSseq)$predictedSex + if(is(sex, "DataFrame") && "predictedSex" %in% names(sex)) + sex <- sex$predictedSex + pd <- pData(BSseq) + if ("predictedSex" %in% names(pd)) + warning("Replacing existing 'predictedSex' column") + pd$predictedSex <- sex + pData(BSseq) <- pd + BSseq +} + +plotSex <- function(df, id = NULL) { + stopifnot(all(c("predictedSex", "xCov", "yCov") %in% names(df))) + if(is.null(id)) + id <- rownames(df) + if(length(id) != length(df$predictedSex)) + stop("id length must match number of samples.") + plot(df$xCov, df$yCov, type = "n", + xlab = "X chr, Average coverage", + ylab = "Y chr, Average coverage") + pointcol <- rep("black",nrow(df)) + pointcol[df$predictedSex == "M"] <- "deepskyblue" + pointcol[df$predictedSex == "F"] <- "deeppink3" + text(df$xCov, df$yCov, id, + col=ifelse(df$predictedSex == "M", "deepskyblue", "deeppink3")) + legend("bottomleft", c("M","F"), col = c("deepskyblue", "deeppink3"), pch = 16) +} \ No newline at end of file diff --git a/man/getSex.Rd b/man/getSex.Rd new file mode 100644 index 0000000..bdaf56b --- /dev/null +++ b/man/getSex.Rd @@ -0,0 +1,54 @@ +\name{getSex} +\alias{getSex} +\alias{addSex} +\alias{plotSex} +\title{ + Estimating sample sex based on sex chromosome coverage +} +\description{ + Estimates sample sex based on sex chromosome coverage +} +\usage{ +getSex(BSseq, cutoff = 0) +addSex(BSseq, sex = NULL) +plotSex(df, id = NULL) +} +\arguments{ + \item{BSseq}{An object of class \code{BSseq}.} + \item{cutoff}{Value used to distinguish males and females when evaluating + difference between coverage of X and Y chromosomes.} + \item{sex}{An optional character vector of sex (with values \code{M} + and \code{F}.} + \item{df}{A data frame structured like the output of \code{getSex}, + with columns \code{predictedSex} \code{xCov} and \code{yCov}} + \item{id}{Text used as plotting symbols in the \code{plotSex} + function. Used for sample identification on the plot.} +} +\details{ + Estimation of sex is based on coverage of the X and Y chromosomes. + If a sample exhibits high coverage of the X chromosome and low (no) + coverage of the Y chromosome, we predict female sex. In the opposite + case, we predict male sex. + + This function should be run on a \code{BSseq} object prior to any CpG-based + filtering. +} +\value{ + For \code{getSex}, a \code{DataFrame} with columns \code{predictedSex} + (a character with values \code{M} and \code{F}), \code{xCov} and + \code{yCov} which are the average coverage values of the X and Y chromosomes, + respectively. + + For \code{addSex}, an object of the same type as \code{BSseq} but + with a column named \code{predictedSex} added to the pheno data. +} +\author{ + Shan V. Andrews \email{sandre17@jhu.edu} +} +\examples{ +\dontrun{ +#Not applicable to test data (only chr 22) +sex<-getSex(BSseq) +plotSex(sex) +} +}