hansenlab · sandrews5 · Mar 30, 2017 · Mar 30, 2017 · Mar 30, 2017 · Mar 30, 2017
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -6,7 +6,8 @@ Description: A collection of tools for analyzing and visualizing bisulfite
     sequencing data.
 Authors@R: c(person(c("Kasper", "Daniel"), "Hansen", role = c("aut", "cre"),
     email = "kasperdanielhansen@gmail.com"),
-    person("Peter", "Hickey", role = "aut", email = "peter.hickey@gmail.com"))
+    person("Peter", "Hickey", role = "aut", email = "peter.hickey@gmail.com"),
+	person(c("Shan", "V."), "Andrews", role = "ctb", email = "sandre17@jhu.edu"))
 Depends:
     R (>= 3.3),
     methods,
@@ -55,6 +56,7 @@ Collate:
     BSmooth.fstat.R
     BSseqStat_class.R
     getStats.R
+  	getSex.R
     hdf5_utils.R
     combine_utils.R
     DelayedArray_utils.R

diff --git a/NAMESPACE b/NAMESPACE
@@ -10,8 +10,7 @@ importFrom(BiocGenerics, "anyDuplicated", "cbind", "colnames",
            "strand", "strand<-", "union", "unique", "updateObject")
 importFrom(stats, "approxfun", "fisher.test", "ppoints",
            "predict", "preplot", "qchisq",
-           "qnorm", "qqplot", "qunif", "cov2cor",
-           "plogis")
+           "qnorm", "qqplot", "qunif", "cov2cor","kmeans", "plogis")
 importFrom(graphics, "abline", "axis", "layout", "legend", "lines",
            "mtext", "par", "plot", "points", "polygon", "rect", "rug", "text")
 import(parallel)
@@ -72,7 +71,7 @@ export("BSseq", "getMeth", "getCoverage", "getBSseq", "getStats",
        "read.umtab", "read.umtab2", "read.bsmooth", "read.bismark",
        "poissonGoodnessOfFit", "binomialGoodnessOfFit",
        "data.frame2GRanges", "BSseqTstat",
-       "BSseqStat")
+       "BSseqStat","getSex")
 
 S3method("print", "chisqGoodnessOfFit")
 S3method("plot", "chisqGoodnessOfFit")

diff --git a/R/getSex.R b/R/getSex.R
@@ -0,0 +1,72 @@
+getSex <- function(BSseq, cutoff = 0){
+	stopifnot(is(BSseq, "BSseq"))
+	xChr <- match.arg(unique(as.character(seqnames(BSseq))), c("X","chrX"), several.ok = TRUE)
+	yChr <- match.arg(unique(as.character(seqnames(BSseq))), c("Y","chrY"), several.ok = TRUE)
+
+	BSseq.x <- chrSelectBSseq(BSseq, seqnames = xChr)
+	BSseq.y <- chrSelectBSseq(BSseq, seqnames = yChr)
+
+	xCov <- getCoverage(BSseq.x, what = "perRegionAverage")
+	yCov <- getCoverage(BSseq.y, what = "perRegionAverage")
+	metric <- xCov - yCov
+	sexclust <- kmeans(metric, 2)
+
+	if(all(sexclust$centers > cutoff)){
+		message("[getSex] No samples with coverage of Y chromosome. Predicting all female samples.")
+		df <- DataFrame(xCov = xCov, yCov = yCov, predictedSex = rep("F", ncol(BSseq)))
+		rownames(df) <- sampleNames(BSseq)
+		stop(return(df))
+	}
+	if(all(sexclust$centers < cutoff)){
+		message("[getSex] No samples without coverage of Y chromosme. Predicting all male samples.")
+		df <- DataFrame(xCov = xCov, yCov = yCov, predictedSex = rep("M", ncol(BSseq)))
+		rownames(df) <- sampleNames(BSseq)
+		stop(return(df))
+	}
+	predictedSex <- ifelse(metric > cutoff, "F", "M")
+
+	xClust <- kmeans(xCov, centers=c(min(xCov), max(xCov)))
+	yClust <- kmeans(yCov, centers=c(max(yCov), min(yCov)))
+
+	predSex <- rep(NA, ncol(BSseq))
+	names(predSex) <- sampleNames(BSseq)
+	predSex[(xClust$cluster + yClust$cluster) ==  2] <- "M"
+	predSex[(xClust$cluster + yClust$cluster) ==  4] <- "F"
+
+	if(!identical(predictedSex, predSex))
+		warning("[getSex] Possible discrepancy with Sample ID ",paste(sampleNames(BSseq)[(predictedSex!=predSex | is.na(predSex))],collapse=", "),". Check via plotSex().")
+
+	df <- DataFrame(xCov = xCov, yCov = yCov, predictedSex = predictedSex)
+    rownames(df) <- sampleNames(BSseq)
+    df
+}
+
+addSex <- function(BSseq, sex = NULL) {
+    if(is.null(sex))
+        sex <- getSex(BSseq)$predictedSex
+    if(is(sex, "DataFrame") && "predictedSex" %in% names(sex))
+        sex <- sex$predictedSex
+	pd <- pData(BSseq)
+	if ("predictedSex" %in% names(pd))
+		warning("Replacing existing 'predictedSex' column")
+	pd$predictedSex <- sex
+	pData(BSseq) <- pd
+	BSseq
+}
+
+plotSex <- function(df, id = NULL) {
+    stopifnot(all(c("predictedSex", "xCov", "yCov") %in% names(df)))
+    if(is.null(id))
+        id <- rownames(df)
+    if(length(id) != length(df$predictedSex))
+        stop("id length must match number of samples.")
+    plot(df$xCov, df$yCov, type = "n",
+         xlab = "X chr, Average coverage",
+         ylab = "Y chr, Average coverage")
+	pointcol <- rep("black",nrow(df))
+	pointcol[df$predictedSex == "M"] <- "deepskyblue"
+	pointcol[df$predictedSex == "F"] <- "deeppink3"
+    text(df$xCov, df$yCov, id,
+         col=ifelse(df$predictedSex == "M", "deepskyblue", "deeppink3"))
+    legend("bottomleft", c("M","F"), col = c("deepskyblue", "deeppink3"), pch = 16)
+}
diff --git a/man/getSex.Rd b/man/getSex.Rd
@@ -0,0 +1,54 @@
+\name{getSex}
+\alias{getSex}
+\alias{addSex}
+\alias{plotSex}
+\title{
+  Estimating sample sex based on sex chromosome coverage
+}
+\description{
+  Estimates sample sex based on sex chromosome coverage
+}
+\usage{
+getSex(BSseq, cutoff = 0)
+addSex(BSseq, sex = NULL)
+plotSex(df, id = NULL)
+}
+\arguments{
+  \item{BSseq}{An object of class \code{BSseq}.}
+  \item{cutoff}{Value used to distinguish males and females when evaluating
+  difference between coverage of X and Y chromosomes.}
+  \item{sex}{An optional character vector of sex (with values \code{M}
+    and \code{F}.}
+  \item{df}{A data frame structured like the output of \code{getSex},
+	with columns \code{predictedSex} \code{xCov} and \code{yCov}}
+  \item{id}{Text used as plotting symbols in the \code{plotSex}
+    function.  Used for sample identification on the plot.}
+}
+\details{
+	Estimation of sex is based on coverage of the X and Y chromosomes.
+	If a sample exhibits high coverage of the X chromosome and low (no)
+	coverage of the Y chromosome, we predict female sex. In the opposite
+	case, we predict male sex. 
+
+	This function should be run on a \code{BSseq} object prior to any CpG-based
+	filtering.
+}
+\value{
+  For \code{getSex}, a \code{DataFrame} with columns \code{predictedSex}
+  (a character with values \code{M} and \code{F}), \code{xCov} and
+  \code{yCov} which are the average coverage values of the X and Y chromosomes,
+  respectively.
+
+  For \code{addSex}, an object of the same type as \code{BSseq} but
+  with a column named \code{predictedSex} added to the pheno data.
+}
+\author{
+  Shan V. Andrews \email{sandre17@jhu.edu}
+}
+\examples{
+\dontrun{
+#Not applicable to test data (only chr 22)
+sex<-getSex(BSseq)
+plotSex(sex)
+}
+}