Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ Description: A collection of tools for analyzing and visualizing bisulfite
sequencing data.
Authors@R: c(person(c("Kasper", "Daniel"), "Hansen", role = c("aut", "cre"),
email = "kasperdanielhansen@gmail.com"),
person("Peter", "Hickey", role = "aut", email = "peter.hickey@gmail.com"))
person("Peter", "Hickey", role = "aut", email = "peter.hickey@gmail.com"),
person(c("Shan", "V."), "Andrews", role = "ctb", email = "sandre17@jhu.edu"))
Depends:
R (>= 3.3),
methods,
Expand Down Expand Up @@ -55,6 +56,7 @@ Collate:
BSmooth.fstat.R
BSseqStat_class.R
getStats.R
getSex.R
hdf5_utils.R
combine_utils.R
DelayedArray_utils.R
Expand Down
5 changes: 2 additions & 3 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,7 @@ importFrom(BiocGenerics, "anyDuplicated", "cbind", "colnames",
"strand", "strand<-", "union", "unique", "updateObject")
importFrom(stats, "approxfun", "fisher.test", "ppoints",
"predict", "preplot", "qchisq",
"qnorm", "qqplot", "qunif", "cov2cor",
"plogis")
"qnorm", "qqplot", "qunif", "cov2cor","kmeans", "plogis")
importFrom(graphics, "abline", "axis", "layout", "legend", "lines",
"mtext", "par", "plot", "points", "polygon", "rect", "rug", "text")
import(parallel)
Expand Down Expand Up @@ -72,7 +71,7 @@ export("BSseq", "getMeth", "getCoverage", "getBSseq", "getStats",
"read.umtab", "read.umtab2", "read.bsmooth", "read.bismark",
"poissonGoodnessOfFit", "binomialGoodnessOfFit",
"data.frame2GRanges", "BSseqTstat",
"BSseqStat")
"BSseqStat","getSex")

S3method("print", "chisqGoodnessOfFit")
S3method("plot", "chisqGoodnessOfFit")
Expand Down
72 changes: 72 additions & 0 deletions R/getSex.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
getSex <- function(BSseq, cutoff = 0){
stopifnot(is(BSseq, "BSseq"))
xChr <- match.arg(unique(as.character(seqnames(BSseq))), c("X","chrX"), several.ok = TRUE)
yChr <- match.arg(unique(as.character(seqnames(BSseq))), c("Y","chrY"), several.ok = TRUE)

BSseq.x <- chrSelectBSseq(BSseq, seqnames = xChr)
BSseq.y <- chrSelectBSseq(BSseq, seqnames = yChr)

xCov <- getCoverage(BSseq.x, what = "perRegionAverage")
yCov <- getCoverage(BSseq.y, what = "perRegionAverage")
metric <- xCov - yCov
sexclust <- kmeans(metric, 2)

if(all(sexclust$centers > cutoff)){
message("[getSex] No samples with coverage of Y chromosome. Predicting all female samples.")
df <- DataFrame(xCov = xCov, yCov = yCov, predictedSex = rep("F", ncol(BSseq)))
rownames(df) <- sampleNames(BSseq)
stop(return(df))
}
if(all(sexclust$centers < cutoff)){
message("[getSex] No samples without coverage of Y chromosme. Predicting all male samples.")
df <- DataFrame(xCov = xCov, yCov = yCov, predictedSex = rep("M", ncol(BSseq)))
rownames(df) <- sampleNames(BSseq)
stop(return(df))
}
predictedSex <- ifelse(metric > cutoff, "F", "M")

xClust <- kmeans(xCov, centers=c(min(xCov), max(xCov)))
yClust <- kmeans(yCov, centers=c(max(yCov), min(yCov)))

predSex <- rep(NA, ncol(BSseq))
names(predSex) <- sampleNames(BSseq)
predSex[(xClust$cluster + yClust$cluster) == 2] <- "M"
predSex[(xClust$cluster + yClust$cluster) == 4] <- "F"

if(!identical(predictedSex, predSex))
warning("[getSex] Possible discrepancy with Sample ID ",paste(sampleNames(BSseq)[(predictedSex!=predSex | is.na(predSex))],collapse=", "),". Check via plotSex().")

df <- DataFrame(xCov = xCov, yCov = yCov, predictedSex = predictedSex)
rownames(df) <- sampleNames(BSseq)
df
}

addSex <- function(BSseq, sex = NULL) {
if(is.null(sex))
sex <- getSex(BSseq)$predictedSex
if(is(sex, "DataFrame") && "predictedSex" %in% names(sex))
sex <- sex$predictedSex
pd <- pData(BSseq)
if ("predictedSex" %in% names(pd))
warning("Replacing existing 'predictedSex' column")
pd$predictedSex <- sex
pData(BSseq) <- pd
BSseq
}

plotSex <- function(df, id = NULL) {
stopifnot(all(c("predictedSex", "xCov", "yCov") %in% names(df)))
if(is.null(id))
id <- rownames(df)
if(length(id) != length(df$predictedSex))
stop("id length must match number of samples.")
plot(df$xCov, df$yCov, type = "n",
xlab = "X chr, Average coverage",
ylab = "Y chr, Average coverage")
pointcol <- rep("black",nrow(df))
pointcol[df$predictedSex == "M"] <- "deepskyblue"
pointcol[df$predictedSex == "F"] <- "deeppink3"
text(df$xCov, df$yCov, id,
col=ifelse(df$predictedSex == "M", "deepskyblue", "deeppink3"))
legend("bottomleft", c("M","F"), col = c("deepskyblue", "deeppink3"), pch = 16)
}
54 changes: 54 additions & 0 deletions man/getSex.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
\name{getSex}
\alias{getSex}
\alias{addSex}
\alias{plotSex}
\title{
Estimating sample sex based on sex chromosome coverage
}
\description{
Estimates sample sex based on sex chromosome coverage
}
\usage{
getSex(BSseq, cutoff = 0)
addSex(BSseq, sex = NULL)
plotSex(df, id = NULL)
}
\arguments{
\item{BSseq}{An object of class \code{BSseq}.}
\item{cutoff}{Value used to distinguish males and females when evaluating
difference between coverage of X and Y chromosomes.}
\item{sex}{An optional character vector of sex (with values \code{M}
and \code{F}.}
\item{df}{A data frame structured like the output of \code{getSex},
with columns \code{predictedSex} \code{xCov} and \code{yCov}}
\item{id}{Text used as plotting symbols in the \code{plotSex}
function. Used for sample identification on the plot.}
}
\details{
Estimation of sex is based on coverage of the X and Y chromosomes.
If a sample exhibits high coverage of the X chromosome and low (no)
coverage of the Y chromosome, we predict female sex. In the opposite
case, we predict male sex.

This function should be run on a \code{BSseq} object prior to any CpG-based
filtering.
}
\value{
For \code{getSex}, a \code{DataFrame} with columns \code{predictedSex}
(a character with values \code{M} and \code{F}), \code{xCov} and
\code{yCov} which are the average coverage values of the X and Y chromosomes,
respectively.

For \code{addSex}, an object of the same type as \code{BSseq} but
with a column named \code{predictedSex} added to the pheno data.
}
\author{
Shan V. Andrews \email{sandre17@jhu.edu}
}
\examples{
\dontrun{
#Not applicable to test data (only chr 22)
sex<-getSex(BSseq)
plotSex(sex)
}
}