-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathassign_ks_sex.R
More file actions
101 lines (87 loc) · 5.5 KB
/
assign_ks_sex.R
File metadata and controls
101 lines (87 loc) · 5.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# This code should be run in an interactive R session, not as an automated script, in order to inspect the plots and adjust parameters.
# Please read the comments for instructions.
#
# The code demonstrates how to create a chrX vs. chrY normalized coverage ratio (gamma_i) plot for checking, correcting, and/or assigning
# genetic (karyotypic) sex. This sex assignment should be a binary male/female assignment representing the "expected" sex karyotype for
# the sample. Specifically, it will control whether chrX is expected to be diploid for read coverage and allele balance analysis.
library(ggplot2)
# Set output_dir variable, should enable matching to "output_dir"/gammai_matrix.txt file path as generated by the calculate_gammai.py method
output_dir <- ""
if (output_dir == "") {
print("`output_dir` variable not set");
}
f_gammai_matrix <- paste(output_dir,"/gammai_matrix.txt")
gammai <- as.data.frame(read.table(f_gammai_matrix, sep="\t", header=T, row.names=NULL))
# chrY dups are identified "naively" using a 1.5*median(gamma_chrY) threshold in the male sample subset. Setting a default for now.
karyoscan_chrY_median <- 0.001
chrY_dup_factor <- 1.5
# If you have predefined sex assignments that you want to project on the chrX/chrY plot, the following code reads in the meta-data file and
# matches based on the "SampleID" columns.
# Set the following two variables to enable this functionality:
# f_qcmeta meta-data file path (string)
# sex_colname = column name (read from the file header) for the sex data you want to project onto the plot. Default will be the last column.
f_qcmeta <- ""
sex_colname <- ""
# Look for additional sample meta-data file for pre-defined sex projection
if (f_qcmeta != "") {
if (sex_colname == "") {
sex_colname <- tail(colnames(qcmeta), n=1);
print(paste("Using ",sex_colname," column as pre-defined sex. Change `sex_colname` variable if this is incorrect.",sep=""));
}
if (! "SampleID" %in% colnames(f_qcmeta)) {
print("Expecting the sample name column to be \"SampleID\". Change \"by.y\" variable in merge command to use the correct column name");
}
# Merge on "SampleID" columns and check to make sure nothing was lost
pre_len <- nrow(gammai)
gammai <- merge(gammai, f_qcmeta, by.x="SampleID", by.y="SampleID")
post_len <- nrow(gammai)
if (pre_len != post_len) {
print(paste("The sample lists in ",f_gammai_matrix," and ",f_qcmeta," do not match. Proceeding with only the intersection.", sep=""));
}
# Reset chrY_median to match input male sample subset (requires male coding to be in the list below)
karyoscan_chrY_median <- median(gammai[gammi[[sex_colname]] %in% c("Male","MALE","male","m","M","1"),"chrY"]))
}
# Plot gamma_chrX vs. gamma_chrY (normalized coverage ratios)
xy_plot <- ggplot(gammai, aes(x=chrX, y=chrY));
if (sex_colname == "") {
xy_plot <- xy_plot + geom_point();
} else {
xy_plot <- xy_plot + geom_point(aes_string(colour=sex_colname));
}
# Plot chrY median (male subset) and 1.5*median threshold to identify chrY duplications
xy_plot <- xy_plot + geom_hline(yintercept=karyoscan_chrY_median)
xy_plot <- xy_plot + geom_hline(yintercept=chrY_dup_factor*karyoscan_chrY_median, linetype="dashed")
# Plot zoom, default xy-boundaries
x_plot_limits <- c()
y_plot_limits <- c()
# Ex. code to change zoom
#x_plot_limits <- c(0.03, 0.08)
#y_plot_limits <- c(0, 0.002)
xy_plot <- xy_plot + coord_cartesian(xlim=x_plot_limits, ylim=y_plot_limits)
# Display the plot:
xy_plot
# If all sex assignments look good, you can use existing assignemnts from the meta-data file and simply exit this R session.
# If you need to make or change sex assignments, the following code provides a basic framework. It creates a new "KARYOTYPIC_SEX"
# column that you will define to match your requirements. This example represents the step-function cutoffs from the KaryoScan paper,
# but you should change it for your data. Once the assignments are made, it will save a sample-karyotypic sex map to the output file:
# "output_dir/karyotypic_sexes.txt"
# The chrY duplication threshold (dashed line) should be inspected (see next comment block).
# Apply sex assignment function & replot
chrY_dup_factor <- 1.5 # Changes to this variable must also be reflected in the KaryoScan config.txt file
gammai$KARYOTYPIC_SEX <- ifelse((gammai$chrY > 0.0001) | (gammai$chrY > 0.000015 & gammai$chrX < 0.05), "Male", "Female")
karyoscan_chrY_median <- median(subset(gammai, KARYOTYPIC_SEX == "Male")$chrY)
xy_plot_new <- ggplot(gammai, aes(x=chrX, y=chrY))
xy_plot_new <- xy_plot_new + geom_point(aes(colour=KARYOTYPIC_SEX))
xy_plot_new <- xy_plot_new + geom_hline(yintercept=karyoscan_chrY_median)
xy_plot_new <- xy_plot_new + geom_hline(yintercept=chrY_dup_factor*karyoscan_chrY_median, linetype="dashed")
xy_plot_new <- xy_plot_new + coord_cartesian(xlim=x_plot_limits, ylim=y_plot_limits)
xy_plot_new
# Iterate the assignment function/replotting until assignments are satistfactory
# If the chrY duplicatoin threshold is off, first make sure the "Male" sample subset and its chrY median are accurate (solid line).
# If they are, adjust the chrY_dup_factor to a better fitting threshold. If you change it from 1.5, the change **MUST** be reflected
# in the KaryoScan config.txt file.
# Save output file
write.table(gammai[,c("SampleID","KARYOTYPIC_SEX")], paste(output_dir,"/karyotypic_sexes.txt",sep=""), sep="\t", quote=F, row.names=F, col.names=T)
# Plots can be saved with "ggsave()" function
# Ex: ggsave("Karyotypic_Sex.png", xy_plot_new, units="in", width=16, height=10);
quit()