Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
163 changes: 163 additions & 0 deletions GSE46691/50-build_gse46691_hugo.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
# ---- OPTIONS ----
#
# * Summarize Function *
# When HUGO Gene names are associated with multiple probesets, the probset
# expression values will be summarized using the following function:
SUMMARIZE_FUNCTION <- median

# * HUGO Dictionary URL*
# This URL points to most recent HUGO gene names information.
# See https://beta.genenames.org/download/custom/ for more information.
HUGO_DICT_URL <- "https://beta.genenames.org/cgi-bin/download/custom?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_status&col=gd_prev_sym&col=gd_aliases&col=gd_pub_chrom_map&col=gd_pub_acc_ids&col=gd_pub_refseq_ids&status=Approved&status=Entry%20Withdrawn&hgnc_dbtag=on&order_by=gd_app_sym_sort&format=text&submit=submit"

# ---- Load Packages ----
if (!requireNamespace("pacman", quietly = TRUE)) install.packages("pacman")
pacman::p_load(Biobase)
pacman::p_load(tidyverse)
pacman::p_load(GEOquery)

# Source functions for these scripts
source_files <- dir("functions", pattern = "\\.R$", full.names = TRUE)
purrr::walk(source_files, source, local = globalenv())

# ---- Gather Data ----
data_dir <- gather_gse46691("data")

# ---- Load GPL and Series Matrix ----
gse_46691 <- build_gse_46691(file_exprs = NULL, data_dir = data_dir)

# ---- Parse and Tidy Gene Assignment ----
# Get gene_assignment from GPL5188 annotation
gse_46691_genes <- pData(gse_46691$featureData) %>%
select(ID, gene_assignment) %>%
as_tibble() %>%
tidy_gene_assignment(gene_assignment) %>%
# gene_assignment_1 and _2 are now a list_cols but we can splat them out
# which also drops anything that didn't have an assignment, i.e. "---"
tidyr::unnest() %>%
select(-gene_assignment)

gse_46691_genes2 <- tidyr::gather(gse_46691_genes, drop, gene_name, -ID) %>%
select(-drop)

# ---- Get Probeset Annotations ----
# http://www.affymetrix.com/support/technical/byproduct.affx?product=huexon-st
# Download: http://www.affymetrix.com/Auth/analysis/downloads/na36/wtexon/HuEx-1_0-st-v2.na36.hg19.probeset.csv.zip
# Save zip file in "data" and extract.
huex_annotation_path <- file.path("data", "HuEx-1_0-st-v2.na36.hg19.probeset.csv", "HuEx-1_0-st-v2.na36.hg19.probeset.csv")
huex_header_lines <- readLines(huex_annotation_path, n = 50)
huex_header_lines <- max(which(grepl("^#", huex_header_lines)))
huex_annotation <- readr::read_csv(
huex_annotation_path,
col_types = cols_only(
gene_assignment = col_character(),
probeset_id = col_integer()
),
skip = huex_header_lines
) %>%
tidy_gene_assignment(gene_assignment) %>%
tidyr::unnest() %>%
select(-gene_assignment)

# ---- Get Latest HUGO names ----
download_files(
urls = c("hgnc_dict.tsv" = HUGO_DICT_URL),
dest.dir = "data"
)

hgnc_dict <- readr::read_delim("data/hgnc_dict.tsv",
"\t", escape_double = FALSE, trim_ws = TRUE) %>%
mutate(`Approved symbol` = str_replace(`Approved symbol`, "~withdrawn", ""))

# First, match gene symbol provided by Affymetrix annotation to HGNC (Hugo)
probe2hugo <- huex_annotation %>%
distinct(probeset_id, gene_assignment_2) %>%
inner_join(hgnc_dict, by = c(gene_assignment_2 = "Approved symbol")) %>%
rename(gene_assignment = gene_assignment_2)

# Then try matching using RefSeq (public sequence ID from Affymetrix annotation)
probe2refseq2hugo <- huex_annotation %>%
filter(!probeset_id %in% probe2hugo$probeset_id) %>%
distinct(probeset_id, gene_assignment_1) %>%
inner_join(hgnc_dict, by = c(gene_assignment_1 = "RefSeq IDs")) %>%
rename(`RefSeq IDs` = gene_assignment_1, gene_assignment = `Approved symbol`)

probe2hugo <- bind_rows(probe2hugo, probe2refseq2hugo) %>% arrange(probeset_id, gene_assignment)

# # This part is not needed but kept for future reference. Uncomment if the
# # probset-hugo mapping contains un-approved symbols (check `Status` column)
#
# probes_with_only_unapproved_names <- probe2hugo %>%
# group_by(ID, Status) %>%
# count() %>%
# tidyr::spread(Status, n, fill = 0) %>%
# filter(Approved < 1, `Entry Withdrawn` > 0 | `Symbol Withdrawn` > 0)
#
# # At this point, there are many HUGO names with Status = "Symbol withdrawn".
# # These entries point to a replacing symbol in the `Approved name` column
# # with the syntax "see XXX", e.g. "symbol withdrawn, see AGAP4" for "AGAP8".
# # So replace with the new name and re-merge with hgnc_dict.
# probe2hugo <- probe2hugo %>%
# mutate(gene_assignment = if_else(
# Status == "Symbol Withdrawn",
# str_extract(`Approved name`, "(?<=see )(.+)$"),
# gene_assignment
# )) %>%
# distinct(probeset_id, gene_assignment) %>%
# inner_join(hgnc_dict, by = c(gene_assignment = "Approved symbol"))

gse_46691$exprs <- read_tsv_filtered(
file.path(data_dir, "GSE46691_quantile_normalized.txt"),
ID_REF %in% probe2hugo$probeset_id,
chunk_size = 10000,
col_types = readr::cols(ID_REF = col_integer(), .default = col_double())
)

# ---- Save Checkpoint ----
dir.create("out")
saveRDS(gse_46691, file.path("out", "gse46691_hugo_checkpoint.rds"))
saveRDS(probe2hugo, file.path("out", "probe2hugo.rds"))

## To restart from here without having to re-run the above:
# gse_46691 <- readRDS(file.path("out", "gse46691_hugo_checkpoint.rds"))
# probe2hugo <- readRDS(file.path("out", "probe2hugo.rds"))


# ---- Probe ID to HUGO ----
gse_46691_exprs_prepped <- gse_46691$exprs %>%
tidyr::gather("sample", "value", -ID_REF) %>%
mutate(sample = str_replace(sample, "\\.CEL$", "")) %>%
left_join(
select(probe2hugo, probeset_id, hugo_name = gene_assignment),
.,
by = c(probeset_id = "ID_REF")
) %>%
group_by(sample, hugo_name) %>%
summarize(value = SUMMARIZE_FUNCTION(value)) %>%
tidyr::spread(hugo_name, value)

saveRDS(gse_46691_exprs_prepped, file.path("out", "gse46691_hugo_exprs-prepped.rds"))
write_tsv(gse_46691_exprs_prepped, file.path("out", "gse46691_hugo_exprs-prepped.tsv"))


# ---- Additional Patient Information ----

# Clean up pheno data and then merge with gse
gse_46691_pdata <- pData(gse_46691$phenoData) %>%
as_tibble() %>%
remove_common(description) %>%
clean_channel_vars() %>%
select(-matches("title|characteristics|supplementary")) %>%
mutate_all(as.character) %>%
readr::type_convert()

## To merge with expression dataset, run the following, which will prepend four
## columns before the exprs data: `geo_accession`, `sample`, `gleason score` and
## `metastatic event`.
#
gse_46691_exprs_pdata <- gse_46691_pdata %>%
rename(sample = description) %>%
left_join(gse_46691_exprs_prepped, by = "sample")

saveRDS(gse_46691_exprs_pdata, file.path("out", "gse46691_hugo_exprs-w-phenotype.rds"))
write_tsv(gse_46691_exprs_pdata, file.path("out", "gse46691_hugo_exprs-w-phenotype.tsv"))
57 changes: 56 additions & 1 deletion GSE46691/functions/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -133,5 +133,60 @@ download_files <- function(
}
}

dest.dir
invisible(dest.dir)
}

#' Remove all common variables from a data frame
#'
#' ...where common is defined as having the same value across all observations.
remove_common <- function(x, ..., quiet = FALSE) {
keep <- tidyselect::vars_select(names(x), !!! quos(...))
remove_common_except(x, keep, quiet)
}

remove_common_except <- function(x, keep = NULL, quiet = FALSE) {
# remove columns with common values across all observations
# except for those named in `keep`
len_unique <- vapply(x, function(col) length(unique(col)), integer(1))
common <- colnames(x)[len_unique == 1]
common_can_remove <- setdiff(common, keep)
if (!quiet) {
value_text <- function(...) crayon::italic(encodeString(paste0(...), quote = "'"))
field <- function(...) crayon::green(paste0(...))
value <- function(...) crayon::blue(encodeString(paste0(...), quote = "'"))

cli::cat_line("The following columns contain common information across all observations and have been removed.")
cli::cat_line(glue::glue("You can access this metadata in the {value('metadata')} attribute."))
kept <- intersect(keep, common)
if (length(kept)) cli::cat_line(glue::glue(
"{if (plural) 'Columns' else 'Column'} {kept_vars} ",
"{if (plural) 'do' else 'does'} not vary accross observations but ",
"{if (plural) 'have' else 'has'} been retained by user request",
plural = length(kept) > 1,
kept_vars = glue::glue_collapse(glue::glue("`{kept}`"), sep = ", ")
))
for (col in common_can_remove) {
cli::cat_line(field(stringr::str_pad(col, max(nchar(common_can_remove)))), ': ',
value_text(x[[col]][1]))
}
}
removed <- purrr::map(x[, common_can_remove], unique)
x <- x[, dplyr::setdiff(colnames(x), common_can_remove)]
attr(x, "metadata") <- removed
x
}

#' Clean up channel variables from GEO datasets
clean_channel_vars <- function(x) {
idx <- grep(":ch[12]$", colnames(x))
ch_cols <- colnames(x)[idx]
stripped <- sub(":ch[12]$", "", ch_cols)
dups <- vapply(
stripped,
function(key) length(which(key == stripped)) > 1,
FUN.VALUE = logical(1))
# only replace the unqiuely named characteristics
ch_cols[!dups] <- stripped[!dups]
colnames(x)[idx] <- ch_cols
x
}