GerkeLab · gadenbuie · Jul 30, 2018 · Jul 30, 2018 · Jul 30, 2018 · Jul 30, 2018
diff --git a/GSE46691/50-build_gse46691_hugo.R b/GSE46691/50-build_gse46691_hugo.R
@@ -0,0 +1,163 @@
+# ---- OPTIONS ----
+# 
+# * Summarize Function *
+# When HUGO Gene names are associated with multiple probesets, the probset
+# expression values will be summarized using the following function:
+SUMMARIZE_FUNCTION <- median
+
+# * HUGO Dictionary URL*
+# This URL points to most recent HUGO gene names information.
+# See https://beta.genenames.org/download/custom/ for more information.
+HUGO_DICT_URL <- "https://beta.genenames.org/cgi-bin/download/custom?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_status&col=gd_prev_sym&col=gd_aliases&col=gd_pub_chrom_map&col=gd_pub_acc_ids&col=gd_pub_refseq_ids&status=Approved&status=Entry%20Withdrawn&hgnc_dbtag=on&order_by=gd_app_sym_sort&format=text&submit=submit"
+
+# ---- Load Packages ----
+if (!requireNamespace("pacman", quietly = TRUE)) install.packages("pacman")
+pacman::p_load(Biobase)
+pacman::p_load(tidyverse)
+pacman::p_load(GEOquery)
+
+# Source functions for these scripts
+source_files <- dir("functions", pattern = "\\.R$", full.names = TRUE)
+purrr::walk(source_files, source, local = globalenv())
+
+# ---- Gather Data ----
+data_dir <- gather_gse46691("data")
+
+# ---- Load GPL and Series Matrix ----
+gse_46691 <- build_gse_46691(file_exprs = NULL, data_dir = data_dir)
+
+# ---- Parse and Tidy Gene Assignment ----
+# Get gene_assignment from GPL5188 annotation
+gse_46691_genes <- pData(gse_46691$featureData) %>% 
+  select(ID, gene_assignment) %>% 
+  as_tibble() %>% 
+  tidy_gene_assignment(gene_assignment) %>% 
+  # gene_assignment_1 and _2 are now a list_cols but we can splat them out
+  # which also drops anything that didn't have an assignment, i.e. "---"
+  tidyr::unnest() %>% 
+  select(-gene_assignment)
+
+gse_46691_genes2 <- tidyr::gather(gse_46691_genes, drop, gene_name, -ID) %>% 
+  select(-drop)
+
+# ---- Get Probeset Annotations ----
+# http://www.affymetrix.com/support/technical/byproduct.affx?product=huexon-st
+# Download: http://www.affymetrix.com/Auth/analysis/downloads/na36/wtexon/HuEx-1_0-st-v2.na36.hg19.probeset.csv.zip
+# Save zip file in "data" and extract.
+huex_annotation_path <- file.path("data", "HuEx-1_0-st-v2.na36.hg19.probeset.csv", "HuEx-1_0-st-v2.na36.hg19.probeset.csv")
+huex_header_lines <- readLines(huex_annotation_path, n = 50)
+huex_header_lines <- max(which(grepl("^#", huex_header_lines)))
+huex_annotation <- readr::read_csv(
+  huex_annotation_path, 
+  col_types = cols_only(
+    gene_assignment = col_character(), 
+    probeset_id = col_integer()
+  ), 
+  skip = huex_header_lines
+) %>% 
+  tidy_gene_assignment(gene_assignment) %>% 
+  tidyr::unnest() %>% 
+  select(-gene_assignment)
+
+# ---- Get Latest HUGO names ----
+download_files(
+  urls = c("hgnc_dict.tsv" = HUGO_DICT_URL),
+  dest.dir = "data"
+)
+
+hgnc_dict <- readr::read_delim("data/hgnc_dict.tsv", 
+    "\t", escape_double = FALSE, trim_ws = TRUE) %>% 
+  mutate(`Approved symbol` = str_replace(`Approved symbol`, "~withdrawn", ""))
+
+# First, match gene symbol provided by Affymetrix annotation to HGNC (Hugo)
+probe2hugo <- huex_annotation %>% 
+  distinct(probeset_id, gene_assignment_2) %>% 
+  inner_join(hgnc_dict, by = c(gene_assignment_2 = "Approved symbol")) %>% 
+  rename(gene_assignment = gene_assignment_2)
+
+# Then try matching using RefSeq (public sequence ID from Affymetrix annotation)
+probe2refseq2hugo <- huex_annotation %>% 
+  filter(!probeset_id %in% probe2hugo$probeset_id) %>% 
+  distinct(probeset_id, gene_assignment_1) %>% 
+  inner_join(hgnc_dict, by = c(gene_assignment_1 = "RefSeq IDs")) %>% 
+  rename(`RefSeq IDs` = gene_assignment_1, gene_assignment = `Approved symbol`)
+
+probe2hugo <- bind_rows(probe2hugo, probe2refseq2hugo) %>% arrange(probeset_id, gene_assignment)
+
+# # This part is not needed but kept for future reference. Uncomment if the
+# # probset-hugo mapping contains un-approved symbols (check `Status` column)
+# 
+# probes_with_only_unapproved_names <- probe2hugo %>% 
+#   group_by(ID, Status) %>%
+#   count() %>%
+#   tidyr::spread(Status, n, fill = 0) %>%
+#   filter(Approved < 1, `Entry Withdrawn` > 0 | `Symbol Withdrawn` > 0)
+# 
+# # At this point, there are many HUGO names with Status = "Symbol withdrawn".
+# # These entries point to a replacing symbol in the `Approved name` column
+# # with the syntax "see XXX", e.g. "symbol withdrawn, see AGAP4" for "AGAP8".
+# # So replace with the new name and re-merge with hgnc_dict.
+# probe2hugo <- probe2hugo %>%
+#   mutate(gene_assignment = if_else(
+#     Status == "Symbol Withdrawn",
+#     str_extract(`Approved name`, "(?<=see )(.+)$"),
+#     gene_assignment
+#   )) %>% 
+#   distinct(probeset_id, gene_assignment) %>% 
+#   inner_join(hgnc_dict, by = c(gene_assignment = "Approved symbol"))
+
+gse_46691$exprs <- read_tsv_filtered(
+  file.path(data_dir, "GSE46691_quantile_normalized.txt"),
+  ID_REF %in% probe2hugo$probeset_id,
+  chunk_size = 10000, 
+  col_types = readr::cols(ID_REF = col_integer(), .default = col_double())
+)
+
+# ---- Save Checkpoint ----
+dir.create("out")
+saveRDS(gse_46691, file.path("out", "gse46691_hugo_checkpoint.rds"))
+saveRDS(probe2hugo, file.path("out", "probe2hugo.rds"))
+
+## To restart from here without having to re-run the above:
+# gse_46691 <- readRDS(file.path("out", "gse46691_hugo_checkpoint.rds"))
+# probe2hugo <- readRDS(file.path("out", "probe2hugo.rds"))
+
+
+# ---- Probe ID to HUGO ----
+gse_46691_exprs_prepped <- gse_46691$exprs %>% 
+  tidyr::gather("sample", "value", -ID_REF) %>% 
+  mutate(sample = str_replace(sample, "\\.CEL$", "")) %>% 
+  left_join(
+    select(probe2hugo, probeset_id, hugo_name = gene_assignment), 
+    ., 
+    by = c(probeset_id = "ID_REF")
+  ) %>% 
+  group_by(sample, hugo_name) %>% 
+  summarize(value = SUMMARIZE_FUNCTION(value)) %>% 
+  tidyr::spread(hugo_name, value)
+
+saveRDS(gse_46691_exprs_prepped, file.path("out", "gse46691_hugo_exprs-prepped.rds"))
+write_tsv(gse_46691_exprs_prepped, file.path("out", "gse46691_hugo_exprs-prepped.tsv"))
+
+
+# ---- Additional Patient Information ----
+
+# Clean up pheno data and then merge with gse
+gse_46691_pdata <- pData(gse_46691$phenoData) %>% 
+  as_tibble() %>% 
+  remove_common(description) %>% 
+  clean_channel_vars() %>% 
+  select(-matches("title|characteristics|supplementary")) %>% 
+  mutate_all(as.character) %>% 
+  readr::type_convert()
+
+## To merge with expression dataset, run the following, which will prepend four
+## columns before the exprs data: `geo_accession`, `sample`, `gleason score` and
+## `metastatic event`.
+#
+gse_46691_exprs_pdata <- gse_46691_pdata %>%
+  rename(sample = description) %>%
+  left_join(gse_46691_exprs_prepped, by = "sample")
+
+saveRDS(gse_46691_exprs_pdata, file.path("out", "gse46691_hugo_exprs-w-phenotype.rds"))
+write_tsv(gse_46691_exprs_pdata, file.path("out", "gse46691_hugo_exprs-w-phenotype.tsv"))
diff --git a/GSE46691/functions/utils.R b/GSE46691/functions/utils.R
@@ -133,5 +133,60 @@ download_files <- function(
     }
   }
 
-  dest.dir
+  invisible(dest.dir)
+}
+
+#' Remove all common variables from a data frame
+#' 
+#' ...where common is defined as having the same value across all observations.
+remove_common <- function(x, ..., quiet = FALSE) {
+  keep <- tidyselect::vars_select(names(x), !!! quos(...))
+  remove_common_except(x, keep, quiet)
+}
+
+remove_common_except <- function(x, keep = NULL, quiet = FALSE) {
+  # remove columns with common values across all observations
+  # except for those named in `keep`
+  len_unique <- vapply(x, function(col) length(unique(col)), integer(1))
+  common <- colnames(x)[len_unique == 1]
+  common_can_remove <- setdiff(common, keep)
+  if (!quiet) {
+    value_text <- function(...) crayon::italic(encodeString(paste0(...), quote = "'"))
+    field <- function(...) crayon::green(paste0(...))
+    value <- function(...) crayon::blue(encodeString(paste0(...), quote = "'"))
+
+    cli::cat_line("The following columns contain common information across all observations and have been removed.")
+    cli::cat_line(glue::glue("You can access this metadata in the {value('metadata')} attribute."))
+    kept <- intersect(keep, common)
+    if (length(kept)) cli::cat_line(glue::glue(
+      "{if (plural) 'Columns' else 'Column'} {kept_vars} ",
+      "{if (plural) 'do' else 'does'} not vary accross observations but ",
+      "{if (plural) 'have' else 'has'} been retained by user request",
+      plural = length(kept) > 1,
+      kept_vars = glue::glue_collapse(glue::glue("`{kept}`"), sep = ", ")
+    ))
+    for (col in common_can_remove) {
+      cli::cat_line(field(stringr::str_pad(col, max(nchar(common_can_remove)))), ': ',
+                    value_text(x[[col]][1]))
+    }
+  }
+  removed <- purrr::map(x[, common_can_remove], unique)
+  x <- x[, dplyr::setdiff(colnames(x), common_can_remove)]
+  attr(x, "metadata") <- removed
+  x
+}
+
+#' Clean up channel variables from GEO datasets
+clean_channel_vars <- function(x) {
+  idx <- grep(":ch[12]$", colnames(x))
+  ch_cols <- colnames(x)[idx]
+  stripped <- sub(":ch[12]$", "", ch_cols)
+  dups <- vapply(
+    stripped,
+    function(key) length(which(key == stripped)) > 1,
+    FUN.VALUE = logical(1))
+  # only replace the unqiuely named characteristics
+  ch_cols[!dups] <- stripped[!dups]
+  colnames(x)[idx] <- ch_cols
+  x
 }