From c6536affba83adf9abbb157e8b80e4b8fcbe10de Mon Sep 17 00:00:00 2001
From: Garrick Aden-Buie <gadenbuie@mail.usf.edu>
Date: Mon, 30 Jul 2018 13:19:30 -0400
Subject: [PATCH 1/5] Modify pipeline for whole transcriptome using HUGO names

---
 GSE46691/50-build_gse46691_hugo.R | 151 ++++++++++++++++++++++++++++++
 GSE46691/functions/utils.R        |  57 ++++++++++-
 2 files changed, 207 insertions(+), 1 deletion(-)
 create mode 100644 GSE46691/50-build_gse46691_hugo.R

diff --git a/GSE46691/50-build_gse46691_hugo.R b/GSE46691/50-build_gse46691_hugo.R
new file mode 100644
index 0000000..14b28e1
--- /dev/null
+++ b/GSE46691/50-build_gse46691_hugo.R
@@ -0,0 +1,151 @@
+# ---- OPTIONS ----
+# 
+# * Summarize Function *
+# When HUGO Gene names are associated with multiple probesets, the probset
+# expression values will be summarized using the following function:
+SUMMARIZE_FUNCTION <- median
+
+# * HUGO Dictionary URL*
+# This URL points to most recent HUGO gene names information.
+# See https://beta.genenames.org/download/custom/ for more information.
+HUGO_DICT_URL <- "https://beta.genenames.org/cgi-bin/download/custom?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_status&col=gd_prev_sym&col=gd_aliases&col=gd_pub_chrom_map&col=gd_pub_acc_ids&col=gd_pub_refseq_ids&status=Approved&status=Entry%20Withdrawn&hgnc_dbtag=on&order_by=gd_app_sym_sort&format=text&submit=submit"
+
+# ---- Load Packages ----
+if (!requireNamespace("pacman", quietly = TRUE)) install.packages("pacman")
+pacman::p_load(Biobase)
+pacman::p_load(tidyverse)
+pacman::p_load(GEOquery)
+
+# Source functions for these scripts
+source_files <- dir("functions", pattern = "\\.R$", full.names = TRUE)
+purrr::walk(source_files, source, local = globalenv())
+
+# ---- Gather Data ----
+data_dir <- gather_gse46691("data")
+
+# ---- Load GPL and Series Matrix ----
+gse_46691 <- build_gse_46691(file_exprs = NULL, data_dir = data_dir)
+
+# ---- Parse and Tidy Gene Assignment ----
+# Get gene_assignment from GPL5188 annotation
+gse_46691_genes <- pData(gse_46691$featureData) %>% 
+  select(ID, gene_assignment) %>% 
+  as_tibble() %>% 
+  tidy_gene_assignment(gene_assignment) %>% 
+  # gene_assignment_1 and _2 are now a list_cols but we can splat them out
+  # which also drops anything that didn't have an assignment, i.e. "---"
+  tidyr::unnest() %>% 
+  select(-gene_assignment)
+
+gse_46691_genes2 <- tidyr::gather(gse_46691_genes, drop, gene_name, -ID) %>% 
+  select(-drop)
+
+# ---- Get Probeset Annotations ----
+# http://www.affymetrix.com/support/technical/byproduct.affx?product=huexon-st
+# Download: http://www.affymetrix.com/Auth/analysis/downloads/na36/wtexon/HuEx-1_0-st-v2.na36.hg19.probeset.csv.zip
+# Save zip file in "data" and extract.
+huex_annotation_path <- file.path("data", "HuEx-1_0-st-v2.na36.hg19.probeset.csv", "HuEx-1_0-st-v2.na36.hg19.probeset.csv")
+huex_header_lines <- readLines(huex_annotation_path, n = 50)
+huex_header_lines <- max(which(grepl("^#", huex_header_lines)))
+huex_annotation <- readr::read_csv(
+  huex_annotation_path, 
+  col_types = cols_only(
+    gene_assignment = col_character(), 
+    probeset_id = col_integer()
+  ), 
+  skip = huex_header_lines
+) %>% 
+  tidy_gene_assignment(gene_assignment) %>% 
+  tidyr::unnest() %>% 
+  select(probeset_id, gene_assignment = gene_assignment_1)
+
+# ---- Get Latest HUGO names ----
+download_files(
+  urls = c("hgnc_dict.tsv" = HUGO_DICT_URL),
+  dest.dir = "data"
+)
+
+hgnc_dict <- readr::read_delim("data/hgnc_dict.tsv", 
+    "\t", escape_double = FALSE, trim_ws = TRUE) %>% 
+  mutate(`Approved symbol` = str_replace(`Approved symbol`, "~withdrawn", ""))
+
+probe2hugo <- huex_annotation %>% 
+  distinct() %>% 
+  inner_join(hgnc_dict, by = c(gene_assignment = "RefSeq IDs"))
+
+# # This part is not needed but kept for future reference. Uncomment if the
+# # probset-hugo mapping contains un-approved symbols (check `Status` column)
+# 
+# probes_with_only_unapproved_names <- probe2hugo %>% 
+#   group_by(ID, Status) %>%
+#   count() %>%
+#   tidyr::spread(Status, n, fill = 0) %>%
+#   filter(Approved < 1, `Entry Withdrawn` > 0 | `Symbol Withdrawn` > 0)
+# 
+# # At this point, there are many HUGO names with Status = "Symbol withdrawn".
+# # These entries point to a replacing symbol in the `Approved name` column
+# # with the syntax "see XXX", e.g. "symbol withdrawn, see AGAP4" for "AGAP8".
+# # So replace with the new name and re-merge with hgnc_dict.
+# probe2hugo <- probe2hugo %>%
+#   mutate(gene_assignment = if_else(
+#     Status == "Symbol Withdrawn",
+#     str_extract(`Approved name`, "(?<=see )(.+)$"),
+#     gene_assignment
+#   )) %>% 
+#   distinct(probeset_id, gene_assignment) %>% 
+#   inner_join(hgnc_dict, by = c(gene_assignment = "Approved symbol"))
+
+gse_46691$exprs <- read_tsv_filtered(
+  file.path(data_dir, "GSE46691_quantile_normalized.txt"),
+  ID_REF %in% probe2hugo$probeset_id,
+  chunk_size = 10000, 
+  col_types = readr::cols(ID_REF = col_integer(), .default = col_double())
+)
+
+# ---- Save Checkpoint ----
+dir.create("out")
+saveRDS(gse_46691, file.path("out", "gse46691_hugo_checkpoint.rds"))
+saveRDS(gse_46691$exprs, file.path("out", "gse46691_hugo_exprs.rds"))
+saveRDS(probe2hugo, file.path("out", "probe2hugo.rds"))
+
+## To restart from here without having to re-run the above:
+# gse_46691 <- readRDS(file.path("out", "gse46691_hugo_checkpoint.rds"))
+# probe2hugo <- readRDS(file.path("out", "probe2hugo.rds"))
+
+
+# ---- Probe ID to HUGO ----
+gse_46691_exprs_prepped <- gse_46691$exprs %>% 
+  tidyr::gather("sample", "value", -ID_REF) %>% 
+  mutate(sample = str_replace(sample, "\\.CEL$", "")) %>% 
+  left_join(
+    select(probe2hugo, probeset_id, hugo_name = `Approved symbol`), 
+    ., 
+    by = c(probeset_id = "ID_REF")
+  ) %>% 
+  group_by(sample, hugo_name) %>% 
+  summarize(value = SUMMARIZE_FUNCTION(value)) %>% 
+  tidyr::spread(hugo_name, value)
+
+saveRDS(gse_46691_exprs_prepped, file.path("out", "gse466691_hugo_exprs_prepped.rds"))
+
+write_tsv(gse_46691_exprs_prepped, file.path("out", "gse466691_hugo_exprs_prepped.tsv"))
+
+
+# ---- Additional Patient Information ----
+
+# Clean up pheno data and then merge with gse
+gse_46691_pdata <- pData(gse_46691$phenoData) %>% 
+  as_tibble() %>% 
+  remove_common(description) %>% 
+  clean_channel_vars() %>% 
+  select(-matches("title|characteristics|supplementary")) %>% 
+  mutate_all(as.character) %>% 
+  readr::type_convert()
+
+## To merge with expression dataset, run the following, which will prepend four
+## columns before the exprs data: `geo_accession`, `sample`, `gleason score` and
+## `metastatic event`.
+#
+# gse_46691_pdata %>% 
+#   rename(sample = description) %>% 
+#   left_join(gse_46691_exprs_prepped, by = "sample")
diff --git a/GSE46691/functions/utils.R b/GSE46691/functions/utils.R
index 2b20b5c..1a72cf1 100644
--- a/GSE46691/functions/utils.R
+++ b/GSE46691/functions/utils.R
@@ -133,5 +133,60 @@ download_files <- function(
     }
   }
   
-  dest.dir
+  invisible(dest.dir)
+}
+
+#' Remove all common variables from a data frame
+#' 
+#' ...where common is defined as having the same value across all observations.
+remove_common <- function(x, ..., quiet = FALSE) {
+  keep <- tidyselect::vars_select(names(x), !!! quos(...))
+  remove_common_except(x, keep, quiet)
+}
+
+remove_common_except <- function(x, keep = NULL, quiet = FALSE) {
+  # remove columns with common values across all observations
+  # except for those named in `keep`
+  len_unique <- vapply(x, function(col) length(unique(col)), integer(1))
+  common <- colnames(x)[len_unique == 1]
+  common_can_remove <- setdiff(common, keep)
+  if (!quiet) {
+    value_text <- function(...) crayon::italic(encodeString(paste0(...), quote = "'"))
+    field <- function(...) crayon::green(paste0(...))
+    value <- function(...) crayon::blue(encodeString(paste0(...), quote = "'"))
+    
+    cli::cat_line("The following columns contain common information across all observations and have been removed.")
+    cli::cat_line(glue::glue("You can access this metadata in the {value('metadata')} attribute."))
+    kept <- intersect(keep, common)
+    if (length(kept)) cli::cat_line(glue::glue(
+      "{if (plural) 'Columns' else 'Column'} {kept_vars} ",
+      "{if (plural) 'do' else 'does'} not vary accross observations but ",
+      "{if (plural) 'have' else 'has'} been retained by user request",
+      plural = length(kept) > 1,
+      kept_vars = glue::glue_collapse(glue::glue("`{kept}`"), sep = ", ")
+    ))
+    for (col in common_can_remove) {
+      cli::cat_line(field(stringr::str_pad(col, max(nchar(common_can_remove)))), ': ',
+                    value_text(x[[col]][1]))
+    }
+  }
+  removed <- purrr::map(x[, common_can_remove], unique)
+  x <- x[, dplyr::setdiff(colnames(x), common_can_remove)]
+  attr(x, "metadata") <- removed
+  x
+}
+
+#' Clean up channel variables from GEO datasets
+clean_channel_vars <- function(x) {
+  idx <- grep(":ch[12]$", colnames(x))
+  ch_cols <- colnames(x)[idx]
+  stripped <- sub(":ch[12]$", "", ch_cols)
+  dups <- vapply(
+    stripped,
+    function(key) length(which(key == stripped)) > 1,
+    FUN.VALUE = logical(1))
+  # only replace the unqiuely named characteristics
+  ch_cols[!dups] <- stripped[!dups]
+  colnames(x)[idx] <- ch_cols
+  x
 }
\ No newline at end of file

From bb1a74360598999537f10500995048106053fe72 Mon Sep 17 00:00:00 2001
From: Garrick Aden-Buie <gadenbuie@mail.usf.edu>
Date: Mon, 30 Jul 2018 13:48:32 -0400
Subject: [PATCH 2/5] no need to duplicate checkpoint of exprs data

---
 GSE46691/50-build_gse46691_hugo.R | 1 -
 1 file changed, 1 deletion(-)

diff --git a/GSE46691/50-build_gse46691_hugo.R b/GSE46691/50-build_gse46691_hugo.R
index 14b28e1..10ae0e3 100644
--- a/GSE46691/50-build_gse46691_hugo.R
+++ b/GSE46691/50-build_gse46691_hugo.R
@@ -105,7 +105,6 @@ gse_46691$exprs <- read_tsv_filtered(
 # ---- Save Checkpoint ----
 dir.create("out")
 saveRDS(gse_46691, file.path("out", "gse46691_hugo_checkpoint.rds"))
-saveRDS(gse_46691$exprs, file.path("out", "gse46691_hugo_exprs.rds"))
 saveRDS(probe2hugo, file.path("out", "probe2hugo.rds"))
 
 ## To restart from here without having to re-run the above:

From d60e713c8a41faf20bf201b45dfa978041f211c2 Mon Sep 17 00:00:00 2001
From: Garrick Aden-Buie <gadenbuie@mail.usf.edu>
Date: Mon, 30 Jul 2018 13:54:58 -0400
Subject: [PATCH 3/5] Minor typo (extra 6 in gse accession number)

---
 GSE46691/50-build_gse46691_hugo.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/GSE46691/50-build_gse46691_hugo.R b/GSE46691/50-build_gse46691_hugo.R
index 10ae0e3..000db75 100644
--- a/GSE46691/50-build_gse46691_hugo.R
+++ b/GSE46691/50-build_gse46691_hugo.R
@@ -125,9 +125,9 @@ gse_46691_exprs_prepped <- gse_46691$exprs %>%
   summarize(value = SUMMARIZE_FUNCTION(value)) %>% 
   tidyr::spread(hugo_name, value)
 
-saveRDS(gse_46691_exprs_prepped, file.path("out", "gse466691_hugo_exprs_prepped.rds"))
+saveRDS(gse_46691_exprs_prepped, file.path("out", "gse46691_hugo_exprs_prepped.rds"))
 
-write_tsv(gse_46691_exprs_prepped, file.path("out", "gse466691_hugo_exprs_prepped.tsv"))
+write_tsv(gse_46691_exprs_prepped, file.path("out", "gse46691_hugo_exprs_prepped.tsv"))
 
 
 # ---- Additional Patient Information ----

From 2ad0eba9335de0228db6f1bce456c05d7da93c9b Mon Sep 17 00:00:00 2001
From: Garrick Aden-Buie <gadenbuie@mail.usf.edu>
Date: Mon, 30 Jul 2018 15:30:51 -0400
Subject: [PATCH 4/5] Update matching on hugo name

---
 GSE46691/50-build_gse46691_hugo.R | 31 ++++++++++++++++++++++---------
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/GSE46691/50-build_gse46691_hugo.R b/GSE46691/50-build_gse46691_hugo.R
index 000db75..eaf1157 100644
--- a/GSE46691/50-build_gse46691_hugo.R
+++ b/GSE46691/50-build_gse46691_hugo.R
@@ -57,7 +57,7 @@ huex_annotation <- readr::read_csv(
 ) %>% 
   tidy_gene_assignment(gene_assignment) %>% 
   tidyr::unnest() %>% 
-  select(probeset_id, gene_assignment = gene_assignment_1)
+  select(-gene_assignment)
 
 # ---- Get Latest HUGO names ----
 download_files(
@@ -69,9 +69,20 @@ hgnc_dict <- readr::read_delim("data/hgnc_dict.tsv",
     "\t", escape_double = FALSE, trim_ws = TRUE) %>% 
   mutate(`Approved symbol` = str_replace(`Approved symbol`, "~withdrawn", ""))
 
+# First, match gene symbol provided by Affymetrix annotation to HGNC (Hugo)
 probe2hugo <- huex_annotation %>% 
-  distinct() %>% 
-  inner_join(hgnc_dict, by = c(gene_assignment = "RefSeq IDs"))
+  distinct(probeset_id, gene_assignment_2) %>% 
+  inner_join(hgnc_dict, by = c(gene_assignment_2 = "Approved symbol")) %>% 
+  rename(gene_assignment = gene_assignment_2)
+
+# Then try matching using RefSeq (public sequence ID from Affymetrix annotation)
+probe2refseq2hugo <- huex_annotation %>% 
+  filter(!probeset_id %in% probe2hugo$probeset_id) %>% 
+  distinct(probeset_id, gene_assignment_1) %>% 
+  inner_join(hgnc_dict, by = c(gene_assignment_1 = "RefSeq IDs")) %>% 
+  rename(`RefSeq IDs` = gene_assignment_1, gene_assignment = `Approved symbol`)
+
+probe2hugo <- bind_rows(probe2hugo, probe2refseq2hugo) %>% arrange(probeset_id, gene_assignment)
 
 # # This part is not needed but kept for future reference. Uncomment if the
 # # probset-hugo mapping contains un-approved symbols (check `Status` column)
@@ -125,9 +136,8 @@ gse_46691_exprs_prepped <- gse_46691$exprs %>%
   summarize(value = SUMMARIZE_FUNCTION(value)) %>% 
   tidyr::spread(hugo_name, value)
 
-saveRDS(gse_46691_exprs_prepped, file.path("out", "gse46691_hugo_exprs_prepped.rds"))
-
-write_tsv(gse_46691_exprs_prepped, file.path("out", "gse46691_hugo_exprs_prepped.tsv"))
+saveRDS(gse_46691_exprs_prepped, file.path("out", "gse46691_hugo_exprs-prepped.rds"))
+write_tsv(gse_46691_exprs_prepped, file.path("out", "gse46691_hugo_exprs-prepped.tsv"))
 
 
 # ---- Additional Patient Information ----
@@ -145,6 +155,9 @@ gse_46691_pdata <- pData(gse_46691$phenoData) %>%
 ## columns before the exprs data: `geo_accession`, `sample`, `gleason score` and
 ## `metastatic event`.
 #
-# gse_46691_pdata %>% 
-#   rename(sample = description) %>% 
-#   left_join(gse_46691_exprs_prepped, by = "sample")
+gse_46691_exprs_pdata <- gse_46691_pdata %>%
+  rename(sample = description) %>%
+  left_join(gse_46691_exprs_prepped, by = "sample")
+
+saveRDS(gse_46691_exprs_prepped, file.path("out", "gse46691_hugo_exprs-w-phenotype.rds"))
+write_tsv(gse_46691_exprs_prepped, file.path("out", "gse46691_hugo_exprs-w-phenotype.tsv"))

From aa1e03e6261b3c1f22cfa1ac5b73082bb105d965 Mon Sep 17 00:00:00 2001
From: Garrick Aden-Buie <gadenbuie@mail.usf.edu>
Date: Mon, 30 Jul 2018 16:25:20 -0400
Subject: [PATCH 5/5] Fix typos in refactoring

---
 GSE46691/50-build_gse46691_hugo.R | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/GSE46691/50-build_gse46691_hugo.R b/GSE46691/50-build_gse46691_hugo.R
index eaf1157..549585b 100644
--- a/GSE46691/50-build_gse46691_hugo.R
+++ b/GSE46691/50-build_gse46691_hugo.R
@@ -128,7 +128,7 @@ gse_46691_exprs_prepped <- gse_46691$exprs %>%
   tidyr::gather("sample", "value", -ID_REF) %>% 
   mutate(sample = str_replace(sample, "\\.CEL$", "")) %>% 
   left_join(
-    select(probe2hugo, probeset_id, hugo_name = `Approved symbol`), 
+    select(probe2hugo, probeset_id, hugo_name = gene_assignment), 
     ., 
     by = c(probeset_id = "ID_REF")
   ) %>% 
@@ -159,5 +159,5 @@ gse_46691_exprs_pdata <- gse_46691_pdata %>%
   rename(sample = description) %>%
   left_join(gse_46691_exprs_prepped, by = "sample")
 
-saveRDS(gse_46691_exprs_prepped, file.path("out", "gse46691_hugo_exprs-w-phenotype.rds"))
-write_tsv(gse_46691_exprs_prepped, file.path("out", "gse46691_hugo_exprs-w-phenotype.tsv"))
+saveRDS(gse_46691_exprs_pdata, file.path("out", "gse46691_hugo_exprs-w-phenotype.rds"))
+write_tsv(gse_46691_exprs_pdata, file.path("out", "gse46691_hugo_exprs-w-phenotype.tsv"))