From c6536affba83adf9abbb157e8b80e4b8fcbe10de Mon Sep 17 00:00:00 2001 From: Garrick Aden-Buie Date: Mon, 30 Jul 2018 13:19:30 -0400 Subject: [PATCH 1/5] Modify pipeline for whole transcriptome using HUGO names --- GSE46691/50-build_gse46691_hugo.R | 151 ++++++++++++++++++++++++++++++ GSE46691/functions/utils.R | 57 ++++++++++- 2 files changed, 207 insertions(+), 1 deletion(-) create mode 100644 GSE46691/50-build_gse46691_hugo.R diff --git a/GSE46691/50-build_gse46691_hugo.R b/GSE46691/50-build_gse46691_hugo.R new file mode 100644 index 0000000..14b28e1 --- /dev/null +++ b/GSE46691/50-build_gse46691_hugo.R @@ -0,0 +1,151 @@ +# ---- OPTIONS ---- +# +# * Summarize Function * +# When HUGO Gene names are associated with multiple probesets, the probset +# expression values will be summarized using the following function: +SUMMARIZE_FUNCTION <- median + +# * HUGO Dictionary URL* +# This URL points to most recent HUGO gene names information. +# See https://beta.genenames.org/download/custom/ for more information. +HUGO_DICT_URL <- "https://beta.genenames.org/cgi-bin/download/custom?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_status&col=gd_prev_sym&col=gd_aliases&col=gd_pub_chrom_map&col=gd_pub_acc_ids&col=gd_pub_refseq_ids&status=Approved&status=Entry%20Withdrawn&hgnc_dbtag=on&order_by=gd_app_sym_sort&format=text&submit=submit" + +# ---- Load Packages ---- +if (!requireNamespace("pacman", quietly = TRUE)) install.packages("pacman") +pacman::p_load(Biobase) +pacman::p_load(tidyverse) +pacman::p_load(GEOquery) + +# Source functions for these scripts +source_files <- dir("functions", pattern = "\\.R$", full.names = TRUE) +purrr::walk(source_files, source, local = globalenv()) + +# ---- Gather Data ---- +data_dir <- gather_gse46691("data") + +# ---- Load GPL and Series Matrix ---- +gse_46691 <- build_gse_46691(file_exprs = NULL, data_dir = data_dir) + +# ---- Parse and Tidy Gene Assignment ---- +# Get gene_assignment from GPL5188 annotation +gse_46691_genes <- pData(gse_46691$featureData) %>% + select(ID, gene_assignment) %>% + as_tibble() %>% + tidy_gene_assignment(gene_assignment) %>% + # gene_assignment_1 and _2 are now a list_cols but we can splat them out + # which also drops anything that didn't have an assignment, i.e. "---" + tidyr::unnest() %>% + select(-gene_assignment) + +gse_46691_genes2 <- tidyr::gather(gse_46691_genes, drop, gene_name, -ID) %>% + select(-drop) + +# ---- Get Probeset Annotations ---- +# http://www.affymetrix.com/support/technical/byproduct.affx?product=huexon-st +# Download: http://www.affymetrix.com/Auth/analysis/downloads/na36/wtexon/HuEx-1_0-st-v2.na36.hg19.probeset.csv.zip +# Save zip file in "data" and extract. +huex_annotation_path <- file.path("data", "HuEx-1_0-st-v2.na36.hg19.probeset.csv", "HuEx-1_0-st-v2.na36.hg19.probeset.csv") +huex_header_lines <- readLines(huex_annotation_path, n = 50) +huex_header_lines <- max(which(grepl("^#", huex_header_lines))) +huex_annotation <- readr::read_csv( + huex_annotation_path, + col_types = cols_only( + gene_assignment = col_character(), + probeset_id = col_integer() + ), + skip = huex_header_lines +) %>% + tidy_gene_assignment(gene_assignment) %>% + tidyr::unnest() %>% + select(probeset_id, gene_assignment = gene_assignment_1) + +# ---- Get Latest HUGO names ---- +download_files( + urls = c("hgnc_dict.tsv" = HUGO_DICT_URL), + dest.dir = "data" +) + +hgnc_dict <- readr::read_delim("data/hgnc_dict.tsv", + "\t", escape_double = FALSE, trim_ws = TRUE) %>% + mutate(`Approved symbol` = str_replace(`Approved symbol`, "~withdrawn", "")) + +probe2hugo <- huex_annotation %>% + distinct() %>% + inner_join(hgnc_dict, by = c(gene_assignment = "RefSeq IDs")) + +# # This part is not needed but kept for future reference. Uncomment if the +# # probset-hugo mapping contains un-approved symbols (check `Status` column) +# +# probes_with_only_unapproved_names <- probe2hugo %>% +# group_by(ID, Status) %>% +# count() %>% +# tidyr::spread(Status, n, fill = 0) %>% +# filter(Approved < 1, `Entry Withdrawn` > 0 | `Symbol Withdrawn` > 0) +# +# # At this point, there are many HUGO names with Status = "Symbol withdrawn". +# # These entries point to a replacing symbol in the `Approved name` column +# # with the syntax "see XXX", e.g. "symbol withdrawn, see AGAP4" for "AGAP8". +# # So replace with the new name and re-merge with hgnc_dict. +# probe2hugo <- probe2hugo %>% +# mutate(gene_assignment = if_else( +# Status == "Symbol Withdrawn", +# str_extract(`Approved name`, "(?<=see )(.+)$"), +# gene_assignment +# )) %>% +# distinct(probeset_id, gene_assignment) %>% +# inner_join(hgnc_dict, by = c(gene_assignment = "Approved symbol")) + +gse_46691$exprs <- read_tsv_filtered( + file.path(data_dir, "GSE46691_quantile_normalized.txt"), + ID_REF %in% probe2hugo$probeset_id, + chunk_size = 10000, + col_types = readr::cols(ID_REF = col_integer(), .default = col_double()) +) + +# ---- Save Checkpoint ---- +dir.create("out") +saveRDS(gse_46691, file.path("out", "gse46691_hugo_checkpoint.rds")) +saveRDS(gse_46691$exprs, file.path("out", "gse46691_hugo_exprs.rds")) +saveRDS(probe2hugo, file.path("out", "probe2hugo.rds")) + +## To restart from here without having to re-run the above: +# gse_46691 <- readRDS(file.path("out", "gse46691_hugo_checkpoint.rds")) +# probe2hugo <- readRDS(file.path("out", "probe2hugo.rds")) + + +# ---- Probe ID to HUGO ---- +gse_46691_exprs_prepped <- gse_46691$exprs %>% + tidyr::gather("sample", "value", -ID_REF) %>% + mutate(sample = str_replace(sample, "\\.CEL$", "")) %>% + left_join( + select(probe2hugo, probeset_id, hugo_name = `Approved symbol`), + ., + by = c(probeset_id = "ID_REF") + ) %>% + group_by(sample, hugo_name) %>% + summarize(value = SUMMARIZE_FUNCTION(value)) %>% + tidyr::spread(hugo_name, value) + +saveRDS(gse_46691_exprs_prepped, file.path("out", "gse466691_hugo_exprs_prepped.rds")) + +write_tsv(gse_46691_exprs_prepped, file.path("out", "gse466691_hugo_exprs_prepped.tsv")) + + +# ---- Additional Patient Information ---- + +# Clean up pheno data and then merge with gse +gse_46691_pdata <- pData(gse_46691$phenoData) %>% + as_tibble() %>% + remove_common(description) %>% + clean_channel_vars() %>% + select(-matches("title|characteristics|supplementary")) %>% + mutate_all(as.character) %>% + readr::type_convert() + +## To merge with expression dataset, run the following, which will prepend four +## columns before the exprs data: `geo_accession`, `sample`, `gleason score` and +## `metastatic event`. +# +# gse_46691_pdata %>% +# rename(sample = description) %>% +# left_join(gse_46691_exprs_prepped, by = "sample") diff --git a/GSE46691/functions/utils.R b/GSE46691/functions/utils.R index 2b20b5c..1a72cf1 100644 --- a/GSE46691/functions/utils.R +++ b/GSE46691/functions/utils.R @@ -133,5 +133,60 @@ download_files <- function( } } - dest.dir + invisible(dest.dir) +} + +#' Remove all common variables from a data frame +#' +#' ...where common is defined as having the same value across all observations. +remove_common <- function(x, ..., quiet = FALSE) { + keep <- tidyselect::vars_select(names(x), !!! quos(...)) + remove_common_except(x, keep, quiet) +} + +remove_common_except <- function(x, keep = NULL, quiet = FALSE) { + # remove columns with common values across all observations + # except for those named in `keep` + len_unique <- vapply(x, function(col) length(unique(col)), integer(1)) + common <- colnames(x)[len_unique == 1] + common_can_remove <- setdiff(common, keep) + if (!quiet) { + value_text <- function(...) crayon::italic(encodeString(paste0(...), quote = "'")) + field <- function(...) crayon::green(paste0(...)) + value <- function(...) crayon::blue(encodeString(paste0(...), quote = "'")) + + cli::cat_line("The following columns contain common information across all observations and have been removed.") + cli::cat_line(glue::glue("You can access this metadata in the {value('metadata')} attribute.")) + kept <- intersect(keep, common) + if (length(kept)) cli::cat_line(glue::glue( + "{if (plural) 'Columns' else 'Column'} {kept_vars} ", + "{if (plural) 'do' else 'does'} not vary accross observations but ", + "{if (plural) 'have' else 'has'} been retained by user request", + plural = length(kept) > 1, + kept_vars = glue::glue_collapse(glue::glue("`{kept}`"), sep = ", ") + )) + for (col in common_can_remove) { + cli::cat_line(field(stringr::str_pad(col, max(nchar(common_can_remove)))), ': ', + value_text(x[[col]][1])) + } + } + removed <- purrr::map(x[, common_can_remove], unique) + x <- x[, dplyr::setdiff(colnames(x), common_can_remove)] + attr(x, "metadata") <- removed + x +} + +#' Clean up channel variables from GEO datasets +clean_channel_vars <- function(x) { + idx <- grep(":ch[12]$", colnames(x)) + ch_cols <- colnames(x)[idx] + stripped <- sub(":ch[12]$", "", ch_cols) + dups <- vapply( + stripped, + function(key) length(which(key == stripped)) > 1, + FUN.VALUE = logical(1)) + # only replace the unqiuely named characteristics + ch_cols[!dups] <- stripped[!dups] + colnames(x)[idx] <- ch_cols + x } \ No newline at end of file From bb1a74360598999537f10500995048106053fe72 Mon Sep 17 00:00:00 2001 From: Garrick Aden-Buie Date: Mon, 30 Jul 2018 13:48:32 -0400 Subject: [PATCH 2/5] no need to duplicate checkpoint of exprs data --- GSE46691/50-build_gse46691_hugo.R | 1 - 1 file changed, 1 deletion(-) diff --git a/GSE46691/50-build_gse46691_hugo.R b/GSE46691/50-build_gse46691_hugo.R index 14b28e1..10ae0e3 100644 --- a/GSE46691/50-build_gse46691_hugo.R +++ b/GSE46691/50-build_gse46691_hugo.R @@ -105,7 +105,6 @@ gse_46691$exprs <- read_tsv_filtered( # ---- Save Checkpoint ---- dir.create("out") saveRDS(gse_46691, file.path("out", "gse46691_hugo_checkpoint.rds")) -saveRDS(gse_46691$exprs, file.path("out", "gse46691_hugo_exprs.rds")) saveRDS(probe2hugo, file.path("out", "probe2hugo.rds")) ## To restart from here without having to re-run the above: From d60e713c8a41faf20bf201b45dfa978041f211c2 Mon Sep 17 00:00:00 2001 From: Garrick Aden-Buie Date: Mon, 30 Jul 2018 13:54:58 -0400 Subject: [PATCH 3/5] Minor typo (extra 6 in gse accession number) --- GSE46691/50-build_gse46691_hugo.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/GSE46691/50-build_gse46691_hugo.R b/GSE46691/50-build_gse46691_hugo.R index 10ae0e3..000db75 100644 --- a/GSE46691/50-build_gse46691_hugo.R +++ b/GSE46691/50-build_gse46691_hugo.R @@ -125,9 +125,9 @@ gse_46691_exprs_prepped <- gse_46691$exprs %>% summarize(value = SUMMARIZE_FUNCTION(value)) %>% tidyr::spread(hugo_name, value) -saveRDS(gse_46691_exprs_prepped, file.path("out", "gse466691_hugo_exprs_prepped.rds")) +saveRDS(gse_46691_exprs_prepped, file.path("out", "gse46691_hugo_exprs_prepped.rds")) -write_tsv(gse_46691_exprs_prepped, file.path("out", "gse466691_hugo_exprs_prepped.tsv")) +write_tsv(gse_46691_exprs_prepped, file.path("out", "gse46691_hugo_exprs_prepped.tsv")) # ---- Additional Patient Information ---- From 2ad0eba9335de0228db6f1bce456c05d7da93c9b Mon Sep 17 00:00:00 2001 From: Garrick Aden-Buie Date: Mon, 30 Jul 2018 15:30:51 -0400 Subject: [PATCH 4/5] Update matching on hugo name --- GSE46691/50-build_gse46691_hugo.R | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/GSE46691/50-build_gse46691_hugo.R b/GSE46691/50-build_gse46691_hugo.R index 000db75..eaf1157 100644 --- a/GSE46691/50-build_gse46691_hugo.R +++ b/GSE46691/50-build_gse46691_hugo.R @@ -57,7 +57,7 @@ huex_annotation <- readr::read_csv( ) %>% tidy_gene_assignment(gene_assignment) %>% tidyr::unnest() %>% - select(probeset_id, gene_assignment = gene_assignment_1) + select(-gene_assignment) # ---- Get Latest HUGO names ---- download_files( @@ -69,9 +69,20 @@ hgnc_dict <- readr::read_delim("data/hgnc_dict.tsv", "\t", escape_double = FALSE, trim_ws = TRUE) %>% mutate(`Approved symbol` = str_replace(`Approved symbol`, "~withdrawn", "")) +# First, match gene symbol provided by Affymetrix annotation to HGNC (Hugo) probe2hugo <- huex_annotation %>% - distinct() %>% - inner_join(hgnc_dict, by = c(gene_assignment = "RefSeq IDs")) + distinct(probeset_id, gene_assignment_2) %>% + inner_join(hgnc_dict, by = c(gene_assignment_2 = "Approved symbol")) %>% + rename(gene_assignment = gene_assignment_2) + +# Then try matching using RefSeq (public sequence ID from Affymetrix annotation) +probe2refseq2hugo <- huex_annotation %>% + filter(!probeset_id %in% probe2hugo$probeset_id) %>% + distinct(probeset_id, gene_assignment_1) %>% + inner_join(hgnc_dict, by = c(gene_assignment_1 = "RefSeq IDs")) %>% + rename(`RefSeq IDs` = gene_assignment_1, gene_assignment = `Approved symbol`) + +probe2hugo <- bind_rows(probe2hugo, probe2refseq2hugo) %>% arrange(probeset_id, gene_assignment) # # This part is not needed but kept for future reference. Uncomment if the # # probset-hugo mapping contains un-approved symbols (check `Status` column) @@ -125,9 +136,8 @@ gse_46691_exprs_prepped <- gse_46691$exprs %>% summarize(value = SUMMARIZE_FUNCTION(value)) %>% tidyr::spread(hugo_name, value) -saveRDS(gse_46691_exprs_prepped, file.path("out", "gse46691_hugo_exprs_prepped.rds")) - -write_tsv(gse_46691_exprs_prepped, file.path("out", "gse46691_hugo_exprs_prepped.tsv")) +saveRDS(gse_46691_exprs_prepped, file.path("out", "gse46691_hugo_exprs-prepped.rds")) +write_tsv(gse_46691_exprs_prepped, file.path("out", "gse46691_hugo_exprs-prepped.tsv")) # ---- Additional Patient Information ---- @@ -145,6 +155,9 @@ gse_46691_pdata <- pData(gse_46691$phenoData) %>% ## columns before the exprs data: `geo_accession`, `sample`, `gleason score` and ## `metastatic event`. # -# gse_46691_pdata %>% -# rename(sample = description) %>% -# left_join(gse_46691_exprs_prepped, by = "sample") +gse_46691_exprs_pdata <- gse_46691_pdata %>% + rename(sample = description) %>% + left_join(gse_46691_exprs_prepped, by = "sample") + +saveRDS(gse_46691_exprs_prepped, file.path("out", "gse46691_hugo_exprs-w-phenotype.rds")) +write_tsv(gse_46691_exprs_prepped, file.path("out", "gse46691_hugo_exprs-w-phenotype.tsv")) From aa1e03e6261b3c1f22cfa1ac5b73082bb105d965 Mon Sep 17 00:00:00 2001 From: Garrick Aden-Buie Date: Mon, 30 Jul 2018 16:25:20 -0400 Subject: [PATCH 5/5] Fix typos in refactoring --- GSE46691/50-build_gse46691_hugo.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/GSE46691/50-build_gse46691_hugo.R b/GSE46691/50-build_gse46691_hugo.R index eaf1157..549585b 100644 --- a/GSE46691/50-build_gse46691_hugo.R +++ b/GSE46691/50-build_gse46691_hugo.R @@ -128,7 +128,7 @@ gse_46691_exprs_prepped <- gse_46691$exprs %>% tidyr::gather("sample", "value", -ID_REF) %>% mutate(sample = str_replace(sample, "\\.CEL$", "")) %>% left_join( - select(probe2hugo, probeset_id, hugo_name = `Approved symbol`), + select(probe2hugo, probeset_id, hugo_name = gene_assignment), ., by = c(probeset_id = "ID_REF") ) %>% @@ -159,5 +159,5 @@ gse_46691_exprs_pdata <- gse_46691_pdata %>% rename(sample = description) %>% left_join(gse_46691_exprs_prepped, by = "sample") -saveRDS(gse_46691_exprs_prepped, file.path("out", "gse46691_hugo_exprs-w-phenotype.rds")) -write_tsv(gse_46691_exprs_prepped, file.path("out", "gse46691_hugo_exprs-w-phenotype.tsv")) +saveRDS(gse_46691_exprs_pdata, file.path("out", "gse46691_hugo_exprs-w-phenotype.rds")) +write_tsv(gse_46691_exprs_pdata, file.path("out", "gse46691_hugo_exprs-w-phenotype.tsv"))