forked from gchoonoo/HNSCC_Notebook
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathformat_gistic.Rmd
More file actions
82 lines (59 loc) · 2.55 KB
/
format_gistic.Rmd
File metadata and controls
82 lines (59 loc) · 2.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
---
title: "Process Gistic Files"
author: "Ted Laderas"
date: "8/15/2019"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
## Transforming the Gistic Data
This is a simple script that transforms the GISTIC calls for the HNSCC data (the `all_thresholded.by_genes.txt") into a long format more appropriate to introduce into the mutation workflow.
```{r}
library(data.table)
library(dtplyr)
library(dplyr)
library(here)
gistic_file <- data.table::fread(here("data/gdac.broadinstitute.org_HNSC-TP.CopyNumber_Gistic2.Level_4.2012122100.0.0/all_thresholded.by_genes.txt"))
gistic_long <- gistic_file %>% tidyr::gather("tcga_id", "cnv_call", - `Gene Symbol`, - `Locus ID`, -Cytoband)
gistic_long <- gistic_long %>% mutate(tcga_id = substr(tcga_id, 1, 12)) %>% filter(cnv_call != 0)
#filter only to -2 and +2 calls
gistic_long_calls <- gistic_long %>%
filter(cnv_call !=0 & (cnv_call == -2 | cnv_call == 2)) %>%
mutate(Variant_Classification = "CNV",
Variant_Type = case_when(cnv_call == -2 ~ "DEL",
cnv_call == 2 ~ "AMP"
))
saveRDS(gistic_long_calls, file=here("data/gistic_long_calls.rds"))
```
```{r}
head(gistic_long_calls)
```
```{r}
library(data.table)
library(dplyr)
gistic_long_calls <- readRDS(here::here("data/gistic_long_calls.rds"))
##these are the needed columns
cols <- c("Hugo_Symbol", "Chrom","Ncbi_Build",
"Start_Position", "Reference_Allele",
"Tumor_Sample_Barcode", "Tumor_Seq_Allele1",
"Tumor_Seq_Allele2", "Variant_Classification",
"Variant_Type", "Dbsnp_Rs","Dbsnp_Val_Status")
tcga_samp <- fread(here::here("data/TCGA.HNSC.HPV-positive.maf"))
head(tcga_samp)
new_gistic_long <- gistic_long_calls %>% mutate(Hugo_Symbol=`Gene Symbol`,
Chrom = Cytoband,
Tumor_Sample_Barcode = tcga_id,
Start_Position = NA, Reference_Allele = NA,
Ncbi_Build = "GRCh38", Tumor_Seq_Allele1=NA,
Tumor_Seq_Allele2 = NA,
Dbsnp_Rs = NA, Dbsnp_Val_Status = NA, Sequencer=NA
)
write.table(new_gistic_long,file = here::here("data/gistic_long_file.maf"), sep="\t", row.names = FALSE)
#show that this dummied data passes dataCleaningAndCheck
try(packageDir:::dataCleaningAndCheck(head(new_gistic_long)))
#show that it passes addPidColumn
try(packageDir:::addPidColumn(head(new_gistic_long)))
#
try(packageDir:::FilterDuplicates(new_gistic_long))
```