-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathacc_functions.R
More file actions
1952 lines (1728 loc) · 76.2 KB
/
acc_functions.R
File metadata and controls
1952 lines (1728 loc) · 76.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#accessory functions
if(!require("RUnit")){
install.packages("RUnit")
library("RUnit")
}
if(!require("tcltk")){
install.packages("tcltk")
library("tcltk")
}
acc_loaded=T
source('./summaryTable5.R')
source('./pathway_functions.R')
source('./InitiateDataStructures.R')
source('./OverlapAnalysisFunctions.R')
source('./path_paint.R')
source('./save_and_load_data.R')
source('./SettingsObjectInterface.R')
# source("http://bioconductor.org/biocLite.R")
# biocLite("RCytoscape")
library("RCytoscape")
library("graphite")
if(!exists("VERBOSE")) VERBOSE = F
.pardefault <- par(no.readonly = T)
.parpin <-par()$pin
# .pardefault <- par()
promptNumeric <- function (prompt) {
while(T){
line=readline(prompt)
if(line=="") break
line = suppressWarnings(expr=as.numeric(line))
if(!is.na(line)) break
print("Sorry, your input could not be understood, please try again")
}
return(line)
}
longTextBarPlot<-function(data, lab, main=""){
bpdata = barplot(data, horiz=T, main=main)
xmin = par("usr")[1]
xrange = par("usr")[2] - par("usr")[1]
xcoord = xmin + (xrange/40)
nlev = length(data)
text(x=xcoord, y=bpdata, labels=lab, pos=4)
}
save.plot<-function(pname){
return(AutoSavePlot(pname))
}
AutoSavePlot<-function(pname){
if(is.null(pname)){
pname = "tmp_image_file"
}
if(!grepl(pattern=".png$", x=pname)) pname = paste(pname,".png",sep="")
pfname = plotFileName(pname)
dev.copy(png,pfname)
dev.off()
return(pfname)
}
plotFileName<-function(pname){
#takes a name, makes sure there's a directory there, adds time stamp to make it unique, adds a postfix to make sure
autoincrament = F
root="./output/imageTemp/"
if(grepl(pattern="[/]",x=pname)){#the a file path was passed, find the root
root = paste(paste(strsplit(x=pname, split="[/]")[[1]][1:(length(strsplit(x=pname, split="[/]")[[1]])-1)],collapse="/"),
"/",
sep="")
pname = strsplit(x=pname, split="[/]")[[1]][length(strsplit(x=pname, split="[/]")[[1]])]#the last part of the path is the actual file name
}
dir.create(path=root, recursive=T, showWarnings=F)
fullPath = paste(root, "graphic",gsub(pattern="[-:]", replacement=".", x=as.character(Sys.time())),".", pname, sep="")
#check if it exists, if it does, append a number
while(file.exists(fullPath)&autoincrament){
pname = paste("1", pname, sep=".")
fullPath = paste(root, "graphic",gsub(pattern="[-:]", replacement=".", x=as.character(Sys.time())),".", pname, sep="")
}
return(fullPath)
}
#pgvmFromStacked makes a patient gene value matrix from data in stacked format
#rows = genes, columns = patients, values = value for gene, ie, for a gene which has a missense mutation,
# the value would be "missense"
#if more than one value are given for a particular patient-gene combination
#takes: stackedData: data frame with patient gene data in stacked format
# valueCol: the name of the column with the gene values
# patientIdCol: the name of the column with patient ids
# geneCol: the name of the column with the gene identifiers
# blankSymbol: what is placed in the pvm when the stacked file does not give a value
# for a given patient-gene combination
#returns: patient gene value matrix
pgvmFromStacked<-function(stackedData, valueCol, patientIdCol, geneCol, blankSymbol=NA){
#1: find list of unique pids
pids = unique(stackedData[,patientIdCol])
#2: find lis of unique genes
ugenes = unique(stackedData[,geneCol])
#case to handle: if patient does not have a value for a gene,
#it will be set to NULL
pvm = matrix(data = blankSymbol, ncol=length(pids), nrow=length(ugenes), dimnames=list(ugenes, pids))
#3: put values
for(p in pids){
prows = stackedData[stackedData[,patientIdCol] == p,]
for(g in ugenes){
pgrows = prows[prows[,geneCol]==g,]
if(nrow(pgrows) == 1){
pvm[g,p] = pgrows[1,valueCol]
}else if(nrow(pgrows) > 1){
#case to handle: if there are more than one value for a gene..
cat("\nNote: multiple values found for this patient-gene combo:", p, g, "\n")
pvm[g,p] = paste(pgrows[,valueCol], collapse=" ")
}
}
}
return(pvm)
}
PGMFromVector<-function(genevector){
if(is.data.frame(genevector)) genevector = as.matrix(genevector)
if(is.matrix(genevector)) genevector = as.vector(genevector)
pgmout = matrix(data=1, nrow=length(genevector), ncol=1, dimnames=list(genevector, "Genes_in_set"))
return(pgmout)
}
#selectionList()
#prompts the user with a numbered list of selection options
#takes a vector of options to choose from (does not have to be unique set)
#returns logic vector indicating which items in the input vector match the/those
# item(s) chosen by user
selectionList<-function(valcol){
usel = unique(valcol)
useltmp = gsub(pattern="_",replacement=" ",x=usel)
uselmat = matrix(data=1:length(usel), ncol=1, dimnames=list(useltmp, "selection number"))
print(uselmat)
uin = readline("Please enter the number(s) corresponding to your selection(s)\n")
uin = as.integer(strsplit(x=uin, split=" ")[[1]])
lvout = valcol%in%usel[uin]
return(lvout)
}
filePrompt<-function(defaultfile){
#prompts user to select file for data input
#provides default file option
#returns file name
fsel = readline(paste("\nTo select a data file, Enter s\n",
"To load the default data from \n",defaultfile,",\njust press enter \n",sep=""))
if(fsel=="s"){
pfile = file.choose()
}else{
pfile = defaultfile
}
cat("\nLoading data from:\n",pfile,"\n")
return(pfile)
}
open.PGM<-function(fname = NULL){
#opens a patient gene matrix file
#takes: 1) file name
# or 2) no arg (in this case, user will be prompted to select a file)
#returns: patient gene matrix
if(is.null(fname)){
fname = file.choose()
}
pgmdat = read.table(file=fname,check.names=T,
header=T,
row.names=1,
sep="\t",
stringsAsFactors=F)
pgmdat = as.matrix(pgmdat)
return(pgmdat)
}
#extract_pid_w_matchnormal
#like extract_pid but leaves the sample type code at the end (match/normal)
#takes: n: a string that might contain a TCGA id, ex: TCGA-IQ-7632-01 in unc.edu__IlluminaHiSeq_RNASeqV2__TCGA-IQ-7632-01A-11R-2081-07__expression_rsem_isoforms_normalized.txt
#returns: the TCGA id (with -'s replaced by .'s) if an id is found; NULL if there is not a TCGA id in the input string
extract_pid_w_matchnormal<-function(n=NULL)
{
if(is.null(n)){n="unc.edu__IlluminaHiSeq_RNASeqV2__TCGA-IQ-7632-01A-11R-2081-07__expression_rsem_isoforms_normalized.txt"}
#first replace all special characters with spaces
#en = gsub(pattern="[[:punct:]]", replacement=" ", x=n)
sen = strsplit(n, split="[[:punct:]]")[[1]]
#find where "TCGA" is
i = grep(pattern="TCGA", x=sen)
if(length(i)<1){
return(NULL)
}
#take out the TCGA part, plus the next two parts
sub = sen[c(i, i+1, i+2, i+3)]
sub[4] = gsub(pattern="[a-zA-Z]*", "", x=sub[4])
out = paste(sub, sep=".", collapse=".")
return(out)
}
#cleanGeneSymbols
#cleanes input gene symbols, removing leading and trailing spaces, quotes,
#repaces spaces with dashes and periods with dashes
#takes & returns vector of gene symbols
cleanGeneSymbols<-function(genes){
#remove leading and trailing spaces
ltspace2 = gsub(pattern="^ ",replacement="",x=genes)
#remove trailing spaces
ltspace2 = gsub(pattern=" $",replacement="",x=ltspace2)
#replace " " with "-"
spfixed2 = gsub(pattern=" ",replacement="-",x=ltspace2)
spfixed2 = gsub(pattern="\\.",replacement="-",x=spfixed2)
spfixed2 = gsub(pattern="\"",replacement="",x=spfixed2)
out = toupper(spfixed2)
return(out)
}
#corListCheck
#if the HUGO symbols reference file has changed,
#corListCheck will assure the symbol changes are consistent with the new reference file
#takes: cl: corrections list
# htab: hugo reference table
#returns: corrections list data frame
#
#works with files: "./reference_data/gene_symbol_corrections_list.txt"
#
corListCheck<-function(cl=NULL, htab=NULL){
cat("\nScreening symbol correction table...\n")
curhugofname = "./reference_data/current_hugo_table.txt"
correctionsfile="./reference_data/gene_symbol_corrections_list.txt"
if(is.null(cl)){
cl = read.delim(file=correctionsfile, header=T, sep="\t", stringsAsFactors=F,quote="", na.strings="-")
}
if(is.null(htab)){
cat("\nOpening HUGO symbol reference table...\n")
htab=read.table(file=curhugofname,#"./reference_data/current_hugo_table.txt","./reference_data/slim_current_hugo_table.txt"
header=T,
sep="\t",
quote="",
comment.char="",
stringsAsFactors=F,na.strings="-")
}
#handle three cases:
#1: symbol added to HUGO ref
#2: symbol removed from HUGO ref
#3: symbol changed
#for 2 and 3:
#check if any of the "new" are withdrawn:
wnew = paste(cl$new_symbol, "~withdrawn", sep="")
iswithdrawn = wnew%in%htab$Approved.Symbol
if(sum(iswithdrawn)){#if some of the symbols have been withdrawn, see if replacement symbols can be found
cltmp = cbind.data.frame(cl[iswithdrawn,], wnew[iswithdrawn], stringsAsFactors=F)
colnames(cltmp)[3] = "withdrawn"
#for each row in cltmp, merg it's corresponding row in htab
htabtmp = htab[htab$Approved.Symbol%in%cltmp$withdrawn,1:8]#extract the rows from htab
chmerge = merge(x=cltmp, y=htabtmp, by.x="withdrawn", by.y="Approved.Symbol")
####### issue 2
###### check if some symbols were withdrawn and there is no replacement
if(sum(chmerge$Status=="Entry Withdrawn")){
cat("\nOf the symbols in the symbol correction table, these entries were\nwithdrawn as official HUGO sybols, and no replacements were provided:\n")
print(chmerge$new_symbol[chmerge$Status%in%"Entry Withdrawn"])
}
chmerge = chmerge[chmerge$Status == "Symbol Withdrawn",]
if(nrow(chmerge)){
tmp= sapply(chmerge$Approved.Name, function(x) strsplit(x=x, split="symbol withdrawn, see ")[[1]][2])
chmerge$new_symbol = tmp
clt = rbind(cl, chmerge[,c("old_symbol","new_symbol")])
clt = clt[!duplicated(x=clt$old_symbol, fromLast=T),]
cl = clt
cl = corListCheck(cl=cl,htab=htab)#recursive call,
#handles case where symbol was changed more than once
}
}
### now scan the old_symbols for appoved symbols
#remove rows where the old_symbol is an approved symbol
if(sum(cl$old_symbol%in%htab$Approved.Symbol)){
cat("\nSome rows in the symbol correction table were ",
"found to correct symbols that were already approved.",
"These rows will be removed to prevent data inconsistencies\n")
cl = cl[!cl$old_symbol%in%htab$Approved.Symbol,]
}
return(cl)
}
toPGMWithCoverage<-function(sds, coverageSet){
pgm = toPGM(sds)
additionalRowNames = setdiff(coverageSet, rownames(pgm))
newChunk = matrix(data=F,
nrow=length(additionalRowNames),
ncol=ncol(pgm),
dimnames=list(additionalRowNames, colnames(pgm)))
}
##toPGM
# converts a stacked format to a patient gene matrix
#
#takes: a data frame with columns "Symbol" and "Ids"
#returns: patient gene matrix: rownames = gene names,
# colnames = patientIDs
# cell values: logical, T if gene is active in that patient
toPGM<-function(sds){
#find unique patient ids
pids = unique(sds$Ids)
pids = make.names(pids)
sds$Ids = make.names(sds$Ids)
#find unique genes affected
syms = unique(sds$Symbol)
#make the matrix
out = matrix(data=F, nrow=length(syms), ncol=length(pids), dimnames=list(syms,pids))
#for each patient, find the set of genes
for(p in pids){
#get the current patient's active genes
cursyms = sds$Symbol[sds$Ids == p]
#assign the cells for that patient, for those genes
out[cursyms, p] = T
}
return(out)
}
# toBipartateGraph
# takes a data frame with two columns
# makes a bipartate graph from the two columns
toBipartateGraph<-function(dfin){
colnames(dfin)<-c("Ids", "Symbol")
#find unique patient ids
pids = unique(dfin$Ids)
pids = make.names(pids)
dfin$Ids = make.names(dfin$Ids)
#find unique genes affected
syms = unique(dfin$Symbol)
#make the matrix
out = matrix(data=F, nrow=length(syms), ncol=length(pids), dimnames=list(syms,pids))
#for each patient, find the set of genes
for(p in pids){
#get the current patient's active genes
cursyms = dfin$Symbol[dfin$Ids == p]
#assign the cells for that patient, for those genes
out[cursyms, p] = T
}
return(out)
}
#switchIds
#takes set of ids (vector)
#returns set of ids with as many switched as possible
switchIds<-function(idv,htab=NULL){
#see which can be switched
#idv%in%htab$
dict = c("testSymbol")
names(dict) = c("UniProt:P12956")
for(i in 1:length(idv)){
swap = dict[idv[i]]
if(!is.na(swap)){
idv[i] = swap
}
}
return(idv)
}
paths<-function(smd){
return(smd@paths)
}
#rounds columns in table to "figs" number of significant digits
#and replaces underscores in col names with spaces
cleanTables<-function(tab,figs=3, verbose=F){
cat("\nCleaning table..")
if(!(is.data.frame(tab)|is.matrix(tab))){
cat("WAIT, that's not a table..\n")
return(tab)
}
if(nrow(tab)){
for(i in 1:ncol(tab)){
ccol = tab[,i]
cname = colnames(tab)[i]
if(verbose){
print(cname)
print(tab[1,i])
print(typeof(ccol))
print(mode(ccol))
}
if(typeof(tab[,i])%in%c("integer","double")){
tab[,i] = round(tab[,i],digits=figs)
}
}
}else{
print("table has no rows")
}
if(ncol(tab)){
#replace underscores:
colnames(tab)<-gsub(pattern="[_]", replacement=" ", x=colnames(tab))
}else{
print("table has no columns")
}
cat("cleaned\n")
return(tab)
}
#getHugoSymbols()
#paths_detail: paths object: if passed, this function will excise the currently used hugo symbol set and return them.<them?>
#curhugofname: the name of the
#returns HUGO lookup table
#
#function allows re-download of hugo cross ref file.
getHugoSymbols<-function(paths_detail=NULL,
curhugofname="./reference_data/current_hugo_table_slim.txt",
verbose=F){
if(class(paths_detail)=="Study"){
paths_detail=paths_detail@studyMetaData@paths
}
if(is.null(paths_detail)){
cat("\nLoading official HUGO gene symbols\n")
#cref = read.table(file=curhugofname, header=T, stringsAsFactors=F)
cref=read.table(file=curhugofname,#"./reference_data/current_hugo_table.txt","./reference_data/slim_current_hugo_table.txt"
header=T,
sep="\t",
quote="",
comment.char="",
stringsAsFactors=F, na.strings="-")
cref$Approved.Symbol = cleanGeneSymbols(cref$Approved.Symbol)
cat("\nUsing a symbol correction file downloaded from http://www.genenames.org/",
"on",as.character(file.info(curhugofname)$mtime),".\n")
ginfo = ""
if(verbose){
ginfo=readline(paste("To get info on how to update this file enter \"i\"\n",
"Otherwise, just press enter to continue"))
}else{
cat("\nTo get info on how to update HUGO symbol cross reference file \n",
"or automatically re-download cross ref file, re-run the getHugoSymbols() function\n",
"using the argument verbose=T (default: verbose=F)\n")
}
if(ginfo=="i"){
if("r"==readline("To attempt to download a current cross reference table of HUGO symbols enter r \n(note: this can take more than 10 minutes to download)\nIf you tried this once and it didn't work, press enter to get other options. ")){
full_hurl = "http://www.genenames.org/cgi-bin/hgnc_downloads?title=HGNC+output+data&hgnc_dbtag=on&preset=all&status=Approved&status=Entry+Withdrawn&status_opt=2&level=pri&=on&where=&order_by=gd_app_sym_sort&limit=&format=text&submit=submit&.cgifields=&.cgifields=level&.cgifields=chr&.cgifields=status&.cgifields=hgnc_dbtag"
reopenedfurl = read.table(file=full_hurl,sep="\t",comment.char="",header=T,quote="", stringsAsFactors=F,na.strings="-")
write.table(x=reopenedfurl,file=curhugofname,quote=F,sep="\t")
cref = read.table(file=curhugofname,sep="\t",comment.char="",header=T,quote="", stringsAsFactors=F, na.strings="-")
cref$Approved.Symbol = cleanGeneSymbols(cref$Approved.Symbol)
}else{
cat("Go to http://www.genenames.org/ and find the biomart interface.\n",
"Make sure to check the boxes to download Approved symbol, previous symbol\n",
"synonyms, status, approved name, date approved and name synonyms.\n")
cat("Make sure column names match those in the file",curhugofname,"\n",
"and replace that file with the one downloaded (it is suggested that\n",
"you rename the old file so as not to loose it if anything goes wrong\n")
cat("These are the column names from the current hugo cross reference file:")
print(colnames(cref))
readline("Press enter to continue.\nPress escape to exit the program so that you can update the cross ref file.")
}
}
return(cref)
}else{
return(paths_detail$hugo_lookup)
}#if/else
}#getHugoSymbols
#corsym
#corrects symbols in somatic data
#takes symbol_set: 1 of 2 options
# 1)table with columns:
# "Hugo_Symbol": the set of symbols which should be checked for symbols that need correction
# col2: The chromosome in the genome that the gene symbols are associated with
# 2)vector of symbols to be corrected
# curhugofname: the relative file path to a hugo lookup table
# verbose: if this is set to true, no text output will be given and previously official symbols will not be checked.
#
#returns: vector: set of gene symbols, corrected to hugo symbols
#
#usage: som_select[,"Hugo_Symbol"] = corsym(som_select[,c("Hugo_Symbol", col2)], "./reference_data/hugo_dl.txt", verbose=verbose)
corsym<-function(symbol_set, hugoref=NULL, verbose=T, col2="Chrom", correctionsfile="./reference_data/gene_symbol_corrections_list.txt"){
if(verbose) cat("\n\nChecking that gene symbols match official HUGO gene symbols. . . . . \n")
curhugofname="./reference_data/current_hugo_table.txt"
if(is.null(hugoref)){
cref=getHugoSymbols(curhugofname=curhugofname)
}else if(is.character(hugoref)){#if hugoref is a character, it is a file name; open the file
cref=getHugoSymbols(curhugofname=hugoref)#the direct passing of a hugo file name. . .not sure if we're doing that any more...
}else if(class(hugoref)=="Study"){
cref=hugoref@studyMetaData@paths$HUGOtable
hugoref=cref
}else{#the hugo table was passed directly
cref = hugoref
}
#make sure the input symbol set is a table
if(is.vector(symbol_set)){
symbol_set = cbind(symbol_set,rep("",times=length(symbol_set)))
}
#correct the column names
if(!sum(c("Hugo_Symbol",col2)%in%colnames(symbol_set))){
colnames(symbol_set)<-c("Hugo_Symbol",col2)
}
if(verbose){
cat(sep="","\n\n",nrow(cref)," symbol records loaded from ",curhugofname,
"\nNote: these do not all correspond to currently approved HUGO symbols.",
"\nSome are place holders from previously used or withdrawn symbols\n")
}
###########################################################
# Correct gene names
###########################################################
#check how many symbols are in both TCGA somatic data and HUGO
#cross refernce table : if chasm output cant so easily be matched to chasm input
#change all symbols to upper case:
cref[,"Approved.Symbol"] = toupper(cref[,"Approved.Symbol"])
symbol_set[,"Hugo_Symbol"] = toupper(symbol_set[,"Hugo_Symbol"])
if(verbose){
cat("\nTo conduct symbol comparrisons and corrections, these reformattings were made:")
cat("\nConversion to upper case.\nRemoval of leading and trailing spaces.",
"\nConversion of spaces and periods to dashes.\n",
"***NOTE: These reformattings are not saved or recorded to any file!!\n")
}
symbol_set[,"Hugo_Symbol"]= cleanGeneSymbols(symbol_set[,"Hugo_Symbol"])
tcga_hugo = intersect(cref[,"Approved.Symbol"], symbol_set[,"Hugo_Symbol"])
if(verbose){
cat(paste("\nOut of the ", as.character(length(unique(symbol_set[,"Hugo_Symbol"])))," symbols in current data set,\n",
length(tcga_hugo), " were found to be currently approved HUGO symbols.\n",sep=""))
}
not_approvedi = which(!symbol_set[,"Hugo_Symbol"] %in% cref[,"Approved.Symbol"])
not_approved = symbol_set[not_approvedi,c("Hugo_Symbol", col2), drop=F]
not_approved = unique(not_approved)
if(!max(0,nrow(not_approved))){#if all symbols match approved hugo symbols, this will be true
return(symbol_set[,"Hugo_Symbol"])
}
if((max(0,nrow(not_approved))<50) & verbose){
cat("\nThese are the symbols that were not found to be approved HUGO symbols:\n")
print(not_approved)
}else if(verbose){
cat("\nThese are the first 50 out of",max(nrow(not_approved),0),"symbols that were not found to be approved HUGO symbols:\n")
print(not_approved[1:50,])
}
#check how many appear to be Micro RNA genes
possible_miRNAs = grep(pattern="^MIR|-MIR",x=not_approved,ignore.case=T)
if(length(possible_miRNAs)&verbose){
cat("\nOf the aformentioned symbols,",length(possible_miRNAs),"symbols begin with \"MIR-\", or contain the string \"-MIR\"",
"and thus appear to be symbols for microRNAs, as opposed to protein-coding genes.\n",
"Note: it is possible that gene symbols not containing \"MIR\" correspond to microRNAs as well.\n")
}
if(verbose){cat("\nA full list of symbols found not to be approved, before attempted",
"correction, can be found at ./output/not_approved_symbols_from_last_run.txt\n")}
write.table(x=not_approved,file="./output/not_approved_symbols_from_last_run.txt",sep="\t",row.names=F)
correction_set = NULL#the set of mappings from old names to new names, which is retreived from the corrections file
correctedSymbols = NULL#this will contain mappings to be made in this run of the program
new_corrections = NULL
################################### Check Corrections File
if(file.exists(correctionsfile)){
raw_correction_set = read.delim(file=correctionsfile, header=T, sep="\t", stringsAsFactors=F,quote="", na.strings="-")
resave = F
#check for repeated rows
dupcheck = duplicated(raw_correction_set)
if(sum(dupcheck)){
raw_correction_set = raw_correction_set[!dupcheck,]
resave=T
}
#check that individual old symbol do not have multiple entries
dupcheck2 = duplicated(raw_correction_set$old_symbol)
if(sum(dupcheck2)){
allolddup = dupcheck2 | duplicated(raw_correction_set$old_symbol,fromLast=T)
cat("\nWarning, multimapping issue found!\n",
"These symbol corrections indicate multiple, different corrections for the same symbols:\n")
print(raw_correction_set[allolddup,])
readline("Please edit these symbols in the symbol correction file and re-run the program to prevent errors\nPress enter to continue")
}
if(sum(raw_correction_set$old_symbol == raw_correction_set$new_symbol)){#if there are symbols that are changed to the same thing
#remove symbols that are the same btwx old and new
chsymindex = which(raw_correction_set$old_symbol != raw_correction_set$new_symbol)
correction_set = raw_correction_set[chsymindex,]
resave=T
}
raw_correction_set_tmp=corListCheck(cl=raw_correction_set, htab=hugoref)
if(!all.equal(target=raw_correction_set_tmp, current=raw_correction_set)==T){
resave = T
}
raw_correction_set = raw_correction_set_tmp
correction_set = raw_correction_set
numcor = intersect(x=correction_set[,1], not_approved[,"Hugo_Symbol"])#numcor is the number of symbols in symbols set
#that can be corrected from the symbol correction file
if(length(numcor)){
if(verbose){
cat("\nA previously made corrections file was found at",correctionsfile,"\nThis file contains corrections for ",
as.character(length(numcor)), "of the ",nrow(not_approved),"unapproved gene symbols.\n")
}
use_previous=""
if(verbose){
use_previous = readline(paste("Press enter to use these corrections.\nEnter anything else to skip using these corrections\n",
"(You will be provided with a chance to select your own corrections): "))
}
if(use_previous==""){
symbol_set[,"Hugo_Symbol"] = swapsymbols2(corrected=correction_set, genelist=symbol_set[,"Hugo_Symbol"])
not_approved = which(!symbol_set[,"Hugo_Symbol"] %in% cref[,"Approved.Symbol"])#temporary state of not_approved
not_approved = symbol_set[not_approved,c("Hugo_Symbol", col2), drop=F]#not approved now has two columns
not_approved = unique(not_approved)
if(verbose) cat("\nThere are now", as.character(max(0,nrow(not_approved))), "symbols remaining which do not match approved HUGO symbols.")
}
print("not approved 1.2:")
print(not_approved)
if(resave){
write.table(x=correction_set, file=correctionsfile,
quote=F, sep="\t", row.names=F, col.names=c("old_symbol", "new_symbol"))
raw_correction_set = read.delim(file=correctionsfile, header=T, sep="\t", stringsAsFactors=F,quote="", na.strings="-")
}
}
if(max(0,nrow(not_approved))==0){
return(symbol_set[,"Hugo_Symbol"])
}
}
if(verbose){
if(max(0,nrow(not_approved))){
################################### Check previously used symbols
checkprev = readline("\nWould you like to check previously official HUGO symbols for the remaining unmatching symbols? \n(enter y or n)")
if(checkprev=="y"){
######## Check previous HUGO symbols
switches = checkPreviousSymbols(symbols=not_approved, indexes=1:nrow(not_approved), hugolookup=cref, col2=col2)
if(nrow(switches)){
print(switches)
symbol_set[,"Hugo_Symbol"]=swapsymbols2(corrected=switches, genelist=symbol_set[,"Hugo_Symbol"])
not_approved = which(!symbol_set[,"Hugo_Symbol"] %in% cref[,"Approved.Symbol"])#temporary state of not_approved
not_approved = symbol_set[not_approved,c("Hugo_Symbol", col2)]#not approved now has two columns
not_approved = unique(not_approved)
new_corrections = rbind(switches, new_corrections)
cat("\n",nrow(switches),"mystery symbols were found in the previously official hugo symbols.\n")
cat("There is/are now", as.character(nrow(not_approved)), "symbol(s) remaining which do/does not match approved HUGO symbols.\n")
print(not_approved)
}else{
cat("\nNo matches were found in the previously used HUGO symbols\n")
}
}
}
print(not_approved)
if(max(nrow(not_approved),0)){
################################### Check synonyms
checksyn = readline("\nWould you like to check synonyms for the remaining unmatching symbols? (enter y or n) ")
if(checksyn=="y"){
syncor = checkSynonyms(symbols=not_approved, indexes=1:nrow(not_approved), hugolookup=cref, col2=col2)
if(nrow(syncor)){
colnames(syncor)<-c("old_symbol","new_symbol")
symbol_set[,"Hugo_Symbol"]=swapsymbols2(corrected=syncor, genelist=symbol_set[,"Hugo_Symbol"])
not_approved = which(!symbol_set[,"Hugo_Symbol"] %in% cref[,"Approved.Symbol"])#temporary state of not_approved
not_approved = symbol_set[not_approved,c("Hugo_Symbol", col2), drop=F]#not approved now has two columns
not_approved = unique(not_approved)
new_corrections = rbind(new_corrections, syncor)
}
cat("\nThere is/are now", as.character(nrow(not_approved)), "symbol(s) remaining which do not match approved HUGO symbols.\n")
print(not_approved)
}
if(verbose){cat("\nA full list of symbols which were found not to be approved and which could\n",
"not be corrected can be found at ./output/not_approved_not_correctable_symbols_from_last_run.txt\n")}
write.table(x=not_approved,file="./output/not_approved_not_correctable_symbols_from_last_run.txt",sep="\t",row.names=F)
if(length(new_corrections)){
cat("\nThese are the new corrections that were just added to the symbol corrections file:\n")
print(new_corrections)
if(readline("Would you like to save the set of corrections just made to the corrections file (y/n)")=="y"){
addCorrections(new_corrections=new_corrections, correction_set=correction_set, correctionsfile=correctionsfile)
}
}
}
}#if verbose
return(symbol_set[,"Hugo_Symbol"])
}#corsym function
#'@title addCorrections()
#'@description Adds corrections to the corrections file
#'@param new_corrections: a two column matrix; column 1 = old, incorrect symbols, column 2 = new, corrected symbols
#'@param correctionsfile: the file name of the corrections file
#'@param correction set: the original contents of teh corrections file before new corrections were made
addCorrections<-function(new_corrections, correctionsfile="./reference_data/gene_symbol_corrections_list.txt", correction_set=NULL){
if(is.null(correction_set)){
if(file.exists(correctionsfile)){
correction_set = as.matrix(read.delim(file=correctionsfile, header=T, sep="\t", stringsAsFactors=F,quote="", na.strings="-"))
}else{
correction_set = matrix(data="", ncol=2, nrow=0, dimnames=list(NULL, c("old_symbol", "new_symbol")))
}
}
colnames(new_corrections)<-colnames(correction_set)
final_corrections = rbind(correction_set, new_corrections)
#screen for NA values
final_corrections = final_corrections[!is.na(final_corrections[,1]),,drop=F]
final_corrections = final_corrections[!is.na(final_corrections[,2]),,drop=F]
#now attempt to clean the final_collections of rows where no changes are made (ie, old symbol is the same as new symbol)
final_corrections=final_corrections[final_corrections[,1]!=final_corrections[,2],,drop=F]
final_corrections = unique(final_corrections)
write.table(x=final_corrections, file=correctionsfile,
quote=F, sep="\t", row.names=F, col.names=c("old_symbol", "new_symbol"))
cat("\nSymbol corrections recorded in file:", correctionsfile, "\n")
}
#examineHugoSet
#get summary information on a set of HUGO symbols
#takes: symbol_set: the list of symbols
#study_name and data_type describe the source of the symbols, for display on the read out
#
examineHugoSet<-function(symbol_set,study_name,data_type,curhugofname="./reference_data/current_hugo_table.txt"){
cref = read.delim(file=curhugofname, header=T, stringsAsFactors=F, na.strings="-")
approvedHugoFile = paste("./output/",study_name,"_approved_hugo_w_annotation_from_",data_type,"_data.txt",sep="")
cat("\nA table of approved HUGO symbols, including those just corrected, can be found in the file",approvedHugoFile,"\n")
in_study = cref[cref[,"Approved.Symbol"]%in%symbol_set,]
not_hugo = setdiff(x=symbol_set, y=cref[,"Approved.Symbol"])
nhtab = matrix(nrow=length(not_hugo),ncol=ncol(in_study))
colnames(nhtab)<-colnames(in_study)
nhtab[,"Approved.Symbol"] = not_hugo
nhtab[,"Locus.Type"] = rep("Not HUGO symbol",times=length(not_hugo))
nhtab[,"Locus.Group"] = rep("Not HUGO symbol",times=length(not_hugo))
in_study = rbind(in_study, nhtab)
write.table(x=in_study,file=approvedHugoFile,quote=F,sep="\t",row.names=F,col.names=T)
return(in_study)
}
#takes: indexes: indexes of symbols to be checked
# symbols: list of symbols which the indexes refer to
# (those to be switched and those to remain)
# hugolookup: hugo look up table with columns : <Approved.Symbols>, <Previous.Symbols>
#returns: table: <index to be switched> <what it should be switched to>
checkPreviousSymbols<-function(symbols, indexes, hugolookup, col2){
hugolookup[,"Previous.Symbols"] = toupper(hugolookup[,"Previous.Symbols"])
oldsyms = NULL #output set of those which are previous symbols
ssymbols = NULL #
for(i in 1:length(indexes))
{
cur = symbols[indexes[i],"Hugo_Symbol"]
curchrom = symbols[indexes[i], col2]
cursym = gsub(pattern="[[:punct:]]", replacement=" ", x=cur)
#grep the row against the previous symbols column
pat = paste("(^|[[:blank:]])", cursym, "(,|$)", collapse="", sep="")
ind = grep(pattern=pat, x=hugolookup[["Previous.Symbols"]],ignore.case=T)
if(length(ind)==1){#if grep caught something
ssymbols = c(ssymbols, as.character(hugolookup[ind,"Approved.Symbol"]))
oldsyms = c(oldsyms, cur)
}else if(length(ind)>1){#if grep found more than one matching previous symbol
dcols = colnames(hugolookup)[grep("date", colnames(hugolookup), ignore.case=T)]
dcols = c(dcols,colnames(hugolookup)[grep(col2, colnames(hugolookup), ignore.case=T)])
cat("\nCorrecting gene symbol", as.character(i), "of", as.character(length(indexes)), "symbols that need correction.\n")
cat("\nMultiple previous symbols were found to match", as.character(cursym), " (from Chrom ", as.character(curchrom), "). \n")
cat("\nThese are the symbols that were found to match:\n")
print(hugolookup[ind,c("Approved.Symbol", "Previous.Symbols", dcols)])
line = readline("Please enter the correct HUGO name if it appears above under Approved.Symbol.\nPress enter to continue with out making any symbol corrections.")
if(line != ""){
ssymbols = c(ssymbols, line)
oldsyms = c(oldsyms, cur)
}
}
}
#cat("\n",length(oldsyms),"symbols were found to match previously official symbols, and are being corrected.\n")
out = cbind.data.frame(oldsyms, ssymbols, stringsAsFactors=F)
return(out)
}
#takes: indexes: indexes of symbols to be checked
# symbols: list of symbols which the indexes refer to
# (those to be switched and those to remain)
# hugolookup: hugo look up table with columns : <Approved.Symbols>, <Previous.Symbols>
#returns: table: <index to be switched> <what it should be switched to>
checkSynonyms<-function(symbols, indexes, hugolookup, col2){
sindexes = NULL #output set of those which are previous symbols
ssymbols = NULL
for(i in 1:length(indexes))
{
osym = symbols[indexes[i],"Hugo_Symbol"]
curchrom = symbols[indexes[i], col2]
cursym = gsub(pattern="[[:punct:]]", replacement=" ", x=osym)
sterm=cursym
acc = NULL
while(T){
print("start while(T)")
#grep the row against the previous symbols column
pat = paste("(^|[[:blank:]])", sterm, "(,|$)", collapse="", sep="")
res1 = grep(pattern=pat, x=hugolookup[["Synonyms"]])
res2 = grep(pattern=sterm, x=hugolookup[["Synonyms"]], ignore.case=T)
res3 = grep(pattern=pat, x=hugolookup[["Approved.Symbol"]], ignore.case=T)
cat("\n###################################################### Searching synonyms for", cursym,"\n")
cat("#####in the data being processed, this symbol is associated with chromosome", curchrom, "\n\n########## Original query:", osym,
"\n########## Current search term: ", sterm, "\n")
if(length(res2)>0){
cat("\n ############ These near matches were found: \n")
print(hugolookup[res2, c("Approved.Symbol","Approved.Name","Status", "Synonyms",
colnames(hugolookup)[grep("date", colnames(hugolookup), ignore.case=T)],
colnames(hugolookup)[grep(col2, colnames(hugolookup), ignore.case=T)])])
}else{cat("\nNo synonyms were found for the search term.\n")}
if(length(res3)==1){
cat("\n ************ This exactly matching approved HUGO symbol was found: \n")
print(hugolookup[res3, c("Approved.Symbol","Approved.Name",
colnames(hugolookup)[grep(col2, colnames(hugolookup), ignore.case=T)])])
if(readline(prompt="If you would like to accept this exactly matching approved HUGO symbol, press ENTER.")==""){
acc = as.character(hugolookup[res3, "Approved.Symbol"])
break
}
}
if(length(res1)==1){
cat("\n ************ This exactly matching synonym was found: \n")
print(hugolookup[res1, c("Approved.Symbol","Approved.Name","Status", "Synonyms",
colnames(hugolookup)[grep("date.approved", colnames(hugolookup), ignore.case=T)],
colnames(hugolookup)[grep(col2, colnames(hugolookup), ignore.case=T)])])
if(""==readline("If you would like to accept the exactly matching synonym above, please press ENTER\nIf you enter anything else, more options will be provided.")){
acc = as.character(hugolookup[res1, "Approved.Symbol"])
break
}
}
line = readline(paste("\nIf you would like to accept the current search term, \"",
sterm,
"\" \nas the symbol correction, please press ENTER.\nTo continue with out making a correction, enter c\nIf you would prefer to enter a correction, please key it in now: "))
if(line ==""){
acc = sterm
break
}
if(line == "c"){
acc = ""
break
}
sterm = line
}#while
print("Out of while loop")
if(acc != ""){
ssymbols = c(ssymbols, acc)
sindexes = c(sindexes, osym)
}
}#for
out = cbind.data.frame(sindexes, ssymbols, stringsAsFactors=F)
if(nrow(out)) colnames(out)<-c("old_symbol","new_symbol")
print("about to return")
return(out)
}#checkSynonyms
#qinfo: gives you quick info about a data structure
#takes: ds: vector, dataframe or matrix
qinfo<-function(ds,mx=25){
print(mode(ds))
print(typeof(ds))
if(is.vector(ds)){
print(head(ds))
print(summary(ds))
}else if(is.matrix(ds)|is.data.frame(ds)){
print(dim(ds))
if(ncol(ds)<5){
print(head(ds))
print(summary(ds))
}else{
print(ds[1:min(nrow(ds),5),1:4])
print(summary(ds[,1:4]))
print(colnames(ds)[1:min(ncol(ds),mx)])
print(rownames(ds)[1:min(nrow(ds),mx)])
}
}else if(!is.list(ds)){
print(ds)
}else if(is.list(ds)){
print(names(ds)[1:min(mx,length(ds))])
for(i in 1:length(ds)){
print(names(ds)[i])
qinfo(ds[[i]])
}
}
}
#getTargetGenesFromDrugs
#unpacks drug targets from drug-gene target matrix
#examines which drugs are present on panel to determine which genes are targeted by panel
#takes: drugTargetsFileName: path to file containing drug target matrix: bipartate graph with drug names on one axis
# panelFileName : path to panel file (must have column <Drugs> containing drug names)
#returns:
getTargetGenesFromDrugs<-function(drugTargetsFileName="./input/drug_targets_2.txt", panelFileName="./input/exp006_extract_for_db.txt",verbose=F){
cat("\nCurrently this procedure only provides a list of genes which are targeted,",
"\nbut does not give the ihibitory magnitude.\n")
drug_targets <- read.delim(drugTargetsFileName, header=F, quote="",na.strings="-")
drugs = drug_targets[1,]
genes = as.character(drug_targets[,1])
sysnames = sapply(genes, function(x){strsplit(x, "\\(")[[1]][1]} )
recs = (nrow(drug_targets)-1) * (ncol(drug_targets)-1)
drug = rep("blank", recs)
tgene = rep("blank", recs)
sysname = rep("blank", recs)
degree = rep(-1, recs)
c = 0
for(i in 2:nrow(drug_targets))#for each gene
{
for(j in 2:ncol(drug_targets))#for each drug
{
c = c + 1
drug[c] = as.character(drug_targets[1,j])
tgene[c] = genes[i]
sysname[c] = sysnames[i]
degree[c] = as.integer(drug_targets[i,j]) -1
}
}
drug_gene_xref = data.frame(cbind(drug=toupper(drug), tgene=toupper(tgene), sysname=toupper(sysname), degree=degree), stringsAsFactors=F)
mds = read.delim(panelFileName, header=T, sep="\t", stringsAsFactors=F,na.strings="-")
paneldrugs = toupper(mds[,"Drug"]) #there are 158 unique names here,but some of them are replicates
#now see which rows of out have drug names that are in dkmdrugs
drugsinboth = drug_gene_xref[drug_gene_xref$drug%in%paneldrugs,]
cat("\n",length(unique(drugsinboth[,"drug"])), " drugs from drug panel file match the drugs in the drug target matrix.\n",sep="")
if(length(unique(drugsinboth[,"drug"]))!=length(unique(paneldrugs))){
cat("\nThese are the drugs from the panel that were not found in the drug-target matrix:\n")
notFoundSet = setdiff(paneldrugs, drug_gene_xref$drug)
print(notFoundSet)
cat("Some of these may be replicates (typically ending in the pattern _<replicate number>). ",
"\nthose that are not replicates may have variations in notation or may need to be added",
"\nto the target matrix file.")
fnameDrugsNotFound="./output/drug_names_not_found_in_target_matrix.txt"
cat("\n\nThe list of drugs not found in the target matrix was saved to", fnameDrugsNotFound,"\n")
write.table(x=notFoundSet, file=fnameDrugsNotFound,quote=F, sep="\t")
}
#now get unique genes from dkmrows
ugenes = unique(drugsinboth[,"sysname"])
if(verbose){readline("\nPress enter to continue")}
return(ugenes)
}
#extract_pid
#takes: n: a string that might contain a TCGA id, ex: TCGA-IQ-7632-01 in unc.edu__IlluminaHiSeq_RNASeqV2__TCGA-IQ-7632-01A-11R-2081-07__expression_rsem_isoforms_normalized.txt
#returns: the TCGA id (with -'s replaced by .'s) if an id is found; NULL if there is not a TCGA id in the input string
extract_pid<-function(n=NULL)
{
if(is.null(n)){
n="unc.edu__IlluminaHiSeq_RNASeqV2__TCGA-IQ-7632-01A-11R-2081-07__expression_rsem_isoforms_normalized.txt"
cat("\nNote: extracting example pid from\n", n, "\n")
}
#first replace all special characters with spaces
#en = gsub(pattern="[[:punct:]]", replacement=" ", x=n)
sen = strsplit(n, split="[[:punct:]]")[[1]]
#find where "TCGA" is
i = grep(pattern="TCGA", x=sen)
if(length(i)<1){
print("** Note: TCGA id not found, returning barcodes as pids.")
return(n)
}
#take out the TCGA part, plus the next two parts
sub = sen[c(i, i+1, i+2)]
out = paste(sub, sep=".", collapse=".")
return(out)
}
###summarize_by
#summarize table by categories in a particular column
#used for making a quick summary of different types of somatic mutations in TCGA somatic mutation tables
#takes: the table
# the column to be summarized
# whether or not you want to display the summary to screen and bar graph
#returns: a summary table: category column, count column
summarize_by<-function(col, display=F, barPlotTitle="Counts across types", left_margin_factor=1)
{
ctab = table(col)
out = cbind.data.frame(rownames(ctab), as.vector(ctab), stringsAsFactors=F)
colnames(out)<-c("types", "counts")
out = out[order(out$types),]
out$counts = as.numeric(out$counts)
types = out$types