-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathhubFunctions.R
More file actions
128 lines (102 loc) · 5.18 KB
/
hubFunctions.R
File metadata and controls
128 lines (102 loc) · 5.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
require(ape)
#Creates Tree-based resamples and re-runs the tree based clustering method on each
runFullMulti <- function(iFile, dateFormat, maxDs, newMark, prop=0.8, n=100, varInd=c(1,2), reVars='_') {
#@param iFile: An input full fasta file for splitting and resampling
#@param dateFormat: Passed to multi run
#@param maxDs: Passed to multiGAICRun()
#@param newMark: The marker defining new sequences from a set of time-defined seqs
# This must be something that exists in the headers of ONLY new sequences
#@param prop: passed to sampleFasta() by default matches sampleFasta() default
#@param n: passed to sampleFasta() by default matches sampleFasta() default
#@param varInd: The indices which represent variables in the header, passed to import.tree()
#@param reVars: The character which splits the header, passed to import.tree()
# This allows variables to be separate and recognizable
##-TODO: Make Generalizeable
source("git/tn/pplacer_utils.R")
source("~/git/tn/subT_Lib.R")
#Sample Folder Set Up
newSplit(iFile, newMark)
oFileO <- paste0(gsub(".fasta$|.fas$", "_Old.fasta", iFile))
nFile <- paste0(gsub(c(".fasta$|.fas$"), "_New.fasta", iFile))
sampsDir <- paste0(gsub(".fasta$", "Samps", iFile))
sampleFasta(oFileO, sampsDir, n=n, prop=prop, nFile =nFile)
multiTree(sampsDir)
multiTaxit(sampsDir)
#Run Pplacer and guppy on all created refpackages
multiPplacer(sampsDir)
multiGuppy(sampsDir)
#Save multiple parallel run info to output file
oFile <- paste0(gsub(".fasta$|.fas$", "_ROB.rds", iFile))
res <- multiGAICRun(sampsDir, maxDs=maxDs, dateFormat=dateFormat, varInd=varInd)
saveRDS(res, oFile)
}
#Run GAIC test for tree-based data
#See Comparison for ease of use
runTreeGAIC <- function(tFile, reVars="_", varInd=c(1,2), dateFormat="%Y", addVarN=character(0), fullF=NA, oDir="~/",
gFile=NA, maxDs=NA, minBs=0, cluFun=STClu, nCores=1, logF=NA, program="IQ-TREE", short="seq",
modFormula=New~Old+Time, propVar="Time", propTrans=list(function(x){mean(x)})) {
#@param tFile: An input tree, passed to import.tree()
#@param dateFormat: Passed to GAICrun
#@param maxDs: Passed to GAICrun()
#@param varInd: The indices which represent variables in the header, passed to import.tree()
#@param reVars: The character which splits the header, passed to import.tree()
# This allows variables to be separate and recognizable
#@param gFile: A file containing growth info generated by guppy, passed to growthSim()
#@param logF: The optional log file location to eventually create a growthfile through pplacer and guppy
# This is passed to translator() function.
#@param program: Also passed to translator()
#@param fullF: Passed to taxitCreate()
source("git/tn/subT_Lib.R")
source("git/tn/pplacer_utils.R")
print("Creating Tree")
##UNTESTED
if(is.na(gFile)) {
taxitCreate(treeF=tFile, logF = logF, fullF = fullF, oDir = oDir, program = program)
multiPplacer(oDir)
multiGuppy(oDir)
}
oT <- import.tree(iFile=tFile, reVars=reVars, varInd=varInd, dateFormat=dateFormat, addVarN=addVarN)
oT$g <- growthSim(iT=oT, gFile=gFile)
if(all(attr(CPClu, "srcref")==attr(cluFun, "srcref"))){
oT$n <- extendInfo(iT=oT)
}
print("Defining Clusters")
clus <- multiSTClu(iT=oT, maxDs=maxDs, nCores=nCores, cluFun=cluFun, minBs =minBs)
print("Obtaining Results")
res <- GAICRun(clus=clus, nCores=nCores, modFormula=modFormula, propVar=propVar, propTrans=propTrans)
return(res)
}
#Test scripts
if(F){
#Set inputs for test
reVars <- '_'
varInd <- c(1,7,2)
dateFormat <- "%Y"
addVarN <- "SubType"
tFile <- "~/Data/paperData/beijFullTree/beijFull_PRO_Old.fasta.treefile"
gFile <- "~/Data/paperData/beijFullTree/beijFull_PRO_Growth.tre"
oT <- import.tree(tFile, reVars, varInd, dateFormat, addVarN)
oT <- growthSim(oT, gFile)
maxDs <- seq(0, 0.04, 0.001)
res <- GAICRun(oT, maxDs, minB=0.90, nCores=8)
plot(maxDs, res$GAIC, xlab="Thresholds", ylab="AIC Loss", ylim =c(-100, 3))
lines(maxDs, res$GAIC)
lines(maxDs, res$randAIC-res$nullAIC, col="red")
#tFile <- "~/Data/Seattle/IqTree_Bootstrap/SeattleB_PRO_Filt.fasta.treefile"
#gFile <- "~/Data/Seattle/IqTree_Bootstrap/st.tre"
#tFile <- "~/Data/Tennessee/tn_ColTreeData/tn.refpkg/tn_Coltree.nwk"
#gFile <- "~/Data/Tennessee/tn_ColTreeData/tn_ColGrowth.tre"
#tFile <- "~/Data/Tennessee/tn_DiagTreeData/tn.refpkg/tn.tre"
#gFile <- "~/Data/Tennessee/tn_DiagTreeData/tnTGrowth.tre"
#tFile <- "~/Data/Seattle/stKingTrees/stKing_PRO_H_Filt.fasta.treefile"
#gFile <- "~/Data/Seattle/stKingTrees/stKing.tre"
#tFile <- "~/Data/NAlberta/IqTree_Bootstrap/na.refpkg/NorthAlbertaB_PRO_Filt.fasta.treefile"
#gFile <- "~/Data/NAlberta/IqTree_Bootstrap/na.tre"
oT <- import.tree(tFile, reVars='_', varInd = c(2,3), dateFormat = "%Y-%m-%d")
oT <- growthSim(oT, gFile)
maxDs <- seq(0, 0.04, 0.001)
res <- GAICRun(oT, maxDs, minB=0.90, nCores=8, modFormula=New~Old+Recency+DominantSubType)
plot(maxDs, res$GAIC, xlab="Thresholds", ylab="AIC Loss")
lines(maxDs, res$GAIC)
#lines(maxDs, res$randAIC-res$nullAIC, col="red")
}