tn/preProcessing_Utils.R at master · PoonLab/tn · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
require("ape")

#A few Basic checks to ensure the quality of an alignment
##-TO-DO: This should be optional in final package -##
alignPrePro <- function(iFile, ambiThresh=0.05, gapThresh=0.15) {
  #@param iFile: Input fasta file.
  #@param gapThresh: The threshold for gapped regions. If greater than this percentage of a given sequence are gaps, reject
  #@param ambiThresh: The threshold for sequence ambiguity. If greater than this percentage of a given sequence (without gaps) are ambiguous, reject

  #Take fasta file input,
  seqs <- read.dna(iFile, format = "fasta", as.character = T)

  #Collect Frequencies of ambiguity
  ambi <- sapply(1:nrow(seqs), function(i) {
    iSeq <- seqs[i,]
    lGaps <- length(iSeq[iSeq%in%"-"])
    lAmbi <- length(iSeq[iSeq%in%c("y","r","w","s","k","m","d","v","h","b","n",
                                   "Y","R","W","S","K","M","D","V","H","B","N")])
    lAmbi/(length(iSeq)-lGaps)
  })

  #Collect Lengths of sequences
  gaps <- sapply(1:nrow(seqs), function(i) {
    iSeq <- seqs[i,]
    lGaps <- length(iSeq[iSeq%in%"-"])
    lGaps/length(iSeq)
  })

  #Sequences that are labelled "Bad", do not meet one or more of the threshold requirements
  badSeq <- union(which(ambi>=ambiThresh),
                  which(gaps>=gapThresh))

  if(length(badSeq)==0) {
    print("No unnacceptable Sequences")
  } else {
    #Write only those sequences with ambiguity below 1.5% and sequence length above 85%
    write.dna(seqs[-badSeq,], colsep = "",
              gsub(".fasta$", "_PRO.fasta", iFile), "fasta")
  }

  return(seqs[-badSeq,])
}

#Print a graph of positions and their ambiguity as well as their gapped rate
positScan <- function(iFile, showPlot=T) {
  #@param iFile: Input fasta file.
  #@param showPlot: True false option for printing the plot of data
  #@return: A data frame of results that the plot is based

  #Take fasta file inpjut,
  seqs <- read.dna(iFile, format = "fasta", as.character = T)

  #Check highly gapped regions for trimming
  positGaps <- sapply(1:ncol(seqs), function(i) {
    iSeq <- seqs[,i]
    lGaps <- length(iSeq[iSeq%in%"-"])
    lGaps/length(iSeq)
  })

  #Collect Frequencies of ambiguity
  positAmbi <- sapply(1:ncol(seqs), function(i) {
    iSeq <- seqs[,i]
    lAmbi <- length(iSeq[iSeq%in%c("y","r","w","s","k","m","d","v","h","b","n",
                                   "Y","R","W","S","K","M","D","V","H","B","N")])
    lAmbi/(length(iSeq))
  })

  df <- data.frame(Gaps=positGaps, Amiguity=positAmbi)

  if(showPlot) {
    plot(df, type="n", xlim=c(1, nrow(df)), ylim = c(0,1), xlab="Position", ylab="Proportion")
    rect(par("usr")[1],par("usr")[3],par("usr")[2],par("usr")[4],col = "antiquewhite")
    abline(h=seq(0,1,0.2), col="white", lwd=2.5)
    abline(h=seq(0.1,0.9,0.2), col="white", lwd=1)
    lines(df$Gaps, lty=2)
    lines(df$Amiguity, lty=1)
  }

  return(df)
}