-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path04_fasta.r
More file actions
executable file
·35 lines (27 loc) · 1.4 KB
/
04_fasta.r
File metadata and controls
executable file
·35 lines (27 loc) · 1.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
#!/usr/bin/env Rscript
library(seqinr)
source(file="constants.r")
d <- read.table("01_annotate.dat", header=T)
# Get only the sequences from the first and last time point.
day.extremes <- rbind(aggregate(day~patient, d, max), aggregate(day~patient, d, min))
d <- merge(d, day.extremes)
# Remove sequences whose total count (in an individual sample) is less than the
# cutoff (min.count, defined in constants.r).
sample.counts <- aggregate(count~seq+patient+day, d, sum)
d <- merge(d, sample.counts, by=c("seq", "patient", "day"), suffixes=c("", ".sample"))
d <- d[d$count.sample >= min.count,]
# Join the names of identical sequences with a "|", and remove duplicates.
names <- aggregate(taxa~seq+patient, d, paste, collapse="|")
d <- merge(d, names, by=c("seq", "patient"), suffixes=c("", ".collapsed"))
# Write a data file, to make it easier later.
stopifnot(duplicated(d$seq) == duplicated(cbind(d$seq, d$patient)))
stopifnot(duplicated(d$seq) == duplicated(d$taxa.collapsed))
write.table(d, "04_fasta.dat", col.names=T, row.names=F, quote=F)
# Write the sequences to fasta files, excluding duplicates.
d <- d[!duplicated(d$seq),]
d$seq <- as.character(d$seq)
. <- by(cbind(d$seq, d$taxa.collapsed, d$patient), d$patient, function (x) {
fn <- sprintf("04_fasta/%02d.fasta", as.integer(as.character(x$V3[1])))
seqs <- strsplit(as.character(x$V1), split=NULL)
write.fasta(sequences=seqs, names=x$V2, file.out=fn)
})