-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathplotCondProb.R
More file actions
59 lines (43 loc) · 2.22 KB
/
plotCondProb.R
File metadata and controls
59 lines (43 loc) · 2.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
## Import (packages)
library(scales)
library(ggplot2)
library(Hmisc)
## Parse input args
args = commandArgs(trailingOnly=TRUE)
orig_csv = args[1]
output_name = args[2]
output_nameMin5 = paste(output_name,'_min5.png',sep="")
output_name_freqRawRank = paste(output_name,'_ev2ByRank.png',sep="")
output_name_embed = paste(output_name,'_embedEv2ByRank.png',sep="")
## Read in data
data=read.csv(orig_csv,header=T,sep = "")
#data$ev2GivenMatrixCorrectProb=data$ev2GivenMatrix/data$numEC
colnames(data)
cat(sprintf("Total verbs: %s\n", nrow(data)))
dataCleanMin5 <- data[data[,7]>4,]
cat(sprintf("Verbs with ev2 at least five times: %s\n", nrow(dataCleanMin5)))
dataCleanMinEC <- data[data[,6]>=100,]
cat(sprintf("Lemmas with numCanTellIfRaised at least 100: %s\n", nrow(dataCleanMinEC)))
ev2RankDataRaw = dataCleanMinEC$X8.p.ev2.matrix.
ev2RankDataRawSorted = sort(ev2RankDataRaw, decreasing = TRUE)
ev2RankDataRawSorted
dataCleanMinEmbed <- data[data[,9]>999,] # X9.highestEmbedVerbCount
cat(sprintf("Lemmas with highestEmbedVerbCount at least 1000: %s\n", nrow(dataCleanMinEmbed)))
ev2EmbedRank = dataCleanMinEmbed$X12.p.ev2.embed.
ev2EmbedRankSorted = sort(ev2EmbedRank, decreasing = TRUE)
# Sort by p(ev2|lemma)
#png(output_name_freqRank)
#plot(seq_along(ev2RankDataSorted), unclass(ev2RankDataSorted), xlab="Rank", ylab="Log transformed P(ev2|matrix)")
png(output_name_freqRawRank)
plot(seq_along(ev2RankDataRawSorted), unclass(ev2RankDataRawSorted), xlab="Rank", ylab="P(ev2|matrix)")
png(output_name_embed)
plot(seq_along(ev2EmbedRankSorted), unclass(ev2EmbedRankSorted), xlab="Rank", ylab="P(ev2|embed)")
#png(output_name_freqRawScaledRank)
#plot(seq_along(ev2RankDataRawSortedScaled), unclass(ev2RankDataRawSortedScaled), xlab="Rank", ylab="P(matrix|ev2) freq. normed")
# Plot log(p(ev2)) by rank
dataCleanMin5$logX5.p.ec.matrix. = log(dataCleanMin5$X5.p.ec.matrix.)
dataCleanMin5$logX8.p.ev2.matrix. = log(dataCleanMin5$X8.p.ev2.matrix.)
png(output_nameMin5)
ggplot(dataCleanMin5, aes(X8.p.ev2.matrix., X5.p.ec.matrix., ) ) + geom_jitter(colour=alpha("black",0.15)) + geom_smooth(method=lm)
#ggplot(dataCleanMin5, aes(logX8.p.ev2.matrix., logX5.p.ec.matrix., ) ) + geom_jitter(colour=alpha("black",0.15)) + geom_smooth(method=lm)
message ("Finished")