forked from emilyhmoore/ProblemSet2
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathLalorPS02.R
More file actions
209 lines (172 loc) · 7.16 KB
/
LalorPS02.R
File metadata and controls
209 lines (172 loc) · 7.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
# Maggie Lalor
# PS 02 Due 02/06/2014
##1) Calculating Violations
# Two ways of testing violations of Benford’s law are proposed below. Let Xi represent the observed
#proportional frequency of the integer i in observed vote totals.
# Leemis' m statistic and Cho-Gains' d
#write a function to calculate these statistics:
#input: (i) a matrix or vector of election returns and options(s) for m,d, both
# output: list containing the results, including full digit distribution
##Calc Function
violationstats <- function(data, stats="both") { #default to calculate both
if (!stats %in% c("both","leemis","chogaines")) { #Check that option is correct
print("Error: incorrect stats option")
break
}
#Process dataset
output <- list() #Output for later
#Extract first digit
extract <- function(x){
return(as.numeric(substr(as.character(x),1,1)))
}
#First digit for each piece of data
sigs <- sapply (data, extract)
#Frequency calculation
digitfrequency <- table(sort(sigs))
output$data <- as.list(digitfrequency)
#Change from frequency to proportion
for (i in 1:length(output$data)) {
output$data[[i]] <- output$data[[i]]/length(data)
}
#Calculate stats
if (stats %in% c("both","leemis")) { #If Leemis or both
vals <- rep(0, length(output$data))
for (i in 1:length(output$data)) { #all potential Leemis stats
vals[i] <- abs(output$data[[i]] - log10( 1+ (1/as.numeric(names(output$data[i]))) ))
}
output$leemis <- max(vals)*sqrt(length(data)) #Select max * sqrt(n) per Murrow
} # end L if
if (stats %in% c("both","chogaines")) { #If Cho-Gaines or both
val <- 0
for (i in 1:length(output$data)) { #sum square of (actual proportion - benford predicted)
val <- val + (output$data[[i]] - log10(1+(1/as.numeric(names(output$data[i])))))^2 #distance is sum
}
output$chogaines <- sqrt(length(data))*sqrt(val) #Cho-Gaines is squareroot of sum * sqrt(n)
} #end CG if
return(output)
}
#Initial test - outputs ok
#datamatrix<- matrix(seq(1,99,by=3), ncol=3) #Test matrix
#datavector <- seq(1,33,3)
#(outmatrix <- violationstats(datamatrix))
#(outvec <- violationstats(datavector, stats="leemis"))
outbaddist <- violationstats(notmetsample) #Run for non-Benford data
##2) Critical Values
#names(outmatrix)
#violoutput <- outmatrix #Test, has both
#Mcrit <- list("0.10" = 0.851,"0.05" = 0.967,"0.01" = 1.212) #Listed Leemis crits
#Dcrit <- list("0.10" = 1.212,"0.05" = 1.330,"0.01" = 1.569) #Listed CG crits
## PRINT FUNCTION
print.benfords <- function(violoutput) {
#Reject Null of no fraud if reach critical values:
cat("--------------------------------------------------\n" )
if ("leemis" %in% names(violoutput)) { #Print Leemis' value
stars <- ""
if (violoutput$leemis[1] > 1.212 ) { #if above .01 crit
stars <-"***"
} else if (violoutput$leemis[1] > 0.967 ) { # if above .05 crit
stars <-"**"
} else if (violoutput$leemis[1] > 0.851 ) { # if above .10 crit
stars <-"*"
}
#print line
cat("Leemis' m:", signif(violoutput$leemis[1], digits=4),stars,"\n") #3 sig digits given crits
} #end Leemis if
if ("chogaines" %in% names(violoutput)) { #Print Cho-Gaines' value
stars <- ""
if (violoutput$chogaines[1] > 1.569 ) { #if above .01 crit
stars <-"***"
} else if (violoutput$chogaines[1] > 1.330 ) { # if above .05 crit
stars <-"**"
} else if (violoutput$chogaines[1] > 1.212 ) { # if above .10 crit
stars <-"*"
}
#print line
cat("Cho-Gaines' d:", signif(violoutput$chogaines[1], digits=4),stars,"\n") #3 sig digits given crits
} #end Cho-Gaines if
# Print legend
cat("Sig code: \"*\" = .10, \"**\" = .05, \"***\" = .01\n" )
cat("--------------------------------------------------\n" )
} #end function
#test - output ok
#print.benfords(violoutput) #both
#print.benfords(outvec) #just leemis
#print.benfords(outbaddist) #non-Benford dataset
##3) Testing
#function to unit test function
##FUNCTION##
#truetestdist is true Beneford distribution for dataset inputdata
#truelem is true Leemis' m for dataset inputdata
#truecg is true Cho-Gaines' d for dataset inputdata
test.funct <- function(inputdata, truetestdist, truelem, truecg) { #will test both stats values
Pass <- "TRUE"
testout <- violationstats(inputdata)
#Compare distributions
#Make sure lengths are the same
if (length(testout$data) != length(truetestdist)) {
print("Distribution Unit Test Failed")
Pass <- "FALSE"
}
#Compare each proportion, don't check names so input doesn't have to be named list
for(i in 1:length(testout$data)) {
if (signif(testout$data[[i]], digits = 3) != signif(truetestdist[[i]], digits = 3)) {
print("Distribution Unit Test Failed")
Pass <- "FALSE"
}
}
#Compare Leemis Stat
if (signif(testout$leemis, digits = 3) != signif(truelem, digits = 3)) {
print("Leemis' Unit Test Failed")
Pass <- "FALSE"
}
#Compare Cho-Gaines Stat
if (signif(testout$chogaines, digits = 3) != signif(truecg, digits = 3)) {
print("Cho-Gaines' Unit Test Failed")
Pass <- "FALSE"
}
return(Pass)
}
##TESTING##
#data where Benford's law is met
library(BenfordTests)
set.seed(25)
#random sample satisfying Benford's law
benfordsample <- rbenf(25000)
#sample NOT satisfying Benford's law
notmetsample <- c(1,2,3,4,rep(5,5),6,7, rep(9,3^9)) #most values being high int opposite of Benford
#Compare my function to results
results <- violationstats(benfordsample)
#From Beneford Package
firstdig <- signifd(benfordsample) #first digits function
#Distribution calculation
digitfrequency <- table(sort(firstdig)) #frequencies
truetestdist <- (digitfrequency/length(firstdig)) #change to proportion
#truetestdist == results$data #TRUE
#Stat Tests
#Calculate d: sqrt(n)* sqrt (sum( observed frequencies - benford predicted)^2 )
val <- 0
for (i in 1:length(truetestdist)) { #all potential Leemis stats - do for incase some digits missing
val<- val + (truetestdist[[i]] -pbenf(digits=1)[[as.numeric(names(truetestdist[i]))]])^2
}
truecg <- (sqrt(length(notmetsample))*sqrt(val))
#same as edist.benftest in BenfordTests package
#results$chogaines == truecg #TRUE
#Calcualte m: sqrt(n) Max abs(observed frequencies - benford predicted)
vals <- rep(0, length(truetestdist))
for (i in 1:length(truetestdist)) { #all potential Leemis stats - do for incase some digits missing
vals[i] <- abs(truetestdist[[i]] -pbenf(digits=1)[[as.numeric(names(truetestdist[i]))]])
}
truelem <- max(vals)*sqrt(length(notmetsample))
}
#same as edist.benftest in BenfordTests package
#results$leemis == truelem #TRUE
#use your functions above to compare to the “truth” for the digit distributions and two test statistics
#Test Function: Beneford data
resultsmet <- test.funct(benfordsample, truetestdist, truelem, truecg)
#resultsmet #TRUE when proper data for variables truetestdist, truelem, truecg
#Test Function: Non-Benford data
resultsmet <- test.funct(notmetsample, truetestdist, truelem, truecg)
#resultsmet #TRUE when proper data for variables truetestdist, truelem, truecg
##3) Part 2 For each way that the function can fail this test, create a
#branch where you edit the code in some way to make the code fail to pass the unit testing
#MESS UP CODE