Skip to content
This repository was archived by the owner on Sep 27, 2023. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
6e0e7c6
Revert "Merge branch 'master' of github.com:google/rappor into drop-QP"
ilyamironov Jul 13, 2015
4f0bb85
FitLasso to handle the case of a single variable correctly.
ilyamironov Jul 13, 2015
7bf1a2d
Merge branch 'master' of github.com:google/rappor into drop-QP
ilyamironov Jul 13, 2015
11ae8b3
Merging drop-QP and master.
ilyamironov Jul 13, 2015
4d5214b
Removing alternative.R
ilyamironov Jul 13, 2015
ff66ab0
Merge branch 'drop-QP' of github.com:google/rappor into drop-QP
ilyamironov Jul 13, 2015
04d9769
Restoring gen_counts_test.R
ilyamironov Jul 14, 2015
4f3aa3a
Fixing reviewer's comments. Functionality of TestDecode is restored.
ilyamironov Jul 14, 2015
6b38d61
Adjusting the parameter of the Lasso algorithm, and fixing elapsed ti…
ilyamironov Jul 21, 2015
0c76391
Requiring that the estimated count be at least two SD away from 0.
ilyamironov Jul 21, 2015
2255a34
Merge branch 'master' of github.com:google/rappor into drop-QP
ilyamironov Jul 21, 2015
ec3c320
Requiring that the estimated count be at least two SD away from 0.
ilyamironov Jul 21, 2015
f4b7222
Deleting obsolete code stemming from PerformInference calls, shaving …
ilyamironov Jul 22, 2015
f548a29
Merge branch 'drop-QP' of github.com:google/rappor into drop-QP
ilyamironov Jul 22, 2015
9557eb9
Merge branch 'drop-QP' of github.com:google/rappor into drop-QP
ilyamironov Jul 22, 2015
6f185e3
Merge branch 'drop-QP' of github.com:google/rappor into drop-QP
ilyamironov Aug 13, 2015
2b5a518
Merge branch 'drop-QP' of github.com:google/rappor into drop-QP
ilyamironov Aug 13, 2015
e27ccc0
Another drop = FALSE error
ilyamironov Aug 13, 2015
f451872
Merge branch 'drop-QP' of github.com:google/rappor into drop-QP
ilyamironov Aug 13, 2015
4986be9
Merge branch 'master' of github.com:google/rappor into drop-QP
ilyamironov Sep 9, 2015
fd62d97
Dropping reference to alternative.R
ilyamironov Sep 9, 2015
7e4d00b
Changing the cutoff from two SD to one for outputting a candidate
ilyamironov Sep 11, 2015
60c64ea
Changing the number of iterations from 20 to 10.
ilyamironov Sep 11, 2015
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 0 additions & 83 deletions analysis/R/alternative.R

This file was deleted.

143 changes: 57 additions & 86 deletions analysis/R/decode.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@ source.rappor <- function(rel_path) {
source(abs_path)
}

source.rappor('analysis/R/alternative.R')

EstimateBloomCounts <- function(params, obs_counts) {
# Estimates the number of times each bit in each cohort was set in original
# Bloom filters.
Expand Down Expand Up @@ -103,22 +101,32 @@ FitLasso <- function(X, Y, intercept = TRUE) {
# a vector of size ncol(X) of coefficients.

# TODO(mironov): Test cv.glmnet instead of glmnet
mod <- try(glmnet(X, Y, standardize = FALSE, intercept = intercept,
lower.limits = 0, # outputs are non-negative
# Cap the number of non-zero coefficients to 500 or
# 80% of the length of Y, whichever is less. The 500 cap
# is for performance reasons, 80% is to avoid overfitting.
pmax = min(500, length(Y) * .8)),
silent = TRUE)

# If fitting fails, return an empty data.frame.
if (class(mod)[1] == "try-error") {
coefs <- setNames(rep(0, ncol(X)), colnames(X))
} else {
coefs <- coef(mod)
coefs <- coefs[-1, ncol(coefs), drop = FALSE] # coefs[1] is the intercept
}
coefs

# Cap the number of non-zero coefficients to 1,000 or 80% of the number of
# constraints, whichever is less. The 1,000 cap is for performance reasons, 80%
# is to avoid overfitting.
cap <- min(1000, nrow(X) * .8, ncol(X))

if (ncol(X) == 1)
XX <- cbind2(X, rep(0, nrow(X))) # add a dummy variable since glmnet can't handle a single-column matrix
else
XX <- X

mod <- glmnet(XX, Y, standardize = TRUE, intercept = intercept,
lower.limits = 0, # outputs are non-negative
lambda.min.ratio = 1E-4,
pmax = cap)

coefs <- coef(mod)
coefs <- coefs[-1, , drop = FALSE] # drop the intercept
l1cap <- sum(colSums(coefs) <= 1.0) # find all columns with L1 norm <= 1
if (l1cap > 0) {
distr <- coefs[, l1cap] # return the last set of coefficients with L1 <= 1
if (ncol(X) == 1)
distr <- distr[1] # dropping the dummy variable
} else
distr <- setNames(rep(0, ncol(X)), colnames(X))
distr
}

PerformInference <- function(X, Y, N, mod, params, alpha, correction) {
Expand Down Expand Up @@ -216,43 +224,25 @@ ComputePrivacyGuarantees <- function(params, alpha, N) {
privacy
}

FitDistribution <- function(estimates_stds, map, quiet = FALSE) {
FitDistribution <- function(estimates, map, quiet = FALSE) {
# Find a distribution over rows of map that approximates estimates_stds best
#
# Input:
# estimates_stds: a list of two m x k matrices, one for estimates, another
# for standard errors
# map : an (m * k) x S boolean matrix
# estimates: an m x k matrix of estimates
# map : an (m * k) x S boolean matrix
#
# Output:
# a float vector of length S, so that a distribution over map's rows sampled
# according to this vector approximates estimates

S <- ncol(map) # total number of candidates

support_coefs <- 1:S

if (S > length(estimates_stds$estimates) * .8) {
# the system is close to being underdetermined
lasso <- FitLasso(map, as.vector(t(estimates_stds$estimates)))

# Select non-zero coefficients.
support_coefs <- which(lasso > 0)

if(!quiet)
cat("LASSO selected ", length(support_coefs), " non-zero coefficients.\n")
}
# an S-long of vector of non-negative floats, so that a distribution over
# the map's rows sampled according to this vector approximates estimates

coefs <- setNames(rep(0, S), colnames(map))
lasso <- FitLasso(map, estimates)

if(length(support_coefs) > 0) { # LASSO may return an empty list
constrained_coefs <- ConstrainedLinModel(map[, support_coefs, drop = FALSE],
estimates_stds)
if (!quiet)
cat("LASSO selected ", sum(lasso > 0), " non-zero coefficients.\n")

coefs[support_coefs] <- constrained_coefs
}
names(lasso) <- colnames(map)

coefs
lasso
}

Resample <- function(e) {
Expand All @@ -266,7 +256,7 @@ Resample <- function(e) {
list(estimates = estimates, stds = stds)
}

Decode <- function(counts, map, params, alpha = 0.05,
Decode <- function(counts, map, params, alpha = 0.05, iterations = 10,
correction = c("Bonferroni"), quiet = FALSE, ...) {
k <- params$k
p <- params$p
Expand All @@ -293,37 +283,39 @@ Decode <- function(counts, map, params, alpha = 0.05,

coefs_all <- vector()

# Run the fitting procedure several times (5 seems to be sufficient and not
# too many) to estimate standard deviation of the output.
for(r in 1:5) {
if(r > 1)
# Run the fitting procedure several times (5 seems to be the minimum that
# makes sense) to estimate standard deviation of the output.
groups <- sample(1:iterations, length(filter_bits), replace=T)

for(r in 1:iterations) {
if (r > 1)
e <- Resample(estimates_stds_filtered)
else
e <- estimates_stds_filtered

e_vec <- as.vector(t(e$estimates))
subset <- filter_bits[groups != r]

coefs_all <- rbind(coefs_all,
FitDistribution(e, map[filter_bits, , drop = FALSE],
FitDistribution(e_vec[subset],
map[subset, , drop = FALSE],
quiet))
}

coefs_ssd <- N * apply(coefs_all, 2, sd) # compute sample standard deviations
coefs_ave <- N * apply(coefs_all, 2, mean)

# Only select coefficients more than two standard deviations from 0. May
# Only select coefficients more than one standard deviation from 0. May
# inflate empirical SD of the estimates.
reported <- which(coefs_ave > 1E-6 + 2 * coefs_ssd)
reported <- which(coefs_ave > 1E-6 + 1 * coefs_ssd)

mod <- list(coefs = coefs_ave[reported], stds = coefs_ssd[reported])

if (correction == "Bonferroni") {
alpha <- alpha / S
}
fit <- data.frame(string = colnames(map[, reported, drop = FALSE]),
Estimate = matrix(mod$coefs, ncol = 1),
SD = mod$stds,
stringsAsFactors = FALSE)

inf <- PerformInference(map[filter_bits, reported, drop = FALSE],
as.vector(t(estimates_stds_filtered$estimates)),
N, mod, params, alpha,
correction)
fit <- inf$fit
# If this is a basic RAPPOR instance, just use the counts for the estimate
# (Check if the map is diagonal to tell if this is basic RAPPOR.)
if (sum(map) == sum(diag(map))) {
Expand Down Expand Up @@ -351,38 +343,17 @@ Decode <- function(counts, map, params, alpha = 0.05,
allocated_mass <- sum(fit$proportion)
num_detected <- nrow(fit)

ss <- round(inf$SS, digits = 3)
explained_var <- ss[[1]]
missing_var <- ss[[2]]
noise_var <- ss[[3]]

noise_std_dev <- round(inf$resid_sigma, digits = 3)

# Compute summary of the fit.
parameters <-
c("Candidate strings", "Detected strings",
"Sample size (N)", "Discovered Prop (out of N)",
"Explained Variance", "Missing Variance", "Noise Variance",
"Theoretical Noise Std. Dev.")
values <- c(S, num_detected, N, allocated_mass,
explained_var, missing_var, noise_var, noise_std_dev)

res_summary <- data.frame(parameters = parameters, values = values)

privacy <- ComputePrivacyGuarantees(params, alpha, N)
params <- data.frame(parameters =
c("k", "h", "m", "p", "q", "f", "N", "alpha"),
values = c(k, h, m, p, q, f, N, alpha))

# This is a list of decode stats in a better format than 'summary'.
# TODO: Delete summary.
metrics <- list(sample_size = N,
allocated_mass = allocated_mass,
num_detected = num_detected,
explained_var = explained_var,
missing_var = missing_var)
num_detected = num_detected
)

list(fit = fit, summary = res_summary, privacy = privacy, params = params,
list(fit = fit, privacy = privacy, params = params,
lasso = NULL, ests = as.vector(t(estimates_stds_filtered$estimates)),
counts = counts[, -1], resid = NULL, metrics = metrics)
}
Expand Down
10 changes: 6 additions & 4 deletions analysis/R/decode_test.R
Original file line number Diff line number Diff line change
Expand Up @@ -205,10 +205,12 @@ CheckDecodeAveAndStds <- function(...) {
results <- RunMultipleTests(...)

estimates <- matrix(nrow = 0, ncol = 0)
lapply(results, function(r) MatrixVectorMerge(estimates, r$estimates))

stds <- matrix(nrow = 0, ncol = 0)
lapply(results, function(r) MatrixVectorMerge(stds, r$stds))

for(i in 1:length(results)) {
estimates <- MatrixVectorMerge(estimates, results[[i]]$estimates)
stds <- MatrixVectorMerge(stds, results[[i]]$stds)
}

empirical_stds <- apply(estimates, 2, sd, na.rm = TRUE)
estimated_stds <- apply(stds, 2, mean, na.rm = TRUE)
Expand All @@ -217,7 +219,7 @@ CheckDecodeAveAndStds <- function(...) {
checkTrue(any(estimated_stds > empirical_stds / 2),
"Our estimate for the standard deviation is too low")

checkTrue(any(estimated_stds < empirical_stds * 3),
checkTrue(any(estimated_stds < empirical_stds * 2),
"Our estimate for the standard deviation is too high")
}
}
Expand Down
2 changes: 0 additions & 2 deletions bin/decode_dist.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@ source.rappor("analysis/R/read_input.R")
source.rappor("analysis/R/decode.R")
source.rappor("analysis/R/util.R")

source.rappor("analysis/R/alternative.R")

options(stringsAsFactors = FALSE)

# Do command line parsing first to catch errors. Loading libraries in R is
Expand Down
2 changes: 0 additions & 2 deletions tests/compare_dist.R
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,6 @@ source("analysis/R/read_input.R")
source("analysis/R/decode.R")
source("analysis/R/util.R")

source("analysis/R/alternative.R") # temporary

LoadContext <- function(prefix_case) {
# Creates the context, filling it with privacy parameters
# Arg:
Expand Down
17 changes: 8 additions & 9 deletions tests/gen_counts_test.R
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ TestGenerateCounts <- function() {
noise1 <- list(p = .5, q = .5, f = 0) # truly random IRRs
counts1 <- GenerateCounts(c(report_params, noise1), map, partition, v)

for(i in 2:4)
for(i in 2:5)
for(j in 1:2)
pvalues <- c(pvalues,
chisq.test(c(counts1[j,1] - counts1[j,i], counts1[j,i]),
Expand All @@ -64,14 +64,13 @@ TestGenerateCounts <- function() {

counts2 <- counts2 / v

for(i in 2:4)
for(i in 2:5)
for(j in 1:2)
pvalues <- c(pvalues,
chisq.test(c(counts2[j,1] - counts2[j,i], counts2[j,i]),
p = c(.5, .5))$p.value)

checkTrue(min(pvalues) > 1E-9 && max(pvalues) < 1 - 1E-9,
"Chi-squared test failed")
checkTrue(min(pvalues) > 1E-9, "Chi-squared test failed")
}

TestRandomPartition <- function() {
Expand All @@ -97,14 +96,14 @@ TestRandomPartition <- function() {
p5 <- RandomPartition(total = 1000000, c(1, 2, 3, 4))
p.value <- chisq.test(p5, p = c(.1, .2, .3, .4))$p.value

# Apply the chi squared test and fail if p.value is too high or too low.
# Probability of failure is 2 * 1E-9, which should never happen.
checkTrue((p.value > 1E-9) && (p.value < 1 - 1E-9))
# Apply the chi squared test and fail if p.value is too low.
# Probability of failure is 1E-9, which should never happen.
checkTrue(p.value < 1 - 1E-9)
}

TestAll <- function(){
CheckAll <- function(){
TestRandomPartition()
TestGenerateCounts()
}

TestAll()
CheckAll()