diff --git a/analysis/R/alternative.R b/analysis/R/alternative.R deleted file mode 100644 index 3f0e66d3..00000000 --- a/analysis/R/alternative.R +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright 2014 Google Inc. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -library(limSolve) -library(Matrix) - -# The next two functions create a matrix (G) and a vector (H) encoding -# linear inequality constraints that a solution vector (x) must satisfy: -# G * x >= H - -# Currently represent three sets of constraints on the solution vector: -# - all solution coefficients are nonnegative -# - the sum total of all solution coefficients is no more than 1 -# - in each of the coordinates of the target vector (estimated Bloom filter) -# we don't overshoot by more than three standard deviations. -MakeG <- function(n, X) { - d <- Diagonal(n) - last <- rep(-1, n) - rbind2(rbind2(d, last), -X) -} - -MakeH <- function(n, Y, stds) { - # set the floor at 0.01 to avoid degenerate cases - YY <- apply(Y + 3 * stds, # in each bin don't overshoot by more than 3 stds - 1:2, - function(x) min(1, max(0.01, x))) # clamp the bound to [0.01,1] - - c(rep(0, n), # non-negativity condition - -1, # coefficients sum up to no more than 1 - -as.vector(t(YY)) # t is important! - ) -} - -MakeLseiModel <- function(X, Y, stds) { - m <- dim(X)[1] - n <- dim(X)[2] - -# no slack variables for now -# slack <- Matrix(FALSE, nrow = m, ncol = m, sparse = TRUE) -# colnames(slack) <- 1:m -# diag(slack) <- TRUE -# -# G <- MakeG(n + m) -# H <- MakeH(n + m) -# -# G[n+m+1,n:(n+m)] <- -0.1 -# A = cbind2(X, slack) - - w <- as.vector(t(1 / stds)) - w_median <- median(w[!is.infinite(w)]) - if(is.na(w_median)) # all w are infinite - w_median <- 1 - w[w > w_median * 2] <- w_median * 2 - w <- w / mean(w) - - list(# coerce sparse Boolean matrix X to sparse numeric matrix - A = Diagonal(x = w) %*% (X + 0), - B = as.vector(t(Y)) * w, # transform to vector in the row-first order - G = MakeG(n, X), - H = MakeH(n, Y, stds), - type = 2) # Since there are no equality constraints, lsei defaults to - # solve.QP anyway, but outputs a warning unless type == 2. -} - -# CustomLM(X, Y) -ConstrainedLinModel <- function(X,Y) { - model <- MakeLseiModel(X, Y$estimates, Y$stds) - coefs <- do.call(lsei, model)$X - names(coefs) <- colnames(X) - - coefs -} \ No newline at end of file diff --git a/analysis/R/decode.R b/analysis/R/decode.R index 50f64702..7d651145 100644 --- a/analysis/R/decode.R +++ b/analysis/R/decode.R @@ -23,8 +23,6 @@ source.rappor <- function(rel_path) { source(abs_path) } -source.rappor('analysis/R/alternative.R') - EstimateBloomCounts <- function(params, obs_counts) { # Estimates the number of times each bit in each cohort was set in original # Bloom filters. @@ -103,22 +101,32 @@ FitLasso <- function(X, Y, intercept = TRUE) { # a vector of size ncol(X) of coefficients. # TODO(mironov): Test cv.glmnet instead of glmnet - mod <- try(glmnet(X, Y, standardize = FALSE, intercept = intercept, - lower.limits = 0, # outputs are non-negative - # Cap the number of non-zero coefficients to 500 or - # 80% of the length of Y, whichever is less. The 500 cap - # is for performance reasons, 80% is to avoid overfitting. - pmax = min(500, length(Y) * .8)), - silent = TRUE) - - # If fitting fails, return an empty data.frame. - if (class(mod)[1] == "try-error") { - coefs <- setNames(rep(0, ncol(X)), colnames(X)) - } else { - coefs <- coef(mod) - coefs <- coefs[-1, ncol(coefs), drop = FALSE] # coefs[1] is the intercept - } - coefs + + # Cap the number of non-zero coefficients to 1,000 or 80% of the number of + # constraints, whichever is less. The 1,000 cap is for performance reasons, 80% + # is to avoid overfitting. + cap <- min(1000, nrow(X) * .8, ncol(X)) + + if (ncol(X) == 1) + XX <- cbind2(X, rep(0, nrow(X))) # add a dummy variable since glmnet can't handle a single-column matrix + else + XX <- X + + mod <- glmnet(XX, Y, standardize = TRUE, intercept = intercept, + lower.limits = 0, # outputs are non-negative + lambda.min.ratio = 1E-4, + pmax = cap) + + coefs <- coef(mod) + coefs <- coefs[-1, , drop = FALSE] # drop the intercept + l1cap <- sum(colSums(coefs) <= 1.0) # find all columns with L1 norm <= 1 + if (l1cap > 0) { + distr <- coefs[, l1cap] # return the last set of coefficients with L1 <= 1 + if (ncol(X) == 1) + distr <- distr[1] # dropping the dummy variable + } else + distr <- setNames(rep(0, ncol(X)), colnames(X)) + distr } PerformInference <- function(X, Y, N, mod, params, alpha, correction) { @@ -216,43 +224,25 @@ ComputePrivacyGuarantees <- function(params, alpha, N) { privacy } -FitDistribution <- function(estimates_stds, map, quiet = FALSE) { +FitDistribution <- function(estimates, map, quiet = FALSE) { # Find a distribution over rows of map that approximates estimates_stds best # # Input: - # estimates_stds: a list of two m x k matrices, one for estimates, another - # for standard errors - # map : an (m * k) x S boolean matrix + # estimates: an m x k matrix of estimates + # map : an (m * k) x S boolean matrix # # Output: - # a float vector of length S, so that a distribution over map's rows sampled - # according to this vector approximates estimates - - S <- ncol(map) # total number of candidates - - support_coefs <- 1:S - - if (S > length(estimates_stds$estimates) * .8) { - # the system is close to being underdetermined - lasso <- FitLasso(map, as.vector(t(estimates_stds$estimates))) - - # Select non-zero coefficients. - support_coefs <- which(lasso > 0) - - if(!quiet) - cat("LASSO selected ", length(support_coefs), " non-zero coefficients.\n") - } + # an S-long of vector of non-negative floats, so that a distribution over + # the map's rows sampled according to this vector approximates estimates - coefs <- setNames(rep(0, S), colnames(map)) + lasso <- FitLasso(map, estimates) - if(length(support_coefs) > 0) { # LASSO may return an empty list - constrained_coefs <- ConstrainedLinModel(map[, support_coefs, drop = FALSE], - estimates_stds) + if (!quiet) + cat("LASSO selected ", sum(lasso > 0), " non-zero coefficients.\n") - coefs[support_coefs] <- constrained_coefs - } + names(lasso) <- colnames(map) - coefs + lasso } Resample <- function(e) { @@ -266,7 +256,7 @@ Resample <- function(e) { list(estimates = estimates, stds = stds) } -Decode <- function(counts, map, params, alpha = 0.05, +Decode <- function(counts, map, params, alpha = 0.05, iterations = 10, correction = c("Bonferroni"), quiet = FALSE, ...) { k <- params$k p <- params$p @@ -293,37 +283,39 @@ Decode <- function(counts, map, params, alpha = 0.05, coefs_all <- vector() - # Run the fitting procedure several times (5 seems to be sufficient and not - # too many) to estimate standard deviation of the output. - for(r in 1:5) { - if(r > 1) + # Run the fitting procedure several times (5 seems to be the minimum that + # makes sense) to estimate standard deviation of the output. + groups <- sample(1:iterations, length(filter_bits), replace=T) + + for(r in 1:iterations) { + if (r > 1) e <- Resample(estimates_stds_filtered) else e <- estimates_stds_filtered + e_vec <- as.vector(t(e$estimates)) + subset <- filter_bits[groups != r] + coefs_all <- rbind(coefs_all, - FitDistribution(e, map[filter_bits, , drop = FALSE], + FitDistribution(e_vec[subset], + map[subset, , drop = FALSE], quiet)) } coefs_ssd <- N * apply(coefs_all, 2, sd) # compute sample standard deviations coefs_ave <- N * apply(coefs_all, 2, mean) - # Only select coefficients more than two standard deviations from 0. May + # Only select coefficients more than one standard deviation from 0. May # inflate empirical SD of the estimates. - reported <- which(coefs_ave > 1E-6 + 2 * coefs_ssd) + reported <- which(coefs_ave > 1E-6 + 1 * coefs_ssd) mod <- list(coefs = coefs_ave[reported], stds = coefs_ssd[reported]) - if (correction == "Bonferroni") { - alpha <- alpha / S - } + fit <- data.frame(string = colnames(map[, reported, drop = FALSE]), + Estimate = matrix(mod$coefs, ncol = 1), + SD = mod$stds, + stringsAsFactors = FALSE) - inf <- PerformInference(map[filter_bits, reported, drop = FALSE], - as.vector(t(estimates_stds_filtered$estimates)), - N, mod, params, alpha, - correction) - fit <- inf$fit # If this is a basic RAPPOR instance, just use the counts for the estimate # (Check if the map is diagonal to tell if this is basic RAPPOR.) if (sum(map) == sum(diag(map))) { @@ -351,38 +343,17 @@ Decode <- function(counts, map, params, alpha = 0.05, allocated_mass <- sum(fit$proportion) num_detected <- nrow(fit) - ss <- round(inf$SS, digits = 3) - explained_var <- ss[[1]] - missing_var <- ss[[2]] - noise_var <- ss[[3]] - - noise_std_dev <- round(inf$resid_sigma, digits = 3) - - # Compute summary of the fit. - parameters <- - c("Candidate strings", "Detected strings", - "Sample size (N)", "Discovered Prop (out of N)", - "Explained Variance", "Missing Variance", "Noise Variance", - "Theoretical Noise Std. Dev.") - values <- c(S, num_detected, N, allocated_mass, - explained_var, missing_var, noise_var, noise_std_dev) - - res_summary <- data.frame(parameters = parameters, values = values) - privacy <- ComputePrivacyGuarantees(params, alpha, N) params <- data.frame(parameters = c("k", "h", "m", "p", "q", "f", "N", "alpha"), values = c(k, h, m, p, q, f, N, alpha)) - # This is a list of decode stats in a better format than 'summary'. - # TODO: Delete summary. metrics <- list(sample_size = N, allocated_mass = allocated_mass, - num_detected = num_detected, - explained_var = explained_var, - missing_var = missing_var) + num_detected = num_detected + ) - list(fit = fit, summary = res_summary, privacy = privacy, params = params, + list(fit = fit, privacy = privacy, params = params, lasso = NULL, ests = as.vector(t(estimates_stds_filtered$estimates)), counts = counts[, -1], resid = NULL, metrics = metrics) } diff --git a/analysis/R/decode_test.R b/analysis/R/decode_test.R index 71db7aa5..c4fbdfcf 100755 --- a/analysis/R/decode_test.R +++ b/analysis/R/decode_test.R @@ -205,10 +205,12 @@ CheckDecodeAveAndStds <- function(...) { results <- RunMultipleTests(...) estimates <- matrix(nrow = 0, ncol = 0) - lapply(results, function(r) MatrixVectorMerge(estimates, r$estimates)) - stds <- matrix(nrow = 0, ncol = 0) - lapply(results, function(r) MatrixVectorMerge(stds, r$stds)) + + for(i in 1:length(results)) { + estimates <- MatrixVectorMerge(estimates, results[[i]]$estimates) + stds <- MatrixVectorMerge(stds, results[[i]]$stds) + } empirical_stds <- apply(estimates, 2, sd, na.rm = TRUE) estimated_stds <- apply(stds, 2, mean, na.rm = TRUE) @@ -217,7 +219,7 @@ CheckDecodeAveAndStds <- function(...) { checkTrue(any(estimated_stds > empirical_stds / 2), "Our estimate for the standard deviation is too low") - checkTrue(any(estimated_stds < empirical_stds * 3), + checkTrue(any(estimated_stds < empirical_stds * 2), "Our estimate for the standard deviation is too high") } } diff --git a/bin/decode_dist.R b/bin/decode_dist.R index f61e155b..6ed38b18 100755 --- a/bin/decode_dist.R +++ b/bin/decode_dist.R @@ -16,8 +16,6 @@ source.rappor("analysis/R/read_input.R") source.rappor("analysis/R/decode.R") source.rappor("analysis/R/util.R") -source.rappor("analysis/R/alternative.R") - options(stringsAsFactors = FALSE) # Do command line parsing first to catch errors. Loading libraries in R is diff --git a/tests/compare_dist.R b/tests/compare_dist.R index e54978b4..fba920eb 100755 --- a/tests/compare_dist.R +++ b/tests/compare_dist.R @@ -51,8 +51,6 @@ source("analysis/R/read_input.R") source("analysis/R/decode.R") source("analysis/R/util.R") -source("analysis/R/alternative.R") # temporary - LoadContext <- function(prefix_case) { # Creates the context, filling it with privacy parameters # Arg: diff --git a/tests/gen_counts_test.R b/tests/gen_counts_test.R index 49ad3be5..e91de68e 100755 --- a/tests/gen_counts_test.R +++ b/tests/gen_counts_test.R @@ -51,7 +51,7 @@ TestGenerateCounts <- function() { noise1 <- list(p = .5, q = .5, f = 0) # truly random IRRs counts1 <- GenerateCounts(c(report_params, noise1), map, partition, v) - for(i in 2:4) + for(i in 2:5) for(j in 1:2) pvalues <- c(pvalues, chisq.test(c(counts1[j,1] - counts1[j,i], counts1[j,i]), @@ -64,14 +64,13 @@ TestGenerateCounts <- function() { counts2 <- counts2 / v - for(i in 2:4) + for(i in 2:5) for(j in 1:2) pvalues <- c(pvalues, chisq.test(c(counts2[j,1] - counts2[j,i], counts2[j,i]), p = c(.5, .5))$p.value) - checkTrue(min(pvalues) > 1E-9 && max(pvalues) < 1 - 1E-9, - "Chi-squared test failed") + checkTrue(min(pvalues) > 1E-9, "Chi-squared test failed") } TestRandomPartition <- function() { @@ -97,14 +96,14 @@ TestRandomPartition <- function() { p5 <- RandomPartition(total = 1000000, c(1, 2, 3, 4)) p.value <- chisq.test(p5, p = c(.1, .2, .3, .4))$p.value - # Apply the chi squared test and fail if p.value is too high or too low. - # Probability of failure is 2 * 1E-9, which should never happen. - checkTrue((p.value > 1E-9) && (p.value < 1 - 1E-9)) + # Apply the chi squared test and fail if p.value is too low. + # Probability of failure is 1E-9, which should never happen. + checkTrue(p.value < 1 - 1E-9) } -TestAll <- function(){ +CheckAll <- function(){ TestRandomPartition() TestGenerateCounts() } -TestAll() \ No newline at end of file +CheckAll() \ No newline at end of file