google · tkaitchuck · Jul 13, 2015 · Jul 13, 2015 · Jul 13, 2015 · Jul 13, 2015
diff --git a/analysis/R/alternative.R b/analysis/R/alternative.R
diff --git a/analysis/R/decode.R b/analysis/R/decode.R
@@ -23,8 +23,6 @@ source.rappor <- function(rel_path)  {
   source(abs_path)
 }
 
-source.rappor('analysis/R/alternative.R')
-
 EstimateBloomCounts <- function(params, obs_counts) {
   # Estimates the number of times each bit in each cohort was set in original
   # Bloom filters.
@@ -103,22 +101,32 @@ FitLasso <- function(X, Y, intercept = TRUE) {
   #    a vector of size ncol(X) of coefficients.
 
   # TODO(mironov): Test cv.glmnet instead of glmnet
-  mod <- try(glmnet(X, Y, standardize = FALSE, intercept = intercept,
-                    lower.limits = 0,  # outputs are non-negative
-                    # Cap the number of non-zero coefficients to 500 or
-                    # 80% of the length of Y, whichever is less. The 500 cap
-                    # is for performance reasons, 80% is to avoid overfitting.
-                    pmax = min(500, length(Y) * .8)),
-             silent = TRUE)
-
-  # If fitting fails, return an empty data.frame.
-  if (class(mod)[1] == "try-error") {
-    coefs <- setNames(rep(0, ncol(X)), colnames(X))
-  } else {
-    coefs <- coef(mod)
-    coefs <- coefs[-1, ncol(coefs), drop = FALSE]  # coefs[1] is the intercept
-  }
-  coefs
+
+  # Cap the number of non-zero coefficients to 1,000 or 80% of the number of
+  # constraints, whichever is less. The 1,000 cap is for performance reasons, 80%
+  # is to avoid overfitting.
+  cap <- min(1000, nrow(X) * .8, ncol(X))
+
+  if (ncol(X) == 1)
+  	XX <- cbind2(X, rep(0, nrow(X)))  # add a dummy variable since glmnet can't handle a single-column matrix
+  else
+  	XX <- X
+
+  mod <- glmnet(XX, Y, standardize = TRUE, intercept = intercept,
+                lower.limits = 0,  # outputs are non-negative
+                lambda.min.ratio = 1E-4,
+                pmax = cap)
+
+  coefs <- coef(mod)
+  coefs <- coefs[-1, , drop = FALSE]  # drop the intercept
+  l1cap <- sum(colSums(coefs) <= 1.0)  # find all columns with L1 norm <= 1
+  if (l1cap > 0) {
+   	distr <- coefs[, l1cap]  # return the last set of coefficients with L1 <= 1
+    if (ncol(X) == 1)
+      distr <- distr[1]  # dropping the dummy variable
+  } else
+   	distr <- setNames(rep(0, ncol(X)), colnames(X))
+  distr
 }
 
 PerformInference <- function(X, Y, N, mod, params, alpha, correction) {
@@ -216,43 +224,25 @@ ComputePrivacyGuarantees <- function(params, alpha, N) {
   privacy
 }
 
-FitDistribution <- function(estimates_stds, map, quiet = FALSE) {
+FitDistribution <- function(estimates, map, quiet = FALSE) {
   # Find a distribution over rows of map that approximates estimates_stds best
   #
   # Input:
-  #   estimates_stds: a list of two m x k matrices, one for estimates, another
-  #                   for standard errors
-  #   map           : an (m * k) x S boolean matrix
+  #   estimates: an m x k matrix of estimates
+  #   map      : an (m * k) x S boolean matrix
   #
   # Output:
-  #   a float vector of length S, so that a distribution over map's rows sampled
-  #   according to this vector approximates estimates
-
-  S <- ncol(map)  # total number of candidates
-
-  support_coefs <- 1:S
-
-  if (S > length(estimates_stds$estimates) * .8) {
-    # the system is close to being underdetermined
-    lasso <- FitLasso(map, as.vector(t(estimates_stds$estimates)))
-
-    # Select non-zero coefficients.
-    support_coefs <- which(lasso > 0)
-
-    if(!quiet)
-      cat("LASSO selected ", length(support_coefs), " non-zero coefficients.\n")
-  }
+  #   an S-long of vector of non-negative floats, so that a distribution over
+  #   the map's rows sampled according to this vector approximates estimates
 
-  coefs <- setNames(rep(0, S), colnames(map))
+  lasso <- FitLasso(map, estimates)
 
-  if(length(support_coefs) > 0) {  # LASSO may return an empty list
-    constrained_coefs <- ConstrainedLinModel(map[, support_coefs, drop = FALSE],
-                                             estimates_stds)
+  if (!quiet)
+    cat("LASSO selected ", sum(lasso > 0), " non-zero coefficients.\n")
 
-    coefs[support_coefs] <- constrained_coefs
-  }
+  names(lasso) <- colnames(map)
 
-  coefs
+  lasso
 }
 
 Resample <- function(e) {
@@ -266,7 +256,7 @@ Resample <- function(e) {
   list(estimates = estimates, stds = stds)
 }
 
-Decode <- function(counts, map, params, alpha = 0.05,
+Decode <- function(counts, map, params, alpha = 0.05, iterations = 10,
                    correction = c("Bonferroni"), quiet = FALSE, ...) {
   k <- params$k
   p <- params$p
@@ -293,37 +283,39 @@ Decode <- function(counts, map, params, alpha = 0.05,
 
   coefs_all <- vector()
 
-  # Run the fitting procedure several times (5 seems to be sufficient and not
-  # too many) to estimate standard deviation of the output.
-  for(r in 1:5) {
-    if(r > 1)
+  # Run the fitting procedure several times (5 seems to be the minimum that
+  # makes sense) to estimate standard deviation of the output.
+  groups <- sample(1:iterations, length(filter_bits), replace=T)
+
+  for(r in 1:iterations) {
+    if (r > 1)
       e <- Resample(estimates_stds_filtered)
     else
       e <- estimates_stds_filtered
 
+    e_vec <- as.vector(t(e$estimates))
+    subset <- filter_bits[groups != r]
+
     coefs_all <- rbind(coefs_all,
-                       FitDistribution(e, map[filter_bits, , drop = FALSE],
+                       FitDistribution(e_vec[subset],
+                                       map[subset, , drop = FALSE],
                                        quiet))
   }
 
   coefs_ssd <- N * apply(coefs_all, 2, sd)  # compute sample standard deviations
   coefs_ave <- N * apply(coefs_all, 2, mean)
 
-  # Only select coefficients more than two standard deviations from 0. May
+  # Only select coefficients more than one standard deviation from 0. May
   # inflate empirical SD of the estimates.
-  reported <- which(coefs_ave > 1E-6 + 2 * coefs_ssd)
+  reported <- which(coefs_ave > 1E-6 + 1 * coefs_ssd)
 
   mod <- list(coefs = coefs_ave[reported], stds = coefs_ssd[reported])
 
-  if (correction == "Bonferroni") {
-    alpha <- alpha / S
-  }
+  fit <- data.frame(string = colnames(map[, reported, drop = FALSE]),
+                    Estimate = matrix(mod$coefs, ncol = 1),
+                    SD = mod$stds,
+                    stringsAsFactors = FALSE)
 
-  inf <- PerformInference(map[filter_bits, reported, drop = FALSE],
-                          as.vector(t(estimates_stds_filtered$estimates)),
-                          N, mod, params, alpha,
-                          correction)
-  fit <- inf$fit
   # If this is a basic RAPPOR instance, just use the counts for the estimate
   #     (Check if the map is diagonal to tell if this is basic RAPPOR.)
   if (sum(map) == sum(diag(map))) {
@@ -351,38 +343,17 @@ Decode <- function(counts, map, params, alpha = 0.05,
   allocated_mass <- sum(fit$proportion)
   num_detected <- nrow(fit)
 
-  ss <- round(inf$SS, digits = 3)
-  explained_var <- ss[[1]]
-  missing_var <- ss[[2]]
-  noise_var <- ss[[3]]
-
-  noise_std_dev <- round(inf$resid_sigma, digits = 3)
-
-  # Compute summary of the fit.
-  parameters <-
-      c("Candidate strings", "Detected strings",
-        "Sample size (N)", "Discovered Prop (out of N)",
-        "Explained Variance", "Missing Variance", "Noise Variance",
-        "Theoretical Noise Std. Dev.")
-  values <- c(S, num_detected, N, allocated_mass,
-              explained_var, missing_var, noise_var, noise_std_dev)
-
-  res_summary <- data.frame(parameters = parameters, values = values)
-
   privacy <- ComputePrivacyGuarantees(params, alpha, N)
   params <- data.frame(parameters =
                        c("k", "h", "m", "p", "q", "f", "N", "alpha"),
                        values = c(k, h, m, p, q, f, N, alpha))
 
-  # This is a list of decode stats in a better format than 'summary'.
-  # TODO: Delete summary.
   metrics <- list(sample_size = N,
                   allocated_mass = allocated_mass,
-                  num_detected = num_detected,
-                  explained_var = explained_var,
-                  missing_var = missing_var)
+                  num_detected = num_detected
+                  )
 
-  list(fit = fit, summary = res_summary, privacy = privacy, params = params,
+  list(fit = fit, privacy = privacy, params = params,
        lasso = NULL, ests = as.vector(t(estimates_stds_filtered$estimates)),
        counts = counts[, -1], resid = NULL, metrics = metrics)
 }

diff --git a/analysis/R/decode_test.R b/analysis/R/decode_test.R
@@ -205,10 +205,12 @@ CheckDecodeAveAndStds <- function(...) {
   results <- RunMultipleTests(...)
 
   estimates <- matrix(nrow = 0, ncol = 0)
-  lapply(results, function(r) MatrixVectorMerge(estimates, r$estimates))
-
   stds <- matrix(nrow = 0, ncol = 0)
-  lapply(results, function(r) MatrixVectorMerge(stds, r$stds))
+
+  for(i in 1:length(results)) {
+    estimates <- MatrixVectorMerge(estimates, results[[i]]$estimates)
+    stds <- MatrixVectorMerge(stds, results[[i]]$stds)
+  }
 
   empirical_stds <- apply(estimates, 2, sd, na.rm = TRUE)
   estimated_stds <- apply(stds, 2, mean, na.rm = TRUE)
@@ -217,7 +219,7 @@ CheckDecodeAveAndStds <- function(...) {
     checkTrue(any(estimated_stds > empirical_stds / 2),
               "Our estimate for the standard deviation is too low")
 
-    checkTrue(any(estimated_stds < empirical_stds * 3),
+    checkTrue(any(estimated_stds < empirical_stds * 2),
               "Our estimate for the standard deviation is too high")
   }
 }

diff --git a/bin/decode_dist.R b/bin/decode_dist.R
@@ -16,8 +16,6 @@ source.rappor("analysis/R/read_input.R")
 source.rappor("analysis/R/decode.R")
 source.rappor("analysis/R/util.R")
 
-source.rappor("analysis/R/alternative.R")
-
 options(stringsAsFactors = FALSE)
 
 # Do command line parsing first to catch errors.  Loading libraries in R is

diff --git a/tests/compare_dist.R b/tests/compare_dist.R
@@ -51,8 +51,6 @@ source("analysis/R/read_input.R")
 source("analysis/R/decode.R")
 source("analysis/R/util.R")
 
-source("analysis/R/alternative.R")  # temporary
-
 LoadContext <- function(prefix_case) {
   # Creates the context, filling it with privacy parameters
   # Arg:

diff --git a/tests/gen_counts_test.R b/tests/gen_counts_test.R
@@ -51,7 +51,7 @@ TestGenerateCounts <- function() {
   noise1 <- list(p = .5, q = .5, f = 0)  # truly random IRRs
   counts1 <- GenerateCounts(c(report_params, noise1), map, partition, v)
 
-  for(i in 2:4)
+  for(i in 2:5)
     for(j in 1:2)
       pvalues <- c(pvalues,
                    chisq.test(c(counts1[j,1] - counts1[j,i], counts1[j,i]),
@@ -64,14 +64,13 @@ TestGenerateCounts <- function() {
 
   counts2 <- counts2 / v
 
-  for(i in 2:4)
+  for(i in 2:5)
     for(j in 1:2)
       pvalues <- c(pvalues,
                    chisq.test(c(counts2[j,1] - counts2[j,i], counts2[j,i]),
                               p = c(.5, .5))$p.value)
 
-  checkTrue(min(pvalues) > 1E-9 && max(pvalues) < 1 - 1E-9,
-            "Chi-squared test failed")
+  checkTrue(min(pvalues) > 1E-9, "Chi-squared test failed")
 }
 
 TestRandomPartition <- function() {
@@ -97,14 +96,14 @@ TestRandomPartition <- function() {
   p5 <- RandomPartition(total = 1000000, c(1, 2, 3, 4))
   p.value <- chisq.test(p5, p = c(.1, .2, .3, .4))$p.value
 
-  # Apply the chi squared test and fail if p.value is too high or too low.
-  # Probability of failure is 2 * 1E-9, which should never happen.
-  checkTrue((p.value > 1E-9) && (p.value <  1 - 1E-9))
+  # Apply the chi squared test and fail if p.value is too low.
+  # Probability of failure is 1E-9, which should never happen.
+  checkTrue(p.value <  1 - 1E-9)
 }
 
-TestAll <- function(){
+CheckAll <- function(){
   TestRandomPartition()
   TestGenerateCounts()
 }
 
-TestAll()
+CheckAll()